-
Type:
Bug
-
Resolution: Unresolved
-
Priority:
P4
-
None
-
Affects Version/s: 8, 25
-
Component/s: xml
-
generic
-
generic
ADDITIONAL SYSTEM INFORMATION :
Property settings:
file.encoding = UTF-8
file.separator = /
java.class.path =
java.class.version = 69.0
java.home = /home/user/.sdkman/candidates/java/25.0.1-tem
java.io.tmpdir = /tmp
java.library.path = /usr/java/packages/lib
/usr/lib64
/lib64
/lib
/usr/lib
java.runtime.name = OpenJDK Runtime Environment
java.runtime.version = 25.0.1+8-LTS
java.specification.name = Java Platform API Specification
java.specification.vendor = Oracle Corporation
java.specification.version = 25
java.vendor = Eclipse Adoptium
java.vendor.url = https://adoptium.net/
java.vendor.url.bug = https://github.com/adoptium/adoptium-support/issues
java.vendor.version = Temurin-25.0.1+8
java.version = 25.0.1
java.version.date = 2025-10-21
java.vm.compressedOopsMode = Zero based
java.vm.info = mixed mode, sharing
java.vm.name = OpenJDK 64-Bit Server VM
java.vm.specification.name = Java Virtual Machine Specification
java.vm.specification.vendor = Oracle Corporation
java.vm.specification.version = 25
java.vm.vendor = Eclipse Adoptium
java.vm.version = 25.0.1+8-LTS
jdk.debug = release
line.separator = \n
native.encoding = UTF-8
os.arch = amd64
os.name = Linux
os.version = 6.8.0-90-generic
path.separator = :
stderr.encoding = UTF-8
stdin.encoding = UTF-8
stdout.encoding = UTF-8
sun.arch.data.model = 64
sun.boot.library.path = /home/user/.sdkman/candidates/java/25.0.1-tem/lib
sun.cpu.endian = little
sun.io.unicode.encoding = UnicodeLittle
sun.java.launcher = SUN_STANDARD
sun.jnu.encoding = UTF-8
sun.management.compiler = HotSpot 64-Bit Tiered Compilers
user.country = US
user.dir = /home/user
user.home = /home/user
user.language = en
user.name = user
openjdk version "25.0.1" 2025-10-21 LTS
OpenJDK Runtime Environment Temurin-25.0.1+8 (build 25.0.1+8-LTS)
OpenJDK 64-Bit Server VM Temurin-25.0.1+8 (build 25.0.1+8-LTS, mixed mode, sharing)
A DESCRIPTION OF THE PROBLEM :
When an emoji with a variation selector (e.g., 🐧 = U+1F427 + U+FE0F) is placed inside a CDATA section, JDK XML Transformer incorrectly splits the emoji from its variation selector during transformation.
This violates the Unicode in XML specification (W3C TR #20, Section 4.3), which states that "For a variation selector to have an effect it must immediately follow its base character."
STEPS TO FOLLOW TO REPRODUCE THE PROBLEM :
When transforming XML with emoji containing variation selectors (like 🐧), the JDK's XML Transformer splits the emoji into multiple CDATA sections, separating the base character from its variation selector.
Input:
```xml
<div><![CDATA[🐧]]></div>
```
---------- BEGIN SOURCE ----------
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import java.io.StringWriter;
public class EmojiVariationSelectorBug {
public static void main(String[] args) throws Exception {
String input = "<div><![CDATA[🐧]]></div>";
System.out.println("Input XML:");
System.out.println(input);
System.out.println();
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document document = dBuilder.parse(new java.io.ByteArrayInputStream(input.getBytes("UTF-8")));
DOMSource domSource = new DOMSource(document);
StringWriter writer = new StringWriter();
StreamResult result = new StreamResult(writer);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.transform(domSource, result);
String output = writer.toString();
System.out.println("Output XML:");
System.out.println(output);
System.out.println();
String expected = "<div><![CDATA[🐧]]></div>";
boolean isCorrect = output.contains("<![CDATA[🐧]]>");
if (isCorrect) {
System.out.println("✓ TEST PASSED");
} else {
System.out.println("✗ TEST FAILED: Emoji with variation selector is split across CDATA sections");
System.out.println();
System.out.println("The transformer incorrectly splits the emoji from its variation selector,");
System.out.println("violating Unicode in XML spec (W3C TR #20, Section 4.3).");
System.exit(1);
}
}
}
---------- END SOURCE ----------
FREQUENCY :
ALWAYS
Property settings:
file.encoding = UTF-8
file.separator = /
java.class.path =
java.class.version = 69.0
java.home = /home/user/.sdkman/candidates/java/25.0.1-tem
java.io.tmpdir = /tmp
java.library.path = /usr/java/packages/lib
/usr/lib64
/lib64
/lib
/usr/lib
java.runtime.name = OpenJDK Runtime Environment
java.runtime.version = 25.0.1+8-LTS
java.specification.name = Java Platform API Specification
java.specification.vendor = Oracle Corporation
java.specification.version = 25
java.vendor = Eclipse Adoptium
java.vendor.url = https://adoptium.net/
java.vendor.url.bug = https://github.com/adoptium/adoptium-support/issues
java.vendor.version = Temurin-25.0.1+8
java.version = 25.0.1
java.version.date = 2025-10-21
java.vm.compressedOopsMode = Zero based
java.vm.info = mixed mode, sharing
java.vm.name = OpenJDK 64-Bit Server VM
java.vm.specification.name = Java Virtual Machine Specification
java.vm.specification.vendor = Oracle Corporation
java.vm.specification.version = 25
java.vm.vendor = Eclipse Adoptium
java.vm.version = 25.0.1+8-LTS
jdk.debug = release
line.separator = \n
native.encoding = UTF-8
os.arch = amd64
os.name = Linux
os.version = 6.8.0-90-generic
path.separator = :
stderr.encoding = UTF-8
stdin.encoding = UTF-8
stdout.encoding = UTF-8
sun.arch.data.model = 64
sun.boot.library.path = /home/user/.sdkman/candidates/java/25.0.1-tem/lib
sun.cpu.endian = little
sun.io.unicode.encoding = UnicodeLittle
sun.java.launcher = SUN_STANDARD
sun.jnu.encoding = UTF-8
sun.management.compiler = HotSpot 64-Bit Tiered Compilers
user.country = US
user.dir = /home/user
user.home = /home/user
user.language = en
user.name = user
openjdk version "25.0.1" 2025-10-21 LTS
OpenJDK Runtime Environment Temurin-25.0.1+8 (build 25.0.1+8-LTS)
OpenJDK 64-Bit Server VM Temurin-25.0.1+8 (build 25.0.1+8-LTS, mixed mode, sharing)
A DESCRIPTION OF THE PROBLEM :
When an emoji with a variation selector (e.g., 🐧 = U+1F427 + U+FE0F) is placed inside a CDATA section, JDK XML Transformer incorrectly splits the emoji from its variation selector during transformation.
This violates the Unicode in XML specification (W3C TR #20, Section 4.3), which states that "For a variation selector to have an effect it must immediately follow its base character."
STEPS TO FOLLOW TO REPRODUCE THE PROBLEM :
When transforming XML with emoji containing variation selectors (like 🐧), the JDK's XML Transformer splits the emoji into multiple CDATA sections, separating the base character from its variation selector.
Input:
```xml
<div><![CDATA[🐧]]></div>
```
---------- BEGIN SOURCE ----------
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import java.io.StringWriter;
public class EmojiVariationSelectorBug {
public static void main(String[] args) throws Exception {
String input = "<div><![CDATA[🐧]]></div>";
System.out.println("Input XML:");
System.out.println(input);
System.out.println();
DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
Document document = dBuilder.parse(new java.io.ByteArrayInputStream(input.getBytes("UTF-8")));
DOMSource domSource = new DOMSource(document);
StringWriter writer = new StringWriter();
StreamResult result = new StreamResult(writer);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer transformer = tf.newTransformer();
transformer.transform(domSource, result);
String output = writer.toString();
System.out.println("Output XML:");
System.out.println(output);
System.out.println();
String expected = "<div><![CDATA[🐧]]></div>";
boolean isCorrect = output.contains("<![CDATA[🐧]]>");
if (isCorrect) {
System.out.println("✓ TEST PASSED");
} else {
System.out.println("✗ TEST FAILED: Emoji with variation selector is split across CDATA sections");
System.out.println();
System.out.println("The transformer incorrectly splits the emoji from its variation selector,");
System.out.println("violating Unicode in XML spec (W3C TR #20, Section 4.3).");
System.exit(1);
}
}
}
---------- END SOURCE ----------
FREQUENCY :
ALWAYS