import org.w3c.dom.bootstrap.DOMImplementationRegistry;
import org.w3c.dom.ls.*;
import org.w3c.dom.Document;
import java.nio.charset.StandardCharsets;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.util.Arrays;

public class issue2 {

  public static void main(String[] args) throws Exception {
    // UTF-8 code points basically come in 4 variants:
    // 1 code unit/byte for a single code point (ASCII characters)
    // 2 code units/bytes for a single code point
    // 3 code units/bytes for a single code point
    // 4 code units/bytes for a single code point
    //
    // Fuzzy testing would be great here, but this is supposed to be a small POC,
    // that XML serialization creates character references for certain inputs,
    // although *ALL* characters in both input and output are representable directly
    // in their encoding (both input and output are in UTF-8).
    //
    // We use bytes directly, so the testcase should not be dependent on the source
    // encoding of this file. However, readable strings of the input bytes are given
    // in the comments, and to properly read them, you have to open this file as UTF-8.
    //
    // Tests were done on 64bit Ubuntu Linux 20.04, OpenJDK 8, 11 and 14.

    // Tests using a single 1 code unit character
    final byte[] test_1codeunit_1 = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0x61,                                         // a
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("1 code unit, 1", test_1codeunit_1);

    final byte[] test_1codeunit_2 = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0x37,                                         // 7
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("1 code unit, 2", test_1codeunit_2);

    // Tests using 2 code units characters
    final byte[] test_2codeunit_1 = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xc2, (byte)0xa9,                             // ©
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("2 code units, 1", test_2codeunit_1);

    final byte[] test_2codeunit_2 = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xc4, (byte)0x8c,                             // Č
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("2 code units, 2", test_2codeunit_2);

    // Tests using 3 code units characters
    final byte[] test_3codeunit_1 = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xe1, (byte)0xae, (byte)0x9f,                 // ᮟ
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("3 code units, 1", test_3codeunit_1);

    final byte[] test_3codeunit_2 = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xe1, (byte)0xb8, (byte)0xaa,                 // Ḫ
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("3 code units, 2", test_3codeunit_2);

    // Tests using 4 code units characters
    final byte[] test_4codeunit_1 = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xf0, (byte)0x9f, (byte)0x9a, (byte)0xa9,     // 🚩
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("4 code units, 1", test_4codeunit_1);

    final byte[] test_4codeunit_2 = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xf0, (byte)0x9f, (byte)0x98, (byte)0x82,     // 😂
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("4 code units, 2", test_4codeunit_2);

    // Let's also do some tests to combine characters with differing code unit sizes
    // Start with the biggest (better would be to try different orderings as well).
    final byte[] test_2_1codeunit = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xc2, (byte)0xa9,                             // ©
        (byte)0x61,                                         // a
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("2 and 1 code units", test_2_1codeunit);

    final byte[] test_3_1codeunit = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xe1, (byte)0xae, (byte)0x9f,                 // ᮟ
        (byte)0x61,                                         // a
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("3 and 1 code units", test_3_1codeunit);

    final byte[] test_4_1codeunit = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xf0, (byte)0x9f, (byte)0x9a, (byte)0xa9,     // 🚩
        (byte)0x61,                                         // a
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("4 and 1 code units", test_4_1codeunit);

    final byte[] test_3_2codeunit = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xe1, (byte)0xae, (byte)0x9f,                 // ᮟ
        (byte)0xc2, (byte)0xa9,                             // ©
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("3 and 2 code units", test_3_2codeunit);

    final byte[] test_4_2codeunit = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xf0, (byte)0x9f, (byte)0x9a, (byte)0xa9,     // 🚩
        (byte)0xc2, (byte)0xa9,                             // ©
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("4 and 2 code units", test_4_2codeunit);

    final byte[] test_4_3codeunit = {
        (byte)0x3c, (byte)0x54, (byte)0x3e,                 // <T>
        (byte)0xf0, (byte)0x9f, (byte)0x9a, (byte)0xa9,     // 🚩
        (byte)0xe1, (byte)0xae, (byte)0x9f,                 // ᮟ
        (byte)0x3c, (byte)0x2f, (byte)0x54, (byte)0x3e      // </T>
    };
    runTestCase("4 and 3 code units", test_4_3codeunit);

  }

  private static void runTestCase(String name, final byte[] inputData) throws Exception {

    System.out.println("Running test case '" + name + "'...");

    ByteArrayInputStream in = new ByteArrayInputStream(inputData);
    ByteArrayOutputStream out = new ByteArrayOutputStream();

    DOMImplementationLS impl = (DOMImplementationLS) DOMImplementationRegistry.newInstance().getDOMImplementation("LS");

    LSInput input = impl.createLSInput();
    input.setByteStream(in);

    LSOutput output = impl.createLSOutput();
    output.setEncoding("UTF-8");
    output.setByteStream(out);

    LSParser parser = impl.createLSParser(DOMImplementationLS.MODE_SYNCHRONOUS, null);

    LSSerializer serializer = impl.createLSSerializer();
    serializer.getDomConfig().setParameter("xml-declaration", false);

    Document doc = parser.parse(input);
    serializer.write(doc, output);

    final String inputString = new String(inputData, StandardCharsets.UTF_8);
    final byte[] resultData = out.toByteArray();
    final String resultString = new String(resultData, StandardCharsets.UTF_8);

    System.out.println("Results:");
    System.out.println("Input:  " + inputString + " (" + getReadableByteArray(inputData) + ")");
    System.out.println("Output: " + resultString + " (" + getReadableByteArray(resultData) + ")");
    System.out.println("Data " + ((Arrays.equals(inputData, resultData)) ? "is" : "is not") + " equal!");
    System.out.println("***************************************************************************");
  }

  private static String getReadableByteArray(byte[] arr) {
      final StringBuilder builder = new StringBuilder();
      for (int i = 0; i < arr.length; ++i) {
        String hex = Integer.toHexString(arr[i]);
        builder.append(hex.substring(hex.length() - 2));
        if (i % 2 != 0) {
          builder.append(" ");
        }
      }
      return builder.toString();
  }
} 
