Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-4251997

UTF-8 Surrogate Decoding is Broken

XMLWordPrintable

    • Icon: Bug Bug
    • Resolution: Fixed
    • Icon: P4 P4
    • 1.4.0
    • 1.2.0, 1.2.1
    • core-libs



      Name: rlT66838 Date: 07/06/99


      1 - Compile and run the included program.
      2 - program is included
      3 - JDK 1.1 and 1.2 report surrogates as EOF ... wrong!!
          JDK 1.1 moreover doesn't accept the standard "UTF-8" name.
      4 - trace information is irrelevant
      5 - Classic VM (build JDK-1.2.1-A, native threads)
          ... and on many other VMs
      6 - no other data should be relevant ... except that I've known
          about surrogate handling bugs in the JDK for some time, and
          only recently had a reason to try to come up with a test
          case that'd demonstrate them.

          The "test4" encoded pairs are mostly, if not completely,
          correct; they came from James Clark's XML test suite. If
          there's an error there I'd assume it's in the decodings I
          assigned to them ... but having eyeballed this carefully
          and double checked using some other folks' work, I think
          that is also correct.



      REPRODUCE USING:
      ----------------
      /**
       * This program is subject to the terms of the
       * GNU Library General Public License (LGPL) version 2.0
       *
       * It may freely be copied and modified, so long as the original
       * licencing terms are not removed.
       */
      import java.io.*;


      /**
       * Processes some UTF-8 test data java.io character conversion support for
       * the UTF-8 encodings, then gives that character conversion support an
       * overall pass or fail rating. It also sanity checks whether the decodings
       * are lenient (accepting some erroneous encodings).
       *
       * <P> Some of the test cases here are taken from standard XML test suites;
       * UTF-8 is one of the two encodings XML processors must support, so this
       * encoding support should be very correct to support next generation
       * web (and internet) applications with maximal interoperability. (Also, it
       * should be fast -- the JDK 1.1 and 1.2 sun.io converters are slow.)
       *
       * <P> Note that JDK 1.1 and JDK 1.2 don't currently pass these tests;
       * there are known problems in UTF-8 surrogate support at this time.
       *
       * @author David Brownell (###@###.###)
       * @version July 1, 1999
       */
      public class utf8
      {
          //
          // "UTF-8" is the only IANA registered encoding name. However,
          // JDK 1.1.x and some other JVM implementations only accept a
          // Java-proprietary encoding name ("UTF8") ... for compatibility,
          // that name should also be tested, but for correctness it's
          // an error if that's the only name that's supported.
          //
          private static String encodingName = "UTF-8";

          //
          // Positive tests -- test both output and input processing against
          // various "known good" data
          //
          private static boolean positive (
      byte encoded [],
      char decoded [],
      String label
          ) {
      boolean retval = true;
      int i = 0;

      try {
      //
      // Ensure that writing encodes correctly
      //
      ByteArrayOutputStream out;
      OutputStreamWriter writer;
      byte result [];

      out = new ByteArrayOutputStream ();
      writer = new OutputStreamWriter (out, encodingName);
      writer.write (decoded);
      writer.close ();
      result = out.toByteArray ();

      if (result.length != encoded.length) {
      System.err.println (label + ": write length wrong, "
      + result.length
      + " (should be " + encoded.length + ")");
      retval = false;
      }
      for (i = 0; i < encoded.length && i < result.length; i++) {
      if (encoded [i] != result [i]) {
      System.err.println (label + ": result [" + i + "] = 0x"
      + Integer.toHexString (0x0ff & result [i])
      + ", should be 0x"
      + Integer.toHexString (0x0ff & encoded [i]));
      retval = false;
      }
      }

      //
      // Ensure that reading decodes correctly
      //
      ByteArrayInputStream in;
      InputStreamReader reader;

      in = new ByteArrayInputStream (encoded);
      reader = new InputStreamReader (in, encodingName);

      for (i = 0; i < decoded.length; i++) {
      int c = reader.read ();

      if (c != decoded [i]) {
      System.err.print (label + ": read failed, char " + i);
      System.err.print (" ... expected 0x"
      + Integer.toHexString (decoded [i]));
      if (c == -1)
      System.err.println (", got EOF");
      else
      System.err.println (", got 0x"
      + Integer.toHexString (c));
      retval = false;
      if (c == -1)
      return retval;
      }
      }

      if (reader.read () != -1) {
      System.err.println (label + ": read failed, no EOF");
      return false;
      }

      } catch (Exception e) {
      System.err.println (label + ": failed "
      + "(i = " + i + "), "
      + e.getClass ().getName ()
      + ", " + e.getMessage ());
      // e.printStackTrace ();
      return false;
      }
      return retval;
          }


          //
          // Negative tests -- only for input processing, make sure that
          // invalid or corrupt characters are rejected.
          //
          private static boolean negative (byte encoded [], String label)
          {
      try {
      ByteArrayInputStream in;
      InputStreamReader reader;
      int c;

      in = new ByteArrayInputStream (encoded);
      reader = new InputStreamReader (in, encodingName);

      c = reader.read ();
      System.err.print (label + ": read failed, ");

      if (c == -1)
      System.err.println ("reported EOF");
      else
      System.err.println ("returned char 0x"
      + Integer.toHexString (c)
      + ", expected exception");
      return false;

      } catch (CharConversionException e) {
      return true;

      } catch (Throwable t) {
      System.err.println (label + ": failed, threw "
      + t.getClass ().getName ()
      + ", " + t.getMessage ());
      }
      return false;
          }


          //
          // TEST #0: Examples from RFC 2279
          // This is a positive test.
          //
          private static byte test0_bytes [] = {
      // A<NOT IDENTICAL TO><ALPHA>.
              (byte)0x41,
      (byte)0xE2, (byte)0x89, (byte)0xA2,
      (byte)0xCE, (byte)0x91,
      (byte)0x2E,
      // Korean word "hangugo"
      (byte)0xED, (byte)0x95, (byte)0x9C,
      (byte)0xEA, (byte)0xB5, (byte)0xAD,
      (byte)0xEC, (byte)0x96, (byte)0xB4,
              // Japanese word "nihongo"
              (byte)0xE6, (byte)0x97, (byte)0xA5,
      (byte)0xE6, (byte)0x9C, (byte)0xAC,
      (byte)0xE8, (byte)0xAA, (byte)0x9E
          };
          private static char test0_chars [] = {
      // A<NOT IDENTICAL TO><ALPHA>.
      0x0041, 0x2262, 0x0391, 0x002e,
      // Korean word "hangugo"
      0xD55C, 0xAD6D, 0xC5B4,
              // Japanese word "nihongo"
      0x65E5, 0x672C, 0x8A9E
          };


          //
          // From RFC 2279, the ranges which define the values we focus some
          // "organized" testing on -- test each boundary, and a little on each
          // side of the boundary.
          //
          // Note that some encodings are errors: the shortest encoding must be
          // used. On the "be lenient in what you accept" principle, those not
          // tested as input cases; on the "be strict in what you send" principle,
          // they are tested as output cases instead.
          //
          // UCS-4 range (hex.) UTF-8 octet sequence (binary)
          // 0000 0000-0000 007F 0xxxxxxx
          // 0000 0080-0000 07FF 110xxxxx 10xxxxxx
          // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
          //
          // 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
          // 0020 0000-03FF FFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
          // 0400 0000-7FFF FFFF 1111110x 10xxxxxx ... 10xxxxxx
          //

          //
          // TEST #1: One byte encoded values. Works just like ASCII; these
          // values were chosen for boundary testing. This is a positive test.
          //
          // 0000 0000-0000 007F 0xxxxxxx
          //
          private static byte test1_bytes [] = {
      (byte) 0x00, (byte) 0x01, (byte) 0x7e, (byte) 0x7f
          };
          private static char test1_chars [] = {
      0x0000, 0x0001, 0x007e, 0x007f
          };


          //
          // TEST #2: Two byte encoded values, chosen for boundary testing.
          // This is a positive test.
          //
          // 0000 0080-0000 07FF 110xxxxx 10xxxxxx
          //
          // Encodings CX bb, with X = 0 or 1 and 'b' values irrelevant,
          // should have used a shorter encoding.
          //
          private static byte test2_bytes [] = {
      (byte) 0xc2, (byte) 0x80,
      (byte) 0xc2, (byte) 0x81,
      (byte) 0xc3, (byte) 0xa0,
      (byte) 0xdf, (byte) 0xbe,
      (byte) 0xdf, (byte) 0xbf
          };
          private static char test2_chars [] = {
      0x0080,
      0x0081,
      0x00E0,
      0x07FE,
      0x07FF
          };


          //
          // TEST #3: Three byte encoded values, chosen for boundary testing.
          // This is a positive test.
          //
          // 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
          //
          // Encodings EO Xb bb, with X = 8 or 9 and 'b' values irrelevant,
          // should have used a shorter encoding.
          //
          private static byte test3_bytes [] = {
      (byte) 0xe0, (byte) 0xa0, (byte) 0x80,
      (byte) 0xe0, (byte) 0xa0, (byte) 0x81,
      (byte) 0xe1, (byte) 0x80, (byte) 0x80,
      (byte) 0xe8, (byte) 0x80, (byte) 0x80,
      (byte) 0xef, (byte) 0xbf, (byte) 0xbe,
      (byte) 0xef, (byte) 0xbf, (byte) 0xbf
          };
          private static char test3_chars [] = {
      0x0800,
      0x0801,
      0x1000,
      0x8000,
      0xFFFE,
      0xFFFF
          };


          //
          // TEST #4: Four byte encoded values, needing surrogate pairs,
          // chosen for boundary testing. This is a positive test.
          //
          // NOTE: some four byte encodings exceed the range of Unicode
          // with surrogate pairs (UTF-16); those MUST be negatively tested.
          //
          // 0001 0000-001F FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
          //
          // Encodings F0 8b bb bb, where again the 'b' values are irrelevant,
          // should have used a shorter encoding.
          //
          private static byte test4_bytes [] = {
      (byte) 0xf0, (byte) 0x90, (byte) 0x80, (byte) 0x80,
      (byte) 0xf0, (byte) 0x90, (byte) 0x80, (byte) 0x81,
      (byte) 0xf0, (byte) 0x90, (byte) 0x88, (byte) 0x80,
      (byte) 0xf0, (byte) 0x90, (byte) 0x90, (byte) 0x80,
      (byte) 0xf0, (byte) 0x90, (byte) 0x8f, (byte) 0xbf,
      (byte) 0xf1, (byte) 0x90, (byte) 0x8f, (byte) 0xbf,
      (byte) 0xf2, (byte) 0x90, (byte) 0x8f, (byte) 0xbf,
      (byte) 0xf4, (byte) 0x8f, (byte) 0xbf, (byte) 0xbf
          };
          private static char test4_chars [] = {
      0xD800, 0xDC00,
      0xD800, 0xDC01,
      0xD800, 0xDE00,
      0xD801, 0xDC00,
      0xD800, 0xDFFF,
      0xD900, 0xDFFF,
      0xDA00, 0xDFFF,
      0xDBFF, 0xDFFF,
          };


          //
          // NEGATIVE TESTS:
          //
      // four byte encodings that are out of range for UTF-16
      // as the result can't be encoded with surrogate pairs
          private static byte test5_bytes []
      = { (byte) 0xf7, (byte) 0x8f, (byte) 0xbf, (byte) 0xbf };
          private static byte test6_bytes []
      = { (byte) 0xf7, (byte) 0x8f, (byte) 0xbf, (byte) 0xbf };
          private static byte test13_bytes []
      = { (byte) 0xf7, (byte) 0x80, (byte) 0x80, (byte) 0x80 };

      // five and six byte encodings (leniency discouraged)
          private static byte test7_bytes []
      = { (byte) 0xf8, (byte) 0x80, (byte) 0x80,
      (byte) 0x80, (byte) 0x80 };
          private static byte test8_bytes []
      = { (byte) 0xf8, (byte) 0xbf, (byte) 0x80,
      (byte) 0x80, (byte) 0x80 };
          private static byte test9_bytes []
      = { (byte) 0xfc, (byte) 0x80, (byte) 0x80,
      (byte) 0x80, (byte) 0x80, (byte) 0x80 };
          private static byte test10_bytes []
      = { (byte) 0xfc, (byte) 0x80, (byte) 0x80,
      (byte) 0x80, (byte) 0x80, (byte) 0x81 };

      // orphan "extension" bytes (e.g. some ISO-8859-1 characters)
          private static byte test11_bytes []
      = { (byte) 0x80 };
          private static byte test12_bytes []
      = { (byte) 0xa9 };
          

          //
          // Just for information -- see if these cases are accepted; they're
          // all errors ("too short" encodings), but ones which generally
          // ought to be accepted for leniency (though see RFC 2279).
          //
      // three encodings of ASCII NUL
          private static byte bad0_bytes []
      = { (byte) 0xc0, (byte) 0x80 };
          private static byte bad1_bytes []
      = { (byte) 0xe0, (byte) 0x80, (byte) 0x80 };
          private static byte bad2_bytes []
      = { (byte) 0xf0, (byte) 0x80, (byte) 0x80, (byte) 0x80 };

      // ... and other values
          private static byte bad3_bytes []
      = { (byte) 0xc1, (byte) 0x80 };
          private static byte bad4_bytes []
      = { (byte) 0xe0, (byte) 0x81, (byte) 0x80 };
          private static byte bad5_bytes []
      = { (byte) 0xe0, (byte) 0x90, (byte) 0x80 };


          /**
           * Main program to give a pass or fail rating to a JVM's UTF-8 support.
           * No arguments needed.
           */
          public static void main (String argv [])
          {
      boolean pass = true;

      System.out.println ("");
      System.out.println ("------ checking UTF-8 correctness ...");

      try {
      new InputStreamReader (System.in, "UTF-8");
      } catch (Exception e) {
      encodingName = "UTF8";
      System.out.println ("... requires nonstandard encoding name "
      + encodingName);
      pass = false;
      }

      //
      // Positive tests -- good data is dealt with correctly
      //
      pass &= positive (test0_bytes, test0_chars, "RFC 2279 Examples");
      pass &= positive (test1_bytes, test1_chars, "One Byte Characters");
      pass &= positive (test2_bytes, test2_chars, "Two Byte Characters");
      pass &= positive (test3_bytes, test3_chars, "Three Byte Characters");
      pass &= positive (test4_bytes, test4_chars, "Surrogate Pairs");

      //
      // Negative tests -- "bad" data is dealt with correctly ... in
      // this case, "bad" is just out-of-range for Unicode systems,
      // rather than values encoded contrary to spec (such as NUL
      // being encoded as '0xc0 0x80', not '0x00').
      //
      pass &= negative (test5_bytes, "Four byte range error (0)");
      pass &= negative (test6_bytes, "Four byte range error (1)");
      pass &= negative (test13_bytes, "Four byte range error (2)");

      pass &= negative (test7_bytes, "Five byte error (0)");
      pass &= negative (test8_bytes, "Five byte error (1)");
      pass &= negative (test9_bytes, "Six byte error (0)");
      pass &= negative (test10_bytes, "Six byte error (1)");

      pass &= negative (test11_bytes, "Orphan continuation (1)");
      pass &= negative (test12_bytes, "Orphan continuation (2)");

      //
      // PASS/FAIL status is what the whole thing is about.
      //
      if (pass)
      System.out.println ("PASS -- UTF-8 support works right!");
      else
      System.out.println ("FAIL -- incorrect UTF-8 support.");

      //
      // Just for information (most are lenient)
      //
      boolean strict;

      System.out.println ("");
      System.out.println ("------ checking decoder leniency ...");

      strict = negative (bad0_bytes, "Fat zero (0)");
      strict &= negative (bad1_bytes, "Fat zero (1)");
      strict &= negative (bad2_bytes, "Fat zero (2)");
      strict &= negative (bad3_bytes, "Fat '@' (0)");
      strict &= negative (bad4_bytes, "Fat '@' (1)");
      strict &= negative (bad5_bytes, "Fat 0x0400");

      if (strict)
      System.out.println ("... decoder is strict.");
      else
      System.out.println ("... decoder is lenient.");

      System.exit (pass ? 0 : 1);
          }
      }
      (Review ID: 85136)
      ======================================================================

            ilittlesunw Ian Little (Inactive)
            rlewis Roger Lewis (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            0 Start watching this issue

              Created:
              Updated:
              Resolved:
              Imported:
              Indexed: