Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-4406815

[Col] java.text.Collator.compare() gets wrong answer w/French & certain strings

XMLWordPrintable

    • Icon: Bug Bug
    • Resolution: Unresolved
    • Icon: P4 P4
    • tbd
    • 1.3.0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
    • core-libs
    • x86
    • generic, windows_nt

      Name: boT120536 Date: 01/22/2001


      19 Dec 2000, eval1127@eng -- see:
      http://search.java.sun.com/query.html?qt=%2Bcollator++%2Bcompare+%2B%22in+progress%22&col=obug&qp=&qs=&qc=&pw=100%25&ws=0&qm=0&st=1&nh=10&lk=1&rf=0&rq=0
      ----------------
      java version "1.3.0"
      Java(TM) 2 Runtime Environment, Standard Edition (build 1.3.0-C)
      Java HotSpot(TM) Client VM (build 1.3.0-C, mixed mode)

      The compare() method in java.text.Collator with the French locale and TERTIARY
      strength returns the wrong answer for certain pairs strings. Using
      Collator.getCollationKey() and comparing the resulting CollationKeys using
      CollationKey.compareTo() will get the correct answer.

      Source code to reproduce the problem for a certain set of strings is included
      below. Basically, the program runs through a number of pairs of strings and
      checks that Collator.compare() and Collator.getCollationKey().compareTo()
      return values with the same sign.

      Here are the first few mismatches it finds. The strings are all variations
      of "aa" where the 'a' characters have various accents & decorations and are
      upper and lower case.

      [] java TestCollator
      Testing French, TERTIARY
       Mismatch! "\u0061\u00e1" < "\u00e1\u0041" w/compare(), > w/collation keys
       Mismatch! "\u0061\u00c1" < "\u00e1\u0061" w/compare(), > w/collation keys
       Mismatch! "\u0061\u00e0" < "\u00e1\u00c1" w/compare(), > w/collation keys
       Mismatch! "\u0061\u00c0" < "\u00e1\u00e1" w/compare(), > w/collation keys
       Mismatch! "\u0061\u0103" < "\u00e1\u00c0" w/compare(), > w/collation keys
       Mismatch! "\u0061\u0102" < "\u00e1\u00e0" w/compare(), > w/collation keys
       Mismatch! "\u0061\u1eaf" < "\u00e1\u0102" w/compare(), > w/collation keys
       Mismatch! "\u0061\u1eae" < "\u00e1\u0103" w/compare(), > w/collation keys
       Mismatch! "\u0061\u1eb1" < "\u00e1\u1eae" w/compare(), > w/collation keys
       Mismatch! "\u0061\u1eb0" < "\u00e1\u1eaf" w/compare(), > w/collation keys
       Mismatch! "\u0061\u1eb5" < "\u00e1\u1eb0" w/compare(), > w/collation keys
       Mismatch! "\u0061\u1eb4" < "\u00e1\u1eb1" w/compare(), > w/collation keys
       Mismatch! "\u0061\u1eb7" < "\u00e1\u1eb4" w/compare(), > w/collation keys


      import java.text.*;
      import java.util.*;

      public class TestCollator {
          public static String[] g_chars = new String[] {
              // these are variations of the letter A (w/various accents, upper & lower case)
              "aA", "\u00e1\u00c1", "\u00e0\u00c0", "\u0103\u0102",
              "\u1eaf\u1eae", "\u1eb1\u1eb0", "\u1eb5\u1eb4", "\u1eb7\u1eb6",
              "\u1eb3\u1eb2", "\u00e2\u00c2", "\u1ea5\u1ea4", "\u1ea7\u1ea6",
              "\u1eab\u1eaa", "\u1ead\u1eac", "\u1ea9\u1ea8", "\u00e5\u00c5\u212b", // 212b and c5 are the same char
              "\u01fb\u01fa", "\u00e4\u00c4", "\u00e3\u00c3", "\u0101\u0100",
              "\u0105\u0104", "\u1ea1\u1ea0", "\u1ea3\u1ea2",
              "\u00e6\u00c6", // the combined AE character (upper & lower case)
              "bB"
          };
          public static void main(String[] args) {
              System.out.println("Testing French, TERTIARY");
              testCompare(new Locale("fr","FR"), Collator.TERTIARY);

              System.out.println("Testing English, TERTIARY");
              testCompare(new Locale("en","US"), Collator.TERTIARY);

              System.out.println("Testing French, SECONDARY");
              testCompare(new Locale("fr","FR"), Collator.SECONDARY);
          }
          public static void testCompare(Locale locale, int collatorStrength) {
              Collator collator = Collator.getInstance(locale);
              collator.setStrength(collatorStrength);

              // build a list of about 2600 2-letter strings to test with
              List list = new ArrayList();
              for (int i=0; i < g_chars.length; i++) {
                  for (int j=0; j < g_chars[i].length(); j++) {
                      for (int k=0; k < g_chars.length; k++) {
                          for (int m=0; m < g_chars[k].length(); m++)
                              list.add(""+g_chars[i].charAt(j)+g_chars[k].charAt(m));
                      }
                  }
              }

              // go through the 2601 strings and compare them using two methods:
              // Collator.compare() and Collator.getCollationKey.compareTo(), and
              // make sure the two methods reach the same answer.
              HashSet mismatchFound = new HashSet();
              for (int i=0; i < list.size(); i++) {
                  String a = (String)list.get(i);
                  if (mismatchFound.contains(a))
                      continue; // we already found a mismatch w/this string, skip the rest
                  for (int j=i; j < list.size(); j++) {
                      String b = (String)list.get(j);
                      if (mismatchFound.contains(b))
                          continue; // we already found a mismatch w/this string, skip the rest

                      // THIS IS THE IMPORTANT PART.
                      int compare = collator.compare(a, b);
                      int collationKey = collator.getCollationKey(a).compareTo(collator.getCollationKey(b));

                      if (sign(compare) != sign(collationKey)) {
                          System.out.println(" Mismatch! " + toUnicodeString(a) + " "+ op(compare) + " " +
                              toUnicodeString(b) + " w/compare(), " + op(collationKey) + " w/collation keys");
                          mismatchFound.add(a); // don't show any more mismatchesinvolving these
                          mismatchFound.add(b); // strings a & b since there are a probably a lot
                          break;
                      }
                  }
              }
          }
          private static int sign(int z) {
              return z < 0 ? -1 : z > 0 ? 1 : 0;
          }
          private static String op(int z) {
              return z < 0 ? "<" : z > 0 ? ">" : "==";
          }
          /** Pretty print a string using \\uXXXX unicode notation */
          private static String toUnicodeString(String s) {
              StringBuffer b = new StringBuffer();
              b.append("\"");
              for (int i=0; i < s.length(); i++) {
                  b.append("\\u");
                  b.append(Integer.toHexString(s.charAt(i)+0x10000).substring(1));
              }
              b.append("\"");
              return b.toString();
          }
      }
      (Review ID: 114009)
      ======================================================================
      ###@###.### 11/2/04 18:30 GMT

            naoto Naoto Sato
            bonealsunw Bret O'neal (Inactive)
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

              Created:
              Updated:
              Imported:
              Indexed: