Loading...

Type: Bug
Resolution: Unresolved
Priority: P4
Fix Version/s: tbd
Affects Version/s: 1.3.0, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
Component/s: core-libs
Labels:
- 7-wnf
- 8-wnf
- i18n
- sqebug
- webbug

Subcomponent:
java.text
CPU:

x86
OS:

generic, windows_nt

Name: boT120536 Date: 01/22/2001

19 Dec 2000, eval1127@eng -- see:
http://search.java.sun.com/query.html?qt=%2Bcollator++%2Bcompare+%2B%22in+progress%22&col=obug&qp=&qs=&qc=&pw=100%25&ws=0&qm=0&st=1&nh=10&lk=1&rf=0&rq=0
----------------
java version "1.3.0"
Java(TM) 2 Runtime Environment, Standard Edition (build 1.3.0-C)
Java HotSpot(TM) Client VM (build 1.3.0-C, mixed mode)

The compare() method in java.text.Collator with the French locale and TERTIARY
strength returns the wrong answer for certain pairs strings. Using
Collator.getCollationKey() and comparing the resulting CollationKeys using
CollationKey.compareTo() will get the correct answer.

Source code to reproduce the problem for a certain set of strings is included
below. Basically, the program runs through a number of pairs of strings and
checks that Collator.compare() and Collator.getCollationKey().compareTo()
return values with the same sign.

Here are the first few mismatches it finds. The strings are all variations
of "aa" where the 'a' characters have various accents & decorations and are
upper and lower case.

[] java TestCollator
Testing French, TERTIARY
Mismatch! "\u0061\u00e1" < "\u00e1\u0041" w/compare(), > w/collation keys
Mismatch! "\u0061\u00c1" < "\u00e1\u0061" w/compare(), > w/collation keys
Mismatch! "\u0061\u00e0" < "\u00e1\u00c1" w/compare(), > w/collation keys
Mismatch! "\u0061\u00c0" < "\u00e1\u00e1" w/compare(), > w/collation keys
Mismatch! "\u0061\u0103" < "\u00e1\u00c0" w/compare(), > w/collation keys
Mismatch! "\u0061\u0102" < "\u00e1\u00e0" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eaf" < "\u00e1\u0102" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eae" < "\u00e1\u0103" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb1" < "\u00e1\u1eae" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb0" < "\u00e1\u1eaf" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb5" < "\u00e1\u1eb0" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb4" < "\u00e1\u1eb1" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb7" < "\u00e1\u1eb4" w/compare(), > w/collation keys

import java.text.*;
import java.util.*;

public class TestCollator {
    public static String[] g_chars = new String[] {
        // these are variations of the letter A (w/various accents, upper & lower case)
        "aA", "\u00e1\u00c1", "\u00e0\u00c0", "\u0103\u0102",
        "\u1eaf\u1eae", "\u1eb1\u1eb0", "\u1eb5\u1eb4", "\u1eb7\u1eb6",
        "\u1eb3\u1eb2", "\u00e2\u00c2", "\u1ea5\u1ea4", "\u1ea7\u1ea6",
        "\u1eab\u1eaa", "\u1ead\u1eac", "\u1ea9\u1ea8", "\u00e5\u00c5\u212b", // 212b and c5 are the same char
        "\u01fb\u01fa", "\u00e4\u00c4", "\u00e3\u00c3", "\u0101\u0100",
        "\u0105\u0104", "\u1ea1\u1ea0", "\u1ea3\u1ea2",
        "\u00e6\u00c6", // the combined AE character (upper & lower case)
        "bB"
    };
    public static void main(String[] args) {
        System.out.println("Testing French, TERTIARY");
        testCompare(new Locale("fr","FR"), Collator.TERTIARY);

        System.out.println("Testing English, TERTIARY");
        testCompare(new Locale("en","US"), Collator.TERTIARY);

        System.out.println("Testing French, SECONDARY");
        testCompare(new Locale("fr","FR"), Collator.SECONDARY);
    }
    public static void testCompare(Locale locale, int collatorStrength) {
        Collator collator = Collator.getInstance(locale);
        collator.setStrength(collatorStrength);

        // build a list of about 2600 2-letter strings to test with
        List list = new ArrayList();
        for (int i=0; i < g_chars.length; i++) {
            for (int j=0; j < g_chars[i].length(); j++) {
                for (int k=0; k < g_chars.length; k++) {
                    for (int m=0; m < g_chars[k].length(); m++)
                        list.add(""+g_chars[i].charAt(j)+g_chars[k].charAt(m));
                }
            }
        }

        // go through the 2601 strings and compare them using two methods:
        // Collator.compare() and Collator.getCollationKey.compareTo(), and
        // make sure the two methods reach the same answer.
        HashSet mismatchFound = new HashSet();
        for (int i=0; i < list.size(); i++) {
            String a = (String)list.get(i);
            if (mismatchFound.contains(a))
                continue; // we already found a mismatch w/this string, skip the rest
            for (int j=i; j < list.size(); j++) {
                String b = (String)list.get(j);
                if (mismatchFound.contains(b))
                    continue; // we already found a mismatch w/this string, skip the rest

                // THIS IS THE IMPORTANT PART.
                int compare = collator.compare(a, b);
                int collationKey = collator.getCollationKey(a).compareTo(collator.getCollationKey(b));

                if (sign(compare) != sign(collationKey)) {
                    System.out.println(" Mismatch! " + toUnicodeString(a) + " "+ op(compare) + " " +
                        toUnicodeString(b) + " w/compare(), " + op(collationKey) + " w/collation keys");
                    mismatchFound.add(a); // don't show any more mismatchesinvolving these
                    mismatchFound.add(b); // strings a & b since there are a probably a lot
                    break;
                }
            }
        }
    }
    private static int sign(int z) {
        return z < 0 ? -1 : z > 0 ? 1 : 0;
    }
    private static String op(int z) {
        return z < 0 ? "<" : z > 0 ? ">" : "==";
    }
    /** Pretty print a string using \\uXXXX unicode notation */
    private static String toUnicodeString(String s) {
        StringBuffer b = new StringBuffer();
        b.append("\"");
        for (int i=0; i < s.length(); i++) {
            b.append("\\u");
            b.append(Integer.toHexString(s.charAt(i)+0x10000).substring(1));
        }
        b.append("\"");
        return b.toString();
    }
}
(Review ID: 114009)
======================================================================
###@###.### 11/2/04 18:30 GMT

Details

Description

Attachments

Activity

People

Dates