Name: boT120536 Date: 01/22/2001
19 Dec 2000, eval1127@eng -- see:
http://search.java.sun.com/query.html?qt=%2Bcollator++%2Bcompare+%2B%22in+progress%22&col=obug&qp=&qs=&qc=&pw=100%25&ws=0&qm=0&st=1&nh=10&lk=1&rf=0&rq=0
----------------
java version "1.3.0"
Java(TM) 2 Runtime Environment, Standard Edition (build 1.3.0-C)
Java HotSpot(TM) Client VM (build 1.3.0-C, mixed mode)
The compare() method in java.text.Collator with the French locale and TERTIARY
strength returns the wrong answer for certain pairs strings. Using
Collator.getCollationKey() and comparing the resulting CollationKeys using
CollationKey.compareTo() will get the correct answer.
Source code to reproduce the problem for a certain set of strings is included
below. Basically, the program runs through a number of pairs of strings and
checks that Collator.compare() and Collator.getCollationKey().compareTo()
return values with the same sign.
Here are the first few mismatches it finds. The strings are all variations
of "aa" where the 'a' characters have various accents & decorations and are
upper and lower case.
[] java TestCollator
Testing French, TERTIARY
Mismatch! "\u0061\u00e1" < "\u00e1\u0041" w/compare(), > w/collation keys
Mismatch! "\u0061\u00c1" < "\u00e1\u0061" w/compare(), > w/collation keys
Mismatch! "\u0061\u00e0" < "\u00e1\u00c1" w/compare(), > w/collation keys
Mismatch! "\u0061\u00c0" < "\u00e1\u00e1" w/compare(), > w/collation keys
Mismatch! "\u0061\u0103" < "\u00e1\u00c0" w/compare(), > w/collation keys
Mismatch! "\u0061\u0102" < "\u00e1\u00e0" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eaf" < "\u00e1\u0102" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eae" < "\u00e1\u0103" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb1" < "\u00e1\u1eae" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb0" < "\u00e1\u1eaf" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb5" < "\u00e1\u1eb0" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb4" < "\u00e1\u1eb1" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb7" < "\u00e1\u1eb4" w/compare(), > w/collation keys
import java.text.*;
import java.util.*;
public class TestCollator {
public static String[] g_chars = new String[] {
// these are variations of the letter A (w/various accents, upper & lower case)
"aA", "\u00e1\u00c1", "\u00e0\u00c0", "\u0103\u0102",
"\u1eaf\u1eae", "\u1eb1\u1eb0", "\u1eb5\u1eb4", "\u1eb7\u1eb6",
"\u1eb3\u1eb2", "\u00e2\u00c2", "\u1ea5\u1ea4", "\u1ea7\u1ea6",
"\u1eab\u1eaa", "\u1ead\u1eac", "\u1ea9\u1ea8", "\u00e5\u00c5\u212b", // 212b and c5 are the same char
"\u01fb\u01fa", "\u00e4\u00c4", "\u00e3\u00c3", "\u0101\u0100",
"\u0105\u0104", "\u1ea1\u1ea0", "\u1ea3\u1ea2",
"\u00e6\u00c6", // the combined AE character (upper & lower case)
"bB"
};
public static void main(String[] args) {
System.out.println("Testing French, TERTIARY");
testCompare(new Locale("fr","FR"), Collator.TERTIARY);
System.out.println("Testing English, TERTIARY");
testCompare(new Locale("en","US"), Collator.TERTIARY);
System.out.println("Testing French, SECONDARY");
testCompare(new Locale("fr","FR"), Collator.SECONDARY);
}
public static void testCompare(Locale locale, int collatorStrength) {
Collator collator = Collator.getInstance(locale);
collator.setStrength(collatorStrength);
// build a list of about 2600 2-letter strings to test with
List list = new ArrayList();
for (int i=0; i < g_chars.length; i++) {
for (int j=0; j < g_chars[i].length(); j++) {
for (int k=0; k < g_chars.length; k++) {
for (int m=0; m < g_chars[k].length(); m++)
list.add(""+g_chars[i].charAt(j)+g_chars[k].charAt(m));
}
}
}
// go through the 2601 strings and compare them using two methods:
// Collator.compare() and Collator.getCollationKey.compareTo(), and
// make sure the two methods reach the same answer.
HashSet mismatchFound = new HashSet();
for (int i=0; i < list.size(); i++) {
String a = (String)list.get(i);
if (mismatchFound.contains(a))
continue; // we already found a mismatch w/this string, skip the rest
for (int j=i; j < list.size(); j++) {
String b = (String)list.get(j);
if (mismatchFound.contains(b))
continue; // we already found a mismatch w/this string, skip the rest
// THIS IS THE IMPORTANT PART.
int compare = collator.compare(a, b);
int collationKey = collator.getCollationKey(a).compareTo(collator.getCollationKey(b));
if (sign(compare) != sign(collationKey)) {
System.out.println(" Mismatch! " + toUnicodeString(a) + " "+ op(compare) + " " +
toUnicodeString(b) + " w/compare(), " + op(collationKey) + " w/collation keys");
mismatchFound.add(a); // don't show any more mismatchesinvolving these
mismatchFound.add(b); // strings a & b since there are a probably a lot
break;
}
}
}
}
private static int sign(int z) {
return z < 0 ? -1 : z > 0 ? 1 : 0;
}
private static String op(int z) {
return z < 0 ? "<" : z > 0 ? ">" : "==";
}
/** Pretty print a string using \\uXXXX unicode notation */
private static String toUnicodeString(String s) {
StringBuffer b = new StringBuffer();
b.append("\"");
for (int i=0; i < s.length(); i++) {
b.append("\\u");
b.append(Integer.toHexString(s.charAt(i)+0x10000).substring(1));
}
b.append("\"");
return b.toString();
}
}
(Review ID: 114009)
======================================================================
###@###.### 11/2/04 18:30 GMT
19 Dec 2000, eval1127@eng -- see:
http://search.java.sun.com/query.html?qt=%2Bcollator++%2Bcompare+%2B%22in+progress%22&col=obug&qp=&qs=&qc=&pw=100%25&ws=0&qm=0&st=1&nh=10&lk=1&rf=0&rq=0
----------------
java version "1.3.0"
Java(TM) 2 Runtime Environment, Standard Edition (build 1.3.0-C)
Java HotSpot(TM) Client VM (build 1.3.0-C, mixed mode)
The compare() method in java.text.Collator with the French locale and TERTIARY
strength returns the wrong answer for certain pairs strings. Using
Collator.getCollationKey() and comparing the resulting CollationKeys using
CollationKey.compareTo() will get the correct answer.
Source code to reproduce the problem for a certain set of strings is included
below. Basically, the program runs through a number of pairs of strings and
checks that Collator.compare() and Collator.getCollationKey().compareTo()
return values with the same sign.
Here are the first few mismatches it finds. The strings are all variations
of "aa" where the 'a' characters have various accents & decorations and are
upper and lower case.
[] java TestCollator
Testing French, TERTIARY
Mismatch! "\u0061\u00e1" < "\u00e1\u0041" w/compare(), > w/collation keys
Mismatch! "\u0061\u00c1" < "\u00e1\u0061" w/compare(), > w/collation keys
Mismatch! "\u0061\u00e0" < "\u00e1\u00c1" w/compare(), > w/collation keys
Mismatch! "\u0061\u00c0" < "\u00e1\u00e1" w/compare(), > w/collation keys
Mismatch! "\u0061\u0103" < "\u00e1\u00c0" w/compare(), > w/collation keys
Mismatch! "\u0061\u0102" < "\u00e1\u00e0" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eaf" < "\u00e1\u0102" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eae" < "\u00e1\u0103" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb1" < "\u00e1\u1eae" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb0" < "\u00e1\u1eaf" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb5" < "\u00e1\u1eb0" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb4" < "\u00e1\u1eb1" w/compare(), > w/collation keys
Mismatch! "\u0061\u1eb7" < "\u00e1\u1eb4" w/compare(), > w/collation keys
import java.text.*;
import java.util.*;
public class TestCollator {
public static String[] g_chars = new String[] {
// these are variations of the letter A (w/various accents, upper & lower case)
"aA", "\u00e1\u00c1", "\u00e0\u00c0", "\u0103\u0102",
"\u1eaf\u1eae", "\u1eb1\u1eb0", "\u1eb5\u1eb4", "\u1eb7\u1eb6",
"\u1eb3\u1eb2", "\u00e2\u00c2", "\u1ea5\u1ea4", "\u1ea7\u1ea6",
"\u1eab\u1eaa", "\u1ead\u1eac", "\u1ea9\u1ea8", "\u00e5\u00c5\u212b", // 212b and c5 are the same char
"\u01fb\u01fa", "\u00e4\u00c4", "\u00e3\u00c3", "\u0101\u0100",
"\u0105\u0104", "\u1ea1\u1ea0", "\u1ea3\u1ea2",
"\u00e6\u00c6", // the combined AE character (upper & lower case)
"bB"
};
public static void main(String[] args) {
System.out.println("Testing French, TERTIARY");
testCompare(new Locale("fr","FR"), Collator.TERTIARY);
System.out.println("Testing English, TERTIARY");
testCompare(new Locale("en","US"), Collator.TERTIARY);
System.out.println("Testing French, SECONDARY");
testCompare(new Locale("fr","FR"), Collator.SECONDARY);
}
public static void testCompare(Locale locale, int collatorStrength) {
Collator collator = Collator.getInstance(locale);
collator.setStrength(collatorStrength);
// build a list of about 2600 2-letter strings to test with
List list = new ArrayList();
for (int i=0; i < g_chars.length; i++) {
for (int j=0; j < g_chars[i].length(); j++) {
for (int k=0; k < g_chars.length; k++) {
for (int m=0; m < g_chars[k].length(); m++)
list.add(""+g_chars[i].charAt(j)+g_chars[k].charAt(m));
}
}
}
// go through the 2601 strings and compare them using two methods:
// Collator.compare() and Collator.getCollationKey.compareTo(), and
// make sure the two methods reach the same answer.
HashSet mismatchFound = new HashSet();
for (int i=0; i < list.size(); i++) {
String a = (String)list.get(i);
if (mismatchFound.contains(a))
continue; // we already found a mismatch w/this string, skip the rest
for (int j=i; j < list.size(); j++) {
String b = (String)list.get(j);
if (mismatchFound.contains(b))
continue; // we already found a mismatch w/this string, skip the rest
// THIS IS THE IMPORTANT PART.
int compare = collator.compare(a, b);
int collationKey = collator.getCollationKey(a).compareTo(collator.getCollationKey(b));
if (sign(compare) != sign(collationKey)) {
System.out.println(" Mismatch! " + toUnicodeString(a) + " "+ op(compare) + " " +
toUnicodeString(b) + " w/compare(), " + op(collationKey) + " w/collation keys");
mismatchFound.add(a); // don't show any more mismatchesinvolving these
mismatchFound.add(b); // strings a & b since there are a probably a lot
break;
}
}
}
}
private static int sign(int z) {
return z < 0 ? -1 : z > 0 ? 1 : 0;
}
private static String op(int z) {
return z < 0 ? "<" : z > 0 ? ">" : "==";
}
/** Pretty print a string using \\uXXXX unicode notation */
private static String toUnicodeString(String s) {
StringBuffer b = new StringBuffer();
b.append("\"");
for (int i=0; i < s.length(); i++) {
b.append("\\u");
b.append(Integer.toHexString(s.charAt(i)+0x10000).substring(1));
}
b.append("\"");
return b.toString();
}
}
(Review ID: 114009)
======================================================================
###@###.### 11/2/04 18:30 GMT