-
Bug
-
Resolution: Fixed
-
P4
-
6
-
b75
-
generic
-
generic
Here's a program that finds all the non-ASCII-compatible encodings in
the JDK. It correctly finds the bug in KOI8-U, and also finds
apparently serious bugs in
ISO-8859-7
ISO-2022-KR
src/toy/tt.java
The test (incorrectly) assumes all encodings are ASCII-compatible.
Please turn this test into a real regression test, and file bugs
against various charsets.
As I always say, the charset code is crawling with bugs...
import java.io.*;
import java.util.*;
import java.nio.charset.*;
import java.nio.*;
public class tt {
public static void main(String[] args) throws Exception {
final byte[] asciiBytes = new byte[0x80];
final char[] asciiChars = new char[0x80];
for (int i = 0; i < 0x80; i++) {
asciiBytes[i] = (byte) i;
asciiChars[i] = (char) i;
}
final String asciiString = new String(asciiChars);
for (Map.Entry<String,Charset> e
: Charset.availableCharsets().entrySet()) {
try {
String csn = e.getKey();
Charset cs = e.getValue();
//cs.newDecoder();
int failures = 0;
System.out.println(csn);
if (csn.equals("x-IBM933")) continue; // hangs!
if (! cs.canEncode()) continue;
if (! Arrays.equals(asciiString.getBytes(csn), asciiBytes)) {
System.out.printf("%s -> bytes%n", csn);
//System.out.println(new String(asciiString.getBytes(csn),
"ISO-8859-1"));
}
if (! new String(asciiBytes, csn).equals(asciiString)) {
System.out.printf("%s -> chars%n", csn);
//System.out.println(asciiString);
//System.out.println(new String(asciiBytes, csn));
}
} catch (Throwable t) { t.printStackTrace(); }
}
}
}
Martin
$ /home/mb29450/bin/sun/jver /net/cremina/export/sherman/ws/rc/ jr tt
==> javac -source 1.6 -Xlint:all tt.java
==> java -esa -ea tt
Big5
Big5-HKSCS
COMPOUND_TEXT
COMPOUND_TEXT -> bytes
COMPOUND_TEXT -> chars
EUC-JP
EUC-KR
GB18030
GB2312
GBK
IBM-Thai
IBM-Thai -> bytes
IBM-Thai -> chars
IBM00858
IBM01140
IBM01140 -> bytes
IBM01140 -> chars
IBM01141
IBM01141 -> bytes
IBM01141 -> chars
IBM01142
IBM01142 -> bytes
IBM01142 -> chars
IBM01143
IBM01143 -> bytes
IBM01143 -> chars
IBM01144
IBM01144 -> bytes
IBM01144 -> chars
IBM01145
IBM01145 -> bytes
IBM01145 -> chars
IBM01146
IBM01146 -> bytes
IBM01146 -> chars
IBM01147
IBM01147 -> bytes
IBM01147 -> chars
IBM01148
IBM01148 -> bytes
IBM01148 -> chars
IBM01149
IBM01149 -> bytes
IBM01149 -> chars
IBM037
IBM037 -> bytes
IBM037 -> chars
IBM1026
IBM1026 -> bytes
IBM1026 -> chars
IBM1047
IBM1047 -> bytes
IBM1047 -> chars
IBM273
IBM273 -> bytes
IBM273 -> chars
IBM277
IBM277 -> bytes
IBM277 -> chars
IBM278
IBM278 -> bytes
IBM278 -> chars
IBM280
IBM280 -> bytes
IBM280 -> chars
IBM284
IBM284 -> bytes
IBM284 -> chars
IBM285
IBM285 -> bytes
IBM285 -> chars
IBM297
IBM297 -> bytes
IBM297 -> chars
IBM420
IBM420 -> bytes
IBM420 -> chars
IBM424
IBM424 -> bytes
IBM424 -> chars
IBM437
IBM500
IBM500 -> bytes
IBM500 -> chars
IBM775
IBM850
IBM852
IBM855
IBM857
IBM860
IBM861
IBM862
IBM863
IBM864
IBM864 -> bytes
IBM864 -> chars
IBM865
IBM866
IBM868
IBM869
IBM870
IBM870 -> bytes
IBM870 -> chars
IBM871
IBM871 -> bytes
IBM871 -> chars
IBM918
IBM918 -> bytes
IBM918 -> chars
ISO-2022-CN
ISO-2022-JP
ISO-2022-JP -> chars
ISO-2022-JP-2
ISO-2022-JP-2 -> chars
ISO-2022-KR
java.lang.NullPointerException
at sun.nio.cs.ext.ISO2022$Decoder.decodeArrayLoop(ISO2022.java:174)
at sun.nio.cs.ext.ISO2022$Decoder.decodeLoop(ISO2022.java:384)
at java.nio.charset.CharsetDecoder.decode(CharsetDecoder.java:544)
at java.lang.StringCoding$StringDecoder.decode(StringCoding.java:136)
at java.lang.StringCoding.decode(StringCoding.java:169)
at java.lang.String.<init>(String.java:401)
at java.lang.String.<init>(String.java:429)
at tt.main(tt.java:30)
ISO-8859-1
ISO-8859-13
ISO-8859-15
ISO-8859-2
ISO-8859-3
ISO-8859-4
ISO-8859-5
ISO-8859-6
ISO-8859-7
ISO-8859-7 -> bytes
ISO-8859-7 -> chars
ISO-8859-8
ISO-8859-9
JIS_X0201
JIS_X0212-1990
JIS_X0212-1990 -> bytes
JIS_X0212-1990 -> chars
KOI8-R
KOI8-U
KOI8-U -> bytes
KOI8-U -> chars
Shift_JIS
TIS-620
US-ASCII
UTF-16
UTF-16 -> bytes
UTF-16 -> chars
UTF-16BE
UTF-16BE -> bytes
UTF-16BE -> chars
UTF-16LE
UTF-16LE -> bytes
UTF-16LE -> chars
UTF-32
UTF-32 -> bytes
UTF-32 -> chars
UTF-32BE
UTF-32BE -> bytes
UTF-32BE -> chars
UTF-32LE
UTF-32LE -> bytes
UTF-32LE -> chars
UTF-8
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
windows-31j
x-Big5-Solaris
x-euc-jp-linux
x-EUC-TW
x-eucJP-Open
x-IBM1006
x-IBM1025
x-IBM1025 -> bytes
x-IBM1025 -> chars
x-IBM1046
x-IBM1097
x-IBM1097 -> bytes
x-IBM1097 -> chars
x-IBM1098
x-IBM1112
x-IBM1112 -> bytes
x-IBM1112 -> chars
x-IBM1122
x-IBM1122 -> bytes
x-IBM1122 -> chars
x-IBM1123
x-IBM1123 -> bytes
x-IBM1123 -> chars
x-IBM1124
x-IBM1381
x-IBM1383
x-IBM33722
x-IBM33722 -> bytes
x-IBM33722 -> chars
x-IBM737
x-IBM856
x-IBM874
x-IBM875
x-IBM875 -> bytes
x-IBM875 -> chars
x-IBM921
x-IBM922
x-IBM930
x-IBM930 -> bytes
x-IBM930 -> chars
x-IBM933
x-IBM935
x-IBM935 -> bytes
x-IBM935 -> chars
x-IBM937
x-IBM937 -> bytes
x-IBM937 -> chars
x-IBM939
x-IBM939 -> bytes
x-IBM939 -> chars
x-IBM942
x-IBM942 -> bytes
x-IBM942 -> chars
x-IBM942C
x-IBM943
x-IBM943 -> bytes
x-IBM943 -> chars
x-IBM943C
x-IBM948
x-IBM949
java.lang.AssertionError
at sun.nio.cs.Surrogate$Parser.parse(Surrogate.java:205)
at sun.nio.cs.ext.IBM949$Encoder.encodeArrayLoop(IBM949.java:1554)
at sun.nio.cs.ext.IBM949$Encoder.encodeLoop(IBM949.java:5239)
at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:230)
at java.lang.StringCoding.encode(StringCoding.java:262)
at java.lang.String.getBytes(String.java:820)
at tt.main(tt.java:26)
x-IBM949C
java.lang.AssertionError
at sun.nio.cs.Surrogate$Parser.parse(Surrogate.java:205)
at sun.nio.cs.ext.IBM949$Encoder.encodeArrayLoop(IBM949.java:1554)
at sun.nio.cs.ext.IBM949$Encoder.encodeLoop(IBM949.java:5239)
at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:230)
at java.lang.StringCoding.encode(StringCoding.java:262)
at java.lang.String.getBytes(String.java:820)
at tt.main(tt.java:26)
x-IBM950
x-IBM964
x-IBM970
java.lang.AssertionError
at sun.nio.cs.Surrogate$Parser.parse(Surrogate.java:205)
at sun.nio.cs.ext.IBM970$Encoder.encodeArrayLoop(IBM970.java:1272)
at sun.nio.cs.ext.IBM970$Encoder.encodeLoop(IBM970.java:1643)
at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:230)
at java.lang.StringCoding.encode(StringCoding.java:262)
at java.lang.String.getBytes(String.java:820)
at tt.main(tt.java:26)
x-ISCII91
x-ISO-2022-CN-CNS
x-ISO-2022-CN-CNS -> chars
x-ISO-2022-CN-GB
x-ISO-2022-CN-GB -> chars
x-iso-8859-11
x-JIS0208
x-JIS0208 -> bytes
x-JIS0208 -> chars
x-JISAutoDetect
x-Johab
x-MacArabic
x-MacCentralEurope
x-MacCroatian
x-MacCyrillic
x-MacDingbat
x-MacDingbat -> bytes
x-MacDingbat -> chars
x-MacGreek
x-MacHebrew
x-MacIceland
x-MacRoman
x-MacRomania
x-MacSymbol
x-MacSymbol -> bytes
x-MacSymbol -> chars
x-MacThai
x-MacTurkish
x-MacUkraine
x-MS950-HKSCS
x-mswin-936
x-PCK
x-UTF-16LE-BOM
x-UTF-16LE-BOM -> bytes
x-UTF-16LE-BOM -> chars
X-UTF-32BE-BOM
X-UTF-32BE-BOM -> bytes
X-UTF-32BE-BOM -> chars
X-UTF-32LE-BOM
X-UTF-32LE-BOM -> bytes
X-UTF-32LE-BOM -> chars
x-windows-50220
x-windows-50220 -> chars
x-windows-50221
x-windows-50221 -> chars
x-windows-874
x-windows-949
x-windows-950
x-windows-iso2022jp
x-windows-iso2022jp -> chars
the JDK. It correctly finds the bug in KOI8-U, and also finds
apparently serious bugs in
ISO-8859-7
ISO-2022-KR
src/toy/tt.java
The test (incorrectly) assumes all encodings are ASCII-compatible.
Please turn this test into a real regression test, and file bugs
against various charsets.
As I always say, the charset code is crawling with bugs...
import java.io.*;
import java.util.*;
import java.nio.charset.*;
import java.nio.*;
public class tt {
public static void main(String[] args) throws Exception {
final byte[] asciiBytes = new byte[0x80];
final char[] asciiChars = new char[0x80];
for (int i = 0; i < 0x80; i++) {
asciiBytes[i] = (byte) i;
asciiChars[i] = (char) i;
}
final String asciiString = new String(asciiChars);
for (Map.Entry<String,Charset> e
: Charset.availableCharsets().entrySet()) {
try {
String csn = e.getKey();
Charset cs = e.getValue();
//cs.newDecoder();
int failures = 0;
System.out.println(csn);
if (csn.equals("x-IBM933")) continue; // hangs!
if (! cs.canEncode()) continue;
if (! Arrays.equals(asciiString.getBytes(csn), asciiBytes)) {
System.out.printf("%s -> bytes%n", csn);
//System.out.println(new String(asciiString.getBytes(csn),
"ISO-8859-1"));
}
if (! new String(asciiBytes, csn).equals(asciiString)) {
System.out.printf("%s -> chars%n", csn);
//System.out.println(asciiString);
//System.out.println(new String(asciiBytes, csn));
}
} catch (Throwable t) { t.printStackTrace(); }
}
}
}
Martin
$ /home/mb29450/bin/sun/jver /net/cremina/export/sherman/ws/rc/ jr tt
==> javac -source 1.6 -Xlint:all tt.java
==> java -esa -ea tt
Big5
Big5-HKSCS
COMPOUND_TEXT
COMPOUND_TEXT -> bytes
COMPOUND_TEXT -> chars
EUC-JP
EUC-KR
GB18030
GB2312
GBK
IBM-Thai
IBM-Thai -> bytes
IBM-Thai -> chars
IBM00858
IBM01140
IBM01140 -> bytes
IBM01140 -> chars
IBM01141
IBM01141 -> bytes
IBM01141 -> chars
IBM01142
IBM01142 -> bytes
IBM01142 -> chars
IBM01143
IBM01143 -> bytes
IBM01143 -> chars
IBM01144
IBM01144 -> bytes
IBM01144 -> chars
IBM01145
IBM01145 -> bytes
IBM01145 -> chars
IBM01146
IBM01146 -> bytes
IBM01146 -> chars
IBM01147
IBM01147 -> bytes
IBM01147 -> chars
IBM01148
IBM01148 -> bytes
IBM01148 -> chars
IBM01149
IBM01149 -> bytes
IBM01149 -> chars
IBM037
IBM037 -> bytes
IBM037 -> chars
IBM1026
IBM1026 -> bytes
IBM1026 -> chars
IBM1047
IBM1047 -> bytes
IBM1047 -> chars
IBM273
IBM273 -> bytes
IBM273 -> chars
IBM277
IBM277 -> bytes
IBM277 -> chars
IBM278
IBM278 -> bytes
IBM278 -> chars
IBM280
IBM280 -> bytes
IBM280 -> chars
IBM284
IBM284 -> bytes
IBM284 -> chars
IBM285
IBM285 -> bytes
IBM285 -> chars
IBM297
IBM297 -> bytes
IBM297 -> chars
IBM420
IBM420 -> bytes
IBM420 -> chars
IBM424
IBM424 -> bytes
IBM424 -> chars
IBM437
IBM500
IBM500 -> bytes
IBM500 -> chars
IBM775
IBM850
IBM852
IBM855
IBM857
IBM860
IBM861
IBM862
IBM863
IBM864
IBM864 -> bytes
IBM864 -> chars
IBM865
IBM866
IBM868
IBM869
IBM870
IBM870 -> bytes
IBM870 -> chars
IBM871
IBM871 -> bytes
IBM871 -> chars
IBM918
IBM918 -> bytes
IBM918 -> chars
ISO-2022-CN
ISO-2022-JP
ISO-2022-JP -> chars
ISO-2022-JP-2
ISO-2022-JP-2 -> chars
ISO-2022-KR
java.lang.NullPointerException
at sun.nio.cs.ext.ISO2022$Decoder.decodeArrayLoop(ISO2022.java:174)
at sun.nio.cs.ext.ISO2022$Decoder.decodeLoop(ISO2022.java:384)
at java.nio.charset.CharsetDecoder.decode(CharsetDecoder.java:544)
at java.lang.StringCoding$StringDecoder.decode(StringCoding.java:136)
at java.lang.StringCoding.decode(StringCoding.java:169)
at java.lang.String.<init>(String.java:401)
at java.lang.String.<init>(String.java:429)
at tt.main(tt.java:30)
ISO-8859-1
ISO-8859-13
ISO-8859-15
ISO-8859-2
ISO-8859-3
ISO-8859-4
ISO-8859-5
ISO-8859-6
ISO-8859-7
ISO-8859-7 -> bytes
ISO-8859-7 -> chars
ISO-8859-8
ISO-8859-9
JIS_X0201
JIS_X0212-1990
JIS_X0212-1990 -> bytes
JIS_X0212-1990 -> chars
KOI8-R
KOI8-U
KOI8-U -> bytes
KOI8-U -> chars
Shift_JIS
TIS-620
US-ASCII
UTF-16
UTF-16 -> bytes
UTF-16 -> chars
UTF-16BE
UTF-16BE -> bytes
UTF-16BE -> chars
UTF-16LE
UTF-16LE -> bytes
UTF-16LE -> chars
UTF-32
UTF-32 -> bytes
UTF-32 -> chars
UTF-32BE
UTF-32BE -> bytes
UTF-32BE -> chars
UTF-32LE
UTF-32LE -> bytes
UTF-32LE -> chars
UTF-8
windows-1250
windows-1251
windows-1252
windows-1253
windows-1254
windows-1255
windows-1256
windows-1257
windows-1258
windows-31j
x-Big5-Solaris
x-euc-jp-linux
x-EUC-TW
x-eucJP-Open
x-IBM1006
x-IBM1025
x-IBM1025 -> bytes
x-IBM1025 -> chars
x-IBM1046
x-IBM1097
x-IBM1097 -> bytes
x-IBM1097 -> chars
x-IBM1098
x-IBM1112
x-IBM1112 -> bytes
x-IBM1112 -> chars
x-IBM1122
x-IBM1122 -> bytes
x-IBM1122 -> chars
x-IBM1123
x-IBM1123 -> bytes
x-IBM1123 -> chars
x-IBM1124
x-IBM1381
x-IBM1383
x-IBM33722
x-IBM33722 -> bytes
x-IBM33722 -> chars
x-IBM737
x-IBM856
x-IBM874
x-IBM875
x-IBM875 -> bytes
x-IBM875 -> chars
x-IBM921
x-IBM922
x-IBM930
x-IBM930 -> bytes
x-IBM930 -> chars
x-IBM933
x-IBM935
x-IBM935 -> bytes
x-IBM935 -> chars
x-IBM937
x-IBM937 -> bytes
x-IBM937 -> chars
x-IBM939
x-IBM939 -> bytes
x-IBM939 -> chars
x-IBM942
x-IBM942 -> bytes
x-IBM942 -> chars
x-IBM942C
x-IBM943
x-IBM943 -> bytes
x-IBM943 -> chars
x-IBM943C
x-IBM948
x-IBM949
java.lang.AssertionError
at sun.nio.cs.Surrogate$Parser.parse(Surrogate.java:205)
at sun.nio.cs.ext.IBM949$Encoder.encodeArrayLoop(IBM949.java:1554)
at sun.nio.cs.ext.IBM949$Encoder.encodeLoop(IBM949.java:5239)
at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:230)
at java.lang.StringCoding.encode(StringCoding.java:262)
at java.lang.String.getBytes(String.java:820)
at tt.main(tt.java:26)
x-IBM949C
java.lang.AssertionError
at sun.nio.cs.Surrogate$Parser.parse(Surrogate.java:205)
at sun.nio.cs.ext.IBM949$Encoder.encodeArrayLoop(IBM949.java:1554)
at sun.nio.cs.ext.IBM949$Encoder.encodeLoop(IBM949.java:5239)
at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:230)
at java.lang.StringCoding.encode(StringCoding.java:262)
at java.lang.String.getBytes(String.java:820)
at tt.main(tt.java:26)
x-IBM950
x-IBM964
x-IBM970
java.lang.AssertionError
at sun.nio.cs.Surrogate$Parser.parse(Surrogate.java:205)
at sun.nio.cs.ext.IBM970$Encoder.encodeArrayLoop(IBM970.java:1272)
at sun.nio.cs.ext.IBM970$Encoder.encodeLoop(IBM970.java:1643)
at java.nio.charset.CharsetEncoder.encode(CharsetEncoder.java:544)
at java.lang.StringCoding$StringEncoder.encode(StringCoding.java:230)
at java.lang.StringCoding.encode(StringCoding.java:262)
at java.lang.String.getBytes(String.java:820)
at tt.main(tt.java:26)
x-ISCII91
x-ISO-2022-CN-CNS
x-ISO-2022-CN-CNS -> chars
x-ISO-2022-CN-GB
x-ISO-2022-CN-GB -> chars
x-iso-8859-11
x-JIS0208
x-JIS0208 -> bytes
x-JIS0208 -> chars
x-JISAutoDetect
x-Johab
x-MacArabic
x-MacCentralEurope
x-MacCroatian
x-MacCyrillic
x-MacDingbat
x-MacDingbat -> bytes
x-MacDingbat -> chars
x-MacGreek
x-MacHebrew
x-MacIceland
x-MacRoman
x-MacRomania
x-MacSymbol
x-MacSymbol -> bytes
x-MacSymbol -> chars
x-MacThai
x-MacTurkish
x-MacUkraine
x-MS950-HKSCS
x-mswin-936
x-PCK
x-UTF-16LE-BOM
x-UTF-16LE-BOM -> bytes
x-UTF-16LE-BOM -> chars
X-UTF-32BE-BOM
X-UTF-32BE-BOM -> bytes
X-UTF-32BE-BOM -> chars
X-UTF-32LE-BOM
X-UTF-32LE-BOM -> bytes
X-UTF-32LE-BOM -> chars
x-windows-50220
x-windows-50220 -> chars
x-windows-50221
x-windows-50221 -> chars
x-windows-874
x-windows-949
x-windows-950
x-windows-iso2022jp
x-windows-iso2022jp -> chars