Name: skT45625 Date: 06/05/2000
java version "1.3.0"
Java(TM) 2 Runtime Environment, Standard Edition (build 1.3.0-C)
Java HotSpot(TM) Client VM (build 1.3.0-C, mixed mode)
Your javadoc documentation for the DataInputStream class specifically states the
following:
"The null byte '\u0000' is encoded in 2-byte format rather than 1-byte, so that
the encoded strings never have embedded nulls"
In reality, the readUTF method of the DataInputStream class will actually read
in null bytes; they will get turned into the char of value zero.
Is this a bug? Or is your documentation merely wrong? In bug report 4140874
you seem to indicate that Java IS supposed to tolerate reading in null bytes.
Please change either the javadoc or your code.
Incidentally, I came across this "bug" while implementing an improvement of your
DataInputStream class.
You guys at Sun may be interested in my class, as I believe (based off of
preliminary benchmarks) that I can speed up the readUTF method by 5-10% for the
most common case of reading 1 byte chars.
I include an edited version of the entire class below. (If the pasted-in version
below gets mangled with stupid line breaks, feel free to ask me to send you an
email if you want a clean textfile version.) The part that is relevant to you
is the readUTF method at the end, which is extremely similar to the readUTF
method of DataInputStream.
My speed improvement in the readUTF method is that I avoid immediately doing a
"& 0xff" and a ">> 4" operation on the first byte, like you guys do. These
operations may be skipped for the byte pattern 0xxx xxxx by doing an immediate
compare of the int converted byte against 0. Also, I do not use a switch
statement, but do a series of explicit if-then-else blocks (it may be
compiler/microprocessor dependent as to whether that buys -- or costs? -- you
anything). On my machine, my code runs faster than DataInputStream doesif you
are reading 1-byte chars.
(Note: in the code below, I am assuming that null bytes are illegal as per your
DataInputStream javadoc; if they are actually OK, then all you need to do to
correct the code is do "b1 >= 0" instead of "b1 > 0".)
************
import java.io.*;
/**
* (This class is a concatenation of DataInputStream & BufferedInputStream ...)
*
* WARNING: this class is NOT multithread safe
*/
public class QuickPackInput {
InputStream in;
int bufferSize;
byte[] buffer;
int start;
int end;
/**
* Construct the EventWriter with an OutputStream onto
* which the SAX events are to be recorded.
*/
public QuickPackInput(InputStream in, int bufferSize) {
if (in == null) throw new IllegalArgumentException("was supplied
with a null InputStream");
if (bufferSize < QuickPackOutput.MAX_UTF_WRITE_LENGTH) throw new
IllegalArgumentException("was supplied with bufferSize = " + bufferSize + "
which is < QuickPackOutput.MAX_UTF_WRITE_LENGTH = " +
QuickPackOutput.MAX_UTF_WRITE_LENGTH); // need bufferSize >=
QuickPackOutput.MAX_UTF_WRITE_LENGTH in order to be able to store an entire UTF
stream in the buffer
this.in = in;
this.bufferSize = bufferSize;
buffer = new byte[bufferSize];
start = 0;
end = 0;
}
public QuickPackInput(InputStream in) {
this(in, QuickPackOutput.MAX_UTF_WRITE_LENGTH);
}
protected void finalize() throws IOException {
close();
}
public void close() throws IOException {
in.close();
}
public boolean fill(int min) throws IOException {
// Compact the buffer
System.arraycopy(buffer, start, buffer, 0, end - start);
end -= start;
start = 0;
// Now fill it up
int read = in.read(buffer, end, buffer.length - end);
if (read > 0)
end += read;
return (end - start >= min);
}
public byte readByte() throws IOException {
if ((end - start < 1) && !fill(1))
throw new EOFException();
return buffer[start++];
}
public int readInt() throws IOException {
if (end - start < 4 && !fill(4))
throw new EOFException();
return
((int)buffer[start++] << 24) +
((int)buffer[start++] << 16) +
((int)buffer[start++] << 8) +
((int)buffer[start++] << 0);
}
public int readUnsignedShort() throws IOException {
if (end - start < 2 && !fill(2))
throw new EOFException();
return ((buffer[start++] & 0xFF) << 8) + ((buffer[start++] &
0xFF) << 0); // must do & 0xFF in order to properly sign extend the
byte to an int
}
public String readUTF() throws IOException {
int utfLength = readUnsignedShort();
if ((end - start < utfLength) && !fill(utfLength))
throw new UTFDataFormatException();
char str[] = new char[utfLength]; // this char[]
is guaranteed to have at least enough capacity to hold the least compressed
(i.e. 3 byte) UTF stream
int utfIndex = 0;
int strIndex = 0;
int b1, b1Shift, b2, b3;
while (utfIndex < utfLength) {
b1 = (int) buffer[start++];
// this block handles b1 of form: 0xxx
xxxx EXCEPT 0000 0000 (in Sun's modified UTF-8 format, nulls are written in 2
byte format so that a pure null should NEVER be encountered)
if (b1 > 0) {
utfIndex++;
str[strIndex++] = (char) b1;
}
else {
b1Shift = (b1 & 0xFF) >> 4; // the &
0xff eliminates any bits that were added onto b1 from sign extending the byte,
then we downshift leaving only the top 4 bits
// this block handles b1, b2 of form:
110x xxxx, 10xx xxxx
if ((b1Shift == 12) || (b1Shift == 13)) {
utfIndex += 2;
if (utfIndex > utfLength)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 110x xxxx) at a point where this pattern will require a UTF index = " +
utfIndex + " which will exceed the specifed UTF length of " + utfLength);
b2 = (int) buffer[start++];
if ((b2 & 0xC0) != 0x80)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 110x xxxx) followed by byte b2 = " + b2 + " (which does NOT have the
corect bit pattern 10xx xxxx)");
str[strIndex++] = (char) (((b1 & 0x1F)
<< 6) | (b2 & 0x3F));
}
// this block handles b1, b2, b3 of
form: 1110 xxxx, 10xx xxxx, 10xx xxxx
else if (b1Shift == 14) {
utfIndex += 3;
if (utfIndex > utfLength)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 1110 xxxx) at a point where this pattern will require a UTF index = " +
utfIndex + " which will exceed the specifed UTF length of " + utfLength);
b2 = (int) buffer[start++];
b3 = (int) buffer[start++];
if ((b2 & 0xC0) != 0x80)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 110x xxxx) followed by byte b2 = " + b2 + " (which does NOT have the bit
pattern 10xx xxxx)");
else if ((b3 & 0xC0) != 0x80)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 1110 xxxx) followed by byte b2 = " + b2 + " (which has the correct bit
pattern 10xx xxxx) followed by byte b3 = " + b3 + " (which does NOT have the
corect bit pattern 10xx xxxx)");
str[strIndex++] = (char) (((b1 & 0x0F)
<< 12) | ((b2 & 0x3F) << 6) | ((b3 & 0x3F) << 0));
}
// this block handles b1 of form: 10xx
xxxx or 1111 xxxx
else
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has an INVALID
bit pattern of either 10xx xxxx, 1111 xxxx, or 0000 0000)");
}
}
return new String(str, 0, strIndex); // copy that part of str
which was actually assigned char values to a new String
}
public boolean remaining() throws IOException {
if (end > start)
return true;
else
return fill(1);
}
}
/*
below are some old version of the readUTF method, which I retained mainly for
benchmarking
this version is similar to Sun's code: uses a switch & always does & and >> on
b1:
public String readUTF() throws IOException {
int utfLength = readUnsignedShort();
if ((end - start < utfLength) && !fill(utfLength))
throw new UTFDataFormatException();
char str[] = new char[utfLength]; // this char[]
is guaranteed to have at least enough capacity to hold the least compressed
(i.e. 3 byte) UTF stream
int utfIndex = 0;
int strIndex = 0;
int b1, b2, b3;
while (utfIndex < utfLength) {
b1 = ((int) buffer[start++]) & 0xff; // the &
0xff eliminates any bits that were added on from sign extension
switch (b1 >> 4) {
// this block handles b1 of form: 0xxx xxxx
(EXCEPT 0000 0000, because in Sun's modified UTF-8 format, nulls are written in
2 byte format so that a pure null should NEVER be encountered)
case 0: case 1: case 2: case 3: case 4: case 5:
case 6: case 7:
utfIndex++;
str[strIndex++] = (char) b1;
break;
// this block handles b1, b2 of form: 110x xxxx,
10xx xxxx
case 12: case 13:
utfIndex += 2;
if (utfIndex > utfLength)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 110x xxxx) at a point where this pattern will require a UTF index = " +
utfIndex + " which will exceed the specifed UTF length of " + utfLength);
b2 = (int) buffer[start++];
if ((b2 & 0xC0) != 0x80)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 110x xxxx) followed by byte b2 = " + b2 + " (which does NOT have the
corect bit pattern 10xx xxxx)");
str[strIndex++] = (char) (((b1 & 0x1F)
<< 6) | (b2 & 0x3F));
break;
// this block handles b1, b2, b3 of form: 1110
xxxx, 10xx xxxx, 10xx xxxx
case 14:
utfIndex += 3;
if (utfIndex > utfLength)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 1110 xxxx) at a point where this pattern will require a UTF index = " +
utfIndex + " which will exceed the specifed UTF length of " + utfLength);
b2 = (int) buffer[start++];
b3 = (int) buffer[start++];
if ((b2 & 0xC0) != 0x80)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 110x xxxx) followed by byte b2 = " + b2 + " (which does NOT have the bit
pattern 10xx xxxx)");
else if ((b3 & 0xC0) != 0x80)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 1110 xxxx) followed by byte b2 = " + b2 + " (which has the correct bit
pattern 10xx xxxx) followed by byte b3 = " + b3 + " (which does NOT have the
corect bit pattern 10xx xxxx)");
str[strIndex++] = (char) (((b1 & 0x0F)
<< 12) | ((b2 & 0x3F) << 6) | ((b3 & 0x3F) << 0));
break;
// this block handles b1 of form: 10xx xxxx or
1111 xxxx
default:
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has an INVALID
bit pattern of either 10xx xxxx or 1111 xxxx)");
}
}
return new String(str, 0, strIndex); // copy that part of str
which was actually assigned char values to a new String
}
below is a version similar to sun except it eliminates the switch in favor of
if-else blocks:
public String readUTF() throws IOException {
int utfLength = readUnsignedShort();
if ((end - start < utfLength) && !fill(utfLength))
throw new UTFDataFormatException();
char str[] = new char[utfLength]; // this char[]
is guaranteed to have at least enough capacity to hold the least compressed
(i.e. 3 byte) UTF stream
int utfIndex = 0;
int strIndex = 0;
int b1, b1Shift, b2, b3;
while (utfIndex < utfLength) {
b1 = ((int) buffer[start++]) & 0xff; // the &
0xff eliminates any bits that were added on from sign extension
b1Shift = b1 >> 4;
// this block handles b1 of form: 0xxx xxxx
if (b1Shift <= 7) {
utfIndex++;
str[strIndex++] = (char) b1;
}
// this block handles b1, b2 of form: 110x xxxx,
10xx xxxx
else if ((b1Shift == 12) || (b1Shift == 13)) {
utfIndex += 2;
if (utfIndex > utfLength)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 110x xxxx) at a point where this pattern will require a UTF index = " +
utfIndex + " which will exceed the specifed UTF length of " + utfLength);
b2 = (int) buffer[start++];
if ((b2 & 0xC0) != 0x80)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 110x xxxx) followed by byte b2 = " + b2 + " (which does NOT have the
corect bit pattern 10xx xxxx)");
str[strIndex++] = (char) (((b1 & 0x1F) << 6) |
(b2 & 0x3F));
}
// this block handles b1, b2, b3 of form: 1110
xxxx, 10xx xxxx, 10xx xxxx
else if (b1Shift == 14) {
utfIndex += 3;
if (utfIndex > utfLength)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 1110 xxxx) at a point where this pattern will require a UTF index = " +
utfIndex + " which will exceed the specifed UTF length of " + utfLength);
b2 = (int) buffer[start++];
b3 = (int) buffer[start++];
if ((b2 & 0xC0) != 0x80)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 110x xxxx) followed by byte b2 = " + b2 + " (which does NOT have the bit
pattern 10xx xxxx)");
else if ((b3 & 0xC0) != 0x80)
throw new
UTFDataFormatException("encountered byte b1 = " + b1 + " (which has the bit
pattern 1110 xxxx) followed by byte b2 = " + b2 + " (which has the correct bit
pattern 10xx xxxx) followed by byte b3 = " + b3 + " (which does NOT have the
corect bit pattern 10xx xxxx)");
str[strIndex++] = (char) (((b1 & 0x0F) << 12) |
((b2 & 0x3F) << 6) | ((b3 & 0x3F) << 0));
}
// this block handles b1 of form: 10xx xxxx or
1111 xxxx
else
throw new UTFDataFormatException("encountered
byte b1 = " + b1 + " (which has an INVALID bit pattern of either 10xx xxxx or
1111 xxxx)");
}
return new String(str, 0, strIndex); // copy that part of str
which was actually assigned char values to a new String
}
*/
(Review ID: 105573)
======================================================================
- relates to
-
JDK-6620047 DataOutputStream.writeUTF(null) should not throw NPE
-
- Closed
-