The UnicodeReader/JavaTokenizer code is made more obscure than needed by some suboptimal implementation & naming choices:
* UnicodeReader has both API to 'scan' a char (e.g. read next char) as well as to 'putChar' e.g. save char in a temp buffer. The buffer functionality should arguably be higher-level and moved in the JavaTokenizer itself?
* 'putChar' has subtle overloads, some of which also advance the reader state
* the reader 'scanChar' API could be made more intuitive by renaming it to something like 'nextChar' suggesting the underlying state change
This revised changeset addresses two one by one errors that occurred in the original submission.
diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
index 39d9eadcf3a..b8425ad1ecb 100644
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
@@ -306,8 +306,9 @@ public class JavadocTokenizer extends JavaTokenizer {
*
* Thus, to find the source position of any position, p, in the comment
* string, find the index, i, of the pair whose string offset
- * ({@code map[i + SB_OFFSET] }) is closest to but not greater than p. Then,
- * {@code sourcePos(p) = map[i + POS_OFFSET] + (p - map[i + SB_OFFSET]) }.
+ * ({@code map[i * NOFFSETS + SB_OFFSET] }) is closest to but not greater
+ * than p. Then, {@code sourcePos(p) = map[i * NOFFSETS + POS_OFFSET] +
+ * (p - map[i * NOFFSETS + SB_OFFSET]) }.
*/
static class OffsetMap {
/**
@@ -426,7 +427,7 @@ public class JavadocTokenizer extends JavaTokenizer {
int start = 0;
int end = size / NOFFSETS;
- while (start < end - NOFFSETS) {
+ while (start < end - 1) {
// find an index midway between start and end
int index = (start + end) / 2;
int indexScaled = index * NOFFSETS;
diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
index 2472632dbcd..7584b79044b 100644
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
@@ -221,48 +221,49 @@ public class UnicodeReader {
private boolean unicodeEscape() {
// Start of unicode escape (past backslash.)
int start = position + width;
- int index;
+
+ // Default to backslash result, unless proven otherwise.
+ character = '\\';
+ width = 1;
// Skip multiple 'u'.
+ int index;
for (index = start; index < length; index++) {
if (buffer[index] != 'u') {
break;
}
}
- // Needs to be at least backslash-u.
- if (index != start) {
- // If enough characters available.
- if (index + 4 < length) {
- // Convert four hex digits to codepoint. If any digit is invalid then the
- // result is negative.
- int code = (Character.digit(buffer[index++], 16) << 12) |
- (Character.digit(buffer[index++], 16) << 8) |
- (Character.digit(buffer[index++], 16) << 4) |
- Character.digit(buffer[index++], 16);
-
- // If all digits are good.
- if (code >= 0) {
- width = index - position;
- character = (char)code;
-
- return true;
- }
- }
+ // Needs to have been at least one u.
+ if (index == start) {
+ return false;
+ }
- // Did not work out.
- log.error(position, Errors.IllegalUnicodeEsc);
- width = index - position;
+ int code = 0;
- // Return true so that the invalid unicode escape is skipped.
- return true;
+ for (int i = 0; i < 4; i++) {
+ int digit = Character.digit(buffer[index], 16);
+ code = code << 4 | digit;
+
+ if (code < 0) {
+ break;
+ }
+
+ index++;
}
- // Must be just a backslash.
- character = '\\';
- width = 1;
+ // Skip digits even if error.
+ width = index - position;
- return false;
+ // If all digits are good.
+ if (code >= 0) {
+ character = (char)code;
+ } else {
+ log.error(position, Errors.IllegalUnicodeEsc);
+ }
+
+ // Return true even if error so that the invalid unicode escape is skipped.
+ return true;
}
/**
@@ -549,7 +550,7 @@ public class UnicodeReader {
/**
* Offset from the beginning of the original reader buffer.
*/
- private int offset;
+ final private int offset;
/**
* Current column in the comment.
* UnicodeReader has both API to 'scan' a char (e.g. read next char) as well as to 'putChar' e.g. save char in a temp buffer. The buffer functionality should arguably be higher-level and moved in the JavaTokenizer itself?
* 'putChar' has subtle overloads, some of which also advance the reader state
* the reader 'scanChar' API could be made more intuitive by renaming it to something like 'nextChar' suggesting the underlying state change
This revised changeset addresses two one by one errors that occurred in the original submission.
diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
index 39d9eadcf3a..b8425ad1ecb 100644
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
@@ -306,8 +306,9 @@ public class JavadocTokenizer extends JavaTokenizer {
*
* Thus, to find the source position of any position, p, in the comment
* string, find the index, i, of the pair whose string offset
- * ({@code map[i + SB_OFFSET] }) is closest to but not greater than p. Then,
- * {@code sourcePos(p) = map[i + POS_OFFSET] + (p - map[i + SB_OFFSET]) }.
+ * ({@code map[i * NOFFSETS + SB_OFFSET] }) is closest to but not greater
+ * than p. Then, {@code sourcePos(p) = map[i * NOFFSETS + POS_OFFSET] +
+ * (p - map[i * NOFFSETS + SB_OFFSET]) }.
*/
static class OffsetMap {
/**
@@ -426,7 +427,7 @@ public class JavadocTokenizer extends JavaTokenizer {
int start = 0;
int end = size / NOFFSETS;
- while (start < end - NOFFSETS) {
+ while (start < end - 1) {
// find an index midway between start and end
int index = (start + end) / 2;
int indexScaled = index * NOFFSETS;
diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
index 2472632dbcd..7584b79044b 100644
--- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
+++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
@@ -221,48 +221,49 @@ public class UnicodeReader {
private boolean unicodeEscape() {
// Start of unicode escape (past backslash.)
int start = position + width;
- int index;
+
+ // Default to backslash result, unless proven otherwise.
+ character = '\\';
+ width = 1;
// Skip multiple 'u'.
+ int index;
for (index = start; index < length; index++) {
if (buffer[index] != 'u') {
break;
}
}
- // Needs to be at least backslash-u.
- if (index != start) {
- // If enough characters available.
- if (index + 4 < length) {
- // Convert four hex digits to codepoint. If any digit is invalid then the
- // result is negative.
- int code = (Character.digit(buffer[index++], 16) << 12) |
- (Character.digit(buffer[index++], 16) << 8) |
- (Character.digit(buffer[index++], 16) << 4) |
- Character.digit(buffer[index++], 16);
-
- // If all digits are good.
- if (code >= 0) {
- width = index - position;
- character = (char)code;
-
- return true;
- }
- }
+ // Needs to have been at least one u.
+ if (index == start) {
+ return false;
+ }
- // Did not work out.
- log.error(position, Errors.IllegalUnicodeEsc);
- width = index - position;
+ int code = 0;
- // Return true so that the invalid unicode escape is skipped.
- return true;
+ for (int i = 0; i < 4; i++) {
+ int digit = Character.digit(buffer[index], 16);
+ code = code << 4 | digit;
+
+ if (code < 0) {
+ break;
+ }
+
+ index++;
}
- // Must be just a backslash.
- character = '\\';
- width = 1;
+ // Skip digits even if error.
+ width = index - position;
- return false;
+ // If all digits are good.
+ if (code >= 0) {
+ character = (char)code;
+ } else {
+ log.error(position, Errors.IllegalUnicodeEsc);
+ }
+
+ // Return true even if error so that the invalid unicode escape is skipped.
+ return true;
}
/**
@@ -549,7 +550,7 @@ public class UnicodeReader {
/**
* Offset from the beginning of the original reader buffer.
*/
- private int offset;
+ final private int offset;
/**
* Current column in the comment.
- clones
-
JDK-8224225 Tokenizer improvements
- Resolved
- relates to
-
JDK-8259050 Error recovery in lexer could be improved
- Resolved
-
JDK-8269150 UnicodeReader not translating \u005c\\u005d to \\]
- Closed
-
JDK-8253904 Revert Tokenizer improvements JDK-8224225
- Closed
-
JDK-8256071 Regression ~3% in Javac-Cold-Generate in 16-b20
- Closed
-
JDK-8252719 update DocCommentParser
- New
(1 relates to, 2 links to)