Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-8254073

Tokenizer improvements (revised)

XMLWordPrintable

    • Icon: Enhancement Enhancement
    • Resolution: Fixed
    • Icon: P3 P3
    • 16
    • None
    • tools
    • None
    • b20

      The UnicodeReader/JavaTokenizer code is made more obscure than needed by some suboptimal implementation & naming choices:

      * UnicodeReader has both API to 'scan' a char (e.g. read next char) as well as to 'putChar' e.g. save char in a temp buffer. The buffer functionality should arguably be higher-level and moved in the JavaTokenizer itself?

      * 'putChar' has subtle overloads, some of which also advance the reader state

      * the reader 'scanChar' API could be made more intuitive by renaming it to something like 'nextChar' suggesting the underlying state change

      This revised changeset addresses two one by one errors that occurred in the original submission.

      diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
      index 39d9eadcf3a..b8425ad1ecb 100644
      --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
      +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/JavadocTokenizer.java
      @@ -306,8 +306,9 @@ public class JavadocTokenizer extends JavaTokenizer {
            *
            * Thus, to find the source position of any position, p, in the comment
            * string, find the index, i, of the pair whose string offset
      - * ({@code map[i + SB_OFFSET] }) is closest to but not greater than p. Then,
      - * {@code sourcePos(p) = map[i + POS_OFFSET] + (p - map[i + SB_OFFSET]) }.
      + * ({@code map[i * NOFFSETS + SB_OFFSET] }) is closest to but not greater
      + * than p. Then, {@code sourcePos(p) = map[i * NOFFSETS + POS_OFFSET] +
      + * (p - map[i * NOFFSETS + SB_OFFSET]) }.
            */
           static class OffsetMap {
               /**
      @@ -426,7 +427,7 @@ public class JavadocTokenizer extends JavaTokenizer {
                   int start = 0;
                   int end = size / NOFFSETS;
       
      - while (start < end - NOFFSETS) {
      + while (start < end - 1) {
                       // find an index midway between start and end
                       int index = (start + end) / 2;
                       int indexScaled = index * NOFFSETS;
      diff --git a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
      index 2472632dbcd..7584b79044b 100644
      --- a/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
      +++ b/src/jdk.compiler/share/classes/com/sun/tools/javac/parser/UnicodeReader.java
      @@ -221,48 +221,49 @@ public class UnicodeReader {
           private boolean unicodeEscape() {
               // Start of unicode escape (past backslash.)
               int start = position + width;
      - int index;
      +
      + // Default to backslash result, unless proven otherwise.
      + character = '\\';
      + width = 1;
       
               // Skip multiple 'u'.
      + int index;
               for (index = start; index < length; index++) {
                   if (buffer[index] != 'u') {
                       break;
                   }
               }
       
      - // Needs to be at least backslash-u.
      - if (index != start) {
      - // If enough characters available.
      - if (index + 4 < length) {
      - // Convert four hex digits to codepoint. If any digit is invalid then the
      - // result is negative.
      - int code = (Character.digit(buffer[index++], 16) << 12) |
      - (Character.digit(buffer[index++], 16) << 8) |
      - (Character.digit(buffer[index++], 16) << 4) |
      - Character.digit(buffer[index++], 16);
      -
      - // If all digits are good.
      - if (code >= 0) {
      - width = index - position;
      - character = (char)code;
      -
      - return true;
      - }
      - }
      + // Needs to have been at least one u.
      + if (index == start) {
      + return false;
      + }
       
      - // Did not work out.
      - log.error(position, Errors.IllegalUnicodeEsc);
      - width = index - position;
      + int code = 0;
       
      - // Return true so that the invalid unicode escape is skipped.
      - return true;
      + for (int i = 0; i < 4; i++) {
      + int digit = Character.digit(buffer[index], 16);
      + code = code << 4 | digit;
      +
      + if (code < 0) {
      + break;
      + }
      +
      + index++;
               }
       
      - // Must be just a backslash.
      - character = '\\';
      - width = 1;
      + // Skip digits even if error.
      + width = index - position;
       
      - return false;
      + // If all digits are good.
      + if (code >= 0) {
      + character = (char)code;
      + } else {
      + log.error(position, Errors.IllegalUnicodeEsc);
      + }
      +
      + // Return true even if error so that the invalid unicode escape is skipped.
      + return true;
           }
       
           /**
      @@ -549,7 +550,7 @@ public class UnicodeReader {
               /**
                * Offset from the beginning of the original reader buffer.
                */
      - private int offset;
      + final private int offset;
       
               /**
                * Current column in the comment.

            jlaskey Jim Laskey
            jlaskey Jim Laskey
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

              Created:
              Updated:
              Resolved: