Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-8037397

RegEx pattern matching loses character class after intersection (&&) operator

    XMLWordPrintable

Details

    • b20
    • x86
    • windows_7

    Description

      FULL PRODUCT VERSION :
      java version "1.7.0_51"
      Java(TM) SE Runtime Environment (build 1.7.0_51-b13)
      Java HotSpot(TM) Client VM (build 24.51-b03, mixed mode, sharing)

      A DESCRIPTION OF THE PROBLEM :
      This bug requires 2 conditions to be triggered:
      - 1 or more consecutive nested character classes (surrounded by [ ]) following right after an intersection &&
      - Right after the series of nested character classes mentioned above is at least one "flat" character class

      Then the series of nested character classes are lost.

      For example:
      // Working cases
      String ra = "[A-Z&&[0-9A-Z]]"; // Nothing after the nested character class
      String rb = "[A-Z&&0-9A-Z]"; // Everything is "flat"
      String rc = "[A-Z&&0-9[A-Z]]"; // [A-Z] is after "flat" character class 0-9
       
      // Failing cases
      String rx = "[A-Z&&[A-Z]0-9]"; // [A-Z] is lost
      String ry = "[A-Z&&[A-F][G-Z]0-9]"; // [A-F][G-Z] is lost


      REPRODUCIBILITY :
      This bug can be reproduced always.

      ---------- BEGIN SOURCE ----------
      /* Author: Hong Dai Thanh/nhahtdh */
      import java.lang.reflect.*;
      import java.util.regex.*;

      class BugLostCharacterClass {
          private static Class<?> CharProperty;
          private static Class<?> Node;
          
          static {
              try {
                  CharProperty = Class.forName("java.util.regex.Pattern$CharProperty");
                  Node = Class.forName("java.util.regex.Pattern$Node");
              } catch (Exception e) {
                  throw new RuntimeException(e);
              }
          }
          
          private static void indent(int level) {
              System.out.print(new String(new char[level * 3]).replace('\0',' '));
          }
          
          private static void dissectCharClass(int indent, Object o) throws Exception {
              Field[] fields = o.getClass().getDeclaredFields();
              
              Method enclosing = o.getClass().getEnclosingMethod();
              
              indent(indent);
              System.out.println((enclosing != null ? enclosing.getName() + " " : "") + o.getClass().getName());
              
              for (Field f: fields) {
                  f.setAccessible(true);
                  Object c = f.get(o);
                  
                  if (CharProperty.isAssignableFrom(f.getType())) {
                      dissectCharClass(indent + 1, c);
                  } else {
                      indent(indent + 1);
                      System.out.println(f.getName() + ":" + c);
                  }
              }
          }
          
          private static void dissectNode(Object o) throws Exception {
              Field nextField = Node.getDeclaredField("next");
              nextField.setAccessible(true);

              while (o != null) {
                  if (CharProperty.isInstance(o)) {
                      dissectCharClass(0, o);
                  } else {
                      System.out.println(o.getClass().getName());
                  }
                  
                  o = nextField.get(o);
              }
          }
          
          private static void dissectPattern(Pattern p) throws Exception {
              Field rootField = Pattern.class.getDeclaredField("root");
              rootField.setAccessible(true);
              
              Object root = rootField.get(p);
              
              dissectNode(root);
              System.out.println();
          }
          
          public static void main(String args[]) throws Exception {
              String ra = "[A-Z&&[0-9A-Z]]";
              String rb = "[A-Z&&0-9A-Z]";
              String rc = "[A-Z&&0-9[A-Z]]";
              
              String rx = "[A-Z&&[A-Z]0-9]";
              String ry = "[A-Z&&[A-F][G-Z]0-9]";
              //--------------------------------------------
              
              // Structure verification via reflection
              
              // COMPARISON CASES
              // These 3 character classes work as expected.
              dissectPattern(Pattern.compile(ra));
              dissectPattern(Pattern.compile(rb));
              dissectPattern(Pattern.compile(rc));
              
              // EXPECTED: The regex below should be equivalent to the 3 regex in COMPARISON CASES
              // ACTUAL: The nested character class [A-Z] is lost during compilation. This regex can't match anything.
              dissectPattern(Pattern.compile(rx));
              
              // EXPECTED: The regex below should be equivalent to the 3 regex in COMPARISON CASES
              // ACTUAL: The nested character class [A-F] and [G-Z] is lost during compilation. This regex can't match anything.
              dissectPattern(Pattern.compile(ry));
              
              //---------------------------------------------
              // Normal blackbox testing
              
              // ACTUAL=EXPECTED: true
              System.out.println("A".matches(ra));
              System.out.println("A".matches(rb));
              System.out.println("A".matches(rc));
              
              // EXPECTED: true
              // ACTUAL: false
              System.out.println("A".matches(rx));
              
              // EXPECTED: true
              // ACTUAL: false
              System.out.println("A".matches(ry));
          }
      }
      ---------- END SOURCE ----------

      CUSTOMER SUBMITTED WORKAROUND :
      * To fix this bug:

      In clazz() method of Pattern class

                                 } else { // abc&&def
                                      unread();
                                      if (rightNode == null) { //L
                                          rightNode = clazz(false);
                                      } else { //L
                                          rightNode = union(rightNode, clazz(false)); //L
                                      }//L
                                  }

      Lines ending with //L are lines to be added

      * To workaround:

      Enclose the whole thing inside a nested character class, instead of leaving some "flat" character class outside.

      Attachments

        Issue Links

          There are no Sub-Tasks for this issue.

          Activity

            People

              igraves Ian Graves
              webbuggrp Webbug Group
              Votes:
              0 Vote for this issue
              Watchers:
              5 Start watching this issue

              Dates

                Created:
                Updated:
                Resolved: