-
Bug
-
Resolution: Unresolved
-
P4
-
8, 11, 12, 13
-
generic
-
generic
A DESCRIPTION OF THE PROBLEM :
Matchers whose pattern matches trailing high surrogates report false for requireEnd(). This is incorrecty because as specified by the documentation for Matcher.requireEnd():
"Returns true if more input could change a positive match into a negative one."
When a low surrogate is now added to the input, the matcher will not find the previous match anymore, partially due toJDK-8149446, or because it finds no match at all. Therefore requireEnd() should have returned true.
The provided reproduction code shows that and how the match ranges change.
STEPS TO FOLLOW TO REPRODUCE THE PROBLEM :
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RequireEnd {
public static void main(String[] args) {
testSurrogateRequireEnd();
}
/**
* Creates a regex matching the given code point
*/
private static String toRegex(int codePoint) {
return String.format("\\x{%X}", codePoint);
}
/**
* {@link Matcher#find() Finds} matches for the given matcher and returns a list
* of string representations of the match ranges.
* No further matching will be attempted if a match reached the end of the input.
* This allows using the {@code requireEnd()} method of the matcher afterwards.
*/
private static List<String> findMatchRegions(Matcher matcher) {
List<String> matchStrings = new ArrayList<>();
int lastMatchEnd = 0;
while (lastMatchEnd < matcher.regionEnd()) {
boolean foundMatch = matcher.find();
if (foundMatch) {
matchStrings.add(matcher.start() + "-" + matcher.end());
lastMatchEnd = matcher.end();
}
else {
break;
}
}
return matchStrings;
}
private static void testSurrogateRequireEnd() {
final String surrogateString = String.valueOf(Character.toChars(0x1F30C));
// List of regexes which match trailing high surrogate
Arrays.asList(
toRegex(surrogateString.charAt(0)),
"\\p{gc=Cs}", // Cs: Surrogate
"\\p{blk=High Surrogates}"
).stream()
.map(Pattern::compile)
.forEach(pattern -> {
// Only uses high surrogate of surrogate pair
Matcher highSurrogateMatcher = pattern.matcher(surrogateString.substring(0, 1));
List<String> highSurrogateMatchRegions = findMatchRegions(highSurrogateMatcher);
assert(!highSurrogateMatchRegions.isEmpty())
: "Bad test regex, should have matched high surrogate";
boolean requireEnd = highSurrogateMatcher.requireEnd();
// Uses complete surrogate pair
Matcher surrogateMatcher = pattern.matcher(surrogateString);
List<String> surrogateMatchRegions = findMatchRegions(surrogateMatcher);
// Summary:
System.out.println("Pattern: " + pattern);
System.out.println(" requireEnd(): " + requireEnd);
assert(highSurrogateMatchRegions.equals(surrogateMatchRegions))
: "Bad test regex, should have found different matches";
System.out.println(" match regions: " + highSurrogateMatchRegions + " -> " + surrogateMatchRegions);
});
}
}
EXPECTED VERSUS ACTUAL BEHAVIOR :
EXPECTED -
Output "requireEnd(): true" for every entry
FREQUENCY : always
Matchers whose pattern matches trailing high surrogates report false for requireEnd(). This is incorrecty because as specified by the documentation for Matcher.requireEnd():
"Returns true if more input could change a positive match into a negative one."
When a low surrogate is now added to the input, the matcher will not find the previous match anymore, partially due to
The provided reproduction code shows that and how the match ranges change.
STEPS TO FOLLOW TO REPRODUCE THE PROBLEM :
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class RequireEnd {
public static void main(String[] args) {
testSurrogateRequireEnd();
}
/**
* Creates a regex matching the given code point
*/
private static String toRegex(int codePoint) {
return String.format("\\x{%X}", codePoint);
}
/**
* {@link Matcher#find() Finds} matches for the given matcher and returns a list
* of string representations of the match ranges.
* No further matching will be attempted if a match reached the end of the input.
* This allows using the {@code requireEnd()} method of the matcher afterwards.
*/
private static List<String> findMatchRegions(Matcher matcher) {
List<String> matchStrings = new ArrayList<>();
int lastMatchEnd = 0;
while (lastMatchEnd < matcher.regionEnd()) {
boolean foundMatch = matcher.find();
if (foundMatch) {
matchStrings.add(matcher.start() + "-" + matcher.end());
lastMatchEnd = matcher.end();
}
else {
break;
}
}
return matchStrings;
}
private static void testSurrogateRequireEnd() {
final String surrogateString = String.valueOf(Character.toChars(0x1F30C));
// List of regexes which match trailing high surrogate
Arrays.asList(
toRegex(surrogateString.charAt(0)),
"\\p{gc=Cs}", // Cs: Surrogate
"\\p{blk=High Surrogates}"
).stream()
.map(Pattern::compile)
.forEach(pattern -> {
// Only uses high surrogate of surrogate pair
Matcher highSurrogateMatcher = pattern.matcher(surrogateString.substring(0, 1));
List<String> highSurrogateMatchRegions = findMatchRegions(highSurrogateMatcher);
assert(!highSurrogateMatchRegions.isEmpty())
: "Bad test regex, should have matched high surrogate";
boolean requireEnd = highSurrogateMatcher.requireEnd();
// Uses complete surrogate pair
Matcher surrogateMatcher = pattern.matcher(surrogateString);
List<String> surrogateMatchRegions = findMatchRegions(surrogateMatcher);
// Summary:
System.out.println("Pattern: " + pattern);
System.out.println(" requireEnd(): " + requireEnd);
assert(highSurrogateMatchRegions.equals(surrogateMatchRegions))
: "Bad test regex, should have found different matches";
System.out.println(" match regions: " + highSurrogateMatchRegions + " -> " + surrogateMatchRegions);
});
}
}
EXPECTED VERSUS ACTUAL BEHAVIOR :
EXPECTED -
Output "requireEnd(): true" for every entry
FREQUENCY : always