-
Bug
-
Resolution: Fixed
-
P3
-
9
-
b151
-
aarch64
-
linux
Issue | Fix Version | Assignee | Priority | Status | Resolution | Resolved In Build |
---|---|---|---|---|---|---|
JDK-8261945 | openjdk8u292 | Ed Nevill | P3 | Resolved | Fixed | b04 |
The patterns for multplyExact long in aarch64.ad shift by 31 instead of 63
0x000003ff64b29aa8: mul x8, x19, x20
0x000003ff64b29aac: smulh x9, x19, x20
0x000003ff64b29ab0: cmp x9, x8, asr #31 <<<<<<< should be 63
0x000003ff64b29ab4: b.ne 0x000003ff64b29b4c
The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.
This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.
The effect is a degenerate dis-improvement in performance. Usingthe following test case
class Mul {
static long multest(long a, long b) {
long res = a;
for (int i = 0; i < 100000000; i++) {
res += Math.multiplyExact(a, b);
a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
}
return res;
}
public static void main(String argv[]) {
long a = 0x5a5a5a5aL;
long b = 0xa5a5a5a5L;
System.out.println("res " + a + ", " + b + " = " + multest(a, b));
}
}
With -XX:-UseMathExactIntrinsics this takes 1.95 S
With -XX:+UseMathExactIntrinsics this takes 13.42 S
With the following webrev
http://cr.openjdk.java.net/~enevill/8171410/webrev
-XX:-UseMathExactIntrinsics takes 1.98 S
-XX:+UseMathExactIntrinsics takes 0.95 S
--- CUT HERE ---
# HG changeset patch
# User enevill
# Date 1482100004 18000
# Sun Dec 18 17:26:44 2016 -0500
# Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
# Parent 66e2100be052e42af8064e519658185112be6dc3
8171410: aarch64: long multiplyExact shifts by 31 instead of 63
Reviewed-by: aph
diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad
+++ b/src/cpu/aarch64/vm/aarch64.ad
@@ -14086,7 +14086,7 @@
format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
"smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+ "cmp rscratch2, rscratch1, ASR #63\n\t"
"movw rscratch1, #0x80000000\n\t"
"cselw rscratch1, rscratch1, zr, NE\n\t"
"cmpw rscratch1, #1" %}
@@ -14094,7 +14094,7 @@
ins_encode %{
__ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
__ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
- __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
+ __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
__ movw(rscratch1, 0x80000000); // Develop 0 (EQ),
__ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
__ cmpw(rscratch1, 1); // 0x80000000 - 1 => VS
@@ -14112,7 +14112,7 @@
format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
"smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+ "cmp rscratch2, rscratch1, ASR #63\n\t"
"b$cmp $labl" %}
ins_cost(4 * INSN_COST); // Branch is rare so treat as INSN_COST
ins_encode %{
@@ -14120,7 +14120,7 @@
Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
__ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
__ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
- __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
+ __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
__ br(cond == Assembler::VS ? Assembler::NE : Assembler::EQ, *L);
%}
--- CUT HERE ---
The patterns for multplyExact long in aarch64.ad shift by 31 instead of 63
0x000003ff64b29aa8: mul x8, x19, x20
0x000003ff64b29aac: smulh x9, x19, x20
0x000003ff64b29ab0: cmp x9, x8, asr #31 <<<<<<< should be 63
0x000003ff64b29ab4: b.ne 0x000003ff64b29b4c
The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.
This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.
The effect is a degenerate dis-improvement in performance. Usingthe following test case
class Mul {
static long multest(long a, long b) {
long res = a;
for (int i = 0; i < 100000000; i++) {
res += Math.multiplyExact(a, b);
a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
}
return res;
}
public static void main(String argv[]) {
long a = 0x5a5a5a5aL;
long b = 0xa5a5a5a5L;
System.out.println("res " + a + ", " + b + " = " + multest(a, b));
}
}
With -XX:-UseMathExactIntrinsics this takes 1.95 S
With -XX:+UseMathExactIntrinsics this takes 13.42 S
With the following webrev
http://cr.openjdk.java.net/~enevill/8171410/webrev
-XX:-UseMathExactIntrinsics takes 1.98 S
-XX:+UseMathExactIntrinsics takes 0.95 S
--- CUT HERE ---
# HG changeset patch
# User enevill
# Date 1482100004 18000
# Sun Dec 18 17:26:44 2016 -0500
# Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
# Parent 66e2100be052e42af8064e519658185112be6dc3
8171410: aarch64: long multiplyExact shifts by 31 instead of 63
Reviewed-by: aph
diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad
+++ b/src/cpu/aarch64/vm/aarch64.ad
@@ -14086,7 +14086,7 @@
format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
"smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+ "cmp rscratch2, rscratch1, ASR #63\n\t"
"movw rscratch1, #0x80000000\n\t"
"cselw rscratch1, rscratch1, zr, NE\n\t"
"cmpw rscratch1, #1" %}
@@ -14094,7 +14094,7 @@
ins_encode %{
__ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
__ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
- __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
+ __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
__ movw(rscratch1, 0x80000000); // Develop 0 (EQ),
__ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
__ cmpw(rscratch1, 1); // 0x80000000 - 1 => VS
@@ -14112,7 +14112,7 @@
format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
"smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+
0x000003ff64b29aa8: mul x8, x19, x20
0x000003ff64b29aac: smulh x9, x19, x20
0x000003ff64b29ab0: cmp x9, x8, asr #31 <<<<<<< should be 63
0x000003ff64b29ab4: b.ne 0x000003ff64b29b4c
The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.
This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.
The effect is a degenerate dis-improvement in performance. Usingthe following test case
class Mul {
static long multest(long a, long b) {
long res = a;
for (int i = 0; i < 100000000; i++) {
res += Math.multiplyExact(a, b);
a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
}
return res;
}
public static void main(String argv[]) {
long a = 0x5a5a5a5aL;
long b = 0xa5a5a5a5L;
System.out.println("res " + a + ", " + b + " = " + multest(a, b));
}
}
With -XX:-UseMathExactIntrinsics this takes 1.95 S
With -XX:+UseMathExactIntrinsics this takes 13.42 S
With the following webrev
http://cr.openjdk.java.net/~enevill/8171410/webrev
-XX:-UseMathExactIntrinsics takes 1.98 S
-XX:+UseMathExactIntrinsics takes 0.95 S
--- CUT HERE ---
# HG changeset patch
# User enevill
# Date 1482100004 18000
# Sun Dec 18 17:26:44 2016 -0500
# Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
# Parent 66e2100be052e42af8064e519658185112be6dc3
8171410: aarch64: long multiplyExact shifts by 31 instead of 63
Reviewed-by: aph
diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad
+++ b/src/cpu/aarch64/vm/aarch64.ad
@@ -14086,7 +14086,7 @@
format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
"smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+ "cmp rscratch2, rscratch1, ASR #63\n\t"
"movw rscratch1, #0x80000000\n\t"
"cselw rscratch1, rscratch1, zr, NE\n\t"
"cmpw rscratch1, #1" %}
@@ -14094,7 +14094,7 @@
ins_encode %{
__ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
__ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
- __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
+ __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
__ movw(rscratch1, 0x80000000); // Develop 0 (EQ),
__ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
__ cmpw(rscratch1, 1); // 0x80000000 - 1 => VS
@@ -14112,7 +14112,7 @@
format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
"smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+ "cmp rscratch2, rscratch1, ASR #63\n\t"
"b$cmp $labl" %}
ins_cost(4 * INSN_COST); // Branch is rare so treat as INSN_COST
ins_encode %{
@@ -14120,7 +14120,7 @@
Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
__ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
__ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
- __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
+ __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
__ br(cond == Assembler::VS ? Assembler::NE : Assembler::EQ, *L);
%}
--- CUT HERE ---
The patterns for multplyExact long in aarch64.ad shift by 31 instead of 63
0x000003ff64b29aa8: mul x8, x19, x20
0x000003ff64b29aac: smulh x9, x19, x20
0x000003ff64b29ab0: cmp x9, x8, asr #31 <<<<<<< should be 63
0x000003ff64b29ab4: b.ne 0x000003ff64b29b4c
The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.
This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.
The effect is a degenerate dis-improvement in performance. Usingthe following test case
class Mul {
static long multest(long a, long b) {
long res = a;
for (int i = 0; i < 100000000; i++) {
res += Math.multiplyExact(a, b);
a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
}
return res;
}
public static void main(String argv[]) {
long a = 0x5a5a5a5aL;
long b = 0xa5a5a5a5L;
System.out.println("res " + a + ", " + b + " = " + multest(a, b));
}
}
With -XX:-UseMathExactIntrinsics this takes 1.95 S
With -XX:+UseMathExactIntrinsics this takes 13.42 S
With the following webrev
http://cr.openjdk.java.net/~enevill/8171410/webrev
-XX:-UseMathExactIntrinsics takes 1.98 S
-XX:+UseMathExactIntrinsics takes 0.95 S
--- CUT HERE ---
# HG changeset patch
# User enevill
# Date 1482100004 18000
# Sun Dec 18 17:26:44 2016 -0500
# Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
# Parent 66e2100be052e42af8064e519658185112be6dc3
8171410: aarch64: long multiplyExact shifts by 31 instead of 63
Reviewed-by: aph
diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad
+++ b/src/cpu/aarch64/vm/aarch64.ad
@@ -14086,7 +14086,7 @@
format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
"smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+ "cmp rscratch2, rscratch1, ASR #63\n\t"
"movw rscratch1, #0x80000000\n\t"
"cselw rscratch1, rscratch1, zr, NE\n\t"
"cmpw rscratch1, #1" %}
@@ -14094,7 +14094,7 @@
ins_encode %{
__ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
__ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
- __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
+ __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
__ movw(rscratch1, 0x80000000); // Develop 0 (EQ),
__ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
__ cmpw(rscratch1, 1); // 0x80000000 - 1 => VS
@@ -14112,7 +14112,7 @@
format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
"smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+
- backported by
-
JDK-8261945 aarch64: long multiplyExact shifts by 31 instead of 63
- Resolved