Loading...

Type: Bug
Resolution: Fixed
Priority: P3
Fix Version/s: 9
Affects Version/s: 9
Component/s: hotspot
Labels:
- jdk8u-fix-request
- jdk8u-fix-yes

Subcomponent:
compiler
Resolved In Build:
b151
CPU:

aarch64
OS:

linux

Issue	Fix Version	Assignee	Priority	Status	Resolution	Resolved In Build
JDK-8261945	openjdk8u292	Ed Nevill	P3	Resolved	Fixed	b04

The patterns for multplyExact long in aarch64.ad shift by 31 instead of 63

  0x000003ff64b29aa8: mul x8, x19, x20
  0x000003ff64b29aac: smulh x9, x19, x20
  0x000003ff64b29ab0: cmp x9, x8, asr #31 <<<<<<< should be 63
  0x000003ff64b29ab4: b.ne 0x000003ff64b29b4c

The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.

This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.

The effect is a degenerate dis-improvement in performance. Usingthe following test case

class Mul {
  static long multest(long a, long b) {
long res = a;
for (int i = 0; i < 100000000; i++) {
res += Math.multiplyExact(a, b);
a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
}
return res;
  }
  public static void main(String argv[]) {
        long a = 0x5a5a5a5aL;
long b = 0xa5a5a5a5L;
System.out.println("res " + a + ", " + b + " = " + multest(a, b));
  }
}

With -XX:-UseMathExactIntrinsics this takes 1.95 S
With -XX:+UseMathExactIntrinsics this takes 13.42 S

With the following webrev

http://cr.openjdk.java.net/~enevill/8171410/webrev

-XX:-UseMathExactIntrinsics takes 1.98 S
-XX:+UseMathExactIntrinsics takes 0.95 S

--- CUT HERE ---
# HG changeset patch
# User enevill
# Date 1482100004 18000
# Sun Dec 18 17:26:44 2016 -0500
# Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
# Parent 66e2100be052e42af8064e519658185112be6dc3
8171410: aarch64: long multiplyExact shifts by 31 instead of 63
Reviewed-by: aph

diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad
+++ b/src/cpu/aarch64/vm/aarch64.ad
@@ -14086,7 +14086,7 @@

   format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
             "smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+ "cmp rscratch2, rscratch1, ASR #63\n\t"
             "movw rscratch1, #0x80000000\n\t"
             "cselw rscratch1, rscratch1, zr, NE\n\t"
             "cmpw rscratch1, #1" %}
@@ -14094,7 +14094,7 @@
   ins_encode %{
     __ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
     __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
- __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
+ __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
     __ movw(rscratch1, 0x80000000); // Develop 0 (EQ),
     __ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
     __ cmpw(rscratch1, 1); // 0x80000000 - 1 => VS
@@ -14112,7 +14112,7 @@

   format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
             "smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+ "cmp rscratch2, rscratch1, ASR #63\n\t"
             "b$cmp $labl" %}
   ins_cost(4 * INSN_COST); // Branch is rare so treat as INSN_COST
   ins_encode %{
@@ -14120,7 +14120,7 @@
     Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
     __ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
     __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
- __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
+ __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
     __ br(cond == Assembler::VS ? Assembler::NE : Assembler::EQ, *L);
   %}

--- CUT HERE ---

The patterns for multplyExact long in aarch64.ad shift by 31 instead of 63

  0x000003ff64b29aa8: mul x8, x19, x20
  0x000003ff64b29aac: smulh x9, x19, x20
  0x000003ff64b29ab0: cmp x9, x8, asr #31 <<<<<<< should be 63
  0x000003ff64b29ab4: b.ne 0x000003ff64b29b4c

The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.

This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.

The effect is a degenerate dis-improvement in performance. Usingthe following test case

class Mul {
  static long multest(long a, long b) {
long res = a;
for (int i = 0; i < 100000000; i++) {
res += Math.multiplyExact(a, b);
a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
}
return res;
  }
  public static void main(String argv[]) {
        long a = 0x5a5a5a5aL;
long b = 0xa5a5a5a5L;
System.out.println("res " + a + ", " + b + " = " + multest(a, b));
  }
}

With -XX:-UseMathExactIntrinsics this takes 1.95 S
With -XX:+UseMathExactIntrinsics this takes 13.42 S

With the following webrev

http://cr.openjdk.java.net/~enevill/8171410/webrev

-XX:-UseMathExactIntrinsics takes 1.98 S
-XX:+UseMathExactIntrinsics takes 0.95 S

--- CUT HERE ---
# HG changeset patch
# User enevill
# Date 1482100004 18000
# Sun Dec 18 17:26:44 2016 -0500
# Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
# Parent 66e2100be052e42af8064e519658185112be6dc3
8171410: aarch64: long multiplyExact shifts by 31 instead of 63
Reviewed-by: aph

diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
--- a/src/cpu/aarch64/vm/aarch64.ad
+++ b/src/cpu/aarch64/vm/aarch64.ad
@@ -14086,7 +14086,7 @@

   format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
             "smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+ "cmp rscratch2, rscratch1, ASR #63\n\t"
             "movw rscratch1, #0x80000000\n\t"
             "cselw rscratch1, rscratch1, zr, NE\n\t"
             "cmpw rscratch1, #1" %}
@@ -14094,7 +14094,7 @@
   ins_encode %{
     __ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
     __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
- __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
+ __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
     __ movw(rscratch1, 0x80000000); // Develop 0 (EQ),
     __ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
     __ cmpw(rscratch1, 1); // 0x80000000 - 1 => VS
@@ -14112,7 +14112,7 @@

   format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
             "smulh rscratch2, $op1, $op2\n\t"
- "cmp rscratch2, rscratch1, ASR #31\n\t"
+

backported by

JDK-8261945 aarch64: long multiplyExact shifts by 31 instead of 63

Resolved

Details

Backports

Description

Attachments

Issue Links

Activity

People

Dates