Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-8171410

aarch64: long multiplyExact shifts by 31 instead of 63

XMLWordPrintable

    • b151
    • aarch64
    • linux

        The patterns for multplyExact long in aarch64.ad shift by 31 instead of 63

          0x000003ff64b29aa8: mul x8, x19, x20
          0x000003ff64b29aac: smulh x9, x19, x20
          0x000003ff64b29ab0: cmp x9, x8, asr #31 <<<<<<< should be 63
          0x000003ff64b29ab4: b.ne 0x000003ff64b29b4c

        The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.

        This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.

        The effect is a degenerate dis-improvement in performance. Usingthe following test case

        class Mul {
          static long multest(long a, long b) {
        long res = a;
        for (int i = 0; i < 100000000; i++) {
        res += Math.multiplyExact(a, b);
        a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
        }
        return res;
          }
          public static void main(String argv[]) {
                long a = 0x5a5a5a5aL;
        long b = 0xa5a5a5a5L;
        System.out.println("res " + a + ", " + b + " = " + multest(a, b));
          }
        }

        With -XX:-UseMathExactIntrinsics this takes 1.95 S
        With -XX:+UseMathExactIntrinsics this takes 13.42 S

        With the following webrev

        http://cr.openjdk.java.net/~enevill/8171410/webrev

        -XX:-UseMathExactIntrinsics takes 1.98 S
        -XX:+UseMathExactIntrinsics takes 0.95 S

        --- CUT HERE ---
        # HG changeset patch
        # User enevill
        # Date 1482100004 18000
        # Sun Dec 18 17:26:44 2016 -0500
        # Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
        # Parent 66e2100be052e42af8064e519658185112be6dc3
        8171410: aarch64: long multiplyExact shifts by 31 instead of 63
        Reviewed-by: aph

        diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
        --- a/src/cpu/aarch64/vm/aarch64.ad
        +++ b/src/cpu/aarch64/vm/aarch64.ad
        @@ -14086,7 +14086,7 @@
         
           format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
                     "smulh rscratch2, $op1, $op2\n\t"
        - "cmp rscratch2, rscratch1, ASR #31\n\t"
        + "cmp rscratch2, rscratch1, ASR #63\n\t"
                     "movw rscratch1, #0x80000000\n\t"
                     "cselw rscratch1, rscratch1, zr, NE\n\t"
                     "cmpw rscratch1, #1" %}
        @@ -14094,7 +14094,7 @@
           ins_encode %{
             __ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
             __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
        - __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
        + __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
             __ movw(rscratch1, 0x80000000); // Develop 0 (EQ),
             __ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
             __ cmpw(rscratch1, 1); // 0x80000000 - 1 => VS
        @@ -14112,7 +14112,7 @@
         
           format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
                     "smulh rscratch2, $op1, $op2\n\t"
        - "cmp rscratch2, rscratch1, ASR #31\n\t"
        + "cmp rscratch2, rscratch1, ASR #63\n\t"
                     "b$cmp $labl" %}
           ins_cost(4 * INSN_COST); // Branch is rare so treat as INSN_COST
           ins_encode %{
        @@ -14120,7 +14120,7 @@
             Assembler::Condition cond = (Assembler::Condition)$cmp$$cmpcode;
             __ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
             __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
        - __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
        + __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
             __ br(cond == Assembler::VS ? Assembler::NE : Assembler::EQ, *L);
           %}
         
        --- CUT HERE ---

        The patterns for multplyExact long in aarch64.ad shift by 31 instead of 63

          0x000003ff64b29aa8: mul x8, x19, x20
          0x000003ff64b29aac: smulh x9, x19, x20
          0x000003ff64b29ab0: cmp x9, x8, asr #31 <<<<<<< should be 63
          0x000003ff64b29ab4: b.ne 0x000003ff64b29b4c

        The effect of this is to cause the overflow branch to be taken in cases where the multiply has not overflowed. For example 0x5a5a5a5a * 0xa5a5a5a5 does not overflow but the overflow branch will be taken above.

        This was not detected in testing because when the overflow branch is taken C2 causes a call to the real (non intrinsic) multiplyExact to be taken. This then executes correctly.

        The effect is a degenerate dis-improvement in performance. Usingthe following test case

        class Mul {
          static long multest(long a, long b) {
        long res = a;
        for (int i = 0; i < 100000000; i++) {
        res += Math.multiplyExact(a, b);
        a ^= 1L; b ^= 1L; // Stop loop invariant hoisting
        }
        return res;
          }
          public static void main(String argv[]) {
                long a = 0x5a5a5a5aL;
        long b = 0xa5a5a5a5L;
        System.out.println("res " + a + ", " + b + " = " + multest(a, b));
          }
        }

        With -XX:-UseMathExactIntrinsics this takes 1.95 S
        With -XX:+UseMathExactIntrinsics this takes 13.42 S

        With the following webrev

        http://cr.openjdk.java.net/~enevill/8171410/webrev

        -XX:-UseMathExactIntrinsics takes 1.98 S
        -XX:+UseMathExactIntrinsics takes 0.95 S

        --- CUT HERE ---
        # HG changeset patch
        # User enevill
        # Date 1482100004 18000
        # Sun Dec 18 17:26:44 2016 -0500
        # Node ID c0e2c655e28064fd01b54a47915037d8f2423f81
        # Parent 66e2100be052e42af8064e519658185112be6dc3
        8171410: aarch64: long multiplyExact shifts by 31 instead of 63
        Reviewed-by: aph

        diff --git a/src/cpu/aarch64/vm/aarch64.ad b/src/cpu/aarch64/vm/aarch64.ad
        --- a/src/cpu/aarch64/vm/aarch64.ad
        +++ b/src/cpu/aarch64/vm/aarch64.ad
        @@ -14086,7 +14086,7 @@
         
           format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
                     "smulh rscratch2, $op1, $op2\n\t"
        - "cmp rscratch2, rscratch1, ASR #31\n\t"
        + "cmp rscratch2, rscratch1, ASR #63\n\t"
                     "movw rscratch1, #0x80000000\n\t"
                     "cselw rscratch1, rscratch1, zr, NE\n\t"
                     "cmpw rscratch1, #1" %}
        @@ -14094,7 +14094,7 @@
           ins_encode %{
             __ mul(rscratch1, $op1$$Register, $op2$$Register); // Result bits 0..63
             __ smulh(rscratch2, $op1$$Register, $op2$$Register); // Result bits 64..127
        - __ cmp(rscratch2, rscratch1, Assembler::ASR, 31); // Top is pure sign ext
        + __ cmp(rscratch2, rscratch1, Assembler::ASR, 63); // Top is pure sign ext
             __ movw(rscratch1, 0x80000000); // Develop 0 (EQ),
             __ cselw(rscratch1, rscratch1, zr, Assembler::NE); // or 0x80000000 (NE)
             __ cmpw(rscratch1, 1); // 0x80000000 - 1 => VS
        @@ -14112,7 +14112,7 @@
         
           format %{ "mul rscratch1, $op1, $op2\t#overflow check long\n\t"
                     "smulh rscratch2, $op1, $op2\n\t"
        - "cmp rscratch2, rscratch1, ASR #31\n\t"
        +

              enevill Ed Nevill
              enevill Ed Nevill
              Votes:
              0 Vote for this issue
              Watchers:
              4 Start watching this issue

                Created:
                Updated:
                Resolved: