Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-8255287

aarch64: fix SVE patterns for vector shift count

XMLWordPrintable

    • 16
    • b22
    • aarch64
    • generic

      Test case:

      public class TestVectorShiftShorts {

          private static final int ARRLEN = 1000;
          private static final int ITERS = 20000;

          public static void main(String args[]) {
              short[] a0 = new short[ARRLEN];
              short[] a1 = new short[ARRLEN];

              // Initialize
              test_init(a0, a1);

              // Warmup
              for (int i = 0; i < ITERS; i++) {
                  test_lshift(a0, a1);
                  test_urshift(a0, a1);
              }

              // Test and verify results
              test_init(a0, a1);
              test_lshift(a0, a1);
              verify_lshift(a0, a1);

              test_init(a0, a1);
              test_urshift(a0, a1);
              verify_urshift(a0, a1);

              // Finish
              System.out.println("Test passed");
          }

          static void test_init(short[] a0, short[] a1) {
              for (int i = 0; i < ARRLEN; i++) {
                  a0[i] = (short)(i & 3);
                  a1[i] = (short)i;
              }
          }

          static void test_lshift(short[] a0, short[] a1) {
              for (int i = 0; i < ARRLEN; i++) {
                  a0[i] = (short)(a1[i] << 10);
              }
          }

          static void verify_lshift(short[] a0, short[] a1) {
              for (int i = 0; i < ARRLEN; i++) {
                  if (a0[i] != (short)(a1[i] << 10)) {
                      throw new RuntimeException("LShift test failed.");
                  }
              }
          }

          static void test_urshift(short[] a0, short[] a1) {
              for (int i = 0; i < ARRLEN; i++) {
                  a0[i] = (short)(a1[i] >>> 10);
              }
          }

          static void verify_urshift(short[] a0, short[] a1) {
              for (int i = 0; i < ARRLEN; i++) {
                  if (a0[i] != (short)(a1[i] >>> 10)) {
                      throw new RuntimeException("URshift test failed.");
                  }
              }
          }

      }

      Command line:
      $qemu-aarch64 -cpu max,sve-max-vq=2 java -XX:UseSVE=2 -Xbatch -XX:-TieredCompilation -XX:CompileCommand=compileonly,TestVectorShiftShorts::test_lshift -XX:+PrintCompilation -XX:+PrintAssembly TestVectorShiftShorts

      Use the following sve pattern for example:

      1504 instruct vlslS_imm(vReg dst, vReg src, immI shift) %{
      1505 predicate(UseSVE > 0 && n->as_Vector()->length() >= 8);
      1506 match(Set dst (LShiftVS src shift)); <========== Cannot match !!!
      1507 ins_cost(SVE_COST);
      1508 format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (H)" %}
      1509 ins_encode %{
      1510 int con = (int)$shift$$constant;
      1511 if (con >= 8) { <========== Should be 16 !!!
      1512 __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
      1513 as_FloatRegister($src$$reg));
      1514 return;
      1515 }
      1516 __ sve_lsl(as_FloatRegister($dst$$reg), __ H,
      1517 as_FloatRegister($src$$reg), con);
      1518 %}
      1519 ins_pipe(pipe_slow);
      1520 %}


      1. before fixing matching rule
      329 ;; B13: # out( B13 B14 ) <- in( B12 B13 ) Loop( B13-B13 inner main of N98) Freq: 958.688
      330 0x000000551d61b610: sbfiz x10, x17, #1, #32 ;*sastore {reexecute=0 rethrow=0 return_oop=0}
      331 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
      332 0x000000551d61b614: add x11, x2, x10 ;*saload {reexecute=0 rethrow=0 return_oop=0}
      333 ; - TestVectorShiftShorts::test_lshift@13 (line 77)
      334 0x000000551d61b618: add x12, x11, #0x10
      335 0x000000551d61b61c: ld1h {z20.h}, p7/z, [x12]
      336 0x000000551d61b620: lsl z20.h, p7/m, z20.h, z16.h <======================= Correct, but can be improved
      337 0x000000551d61b624: add x10, x1, x10
      338 0x000000551d61b628: add x12, x10, #0x10
      339 0x000000551d61b62c: st1h {z20.h}, p7, [x12]
      340 0x000000551d61b630: add x12, x11, #0x30
      341 0x000000551d61b634: ld1h {z20.h}, p7/z, [x12]
      342 0x000000551d61b638: lsl z20.h, p7/m, z20.h, z17.h
      343 0x000000551d61b63c: add x12, x10, #0x30
      344 0x000000551d61b640: st1h {z20.h}, p7, [x12]
      345 0x000000551d61b644: add x12, x11, #0x50
      346 0x000000551d61b648: ld1h {z20.h}, p7/z, [x12]
      347 0x000000551d61b64c: lsl z20.h, p7/m, z20.h, z19.h
      348 0x000000551d61b650: add x12, x10, #0x50
      349 0x000000551d61b654: st1h {z20.h}, p7, [x12]
      350 0x000000551d61b658: add x11, x11, #0x70
      351 0x000000551d61b65c: ld1h {z20.h}, p7/z, [x11]
      352 0x000000551d61b660: lsl z20.h, p7/m, z20.h, z18.h
      353 0x000000551d61b664: add x10, x10, #0x70
      354 0x000000551d61b668: st1h {z20.h}, p7, [x10] ;*sastore {reexecute=0 rethrow=0 return_oop=0}
      355 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
      356 0x000000551d61b66c: add w17, w17, #0x40 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
      357 ; - TestVectorShiftShorts::test_lshift@19 (line 76)
      358 0x000000551d61b670: cmp w17, #0x3a9
      359 0x000000551d61b674: b.lt 0x000000551d61b610

      2. after fixing matching rule
      311 ;; B12: # out( B12 B13 ) <- in( B11 B12 ) Loop( B12-B12 inner main of N98) Freq: 958.688
      312 0x000000551d614270: sbfiz x10, x16, #1, #32 ;*sastore {reexecute=0 rethrow=0 return_oop=0}
      313 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
      314 0x000000551d614274: add x11, x2, x10 ;*saload {reexecute=0 rethrow=0 return_oop=0}
      315 ; - TestVectorShiftShorts::test_lshift@13 (line 77)
      316 0x000000551d614278: add x12, x11, #0x10
      317 0x000000551d61427c: ld1h {z16.h}, p7/z, [x12]
      318 0x000000551d614280: eor z16.d, z16.d, z16.d <======================= WRONG !!!
      319 0x000000551d614284: add x10, x1, x10
      320 0x000000551d614288: add x12, x10, #0x10
      321 0x000000551d61428c: st1h {z16.h}, p7, [x12]
      322 0x000000551d614290: add x12, x11, #0x30
      323 0x000000551d614294: ld1h {z16.h}, p7/z, [x12]
      324 0x000000551d614298: eor z16.d, z16.d, z16.d
      325 0x000000551d61429c: add x12, x10, #0x30
      326 0x000000551d6142a0: st1h {z16.h}, p7, [x12]
      327 0x000000551d6142a4: add x12, x11, #0x50
      328 0x000000551d6142a8: ld1h {z16.h}, p7/z, [x12]
      329 0x000000551d6142ac: eor z16.d, z16.d, z16.d
      330 0x000000551d6142b0: add x12, x10, #0x50
      331 0x000000551d6142b4: st1h {z16.h}, p7, [x12]
      332 0x000000551d6142b8: add x11, x11, #0x70
      333 0x000000551d6142bc: ld1h {z16.h}, p7/z, [x11]
      334 0x000000551d6142c0: eor z16.d, z16.d, z16.d
      335 0x000000551d6142c4: add x10, x10, #0x70
      336 0x000000551d6142c8: st1h {z16.h}, p7, [x10] ;*sastore {reexecute=0 rethrow=0 return_oop=0}
      337 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
      338 0x000000551d6142cc: add w16, w16, #0x40 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
      339 ; - TestVectorShiftShorts::test_lshift@19 (line 76)
      340 0x000000551d6142d0: cmp w16, #0x3a9
      341 0x000000551d6142d4: b.lt 0x000000551d614270

      Note: SVE eor instruction is not correct here and this is causing the test case fail.

      3. after fixing code gen
      311 ;; B12: # out( B12 B13 ) <- in( B11 B12 ) Loop( B12-B12 inner main of N98) Freq: 958.688
      312 0x000000551d61b5f0: sbfiz x10, x16, #1, #32 ;*sastore {reexecute=0 rethrow=0 return_oop=0}
      313 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
      314 0x000000551d61b5f4: add x11, x2, x10 ;*saload {reexecute=0 rethrow=0 return_oop=0}
      315 ; - TestVectorShiftShorts::test_lshift@13 (line 77)
      316 0x000000551d61b5f8: add x12, x11, #0x10
      317 0x000000551d61b5fc: ld1h {z16.h}, p7/z, [x12]
      318 0x000000551d61b600: lsl z16.h, z16.h, #10 <======================= Correct and better
      319 0x000000551d61b604: add x10, x1, x10
      320 0x000000551d61b608: add x12, x10, #0x10
      321 0x000000551d61b60c: st1h {z16.h}, p7, [x12]
      322 0x000000551d61b610: add x12, x11, #0x30
      323 0x000000551d61b614: ld1h {z16.h}, p7/z, [x12]
      324 0x000000551d61b618: lsl z16.h, z16.h, #10
      325 0x000000551d61b61c: add x12, x10, #0x30
      326 0x000000551d61b620: st1h {z16.h}, p7, [x12]
      327 0x000000551d61b624: add x12, x11, #0x50
      328 0x000000551d61b628: ld1h {z16.h}, p7/z, [x12]
      329 0x000000551d61b62c: lsl z16.h, z16.h, #10
      330 0x000000551d61b630: add x12, x10, #0x50
      331 0x000000551d61b634: st1h {z16.h}, p7, [x12]
      332 0x000000551d61b638: add x11, x11, #0x70
      333 0x000000551d61b63c: ld1h {z16.h}, p7/z, [x11]
      334 0x000000551d61b640: lsl z16.h, z16.h, #10
      335 0x000000551d61b644: add x10, x10, #0x70
      336 0x000000551d61b648: st1h {z16.h}, p7, [x10] ;*sastore {reexecute=0 rethrow=0 return_oop=0}
      337 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
      338 0x000000551d61b64c: add w16, w16, #0x40 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
      339 ; - TestVectorShiftShorts::test_lshift@19 (line 76)
      340 0x000000551d61b650: cmp w16, #0x3a9
      341 0x000000551d61b654: b.lt 0x000000551d61b5f0

      Attached please find the proposed patch. Will propose a PR for review.

            fyang Fei Yang
            fyang Fei Yang
            Votes:
            0 Vote for this issue
            Watchers:
            4 Start watching this issue

              Created:
              Updated:
              Resolved: