Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-8358474

Performance drop in SIMD kernel with records

XMLWordPrintable

    • Icon: Enhancement Enhancement
    • Resolution: Unresolved
    • Icon: P4 P4
    • 26
    • 25
    • hotspot
    • generic
    • generic

      Following micro show 6x performance degradation when we use a record to store an intermediate vector.

      import jdk.incubator.vector.*;
      import java.util.Arrays;
      import java.util.stream.IntStream;

      record Adapter(ShortVector vec) {
      }

      public class test_ea_records {
          public static VectorSpecies<Short> SPEC = ShortVector.SPECIES_PREFERRED;

          public static Adapter getAdapter(ShortVector vec) {
             return new Adapter(vec);
          }
          public static void micro1(short [] output, short [] input) {
              for (int i = 0; i < SPEC.loopBound(input.length); i += SPEC.length()) {
                  ShortVector inp = ShortVector.fromArray(SPEC, input, i);
                  inp = inp.lanewise(VectorOperators.MUL, (short)3);
                  inp = inp.lanewise(VectorOperators.MUL, (short)30);
                  inp = inp.lanewise(VectorOperators.MUL, (short)300);
                  inp = inp.lanewise(VectorOperators.MUL, (short)3000);
                  Adapter obj = getAdapter(inp);
                  obj.vec().lanewise(VectorOperators.ADD, (short)10)
                           .intoArray(output, i);
              }
          }

          public static void micro2(short [] output, short [] input) {
              for (int i = 0; i < SPEC.loopBound(input.length); i += SPEC.length()) {
                  ShortVector inp = ShortVector.fromArray(SPEC, input, i);
                  inp = inp.lanewise(VectorOperators.MUL, (short)3);
                  inp = inp.lanewise(VectorOperators.MUL, (short)30);
                  inp = inp.lanewise(VectorOperators.MUL, (short)300);
                  inp = inp.lanewise(VectorOperators.MUL, (short)3000);
                  inp.lanewise(VectorOperators.ADD, (short)10)
                           .intoArray(output, i);
              }
          }

          public static void main(String [] args) {
              int algo = Integer.parseInt(args[0]);
              short [] input = new short[2048];
              short [] output = new short[2048];
              IntStream.range(0, input.length).forEach(i -> { input[i] = (short)i; });

              if (algo == 0 || algo == -1) {
                  for (int i = 0; i < 20000; i++) {
                      micro1(output, input);
                  }
                  long t1 = System.currentTimeMillis();
                  for (int i = 0; i < 20000; i++) {
                      micro1(output, input);
                  }
                  long t2 = System.currentTimeMillis();
                  System.out.println("[time] " + (t2 - t1) + " ms [res] " + Arrays.hashCode(output));
              }

              if (algo == 1 || algo == -1) {
                  for (int i = 0; i < 20000; i++) {
                      micro2(output, input);
                  }
                  long t1 = System.currentTimeMillis();
                  for (int i = 0; i < 20000; i++) {
                      micro2(output, input);
                  }
                  long t2 = System.currentTimeMillis();
                  System.out.println("[time] " + (t2 - t1) + " ms [res] " + Arrays.hashCode(output));
              }
          }
      }

      SPR2>java --add-modules=jdk.incubator.vector -Xbatch -XX:-TieredCompilation -cp . test_ea_records -1
      WARNING: Using incubator modules: jdk.incubator.vector
      [time] 31 ms [res] 1419051009
      [time] 5 ms [res] 1419051009

      JIT sequence shows allocation for records are not getting eliminated dispiet of being a non-escaping.

        0x00007f69a4928b49: mov 0x1c8(%r15),%rax
        0x00007f69a4928b50: mov %rax,%r10
        0x00007f69a4928b53: add $0x50,%r10
        0x00007f69a4928b57: vpbroadcastd -0xe1(%rip),%zmm0 # 0x00007f69a4928a80
                                                                  ; {section_word}
        0x00007f69a4928b61: vpmullw 0x10(%r9,%rbp,2),%zmm0,%zmm0
        0x00007f69a4928b6c: vpbroadcastd -0xf2(%rip),%zmm1 # 0x00007f69a4928a84
                                                                  ; {section_word}
        0x00007f69a4928b76: vpmullw %zmm1,%zmm0,%zmm0
        0x00007f69a4928b7c: vpbroadcastd -0xfe(%rip),%zmm1 # 0x00007f69a4928a88
                                                                  ; {section_word}
        0x00007f69a4928b86: vpmullw %zmm1,%zmm0,%zmm1
        0x00007f69a4928b8c: cmp 0x1d8(%r15),%r10
        0x00007f69a4928b93: jae 0x00007f69a4928c4d
        0x00007f69a4928b99: mov %r10,0x1c8(%r15)
        0x00007f69a4928ba0: prefetchw 0xc0(%r10)
        0x00007f69a4928ba8: movq $0x1,(%rax)
        0x00007f69a4928baf: prefetchw 0x100(%r10)
        0x00007f69a4928bb7: movl $0x164930,0x8(%rax) ; {metadata({type array short})}
        0x00007f69a4928bbe: prefetchw 0x140(%r10)
        0x00007f69a4928bc6: movl $0x20,0xc(%rax)
        0x00007f69a4928bcd: prefetchw 0x180(%r10)
        0x00007f69a4928bd5: mov %rax,%rsi
        0x00007f69a4928bd8: add $0x10,%rsi
        0x00007f69a4928bdc: vpxord %zmm0,%zmm0,%zmm0
        0x00007f69a4928be2: vmovdqu64 %zmm0,(%rsi)
        0x00007f69a4928be8: vpbroadcastd -0x166(%rip),%zmm0 # 0x00007f69a4928a8c
                                                                  ; {section_word}
        0x00007f69a4928bf2: vpmullw %zmm0,%zmm1,%zmm0
        0x00007f69a4928bf8: vmovdqu32 %zmm0,0x10(%rax)
        0x00007f69a4928c02: vpbroadcastd -0x17c(%rip),%zmm1 # 0x00007f69a4928a90
                                                                  ; {section_word}
        0x00007f69a4928c0c: vpaddw %zmm0,%zmm1,%zmm0
        0x00007f69a4928c12: cmp %edi,%ebp
        0x00007f69a4928c14: jae 0x00007f69a4928d00
        0x00007f69a4928c1a: vmovdqu32 %zmm0,0x10(%rbx,%rbp,2)
        0x00007f69a4928c25: add %r11d,%ebp
        0x00007f69a4928c28: mov 0x30(%r15),%r10 ; ImmutableOopMap {r9=Oop rbx=Oop rdx=Oop }
                                                                  ;*goto {reexecute=1 rethrow=0 return_oop=0}
                                                                  ; - (reexecute) test_ea_records::micro@101 (line 13)
        0x00007f69a4928c2c: test %eax,(%r10) ; {poll}
        0x00007f69a4928c2f: cmp %ecx,%ebp
        0x00007f69a4928c31: jl 0x00007f69a4928b40
        0x00007f69a4928c37: vzeroupper
        0x00007f69a4928c3a: add $0xd0,%rsp
        0x00007f69a4928c41: pop %rbp
        0x00007f69a4928c42: cmp 0x28(%r15),%rsp ; {poll_return}
        0x00007f69a4928c46: ja 0x00007f69a4928deb
        0x00007f69a4928c4c: ret

            jbhateja Jatin Bhateja
            jbhateja Jatin Bhateja
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

              Created:
              Updated: