Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-8362596

RISC-V: Improve _vectorizedHashCode intrinsic

XMLWordPrintable

    • Icon: Enhancement Enhancement
    • Resolution: Unresolved
    • Icon: P4 P4
    • None
    • None
    • hotspot
    • None
    • riscv
    • linux

      It has been encountered that scalar version of _vectorizedHashCode
      intrinsic perform worse on BPI-F3 hardware[1] for large enough (~70)
      size:
      {code}
      bpif3-16g% ( for i in "-XX:DisableIntrinsic=_vectorizedHashCode" "" ; \
          do ( echo "--- ${i} ---" && ${JAVA_HOME}/bin/java -jar benchmarks.jar \
                     --jvmArgs="-XX:+UnlockDiagnosticVMOptions -XX:+UnlockExperimentalVMOptions ${i}" \
                     org.openjdk.bench.java.lang.ArraysHashCode.ints \
                     -p size=1,5,10,20,30,40,50,60,70,80,90,100,200,300 \
                     -f 3 -r 1 -w 1 -wi 10 -i 10 2>&1 | tail -15 ) done )
      --- -XX:DisableIntrinsic=_vectorizedHashCode ---
      Benchmark (size) Mode Cnt Score Error Units
      ArraysHashCode.ints 1 avgt 30 11.297 ± 0.021 ns/op
      ArraysHashCode.ints 5 avgt 30 28.907 ± 0.117 ns/op
      ArraysHashCode.ints 10 avgt 30 41.196 ± 0.218 ns/op
      ArraysHashCode.ints 20 avgt 30 68.403 ± 0.118 ns/op
      ArraysHashCode.ints 30 avgt 30 88.732 ± 0.506 ns/op
      ArraysHashCode.ints 40 avgt 30 115.166 ± 0.103 ns/op
      ArraysHashCode.ints 50 avgt 30 136.047 ± 0.487 ns/op
      ArraysHashCode.ints 60 avgt 30 161.985 ± 0.193 ns/op
      ArraysHashCode.ints 70 avgt 30 170.613 ± 0.506 ns/op <---
      ArraysHashCode.ints 80 avgt 30 194.457 ± 0.547 ns/op
      ArraysHashCode.ints 90 avgt 30 207.872 ± 0.305 ns/op
      ArraysHashCode.ints 100 avgt 30 231.960 ± 0.338 ns/op
      ArraysHashCode.ints 200 avgt 30 448.387 ± 1.186 ns/op
      ArraysHashCode.ints 300 avgt 30 655.308 ± 0.146 ns/op
      --- ---
      Benchmark (size) Mode Cnt Score Error Units
      ArraysHashCode.ints 1 avgt 30 11.295 ± 0.022 ns/op
      ArraysHashCode.ints 5 avgt 30 24.426 ± 0.005 ns/op
      ArraysHashCode.ints 10 avgt 30 35.734 ± 0.034 ns/op
      ArraysHashCode.ints 20 avgt 30 58.876 ± 0.015 ns/op
      ArraysHashCode.ints 30 avgt 30 82.964 ± 0.271 ns/op
      ArraysHashCode.ints 40 avgt 30 105.866 ± 0.027 ns/op
      ArraysHashCode.ints 50 avgt 30 129.875 ± 0.230 ns/op
      ArraysHashCode.ints 60 avgt 30 153.074 ± 0.331 ns/op
      ArraysHashCode.ints 70 avgt 30 176.633 ± 0.072 ns/op <---
      ArraysHashCode.ints 80 avgt 30 199.799 ± 0.049 ns/op
      ArraysHashCode.ints 90 avgt 30 223.666 ± 0.087 ns/op
      ArraysHashCode.ints 100 avgt 30 247.609 ± 0.447 ns/op
      ArraysHashCode.ints 200 avgt 30 481.884 ± 0.612 ns/op
      ArraysHashCode.ints 300 avgt 30 716.558 ± 0.197 ns/op
      {code}
       
      The following small change fixes the regression on BPI-F3 and
      doesn't introduce regressions on other available RISC-V cpus.
      The core of the change is placing of first `mulw` in the loop
      after four loads:
       
      {code}
      $ git diff
      diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
      index c62997310b3..f98b48adccd 100644
      --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
      +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
      @@ -1953,16 +1953,15 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res
         mv(pow31_3, 29791); // [31^^3]
         mv(pow31_2, 961); // [31^^2]
      - slli(chunks_end, chunks, chunks_end_shift);
      - add(chunks_end, ary, chunks_end);
      + shadd(chunks_end, chunks, ary, t0, chunks_end_shift);
         andi(cnt, cnt, stride - 1); // don't forget about tail!
         bind(WIDE_LOOP);
      - mulw(result, result, pow31_4); // 31^^4 * h
         arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype);
         arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype);
         arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
         arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
      + mulw(result, result, pow31_4); // 31^^4 * h
         mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0]
         addw(result, result, t0);
         mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1]
      @@ -1977,8 +1976,7 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res
         beqz(cnt, DONE);
         bind(TAIL);
      - slli(chunks_end, cnt, chunks_end_shift);
      - add(chunks_end, ary, chunks_end);
      + shadd(chunks_end, cnt, ary, t0, chunks_end_shift);
         bind(TAIL_LOOP);
         arrays_hashcode_elload(t0, Address(ary), eltype);
      {code}

            vkempik Vladimir Kempik
            vkempik Vladimir Kempik
            Votes:
            0 Vote for this issue
            Watchers:
            1 Start watching this issue

              Created:
              Updated: