Uploaded image for project: 'JDK'
  1. JDK
  2. JDK-8362596

RISC-V: Improve _vectorizedHashCode intrinsic

XMLWordPrintable

    • b09
    • riscv
    • linux

        It has been encountered that scalar version of _vectorizedHashCode
        intrinsic perform worse on BPI-F3 hardware[1] for large enough (~70)
        size:
        {code}
        bpif3-16g% ( for i in "-XX:DisableIntrinsic=_vectorizedHashCode" "" ; \
            do ( echo "--- ${i} ---" && ${JAVA_HOME}/bin/java -jar benchmarks.jar \
                       --jvmArgs="-XX:+UnlockDiagnosticVMOptions -XX:+UnlockExperimentalVMOptions ${i}" \
                       org.openjdk.bench.java.lang.ArraysHashCode.ints \
                       -p size=1,5,10,20,30,40,50,60,70,80,90,100,200,300 \
                       -f 3 -r 1 -w 1 -wi 10 -i 10 2>&1 | tail -15 ) done )
        --- -XX:DisableIntrinsic=_vectorizedHashCode ---
        Benchmark (size) Mode Cnt Score Error Units
        ArraysHashCode.ints 1 avgt 30 11.297 ± 0.021 ns/op
        ArraysHashCode.ints 5 avgt 30 28.907 ± 0.117 ns/op
        ArraysHashCode.ints 10 avgt 30 41.196 ± 0.218 ns/op
        ArraysHashCode.ints 20 avgt 30 68.403 ± 0.118 ns/op
        ArraysHashCode.ints 30 avgt 30 88.732 ± 0.506 ns/op
        ArraysHashCode.ints 40 avgt 30 115.166 ± 0.103 ns/op
        ArraysHashCode.ints 50 avgt 30 136.047 ± 0.487 ns/op
        ArraysHashCode.ints 60 avgt 30 161.985 ± 0.193 ns/op
        ArraysHashCode.ints 70 avgt 30 170.613 ± 0.506 ns/op <---
        ArraysHashCode.ints 80 avgt 30 194.457 ± 0.547 ns/op
        ArraysHashCode.ints 90 avgt 30 207.872 ± 0.305 ns/op
        ArraysHashCode.ints 100 avgt 30 231.960 ± 0.338 ns/op
        ArraysHashCode.ints 200 avgt 30 448.387 ± 1.186 ns/op
        ArraysHashCode.ints 300 avgt 30 655.308 ± 0.146 ns/op
        --- ---
        Benchmark (size) Mode Cnt Score Error Units
        ArraysHashCode.ints 1 avgt 30 11.295 ± 0.022 ns/op
        ArraysHashCode.ints 5 avgt 30 24.426 ± 0.005 ns/op
        ArraysHashCode.ints 10 avgt 30 35.734 ± 0.034 ns/op
        ArraysHashCode.ints 20 avgt 30 58.876 ± 0.015 ns/op
        ArraysHashCode.ints 30 avgt 30 82.964 ± 0.271 ns/op
        ArraysHashCode.ints 40 avgt 30 105.866 ± 0.027 ns/op
        ArraysHashCode.ints 50 avgt 30 129.875 ± 0.230 ns/op
        ArraysHashCode.ints 60 avgt 30 153.074 ± 0.331 ns/op
        ArraysHashCode.ints 70 avgt 30 176.633 ± 0.072 ns/op <---
        ArraysHashCode.ints 80 avgt 30 199.799 ± 0.049 ns/op
        ArraysHashCode.ints 90 avgt 30 223.666 ± 0.087 ns/op
        ArraysHashCode.ints 100 avgt 30 247.609 ± 0.447 ns/op
        ArraysHashCode.ints 200 avgt 30 481.884 ± 0.612 ns/op
        ArraysHashCode.ints 300 avgt 30 716.558 ± 0.197 ns/op
        {code}
         
        The following small change fixes the regression on BPI-F3 and
        doesn't introduce regressions on other available RISC-V cpus.
        The core of the change is placing of first `mulw` in the loop
        after four loads:
         
        {code}
        $ git diff
        diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
        index c62997310b3..f98b48adccd 100644
        --- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
        +++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
        @@ -1953,16 +1953,15 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res
           mv(pow31_3, 29791); // [31^^3]
           mv(pow31_2, 961); // [31^^2]
        - slli(chunks_end, chunks, chunks_end_shift);
        - add(chunks_end, ary, chunks_end);
        + shadd(chunks_end, chunks, ary, t0, chunks_end_shift);
           andi(cnt, cnt, stride - 1); // don't forget about tail!
           bind(WIDE_LOOP);
        - mulw(result, result, pow31_4); // 31^^4 * h
           arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype);
           arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype);
           arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
           arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
        + mulw(result, result, pow31_4); // 31^^4 * h
           mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0]
           addw(result, result, t0);
           mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1]
        @@ -1977,8 +1976,7 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res
           beqz(cnt, DONE);
           bind(TAIL);
        - slli(chunks_end, cnt, chunks_end_shift);
        - add(chunks_end, ary, chunks_end);
        + shadd(chunks_end, cnt, ary, t0, chunks_end_shift);
           bind(TAIL_LOOP);
           arrays_hashcode_elload(t0, Address(ary), eltype);
        {code}

              vkempik Vladimir Kempik
              vkempik Vladimir Kempik
              Votes:
              0 Vote for this issue
              Watchers:
              3 Start watching this issue

                Created:
                Updated:
                Resolved: