It has been encountered that scalar version of _vectorizedHashCode
intrinsic perform worse on BPI-F3 hardware[1] for large enough (~70)
size:
{code}
bpif3-16g% ( for i in "-XX:DisableIntrinsic=_vectorizedHashCode" "" ; \
do ( echo "--- ${i} ---" && ${JAVA_HOME}/bin/java -jar benchmarks.jar \
--jvmArgs="-XX:+UnlockDiagnosticVMOptions -XX:+UnlockExperimentalVMOptions ${i}" \
org.openjdk.bench.java.lang.ArraysHashCode.ints \
-p size=1,5,10,20,30,40,50,60,70,80,90,100,200,300 \
-f 3 -r 1 -w 1 -wi 10 -i 10 2>&1 | tail -15 ) done )
--- -XX:DisableIntrinsic=_vectorizedHashCode ---
Benchmark (size) Mode Cnt Score Error Units
ArraysHashCode.ints 1 avgt 30 11.297 ± 0.021 ns/op
ArraysHashCode.ints 5 avgt 30 28.907 ± 0.117 ns/op
ArraysHashCode.ints 10 avgt 30 41.196 ± 0.218 ns/op
ArraysHashCode.ints 20 avgt 30 68.403 ± 0.118 ns/op
ArraysHashCode.ints 30 avgt 30 88.732 ± 0.506 ns/op
ArraysHashCode.ints 40 avgt 30 115.166 ± 0.103 ns/op
ArraysHashCode.ints 50 avgt 30 136.047 ± 0.487 ns/op
ArraysHashCode.ints 60 avgt 30 161.985 ± 0.193 ns/op
ArraysHashCode.ints 70 avgt 30 170.613 ± 0.506 ns/op <---
ArraysHashCode.ints 80 avgt 30 194.457 ± 0.547 ns/op
ArraysHashCode.ints 90 avgt 30 207.872 ± 0.305 ns/op
ArraysHashCode.ints 100 avgt 30 231.960 ± 0.338 ns/op
ArraysHashCode.ints 200 avgt 30 448.387 ± 1.186 ns/op
ArraysHashCode.ints 300 avgt 30 655.308 ± 0.146 ns/op
--- ---
Benchmark (size) Mode Cnt Score Error Units
ArraysHashCode.ints 1 avgt 30 11.295 ± 0.022 ns/op
ArraysHashCode.ints 5 avgt 30 24.426 ± 0.005 ns/op
ArraysHashCode.ints 10 avgt 30 35.734 ± 0.034 ns/op
ArraysHashCode.ints 20 avgt 30 58.876 ± 0.015 ns/op
ArraysHashCode.ints 30 avgt 30 82.964 ± 0.271 ns/op
ArraysHashCode.ints 40 avgt 30 105.866 ± 0.027 ns/op
ArraysHashCode.ints 50 avgt 30 129.875 ± 0.230 ns/op
ArraysHashCode.ints 60 avgt 30 153.074 ± 0.331 ns/op
ArraysHashCode.ints 70 avgt 30 176.633 ± 0.072 ns/op <---
ArraysHashCode.ints 80 avgt 30 199.799 ± 0.049 ns/op
ArraysHashCode.ints 90 avgt 30 223.666 ± 0.087 ns/op
ArraysHashCode.ints 100 avgt 30 247.609 ± 0.447 ns/op
ArraysHashCode.ints 200 avgt 30 481.884 ± 0.612 ns/op
ArraysHashCode.ints 300 avgt 30 716.558 ± 0.197 ns/op
{code}
The following small change fixes the regression on BPI-F3 and
doesn't introduce regressions on other available RISC-V cpus.
The core of the change is placing of first `mulw` in the loop
after four loads:
{code}
$ git diff
diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
index c62997310b3..f98b48adccd 100644
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
@@ -1953,16 +1953,15 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res
mv(pow31_3, 29791); // [31^^3]
mv(pow31_2, 961); // [31^^2]
- slli(chunks_end, chunks, chunks_end_shift);
- add(chunks_end, ary, chunks_end);
+ shadd(chunks_end, chunks, ary, t0, chunks_end_shift);
andi(cnt, cnt, stride - 1); // don't forget about tail!
bind(WIDE_LOOP);
- mulw(result, result, pow31_4); // 31^^4 * h
arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype);
arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype);
arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
+ mulw(result, result, pow31_4); // 31^^4 * h
mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0]
addw(result, result, t0);
mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1]
@@ -1977,8 +1976,7 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res
beqz(cnt, DONE);
bind(TAIL);
- slli(chunks_end, cnt, chunks_end_shift);
- add(chunks_end, ary, chunks_end);
+ shadd(chunks_end, cnt, ary, t0, chunks_end_shift);
bind(TAIL_LOOP);
arrays_hashcode_elload(t0, Address(ary), eltype);
{code}
intrinsic perform worse on BPI-F3 hardware[1] for large enough (~70)
size:
{code}
bpif3-16g% ( for i in "-XX:DisableIntrinsic=_vectorizedHashCode" "" ; \
do ( echo "--- ${i} ---" && ${JAVA_HOME}/bin/java -jar benchmarks.jar \
--jvmArgs="-XX:+UnlockDiagnosticVMOptions -XX:+UnlockExperimentalVMOptions ${i}" \
org.openjdk.bench.java.lang.ArraysHashCode.ints \
-p size=1,5,10,20,30,40,50,60,70,80,90,100,200,300 \
-f 3 -r 1 -w 1 -wi 10 -i 10 2>&1 | tail -15 ) done )
--- -XX:DisableIntrinsic=_vectorizedHashCode ---
Benchmark (size) Mode Cnt Score Error Units
ArraysHashCode.ints 1 avgt 30 11.297 ± 0.021 ns/op
ArraysHashCode.ints 5 avgt 30 28.907 ± 0.117 ns/op
ArraysHashCode.ints 10 avgt 30 41.196 ± 0.218 ns/op
ArraysHashCode.ints 20 avgt 30 68.403 ± 0.118 ns/op
ArraysHashCode.ints 30 avgt 30 88.732 ± 0.506 ns/op
ArraysHashCode.ints 40 avgt 30 115.166 ± 0.103 ns/op
ArraysHashCode.ints 50 avgt 30 136.047 ± 0.487 ns/op
ArraysHashCode.ints 60 avgt 30 161.985 ± 0.193 ns/op
ArraysHashCode.ints 70 avgt 30 170.613 ± 0.506 ns/op <---
ArraysHashCode.ints 80 avgt 30 194.457 ± 0.547 ns/op
ArraysHashCode.ints 90 avgt 30 207.872 ± 0.305 ns/op
ArraysHashCode.ints 100 avgt 30 231.960 ± 0.338 ns/op
ArraysHashCode.ints 200 avgt 30 448.387 ± 1.186 ns/op
ArraysHashCode.ints 300 avgt 30 655.308 ± 0.146 ns/op
--- ---
Benchmark (size) Mode Cnt Score Error Units
ArraysHashCode.ints 1 avgt 30 11.295 ± 0.022 ns/op
ArraysHashCode.ints 5 avgt 30 24.426 ± 0.005 ns/op
ArraysHashCode.ints 10 avgt 30 35.734 ± 0.034 ns/op
ArraysHashCode.ints 20 avgt 30 58.876 ± 0.015 ns/op
ArraysHashCode.ints 30 avgt 30 82.964 ± 0.271 ns/op
ArraysHashCode.ints 40 avgt 30 105.866 ± 0.027 ns/op
ArraysHashCode.ints 50 avgt 30 129.875 ± 0.230 ns/op
ArraysHashCode.ints 60 avgt 30 153.074 ± 0.331 ns/op
ArraysHashCode.ints 70 avgt 30 176.633 ± 0.072 ns/op <---
ArraysHashCode.ints 80 avgt 30 199.799 ± 0.049 ns/op
ArraysHashCode.ints 90 avgt 30 223.666 ± 0.087 ns/op
ArraysHashCode.ints 100 avgt 30 247.609 ± 0.447 ns/op
ArraysHashCode.ints 200 avgt 30 481.884 ± 0.612 ns/op
ArraysHashCode.ints 300 avgt 30 716.558 ± 0.197 ns/op
{code}
The following small change fixes the regression on BPI-F3 and
doesn't introduce regressions on other available RISC-V cpus.
The core of the change is placing of first `mulw` in the loop
after four loads:
{code}
$ git diff
diff --git a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
index c62997310b3..f98b48adccd 100644
--- a/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
+++ b/src/hotspot/cpu/riscv/c2_MacroAssembler_riscv.cpp
@@ -1953,16 +1953,15 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res
mv(pow31_3, 29791); // [31^^3]
mv(pow31_2, 961); // [31^^2]
- slli(chunks_end, chunks, chunks_end_shift);
- add(chunks_end, ary, chunks_end);
+ shadd(chunks_end, chunks, ary, t0, chunks_end_shift);
andi(cnt, cnt, stride - 1); // don't forget about tail!
bind(WIDE_LOOP);
- mulw(result, result, pow31_4); // 31^^4 * h
arrays_hashcode_elload(t0, Address(ary, 0 * elsize), eltype);
arrays_hashcode_elload(t1, Address(ary, 1 * elsize), eltype);
arrays_hashcode_elload(tmp5, Address(ary, 2 * elsize), eltype);
arrays_hashcode_elload(tmp6, Address(ary, 3 * elsize), eltype);
+ mulw(result, result, pow31_4); // 31^^4 * h
mulw(t0, t0, pow31_3); // 31^^3 * ary[i+0]
addw(result, result, t0);
mulw(t1, t1, pow31_2); // 31^^2 * ary[i+1]
@@ -1977,8 +1976,7 @@ void C2_MacroAssembler::arrays_hashcode(Register ary, Register cnt, Register res
beqz(cnt, DONE);
bind(TAIL);
- slli(chunks_end, cnt, chunks_end_shift);
- add(chunks_end, ary, chunks_end);
+ shadd(chunks_end, cnt, ary, t0, chunks_end_shift);
bind(TAIL_LOOP);
arrays_hashcode_elload(t0, Address(ary), eltype);
{code}
- relates to
-
JDK-8318217 RISC-V: C2 VectorizedHashCode
-
- Resolved
-
-
JDK-8322174 RISC-V: C2 VectorizedHashCode RVV Version
-
- Open
-