-
Enhancement
-
Resolution: Fixed
-
P3
-
17
-
b08
See the attached benchmark.
When comparing normal array access of an int array with byte[] access using an array view varhandle to view the array as a set of ints (i.e. mismatched access), the latter is 3x slower.
Benchmark Mode Cnt Score Error Units
MyBenchmark.array avgt 3 5.401 ± 1.426 us/op
MyBenchmark.array_view avgt 3 15.221 ± 11.024 us/op
Looking at the generated code, the array_view case is missing vectorization:
? 0x000001df903ec700: movsxd rbp,r11d ;*i2l {reexecute=0 rethrow=0 return_oop=0}
│ ; - java.lang.invoke.VarHandleByteArrayAsInts$ArrayHandle::get@22 (line 115)
│ ; - java.lang.invoke.VarHandleGuards::guard_LI_I@108 (line 193)
│ ; - org.sample.MyBenchmark::array_view@19 (line 96)
│ ; - org.sample.jmh_generated.MyBenchmark_array_view_jmhTest::array_view_avgt_jmhStub@17 (line 195)
0.23% │ 0x000001df903ec703: inc dword ptr [rdx+rbp+10h]
31.07% │ 0x000001df903ec707: inc dword ptr [rdx+rbp+14h]
23.35% │ 0x000001df903ec70b: inc dword ptr [rdx+rbp+18h]
6.55% │ 0x000001df903ec70f: inc dword ptr [rdx+rbp+1ch]
10.70% │ 0x000001df903ec713: inc dword ptr [rdx+rbp+20h]
3.11% │ 0x000001df903ec717: inc dword ptr [rdx+rbp+24h]
8.50% │ 0x000001df903ec71b: inc dword ptr [rdx+rbp+28h]
1.23% │ 0x000001df903ec71f: inc dword ptr [rdx+rbp+2ch] ;*invokevirtual putIntUnaligned {reexecute=0 rethrow=0 return_oop=0}
│ ; - jdk.internal.misc.Unsafe::putIntUnaligned@10 (line 3718)
│ ; - java.lang.invoke.VarHandleByteArrayAsInts$ArrayHandle::set@35 (line 123)
│ ; - java.lang.invoke.VarHandleGuards::guard_LII_V@116 (line 919)
│ ; - org.sample.MyBenchmark::array_view@34 (line 97)
│ ; - org.sample.jmh_generated.MyBenchmark_array_view_jmhTest::array_view_avgt_jmhStub@17 (line 195)
7.36% │ 0x000001df903ec723: add r11d,20h ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ; - org.sample.MyBenchmark::array_view@37 (line 95)
│ ; - org.sample.jmh_generated.MyBenchmark_array_view_jmhTest::array_view_avgt_jmhStub@17 (line 195)
0.19% │ 0x000001df903ec727: cmp r11d,ecx
2.89% ? 0x000001df903ec72a: jl 1df903ec700h ;*goto {reexecute=0 rethrow=0 return_oop=0}
; - org.sample.MyBenchmark::array_view@40 (line 95)
; - org.sample.jmh_generated.MyBenchmark_array_view_jmhTest::array_view_avgt_jmhStub@17 (line 195)
Vs array's:
?│ 0x0000024307b09c80: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+10h]
6.75% ││ 0x0000024307b09c86: vmovdqu ymmword ptr [rdx+rcx*4+10h],ymm0
2.64% ││ 0x0000024307b09c8c: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+30h]
11.80% ││ 0x0000024307b09c92: vmovdqu ymmword ptr [rdx+rcx*4+30h],ymm0
2.80% ││ 0x0000024307b09c98: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+50h]
6.23% ││ 0x0000024307b09c9e: vmovdqu ymmword ptr [rdx+rcx*4+50h],ymm0
3.39% ││ 0x0000024307b09ca4: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+70h]
9.65% ││ 0x0000024307b09caa: vmovdqu ymmword ptr [rdx+rcx*4+70h],ymm0
3.62% ││ 0x0000024307b09cb0: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+90h]
6.52% ││ 0x0000024307b09cb9: vmovdqu ymmword ptr [rdx+rcx*4+90h],ymm0
3.68% ││ 0x0000024307b09cc2: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+0b0h]
10.24% ││ 0x0000024307b09ccb: vmovdqu ymmword ptr [rdx+rcx*4+0b0h],ymm0
3.55% ││ 0x0000024307b09cd4: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+0d0h]
6.98% ││ 0x0000024307b09cdd: vmovdqu ymmword ptr [rdx+rcx*4+0d0h],ymm0
4.34% ││ 0x0000024307b09ce6: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+0f0h]
9.94% ││ 0x0000024307b09cef: vmovdqu ymmword ptr [rdx+rcx*4+0f0h],ymm0;*iastore {reexecute=0 rethrow=0 return_oop=0}
││ ; - org.sample.MyBenchmark::array@20 (line 82)
││ ; - org.sample.jmh_generated.MyBenchmark_array_jmhTest::array_avgt_jmhStub@17 (line 195)
3.98% ││ 0x0000024307b09cf8: add ecx,40h ;*iinc {reexecute=0 rethrow=0 return_oop=0}
││ ; - org.sample.MyBenchmark::array@21 (line 81)
││ ; - org.sample.jmh_generated.MyBenchmark_array_jmhTest::array_avgt_jmhStub@17 (line 195)
0.23% ││ 0x0000024307b09cfb: cmp ecx,r9d
?│ 0x0000024307b09cfe: jl 24307b09c80h ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ ; - org.sample.MyBenchmark::array@24 (line 81)
│ ; - org.sample.jmh_generated.MyBenchmark_array_jmhTest::array_avgt_jmhStub@17 (line 195)
This issue seems very similar to https://bugs.openjdk.java.net/browse/JDK-8257531 (and was in fact found with a derived benchmark)
When comparing normal array access of an int array with byte[] access using an array view varhandle to view the array as a set of ints (i.e. mismatched access), the latter is 3x slower.
Benchmark Mode Cnt Score Error Units
MyBenchmark.array avgt 3 5.401 ± 1.426 us/op
MyBenchmark.array_view avgt 3 15.221 ± 11.024 us/op
Looking at the generated code, the array_view case is missing vectorization:
? 0x000001df903ec700: movsxd rbp,r11d ;*i2l {reexecute=0 rethrow=0 return_oop=0}
│ ; - java.lang.invoke.VarHandleByteArrayAsInts$ArrayHandle::get@22 (line 115)
│ ; - java.lang.invoke.VarHandleGuards::guard_LI_I@108 (line 193)
│ ; - org.sample.MyBenchmark::array_view@19 (line 96)
│ ; - org.sample.jmh_generated.MyBenchmark_array_view_jmhTest::array_view_avgt_jmhStub@17 (line 195)
0.23% │ 0x000001df903ec703: inc dword ptr [rdx+rbp+10h]
31.07% │ 0x000001df903ec707: inc dword ptr [rdx+rbp+14h]
23.35% │ 0x000001df903ec70b: inc dword ptr [rdx+rbp+18h]
6.55% │ 0x000001df903ec70f: inc dword ptr [rdx+rbp+1ch]
10.70% │ 0x000001df903ec713: inc dword ptr [rdx+rbp+20h]
3.11% │ 0x000001df903ec717: inc dword ptr [rdx+rbp+24h]
8.50% │ 0x000001df903ec71b: inc dword ptr [rdx+rbp+28h]
1.23% │ 0x000001df903ec71f: inc dword ptr [rdx+rbp+2ch] ;*invokevirtual putIntUnaligned {reexecute=0 rethrow=0 return_oop=0}
│ ; - jdk.internal.misc.Unsafe::putIntUnaligned@10 (line 3718)
│ ; - java.lang.invoke.VarHandleByteArrayAsInts$ArrayHandle::set@35 (line 123)
│ ; - java.lang.invoke.VarHandleGuards::guard_LII_V@116 (line 919)
│ ; - org.sample.MyBenchmark::array_view@34 (line 97)
│ ; - org.sample.jmh_generated.MyBenchmark_array_view_jmhTest::array_view_avgt_jmhStub@17 (line 195)
7.36% │ 0x000001df903ec723: add r11d,20h ;*iinc {reexecute=0 rethrow=0 return_oop=0}
│ ; - org.sample.MyBenchmark::array_view@37 (line 95)
│ ; - org.sample.jmh_generated.MyBenchmark_array_view_jmhTest::array_view_avgt_jmhStub@17 (line 195)
0.19% │ 0x000001df903ec727: cmp r11d,ecx
2.89% ? 0x000001df903ec72a: jl 1df903ec700h ;*goto {reexecute=0 rethrow=0 return_oop=0}
; - org.sample.MyBenchmark::array_view@40 (line 95)
; - org.sample.jmh_generated.MyBenchmark_array_view_jmhTest::array_view_avgt_jmhStub@17 (line 195)
Vs array's:
?│ 0x0000024307b09c80: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+10h]
6.75% ││ 0x0000024307b09c86: vmovdqu ymmword ptr [rdx+rcx*4+10h],ymm0
2.64% ││ 0x0000024307b09c8c: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+30h]
11.80% ││ 0x0000024307b09c92: vmovdqu ymmword ptr [rdx+rcx*4+30h],ymm0
2.80% ││ 0x0000024307b09c98: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+50h]
6.23% ││ 0x0000024307b09c9e: vmovdqu ymmword ptr [rdx+rcx*4+50h],ymm0
3.39% ││ 0x0000024307b09ca4: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+70h]
9.65% ││ 0x0000024307b09caa: vmovdqu ymmword ptr [rdx+rcx*4+70h],ymm0
3.62% ││ 0x0000024307b09cb0: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+90h]
6.52% ││ 0x0000024307b09cb9: vmovdqu ymmword ptr [rdx+rcx*4+90h],ymm0
3.68% ││ 0x0000024307b09cc2: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+0b0h]
10.24% ││ 0x0000024307b09ccb: vmovdqu ymmword ptr [rdx+rcx*4+0b0h],ymm0
3.55% ││ 0x0000024307b09cd4: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+0d0h]
6.98% ││ 0x0000024307b09cdd: vmovdqu ymmword ptr [rdx+rcx*4+0d0h],ymm0
4.34% ││ 0x0000024307b09ce6: vpaddd ymm0,ymm1,ymmword ptr [rdx+rcx*4+0f0h]
9.94% ││ 0x0000024307b09cef: vmovdqu ymmword ptr [rdx+rcx*4+0f0h],ymm0;*iastore {reexecute=0 rethrow=0 return_oop=0}
││ ; - org.sample.MyBenchmark::array@20 (line 82)
││ ; - org.sample.jmh_generated.MyBenchmark_array_jmhTest::array_avgt_jmhStub@17 (line 195)
3.98% ││ 0x0000024307b09cf8: add ecx,40h ;*iinc {reexecute=0 rethrow=0 return_oop=0}
││ ; - org.sample.MyBenchmark::array@21 (line 81)
││ ; - org.sample.jmh_generated.MyBenchmark_array_jmhTest::array_avgt_jmhStub@17 (line 195)
0.23% ││ 0x0000024307b09cfb: cmp ecx,r9d
?│ 0x0000024307b09cfe: jl 24307b09c80h ;*goto {reexecute=0 rethrow=0 return_oop=0}
│ ; - org.sample.MyBenchmark::array@24 (line 81)
│ ; - org.sample.jmh_generated.MyBenchmark_array_jmhTest::array_avgt_jmhStub@17 (line 195)
This issue seems very similar to https://bugs.openjdk.java.net/browse/JDK-8257531 (and was in fact found with a derived benchmark)
- relates to
-
JDK-8257531 Super word not applied to a loop of simple Buffer operations
-
- Resolved
-