-
Enhancement
-
Resolution: Fixed
-
P3
-
16
-
b28
Daniel Lemire identifies a performance issue with IntBuffer expressions when compared direct array expressions:
https://lemire.me/blog/2020/11/30/java-buffer-types-versus-native-arrays-which-is-faster/
See here for the benchmark:
https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/tree/master/2020/11/30
@Benchmark
public void array(BenchmarkState s) {
for(int k = 0; k < s.array.length; k++) {
s.array[k] += 1;
}
}
@Benchmark
public void buffer(BenchmarkState s) throws IOException {
for(int k = 0; k < s.buffer.limit(); k++) {
s.buffer.put(k, s.buffer.get(k) + 1);
}
}
Analysis of the generated code shows that for the buffer benchmark the loop is getting unrolled but not auto-vectorized (Super word analysis is somehow failing).
Hot loop for buffer bechamrk:
3.03% │↗│ 0x000000011882f142: mov %edi,%r9d
5.35% │││ 0x000000011882f145: add %esi,%r9d
1.97% │││ 0x000000011882f148: movslq %edi,%r14
1.76% │││ 0x000000011882f14b: add %rbx,%r14
2.81% │││ 0x000000011882f14e: incl 0x10(%r11,%r14,4)
8.83% │││ 0x000000011882f153: incl 0x14(%r11,%r14,4)
9.85% │││ 0x000000011882f158: movslq %r9d,%r9
2.01% │││ 0x000000011882f15b: incl 0x18(%r11,%r9,4)
9.89% │││ 0x000000011882f160: incl 0x1c(%r11,%r9,4)
9.54% │││ 0x000000011882f165: incl 0x20(%r11,%r9,4)
9.50% │││ 0x000000011882f16a: incl 0x24(%r11,%r9,4)
9.39% │││ 0x000000011882f16f: incl 0x28(%r11,%r9,4)
9.89% │││ 0x000000011882f174: incl 0x2c(%r11,%r9,4)
12.00% │││ 0x000000011882f179: add $0x8,%edi
1.34% │││ 0x000000011882f17c: nopl 0x0(%rax)
2.11% │││ 0x000000011882f180: cmp %r8d,%edi
│╰│ 0x000000011882f183: jl 0x000000011882f142
Hot loop for array benchmark:
1.83% ↗ 0x0000000110c00040: vpaddd 0x10(%rsi,%rcx,4),%ymm1,%ymm0
12.36% │ 0x0000000110c00046: vmovdqu %ymm0,0x10(%rsi,%rcx,4)
1.72% │ 0x0000000110c0004c: vpaddd 0x30(%rsi,%rcx,4),%ymm1,%ymm0
6.57% │ 0x0000000110c00052: vmovdqu %ymm0,0x30(%rsi,%rcx,4)
1.05% │ 0x0000000110c00058: vpaddd 0x50(%rsi,%rcx,4),%ymm1,%ymm0
15.41% │ 0x0000000110c0005e: vmovdqu %ymm0,0x50(%rsi,%rcx,4)
1.97% │ 0x0000000110c00064: vpaddd 0x70(%rsi,%rcx,4),%ymm1,%ymm0
6.92% │ 0x0000000110c0006a: vmovdqu %ymm0,0x70(%rsi,%rcx,4)
1.62% │ 0x0000000110c00070: vpaddd 0x90(%rsi,%rcx,4),%ymm1,%ymm0
13.69% │ 0x0000000110c00079: vmovdqu %ymm0,0x90(%rsi,%rcx,4)
1.51% │ 0x0000000110c00082: vpaddd 0xb0(%rsi,%rcx,4),%ymm1,%ymm0
6.53% │ 0x0000000110c0008b: vmovdqu %ymm0,0xb0(%rsi,%rcx,4)
1.90% │ 0x0000000110c00094: vpaddd 0xd0(%rsi,%rcx,4),%ymm1,%ymm0
12.75% │ 0x0000000110c0009d: vmovdqu %ymm0,0xd0(%rsi,%rcx,4)
1.54% │ 0x0000000110c000a6: vpaddd 0xf0(%rsi,%rcx,4),%ymm1,%ymm0
6.92% │ 0x0000000110c000af: vmovdqu %ymm0,0xf0(%rsi,%rcx,4)
1.62% │ 0x0000000110c000b8: add $0x40,%ecx
1.26% │ 0x0000000110c000bb: nopl 0x0(%rax,%rax,1)
1.79% │ 0x0000000110c000c0: cmp %r11d,%ecx
╰ 0x0000000110c000c3: jl 0x0000000110c00040
https://lemire.me/blog/2020/11/30/java-buffer-types-versus-native-arrays-which-is-faster/
See here for the benchmark:
https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/tree/master/2020/11/30
@Benchmark
public void array(BenchmarkState s) {
for(int k = 0; k < s.array.length; k++) {
s.array[k] += 1;
}
}
@Benchmark
public void buffer(BenchmarkState s) throws IOException {
for(int k = 0; k < s.buffer.limit(); k++) {
s.buffer.put(k, s.buffer.get(k) + 1);
}
}
Analysis of the generated code shows that for the buffer benchmark the loop is getting unrolled but not auto-vectorized (Super word analysis is somehow failing).
Hot loop for buffer bechamrk:
3.03% │↗│ 0x000000011882f142: mov %edi,%r9d
5.35% │││ 0x000000011882f145: add %esi,%r9d
1.97% │││ 0x000000011882f148: movslq %edi,%r14
1.76% │││ 0x000000011882f14b: add %rbx,%r14
2.81% │││ 0x000000011882f14e: incl 0x10(%r11,%r14,4)
8.83% │││ 0x000000011882f153: incl 0x14(%r11,%r14,4)
9.85% │││ 0x000000011882f158: movslq %r9d,%r9
2.01% │││ 0x000000011882f15b: incl 0x18(%r11,%r9,4)
9.89% │││ 0x000000011882f160: incl 0x1c(%r11,%r9,4)
9.54% │││ 0x000000011882f165: incl 0x20(%r11,%r9,4)
9.50% │││ 0x000000011882f16a: incl 0x24(%r11,%r9,4)
9.39% │││ 0x000000011882f16f: incl 0x28(%r11,%r9,4)
9.89% │││ 0x000000011882f174: incl 0x2c(%r11,%r9,4)
12.00% │││ 0x000000011882f179: add $0x8,%edi
1.34% │││ 0x000000011882f17c: nopl 0x0(%rax)
2.11% │││ 0x000000011882f180: cmp %r8d,%edi
│╰│ 0x000000011882f183: jl 0x000000011882f142
Hot loop for array benchmark:
1.83% ↗ 0x0000000110c00040: vpaddd 0x10(%rsi,%rcx,4),%ymm1,%ymm0
12.36% │ 0x0000000110c00046: vmovdqu %ymm0,0x10(%rsi,%rcx,4)
1.72% │ 0x0000000110c0004c: vpaddd 0x30(%rsi,%rcx,4),%ymm1,%ymm0
6.57% │ 0x0000000110c00052: vmovdqu %ymm0,0x30(%rsi,%rcx,4)
1.05% │ 0x0000000110c00058: vpaddd 0x50(%rsi,%rcx,4),%ymm1,%ymm0
15.41% │ 0x0000000110c0005e: vmovdqu %ymm0,0x50(%rsi,%rcx,4)
1.97% │ 0x0000000110c00064: vpaddd 0x70(%rsi,%rcx,4),%ymm1,%ymm0
6.92% │ 0x0000000110c0006a: vmovdqu %ymm0,0x70(%rsi,%rcx,4)
1.62% │ 0x0000000110c00070: vpaddd 0x90(%rsi,%rcx,4),%ymm1,%ymm0
13.69% │ 0x0000000110c00079: vmovdqu %ymm0,0x90(%rsi,%rcx,4)
1.51% │ 0x0000000110c00082: vpaddd 0xb0(%rsi,%rcx,4),%ymm1,%ymm0
6.53% │ 0x0000000110c0008b: vmovdqu %ymm0,0xb0(%rsi,%rcx,4)
1.90% │ 0x0000000110c00094: vpaddd 0xd0(%rsi,%rcx,4),%ymm1,%ymm0
12.75% │ 0x0000000110c0009d: vmovdqu %ymm0,0xd0(%rsi,%rcx,4)
1.54% │ 0x0000000110c000a6: vpaddd 0xf0(%rsi,%rcx,4),%ymm1,%ymm0
6.92% │ 0x0000000110c000af: vmovdqu %ymm0,0xf0(%rsi,%rcx,4)
1.62% │ 0x0000000110c000b8: add $0x40,%ecx
1.26% │ 0x0000000110c000bb: nopl 0x0(%rax,%rax,1)
1.79% │ 0x0000000110c000c0: cmp %r11d,%ecx
╰ 0x0000000110c000c3: jl 0x0000000110c00040
- relates to
-
JDK-8259398 Super word not applied to a loop with byteArrayViewVarHandle
- Resolved