-
Enhancement
-
Resolution: Unresolved
-
P4
-
25
-
generic
-
generic
Following micro show 6x performance degradation when we use a record to store an intermediate vector.
import jdk.incubator.vector.*;
import java.util.Arrays;
import java.util.stream.IntStream;
record Adapter(ShortVector vec) {
}
public class test_ea_records {
public static VectorSpecies<Short> SPEC = ShortVector.SPECIES_PREFERRED;
public static Adapter getAdapter(ShortVector vec) {
return new Adapter(vec);
}
public static void micro1(short [] output, short [] input) {
for (int i = 0; i < SPEC.loopBound(input.length); i += SPEC.length()) {
ShortVector inp = ShortVector.fromArray(SPEC, input, i);
inp = inp.lanewise(VectorOperators.MUL, (short)3);
inp = inp.lanewise(VectorOperators.MUL, (short)30);
inp = inp.lanewise(VectorOperators.MUL, (short)300);
inp = inp.lanewise(VectorOperators.MUL, (short)3000);
Adapter obj = getAdapter(inp);
obj.vec().lanewise(VectorOperators.ADD, (short)10)
.intoArray(output, i);
}
}
public static void micro2(short [] output, short [] input) {
for (int i = 0; i < SPEC.loopBound(input.length); i += SPEC.length()) {
ShortVector inp = ShortVector.fromArray(SPEC, input, i);
inp = inp.lanewise(VectorOperators.MUL, (short)3);
inp = inp.lanewise(VectorOperators.MUL, (short)30);
inp = inp.lanewise(VectorOperators.MUL, (short)300);
inp = inp.lanewise(VectorOperators.MUL, (short)3000);
inp.lanewise(VectorOperators.ADD, (short)10)
.intoArray(output, i);
}
}
public static void main(String [] args) {
int algo = Integer.parseInt(args[0]);
short [] input = new short[2048];
short [] output = new short[2048];
IntStream.range(0, input.length).forEach(i -> { input[i] = (short)i; });
if (algo == 0 || algo == -1) {
for (int i = 0; i < 20000; i++) {
micro1(output, input);
}
long t1 = System.currentTimeMillis();
for (int i = 0; i < 20000; i++) {
micro1(output, input);
}
long t2 = System.currentTimeMillis();
System.out.println("[time] " + (t2 - t1) + " ms [res] " + Arrays.hashCode(output));
}
if (algo == 1 || algo == -1) {
for (int i = 0; i < 20000; i++) {
micro2(output, input);
}
long t1 = System.currentTimeMillis();
for (int i = 0; i < 20000; i++) {
micro2(output, input);
}
long t2 = System.currentTimeMillis();
System.out.println("[time] " + (t2 - t1) + " ms [res] " + Arrays.hashCode(output));
}
}
}
SPR2>java --add-modules=jdk.incubator.vector -Xbatch -XX:-TieredCompilation -cp . test_ea_records -1
WARNING: Using incubator modules: jdk.incubator.vector
[time] 31 ms [res] 1419051009
[time] 5 ms [res] 1419051009
JIT sequence shows allocation for records are not getting eliminated dispiet of being a non-escaping.
0x00007f69a4928b49: mov 0x1c8(%r15),%rax
0x00007f69a4928b50: mov %rax,%r10
0x00007f69a4928b53: add $0x50,%r10
0x00007f69a4928b57: vpbroadcastd -0xe1(%rip),%zmm0 # 0x00007f69a4928a80
; {section_word}
0x00007f69a4928b61: vpmullw 0x10(%r9,%rbp,2),%zmm0,%zmm0
0x00007f69a4928b6c: vpbroadcastd -0xf2(%rip),%zmm1 # 0x00007f69a4928a84
; {section_word}
0x00007f69a4928b76: vpmullw %zmm1,%zmm0,%zmm0
0x00007f69a4928b7c: vpbroadcastd -0xfe(%rip),%zmm1 # 0x00007f69a4928a88
; {section_word}
0x00007f69a4928b86: vpmullw %zmm1,%zmm0,%zmm1
0x00007f69a4928b8c: cmp 0x1d8(%r15),%r10
0x00007f69a4928b93: jae 0x00007f69a4928c4d
0x00007f69a4928b99: mov %r10,0x1c8(%r15)
0x00007f69a4928ba0: prefetchw 0xc0(%r10)
0x00007f69a4928ba8: movq $0x1,(%rax)
0x00007f69a4928baf: prefetchw 0x100(%r10)
0x00007f69a4928bb7: movl $0x164930,0x8(%rax) ; {metadata({type array short})}
0x00007f69a4928bbe: prefetchw 0x140(%r10)
0x00007f69a4928bc6: movl $0x20,0xc(%rax)
0x00007f69a4928bcd: prefetchw 0x180(%r10)
0x00007f69a4928bd5: mov %rax,%rsi
0x00007f69a4928bd8: add $0x10,%rsi
0x00007f69a4928bdc: vpxord %zmm0,%zmm0,%zmm0
0x00007f69a4928be2: vmovdqu64 %zmm0,(%rsi)
0x00007f69a4928be8: vpbroadcastd -0x166(%rip),%zmm0 # 0x00007f69a4928a8c
; {section_word}
0x00007f69a4928bf2: vpmullw %zmm0,%zmm1,%zmm0
0x00007f69a4928bf8: vmovdqu32 %zmm0,0x10(%rax)
0x00007f69a4928c02: vpbroadcastd -0x17c(%rip),%zmm1 # 0x00007f69a4928a90
; {section_word}
0x00007f69a4928c0c: vpaddw %zmm0,%zmm1,%zmm0
0x00007f69a4928c12: cmp %edi,%ebp
0x00007f69a4928c14: jae 0x00007f69a4928d00
0x00007f69a4928c1a: vmovdqu32 %zmm0,0x10(%rbx,%rbp,2)
0x00007f69a4928c25: add %r11d,%ebp
0x00007f69a4928c28: mov 0x30(%r15),%r10 ; ImmutableOopMap {r9=Oop rbx=Oop rdx=Oop }
;*goto {reexecute=1 rethrow=0 return_oop=0}
; - (reexecute) test_ea_records::micro@101 (line 13)
0x00007f69a4928c2c: test %eax,(%r10) ; {poll}
0x00007f69a4928c2f: cmp %ecx,%ebp
0x00007f69a4928c31: jl 0x00007f69a4928b40
0x00007f69a4928c37: vzeroupper
0x00007f69a4928c3a: add $0xd0,%rsp
0x00007f69a4928c41: pop %rbp
0x00007f69a4928c42: cmp 0x28(%r15),%rsp ; {poll_return}
0x00007f69a4928c46: ja 0x00007f69a4928deb
0x00007f69a4928c4c: ret
import jdk.incubator.vector.*;
import java.util.Arrays;
import java.util.stream.IntStream;
record Adapter(ShortVector vec) {
}
public class test_ea_records {
public static VectorSpecies<Short> SPEC = ShortVector.SPECIES_PREFERRED;
public static Adapter getAdapter(ShortVector vec) {
return new Adapter(vec);
}
public static void micro1(short [] output, short [] input) {
for (int i = 0; i < SPEC.loopBound(input.length); i += SPEC.length()) {
ShortVector inp = ShortVector.fromArray(SPEC, input, i);
inp = inp.lanewise(VectorOperators.MUL, (short)3);
inp = inp.lanewise(VectorOperators.MUL, (short)30);
inp = inp.lanewise(VectorOperators.MUL, (short)300);
inp = inp.lanewise(VectorOperators.MUL, (short)3000);
Adapter obj = getAdapter(inp);
obj.vec().lanewise(VectorOperators.ADD, (short)10)
.intoArray(output, i);
}
}
public static void micro2(short [] output, short [] input) {
for (int i = 0; i < SPEC.loopBound(input.length); i += SPEC.length()) {
ShortVector inp = ShortVector.fromArray(SPEC, input, i);
inp = inp.lanewise(VectorOperators.MUL, (short)3);
inp = inp.lanewise(VectorOperators.MUL, (short)30);
inp = inp.lanewise(VectorOperators.MUL, (short)300);
inp = inp.lanewise(VectorOperators.MUL, (short)3000);
inp.lanewise(VectorOperators.ADD, (short)10)
.intoArray(output, i);
}
}
public static void main(String [] args) {
int algo = Integer.parseInt(args[0]);
short [] input = new short[2048];
short [] output = new short[2048];
IntStream.range(0, input.length).forEach(i -> { input[i] = (short)i; });
if (algo == 0 || algo == -1) {
for (int i = 0; i < 20000; i++) {
micro1(output, input);
}
long t1 = System.currentTimeMillis();
for (int i = 0; i < 20000; i++) {
micro1(output, input);
}
long t2 = System.currentTimeMillis();
System.out.println("[time] " + (t2 - t1) + " ms [res] " + Arrays.hashCode(output));
}
if (algo == 1 || algo == -1) {
for (int i = 0; i < 20000; i++) {
micro2(output, input);
}
long t1 = System.currentTimeMillis();
for (int i = 0; i < 20000; i++) {
micro2(output, input);
}
long t2 = System.currentTimeMillis();
System.out.println("[time] " + (t2 - t1) + " ms [res] " + Arrays.hashCode(output));
}
}
}
SPR2>java --add-modules=jdk.incubator.vector -Xbatch -XX:-TieredCompilation -cp . test_ea_records -1
WARNING: Using incubator modules: jdk.incubator.vector
[time] 31 ms [res] 1419051009
[time] 5 ms [res] 1419051009
JIT sequence shows allocation for records are not getting eliminated dispiet of being a non-escaping.
0x00007f69a4928b49: mov 0x1c8(%r15),%rax
0x00007f69a4928b50: mov %rax,%r10
0x00007f69a4928b53: add $0x50,%r10
0x00007f69a4928b57: vpbroadcastd -0xe1(%rip),%zmm0 # 0x00007f69a4928a80
; {section_word}
0x00007f69a4928b61: vpmullw 0x10(%r9,%rbp,2),%zmm0,%zmm0
0x00007f69a4928b6c: vpbroadcastd -0xf2(%rip),%zmm1 # 0x00007f69a4928a84
; {section_word}
0x00007f69a4928b76: vpmullw %zmm1,%zmm0,%zmm0
0x00007f69a4928b7c: vpbroadcastd -0xfe(%rip),%zmm1 # 0x00007f69a4928a88
; {section_word}
0x00007f69a4928b86: vpmullw %zmm1,%zmm0,%zmm1
0x00007f69a4928b8c: cmp 0x1d8(%r15),%r10
0x00007f69a4928b93: jae 0x00007f69a4928c4d
0x00007f69a4928b99: mov %r10,0x1c8(%r15)
0x00007f69a4928ba0: prefetchw 0xc0(%r10)
0x00007f69a4928ba8: movq $0x1,(%rax)
0x00007f69a4928baf: prefetchw 0x100(%r10)
0x00007f69a4928bb7: movl $0x164930,0x8(%rax) ; {metadata({type array short})}
0x00007f69a4928bbe: prefetchw 0x140(%r10)
0x00007f69a4928bc6: movl $0x20,0xc(%rax)
0x00007f69a4928bcd: prefetchw 0x180(%r10)
0x00007f69a4928bd5: mov %rax,%rsi
0x00007f69a4928bd8: add $0x10,%rsi
0x00007f69a4928bdc: vpxord %zmm0,%zmm0,%zmm0
0x00007f69a4928be2: vmovdqu64 %zmm0,(%rsi)
0x00007f69a4928be8: vpbroadcastd -0x166(%rip),%zmm0 # 0x00007f69a4928a8c
; {section_word}
0x00007f69a4928bf2: vpmullw %zmm0,%zmm1,%zmm0
0x00007f69a4928bf8: vmovdqu32 %zmm0,0x10(%rax)
0x00007f69a4928c02: vpbroadcastd -0x17c(%rip),%zmm1 # 0x00007f69a4928a90
; {section_word}
0x00007f69a4928c0c: vpaddw %zmm0,%zmm1,%zmm0
0x00007f69a4928c12: cmp %edi,%ebp
0x00007f69a4928c14: jae 0x00007f69a4928d00
0x00007f69a4928c1a: vmovdqu32 %zmm0,0x10(%rbx,%rbp,2)
0x00007f69a4928c25: add %r11d,%ebp
0x00007f69a4928c28: mov 0x30(%r15),%r10 ; ImmutableOopMap {r9=Oop rbx=Oop rdx=Oop }
;*goto {reexecute=1 rethrow=0 return_oop=0}
; - (reexecute) test_ea_records::micro@101 (line 13)
0x00007f69a4928c2c: test %eax,(%r10) ; {poll}
0x00007f69a4928c2f: cmp %ecx,%ebp
0x00007f69a4928c31: jl 0x00007f69a4928b40
0x00007f69a4928c37: vzeroupper
0x00007f69a4928c3a: add $0xd0,%rsp
0x00007f69a4928c41: pop %rbp
0x00007f69a4928c42: cmp 0x28(%r15),%rsp ; {poll_return}
0x00007f69a4928c46: ja 0x00007f69a4928deb
0x00007f69a4928c4c: ret