-
Enhancement
-
Resolution: Fixed
-
P4
-
23
-
b26
-
riscv
-
linux
Hi, We want to support vector-scalar and vector-immediate arithmetic instructions, It was implemented by referring to RVV v1.0 [1]. please take a look and have some reviews. Thanks a lot.
We can use the Byte256VectorTests.java[2] to print the Opto JIT Code, verify and observe the generation of nodes.
For example, we can use the following command to print the Opto JIT Code of a jtreg test case:
```
/home/zifeihan/jtreg/bin/jtreg \
-v:default \
-concurrency:16 -timeout:50 \
-javaoption:-XX:+UnlockExperimentalVMOptions \
-javaoption:-XX:+UseRVV \
-javaoption:-XX:+PrintOptoAssembly \
-javaoption:-XX:LogFile=/home/zifeihan/jdk/Byte256VectorTests_PrintOptoAssembly.log \
-jdk:/home/zifeihan/jdk/build/linux-riscv64-server-fastdebug/jdk \
/home/zifeihan/jdk/test/jdk/jdk/incubator/vector/Byte256VectorTests.java
```
we can observe the specified compilation log `Byte256VectorTests_PrintOptoAssembly.log`, which contains the vector-scalar and vector-immediate arithmetic instructions for the PR implementation.
vadd_immI Node
```
16c addw R11, R10, zr #@convI2L_reg_reg
170 add R9, R31, R11 # ptr, #@addP_reg_reg
174 addi R9, R9, #16 # ptr, #@addP_reg_imm
176 loadV V1, [R9] # vector (rvv)
17e vadd_immI V1, V1, #7
186 add R11, R15, R11 # ptr, #@addP_reg_reg
188 addi R11, R11, #16 # ptr, #@addP_reg_imm
18a storeV [R11], V1 # vector (rvv)
```
vadd_immI_masked Node
```
1e8 B31: # out( B37 B32 ) <- in( B30 ) Freq: 76.2281
1e8 loadV V2, [R31] # vector (rvv)
1f0 vloadmask V0, V1
1f8 vadd_immI_masked V2, V2, #7
200 addi R31, R10, #48 # ptr, #@addP_reg_imm
204 bgeu R30, R7, B37 #@cmpU_branch P=0.000001 C=-1.000000
```
vadd_regI Node
```
0c4 B4: # out( B9 B5 ) <- in( B8 B3 ) Freq: 1
0c4 vloadcon V1 # generate iota indices
0cc spill [sp, #4] -> R30 # spill size = 32
0ce vmul_regI V1, V1, R30
0d6 spill [sp, #0] -> R29 # spill size = 32
0d8 vadd_regI V1, V1, R29
```
vadd_regI_masked Node
```
244 B36: # out( B33 B37 ) <- in( B35 ) Freq: 7427.81
244 # castII of R30, #@castII
244 addw R31, R30, zr #@convI2L_reg_reg
248 spill [sp, #32] -> R10 # spill size = 64
24a add R10, R10, R31 # ptr, #@addP_reg_reg
24c addi R10, R10, #16 # ptr, #@addP_reg_imm
24e loadV V2, [R10] # vector (rvv)
256 vloadmask V0, V1
25e vadd_regI_masked V2, V2, R29
```
vsub_regI Node
```
112 B20: # out( B63 B21 ) <- in( B19 ) Freq: 77.0107
112 # castII of R20, #@castII
112 addw R11, R20, zr #@convI2L_reg_reg
116 add R12, R10, R11 # ptr, #@addP_reg_reg
11a addi R12, R12, #16 # ptr, #@addP_reg_imm
11c loadV V1, [R12] # vector (rvv)
124 vsub_regI V1, V1, R31
12c bgeu R20, R29, B63 #@cmpU_branch P=0.000001 C=-1.000000
```
vsub_regI_masked Node
```
1e8 B31: # out( B37 B32 ) <- in( B30 ) Freq: 76.2281
1e8 loadV V2, [R31] # vector (rvv)
1f0 vloadmask V0, V1
1f8 vsub_regI_masked V2, V2, R29
200 addi R31, R10, #48 # ptr, #@addP_reg_imm
204 bgeu R30, R7, B37 #@cmpU_branch P=0.000001 C=-1.000000
```
vmul_regI Node
```
0ca B4: # out( B9 B5 ) <- in( B8 B3 ) Freq: 1
0ca vloadcon V1 # generate iota indices
0d2 spill [sp, #0] -> R29 # spill size = 64
0d4 lwu R7, [R29, #12] # loadN, compressed ptr, #@loadN ! Field: jdk/internal/vm/vector/VectorSupport$VectorPayload.payload (constant)
0d8 decode_heap_oop R7, R7 #@decodeHeapOop
0da addi R7, R7, #16 # ptr, #@addP_reg_imm
0dc vmul_regI V1, V1, R30
0e4 loadV V2, [R7] # vector (rvv)
```
vmul_regI_masked Node
```
198 addw R30, R19, zr #@convI2L_reg_reg
19c spill [sp, #32] -> R31 # spill size = 64
19e add R31, R31, R30 # ptr, #@addP_reg_reg
1a0 addi R10, R31, #16 # ptr, #@addP_reg_imm
1a4 loadV V2, [R10] # vector (rvv)
1ac vloadmask V0, V1
1b4 vmul_regI_masked V2, V2, R29
```
We can test test/jdk/jdk/incubator/vector/Long256VectorTests.java in the same way, and looking at the Opto logs, we will see nodes similar to vadd_immL, vadd_immL_masked, vadd_regL, vadd_regL_masked, vsub_regL, vsub_regL_masked, vmul_regL, vmul_regL_masked.
vadd_immL Node
```
112 addw R11, R9, zr #@convI2L_reg_reg
116 slli R11, R11, (#3 & 0x3f) #@lShiftL_reg_imm
118 add R14, R29, R11 # ptr, #@addP_reg_reg
11c addi R14, R14, #16 # ptr, #@addP_reg_imm
11e loadV V1, [R14] # vector (rvv)
126 vadd_immL V1, V1, #7
```
vadd_immL_masked Node
```
194 addw R30, R19, zr #@convI2L_reg_reg
198 slli R30, R30, (#3 & 0x3f) #@lShiftL_reg_imm
19a spill [sp, #32] -> R31 # spill size = 64
19c add R31, R31, R30 # ptr, #@addP_reg_reg
19e addi R10, R31, #16 # ptr, #@addP_reg_imm
1a2 loadV V1, [R10] # vector (rvv)
1aa vadd_immL_masked V1, V1, #7
```
vadd_regL Node
```
104 B17: # out( B20 ) <- in( B16 ) Freq: 0.99999
104 replicateL_imm5 V4, #1
10c vadd_regL V4, V4, R17
114 -- // R23=Thread::current(), empty, #@tlsLoadP
114 mv R31, #0 # int, #@loadConI
116 j B20 #@branch
```
vadd_regL_masked Node
```
198 addw R30, R19, zr #@convI2L_reg_reg
19c slli R30, R30, (#3 & 0x3f) #@lShiftL_reg_imm
19e spill [sp, #32] -> R31 # spill size = 64
1a0 add R31, R31, R30 # ptr, #@addP_reg_reg
1a2 addi R10, R31, #16 # ptr, #@addP_reg_imm
1a6 loadV V1, [R10] # vector (rvv)
1ae vadd_regL_masked V1, V1, R11
```
vsub_regL Node
```
116 addw R11, R19, zr #@convI2L_reg_reg
11a slli R11, R11, (#3 & 0x3f) #@lShiftL_reg_imm
11c add R12, R31, R11 # ptr, #@addP_reg_reg
120 addi R12, R12, #16 # ptr, #@addP_reg_imm
122 loadV V1, [R12] # vector (rvv)
12a vsub_regL V1, V1, R14
```
vsub_regL_masked Node
```
198 addw R30, R19, zr #@convI2L_reg_reg
19c slli R30, R30, (#3 & 0x3f) #@lShiftL_reg_imm
19e spill [sp, #32] -> R31 # spill size = 64
1a0 add R31, R31, R30 # ptr, #@addP_reg_reg
1a2 addi R10, R31, #16 # ptr, #@addP_reg_imm
1a6 loadV V1, [R10] # vector (rvv)
1ae vsub_regL_masked V1, V1, R11
```
vmul_regL Node
```
0c2 vloadcon V1 # generate iota indices
0ca spill [sp, #0] -> R29 # spill size = 64
0cc lwu R7, [R29, #12] # loadN, compressed ptr, #@loadN ! Field: jdk/internal/vm/vector/VectorSupport$VectorPayload.payload (constant)
0d0 decode_heap_oop R7, R7 #@decodeHeapOop
0d2 addi R7, R7, #16 # ptr, #@addP_reg_imm
0d4 addw R28, R30, zr #@convI2L_reg_reg
0d8 loadV V2, [R7] # vector (rvv)
0e0 vmul_regL V1, V1, R28
```
vmul_regL_masked Node
```
19c slli R30, R30, (#3 & 0x3f) #@lShiftL_reg_imm
19e spill [sp, #32] -> R31 # spill size = 64
1a0 add R31, R31, R30 # ptr, #@addP_reg_reg
1a2 addi R10, R31, #16 # ptr, #@addP_reg_imm
1a6 loadV V1, [R10] # vector (rvv)
1ae vmul_regL_masked V1, V1, R11
1b6 spill [sp, #48] -> R10 # spill size = 64
```
We can use the Byte256VectorTests.java[2] to print the Opto JIT Code, verify and observe the generation of nodes.
For example, we can use the following command to print the Opto JIT Code of a jtreg test case:
```
/home/zifeihan/jtreg/bin/jtreg \
-v:default \
-concurrency:16 -timeout:50 \
-javaoption:-XX:+UnlockExperimentalVMOptions \
-javaoption:-XX:+UseRVV \
-javaoption:-XX:+PrintOptoAssembly \
-javaoption:-XX:LogFile=/home/zifeihan/jdk/Byte256VectorTests_PrintOptoAssembly.log \
-jdk:/home/zifeihan/jdk/build/linux-riscv64-server-fastdebug/jdk \
/home/zifeihan/jdk/test/jdk/jdk/incubator/vector/Byte256VectorTests.java
```
we can observe the specified compilation log `Byte256VectorTests_PrintOptoAssembly.log`, which contains the vector-scalar and vector-immediate arithmetic instructions for the PR implementation.
vadd_immI Node
```
16c addw R11, R10, zr #@convI2L_reg_reg
170 add R9, R31, R11 # ptr, #@addP_reg_reg
174 addi R9, R9, #16 # ptr, #@addP_reg_imm
176 loadV V1, [R9] # vector (rvv)
17e vadd_immI V1, V1, #7
186 add R11, R15, R11 # ptr, #@addP_reg_reg
188 addi R11, R11, #16 # ptr, #@addP_reg_imm
18a storeV [R11], V1 # vector (rvv)
```
vadd_immI_masked Node
```
1e8 B31: # out( B37 B32 ) <- in( B30 ) Freq: 76.2281
1e8 loadV V2, [R31] # vector (rvv)
1f0 vloadmask V0, V1
1f8 vadd_immI_masked V2, V2, #7
200 addi R31, R10, #48 # ptr, #@addP_reg_imm
204 bgeu R30, R7, B37 #@cmpU_branch P=0.000001 C=-1.000000
```
vadd_regI Node
```
0c4 B4: # out( B9 B5 ) <- in( B8 B3 ) Freq: 1
0c4 vloadcon V1 # generate iota indices
0cc spill [sp, #4] -> R30 # spill size = 32
0ce vmul_regI V1, V1, R30
0d6 spill [sp, #0] -> R29 # spill size = 32
0d8 vadd_regI V1, V1, R29
```
vadd_regI_masked Node
```
244 B36: # out( B33 B37 ) <- in( B35 ) Freq: 7427.81
244 # castII of R30, #@castII
244 addw R31, R30, zr #@convI2L_reg_reg
248 spill [sp, #32] -> R10 # spill size = 64
24a add R10, R10, R31 # ptr, #@addP_reg_reg
24c addi R10, R10, #16 # ptr, #@addP_reg_imm
24e loadV V2, [R10] # vector (rvv)
256 vloadmask V0, V1
25e vadd_regI_masked V2, V2, R29
```
vsub_regI Node
```
112 B20: # out( B63 B21 ) <- in( B19 ) Freq: 77.0107
112 # castII of R20, #@castII
112 addw R11, R20, zr #@convI2L_reg_reg
116 add R12, R10, R11 # ptr, #@addP_reg_reg
11a addi R12, R12, #16 # ptr, #@addP_reg_imm
11c loadV V1, [R12] # vector (rvv)
124 vsub_regI V1, V1, R31
12c bgeu R20, R29, B63 #@cmpU_branch P=0.000001 C=-1.000000
```
vsub_regI_masked Node
```
1e8 B31: # out( B37 B32 ) <- in( B30 ) Freq: 76.2281
1e8 loadV V2, [R31] # vector (rvv)
1f0 vloadmask V0, V1
1f8 vsub_regI_masked V2, V2, R29
200 addi R31, R10, #48 # ptr, #@addP_reg_imm
204 bgeu R30, R7, B37 #@cmpU_branch P=0.000001 C=-1.000000
```
vmul_regI Node
```
0ca B4: # out( B9 B5 ) <- in( B8 B3 ) Freq: 1
0ca vloadcon V1 # generate iota indices
0d2 spill [sp, #0] -> R29 # spill size = 64
0d4 lwu R7, [R29, #12] # loadN, compressed ptr, #@loadN ! Field: jdk/internal/vm/vector/VectorSupport$VectorPayload.payload (constant)
0d8 decode_heap_oop R7, R7 #@decodeHeapOop
0da addi R7, R7, #16 # ptr, #@addP_reg_imm
0dc vmul_regI V1, V1, R30
0e4 loadV V2, [R7] # vector (rvv)
```
vmul_regI_masked Node
```
198 addw R30, R19, zr #@convI2L_reg_reg
19c spill [sp, #32] -> R31 # spill size = 64
19e add R31, R31, R30 # ptr, #@addP_reg_reg
1a0 addi R10, R31, #16 # ptr, #@addP_reg_imm
1a4 loadV V2, [R10] # vector (rvv)
1ac vloadmask V0, V1
1b4 vmul_regI_masked V2, V2, R29
```
We can test test/jdk/jdk/incubator/vector/Long256VectorTests.java in the same way, and looking at the Opto logs, we will see nodes similar to vadd_immL, vadd_immL_masked, vadd_regL, vadd_regL_masked, vsub_regL, vsub_regL_masked, vmul_regL, vmul_regL_masked.
vadd_immL Node
```
112 addw R11, R9, zr #@convI2L_reg_reg
116 slli R11, R11, (#3 & 0x3f) #@lShiftL_reg_imm
118 add R14, R29, R11 # ptr, #@addP_reg_reg
11c addi R14, R14, #16 # ptr, #@addP_reg_imm
11e loadV V1, [R14] # vector (rvv)
126 vadd_immL V1, V1, #7
```
vadd_immL_masked Node
```
194 addw R30, R19, zr #@convI2L_reg_reg
198 slli R30, R30, (#3 & 0x3f) #@lShiftL_reg_imm
19a spill [sp, #32] -> R31 # spill size = 64
19c add R31, R31, R30 # ptr, #@addP_reg_reg
19e addi R10, R31, #16 # ptr, #@addP_reg_imm
1a2 loadV V1, [R10] # vector (rvv)
1aa vadd_immL_masked V1, V1, #7
```
vadd_regL Node
```
104 B17: # out( B20 ) <- in( B16 ) Freq: 0.99999
104 replicateL_imm5 V4, #1
10c vadd_regL V4, V4, R17
114 -- // R23=Thread::current(), empty, #@tlsLoadP
114 mv R31, #0 # int, #@loadConI
116 j B20 #@branch
```
vadd_regL_masked Node
```
198 addw R30, R19, zr #@convI2L_reg_reg
19c slli R30, R30, (#3 & 0x3f) #@lShiftL_reg_imm
19e spill [sp, #32] -> R31 # spill size = 64
1a0 add R31, R31, R30 # ptr, #@addP_reg_reg
1a2 addi R10, R31, #16 # ptr, #@addP_reg_imm
1a6 loadV V1, [R10] # vector (rvv)
1ae vadd_regL_masked V1, V1, R11
```
vsub_regL Node
```
116 addw R11, R19, zr #@convI2L_reg_reg
11a slli R11, R11, (#3 & 0x3f) #@lShiftL_reg_imm
11c add R12, R31, R11 # ptr, #@addP_reg_reg
120 addi R12, R12, #16 # ptr, #@addP_reg_imm
122 loadV V1, [R12] # vector (rvv)
12a vsub_regL V1, V1, R14
```
vsub_regL_masked Node
```
198 addw R30, R19, zr #@convI2L_reg_reg
19c slli R30, R30, (#3 & 0x3f) #@lShiftL_reg_imm
19e spill [sp, #32] -> R31 # spill size = 64
1a0 add R31, R31, R30 # ptr, #@addP_reg_reg
1a2 addi R10, R31, #16 # ptr, #@addP_reg_imm
1a6 loadV V1, [R10] # vector (rvv)
1ae vsub_regL_masked V1, V1, R11
```
vmul_regL Node
```
0c2 vloadcon V1 # generate iota indices
0ca spill [sp, #0] -> R29 # spill size = 64
0cc lwu R7, [R29, #12] # loadN, compressed ptr, #@loadN ! Field: jdk/internal/vm/vector/VectorSupport$VectorPayload.payload (constant)
0d0 decode_heap_oop R7, R7 #@decodeHeapOop
0d2 addi R7, R7, #16 # ptr, #@addP_reg_imm
0d4 addw R28, R30, zr #@convI2L_reg_reg
0d8 loadV V2, [R7] # vector (rvv)
0e0 vmul_regL V1, V1, R28
```
vmul_regL_masked Node
```
19c slli R30, R30, (#3 & 0x3f) #@lShiftL_reg_imm
19e spill [sp, #32] -> R31 # spill size = 64
1a0 add R31, R31, R30 # ptr, #@addP_reg_reg
1a2 addi R10, R31, #16 # ptr, #@addP_reg_imm
1a6 loadV V1, [R10] # vector (rvv)
1ae vmul_regL_masked V1, V1, R11
1b6 spill [sp, #48] -> R10 # spill size = 64
```