-
Enhancement
-
Resolution: Fixed
-
P4
-
17, 18, 19
-
b05
We have observed C2's auto-vectorization fails in our machine learning programs.
And here is a small example.
It would be better to enable the auto-vectorization to improve performance.
```
public class DoubleArray2 {
final private static int NUM = 64;
private static double[][] a = new double[NUM][NUM];
private static double[][] b = new double[NUM][NUM];
private static double[][] c = new double[NUM][NUM];
private static void test(double[][] a , double[][] b, double[][] c) {
for(int i = 0; i < a.length; i++) {
for (int j = 0; j < a[0].length; j++) {
a[i][j] = b[i][j] + c[i][j];
}
}
}
public static void main(String[] args) {
for(long i = 0; i < 400000; i++) {
test(a, b, c);
}
for(long i = 0; i < 400000; i++) {
test(a, b, c);
}
long time1, time0 = System.currentTimeMillis();
for(long i = 0; i < 400000; i++) {
test(a, b, c);
}
time1 = System.currentTimeMillis();
System.out.println("Time: " + (time1 - time0));
}
}
```
Before this fix: auto-vectorization failed
```
220 B33: # out( B33 B34 ) <- in( B32 B33 ) Loop( B33-B33 inner main of N130) Freq: 5.99763e+07
220 movsd XMM7, [RDX + #16 + R8 << #3] # double
227 vaddsd XMM7, XMM7, [RAX + #16 + R8 << #3]
22e movsd [RCX + #16 + R8 << #3], XMM7 # double
235 movsd XMM7, [RDX + #24 + R8 << #3] # double
23c vaddsd XMM7, XMM7, [RAX + #24 + R8 << #3]
243 movsd [RCX + #24 + R8 << #3], XMM7 # double
24a movsd XMM7, [RAX + #32 + R8 << #3] # double
251 vaddsd XMM7, XMM7, [RDX + #32 + R8 << #3]
258 movsd [RCX + #32 + R8 << #3], XMM7 # double
25f movsd XMM7, [RAX + #40 + R8 << #3] # double
266 vaddsd XMM7, XMM7, [RDX + #40 + R8 << #3]
26d movsd [RCX + #40 + R8 << #3], XMM7 # double
274 addl R8, #4 # int
278 cmpl R8, R11
27b jl,s B33 # loop end P=0.999999 C=-1.000000
```
After this fix: auto-vectorization succeeded.
```
292 B39: # out( B39 B40 ) <- in( B38 B39 ) Loop( B39-B39 inner main of N194) Freq: 5.26695e+07
292 load_vector XMM8,[RBX + #16 + RDI << #3]
298 vaddpd XMM8,XMM8,[RDX + #16 + RDI << #3] ! add packedD
29e store_vector [RCX + #16 + RDI << #3],XMM8
2a4 load_vector XMM8,[RBX + #48 + RDI << #3]
2aa vaddpd XMM8,XMM8,[RDX + #48 + RDI << #3] ! add packedD
2b0 store_vector [RCX + #48 + RDI << #3],XMM8
2b6 addl RDI, #8 # int
2b9 cmpl RDI, R8
2bc jl,s B39 # loop end P=0.999999 C=-1.000000
```