-
Enhancement
-
Resolution: Fixed
-
P4
-
17
-
b24
-
aarch64
-
generic
Vector reduce_max, reduce_min, reduce_add can be optimized on AArch64 with NEON pairwise instructions.
## reduce_add2I, before
mov w10, v19.s[0]
mov w2, v19.s[1]
add w10, w0, w10
add w10, w10, w2
## reduce_add2I, optmized
addp v23.2s, v24.2s, v24.2s
mov w10, v23.s[0]
add w10, w10, w2
## reduce_max2I, before
dup v16.2d, v23.d[0]
sminv s16, v16.4s
mov w10, v16.s[0]
cmp w10, w0
csel w10, w10, w0, lt
## reduce_max2I, optmized
sminp v16.2s, v27.2s, v27.2s
mov w10, v16.s[0]
cmp w10, w0
csel w10, w10, w0, lt
Tested the JMH benchmark available on [1], witnessed about 51.23% improvements for reduce_add2I, ~7.5% improvements for reduce_min2I and reduce_max2I.
[1] https://github.com/openjdk/panama-vector/blob/vectorIntrinsics/test/jdk/jdk/incubator/vector/benchmark/src/main/java/benchmark/jdk/incubator/vector/Int64Vector.java
## reduce_add2I, before
mov w10, v19.s[0]
mov w2, v19.s[1]
add w10, w0, w10
add w10, w10, w2
## reduce_add2I, optmized
addp v23.2s, v24.2s, v24.2s
mov w10, v23.s[0]
add w10, w10, w2
## reduce_max2I, before
dup v16.2d, v23.d[0]
sminv s16, v16.4s
mov w10, v16.s[0]
cmp w10, w0
csel w10, w10, w0, lt
## reduce_max2I, optmized
sminp v16.2s, v27.2s, v27.2s
mov w10, v16.s[0]
cmp w10, w0
csel w10, w10, w0, lt
Tested the JMH benchmark available on [1], witnessed about 51.23% improvements for reduce_add2I, ~7.5% improvements for reduce_min2I and reduce_max2I.
[1] https://github.com/openjdk/panama-vector/blob/vectorIntrinsics/test/jdk/jdk/incubator/vector/benchmark/src/main/java/benchmark/jdk/incubator/vector/Int64Vector.java