-
Bug
-
Resolution: Fixed
-
P3
-
16
Test case:
public class TestVectorShiftShorts {
private static final int ARRLEN = 1000;
private static final int ITERS = 20000;
public static void main(String args[]) {
short[] a0 = new short[ARRLEN];
short[] a1 = new short[ARRLEN];
// Initialize
test_init(a0, a1);
// Warmup
for (int i = 0; i < ITERS; i++) {
test_lshift(a0, a1);
test_urshift(a0, a1);
}
// Test and verify results
test_init(a0, a1);
test_lshift(a0, a1);
verify_lshift(a0, a1);
test_init(a0, a1);
test_urshift(a0, a1);
verify_urshift(a0, a1);
// Finish
System.out.println("Test passed");
}
static void test_init(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
a0[i] = (short)(i & 3);
a1[i] = (short)i;
}
}
static void test_lshift(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
a0[i] = (short)(a1[i] << 10);
}
}
static void verify_lshift(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
if (a0[i] != (short)(a1[i] << 10)) {
throw new RuntimeException("LShift test failed.");
}
}
}
static void test_urshift(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
a0[i] = (short)(a1[i] >>> 10);
}
}
static void verify_urshift(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
if (a0[i] != (short)(a1[i] >>> 10)) {
throw new RuntimeException("URshift test failed.");
}
}
}
}
Command line:
$qemu-aarch64 -cpu max,sve-max-vq=2 java -XX:UseSVE=2 -Xbatch -XX:-TieredCompilation -XX:CompileCommand=compileonly,TestVectorShiftShorts::test_lshift -XX:+PrintCompilation -XX:+PrintAssembly TestVectorShiftShorts
Use the following sve pattern for example:
1504 instruct vlslS_imm(vReg dst, vReg src, immI shift) %{
1505 predicate(UseSVE > 0 && n->as_Vector()->length() >= 8);
1506 match(Set dst (LShiftVS src shift)); <========== Cannot match !!!
1507 ins_cost(SVE_COST);
1508 format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (H)" %}
1509 ins_encode %{
1510 int con = (int)$shift$$constant;
1511 if (con >= 8) { <========== Should be 16 !!!
1512 __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
1513 as_FloatRegister($src$$reg));
1514 return;
1515 }
1516 __ sve_lsl(as_FloatRegister($dst$$reg), __ H,
1517 as_FloatRegister($src$$reg), con);
1518 %}
1519 ins_pipe(pipe_slow);
1520 %}
1. before fixing matching rule
329 ;; B13: # out( B13 B14 ) <- in( B12 B13 ) Loop( B13-B13 inner main of N98) Freq: 958.688
330 0x000000551d61b610: sbfiz x10, x17, #1, #32 ;*sastore {reexecute=0 rethrow=0 return_oop=0}
331 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
332 0x000000551d61b614: add x11, x2, x10 ;*saload {reexecute=0 rethrow=0 return_oop=0}
333 ; - TestVectorShiftShorts::test_lshift@13 (line 77)
334 0x000000551d61b618: add x12, x11, #0x10
335 0x000000551d61b61c: ld1h {z20.h}, p7/z, [x12]
336 0x000000551d61b620: lsl z20.h, p7/m, z20.h, z16.h <======================= Correct, but can be improved
337 0x000000551d61b624: add x10, x1, x10
338 0x000000551d61b628: add x12, x10, #0x10
339 0x000000551d61b62c: st1h {z20.h}, p7, [x12]
340 0x000000551d61b630: add x12, x11, #0x30
341 0x000000551d61b634: ld1h {z20.h}, p7/z, [x12]
342 0x000000551d61b638: lsl z20.h, p7/m, z20.h, z17.h
343 0x000000551d61b63c: add x12, x10, #0x30
344 0x000000551d61b640: st1h {z20.h}, p7, [x12]
345 0x000000551d61b644: add x12, x11, #0x50
346 0x000000551d61b648: ld1h {z20.h}, p7/z, [x12]
347 0x000000551d61b64c: lsl z20.h, p7/m, z20.h, z19.h
348 0x000000551d61b650: add x12, x10, #0x50
349 0x000000551d61b654: st1h {z20.h}, p7, [x12]
350 0x000000551d61b658: add x11, x11, #0x70
351 0x000000551d61b65c: ld1h {z20.h}, p7/z, [x11]
352 0x000000551d61b660: lsl z20.h, p7/m, z20.h, z18.h
353 0x000000551d61b664: add x10, x10, #0x70
354 0x000000551d61b668: st1h {z20.h}, p7, [x10] ;*sastore {reexecute=0 rethrow=0 return_oop=0}
355 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
356 0x000000551d61b66c: add w17, w17, #0x40 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
357 ; - TestVectorShiftShorts::test_lshift@19 (line 76)
358 0x000000551d61b670: cmp w17, #0x3a9
359 0x000000551d61b674: b.lt 0x000000551d61b610
2. after fixing matching rule
311 ;; B12: # out( B12 B13 ) <- in( B11 B12 ) Loop( B12-B12 inner main of N98) Freq: 958.688
312 0x000000551d614270: sbfiz x10, x16, #1, #32 ;*sastore {reexecute=0 rethrow=0 return_oop=0}
313 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
314 0x000000551d614274: add x11, x2, x10 ;*saload {reexecute=0 rethrow=0 return_oop=0}
315 ; - TestVectorShiftShorts::test_lshift@13 (line 77)
316 0x000000551d614278: add x12, x11, #0x10
317 0x000000551d61427c: ld1h {z16.h}, p7/z, [x12]
318 0x000000551d614280: eor z16.d, z16.d, z16.d <======================= WRONG !!!
319 0x000000551d614284: add x10, x1, x10
320 0x000000551d614288: add x12, x10, #0x10
321 0x000000551d61428c: st1h {z16.h}, p7, [x12]
322 0x000000551d614290: add x12, x11, #0x30
323 0x000000551d614294: ld1h {z16.h}, p7/z, [x12]
324 0x000000551d614298: eor z16.d, z16.d, z16.d
325 0x000000551d61429c: add x12, x10, #0x30
326 0x000000551d6142a0: st1h {z16.h}, p7, [x12]
327 0x000000551d6142a4: add x12, x11, #0x50
328 0x000000551d6142a8: ld1h {z16.h}, p7/z, [x12]
329 0x000000551d6142ac: eor z16.d, z16.d, z16.d
330 0x000000551d6142b0: add x12, x10, #0x50
331 0x000000551d6142b4: st1h {z16.h}, p7, [x12]
332 0x000000551d6142b8: add x11, x11, #0x70
333 0x000000551d6142bc: ld1h {z16.h}, p7/z, [x11]
334 0x000000551d6142c0: eor z16.d, z16.d, z16.d
335 0x000000551d6142c4: add x10, x10, #0x70
336 0x000000551d6142c8: st1h {z16.h}, p7, [x10] ;*sastore {reexecute=0 rethrow=0 return_oop=0}
337 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
338 0x000000551d6142cc: add w16, w16, #0x40 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
339 ; - TestVectorShiftShorts::test_lshift@19 (line 76)
340 0x000000551d6142d0: cmp w16, #0x3a9
341 0x000000551d6142d4: b.lt 0x000000551d614270
Note: SVE eor instruction is not correct here and this is causing the test case fail.
3. after fixing code gen
311 ;; B12: # out( B12 B13 ) <- in( B11 B12 ) Loop( B12-B12 inner main of N98) Freq: 958.688
312 0x000000551d61b5f0: sbfiz x10, x16, #1, #32 ;*sastore {reexecute=0 rethrow=0 return_oop=0}
313 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
314 0x000000551d61b5f4: add x11, x2, x10 ;*saload {reexecute=0 rethrow=0 return_oop=0}
315 ; - TestVectorShiftShorts::test_lshift@13 (line 77)
316 0x000000551d61b5f8: add x12, x11, #0x10
317 0x000000551d61b5fc: ld1h {z16.h}, p7/z, [x12]
318 0x000000551d61b600: lsl z16.h, z16.h, #10 <======================= Correct and better
319 0x000000551d61b604: add x10, x1, x10
320 0x000000551d61b608: add x12, x10, #0x10
321 0x000000551d61b60c: st1h {z16.h}, p7, [x12]
322 0x000000551d61b610: add x12, x11, #0x30
323 0x000000551d61b614: ld1h {z16.h}, p7/z, [x12]
324 0x000000551d61b618: lsl z16.h, z16.h, #10
325 0x000000551d61b61c: add x12, x10, #0x30
326 0x000000551d61b620: st1h {z16.h}, p7, [x12]
327 0x000000551d61b624: add x12, x11, #0x50
328 0x000000551d61b628: ld1h {z16.h}, p7/z, [x12]
329 0x000000551d61b62c: lsl z16.h, z16.h, #10
330 0x000000551d61b630: add x12, x10, #0x50
331 0x000000551d61b634: st1h {z16.h}, p7, [x12]
332 0x000000551d61b638: add x11, x11, #0x70
333 0x000000551d61b63c: ld1h {z16.h}, p7/z, [x11]
334 0x000000551d61b640: lsl z16.h, z16.h, #10
335 0x000000551d61b644: add x10, x10, #0x70
336 0x000000551d61b648: st1h {z16.h}, p7, [x10] ;*sastore {reexecute=0 rethrow=0 return_oop=0}
337 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
338 0x000000551d61b64c: add w16, w16, #0x40 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
339 ; - TestVectorShiftShorts::test_lshift@19 (line 76)
340 0x000000551d61b650: cmp w16, #0x3a9
341 0x000000551d61b654: b.lt 0x000000551d61b5f0
Attached please find the proposed patch. Will propose a PR for review.
public class TestVectorShiftShorts {
private static final int ARRLEN = 1000;
private static final int ITERS = 20000;
public static void main(String args[]) {
short[] a0 = new short[ARRLEN];
short[] a1 = new short[ARRLEN];
// Initialize
test_init(a0, a1);
// Warmup
for (int i = 0; i < ITERS; i++) {
test_lshift(a0, a1);
test_urshift(a0, a1);
}
// Test and verify results
test_init(a0, a1);
test_lshift(a0, a1);
verify_lshift(a0, a1);
test_init(a0, a1);
test_urshift(a0, a1);
verify_urshift(a0, a1);
// Finish
System.out.println("Test passed");
}
static void test_init(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
a0[i] = (short)(i & 3);
a1[i] = (short)i;
}
}
static void test_lshift(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
a0[i] = (short)(a1[i] << 10);
}
}
static void verify_lshift(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
if (a0[i] != (short)(a1[i] << 10)) {
throw new RuntimeException("LShift test failed.");
}
}
}
static void test_urshift(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
a0[i] = (short)(a1[i] >>> 10);
}
}
static void verify_urshift(short[] a0, short[] a1) {
for (int i = 0; i < ARRLEN; i++) {
if (a0[i] != (short)(a1[i] >>> 10)) {
throw new RuntimeException("URshift test failed.");
}
}
}
}
Command line:
$qemu-aarch64 -cpu max,sve-max-vq=2 java -XX:UseSVE=2 -Xbatch -XX:-TieredCompilation -XX:CompileCommand=compileonly,TestVectorShiftShorts::test_lshift -XX:+PrintCompilation -XX:+PrintAssembly TestVectorShiftShorts
Use the following sve pattern for example:
1504 instruct vlslS_imm(vReg dst, vReg src, immI shift) %{
1505 predicate(UseSVE > 0 && n->as_Vector()->length() >= 8);
1506 match(Set dst (LShiftVS src shift)); <========== Cannot match !!!
1507 ins_cost(SVE_COST);
1508 format %{ "sve_lsl $dst, $src, $shift\t# vector (sve) (H)" %}
1509 ins_encode %{
1510 int con = (int)$shift$$constant;
1511 if (con >= 8) { <========== Should be 16 !!!
1512 __ sve_eor(as_FloatRegister($dst$$reg), as_FloatRegister($src$$reg),
1513 as_FloatRegister($src$$reg));
1514 return;
1515 }
1516 __ sve_lsl(as_FloatRegister($dst$$reg), __ H,
1517 as_FloatRegister($src$$reg), con);
1518 %}
1519 ins_pipe(pipe_slow);
1520 %}
1. before fixing matching rule
329 ;; B13: # out( B13 B14 ) <- in( B12 B13 ) Loop( B13-B13 inner main of N98) Freq: 958.688
330 0x000000551d61b610: sbfiz x10, x17, #1, #32 ;*sastore {reexecute=0 rethrow=0 return_oop=0}
331 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
332 0x000000551d61b614: add x11, x2, x10 ;*saload {reexecute=0 rethrow=0 return_oop=0}
333 ; - TestVectorShiftShorts::test_lshift@13 (line 77)
334 0x000000551d61b618: add x12, x11, #0x10
335 0x000000551d61b61c: ld1h {z20.h}, p7/z, [x12]
336 0x000000551d61b620: lsl z20.h, p7/m, z20.h, z16.h <======================= Correct, but can be improved
337 0x000000551d61b624: add x10, x1, x10
338 0x000000551d61b628: add x12, x10, #0x10
339 0x000000551d61b62c: st1h {z20.h}, p7, [x12]
340 0x000000551d61b630: add x12, x11, #0x30
341 0x000000551d61b634: ld1h {z20.h}, p7/z, [x12]
342 0x000000551d61b638: lsl z20.h, p7/m, z20.h, z17.h
343 0x000000551d61b63c: add x12, x10, #0x30
344 0x000000551d61b640: st1h {z20.h}, p7, [x12]
345 0x000000551d61b644: add x12, x11, #0x50
346 0x000000551d61b648: ld1h {z20.h}, p7/z, [x12]
347 0x000000551d61b64c: lsl z20.h, p7/m, z20.h, z19.h
348 0x000000551d61b650: add x12, x10, #0x50
349 0x000000551d61b654: st1h {z20.h}, p7, [x12]
350 0x000000551d61b658: add x11, x11, #0x70
351 0x000000551d61b65c: ld1h {z20.h}, p7/z, [x11]
352 0x000000551d61b660: lsl z20.h, p7/m, z20.h, z18.h
353 0x000000551d61b664: add x10, x10, #0x70
354 0x000000551d61b668: st1h {z20.h}, p7, [x10] ;*sastore {reexecute=0 rethrow=0 return_oop=0}
355 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
356 0x000000551d61b66c: add w17, w17, #0x40 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
357 ; - TestVectorShiftShorts::test_lshift@19 (line 76)
358 0x000000551d61b670: cmp w17, #0x3a9
359 0x000000551d61b674: b.lt 0x000000551d61b610
2. after fixing matching rule
311 ;; B12: # out( B12 B13 ) <- in( B11 B12 ) Loop( B12-B12 inner main of N98) Freq: 958.688
312 0x000000551d614270: sbfiz x10, x16, #1, #32 ;*sastore {reexecute=0 rethrow=0 return_oop=0}
313 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
314 0x000000551d614274: add x11, x2, x10 ;*saload {reexecute=0 rethrow=0 return_oop=0}
315 ; - TestVectorShiftShorts::test_lshift@13 (line 77)
316 0x000000551d614278: add x12, x11, #0x10
317 0x000000551d61427c: ld1h {z16.h}, p7/z, [x12]
318 0x000000551d614280: eor z16.d, z16.d, z16.d <======================= WRONG !!!
319 0x000000551d614284: add x10, x1, x10
320 0x000000551d614288: add x12, x10, #0x10
321 0x000000551d61428c: st1h {z16.h}, p7, [x12]
322 0x000000551d614290: add x12, x11, #0x30
323 0x000000551d614294: ld1h {z16.h}, p7/z, [x12]
324 0x000000551d614298: eor z16.d, z16.d, z16.d
325 0x000000551d61429c: add x12, x10, #0x30
326 0x000000551d6142a0: st1h {z16.h}, p7, [x12]
327 0x000000551d6142a4: add x12, x11, #0x50
328 0x000000551d6142a8: ld1h {z16.h}, p7/z, [x12]
329 0x000000551d6142ac: eor z16.d, z16.d, z16.d
330 0x000000551d6142b0: add x12, x10, #0x50
331 0x000000551d6142b4: st1h {z16.h}, p7, [x12]
332 0x000000551d6142b8: add x11, x11, #0x70
333 0x000000551d6142bc: ld1h {z16.h}, p7/z, [x11]
334 0x000000551d6142c0: eor z16.d, z16.d, z16.d
335 0x000000551d6142c4: add x10, x10, #0x70
336 0x000000551d6142c8: st1h {z16.h}, p7, [x10] ;*sastore {reexecute=0 rethrow=0 return_oop=0}
337 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
338 0x000000551d6142cc: add w16, w16, #0x40 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
339 ; - TestVectorShiftShorts::test_lshift@19 (line 76)
340 0x000000551d6142d0: cmp w16, #0x3a9
341 0x000000551d6142d4: b.lt 0x000000551d614270
Note: SVE eor instruction is not correct here and this is causing the test case fail.
3. after fixing code gen
311 ;; B12: # out( B12 B13 ) <- in( B11 B12 ) Loop( B12-B12 inner main of N98) Freq: 958.688
312 0x000000551d61b5f0: sbfiz x10, x16, #1, #32 ;*sastore {reexecute=0 rethrow=0 return_oop=0}
313 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
314 0x000000551d61b5f4: add x11, x2, x10 ;*saload {reexecute=0 rethrow=0 return_oop=0}
315 ; - TestVectorShiftShorts::test_lshift@13 (line 77)
316 0x000000551d61b5f8: add x12, x11, #0x10
317 0x000000551d61b5fc: ld1h {z16.h}, p7/z, [x12]
318 0x000000551d61b600: lsl z16.h, z16.h, #10 <======================= Correct and better
319 0x000000551d61b604: add x10, x1, x10
320 0x000000551d61b608: add x12, x10, #0x10
321 0x000000551d61b60c: st1h {z16.h}, p7, [x12]
322 0x000000551d61b610: add x12, x11, #0x30
323 0x000000551d61b614: ld1h {z16.h}, p7/z, [x12]
324 0x000000551d61b618: lsl z16.h, z16.h, #10
325 0x000000551d61b61c: add x12, x10, #0x30
326 0x000000551d61b620: st1h {z16.h}, p7, [x12]
327 0x000000551d61b624: add x12, x11, #0x50
328 0x000000551d61b628: ld1h {z16.h}, p7/z, [x12]
329 0x000000551d61b62c: lsl z16.h, z16.h, #10
330 0x000000551d61b630: add x12, x10, #0x50
331 0x000000551d61b634: st1h {z16.h}, p7, [x12]
332 0x000000551d61b638: add x11, x11, #0x70
333 0x000000551d61b63c: ld1h {z16.h}, p7/z, [x11]
334 0x000000551d61b640: lsl z16.h, z16.h, #10
335 0x000000551d61b644: add x10, x10, #0x70
336 0x000000551d61b648: st1h {z16.h}, p7, [x10] ;*sastore {reexecute=0 rethrow=0 return_oop=0}
337 ; - TestVectorShiftShorts::test_lshift@18 (line 77)
338 0x000000551d61b64c: add w16, w16, #0x40 ;*iinc {reexecute=0 rethrow=0 return_oop=0}
339 ; - TestVectorShiftShorts::test_lshift@19 (line 76)
340 0x000000551d61b650: cmp w16, #0x3a9
341 0x000000551d61b654: b.lt 0x000000551d61b5f0
Attached please find the proposed patch. Will propose a PR for review.
- relates to
-
JDK-8231441 AArch64: Initial SVE backend support
- Resolved