-
Enhancement
-
Resolution: Fixed
-
P4
-
9
-
b105
Paul spotted the following small inefficiencies:
for (; wi < l; wi++) {
long bi = ((long) Objects.checkIndex(wi, l, null)) << LOG2_ARRAY_LONG_INDEX_SCALE;
long av = U.getLongUnaligned(a, aOffset + bi);
long bv = U.getLongUnaligned(b, bOffset + bi);
if (av != bv) {
is compiled as:
0b0 B9: # B28 B10 <- B8 B13 Loop: B9-B13 inner main of N130 Freq: 977.661
0b0 movl RDX, RDI # spill
0b2 # castII of RDX
0b2 movq RBX, [R9 + #16 + RDX << #3] # long
0b7 movq RAX, [RSI + #16 + RDX << #3] # long
0bc cmpq RBX, RAX
0bf jne B28 P=0.000000 C=7836.000000
0bf
0c5 B10: # B28 B11 <- B9 Freq: 977.66
0c5 movl RDX, RDI # spill
0c7 incl RDX # int
0c9 # castII of RDX
0c9 movq RBX, [R9 + #16 + RDX << #3] # long
0ce movq RAX, [RSI + #16 + RDX << #3] # long
0d3 cmpq RBX, RAX
0d6 jne B28 P=0.000000 C=7836.000000
0d6
0dc B11: # B28 B12 <- B10 Freq: 977.66
0dc movl RDX, RDI # spill
0de addl RDX, #2 # int
0e1 # castII of RDX
0e1 movq RBX, [R9 + #16 + RDX << #3] # long
0e6 movq RAX, [RSI + #16 + RDX << #3] # long
0eb cmpq RBX, RAX
0ee jne B28 P=0.000000 C=7836.000000
0ee
0f4 B12: # B28 B13 <- B11 Freq: 977.659
0f4 movl RDX, RDI # spill
0f6 addl RDX, #3 # int
0f9 # castII of RDX
0f9 movq RBX, [R9 + #16 + RDX << #3] # long
0fe movq RAX, [RSI + #16 + RDX << #3] # long
103 cmpq RBX, RAX
106 jne B28 P=0.000000 C=7836.000000
106
10c B13: # B9 B14 <- B12 Freq: 977.659
10c addl RDI, #4 # int
10f cmpl RDI, RBP
111 jl,s B9 # loop end P=0.998980 C=7836.000000
But the intermediate increment of the induction variable should be folded in the address computation of the memory accesses.
This loop:
for (; wi < length >> valuesPerWidth; wi++) {
long bi = ((long) wi) << LOG2_ARRAY_LONG_INDEX_SCALE;
long av = U.getLongUnaligned(a, aOffset + bi);
long bv = U.getLongUnaligned(b, bOffset + bi);
if (av != bv) {
with length and array length is compiled as:
0b0 B7: # B32 B8 <- B6 B15 Loop: B7-B15 inner main of N123 Freq: 975.843
0b0 movslq R8, RSI # i2l
0b3 movq RAX, [RDX + #16 + R8 << #3] # long
0b8 movq RDI, [RBP + #16 + R8 << #3] # long
0bd cmpq RAX, RDI
0c0 jne B32 P=0.000000 C=7836.000000
0c0
0c6 B8: # B33 B9 <- B7 Freq: 975.842
0c6 movl R8, RSI # spill
0c9 incl R8 # int
0cc movslq RDI, R8 # i2l
0cf movq RAX, [RDX + #16 + RDI << #3] # long
0d4 movq RDI, [RBP + #16 + RDI << #3] # long
0d9 cmpq RAX, RDI
0dc jne B33 P=0.000000 C=7836.000000
0dc
0e2 B9: # B33 B10 <- B8 Freq: 975.842
0e2 movl R8, RSI # spill
0e5 addl R8, #2 # int
0e9 movslq RDI, R8 # i2l
0ec movq RAX, [RDX + #16 + RDI << #3] # long
0f1 movq RDI, [RBP + #16 + RDI << #3] # long
0f6 cmpq RAX, RDI
0f9 jne B33 P=0.000000 C=7836.000000
0f9
0ff B10: # B33 B11 <- B9 Freq: 975.842
0ff movl R8, RSI # spill
102 addl R8, #3 # int
106 movslq RDI, R8 # i2l
109 movq RAX, [RDX + #16 + RDI << #3] # long
10e movq RDI, [RBP + #16 + RDI << #3] # long
113 cmpq RAX, RDI
116 jne B33 P=0.000000 C=7836.000000
116
11c B11: # B33 B12 <- B10 Freq: 975.841
11c movl R8, RSI # spill
11f addl R8, #4 # int
123 movslq RDI, R8 # i2l
126 movq RAX, [RDX + #16 + RDI << #3] # long
12b movq RDI, [RBP + #16 + RDI << #3] # long
130 cmpq RAX, RDI
133 jne B33 P=0.000000 C=7836.000000
133
139 B12: # B33 B13 <- B11 Freq: 975.841
139 movl R8, RSI # spill
13c addl R8, #5 # int
140 movslq RDI, R8 # i2l
143 movq RAX, [RDX + #16 + RDI << #3] # long
148 movq RDI, [RBP + #16 + RDI << #3] # long
14d cmpq RAX, RDI
150 jne B33 P=0.000000 C=7836.000000
150
156 B13: # B33 B14 <- B12 Freq: 975.84
156 movl R8, RSI # spill
159 addl R8, #6 # int
15d movslq RDI, R8 # i2l
160 movq RAX, [RDX + #16 + RDI << #3] # long
165 movq RDI, [RBP + #16 + RDI << #3] # long
16a cmpq RAX, RDI
16d jne B33 P=0.000000 C=7836.000000
16d
173 B14: # B33 B15 <- B13 Freq: 975.84
173 movl R8, RSI # spill
176 addl R8, #7 # int
17a movslq RDI, R8 # i2l
17d movq RAX, [RDX + #16 + RDI << #3] # long
182 movq RDI, [RBP + #16 + RDI << #3] # long
187 cmpq RAX, RDI
18a jne B33 P=0.000000 C=7836.000000
18a
190 B15: # B7 B16 <- B14 Freq: 975.839
190 addl RSI, #8 # int
193 cmpl RSI, R11
196 jl B7 # loop end P=0.998980 C=7836.000000
The i2l conversions are not needed because the loop bounds guarantee the induction variable is always positive. As above intermediate increment of the induction variable should be folded in the address computations.
for (; wi < l; wi++) {
long bi = ((long) Objects.checkIndex(wi, l, null)) << LOG2_ARRAY_LONG_INDEX_SCALE;
long av = U.getLongUnaligned(a, aOffset + bi);
long bv = U.getLongUnaligned(b, bOffset + bi);
if (av != bv) {
is compiled as:
0b0 B9: # B28 B10 <- B8 B13 Loop: B9-B13 inner main of N130 Freq: 977.661
0b0 movl RDX, RDI # spill
0b2 # castII of RDX
0b2 movq RBX, [R9 + #16 + RDX << #3] # long
0b7 movq RAX, [RSI + #16 + RDX << #3] # long
0bc cmpq RBX, RAX
0bf jne B28 P=0.000000 C=7836.000000
0bf
0c5 B10: # B28 B11 <- B9 Freq: 977.66
0c5 movl RDX, RDI # spill
0c7 incl RDX # int
0c9 # castII of RDX
0c9 movq RBX, [R9 + #16 + RDX << #3] # long
0ce movq RAX, [RSI + #16 + RDX << #3] # long
0d3 cmpq RBX, RAX
0d6 jne B28 P=0.000000 C=7836.000000
0d6
0dc B11: # B28 B12 <- B10 Freq: 977.66
0dc movl RDX, RDI # spill
0de addl RDX, #2 # int
0e1 # castII of RDX
0e1 movq RBX, [R9 + #16 + RDX << #3] # long
0e6 movq RAX, [RSI + #16 + RDX << #3] # long
0eb cmpq RBX, RAX
0ee jne B28 P=0.000000 C=7836.000000
0ee
0f4 B12: # B28 B13 <- B11 Freq: 977.659
0f4 movl RDX, RDI # spill
0f6 addl RDX, #3 # int
0f9 # castII of RDX
0f9 movq RBX, [R9 + #16 + RDX << #3] # long
0fe movq RAX, [RSI + #16 + RDX << #3] # long
103 cmpq RBX, RAX
106 jne B28 P=0.000000 C=7836.000000
106
10c B13: # B9 B14 <- B12 Freq: 977.659
10c addl RDI, #4 # int
10f cmpl RDI, RBP
111 jl,s B9 # loop end P=0.998980 C=7836.000000
But the intermediate increment of the induction variable should be folded in the address computation of the memory accesses.
This loop:
for (; wi < length >> valuesPerWidth; wi++) {
long bi = ((long) wi) << LOG2_ARRAY_LONG_INDEX_SCALE;
long av = U.getLongUnaligned(a, aOffset + bi);
long bv = U.getLongUnaligned(b, bOffset + bi);
if (av != bv) {
with length and array length is compiled as:
0b0 B7: # B32 B8 <- B6 B15 Loop: B7-B15 inner main of N123 Freq: 975.843
0b0 movslq R8, RSI # i2l
0b3 movq RAX, [RDX + #16 + R8 << #3] # long
0b8 movq RDI, [RBP + #16 + R8 << #3] # long
0bd cmpq RAX, RDI
0c0 jne B32 P=0.000000 C=7836.000000
0c0
0c6 B8: # B33 B9 <- B7 Freq: 975.842
0c6 movl R8, RSI # spill
0c9 incl R8 # int
0cc movslq RDI, R8 # i2l
0cf movq RAX, [RDX + #16 + RDI << #3] # long
0d4 movq RDI, [RBP + #16 + RDI << #3] # long
0d9 cmpq RAX, RDI
0dc jne B33 P=0.000000 C=7836.000000
0dc
0e2 B9: # B33 B10 <- B8 Freq: 975.842
0e2 movl R8, RSI # spill
0e5 addl R8, #2 # int
0e9 movslq RDI, R8 # i2l
0ec movq RAX, [RDX + #16 + RDI << #3] # long
0f1 movq RDI, [RBP + #16 + RDI << #3] # long
0f6 cmpq RAX, RDI
0f9 jne B33 P=0.000000 C=7836.000000
0f9
0ff B10: # B33 B11 <- B9 Freq: 975.842
0ff movl R8, RSI # spill
102 addl R8, #3 # int
106 movslq RDI, R8 # i2l
109 movq RAX, [RDX + #16 + RDI << #3] # long
10e movq RDI, [RBP + #16 + RDI << #3] # long
113 cmpq RAX, RDI
116 jne B33 P=0.000000 C=7836.000000
116
11c B11: # B33 B12 <- B10 Freq: 975.841
11c movl R8, RSI # spill
11f addl R8, #4 # int
123 movslq RDI, R8 # i2l
126 movq RAX, [RDX + #16 + RDI << #3] # long
12b movq RDI, [RBP + #16 + RDI << #3] # long
130 cmpq RAX, RDI
133 jne B33 P=0.000000 C=7836.000000
133
139 B12: # B33 B13 <- B11 Freq: 975.841
139 movl R8, RSI # spill
13c addl R8, #5 # int
140 movslq RDI, R8 # i2l
143 movq RAX, [RDX + #16 + RDI << #3] # long
148 movq RDI, [RBP + #16 + RDI << #3] # long
14d cmpq RAX, RDI
150 jne B33 P=0.000000 C=7836.000000
150
156 B13: # B33 B14 <- B12 Freq: 975.84
156 movl R8, RSI # spill
159 addl R8, #6 # int
15d movslq RDI, R8 # i2l
160 movq RAX, [RDX + #16 + RDI << #3] # long
165 movq RDI, [RBP + #16 + RDI << #3] # long
16a cmpq RAX, RDI
16d jne B33 P=0.000000 C=7836.000000
16d
173 B14: # B33 B15 <- B13 Freq: 975.84
173 movl R8, RSI # spill
176 addl R8, #7 # int
17a movslq RDI, R8 # i2l
17d movq RAX, [RDX + #16 + RDI << #3] # long
182 movq RDI, [RBP + #16 + RDI << #3] # long
187 cmpq RAX, RDI
18a jne B33 P=0.000000 C=7836.000000
18a
190 B15: # B7 B16 <- B14 Freq: 975.839
190 addl RSI, #8 # int
193 cmpl RSI, R11
196 jl B7 # loop end P=0.998980 C=7836.000000
The i2l conversions are not needed because the loop bounds guarantee the induction variable is always positive. As above intermediate increment of the induction variable should be folded in the address computations.
- relates to
-
JDK-8168481 Check possibility of improved performance with the (CastII (AddI x const)) -> (AddI (CastII x) const) transformation
- Open
-
JDK-8147394 CastIINode::Ideal() should be more generic
- Closed
-
JDK-8148786 xml.transform fails on x86-64
- Resolved
-
JDK-8147853 "assert(t->meet(t0) == t) failed: Not monotonic" with sun/util/calendar/zi/TestZoneInfo310.java
- Resolved
-
JDK-8190375 Java Crash in JavaBug.formatPos(I)Ljava/lang/String
- Closed
-
JDK-8074124 Most Unsafe.get*() access shapes are losing vs. the plain Java accesses
- Open
(1 relates to)