-
Enhancement
-
Resolution: Fixed
-
P4
-
11, 17, 19
-
b10
This is seen in just about any profiled -Xint run. E.g. running a simple JMH benchmark like this:
http://hg.openjdk.java.net/code-tools/jmh/file/492a000a7aea/jmh-samples/src/main/java/org/openjdk/jmh/samples/JMHSample_08_DeadCode.java
...yields this hot region in the method entry with zerolocals:
....[Hottest Region 1]..............................................................................
[0x7f1c55010d47:0x7f1c55010de2] in <stub: method entry point (kind = zerolocals)>
0x00007f1c55010d18: cmpq $0x1,0x18(%rax,%rcx,8)
0x00007f1c55010d21: je 0x00007f1c55010d3c
0x00007f1c55010d23: xor 0x18(%rax,%rcx,8),%rdx
0x00007f1c55010d28: test $0xfffffffffffffffc,%rdx
0x00007f1c55010d2f: je 0x00007f1c55010d41
0x00007f1c55010d31: orq $0x2,0x18(%rax,%rcx,8)
0x00007f1c55010d3a: jmp 0x00007f1c55010d41
0x00007f1c55010d3c: mov %rdx,0x18(%rax,%rcx,8)
0x00007f1c55010d41: sub $0x2,%rcx
0x00007f1c55010d45: jns 0x00007f1c55010cd3
0.20% 0.30% 0x00007f1c55010d47: mov %eax,-0x1000(%rsp)
0.17% 0.26% 0x00007f1c55010d4e: mov %eax,-0x2000(%rsp)
0.04% 0.30% 0x00007f1c55010d55: mov %eax,-0x3000(%rsp)
0.07% 0x00007f1c55010d5c: mov %eax,-0x4000(%rsp)
0.41% 1.35% 0x00007f1c55010d63: mov %eax,-0x5000(%rsp)
0.02% 0.46% 0x00007f1c55010d6a: mov %eax,-0x6000(%rsp)
0.74% 2.61% 0x00007f1c55010d71: mov %eax,-0x7000(%rsp)
0.41% 0.89% 0x00007f1c55010d78: mov %eax,-0x8000(%rsp)
2.80% 5.21% 0x00007f1c55010d7f: mov %eax,-0x9000(%rsp)
0.22% 0.46% 0x00007f1c55010d86: mov %eax,-0xa000(%rsp)
4.32% 6.76% 0x00007f1c55010d8d: mov %eax,-0xb000(%rsp)
1.63% 0.76% 0x00007f1c55010d94: mov %eax,-0xc000(%rsp)
6.82% 5.56% 0x00007f1c55010d9b: mov %eax,-0xd000(%rsp)
0.28% 0.24% 0x00007f1c55010da2: mov %eax,-0xe000(%rsp)
5.25% 2.72% 0x00007f1c55010da9: mov %eax,-0xf000(%rsp)
0.78% 0.20% 0x00007f1c55010db0: mov %eax,-0x10000(%rsp)
2.13% 0.37% 0x00007f1c55010db7: mov %eax,-0x11000(%rsp)
0.52% 0.04% 0x00007f1c55010dbe: mov %eax,-0x12000(%rsp)
3.76% 0.52% 0x00007f1c55010dc5: mov %eax,-0x13000(%rsp)
0.87% 0.02% 0x00007f1c55010dcc: mov %eax,-0x14000(%rsp)
1.91% 0.35% 0x00007f1c55010dd3: movb $0x0,0x295(%r15)
0.54% 0.13% 0x00007f1c55010ddb: cmpb $0x0,0x168da700(%rip) # 0x00007f1c6b8eb4e2
0.07% 0x00007f1c55010de2: je 0x00007f1c55010e12
0x00007f1c55010de8: mov -0x18(%rbp),%rsi
0x00007f1c55010dec: mov %r15,%rdi
0x00007f1c55010def: test $0xf,%esp
0x00007f1c55010df5: je 0x00007f1c55010e0d
0x00007f1c55010dfb: sub $0x8,%rsp
0x00007f1c55010dff: callq 0x00007f1c6b3080d0
0x00007f1c55010e04: add $0x8,%rsp
0x00007f1c55010e08: jmpq 0x00007f1c55010e12
0x00007f1c55010e0d: callq 0x00007f1c6b3080d0
0.20% 0x00007f1c55010e12: movzbl 0x0(%r13),%ebx
....................................................................................................
33.88% 29.57% <total for region 1>
This seems to be due to TemplateInterpreterGenerator::bang_stack_shadow_pages that does:
void TemplateInterpreterGenerator::bang_stack_shadow_pages(bool native_call) {
// Quick & dirty stack overflow checking: bang the stack & handle trap.
...
// Bang each page in the shadow zone. We can't assume it's been done for
// an interpreter frame with greater than a page of locals, so each page
// needs to be checked. Only true for non-native.
const int page_size = os::vm_page_size();
const int n_shadow_pages = ((int)StackOverflow::stack_shadow_zone_size()) / page_size;
const int start_page = native_call ? n_shadow_pages : 1;
for (int pages = start_page; pages <= n_shadow_pages; pages++) {
__ bang_stack_with_offset(pages*page_size);
}
}
So every time we enter a method in interpreter, we bang a lot of stack ahead for shadow zone checks. I believe this could be made significantly better, e.g. by checking how far away we are from the stack_overflow_limit.
WIP PR: https://github.com/openjdk/jdk/pull/7247
http://hg.openjdk.java.net/code-tools/jmh/file/492a000a7aea/jmh-samples/src/main/java/org/openjdk/jmh/samples/JMHSample_08_DeadCode.java
...yields this hot region in the method entry with zerolocals:
....[Hottest Region 1]..............................................................................
[0x7f1c55010d47:0x7f1c55010de2] in <stub: method entry point (kind = zerolocals)>
0x00007f1c55010d18: cmpq $0x1,0x18(%rax,%rcx,8)
0x00007f1c55010d21: je 0x00007f1c55010d3c
0x00007f1c55010d23: xor 0x18(%rax,%rcx,8),%rdx
0x00007f1c55010d28: test $0xfffffffffffffffc,%rdx
0x00007f1c55010d2f: je 0x00007f1c55010d41
0x00007f1c55010d31: orq $0x2,0x18(%rax,%rcx,8)
0x00007f1c55010d3a: jmp 0x00007f1c55010d41
0x00007f1c55010d3c: mov %rdx,0x18(%rax,%rcx,8)
0x00007f1c55010d41: sub $0x2,%rcx
0x00007f1c55010d45: jns 0x00007f1c55010cd3
0.20% 0.30% 0x00007f1c55010d47: mov %eax,-0x1000(%rsp)
0.17% 0.26% 0x00007f1c55010d4e: mov %eax,-0x2000(%rsp)
0.04% 0.30% 0x00007f1c55010d55: mov %eax,-0x3000(%rsp)
0.07% 0x00007f1c55010d5c: mov %eax,-0x4000(%rsp)
0.41% 1.35% 0x00007f1c55010d63: mov %eax,-0x5000(%rsp)
0.02% 0.46% 0x00007f1c55010d6a: mov %eax,-0x6000(%rsp)
0.74% 2.61% 0x00007f1c55010d71: mov %eax,-0x7000(%rsp)
0.41% 0.89% 0x00007f1c55010d78: mov %eax,-0x8000(%rsp)
2.80% 5.21% 0x00007f1c55010d7f: mov %eax,-0x9000(%rsp)
0.22% 0.46% 0x00007f1c55010d86: mov %eax,-0xa000(%rsp)
4.32% 6.76% 0x00007f1c55010d8d: mov %eax,-0xb000(%rsp)
1.63% 0.76% 0x00007f1c55010d94: mov %eax,-0xc000(%rsp)
6.82% 5.56% 0x00007f1c55010d9b: mov %eax,-0xd000(%rsp)
0.28% 0.24% 0x00007f1c55010da2: mov %eax,-0xe000(%rsp)
5.25% 2.72% 0x00007f1c55010da9: mov %eax,-0xf000(%rsp)
0.78% 0.20% 0x00007f1c55010db0: mov %eax,-0x10000(%rsp)
2.13% 0.37% 0x00007f1c55010db7: mov %eax,-0x11000(%rsp)
0.52% 0.04% 0x00007f1c55010dbe: mov %eax,-0x12000(%rsp)
3.76% 0.52% 0x00007f1c55010dc5: mov %eax,-0x13000(%rsp)
0.87% 0.02% 0x00007f1c55010dcc: mov %eax,-0x14000(%rsp)
1.91% 0.35% 0x00007f1c55010dd3: movb $0x0,0x295(%r15)
0.54% 0.13% 0x00007f1c55010ddb: cmpb $0x0,0x168da700(%rip) # 0x00007f1c6b8eb4e2
0.07% 0x00007f1c55010de2: je 0x00007f1c55010e12
0x00007f1c55010de8: mov -0x18(%rbp),%rsi
0x00007f1c55010dec: mov %r15,%rdi
0x00007f1c55010def: test $0xf,%esp
0x00007f1c55010df5: je 0x00007f1c55010e0d
0x00007f1c55010dfb: sub $0x8,%rsp
0x00007f1c55010dff: callq 0x00007f1c6b3080d0
0x00007f1c55010e04: add $0x8,%rsp
0x00007f1c55010e08: jmpq 0x00007f1c55010e12
0x00007f1c55010e0d: callq 0x00007f1c6b3080d0
0.20% 0x00007f1c55010e12: movzbl 0x0(%r13),%ebx
....................................................................................................
33.88% 29.57% <total for region 1>
This seems to be due to TemplateInterpreterGenerator::bang_stack_shadow_pages that does:
void TemplateInterpreterGenerator::bang_stack_shadow_pages(bool native_call) {
// Quick & dirty stack overflow checking: bang the stack & handle trap.
...
// Bang each page in the shadow zone. We can't assume it's been done for
// an interpreter frame with greater than a page of locals, so each page
// needs to be checked. Only true for non-native.
const int page_size = os::vm_page_size();
const int n_shadow_pages = ((int)StackOverflow::stack_shadow_zone_size()) / page_size;
const int start_page = native_call ? n_shadow_pages : 1;
for (int pages = start_page; pages <= n_shadow_pages; pages++) {
__ bang_stack_with_offset(pages*page_size);
}
}
So every time we enter a method in interpreter, we bang a lot of stack ahead for shadow zone checks. I believe this could be made significantly better, e.g. by checking how far away we are from the stack_overflow_limit.
WIP PR: https://github.com/openjdk/jdk/pull/7247
- relates to
-
JDK-8069196 Stack overflow logic is incomplete in template interpreter
- Closed
-
JDK-8281632 riscv: Improve interpreter stack banging
- Resolved
-
JDK-8282224 Correct TIG::bang_stack_shadow_pages comments
- Resolved
-
JDK-8281309 x86: Select better stack banging instruction
- Closed
-
JDK-8287550 Improve stack bang sp update granularity
- Closed
-
JDK-8281469 aarch64: Improve interpreter stack banging
- Resolved
(1 relates to, 3 links to)