Extracting the debug/trace code under DCUBED_C2_FAST_LOCK_DEBUG so there is a simpler case that can be discussed with other folks. The key piece is here: src/share/vm/runtime/sharedRuntime.cpp > @@ -1880,6 +1880,39 @@ JRT_END > > // Handles the uncommon case in locking, i.e., contention or an inflated lock. > JRT_BLOCK_ENTRY(void, SharedRuntime::complete_monitor_locking_C(oopDesc* _obj, BasicLock* lock, JavaThread* thread)) > +#ifdef DCUBED_C2_FAST_LOCK_DEBUG > +#if 0 > +{ > + int dcubed_C2_fast_lock_result = thread->dcubed_C2_fast_lock_result(); > + static FILE * fp = NULL; > + ThreadCritical tc; > + > + if (fp == NULL) { > + fp = fopen("dcubed.debug.out", "w"); > + guarantee(fp != NULL, "cannot create dcubed.debug.out"); > + } > + fprintf(fp, "0x%x\n", dcubed_C2_fast_lock_result); > + fflush(fp); > +} > +#endif > + // SharedRuntime::complete_monitor_locking_C() is only supposed to be > + // called when MacroAssembler::fast_lock() fails. > + if (thread->dcubed_C2_fast_lock_worked()) { > + tty->print_cr("WARNING: errant call to " > + "SharedRuntime::complete_monitor_locking_C() after " > + "MacroAssembler::fast_lock() worked: _obj=" INTPTR_FORMAT > + ", lock=" INTPTR_FORMAT ", thread=" INTPTR_FORMAT > + ", dcubed_C2_fast_lock_result=0x%x", > + _obj, lock, thread, thread->dcubed_C2_fast_lock_result()); > + if (VerifyC2FastLockAndCompleteMLCMatch) { > + fatal("SharedRuntime::complete_monitor_locking_C() should not be " > + "called since MacroAssembler::fast_lock() worked."); > + } > + if (FixC2FastLockAndCompleteMLCMatch) { > + return; > + } > + } > +#endif > // Disable ObjectSynchronizer::quick_enter() in default config > // until JDK-8077392 is resolved. > if ((SyncFlags & 256) != 0 && !SafepointSynchronize::is_synchronizing()) { The "#if 0" part of the code was used to validate the JavaThread::_dcubed_C2_fast_lock_result values seen during a single run of the failing test. In src/share/vm/runtime/thread.hpp: #ifdef DCUBED_C2_FAST_LOCK_DEBUG #define DCUBED_C2_FAST_LOCK_CALLED 0xC2464CC2 /* C2 'F' 'L' C2 */ #define DCUBED_C2_FAST_LOCK_WORKED 0x42424242 #define DCUBED_C2_SYNC_METHOD_CALLED 0xC2534DC2 /* C2 'S' 'M' C2 */ $ sort dcubed.debug.out | uniq -c 232 0x42424242 4172 0xc2464cc2 so most of the slow-path calls saw the expected DCUBED_C2_FAST_LOCK_CALLED (0xC2464CC2), but a couple of hundred saw DCUBED_C2_FAST_LOCK_WORKED (0x42424242). This is much, much higher than the 2-3 failures over 72 hours that is normal for this bug so something is not quite right with the new DCUBED_C2_FAST_LOCK_DEBUG code. Here's the hs_err_pid (doit.copyB2.hs_err_pid.0) snippets from a failure of the new fatal(): # Internal Error (sharedRuntime.cpp:1909), pid=20200, tid=106 # fatal error: SharedRuntime::complete_monitor_locking_C() should not be called since MacroAssembler::fast_lock() worked. # # JRE version: Java(TM) SE Runtime Environment (9.0) (build 1.9.0-internal-dcubed_2016_03_16_19_15-b00) --------------- T H R E A D --------------- Current thread (0x0000000003eee000): JavaThread "ForkJoinPool.commonPool-worker-10" daemon [_thread_in_Java, id=106, stack(0xfffffd7fbf5ce000,0xfffffd7fbf6ce000)] Stack: [0xfffffd7fbf5ce000,0xfffffd7fbf6ce000], sp=0xfffffd7fbf6caa30, free space=1010k Native frames: (J=compiled Java code, j=interpreted, Vv=VM code, C=native code) V [libjvm.so+0x13d72f9] void VMError::report(outputStream*,bool)+0xd59 V [libjvm.so+0x13d8bb6] void VMError::report_and_die(int,const char*,const char*,__va_list_element*,Thread*,unsigned char*,void*,void*,const char*,int,unsigned long)+0x596 V [libjvm.so+0x13d85bf] void VMError::report_and_die(Thread*,const char*,int,const char*,const char*,__va_list_element*)+0x3f V [libjvm.so+0xacd40b] void report_fatal(const char*,int,const char*,...)+0xdb V [libjvm.so+0x12180f5] void SharedRuntime::complete_monitor_locking_C(oopDesc*,BasicLock*,JavaThread*)+0x545 v ~RuntimeStub::_complete_monitor_locking_Java J 1638 C2 java.util.stream.Nodes$SizedCollectorTask.compute()V (132 bytes) @ 0xfffffd7ff2537b84 [0xfffffd7ff2536a60+0x0000000000001124] J 1229 C2 java.util.concurrent.CountedCompleter.exec()Z (6 bytes) @ 0xfffffd7ff24638ac [0xfffffd7ff2463860+0x000000000000004c] J 1509 C2 java.util.concurrent.ForkJoinPool$WorkQueue.localPopAndExec()V (115 bytes) @ 0xfffffd7ff24d52d8 [0xfffffd7ff24d5120+0x00000000000001b8] J 1636% C2 java.util.concurrent.ForkJoinPool.runWorker(Ljava/util/concurrent/ForkJoinPool$WorkQueue;)V (139 bytes) @ 0xfffffd7ff25349dc [0xfffffd7ff25344a0+0x000000000000053c] j java.util.concurrent.ForkJoinWorkerThread.run()V+24 v ~StubRoutines::call_stub V [libjvm.so+0xd0f4bf] void JavaCalls::call_helper(JavaValue*,const methodHandle&,JavaCallArguments*,Thread*)+0x42f V [libjvm.so+0xd0e036] void JavaCalls::call_virtual(JavaValue*,KlassHandle,Symbol*,Symbol*,JavaCallArguments*,Thread*)+0x296 V [libjvm.so+0xd0e278] void JavaCalls::call_virtual(JavaValue*,Handle,KlassHandle,Symbol*,Symbol*,Thread*)+0x68 V [libjvm.so+0xdecb7e] void thread_entry(JavaThread*,Thread*)+0xbe V [libjvm.so+0x12f93f1] void JavaThread::thread_main_inner()+0xf1 V [libjvm.so+0x12f92e2] void JavaThread::run()+0x232 V [libjvm.so+0x1125cb0] java_start+0x230 C [libc.so.1+0xdd9db] _thr_setup+0x5b C [libc.so.1+0xddc10] _lwp_start+0x0 C 0x0000000000000000 Java frames: (J=compiled Java code, j=interpreted, Vv=VM code) v ~RuntimeStub::_complete_monitor_locking_Java J 1638 C2 java.util.stream.Nodes$SizedCollectorTask.compute()V (132 bytes) @ 0xfffffd7ff2537b84 [0xfffffd7ff2536a60+0x0000000000001124] J 1229 C2 java.util.concurrent.CountedCompleter.exec()Z (6 bytes) @ 0xfffffd7ff24638ac [0xfffffd7ff2463860+0x000000000000004c] J 1509 C2 java.util.concurrent.ForkJoinPool$WorkQueue.localPopAndExec()V (115 bytes) @ 0xfffffd7ff24d52d8 [0xfffffd7ff24d5120+0x00000000000001b8] J 1636% C2 java.util.concurrent.ForkJoinPool.runWorker(Ljava/util/concurrent/ForkJoinPool$WorkQueue;)V (139 bytes) @ 0xfffffd7ff25349dc [0xfffffd7ff25344a0+0x000000000000053c] j java.util.concurrent.ForkJoinWorkerThread.run()V+24 v ~StubRoutines::call_stub Here's the dbx stack trace (doit.copyB2.threads.log.0) from a failure of the new guarantee(): THREAD t@106 t@106(l@106) stopped in __lwp_kill at 0xfffffd7fff29351a 0xfffffd7fff29351a: __lwp_kill+0x000a: jae __lwp_kill+0x18 [ 0xfffffd7fff293528, .+0xe ] current thread: t@106 [1] __lwp_kill(0x6a, 0x6, 0xfffffeb43c74fbc0, 0xfffffd7fff293e0e, 0xfffffd7fbf6cd010, 0x6), at 0xfffffd7fff29351a [2] _thr_kill(), at 0xfffffd7fff28be13 [3] raise(), at 0xfffffd7fff2381b9 [4] abort(), at 0xfffffd7fff216b80 =>[5] os::abort(dump_core = true, siginfo = , context = ) (optimized), at 0xfffffd7ffe9274d6 (line ~1396) in "os_solaris.cpp" [6] VMError::report_and_die(id = , message = , detail_fmt = , detail_args = , thread = , pc = , siginfo = (nil), context = (nil), filename = 0xfffffd7ffeee9860 "/work/shared/bug_hunt/8077392_for_jdk9_hs_rt/hotspot/src/share/vm/runtime/sharedRuntime.cpp", lineno = 1909, size = 0) (optimized), at 0xfffffd7ffebd94f1 (line ~1152) in "vmError.cpp" [7] VMError::report_and_die(thread = , filename = , lineno = , message = , detail_fmt = , detail_args = ) (optimized), at 0xfffffd7ffebd85bf (line ~931) in "vmError.cpp" [8] report_fatal(file = 0xfffffd7ffeee9860 "/work/shared/bug_hunt/8077392_for_jdk9_hs_rt/hotspot/src/share/vm/runtime/sharedRuntime.cpp", line = 1909, detail_fmt = 0xfffffd7ffeee97f0 "SharedRuntime::complete_monitor_locking_C() should notbe called since MacroAssembler::fast_lock() worked.", ...) (optimized), at 0xfffffd7ffe2cd40b (line ~227) in "debug.cpp" [9] SharedRuntime::complete_monitor_locking_C(_obj = 0xfffffd7bf79a9d60, lock = 0xfffffd7fbf6cd530, thread = 0x3eee000) (optimized), at 0xfffffd7ffea180f5 (line ~1909) in "sharedRuntime.cpp" [10] 0xfffffd7feab36d08(), at 0xfffffd7feab36d08 [11] 0xfffffd7feab36d08(), at 0xfffffd7feab36d08 [12] 0xfffffd7ff2537b84(), at 0xfffffd7ff2537b84 Current function is Parker::park (optimized) 228 static int cond_wait(cond_t *cv, mutex_t *mx) { return _cond_wait(cv, mx); } Not sure why frame 10 and 11 are the same address info. So let's take a look at the code from frame 10/11 that got us to SharedRuntime::complete_monitor_locking_C(): (dbx) x 0xfffffd7ff2537b7f,0xfffffd7ff2537b84/i 0xfffffd7ff2537b7f: call 0xfffffd7feab36ce0 [ 0xfffffd7feab36ce0, .-0x7a00e9f ] 0xfffffd7ff2537b84: jmp 0xfffffd7ff253741f [ 0xfffffd7ff253741f, .-0x765 ] So frame 12 called 0xfffffd7feab36ce0 which is really close to our frame 10/11 address: (dbx) x 0xfffffd7feab36ce0,0xfffffd7feab36d08/i 0xfffffd7feab36ce0: subq $0x0000000000000008,%rsp 0xfffffd7feab36ce7: movq %rbp,(%rsp) 0xfffffd7feab36ceb: movq %rsp,0x00000000000001d0(%r15) 0xfffffd7feab36cf2: movq %rsi,%rdi 0xfffffd7feab36cf5: movq %rdx,%rsi 0xfffffd7feab36cf8: movq %r15,%rdx 0xfffffd7feab36cfb: movq $complete_monitor_locking_C,%r10 0xfffffd7feab36d05: call *%r10d 0xfffffd7feab36d08: movq $0x0000000000000000,0x00000000000001d0(%r15) so the code from frame 10/11 is pretty much marshalling code for calling complete_monitor_locking_C which has this signature: // Handles the uncommon case in locking, i.e., contention or an inflated lock. JRT_BLOCK_ENTRY(void, SharedRuntime::complete_monitor_locking_C(oopDesc* _obj, BasicLock* lock, JavaThread* thread)) subq $0x0000000000000008,%rsp // make space on the stack movq %rbp,(%rsp) // save %rbp on the stack movq %rsp,0x00000000000001d0(%r15) // save %rsp in a field in %r15 (thread) movq %rsi,%rdi // guessing this is _obj param movq %rdx,%rsi // guessing this is lock param movq %r15,%rdx // this is thread param movq $complete_monitor_locking_C,%r10 call *%r10d // call complete_monitor_locking_C // zero the field in %r15 (thread) movq $0x0000000000000000,0x00000000000001d0(%r15) So here's the regs from frame 10: (dbx) regs current thread: t@106 current frame: [10] r15 0x0000000003eee000 r14 0x00000000000003e8 r13 0xfffffd7bf9100870 r12 0xfffffd7bc0000000 r11 0x0000000000000000 r10 0x0000000000000000 r9 0x0000000000000000 r8 0x0000000000000000 rdi 0x0000000000000000 rsi 0x0000000000000000 rbp 0xfffffd7fbf6cd4d0 rbx 0xfffffd7bafab90e8 rdx 0x0000000000000000 rcx 0x0000000000000000 rax 0x0000000000000000 trapno 0x0000000000000000 err 0x0000000000000000 rip 0xfffffd7feab36d08:0xfffffd7feab36d08 movq $0x0000000000000000,0x00000000000001d0(%r15) cs 0x0000000000000000 eflags 0x0000000000000000 rsp 0xfffffd7fbf6cd4a0 ss 0x0000000000000000 fs 0x0000000000000000 gs 0x0000000000000000 es 0x0000000000000000 ds 0x0000000000000000 fsbase 0x0000000000000000 gsbase 0x0000000000000000 (dbx) x 0xfffffd7fbf6cd4d0/X 0xfffffd7fbf6cd4d0: 0xf79a9d60 Here's the regs from frame 11: (dbx) regs current thread: t@106 current frame: [11] r15 0x0000000000000000 r14 0x0000000000000000 r13 0x0000000000000000 r12 0x0000000000000000 r11 0x0000000000000000 r10 0x0000000000000000 r9 0x0000000000000000 r8 0x0000000000000000 rdi 0x0000000000000000 rsi 0x0000000000000000 rbp 0xfffffd7bf79a9d60 rbx 0x0000000000000000 rdx 0x0000000000000000 rcx 0x0000000000000000 rax 0x0000000000000000 trapno 0x0000000000000000 err 0x0000000000000000 rip 0xfffffd7feab36d08:0xfffffd7feab36d08 movq $0x0000000000000000,0x00000000000001d0(%r15) cs 0x0000000000000000 eflags 0x0000000000000000 rsp 0x0000000000000000 ss 0x0000000000000000 fs 0x0000000000000000 gs 0x0000000000000000 es 0x0000000000000000 ds 0x0000000000000000 fsbase 0x0000000000000000 gsbase 0x0000000000000000 (dbx) x 0xfffffd7bf79a9d60/X 0xfffffd7bf79a9d60: 0x03a3e402 Here's the regs from frame 12: (dbx) regs current thread: t@106 current frame: [12] r15 0x0000000000000000 r14 0x0000000000000000 r13 0x0000000000000000 r12 0x0000000000000000 r11 0x0000000000000000 r10 0x0000000000000000 r9 0x0000000000000000 r8 0x0000000000000000 rdi 0x0000000000000000 rsi 0x0000000000000000 rbp 0x0000000003a3e402 rbx 0x0000000000000000 rdx 0x0000000000000000 rcx 0x0000000000000000 rax 0x0000000000000000 trapno 0x0000000000000000 err 0x0000000000000000 rip 0xfffffd7ff2537b84:0xfffffd7ff2537b84 jmp 0xfffffd7ff253741f [ 0xfffffd7ff253741f, .-0x765 ] cs 0x0000000000000000 eflags 0x0000000000000000 rsp 0x0000000000000000 ss 0x0000000000000000 fs 0x0000000000000000 gs 0x0000000000000000 es 0x0000000000000000 ds 0x0000000000000000 fsbase 0x0000000000000000 gsbase 0x0000000000000000 (dbx) x 0x0000000003a3e402/X 0x0000000003a3e402: 0x00000000 The *rbp value of NULL explains why the dbx stack trace stops at frame 12. So without a valid frame 13, it's hard to know where we go into the code in frame 12. For now, I'm dumping this big section: (dbx) x 0xfffffd7ff2537b00,0xfffffd7ff2537b84/i 0xfffffd7ff2537b00: pushq %rax 0xfffffd7ff2537b01: pushq %rdx 0xfffffd7ff2537b02: pushq %rcx 0xfffffd7ff2537b03: call breakpoint [ 0xfffffd7ffe925950, .+0xc3ede4d ] 0xfffffd7ff2537b08: popq %rcx 0xfffffd7ff2537b09: popq %rdx 0xfffffd7ff2537b0a: popq %rax 0xfffffd7ff2537b0b: lock cmpxchgq %r10,0x0000000000000000(%rbp) 0xfffffd7ff2537b11: leaq 0x0000000000000040(%rsp),%rbx 0xfffffd7ff2537b16: movq $0xffffffffc2464cc2,0x00000000000003e4(%r15) 0xfffffd7ff2537b21: movq 0x0000000000000000(%rbp),%rax 0xfffffd7ff2537b25: testq $0x0000000000000002,%rax 0xfffffd7ff2537b2b: jne 0xfffffd7ff2537b52 [ 0xfffffd7ff2537b52, .+0x27 ] 0xfffffd7ff2537b2d: orq $0x0000000000000001,%rax 0xfffffd7ff2537b31: movq %rax,(%rbx) 0xfffffd7ff2537b34: lock cmpxchgq %rbx,0x0000000000000000(%rbp) 0xfffffd7ff2537b3a: je 0xfffffd7ff2537b65 [ 0xfffffd7ff2537b65, .+0x2b ] 0xfffffd7ff2537b40: subq %rsp,%rax 0xfffffd7ff2537b43: andq $0xfffffffffffff007,%rax 0xfffffd7ff2537b4a: movq %rax,(%rbx) 0xfffffd7ff2537b4d: jmp 0xfffffd7ff2537b65 [ 0xfffffd7ff2537b65, .+0x18 ] 0xfffffd7ff2537b52: movq %rax,%r10 0xfffffd7ff2537b55: xorq %rax,%rax 0xfffffd7ff2537b58: lock cmpxchgq %r15,0x000000000000007e(%r10) 0xfffffd7ff2537b5e: movq $0x0000000000000003,(%rbx) 0xfffffd7ff2537b65: jne 0xfffffd7ff2537b72 [ 0xfffffd7ff2537b72, .+0xd ] 0xfffffd7ff2537b67: movq $0x0000000042424242,0x00000000000003e4(%r15) 0xfffffd7ff2537b72: je 0xfffffd7ff253741f [ 0xfffffd7ff253741f, .-0x753 ] 0xfffffd7ff2537b78: leaq 0x0000000000000040(%rsp),%rdx 0xfffffd7ff2537b7d: nop 0xfffffd7ff2537b7f: call 0xfffffd7feab36ce0 [ 0xfffffd7feab36ce0, .-0x7a00e9f ] 0xfffffd7ff2537b84: jmp 0xfffffd7ff253741f [ 0xfffffd7ff253741f, .-0x765 ] Start Update: These frames from the hs_err_pid stack trace are relevant: V [libjvm.so+0x12180f5] void SharedRuntime::complete_monitor_locking_C(oopDesc*,BasicLock*,JavaThread*)+0x545 v ~RuntimeStub::_complete_monitor_locking_Java J 1638 C2 java.util.stream.Nodes$SizedCollectorTask.compute()V (132 bytes) @ 0xfffffd7ff2537b84 [0xfffffd7ff2536a60+0x0000000000001124] The code for java.util.stream.Nodes$SizedCollectorTask.compute()V includes the "x 0xfffffd7ff2537b00,0xfffffd7ff2537b84/i" code block from above. In the code block from "x 0xfffffd7ff2536a60,0xfffffd7ff2537b84/i", I found a branch to "0xfffffd7ff2537b11: leaq 0x0000000000000040(%rsp),%rbx" which is the start of the parameter setup for calling fast_lock(). I did not find a branch to: "0xfffffd7ff2537b0b: lock cmpxchgq %r10,0x0000000000000000(%rbp)" so I'm not sure if it belongs to the strange "breakpoint" block or not. End Update I _think_ the "call breakpoint" section at the top is some barrier code emitted by C2 just in case the previous generated code block runs off the end. Here's the original src/cpu/x86/vm/macroAssembler_x86.cpp: fast_lock() with most comments and optional code not included in the current config elided and the DCUBED_C2_FAST_LOCK_DEBUG addition marked by "NNNN" line numbers: 1658 // obj: object to lock 1659 // box: on-stack box address (displaced header location) - KILLED 1660 // rax,: tmp -- KILLED 1661 // scr: tmp -- KILLED 1662 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 1663 Register scrReg, Register cx1Reg, Register cx2Reg, 1664 BiasedLockingCounters* counters, 1665 RTMLockingCounters* rtm_counters, 1666 RTMLockingCounters* stack_rtm_counters, 1667 Metadata* method_data, 1668 bool use_rtm, bool profile_rtm) { : NNN1 #ifdef DCUBED_C2_FAST_LOCK_DEBUG NNN2 // Mark that this JavaThread called MacroAssembler::fast_lock() NNN3 movptr(Address(r15_thread, JavaThread::dcubed_C2_fast_lock_result_offset()), (int32_t)DCUBED_C2_FAST_LOCK_CALLED); NNN4 #endif : 1727 movptr(tmpReg, Address(objReg, 0)); // [FETCH] 1728 testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 1729 jccb(Assembler::notZero, IsInflated); : 1732 orptr (tmpReg, markOopDesc::unlocked_value); 1733 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 1734 if (os::is_MP()) { 1735 lock(); 1736 } 1737 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg : 1742 jcc(Assembler::equal, DONE_LABEL); // Success : 1747 subptr(tmpReg, rsp); : 1749 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 1750 movptr(Address(boxReg, 0), tmpReg); : 1755 jmp(DONE_LABEL); 1756 1757 bind(IsInflated); : 1876 movq(scrReg, tmpReg); 1877 xorq(tmpReg, tmpReg); 1878 1879 if (os::is_MP()) { 1880 lock(); 1881 } 1882 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); : 1885 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); : 1897 bind(DONE_LABEL); : NNN5 #ifdef DCUBED_C2_FAST_LOCK_DEBUG NNN6 Label MY_DONE_FAILED; NNN7 // if current state is failure, then there is nothing more to do NNN8 jccb(Assembler::notZero, MY_DONE_FAILED); NNN9 // Mark that this JavaThread's call to MacroAssembler::fast_lock() worked NN10 movptr(Address(r15_thread, JavaThread::dcubed_C2_fast_lock_result_offset()), (int32_t)DCUBED_C2_FAST_LOCK_WORKED); NN11 NN12 bind(MY_DONE_FAILED); NN13 #endif : 1903 } 1904 } The above macroAssembler_x86.cpp: fast_lock() code maps to this code in memory: (trimming off the addresses in brackets and going wide here for annotations after the instructions) (dbx) x 0xfffffd7ff2537b00,0xfffffd7ff2537b84/i // start: C2 barrier/guard code? 0xfffffd7ff2537b00: pushq %rax // save regs for call 0xfffffd7ff2537b01: pushq %rdx // 0xfffffd7ff2537b02: pushq %rcx // 0xfffffd7ff2537b03: call breakpoint // 0xfffffd7ff2537b08: popq %rcx // restore regs after call 0xfffffd7ff2537b09: popq %rdx // 0xfffffd7ff2537b0a: popq %rax // end: C2 barrier/guard code? // compare-and-exchange/CAS: // if ((old = 0x0000000000000000(%rbp)) == %rax) { // 0x0000000000000000(%rbp) = %r10; // } // %rax = old; 0xfffffd7ff2537b0b: lock cmpxchgq %r10,0x0000000000000000(%rbp) // cmpxchgptr(%r10, Address(objReg, 0)); // Where did the above cmpxchgq() come from? What's // in %rax for the comparison with object header? // (%rax is fast_lock's tmpReg param) // What's in %r10 for the assignment to the object // header if the 0x0000000000000000(%rbp) == %rax // compare succeeds? // (%r10 is fast_lock's scrReg param) // (fast_lock doesn't expect %rax or %r10 to contain // anything useful on input since they get overwritten) // If the random value in %rax happens to match the // object's header (0x0000000000000000(%rbp)), then // the random value in %r10 will be put into the // object's header. If random value in %r10 happens // to be an ObjectMonitor, then we're going to lock // that ObjectMonitor. // Start update: I found code in // // java.util.stream.Nodes$SizedCollectorTask.compute()V // that jumps to "0xfffffd7ff2537b11: leaq", but I did // not find code that jumps to "0xfffffd7ff2537b0b: // lock cmpxchgq" above. I don't know if the cmpxchgq // above is causing us grief or not. // // End update. 0xfffffd7ff2537b11: leaq 0x0000000000000040(%rsp),%rbx // set boxReg to BasicLock on local stack // START OF: MacroAssembler::fast_lock() // objReg == %rbp, boxReg == %rbx, tmpReg == %rax, // scrReg == %r10 0xfffffd7ff2537b16: movq $0xffffffffc2464cc2,0x00000000000003e4(%r15) // NNN3: set DCUBED_C2_FAST_LOCK_CALLED in // JavaThread::_dcubed_C2_fast_lock_result // to mark that fast_lock() was called 0xfffffd7ff2537b21: movq 0x0000000000000000(%rbp),%rax // 1727: [FETCH] (object header) 0xfffffd7ff2537b25: testq $0x0000000000000002,%rax // 1728: inflated vs stack-locked|neutral|biased 0xfffffd7ff2537b2b: jne 0xfffffd7ff2537b52 // 1729: if (inflated) then jump 0xfffffd7ff2537b2d: orq $0x0000000000000001,%rax // 1732: 'or' in markOopDesc::unlocked_value 0xfffffd7ff2537b31: movq %rax,(%rbx) // 1733: update BasicLock's saved header // compare-and-exchange/CAS: // if ((old = Address(objReg, 0)) == tmpReg) { // Address(objReg, 0) = boxReg; // } // tmpReg = old; 0xfffffd7ff2537b34: lock cmpxchgq %rbx,0x0000000000000000(%rbp) // 173[5,7]: cmpxchgptr(boxReg, Address(objReg, 0)); // if cmpxchgptr worked we are done 0xfffffd7ff2537b3a: je 0xfffffd7ff2537b65 // 1742: jcc(Assembler::equal, DONE_LABEL); // Stack locked by current thread if difference with // current SP is less than one page. 0xfffffd7ff2537b40: subq %rsp,%rax // 1747: subptr(tmpReg, rsp); // 1749: andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) 0xfffffd7ff2537b43: andq $0xfffffffffffff007,%rax // LP64_ONLY(7 - os::vm_page_size()))); // always save result in BasicLock's saved header // recursive enter saves a NULL 0xfffffd7ff2537b4a: movq %rax,(%rbx) // 1750: movptr(Address(boxReg, 0), tmpReg); 0xfffffd7ff2537b4d: jmp 0xfffffd7ff2537b65 // 1755: jmp(DONE_LABEL); // 1757: bind(IsInflated); // save ObjectMonitor 0xfffffd7ff2537b52: movq %rax,%r10 // 1876: movq(scrReg, tmpReg); // get ready for NULL _owner CAS 0xfffffd7ff2537b55: xorq %rax,%rax // 1877: xorq(tmpReg, tmpReg); // compare-and-exchange/CAS: // if ((old = Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))) == tmpReg) { // Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG (owner)) = r15_thread; // } // tmpReg = old; 0xfffffd7ff2537b58: lock cmpxchgq %r15,0x000000000000007e(%r10) // 188[0,2]: cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 0xfffffd7ff2537b5e: movq $0x0000000000000003,(%rbx) // 1885: set BasicLock's saved header to markOopDesc::unused_mark // bind(DONE); 0xfffffd7ff2537b65: jne 0xfffffd7ff2537b72 // NNN8: jccb(Assembler::notZero, MY_DONE_FAILED); 0xfffffd7ff2537b67: movq $0x0000000042424242,0x00000000000003e4(%r15) // NN10: set DCUBED_C2_FAST_LOCK_WORKED in // JavaThread::_dcubed_C2_fast_lock_result // to mark that fast_lock() worked // END OF: MacroAssembler::fast_lock() 0xfffffd7ff2537b72: je 0xfffffd7ff253741f // ZFlag == 1 -> Success // ZFlag == 0 -> Failure - force control through the slow-path 0xfffffd7ff2537b78: leaq 0x0000000000000040(%rsp),%rdx // fetch BasicLock addr on local stack for // complete_monitor_locking_C() call 0xfffffd7ff2537b7d: nop // 0xfffffd7ff2537b7f: call 0xfffffd7feab36ce0 // call complete_monitor_locking_C() 0xfffffd7ff2537b84: jmp 0xfffffd7ff253741f // Start update: The above analysis of the MacroAssembler::fast_lock() code, the prologue code and the epilogue code do not reveal any believable smoking gun for this failure mode. The DCUBED_C2_FAST_LOCK_DEBUG debug/tracing code yields similar results to the DCUBED_JME_TRACE/ DCUBED_JME_DEBUG debug/tracing code: Sometimes the ZFlag check that controls the call to complete_monitor_locking_C() does not work right. We see evidence of calls to complete_monitor_locking_C() when the fast_lock() code has finished with ZFlag == 1. Just to sanity check the DCUBED_C2_FAST_LOCK_DEBUG debug/tracing code, I've done an experiment where the DCUBED_JME_TRACE/DCUBED_JME_DEBUG debug/tracing is also enabled. The key output: WARNING: errant call to SharedRuntime::complete_monitor_locking_C() after MacroAssembler::fast_lock() worked: _obj=0xfffffd7bfa2154a8, lock=0xfffffd7fbf9d07c0, thread=0x0000000000c58800, dcubed_C2_fast_lock_result=0x42424242 INFO: dcubed_jme_last_trace_points=0x0000000000002862 # # A fatal error has been detected by the Java Runtime Environment: # # Internal Error (sharedRuntime.cpp:1913), pid=18294, tid=103 # fatal error: SharedRuntime::complete_monitor_locking_C() should not be called since MacroAssembler::fast_lock() worked. # # JRE version: Java(TM) SE Runtime Environment (9.0) (build 1.9.0-internal-dcubed_2016_03_18_12_26-b00) The DCUBED_C2_FAST_LOCK_DEBUG output of: WARNING: errant call to SharedRuntime::complete_monitor_locking_C() after MacroAssembler::fast_lock() worked: _obj=0xfffffd7bfa2154a8, lock=0xfffffd7fbf9d07c0, thread=0x0000000000c58800, dcubed_C2_fast_lock_result=0x42424242 # Internal Error (sharedRuntime.cpp:1913), pid=18294, tid=103 # fatal error: SharedRuntime::complete_monitor_locking_C() should not be called since MacroAssembler::fast_lock() worked. Shows that SharedRuntime::complete_monitor_locking_C() was called when MacroAssembler::fast_lock() work. The DCUBED_JME_TRACE output of: INFO: dcubed_jme_last_trace_points=0x0000000000002862 is decode as follows: // Mark that we came from MacroAssembler::fast_lock(). orptr(tracePoints, 0x00000002); // Record that we didn't take the force slow-path branch orptr(tracePoints, 0x00000020); // Record that biased_locking_enter() didn't take the 'DONE' label. orptr(tracePoints, 0x00000040); // Record that we're in the inflated block orptr(tracePoints, 0x00000800); // Record that we returned success from fast_lock orptr(tracePoints, 0x00002000); And it confirms the binary code fast_lock() code path that we decoded in gory detail above. Of course, I'm having a serious problem believing that the ZFlag value check done by the 'je' instruction is broken, but we keep coming back to that conclusion. End update: