1. JMH data on AARCH - foreach hot loop compiles into 7 instructions - index hot loop compiles into 6 instructions - the difference is just one extra MOV, which hardly explains a 60% performance difference. @Benchmark @OperationsPerInvocation(SIZE) @CompilerControl(CompilerControl.Mode.DONT_INLINE) public void list_foreach(Blackhole bh) { for (Object o : list) { bh.consume(o); } } Code: 0: aload_0 1: getfield #15 // Field list:Ljava/util/ArrayList; 4: invokevirtual #23 // Method java/util/ArrayList.iterator:()Ljava/util/Iterator; 7: astore_2 8: aload_2 9: invokeinterface #27, 1 // InterfaceMethod java/util/Iterator.hasNext:()Z 14: ifeq 32 17: aload_2 18: invokeinterface #33, 1 // InterfaceMethod java/util/Iterator.next:()Ljava/lang/Object; 23: astore_3 24: aload_1 25: aload_3 26: invokevirtual #37 // Method org/openjdk/jmh/infra/Blackhole.consume:(Ljava/lang/Object;)V 29: goto 8 32: return @Benchmark @OperationsPerInvocation(SIZE) @CompilerControl(CompilerControl.Mode.DONT_INLINE) public void list_indexed(Blackhole bh) { for (int i = 0; i < list.size(); i++) { bh.consume(list.get(i)); } } Code: 0: iconst_0 1: istore_2 2: iload_2 3: aload_0 4: getfield #15 // Field list:Ljava/util/ArrayList; 7: invokevirtual #43 // Method java/util/ArrayList.size:()I 10: if_icmpge 31 13: aload_1 14: aload_0 15: getfield #15 // Field list:Ljava/util/ArrayList; 18: iload_2 19: invokevirtual #47 // Method java/util/ArrayList.get:(I)Ljava/lang/Object; 22: invokevirtual #37 // Method org/openjdk/jmh/infra/Blackhole.consume:(Ljava/lang/Object;)V 25: iinc 2, 1 28: goto 2 31: return } ....[Hottest Region 1].............................................................................. C2, level 4, org.openjdk.bench.java.util.ArrayListIterate::list_foreach > mov w14, w11 ; index: w11 -> w14 add x11, x10, w14, sxtw #2 ; base = list + index*4 ldr w11, [x11, #0x10] ; list[base + 8] lsl x11, x11, #3 ; *invokestatic consumeCompiler Blackhole::consume@7 add w11, w14, #0x1 ; index ++ cmp w11, w12 ; index < length b.lt #-0x18 ; Iteration 1: 0.514 ns/op Iteration 2: 0.519 ns/op Iteration 3: 0.526 ns/op Iteration 4: 0.527 ns/op Iteration 5: 0.507 ns/op ....[Hottest Region 1].............................................................................. C2, level 4, org.openjdk.bench.java.util.ArrayListIterate::list_indexed add x13, x11, w12, sxtw #2 ldr w13, [x13, #0x10] add w12, w12, #0x1 lsl x13, x13, #3 ;*invokestatic consumeCompiler Blackhole::consume@7 cmp w12, w10 b.lt #-0x14 Iteration 1: 0.302 ns/op Iteration 2: 0.303 ns/op Iteration 3: 0.305 ns/op Iteration 4: 0.303 ns/op Iteration 5: 0.302 ns/op .................................................................................................... 2. Experiment (C++/AArch64, Apple M4), reproduced the foreach iteration in a microbenchmark. Result: - The slowdown is not due to the extra mov. - The root cause must be a longer loop-carried dependency chain: - Fast (indexed-like): add → cmp → b.lt (independent), in parallel with add → ldr → lsl. - Slow (foreach-like): mov → add → ldr → lsl → add → cmp → b.lt (branch depends on the load). - Using the same instruction sequence but dedicating a separate register for the address/load: - decouples the branch from the load - and speeds up the foreach variant from ~0.482 ns/iter to ~0.303 ns/iter (40% faster). // g++ -O3 aarch64_asm_loop_bench.cpp -o bench ; ./bench #include #include #include #include int main() { constexpr uint32_t N = 1'048'576; std::vector arr(N); for (int i = 0; i < 3; i++) { auto t0 = std::chrono::high_resolution_clock::now(); int REPS = 1000; for (int r = 0; r < REPS; r++) { asm volatile( "mov x10, %[base] \n" "mov w12, %w[limit] \n" "mov w11, wzr \n" "1: \n" #if 0 // Apple M4: AVG: 0.482 ns/iter " mov w14, w11 \n" // w14: temporary holder for index (w11) " add x11, x10, w14, sxtw #2 \n" // w11: array offset " ldr w11, [x11, #0x10] \n" // w11: array value " lsl x11, x11, #3 \n" // blackhole " add w11, w14, #1 \n" // w11: back to index " cmp w11, w12 \n" " b.lt 1b \n" #endif #if 1 // Apple M4: AVG: 0.303 ns/iter " mov w14, w11 \n" " add x14, x10, w11, sxtw #2 \n" // the same code, but w11 holds index only while w14 is dedicated to address/load " ldr w14, [x14, #0x10] \n" " lsl x14, x14, #3 \n" " add w11, w11, #1 \n" " cmp w11, w12 \n" " b.lt 1b \n" #endif : : [base]"r"(reinterpret_cast(arr.data())), [limit] "r"(N) : "x10","x11","x12","x14","memory","cc" ); } auto t1 = std::chrono::high_resolution_clock::now(); double ns = std::chrono::duration(t1 - t0).count() / N / REPS; std::printf("AVG: %.3f ns/iter \n", ns); } return 0; }