1. JMH data on AARCH
- foreach hot loop compiles into 7 instructions
- index hot loop compiles into 6 instructions
- the difference is just one extra MOV, which hardly explains a 60% performance difference.

    @Benchmark
    @OperationsPerInvocation(SIZE)
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void list_foreach(Blackhole bh) {
        for (Object o : list) {
            bh.consume(o);
        }
    }
    Code:
         0: aload_0
         1: getfield      #15                 // Field list:Ljava/util/ArrayList;
         4: invokevirtual #23                 // Method java/util/ArrayList.iterator:()Ljava/util/Iterator;
         7: astore_2
         8: aload_2
         9: invokeinterface #27,  1           // InterfaceMethod java/util/Iterator.hasNext:()Z
        14: ifeq          32
        17: aload_2
        18: invokeinterface #33,  1           // InterfaceMethod java/util/Iterator.next:()Ljava/lang/Object;
        23: astore_3
        24: aload_1
        25: aload_3
        26: invokevirtual #37                 // Method org/openjdk/jmh/infra/Blackhole.consume:(Ljava/lang/Object;)V
        29: goto          8
        32: return

    @Benchmark
    @OperationsPerInvocation(SIZE)
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void list_indexed(Blackhole bh) {
        for (int i = 0; i < list.size(); i++) {
            bh.consume(list.get(i));
        }
    }
    Code:
         0: iconst_0
         1: istore_2
         2: iload_2
         3: aload_0
         4: getfield      #15                 // Field list:Ljava/util/ArrayList;
         7: invokevirtual #43                 // Method java/util/ArrayList.size:()I
        10: if_icmpge     31
        13: aload_1
        14: aload_0
        15: getfield      #15                 // Field list:Ljava/util/ArrayList;
        18: iload_2
        19: invokevirtual #47                 // Method java/util/ArrayList.get:(I)Ljava/lang/Object;
        22: invokevirtual #37                 // Method org/openjdk/jmh/infra/Blackhole.consume:(Ljava/lang/Object;)V
        25: iinc          2, 1
        28: goto          2
        31: return
}

....[Hottest Region 1]..............................................................................
C2, level 4, org.openjdk.bench.java.util.ArrayListIterate::list_foreach

> mov  w14, w11               ; index: w11 -> w14
  add  x11, x10, w14, sxtw #2 ; base = list + index*4
  ldr  w11, [x11, #0x10]      ; list[base + 8]
  lsl  x11, x11, #3           ; *invokestatic consumeCompiler Blackhole::consume@7
  add  w11, w14, #0x1         ; index ++
  cmp  w11, w12               ; index < length
  b.lt #-0x18                 ;

Iteration   1: 0.514 ns/op
Iteration   2: 0.519 ns/op
Iteration   3: 0.526 ns/op
Iteration   4: 0.527 ns/op
Iteration   5: 0.507 ns/op

....[Hottest Region 1]..............................................................................
C2, level 4, org.openjdk.bench.java.util.ArrayListIterate::list_indexed

  add  x13, x11, w12, sxtw #2
  ldr  w13, [x13, #0x10]
  add  w12, w12, #0x1
  lsl  x13, x13, #3           ;*invokestatic consumeCompiler Blackhole::consume@7
  cmp  w12, w10
  b.lt  #-0x14

Iteration   1: 0.302 ns/op
Iteration   2: 0.303 ns/op
Iteration   3: 0.305 ns/op
Iteration   4: 0.303 ns/op
Iteration   5: 0.302 ns/op

....................................................................................................

2. Experiment (C++/AArch64, Apple M4), reproduced the foreach iteration in a microbenchmark.
Result:
- The slowdown is not due to the extra mov.
- The root cause must be a longer loop-carried dependency chain:
- Fast (indexed-like): add → cmp → b.lt (independent), in parallel with add → ldr → lsl.
- Slow (foreach-like): mov → add → ldr → lsl → add → cmp → b.lt (branch depends on the load).
- Using the same instruction sequence but dedicating a separate register for the address/load:
  - decouples the branch from the load
  - and speeds up the foreach variant from ~0.482 ns/iter to ~0.303 ns/iter (40% faster).

// g++ -O3 aarch64_asm_loop_bench.cpp -o bench ; ./bench

#include <chrono>
#include <cstdint>
#include <cstdio>
#include <vector>

int main() {
  constexpr uint32_t N = 1'048'576;
  std::vector<int> arr(N);

  for (int i = 0; i < 3; i++) {
    auto t0 = std::chrono::high_resolution_clock::now();
    int REPS = 1000;
    for (int r = 0; r < REPS; r++) {
      asm volatile(
        "mov   x10, %[base]               \n"
        "mov   w12, %w[limit]             \n"
        "mov   w11, wzr                   \n"
        "1:                               \n"
#if 0
        // Apple M4: AVG: 0.482 ns/iter
        "  mov   w14, w11                 \n" // w14: temporary holder for index (w11)
        "  add   x11, x10, w14, sxtw #2   \n" // w11: array offset
        "  ldr   w11, [x11, #0x10]        \n" // w11: array value
        "  lsl   x11, x11, #3             \n" //      blackhole
        "  add   w11, w14, #1             \n" // w11: back to index
        "  cmp   w11, w12                 \n"
        "  b.lt  1b                       \n"
#endif
#if 1
        // Apple M4: AVG: 0.303 ns/iter
        "  mov   w14, w11                 \n"
        "  add   x14, x10, w11, sxtw #2   \n" // the same code, but w11 holds index only while w14 is dedicated to address/load
        "  ldr   w14, [x14, #0x10]        \n"
        "  lsl   x14, x14, #3             \n"
        "  add   w11, w11, #1             \n"
        "  cmp   w11, w12                 \n"
        "  b.lt  1b                       \n"
#endif
        :
        : [base]"r"(reinterpret_cast<uint64_t>(arr.data())),
          [limit] "r"(N)
        : "x10","x11","x12","x14","memory","cc"
      );
    }
    auto t1 = std::chrono::high_resolution_clock::now();
    double ns = std::chrono::duration<double, std::nano>(t1 - t0).count() / N / REPS;
    std::printf("AVG: %.3f ns/iter \n", ns);
  }
  return 0;
}