package org.openjdk;

import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.infra.Blackhole;

import java.util.ArrayList;
import java.util.concurrent.TimeUnit;

@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 3, time = 300, timeUnit = TimeUnit.MILLISECONDS)
@Measurement(iterations = 5, time = 300, timeUnit = TimeUnit.MILLISECONDS)
@Fork(value = 1, jvmArgsAppend = {"-XX:+UseSerialGC", "-XX:+UnlockDiagnosticVMOptions", "-XX:LoopMaxUnroll=1", "-Xmx1g"})
@State(Scope.Benchmark)
public class ArrayListIterate {

    private static final int SIZE = 1048576;

    private ArrayList<Object> list;

    @Setup
    public void setup() {
        list = new ArrayList<>(SIZE);
        for (int i = 0; i < SIZE; i++) {
            list.add(new Object());
        }
    }

    @Benchmark
    @OperationsPerInvocation(SIZE)
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void list_foreach(Blackhole bh) {
        for (Object o : list) {
            bh.consume(o);
        }
    }

    @Benchmark
    @OperationsPerInvocation(SIZE)
    @CompilerControl(CompilerControl.Mode.DONT_INLINE)
    public void list_indexed(Blackhole bh) {
        for (int i = 0; i < list.size(); i++) {
            bh.consume(list.get(i));
        }
    }

    /*
        On Ryzen 5950X:

        Benchmark                      Mode  Cnt    Score   Error  Units
        ArrayListIterate.list_foreach  avgt    5    0.596 ± 0.003  ns/op ; ~2x slower
        ArrayListIterate.list_indexed  avgt    5    0.298 ± 0.002  ns/op ;

        -prof perfasm shows this:

        Hot loop for list_foreach():
                      0x00007271fc016f..:   mov    $0x1,%r11d
                      0x00007271fc016f..:   cmp    $0x1,%ebx
                  ╭   0x00007271fc016...:   jg     0x00007271fc016fb3
                  |                         ...
                  │↗  0x00007271fc016fb0:   mov    %r9d,%r11d                  ;*getfield cursor {reexecute=0 rethrow=0 return_oop=0}
                  ↘│  0x00007271fc016fb3:   mov    0x10(%rdi,%r11,4),%esi      ; get list.elementData[idx]
          85.63%   │  0x00007271fc016fb8:   mov    %rsi,%r9                    ; "unpack" compressed pointer and blackhole it
                   │  0x00007271fc016fbb:   lea    0x1(%r11),%r9d              ; idx++
                   │  0x00007271fc016fbf:   cmp    %ebx,%r9d                   ; loop check
                   ╰  0x00007271fc016fc2:   jl     0x00007271fc016fb0
                      0x00007271fc016fc4: (hot loop: 20 bytes)

        Hot loop for list_indexed():
                  ↗  0x000070b1cc017770:   mov    0x10(%rbx,%r10,4),%r11d      ; get list.elementData[idx] and blackhole it
          86.42%  │  0x000070b1cc017775:   inc    %r10d                        ; idx++
           0.07%  │  0x000070b1cc017778:   cmp    %r9d,%r10d                   ; loop check
                  ╰  0x000070b1cc01777b:   jl     0x000070b1cc017770
                     0x000070b1cc01777d: (hot loop: 13 bytes)

        Looks like the ArrayList$Itr checks that enter the hot loop through different point force us to
        juggle more registers, which generates more code.

        -prof perfnorm shows:

            Benchmark                                              Mode  Cnt   Score    Error      Units
            ArrayListIterate.list_foreach                          avgt   25   0.603 ±  0.002      ns/op
            ArrayListIterate.list_foreach:CPI                      avgt    5   0.338 ±  0.005  clks/insn
            ArrayListIterate.list_foreach:IPC                      avgt    5   2.959 ±  0.041  insns/clk
            ArrayListIterate.list_foreach:cycles                   avgt    5   2.032 ±  0.019       #/op
            ArrayListIterate.list_foreach:instructions             avgt    5   6.013 ±  0.062       #/op

            ArrayListIterate.list_indexed                          avgt   25   0.304 ±  0.002      ns/op
            ArrayListIterate.list_indexed:CPI                      avgt    5   0.256 ±  0.008  clks/insn
            ArrayListIterate.list_indexed:IPC                      avgt    5   3.912 ±  0.122  insns/clk
            ArrayListIterate.list_indexed:cycles                   avgt    5   1.024 ±  0.031       #/op
            ArrayListIterate.list_indexed:instructions             avgt    5   4.006 ±  0.011       #/op

        That is, executing 13-byte loop with 1 cycle per iteration, and 20-byte loop with 2 cycles per iteration.
        It makes me suspect that OptoLoopAlignment is not enough: https://bugs.openjdk.org/browse/JDK-8281586.
        Bumping -XX:OptoLoopAlignment=32 makes the tests perform the same:

            Benchmark                      Mode  Cnt  Score    Error  Units
            ArrayListIterate.list_foreach  avgt   25  0.299 ±  0.002  ns/op
            ArrayListIterate.list_indexed  avgt   25  0.299 ±  0.001  ns/op

        Still, the generated code inefficiency is what gets us into this situation.

     */


}