// java -Xbatch -XX:CompileCommand=compileonly,Test*::test -XX:CompileCommand=printcompilation,Test*::test -XX:+PrintIdeal TestOptimizeLoadVector.java

import jdk.incubator.vector.VectorSpecies;
import jdk.incubator.vector.IntVector;

public class TestOptimizeLoadVector {

    static final VectorSpecies<Integer> SPECIES =
                IntVector.SPECIES_256;

    static void test(int[] a) {
        // The LOAD below can be optimized away, and be replaced by the value of v1:
        // LoadVectorNode::Ideal calls LoadNode::Ideal, which looks at the memory
        // input and skips and independent stores, finding a store that matches the
        // exact location. And this store stores the value of v1, so we can replace
        // the LOAD, and just use v1 directly. Hence, the example below should have
        // Only a single load, and 3 stores.
        // HOWEVER: if we somehow exit too early in LoadVectorNode::Ideal, we may
        // never reach LoadNode::Ideal and miss the optimization.
        // This happens on aarch64 SVE with 256bits, when we return true for
        // Matcher::vector_needs_partial_operations, but then do nothing when calling
        // VectorNode::try_to_gen_masked_vector. We just return nullptr instantly,
        // rather than trying the other optimizations that LoadNode::Ideal has to
        // offer.
        IntVector v1 = IntVector.fromArray(SPECIES, a, 0 * SPECIES.length());
        v1.intoArray(a, 1 * SPECIES.length()); // STORE of v1
        v1.intoArray(a, 2 * SPECIES.length()); // independent STORE - no overlap with STORE above and LOAD below.
        IntVector v2 = IntVector.fromArray(SPECIES, a, 1 * SPECIES.length()); // LOAD - is it replaced with v1?
        v2.intoArray(a, 3 * SPECIES.length());
    }

    public static void main(String[] args) {
        int[] a = new int[1000];
        for (int i = 0; i < 10_000; i++) {
            test(a);
	}
    }
}
