import sys, os
import random

GENERATE_NEGATIVE_IR_RULES = False

COPYRIGHT = \
"""/*
 * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */"""


# This class is used to simplify the IR rule constraints
class IRRange:
    def __init__(self, lo, hi):
        self.lo = lo
        self.hi = hi
    def lower_bound(lo):
        return IRRange(lo, None)
    def upper_bound(hi):
        return IRRange(None, hi)
    def __str__(self):
        if self.lo is None and self.hi is None:
            return "int"
        lo = "[" if self.lo is None else f"[{self.lo}"
        hi = "]" if self.hi is None else f"{self.hi}]"
        return f"{lo}..{hi}"
    def __repr__(self):
        return str(self)
    def intersect(self, other):
        lo = None
        hi = None
        if self.lo is None:
            lo = other.lo
        elif other.lo is not None:
            lo = max(self.lo, other.lo)
        else:
            lo = self.lo
        if self.hi is None:
            hi = other.hi
        elif other.hi is not None:
            hi = min(self.hi, other.hi)
        else:
            hi = self.hi
        return IRRange(lo, hi)
    def is_empty(self):
        if self.lo is not None and self.hi is not None and self.lo > self.hi:
            return True
        else:
            return False
    def is_int(self):
        return (self.lo is None and self.hi is None)
    def complement(self):
        l = []
        if self.is_empty():
            return IRRange(None, None)
        if self.lo is not None:
            l.append(IRRange.upper_bound(self.lo-1))
        if self.hi is not None:
            l.append(IRRange.lower_bound(self.hi+1))
        return l
    def get_constraints(self):
        l = []
        assert not self.is_empty() and not self.is_int()
        if self.lo is not None:
            l.append(f">= {self.lo}")
        if self.hi is not None:
            l.append(f"<= {self.hi}")
        return l

class IRBool:
    def __init__(self, t, f):
        self.t = t
        self.f = f
    def makeTrue():
        return IRBool(True, False)
    def makeFalse():
        return IRBool(False, True)
    def __str__(self):
        l = []
        if self.t:
            l.append("true")
        if self.f:
            l.append("false")
        l = ", ".join(l)
        return f"[{l}]"
    def __repr__(self):
        return str(self)
    def intersect(self, other):
        return IRBool(self.t and other.t, self.f and other.f)
    def is_empty(self):
        return (not self.t and not self.f)
    def is_int(self):
        return (self.t and self.f)
    def complement(self):
        return [IRBool(not self.t, not self.f)]
    def get_constraints(self):
        l = []
        assert not self.is_empty() and not self.is_int()
        if self.t:
            l.append(f"true")
        if self.f:
            l.append(f"false")
        return l

#r1 = IRRange.lower_bound(4)
#print(r1)
#r2 = IRRange.upper_bound(100)
#print(r2)
#print(r1.intersect(r2))
#print(r2.intersect(r1))
#r3 = IRRange.lower_bound(50)
#print(r3)
#print(r1.intersect(r3))
#print(r3.intersect(r1))
#
#print(r1.complement())
#print(r2.complement())
#print(r3.complement())
#print(r1.intersect(r2).complement())
#
#print([r1, r2, r3])
#
#r4 = IRRange.lower_bound(70)
#r5 = IRRange.upper_bound(30)
#print(r4.intersect(r5).complement())
#print(r4.intersect(r5).complement())

#b1 = IRBool(False, False)
#b2 = IRBool(True, False)
#b3 = IRBool(False, True)
#b4 = IRBool(True, True)
#print(b1, b2, b3, b4)
#print(b1.complement())
#print(b2.complement())
#print(b3.complement())
#print(b4.complement())
#print("i", b1.intersect(b2))
#print("i", b4.intersect(b2))
#print("i", b3.intersect(b4))
#print("i", b4.intersect(b3))

def pow2_factor(i):
    if i <= 0:
        return 1
    p = 1
    while i % p == 0:
        p *= 2
    return p // 2

# unit test
for i in range(1000):
    f = pow2_factor(i)
    assert f == 1 or f % 2 == 0
    assert i % f == 0

class Platform:
    def __init__(self, name, cpu_features, vector_width):
        self.name = name
        self.cpu_features = cpu_features
        self.vector_width = vector_width

class PlatformIRRule:
    def __init__(self, platform):
        self.platform = platform
        self.nodes = []
        self.pre_constraints = []
        self.constraints = []
        self.generate_negative = GENERATE_NEGATIVE_IR_RULES
        self.positive_forbids_nodes = False
    def disable_negative_rules(self):
        self.generate_negative = False
    def expect_no_nodes(self):
        self.positive_forbids_nodes = True
    def add_node(self, node):
        self.nodes.append(node)
    # inverted for negative rule
    def add_constraint(self, variable, ir_range):
        self.constraints.append((variable, ir_range))
    # applied for positive and negative rule
    def add_pre_constraint(self, variable, ir_range):
        self.pre_constraints.append((variable, ir_range))
    def simplify(self, constraints):
        # have one IRRange per variable
        varRange = dict()
        for v, r in constraints:
            if v in varRange:
                r = r.intersect(varRange[v])
            varRange[v] = r
        return list(varRange.items())
    def positive_constraints(self):
        return self.collect_constraints(self.pre_constraints + self.constraints)
    def collect_constraints(self, constraints):
        s = self.simplify(constraints)
        l = []
        for v, r in s:
            if r.is_empty():
                return None # impossible rule
            if not r.is_int():
                cs = r.get_constraints()
                for cc in cs:
                    l.append((v, cc)) # non-trivial constraint
        return l
    def negative_constraints(self):
        # have one IRRange per variable
        s = self.simplify(self.constraints)
        orList = [] # or list
        for v, r in s:
            if r.is_empty():
                orList = None # positive impossible -> negative unconstrained
                break
            if not r.is_int():
                comps = r.complement()
                for comp in comps:
                    for c in comp.get_constraints():
                        orList.append((v, c))
        andList = self.collect_constraints(self.pre_constraints)
        if orList is None:
            # only pre_constraints
            orList = None
        else:
            if len(orList) == 0:
                # no or condition -> impossible
                andList = None
                orList = None
            elif len(orList) == 1:
                # only one or condition -> all into andList
                andList += orList
                orList = None
        assert orList is None or len(orList) > 1
        assert orList is None or andList is not None
        if andList is None:
            return []
        elif orList is None:
            return [andList]
        else:
            l = []
            for oo in orList:
                l.append(andList + [oo])
            return l
    def generate(self, lines):
        # positive / And
        positive_c = self.positive_constraints()
        if positive_c is not None:
            if self.positive_forbids_nodes:
                ns = ", ".join(self.nodes)
                lines.append(f"    @IR(failOn = {{{ns}}},")
            else:
                ns = ", ".join([f"{n}, \"> 0\"" for n in self.nodes])
                lines.append(f"    @IR(counts = {{{ns}}},")
            if len(positive_c) > 0:
                isAnd = "And" if len(positive_c) > 1 else ""
                conditions = [f"\"{a1}\", \"{a2}\"" for (a1,a2) in positive_c]
                conditions = ", ".join(conditions)
                lines.append(f"        applyIf{isAnd} = {{{conditions}}},")
            cpu = [f"\"{c}\"" for c in self.platform.cpu_features]
            if len(cpu) > 0:
                isAnd = "And" if len(cpu) > 2 else ""
                cpu = ", ".join(cpu)
                lines.append(f"        applyIfCPUFeature{isAnd} = {{{cpu}}})")
        else:
            lines.append(f"    //   No positive IR rule: conditions impossible.")

        if self.generate_negative:
            # negative / Or
            negative_list = self.negative_constraints()
            for negative_and in negative_list:
                ns = ", ".join(self.nodes)
                lines.append(f"    @IR(failOn = {{{ns}}},")
                if len(negative_and) > 0:
                    isAnd = "And" if len(negative_and) > 1 else ""
                    conditions = [f"\"{a1}\", \"{a2}\"" for (a1,a2) in negative_and]
                    conditions = ", ".join(conditions)
                    lines.append(f"        applyIf{isAnd} = {{{conditions}}},")
                cpu = [f"\"{c}\"" for c in self.platform.cpu_features]
                if len(cpu) > 0:
                    isAnd = "And" if len(cpu) > 2 else ""
                    cpu = ", ".join(cpu)
                    lines.append(f"        applyIfCPUFeature{isAnd} = {{{cpu}}})")
            if len(negative_list) == 0:
                lines.append(f"    //   No negative IR rule: conditions impossible.")

class Type:
    def __init__(self, name, size, factor, operator, ir_op):
        self.name = name
        self.size = size
        self.factor = factor
        self.operator = operator
        self.ir_op = ir_op
    def platforms(self):
        p = []
        if self.name in ["byte", "char", "short"]:
           p.append(Platform("sse4.1 to avx", ["sse4.1", "true", "avx2", "false"], 16))
           p.append(Platform("avx2 to avx512 without avx512bw", ["avx2", "true", "avx512bw", "false"], 32))
           p.append(Platform("avx512bw", ["avx512bw", "true"], 64))
        elif self.name in ["float", "double"]:
           p.append(Platform("sse4.1", ["sse4.1", "true", "avx", "false"], 16))
           p.append(Platform("avx and avx2", ["avx", "true", "avx512", "false"], 32))
           p.append(Platform("avx512", ["avx512", "true"], 64))
        elif self.name in ["int", "long"]:
           p.append(Platform("sse4.1 to avx", ["sse4.1", "true", "avx2", "false"], 16))
           p.append(Platform("avx2", ["avx2", "true", "avx512", "false"], 32))
           p.append(Platform("avx512", ["avx512", "true"], 64))
        else:
           assert False, "type not implemented" + self.name
        p.append(Platform("asimd", ["asimd", "true"], 32))
        return p

class Test:
    def __init__(self, t, offset):
        self.name = t.name.capitalize() + \
                    ("M" if (offset < 0) else "P") + \
                    str(abs(offset))
        self.t = t
        self.offset = offset
    def gold(self):
        return f"gold{self.name}"
    def run(self):
        return f"run{self.name}"
    def test(self):
        return f"test{self.name}"

class TestScenario:
    def __init__(self, name, flags, requires):
        self.name = name
        self.flags = flags
        self.requires = requires
    def copy(self):
        return TestScenario(str(self.name), list(self.flags), list(self.requires))
    def format_flags(self):
        fs = [f"\"{f}\"" for f in self.flags]
        return ", ".join(fs)

class Generator:
    def __init__(self):
        self.types = [
            Type("int",    4, -11,      "*", "MUL_V"),
            Type("long",   8, -11,      "+", "ADD_V"), # arm NEON does not support MulVL
            Type("short",  2, -11,      "*", "MUL_V"),
            Type("char",   2, -11,      "*", "MUL_V"), # char behaves like short
            Type("byte",   1, 11,       "*", "MUL_V"),
            Type("float",  4, "1.001f", "*", "MUL_V"),
            Type("double", 8, "1.001",  "*", "MUL_V"),
        ]
        self.offsets = [
            0,
            -1, 1,
            -2, 2,
            -3, 3,
            -4, 4,
            -7, 7,
            -8, 8,
            -14, 14,
            -16, 16,
            -18, 18,
            -20, 20,
            -31, 31,
            -32, 32,
            -63, 63,
            -64, 64,
            -65, 65,
            -128, 128,
            -129, 129,
            -192, 192, # 3 * 64
        ]
        self.range = 512

        req_x86 =     "(os.arch==\"x86\" | os.arch==\"i386\" | os.arch==\"amd64\" | os.arch==\"x86_64\")"
        req_not_x86 = "(os.arch!=\"x86\" & os.arch!=\"i386\" & os.arch!=\"amd64\" & os.arch!=\"x86_64\")"

        req_sse4     = "vm.cpu.features ~= \".*sse4.*\""
        req_avx1     = "vm.cpu.features ~= \".*avx.*\""
        req_avx2     = "vm.cpu.features ~= \".*avx2.*\""
        req_avx512   = "vm.cpu.features ~= \".*avx512.*\""
        req_avx512bw = "vm.cpu.features ~= \".*avx512bw.*\""

        align_0 = "-XX:-AlignVector"
        align_1 = "-XX:+AlignVector"

        v_002 = "-XX:MaxVectorSize=2"
        v_004 = "-XX:MaxVectorSize=4"
        v_008 = "-XX:MaxVectorSize=8"
        v_016 = "-XX:MaxVectorSize=16"
        v_032 = "-XX:MaxVectorSize=32"
        v_064 = "-XX:MaxVectorSize=64"

        sse4 = "-XX:UseSSE=4"
        avx1 = "-XX:UseAVX=1"
        avx2 = "-XX:UseAVX=2"
        avx3 = "-XX:UseAVX=3"
        knl = "-XX:+UseKNLSetting"

        scenarios = [
            TestScenario("vanilla",   [], []),
            TestScenario("sse4-v016",   [sse4, v_016], [req_x86, req_sse4]),
            TestScenario("sse4-v008",   [sse4, v_008], [req_x86, req_sse4]),
            TestScenario("sse4-v004",   [sse4, v_004], [req_x86, req_sse4]),
            TestScenario("sse4-v002",   [sse4, v_004], [req_x86, req_sse4]),
            TestScenario("avx1-v032",   [avx1, v_032], [req_x86, req_avx1]),
            TestScenario("avx1-v016",   [avx1, v_016], [req_x86, req_avx1]),
            TestScenario("avx2-v032",   [avx2, v_032], [req_x86, req_avx2]),
            TestScenario("avx2-v016",   [avx2, v_016], [req_x86, req_avx2]),
            TestScenario("avx512-v064", [avx3, knl, v_064], [req_x86, req_avx512]),
            TestScenario("avx512-v032", [avx3, knl, v_032], [req_x86, req_avx512]),
            TestScenario("avx512bw-v064", [avx3, v_064], [req_x86, req_avx512bw]),
            TestScenario("avx512bw-v032", [avx3, v_032], [req_x86, req_avx512bw]),
            TestScenario("vec-v064", [v_064], [req_not_x86]),
            TestScenario("vec-v032", [v_032], [req_not_x86]),
            TestScenario("vec-v016", [v_016], [req_not_x86]),
            TestScenario("vec-v008", [v_008], [req_not_x86]),
            TestScenario("vec-v004", [v_004], [req_not_x86]),
        ]

        self.scenarios = []
        for s in scenarios:
            s1 = s
            s2 = s.copy()
            s1.name += "-A"
            s2.name += "-U"
            s1.flags.append("-XX:+AlignVector")
            s2.flags.append("-XX:-AlignVector")
            self.scenarios.append(s1)
            self.scenarios.append(s2)

    def get_test_list(self):
        l = []
        for t in self.types:
            for o in self.offsets:
                l.append(Test(t, o))
        return l

    def generate(self, class_name, path_name, bugid, package_name):
        lines = []
        lines.append(f"{COPYRIGHT}")
        lines.append("")
        lines.append("/*")
        lines.append(" * Summary:")
        lines.append(" *   Test SuperWord vectorization with different access offsets")
        lines.append(" *   and various MaxVectorSize values, and +- AlignVector.")
        lines.append(" *")
        lines.append(" * Note: this test is auto-generated. Please modify / generate with script:")
        lines.append(" *       https://bugs.openjdk.org/browse/JDK-8308606")
        lines.append(" *")
        lines.append(" * Types: " + ", ".join([t.name for t in self.types]))
        lines.append(" * Offsets: " + ", ".join([str(o) for o in self.offsets]))
        lines.append(" *")
        lines.append(" * Checking if we should vectorize is a bit complicated. It depends on")
        lines.append(" * Matcher::vector_width_in_bytes, of the respective platforms (eg. x86.ad)")
        lines.append(" * This vector_width can be further constrained by MaxVectorSize.")
        lines.append(" *")
        lines.append(" * With '-XX:-AlignVector', we vectorize if:")
        lines.append(" *  - Vectors have at least 4 bytes:    vector_width >= 4")
        lines.append(" *  - Vectors hold at least 2 elements: vector_width >= 2 * sizeofop(velt_type)")
        lines.append(" *    -> min_vector_width = max(4, 2 * sizeofop(velt_type))")
        lines.append(" *    -> simplifies to: vector_width >= min_vector_width")
        lines.append(" *  - No cyclic dependency:")
        lines.append(" *    - Access: data[i + offset] = data[i] * fac;")
        lines.append(" *    - byte_offset = offset * sizeofop(type)")
        lines.append(" *    - Cyclic dependency if: 0 < byte_offset < vector_width")
        lines.append(" *")
        lines.append(" * Note: sizeofop(type) = sizeof(type), except sizeofop(char) = 2")
        lines.append(" *")
        lines.append(" * Different types can lead to different vector_width. This depends on")
        lines.append(" * the CPU-features. Thus, we have a positive and negative IR rule per")
        lines.append(" * CPU-feature for each test.")
        lines.append(" *")
        lines.append(" * Definition:")
        lines.append(" *     MaxVectorSize: limit through flag")
        lines.append(" *     vector_width: limit given by specific CPU feature for a specific velt_type")
        lines.append(" *     actual_vector_width: what is actually vectorized with")
        lines.append(" *     min_vector_width: what is minimally required for vectorization")
        lines.append(" *")
        lines.append(" *     min_vector_width = max(4, 2 * sizeofop(velt_type))")
        lines.append(" *     MaxVectorSize >= vector_width >= actual_vector_width >= min_vector_width")
        lines.append(" *")
        lines.append(" * In general, we cannot easily specify negative IR rules, that require no")
        lines.append(" * vectorization to happen. We may improve the SuperWord algorithm later,")
        lines.append(" * or some additional optimization collapses some Loads, and suddenly cyclic")
        lines.append(" * dependency disappears, and we can vectorize.")
        lines.append(" *")
        lines.append(" * With '-XX:+AlignVector', we would like to check that we vectorize exactly iff:")
        lines.append(" *     byte_offset % actual_vector_width == 0")
        lines.append(" * Because all vector_widths are powers of 2, this is equivalent to:")
        lines.append(" *     pow2_factor(byte_offset) >= actual_vector_width")
        lines.append(" * where pow2_factor computes the largest power of 2 that is a factor of the number.")
        lines.append(" *")
        lines.append(" * Under these assumptions, we know there must be vectorization:")
        lines.append(" *     pow2_factor(byte_offset) >= vector_width")
        lines.append(" *       implies")
        lines.append(" *         pow2_factor(byte_offset) >= actual_vector_width")
        lines.append(" *     MaxVectorSize >= min_vector_size")
        lines.append(" *       else any vectorization is impossible.")
        lines.append(" *")
        lines.append(" * And under the following conditions no vectorization is possible:")
        lines.append(" *     byte_offset < 0: No cyclic dependency.")
        lines.append(" *       Cyclic dependency could lead to Load removals, then only the store is vectorized.")
        lines.append(" *     byte_offset % min_vector_width != 0")
        lines.append(" *       implies")
        lines.append(" *         byte_offset % actual_vector_width != 0")
        lines.append(" *")
        lines.append(" */")
        lines.append("")
        self.generate_jtreg_tests(lines, class_name, bugid, package_name)
        lines.append(f"package {package_name};")
        lines.append("import compiler.lib.ir_framework.*;")
        lines.append("")
        lines.append(f"public class {class_name} {{")
        lines.append(f"    static final int RANGE = {self.range};")
        lines.append("")
        self.generate_gold_def(lines)
        self.generate_static_block(lines)
        self.generate_main(lines, class_name, package_name)
        self.generate_tests(lines)
        self.generate_inits(lines)
        self.generate_verifys(lines)
        lines.append("}")
        with open(f"{path_name}/{class_name}.java", "w") as f:
            for line in lines:
                f.write(f"{line}\n")

    def generate_jtreg_tests(self, lines, class_name, bugid, package_name):
        for scenario in self.scenarios:
            lines.append("/*")
            lines.append(f" * @test id={scenario.name}")
            lines.append(f" * @bug {bugid}")
            lines.append(" * @summary Test SuperWord: vector size, offsets, dependencies, alignment.")
            lines.append(" * @requires vm.compiler2.enabled")
            for req in scenario.requires:
                lines.append(f" * @requires {req}")
            lines.append(" * @library /test/lib /")
            #lines.append(f" * @run driver/timeout=400 {package_name}.{class_name} {scenario.name}")
            lines.append(f" * @run driver {package_name}.{class_name} {scenario.name}")
            lines.append(" */")
            lines.append("")

    def generate_gold_def(self, lines):
        for test in self.get_test_list():
            lines.append(f"    static {test.t.name}[] {test.gold()} = new {test.t.name}[RANGE];")
        lines.append("")

    def generate_main(self, lines, class_name, package_name):
        lines.append("    public static void main(String args[]) {")
        lines.append(f"        TestFramework framework = new TestFramework({class_name}.class);")
        lines.append("        framework.addFlags(\"-XX:-TieredCompilation\",")
        lines.append(f"                           \"-XX:CompileCommand=compileonly,{package_name}.{class_name}::init\",")
        lines.append(f"                           \"-XX:CompileCommand=compileonly,{package_name}.{class_name}::test*\",")
        lines.append(f"                           \"-XX:CompileCommand=compileonly,{package_name}.{class_name}::verify\",")
        lines.append("                           \"-XX:LoopUnrollLimit=250\");")
        lines.append("")
        lines.append("        if (args.length != 1) {")
        lines.append("            throw new RuntimeException(\"Test requires exactly one argument!\");")
        lines.append("        }")
        lines.append("")
        lines.append("        switch (args[0]) {")
        for scenario in self.scenarios:
            lines.append(f"        case \"{scenario.name}\":")
            lines.append(f"            framework.addFlags({scenario.format_flags()});")
            lines.append("            break;")
        lines.append("        default:")
        lines.append("            throw new RuntimeException(\"Test argument not recognized: \" + args[0]);")
        lines.append("        }")
        lines.append("        framework.start();")
        lines.append("    }")
        lines.append("")

    def generate_static_block(self, lines):
        lines.append("    static {")
        lines.append("        // compute the gold standard in interpreter mode")
        for test in self.get_test_list():
            lines.append(f"        init({test.gold()});")
            lines.append(f"        {test.test()}({test.gold()});")
        lines.append("    }")
        lines.append("")

    def generate_tests(self, lines):
        lines.append("    // ------------------- Tests -------------------")
        lines.append("")
        for test in self.get_test_list():
            lines.append("    @Test")
            # IR rules
            for p in test.t.platforms():
                elements = p.vector_width // test.t.size
                lines.append(f"    // CPU: {p.name} -> vector_width: {p.vector_width} -> elements in vector: {elements}")
                ###############  -AlignVector
                rule = PlatformIRRule(p)
                rule.add_pre_constraint("AlignVector", IRBool.makeFalse())
                rule.add_node("IRNode.LOAD_VECTOR")
                rule.add_node(f"IRNode.{test.t.ir_op}")
                rule.add_node("IRNode.STORE_VECTOR")
                # at least 4 byte, and at least 2 elements in vector
                min_vector_width = max(4, 2 * test.t.size)
                rule.add_constraint("MaxVectorSize", IRRange.lower_bound( min_vector_width))
                byte_offset = test.offset * test.t.size
                # positive offset smaller than vector_width leads to cyclic dependency
                if byte_offset > 0 and p.vector_width > byte_offset:
                    # byte_offset < MaxVectorSize -> cyclic dependency
                    lines.append(f"    //   positive byte_offset {byte_offset} can lead to cyclic dependency")
                    rule.add_constraint("MaxVectorSize", IRRange.upper_bound( byte_offset ))
                rule.generate(lines)
                ###############  +AlignVector
                # do not vectorize if (byte_offset % min_vector_width) != 0
                # because the real vector size is only a multiple, and so will never divide byte_offset
                # if byte_offset is positive, we have cyclic dependencies, which may be handled well
                if byte_offset < 0 and (byte_offset % min_vector_width) != 0:
                    lines.append(f"    //   Strict alignment not possible.")
                    rule = PlatformIRRule(p)
                    rule.add_pre_constraint("AlignVector", IRBool.makeTrue())
                    rule.add_node("IRNode.LOAD_VECTOR")
                    rule.add_node(f"IRNode.{test.t.ir_op}")
                    rule.add_node("IRNode.STORE_VECTOR")
                    rule.disable_negative_rules()
                    rule.expect_no_nodes()
                    rule.generate(lines)
                else:
                    # we expect vectorizaton, if
                    #    byte_offset % actual_vector_width == 0
                    # which is equivalent with the following, because vector_width is power of 2:
                    #    pow2_factor(byte_offset) >= actual_vector_width
                    # we do not know actual_vector_width, but we know
                    #    MaxVectorSize  >= actual_vector_width
                    #    p.vector_width >= actual_vector_width
                    byte_offset_p2 = pow2_factor(byte_offset)
                    if byte_offset_p2 >= p.vector_width:
                        lines.append(f"    //   Vectorize when strict alignment guaranteed.")
                        rule = PlatformIRRule(p)
                        rule.add_pre_constraint("AlignVector", IRBool.makeTrue())
                        rule.add_node("IRNode.LOAD_VECTOR")
                        rule.add_node(f"IRNode.{test.t.ir_op}")
                        rule.add_node("IRNode.STORE_VECTOR")
                        rule.disable_negative_rules()
                        rule.add_constraint("MaxVectorSize", IRRange.lower_bound( min_vector_width))
                        rule.generate(lines)

            # test method
            lines.append(f"    public static void {test.test()}({test.t.name}[] data) {{")
            start = 0 if (test.offset >= 0) else abs(test.offset)
            end_diff = "" if (test.offset <= 0) else f" - {abs(test.offset)}"
            lines.append(f"        for (int j = {start}; j < RANGE{end_diff}; j++) {{")
            lines.append(f"            data[j + {test.offset}] = ({test.t.name})(data[j] {test.t.operator} ({test.t.name}){test.t.factor});")
            lines.append(f"        }}")
            lines.append("    }")
            lines.append("")
            # run method
            lines.append(f"    @Run(test = \"{test.test()}\")")
            lines.append(f"    @Warmup(0)")
            lines.append(f"    public static void {test.run()}() {{")
            lines.append(f"        {test.t.name}[] data = new {test.t.name}[RANGE];")
            lines.append(f"        init(data);")
            lines.append(f"        {test.test()}(data);")
            lines.append(f"        verify(\"{test.test()}\", data, {test.gold()});")
            lines.append("    }")
            lines.append("")

    def generate_inits(self, lines):
        lines.append("    // ------------------- Initialization -------------------")
        lines.append("")
        for t in self.types:
            lines.append(f"    static void init({t.name}[] data) {{")
            lines.append(f"        for (int j = 0; j < RANGE; j++) {{")
            lines.append(f"            data[j] = ({t.name})j;")
            lines.append(f"        }}")
            lines.append(f"    }}")
            lines.append("")

    def generate_verifys(self, lines):
        lines.append("    // ------------------- Verification -------------------")
        lines.append("")
        for t in self.types:
            lines.append(f"    static void verify(String context, {t.name}[] data, {t.name}[] gold) {{")
            lines.append(f"        for (int i = 0; i < RANGE; i++) {{")
            lines.append(f"            if (data[i] != gold[i]) {{")
            lines.append(f"                throw new RuntimeException(\" Invalid \" + context + \" result: data[\" + i + \"]: \" + data[i] + \" != \" + gold[i]);")
            lines.append(f"            }}")
            lines.append(f"        }}")
            lines.append(f"    }}")
def main():
    g = Generator()
    g.generate("TestDependencyOffsets",
               "/home/emanuel/Documents/fork7-jdk/open/test/hotspot/jtreg/compiler/loopopts/superword",
               "8298935 8308606", # Big ID
               "compiler.loopopts.superword", # package
    )

if __name__ == "__main__":
    main()
