import time
import numpy as np
import migraphx

def pct(a, p):
    return float(np.percentile(np.asarray(a, dtype=np.float64), p))

def build_mlp(batch, hidden, layers, seed=0):
    rng = np.random.default_rng(seed)

    p = migraphx.program()
    mm = p.get_main_module()

    x = mm.add_parameter("x", migraphx.shape(type="float", lens=[batch, hidden]))
    y = x

    for i in range(layers):
        w = mm.add_literal(rng.random((hidden, hidden), dtype=np.float32))
        y = mm.add_instruction(migraphx.op("dot"), [y, w])
        y = mm.add_instruction(migraphx.op("relu"), [y])

    mm.add_return([y])
    return p

def run_case(target, batch, hidden, layers, warmup=20, iters=200):
    p = build_mlp(batch, hidden, layers)
    p.compile(migraphx.get_target(target))

    x = np.random.rand(batch, hidden).astype(np.float32)

    for _ in range(warmup):
        p.run({"x": x})

    times = []
    t0 = time.perf_counter()
    for _ in range(iters):
        s = time.perf_counter()
        p.run({"x": x})
        times.append((time.perf_counter() - s) * 1000)
    t1 = time.perf_counter()

    avg = (t1 - t0) * 1000 / iters
    thr = (batch * iters) / (t1 - t0)
    return avg, pct(times, 50), pct(times, 95), thr

def main():
    target = "gpu"   # ✅ 你的镜像就是这个
    cases = [
        (128, 8192*4,8),  # 更猛：内存/显存不够就删掉这行
    ]

    print(f"target={target}\n")
    for b, h, l in cases:
        avg, p50, p95, thr = run_case(target, b, h, l, warmup=20, iters=200)
        print(f"[batch={b} hidden={h} layers={l}] "
              f"avg={avg:.3f} ms  p50={p50:.3f} ms  p95={p95:.3f} ms  "
              f"throughput={thr:.1f} samples/s")

if __name__ == "__main__":
    main()

