Fix bench script making input data on L2 cache (#7739)

e34cf6ad · fzyzcjy · GitHub · 62222bd2 · e34cf6ad
Unverified Commit e34cf6ad authored Jul 27, 2025 by fzyzcjy Committed by GitHub Jul 27, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 2 additions and 2 deletions

sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py +2 -2

No files found.
--- a/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py
+++ b/sgl-kernel/benchmark/bench_per_token_group_quant_8bit.py
@@ -205,9 +205,9 @@ def benchmark(batch_size, seq_len, group_size, dst_dtype, provider):
    quantiles = [0.5, 0.2, 0.8]
    if provider == "triton":
-        fn = lambda: triton_per_token_group_quant_8bit(x.clone(), group_size, dst_dtype)
+        fn = lambda: triton_per_token_group_quant_8bit(x, group_size, dst_dtype)
    elif provider == "sglang":
-        fn = lambda: sglang_per_token_group_quant_8bit(x.clone(), group_size, dst_dtype)
+        fn = lambda: sglang_per_token_group_quant_8bit(x, group_size, dst_dtype)
    ms, min_ms, max_ms = triton.testing.do_bench(fn, quantiles=quantiles)