[Bugfix] Correctly call `cudaProfilerStop` in benchmarks script (#14183)

Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>

[Bugfix] Correctly call `cudaProfilerStop` in benchmarks script (#14183)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
c34eeec5 · Brayden Zhong · GitHub · ad60bbb2 · c34eeec5 · c34eeec5
Unverified Commit c34eeec5 authored Mar 06, 2025 by Brayden Zhong Committed by GitHub Mar 07, 2025
6 changed files
--- a/benchmarks/kernels/benchmark_layernorm.py
+++ b/benchmarks/kernels/benchmark_layernorm.py
@@ -40,7 +40,7 @@ def main(num_tokens: int,
        end_time = time.perf_counter()
        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
        return (end_time - start_time) / num_iters
    # Warmup.

--- a/benchmarks/kernels/benchmark_lora.py
+++ b/benchmarks/kernels/benchmark_lora.py
@@ -153,7 +153,6 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
        result = torch.nn.functional.linear(x, w)
        result *= scaling
        out_list.append(result)
-    torch.cat(out_list, dim=0)
    cat_result = torch.cat(out_list, dim=0)

--- a/benchmarks/kernels/benchmark_machete.py
+++ b/benchmarks/kernels/benchmark_machete.py
@@ -45,7 +45,6 @@ def terse_type_name(dt):
        torch.float16: "fp16",
        torch.int8: "int8",
        torch.float8_e4m3fn: "fp8",
-        torch.bfloat16: "bf16",
        torch.float: "float",
        torch.int: "int",
    }[dt]
@@ -259,7 +258,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
    return lambda: ops.machete_mm(
        a=bt.a,
-        b_q=bt.w_q,
+        b_q=w_q,
        b_type=bt.wtype,
        b_group_scales=bt.w_g_s,
        b_group_zeros=w_g_zp,

--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
 # SPDX-License-Identifier: Apache-2.0
 import argparse
+import json
 import time
 from contextlib import nullcontext
 from datetime import datetime

--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -176,7 +176,7 @@ def main(
        end_time = time.perf_counter()
        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
        return (end_time - start_time) / num_iters
    # Warmup.

--- a/benchmarks/kernels/benchmark_quant.py
+++ b/benchmarks/kernels/benchmark_quant.py
@@ -40,7 +40,7 @@ def main(num_tokens: int,
        end_time = time.perf_counter()
        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
+            torch.cuda.cudart().cudaProfilerStop()
        return (end_time - start_time) / num_iters
    # Warmup.