#!/usr/bin/env python3 from __future__ import annotations import argparse import csv import statistics import sys import time import torch def parse_int_list(value: str) -> list[int]: return [int(item) for item in value.split(",") if item.strip()] def sync() -> None: torch.cuda.synchronize() def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--lib", required=True) parser.add_argument("--device", type=int, default=0) parser.add_argument("--inner-loops", type=parse_int_list, default=parse_int_list("0,1,2,4,8,16,32,64")) parser.add_argument("--steps", type=int, default=10000) parser.add_argument("--warmup", type=int, default=1000) parser.add_argument("--rounds", type=int, default=5) args = parser.parse_args() torch.ops.load_library(args.lib) torch.cuda.set_device(args.device) tensor = torch.empty(1024, device="cuda") op = torch.ops.fastpt_c_overhead_mre.guard_loop writer = csv.writer(sys.stdout) writer.writerow( [ "section", "inner_loops", "steps", "warmup", "rounds", "median_step_us", "mean_step_us", "median_per_guard_us", ] ) for inner_loops in args.inner_loops: for _ in range(args.warmup): op(tensor, inner_loops) sync() values = [] for _ in range(args.rounds): sync() start = time.perf_counter_ns() for _ in range(args.steps): op(tensor, inner_loops) sync() stop = time.perf_counter_ns() values.append((stop - start) / args.steps / 1000.0) median_step = statistics.median(values) writer.writerow( [ "guard_loop", inner_loops, args.steps, args.warmup, args.rounds, f"{median_step:.6f}", f"{statistics.mean(values):.6f}", f"{median_step / inner_loops:.6f}" if inner_loops else "0.000000", ] ) if __name__ == "__main__": main()