"docs/benchmarking/cli.md" did not exist on "67187554dd478ba76e79d7a6f8bf02be01290de3"
Unverified Commit 66a22096 authored by Kunshang Ji's avatar Kunshang Ji Committed by GitHub
Browse files

[Hardware] Replace `torch.cuda.synchronize()` api with `torch.accelerator.synchronize` (#36085)


Signed-off-by: default avatarKunshang Ji <kunshang.ji@intel.com>
parent 0bfa229b
......@@ -701,7 +701,7 @@ def _run_single_benchmark(
# Warmup
for _ in range(config.warmup_iters):
forward_fn()
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Benchmark
times = []
......@@ -714,7 +714,7 @@ def _run_single_benchmark(
forward_fn()
end.record()
torch.cuda.synchronize()
torch.accelerator.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers)
......
......@@ -391,7 +391,7 @@ def _run_single_benchmark(
attn_metadata,
output=out,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Benchmark
times = []
......@@ -412,7 +412,7 @@ def _run_single_benchmark(
)
end.record()
torch.cuda.synchronize()
torch.accelerator.synchronize()
elapsed_ms = start.elapsed_time(end)
times.append(elapsed_ms / 1000.0 / config.num_layers) # seconds per layer
......
......@@ -94,7 +94,7 @@ def create_logits(
def measure_memory() -> tuple[int, int]:
"""Return (allocated, reserved) memory in bytes."""
torch.cuda.synchronize()
torch.accelerator.synchronize()
return torch.cuda.memory_allocated(), torch.cuda.max_memory_allocated()
......@@ -123,7 +123,7 @@ def benchmark_function(
for _ in range(warmup_iters):
logits_copy = logits.clone()
func(logits_copy, k, p)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Reset memory stats before benchmark
reset_memory_stats()
......@@ -140,7 +140,7 @@ def benchmark_function(
func(logits_copy, k, p)
end_events[i].record()
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Calculate timing
times = [
......
......@@ -168,7 +168,7 @@ def bench_impl(
# warmup
for kwargs in kwargs_list:
impl_type.get_impl()(**kwargs)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Merge into a single kwargs and qualify arguments as ArgPool
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
......
......@@ -171,7 +171,7 @@ def bench_run(
activation=MoEActivation.SILU,
global_num_experts=num_experts,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
triton_stream = torch.cuda.Stream()
......@@ -187,14 +187,14 @@ def bench_run(
topk_ids,
quant_config=quant_config,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
"""Benchmark CUDA graph using events like benchmark_moe.py"""
# Warmup
for _ in range(num_warmup):
graph.replay()
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Timing
start_event = torch.Event(enable_timing=True)
......@@ -202,7 +202,7 @@ def bench_run(
latencies = []
for _ in range(num_iters):
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_event.record()
graph.replay()
end_event.record()
......
......@@ -307,7 +307,7 @@ def bench_run(
def replay_graph(graph, num_repeats):
for _ in range(num_repeats):
graph.replay()
torch.cuda.synchronize()
torch.accelerator.synchronize()
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
......@@ -330,7 +330,7 @@ def bench_run(
e=num_experts,
device=device,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
......@@ -345,7 +345,7 @@ def bench_run(
w2_fp8scale,
a_fp8_scale,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
min_run_time = 5
num_warmup = 5
......
......@@ -342,7 +342,7 @@ class CommunicatorBenchmark:
if not should_use_fn(tensor):
return None
torch.cuda.synchronize()
torch.accelerator.synchronize()
stream = torch.cuda.Stream()
with torch.cuda.stream(stream):
graph_input = tensor.clone()
......@@ -360,17 +360,17 @@ class CommunicatorBenchmark:
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
allreduce_fn(graph_input)
torch.cuda.synchronize()
torch.accelerator.synchronize()
for _ in range(num_warmup):
graph.replay()
torch.cuda.synchronize()
torch.accelerator.synchronize()
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_time = time.perf_counter()
for _ in range(num_trials):
graph.replay()
torch.cuda.synchronize()
torch.accelerator.synchronize()
end_time = time.perf_counter()
......
......@@ -385,7 +385,7 @@ def benchmark_operation(
# Warmup before graph capture
for _ in range(warmup):
operation_func(*args, **kwargs)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Create CUDA graph
graph = torch.cuda.CUDAGraph()
......@@ -398,19 +398,19 @@ def benchmark_operation(
operation_func(*args, **kwargs)
# Graph warmup
torch.cuda.synchronize()
torch.accelerator.synchronize()
for _ in range(warmup):
graph.replay()
# Benchmark with CUDA graph
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_time = time.perf_counter()
for _ in range(trials // num_op_per_cudagraph):
# operation_func(*args, **kwargs)
graph.replay()
torch.cuda.synchronize()
torch.accelerator.synchronize()
end_time = time.perf_counter()
avg_time_ms = ((end_time - start_time) / trials) * 1000
......
......@@ -224,7 +224,7 @@ def bench_run(
def replay_graph(graph, num_repeats):
for _ in range(num_repeats):
graph.replay()
torch.cuda.synchronize()
torch.accelerator.synchronize()
cutlass_stream = torch.cuda.Stream()
cutlass_graph = torch.cuda.CUDAGraph()
......@@ -239,7 +239,7 @@ def bench_run(
topk_weights,
topk_ids,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
triton_stream = torch.cuda.Stream()
triton_graph = torch.cuda.CUDAGraph()
......@@ -254,7 +254,7 @@ def bench_run(
w2_scale,
a_scale,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
min_run_time = 5
num_warmup = 5
......
......@@ -34,14 +34,14 @@ def main(
residual = torch.randn_like(x) * scale if add_residual else None
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
torch.cuda.synchronize()
torch.accelerator.synchronize()
if profile:
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()
for _ in range(num_iters):
layer(x, residual)
torch.cuda.synchronize()
torch.accelerator.synchronize()
end_time = time.perf_counter()
if profile:
......
......@@ -1035,7 +1035,7 @@ def bench_optype(
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
for kwargs in kwargs_list:
op_type.bench_fn()(**kwargs)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Merge into a single kwargs and qualify arguments as ArgPool
kwargs = {k: ArgPool([]) for k in kwargs_list[0]}
......
......@@ -47,13 +47,13 @@ def benchmark_method(
# Warmup
for _ in range(num_warmup):
_ = method(k_nope, k_pe)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Benchmark
start = time.perf_counter()
for _ in range(num_iters):
_ = method(k_nope, k_pe)
torch.cuda.synchronize()
torch.accelerator.synchronize()
end = time.perf_counter()
return (end - start) / num_iters * 1000 # Convert to ms
......
......@@ -304,19 +304,19 @@ def benchmark_config(
# JIT compilation & warmup
run()
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Capture 10 invocations with CUDA graph
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
for _ in range(10):
run()
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Warmup
for _ in range(5):
graph.replay()
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_event = torch.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True)
......@@ -324,7 +324,7 @@ def benchmark_config(
latencies: list[float] = []
for i in range(num_iters):
prepare(i)
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_event.record()
graph.replay()
......
......@@ -131,7 +131,7 @@ def benchmark_config(
topk_ids,
quant_config=quant_config,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Benchmark
start = torch.cuda.Event(enable_timing=True)
......@@ -149,7 +149,7 @@ def benchmark_config(
quant_config=quant_config,
)
end.record()
torch.cuda.synchronize()
torch.accelerator.synchronize()
return start.elapsed_time(end) / num_iters * 1000 # ms -> us
......
......@@ -69,19 +69,19 @@ def benchmark_permute(
# JIT compilation & warmup
run()
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Capture 10 invocations with CUDA graph
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
for _ in range(10):
run()
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Warmup
for _ in range(5):
graph.replay()
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_event = torch.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True)
......@@ -89,7 +89,7 @@ def benchmark_permute(
latencies: list[float] = []
for i in range(num_iters):
prepare(i)
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_event.record()
graph.replay()
......@@ -159,26 +159,26 @@ def benchmark_unpermute(
# JIT compilation & warmup
input = prepare()
run(input)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Capture 10 invocations with CUDA graph
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
for _ in range(10):
run(input)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Warmup
for _ in range(5):
graph.replay()
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_event = torch.Event(enable_timing=True)
end_event = torch.Event(enable_timing=True)
latencies: list[float] = []
for i in range(num_iters):
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_event.record()
graph.replay()
end_event.record()
......
......@@ -135,14 +135,14 @@ def benchmark_mrope(
key.clone(),
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
# Time reference implementation
torch_times = []
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_time = time.time()
mrope_helper_class.forward_native(
......@@ -151,7 +151,7 @@ def benchmark_mrope(
key_clone,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
torch_times.append(time.time() - start_time)
# Time triton kernel implementation
......@@ -159,14 +159,14 @@ def benchmark_mrope(
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.cuda.synchronize()
torch.accelerator.synchronize()
start_time = time.time()
mrope_helper_class.forward_cuda(
positions,
query_clone,
key_clone,
)
torch.cuda.synchronize()
torch.accelerator.synchronize()
triton_times.append(time.time() - start_time)
# Calculate statistics
......
......@@ -103,7 +103,7 @@ def main(
max_logits = torch.empty_like(exp_sums)
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
torch.cuda.synchronize()
torch.accelerator.synchronize()
if profile:
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()
......@@ -173,7 +173,7 @@ def main(
)
else:
raise ValueError(f"Invalid version: {version}")
torch.cuda.synchronize()
torch.accelerator.synchronize()
end_time = time.perf_counter()
if profile:
......
......@@ -28,7 +28,7 @@ def _time_cuda(
# warmup
for _ in range(warmup_iters):
fn()
torch.cuda.synchronize()
torch.accelerator.synchronize()
start = torch.Event(enable_timing=True)
end = torch.Event(enable_timing=True)
......@@ -37,7 +37,7 @@ def _time_cuda(
for _ in range(bench_iters):
fn()
end.record()
torch.cuda.synchronize()
torch.accelerator.synchronize()
return start.elapsed_time(end) / bench_iters # ms/iter
......
......@@ -29,7 +29,7 @@ def main(
scale = torch.randn(1, 1, dtype=torch.float32) if static_scale else None
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
torch.cuda.synchronize()
torch.accelerator.synchronize()
if profile:
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()
......@@ -39,7 +39,7 @@ def main(
ops.scaled_int8_quant(x, scale)
else:
ops.scaled_fp8_quant(x, scale)
torch.cuda.synchronize()
torch.accelerator.synchronize()
end_time = time.perf_counter()
if profile:
......
......@@ -84,16 +84,16 @@ def run_benchmark(
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(g):
function_under_test()
torch.cuda.synchronize()
torch.accelerator.synchronize()
function_under_test = lambda: g.replay()
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
torch.accelerator.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
function_under_test()
torch.cuda.synchronize()
torch.accelerator.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment