[Hardware] Replace `torch.cuda.synchronize()` api with `torch.accelerator.synchronize` (#36085)

Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>

[Hardware] Replace `torch.cuda.synchronize()` api with `torch.accelerator.synchronize` (#36085)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
66a22096 · Kunshang Ji · GitHub · 0bfa229b · 66a22096 · 66a22096
Unverified Commit 66a22096 authored Mar 05, 2026 by Kunshang Ji Committed by GitHub Mar 05, 2026
19 changed files
--- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
+++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py
@@ -34,7 +34,7 @@ def do_profile(
        record_shapes=True,
    ) as tprof:
        fn(**fn_kwargs)
-        torch.cuda.synchronize(torch.cuda.current_device())
+        torch.accelerator.synchronize(torch.cuda.current_device())
    # TODO (varun): Add a descriptive trace file name
    tprof.export_chrome_trace(

--- a/tests/kernels/moe/test_block_fp8.py
+++ b/tests/kernels/moe/test_block_fp8.py
@@ -318,8 +318,8 @@ def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed, monkeypatch)
                out = deep_gemm_moe_fp8_fn(
                    a, w1, w2, w1_s, w2_s, topk_weights, topk_ids
                )
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            graph.replay()
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
    torch.testing.assert_close(out, ref_out, atol=0.035, rtol=0.035)
--- a/tests/kernels/moe/test_cutlass_moe.py
+++ b/tests/kernels/moe/test_cutlass_moe.py
@@ -399,9 +399,9 @@ def test_cutlass_moe_8_bit_cuda_graph(
                mt, topk_weights, topk_ids, per_act_token, per_out_ch
            )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        torch.testing.assert_close(triton_output, cutlass_output, atol=9e-2, rtol=1e-2)

--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -272,9 +272,9 @@ def run_moe_test(
                global_num_experts=global_num_experts,
                expert_map=expert_map,
            )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        graph.replay()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    torch.testing.assert_close(test_output, baseline_output, atol=atol, rtol=rtol)
@@ -768,7 +768,7 @@ def test_mixtral_moe(
                F.pad(vllm_moe.experts.w2_weight, (0, 128), "constant", 0)[..., 0:-128],
                requires_grad=False,
            )
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            torch.accelerator.empty_cache()
        # FIXME (zyongye) fix this after we move self.kernel

--- a/tests/kernels/quantization/test_allspark_gemm.py
+++ b/tests/kernels/quantization/test_allspark_gemm.py
@@ -122,7 +122,7 @@ def test_gptq_allspark_gemm_ampere(mnk_factors, group_size, has_zp, dtype):
    )
    output_ref = torch.matmul(input, w_ref)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    max_diff = compute_max_diff(output, output_ref)
    assert max_diff < 0.04
--- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py
+++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py
@@ -269,7 +269,7 @@ def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero):
        setup.c_strides,
        setup.group_scale_strides,
    )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    out_ref = compute_moe_reference_output(setup)
    torch.testing.assert_close(setup.out, out_ref, rtol=1e-2, atol=1e-2)

--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
@@ -260,7 +260,7 @@ def test_gptq_marlin_repack(
    marlin_q_w_2 = ops.gptq_marlin_repack(
        q_w_gptq, sort_indices, size_k, size_n, quant_type.size_bits, is_a_8bit
    )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
@@ -308,7 +308,7 @@ def test_awq_marlin_repack(k_chunk, n_chunk, quant_type, is_a_8bit, nk_factors):
    marlin_q_w_2 = ops.awq_marlin_repack(
        q_w_awq, size_k, size_n, quant_type.size_bits, is_a_8bit
    )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    torch.testing.assert_close(marlin_q_w_1, marlin_q_w_2)
@@ -564,7 +564,7 @@ def test_marlin_gemm_subset_input():
    )
    output_ref = torch.matmul(a_input, w_ref)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    max_diff = compute_max_diff(output, output_ref)
@@ -613,7 +613,7 @@ def test_marlin_gemm_with_bias(size_m):
    )
    output_ref = torch.matmul(a_input, w_ref) + b_bias.view(1, -1)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    max_diff = compute_max_diff(output, output_ref)

--- a/tests/kernels/test_cache_kernels.py
+++ b/tests/kernels/test_cache_kernels.py
@@ -57,7 +57,7 @@ def test_gather_cache_oob():
        seq_starts,
    )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    assert True

--- a/tests/kernels/test_top_k_per_row.py
+++ b/tests/kernels/test_top_k_per_row.py
@@ -219,7 +219,7 @@ def _run_top_k_per_row_decode_test(
        top_k,
    )
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    # Run reference implementation
    torch_indices = torch.empty((num_rows, top_k), dtype=torch.int32, device="cuda")

--- a/tests/models/language/generation/test_common.py
+++ b/tests/models/language/generation/test_common.py
@@ -195,4 +195,4 @@ def test_models(
        # unit tests. On ROCm, when using AITER
        # the memory might not be deallocated completely
        # before running the next test case
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -196,7 +196,7 @@ def test_compressed_tensors_w8a8_logprobs(
    )
    if current_platform.is_rocm():
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
 def test_compressed_tensors_no_enforce_eager(vllm_runner):

--- a/tools/pre_commit/check_torch_cuda.py
+++ b/tools/pre_commit/check_torch_cuda.py
@@ -9,6 +9,7 @@ import regex as re
 # --------------------------------------------------------------------------- #
 _TORCH_CUDA_PATTERNS = [
    r"\btorch\.cuda\.empty_cache\b",
+    r"\btorch\.cuda\.synchronize\b",
 ]
 ALLOWED_FILES = {"vllm/platforms/", "vllm/device_allocator/"}

--- a/vllm/distributed/elastic_ep/elastic_execute.py
+++ b/vllm/distributed/elastic_ep/elastic_execute.py
@@ -217,7 +217,7 @@ class ElasticEPScalingExecutor:
                dp_group=standby_dp_group,
                expert_weights=model.expert_weights,
            )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    def broadcast_expert_mapping(self) -> None:
        standby_dp_group = get_standby_dp_group()
@@ -407,7 +407,7 @@ class ElasticEPScalingExecutor:
            reset_compile_wrapper(self.worker.model_runner.get_model())
        gc.collect()
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        torch.accelerator.empty_cache()
        unlock_workspace()
        self.worker.compile_or_warm_up_model()
@@ -446,7 +446,7 @@ class ElasticEPScalingExecutor:
            eplb_state.rearrange(rank_mapping=rank_mapping)
        # NOTE(yongji): check whether we need to synchronize here
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        # reset expert_rearrangement_step to ensure all ranks are synchronized
        eplb_state.expert_rearrangement_step = 0
        eplb_state.num_valid_physical_experts = (
@@ -491,7 +491,7 @@ class ElasticEPScalingExecutor:
            dp_group=dp_group,
            expert_weights=model.expert_weights,
        )
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    def receive_expert_mapping(self) -> tuple[torch.Tensor, int, int]:
        dp_group = get_dp_group()

--- a/vllm/distributed/eplb/rebalance_execute.py
+++ b/vllm/distributed/eplb/rebalance_execute.py
@@ -622,7 +622,7 @@ def rearrange_expert_weights_inplace(
    # NOTE(bowen): We need this synchronize to run, but I don't know why.
    # If you figure out the reason, please let me know -- thank you!
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
    old_global_expert_indices_cpu = old_global_expert_indices.cpu().numpy()
    new_global_expert_indices_cpu = new_global_expert_indices.cpu().numpy()

--- a/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py
+++ b/vllm/model_executor/kernels/linear/mixed_precision/cutlass.py
@@ -77,7 +77,7 @@ class CutlassW4A8LinearKernel(MPLinearKernel):
        def transform_w_q(x):
            assert isinstance(x, BasevLLMParameter)
            convert_packed_uint4b8_to_signed_int4_inplace(x.data)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
            x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t())
            return x

--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -457,7 +457,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            else:
                self._dummy_pooler_run(hidden_states)
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        del hidden_states, sample_hidden_states
        gc.collect()
@@ -525,7 +525,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        # to trigger JIT compilation.
        if all("FLASHINFER" in b.get_name() for b in self.attn_backends.values()):
            self._dummy_run(self.max_num_tokens, skip_attn=False)
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
    def finish_requests(self, scheduler_output: SchedulerOutput) -> None:
        finished_req_ids = scheduler_output.finished_req_ids

--- a/vllm/v1/worker/gpu/warmup.py
+++ b/vllm/v1/worker/gpu/warmup.py
@@ -102,4 +102,4 @@ def warmup_kernels(model_runner: GPUModelRunner) -> None:
    cleanup_output.finished_req_ids = set(req_ids)
    model_runner.execute_model(cleanup_output)
    model_runner.kv_connector.set_disabled(False)
-    torch.cuda.synchronize()
+    torch.accelerator.synchronize()
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -928,7 +928,7 @@ class GPUModelRunner(
    # Note: used for model runner override.
    def _sync_device(self) -> None:
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
    def _update_states(self, scheduler_output: "SchedulerOutput") -> None:
        """Update the cached states and the persistent batch with the scheduler
@@ -5345,7 +5345,7 @@ class GPUModelRunner(
                    cudagraph_runtime_mode=runtime_mode,
                )
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            end_free_gpu_memory = torch.cuda.mem_get_info()[0]
        # Disable cudagraph capturing globally, so any unexpected cudagraph
@@ -6266,13 +6266,13 @@ class GPUModelRunner(
        group_refs = group_lora_refs[current_item_idx : current_item_idx + num_items]
        group_request_ids = {req_id for req_id, _ in group_refs}
-        torch.cuda.synchronize()
+        torch.accelerator.synchronize()
        start_time = time.perf_counter()
        try:
            yield
        finally:
-            torch.cuda.synchronize()
+            torch.accelerator.synchronize()
            elapsed = time.perf_counter() - start_time
            per_request_time = elapsed / max(len(group_request_ids), 1)

--- a/vllm/v1/worker/xpu_model_runner.py
+++ b/vllm/v1/worker/xpu_model_runner.py
@@ -29,9 +29,6 @@ class XPUModelRunner(GPUModelRunner):
        # FIXME: To be verified.
        self.cascade_attn_enabled = False
-    def _sync_device(self) -> None:
-        torch.xpu.synchronize()
 @contextmanager
 def _torch_cuda_wrapper():
@@ -42,7 +39,6 @@ def _torch_cuda_wrapper():
        torch.cuda.current_stream = torch.xpu.current_stream
        torch.cuda.stream = torch.xpu.stream
        torch.cuda.mem_get_info = torch.xpu.mem_get_info
-        torch.cuda.synchronize = torch.xpu.synchronize
        if supports_xpu_graph():
            torch.cuda.graph = torch.xpu.graph
            torch.cuda.CUDAGraph = torch.xpu.XPUGraph