[Hardware] Replace torch.cuda.device_count/current_device/set_device API (#36145)

Signed-off-by: Kunshang Ji <jikunshang95@gmail.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>

[Hardware] Replace torch.cuda.device_count/current_device/set_device API (#36145)
Signed-off-by: Kunshang Ji <jikunshang95@gmail.com> Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
53ec16a7 · Kunshang Ji · GitHub · 2e693f48 · 53ec16a7 · 53ec16a7
Unverified Commit 53ec16a7 authored Mar 12, 2026 by Kunshang Ji Committed by GitHub Mar 12, 2026
20 changed files
--- a/benchmarks/attention_benchmarks/mla_runner.py
+++ b/benchmarks/attention_benchmarks/mla_runner.py
@@ -757,7 +757,7 @@ def _run_mla_benchmark_batched(
    backend_cfg = _get_backend_config(backend)
    device = torch.device(configs_with_params[0][0].device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    # Determine block size
    config_block_size = configs_with_params[0][0].block_size

--- a/benchmarks/attention_benchmarks/runner.py
+++ b/benchmarks/attention_benchmarks/runner.py
@@ -443,7 +443,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
        BenchmarkResult with timing and memory statistics
    """
    device = torch.device(config.device)
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    backend_cfg = _get_backend_config(config.backend)

--- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
+++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py
@@ -64,7 +64,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    (m, k, n) = mkn
    dtype = torch.half

--- a/benchmarks/kernels/benchmark_device_communicators.py
+++ b/benchmarks/kernels/benchmark_device_communicators.py
@@ -495,7 +495,7 @@ def main():
    # Set device
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    # Get CPU process group
    cpu_group = dist.new_group(backend="gloo")

--- a/benchmarks/kernels/benchmark_fused_collective.py
+++ b/benchmarks/kernels/benchmark_fused_collective.py
@@ -392,7 +392,7 @@ def benchmark_operation(
    num_op_per_cudagraph = 10
    # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
-    device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
    with graph_capture(device=device), torch.cuda.graph(graph):
        for _ in range(num_op_per_cudagraph):
            operation_func(*args, **kwargs)
@@ -984,7 +984,7 @@ def main():
    world_size = int(os.environ["WORLD_SIZE"])
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    torch.set_default_device(device)
    init_distributed_environment()

--- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
+++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
@@ -50,7 +50,7 @@ def bench_run(
    per_out_ch: bool,
    mkn: tuple[int, int, int],
 ):
-    init_workspace_manager(torch.cuda.current_device())
+    init_workspace_manager(torch.accelerator.current_device_index())
    label = "Quant Matmul"
    sub_label = (

--- a/benchmarks/kernels/benchmark_w8a8_block_fp8.py
+++ b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
    weight_shapes = args_dict["weight_shapes"]
    args = args_dict["args"]
-    torch.cuda.set_device(gpu_id)
+    torch.accelerator.set_device_index(gpu_id)
    print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
    block_n = args.block_n
@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):
 def main(args):
    print(args)
-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
    if num_gpus == 0:
        raise RuntimeError("No GPU available for tuning")
    print(f"Found {num_gpus} GPUs for parallel tuning")

--- a/docs/configuration/conserving_memory.md
+++ b/docs/configuration/conserving_memory.md
@@ -15,7 +15,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
 ```
 !!! warning
-    To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
+    To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.accelerator.set_device_index][])
    before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
    To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.

--- a/docs/usage/troubleshooting.md
+++ b/docs/usage/troubleshooting.md
@@ -91,8 +91,8 @@ If GPU/CPU communication cannot be established, you can use the following Python
    import torch
    import torch.distributed as dist
    dist.init_process_group(backend="nccl")
-    local_rank = dist.get_rank() % torch.cuda.device_count()
+    local_rank = dist.get_rank() % torch.accelerator.device_count()
-    torch.cuda.set_device(local_rank)
+    torch.accelerator.set_device_index(local_rank)
    data = torch.FloatTensor([1,] * 128).to("cuda")
    dist.all_reduce(data, op=dist.ReduceOp.SUM)
    torch.accelerator.synchronize()
@@ -337,7 +337,7 @@ import vllm
 import torch
 print(f"CUDA available: {torch.cuda.is_available()}")
-print(f"CUDA device count: {torch.cuda.device_count()}")
+print(f"CUDA device count: {torch.accelerator.device_count()}")
 EOF
 ```

--- a/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_ipc.py
@@ -106,7 +106,7 @@ def main():
    # IPC requires the training model to be on the same GPU as the vLLM server
    # The server should be started on GPU 0 with reduced memory utilization
    device = "cuda:0"
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    # Load the training model on the same GPU as the server
    # Use bfloat16 to reduce memory footprint

--- a/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
+++ b/examples/online_serving/new_weight_syncing/rlhf_http_nccl.py
@@ -131,7 +131,7 @@ def main():
    inference_world_size = get_world_size(BASE_URL)
    world_size = inference_world_size + 1  # +1 for the trainer
    device = f"cuda:{inference_world_size}"
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    # Load the training model
    print(f"Loading training model: {MODEL_NAME}")

--- a/tests/compile/passes/distributed/test_async_tp.py
+++ b/tests/compile/passes/distributed/test_async_tp.py
@@ -300,7 +300,7 @@ def async_tp_pass_on_test_model(
    set_random_seed(0)
    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    torch.set_default_device(device)
    torch.set_default_dtype(dtype)

--- a/tests/compile/passes/distributed/test_fusion_all_reduce.py
+++ b/tests/compile/passes/distributed/test_fusion_all_reduce.py
@@ -262,7 +262,7 @@ def all_reduce_fusion_pass_on_test_model(
    set_random_seed(0)
    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    torch.set_default_device(device)
    torch.set_default_dtype(dtype)

--- a/tests/compile/passes/distributed/test_sequence_parallelism.py
+++ b/tests/compile/passes/distributed/test_sequence_parallelism.py
@@ -228,7 +228,7 @@ def sequence_parallelism_pass_on_test_model(
    set_random_seed(0)
    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    torch.set_default_device(device)
    torch.set_default_dtype(dtype)

--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -428,7 +428,7 @@ class HfRunner:
            )
        # don't put this import at the top level
-        # it will call torch.cuda.device_count()
+        # it will call torch.accelerator.device_count()
        from transformers import AutoProcessor
        self.processor = AutoProcessor.from_pretrained(
@@ -1535,7 +1535,7 @@ def clean_gpu_memory_between_tests():
    from tests.utils import wait_for_gpu_memory_to_clear
-    num_gpus = torch.cuda.device_count()
+    num_gpus = torch.accelerator.device_count()
    if num_gpus > 0:
        try:
            wait_for_gpu_memory_to_clear(

--- a/tests/cuda/scripts/check_device_count_respects_env.py
+++ b/tests/cuda/scripts/check_device_count_respects_env.py
@@ -14,7 +14,7 @@ import torch  # noqa: E402
 from vllm.platforms import current_platform  # noqa: F401, E402
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
-count = torch.cuda.device_count()
+count = torch.accelerator.device_count()
 if count == 0:
    sys.exit(0)  # Skip: no GPUs available

--- a/tests/distributed/eplb_utils.py
+++ b/tests/distributed/eplb_utils.py
@@ -42,7 +42,7 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
    update_environment_variables(env)
    local_rank = os.environ["LOCAL_RANK"]
    device = torch.device(f"cuda:{local_rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    # Create a minimal vllm config for init_distributed_environment
    vllm_config = VllmConfig()

--- a/tests/distributed/test_comm_ops.py
+++ b/tests/distributed/test_comm_ops.py
@@ -43,7 +43,7 @@ def all_reduce_test_worker(
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    num_elements = 8
    all_tensors = [
@@ -69,7 +69,7 @@ def reduce_scatter_test_worker(
    # they will be able to set the device to the correct GPU
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    num_elements = 8
@@ -100,7 +100,7 @@ def all_gather_test_worker(
    # they will be able to set the device to the correct GPU
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    num_dimensions = 3
    tensor_size = list(range(2, num_dimensions + 2))
@@ -134,7 +134,7 @@ def broadcast_tensor_dict_test_worker(
    # they will be able to set the device to the correct GPU
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    test_dict = {
        # device tensor
@@ -171,7 +171,7 @@ def send_recv_tensor_dict_test_worker(
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    test_dict = {
@@ -317,7 +317,7 @@ def send_recv_test_worker(
 ):
    monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
    device = torch.device(f"cuda:{rank}")
-    torch.cuda.set_device(device)
+    torch.accelerator.set_device_index(device)
    init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
    size = 64

--- a/tests/distributed/test_custom_all_reduce.py
+++ b/tests/distributed/test_custom_all_reduce.py
@@ -35,7 +35,7 @@ def graph_allreduce(
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        m.delenv("HIP_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
        ensure_model_parallel_initialized(tp_size, pp_size)
        group = get_tp_group().device_group
@@ -62,12 +62,10 @@ def graph_allreduce(
            for dtype in [torch.float32, torch.float16, torch.bfloat16]:
                with graph_capture(device=device) as graph_capture_context:
                    # use integers so result matches NCCL exactly
-                    inp1 = torch.randint(
+                    device_idx = torch.accelerator.current_device_index()
-                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
+                    inp1 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
-                    )
+                    inp2 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
-                    inp2 = torch.randint(
-                        1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
-                    )
                    torch.accelerator.synchronize()
                    graph = torch.cuda.CUDAGraph()
                    with torch.cuda.graph(graph, stream=graph_capture_context.stream):
@@ -95,7 +93,7 @@ def eager_allreduce(
        m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
        m.delenv("HIP_VISIBLE_DEVICES", raising=False)
        device = torch.device(f"cuda:{rank}")
-        torch.cuda.set_device(device)
+        torch.accelerator.set_device_index(device)
        init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
        # we use the first group to communicate once
@@ -129,6 +127,6 @@ def test_custom_allreduce(
    test_target,
 ):
    world_size = tp_size * pipeline_parallel_size
-    if world_size > torch.cuda.device_count():
+    if world_size > torch.accelerator.device_count():
        pytest.skip("Not enough GPUs to run the test.")
    multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
--- a/tests/distributed/test_eplb_execute.py
+++ b/tests/distributed/test_eplb_execute.py
@@ -442,7 +442,7 @@ def test_rearrange_expert_weights_with_redundancy(
 ):
    """Test the functionality of rearranging expert weights with redundancy."""
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")
    distributed_run(
        _test_rearrange_expert_weights_with_redundancy,
@@ -528,7 +528,7 @@ def test_async_transfer_layer_without_mtp(
 ):
    """Exercise async EPLB transfer path without MTP/spec decode."""
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")
    distributed_run(
@@ -547,7 +547,7 @@ def test_rearrange_expert_weights_no_change(world_size):
    unchanged.
    """
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")
    distributed_run(_test_rearrange_expert_weights_no_change, world_size)
@@ -623,6 +623,6 @@ def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
 def test_rearrange_expert_weights_profile_mode(world_size):
    """Test profile mode (should not copy actual weights)"""
-    if torch.cuda.device_count() < world_size:
+    if torch.accelerator.device_count() < world_size:
        pytest.skip(f"Need at least {world_size} GPUs to run the test")
    distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)