Unverified Commit 53ec16a7 authored by Kunshang Ji's avatar Kunshang Ji Committed by GitHub
Browse files

[Hardware] Replace torch.cuda.device_count/current_device/set_device API (#36145)


Signed-off-by: default avatarKunshang Ji <jikunshang95@gmail.com>
Signed-off-by: default avatarKunshang Ji <kunshang.ji@intel.com>
parent 2e693f48
......@@ -757,7 +757,7 @@ def _run_mla_benchmark_batched(
backend_cfg = _get_backend_config(backend)
device = torch.device(configs_with_params[0][0].device)
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
# Determine block size
config_block_size = configs_with_params[0][0].block_size
......
......@@ -443,7 +443,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
BenchmarkResult with timing and memory statistics
"""
device = torch.device(config.device)
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
backend_cfg = _get_backend_config(config.backend)
......
......@@ -64,7 +64,7 @@ def bench_run(
per_out_ch: bool,
mkn: tuple[int, int, int],
):
init_workspace_manager(torch.cuda.current_device())
init_workspace_manager(torch.accelerator.current_device_index())
(m, k, n) = mkn
dtype = torch.half
......
......@@ -495,7 +495,7 @@ def main():
# Set device
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
# Get CPU process group
cpu_group = dist.new_group(backend="gloo")
......
......@@ -392,7 +392,7 @@ def benchmark_operation(
num_op_per_cudagraph = 10
# Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
device = torch.device(f"cuda:{torch.cuda.current_device()}")
device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
with graph_capture(device=device), torch.cuda.graph(graph):
for _ in range(num_op_per_cudagraph):
operation_func(*args, **kwargs)
......@@ -984,7 +984,7 @@ def main():
world_size = int(os.environ["WORLD_SIZE"])
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
torch.set_default_device(device)
init_distributed_environment()
......
......@@ -50,7 +50,7 @@ def bench_run(
per_out_ch: bool,
mkn: tuple[int, int, int],
):
init_workspace_manager(torch.cuda.current_device())
init_workspace_manager(torch.accelerator.current_device_index())
label = "Quant Matmul"
sub_label = (
......
......@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
weight_shapes = args_dict["weight_shapes"]
args = args_dict["args"]
torch.cuda.set_device(gpu_id)
torch.accelerator.set_device_index(gpu_id)
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
block_n = args.block_n
......@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):
def main(args):
print(args)
num_gpus = torch.cuda.device_count()
num_gpus = torch.accelerator.device_count()
if num_gpus == 0:
raise RuntimeError("No GPU available for tuning")
print(f"Found {num_gpus} GPUs for parallel tuning")
......
......@@ -15,7 +15,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
```
!!! warning
To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.accelerator.set_device_index][])
before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
......
......@@ -91,8 +91,8 @@ If GPU/CPU communication cannot be established, you can use the following Python
import torch
import torch.distributed as dist
dist.init_process_group(backend="nccl")
local_rank = dist.get_rank() % torch.cuda.device_count()
torch.cuda.set_device(local_rank)
local_rank = dist.get_rank() % torch.accelerator.device_count()
torch.accelerator.set_device_index(local_rank)
data = torch.FloatTensor([1,] * 128).to("cuda")
dist.all_reduce(data, op=dist.ReduceOp.SUM)
torch.accelerator.synchronize()
......@@ -337,7 +337,7 @@ import vllm
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}")
print(f"CUDA device count: {torch.accelerator.device_count()}")
EOF
```
......
......@@ -106,7 +106,7 @@ def main():
# IPC requires the training model to be on the same GPU as the vLLM server
# The server should be started on GPU 0 with reduced memory utilization
device = "cuda:0"
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
# Load the training model on the same GPU as the server
# Use bfloat16 to reduce memory footprint
......
......@@ -131,7 +131,7 @@ def main():
inference_world_size = get_world_size(BASE_URL)
world_size = inference_world_size + 1 # +1 for the trainer
device = f"cuda:{inference_world_size}"
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
# Load the training model
print(f"Loading training model: {MODEL_NAME}")
......
......@@ -300,7 +300,7 @@ def async_tp_pass_on_test_model(
set_random_seed(0)
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
......
......@@ -262,7 +262,7 @@ def all_reduce_fusion_pass_on_test_model(
set_random_seed(0)
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
......
......@@ -228,7 +228,7 @@ def sequence_parallelism_pass_on_test_model(
set_random_seed(0)
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
torch.set_default_device(device)
torch.set_default_dtype(dtype)
......
......@@ -428,7 +428,7 @@ class HfRunner:
)
# don't put this import at the top level
# it will call torch.cuda.device_count()
# it will call torch.accelerator.device_count()
from transformers import AutoProcessor
self.processor = AutoProcessor.from_pretrained(
......@@ -1535,7 +1535,7 @@ def clean_gpu_memory_between_tests():
from tests.utils import wait_for_gpu_memory_to_clear
num_gpus = torch.cuda.device_count()
num_gpus = torch.accelerator.device_count()
if num_gpus > 0:
try:
wait_for_gpu_memory_to_clear(
......
......@@ -14,7 +14,7 @@ import torch # noqa: E402
from vllm.platforms import current_platform # noqa: F401, E402
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
count = torch.cuda.device_count()
count = torch.accelerator.device_count()
if count == 0:
sys.exit(0) # Skip: no GPUs available
......
......@@ -42,7 +42,7 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
update_environment_variables(env)
local_rank = os.environ["LOCAL_RANK"]
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
# Create a minimal vllm config for init_distributed_environment
vllm_config = VllmConfig()
......
......@@ -43,7 +43,7 @@ def all_reduce_test_worker(
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_elements = 8
all_tensors = [
......@@ -69,7 +69,7 @@ def reduce_scatter_test_worker(
# they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_elements = 8
......@@ -100,7 +100,7 @@ def all_gather_test_worker(
# they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_dimensions = 3
tensor_size = list(range(2, num_dimensions + 2))
......@@ -134,7 +134,7 @@ def broadcast_tensor_dict_test_worker(
# they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
test_dict = {
# device tensor
......@@ -171,7 +171,7 @@ def send_recv_tensor_dict_test_worker(
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
test_dict = {
......@@ -317,7 +317,7 @@ def send_recv_test_worker(
):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
size = 64
......
......@@ -35,7 +35,7 @@ def graph_allreduce(
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
m.delenv("HIP_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
ensure_model_parallel_initialized(tp_size, pp_size)
group = get_tp_group().device_group
......@@ -62,12 +62,10 @@ def graph_allreduce(
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
with graph_capture(device=device) as graph_capture_context:
# use integers so result matches NCCL exactly
inp1 = torch.randint(
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
)
inp2 = torch.randint(
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
)
device_idx = torch.accelerator.current_device_index()
inp1 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
inp2 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
torch.accelerator.synchronize()
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, stream=graph_capture_context.stream):
......@@ -95,7 +93,7 @@ def eager_allreduce(
m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
m.delenv("HIP_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device)
torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
# we use the first group to communicate once
......@@ -129,6 +127,6 @@ def test_custom_allreduce(
test_target,
):
world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count():
if world_size > torch.accelerator.device_count():
pytest.skip("Not enough GPUs to run the test.")
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
......@@ -442,7 +442,7 @@ def test_rearrange_expert_weights_with_redundancy(
):
"""Test the functionality of rearranging expert weights with redundancy."""
if torch.cuda.device_count() < world_size:
if torch.accelerator.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(
_test_rearrange_expert_weights_with_redundancy,
......@@ -528,7 +528,7 @@ def test_async_transfer_layer_without_mtp(
):
"""Exercise async EPLB transfer path without MTP/spec decode."""
if torch.cuda.device_count() < world_size:
if torch.accelerator.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(
......@@ -547,7 +547,7 @@ def test_rearrange_expert_weights_no_change(world_size):
unchanged.
"""
if torch.cuda.device_count() < world_size:
if torch.accelerator.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(_test_rearrange_expert_weights_no_change, world_size)
......@@ -623,6 +623,6 @@ def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
def test_rearrange_expert_weights_profile_mode(world_size):
"""Test profile mode (should not copy actual weights)"""
if torch.cuda.device_count() < world_size:
if torch.accelerator.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment