Unverified Commit 53ec16a7 authored by Kunshang Ji's avatar Kunshang Ji Committed by GitHub
Browse files

[Hardware] Replace torch.cuda.device_count/current_device/set_device API (#36145)


Signed-off-by: default avatarKunshang Ji <jikunshang95@gmail.com>
Signed-off-by: default avatarKunshang Ji <kunshang.ji@intel.com>
parent 2e693f48
...@@ -757,7 +757,7 @@ def _run_mla_benchmark_batched( ...@@ -757,7 +757,7 @@ def _run_mla_benchmark_batched(
backend_cfg = _get_backend_config(backend) backend_cfg = _get_backend_config(backend)
device = torch.device(configs_with_params[0][0].device) device = torch.device(configs_with_params[0][0].device)
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
# Determine block size # Determine block size
config_block_size = configs_with_params[0][0].block_size config_block_size = configs_with_params[0][0].block_size
......
...@@ -443,7 +443,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult: ...@@ -443,7 +443,7 @@ def run_attention_benchmark(config: BenchmarkConfig) -> BenchmarkResult:
BenchmarkResult with timing and memory statistics BenchmarkResult with timing and memory statistics
""" """
device = torch.device(config.device) device = torch.device(config.device)
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
backend_cfg = _get_backend_config(config.backend) backend_cfg = _get_backend_config(config.backend)
......
...@@ -64,7 +64,7 @@ def bench_run( ...@@ -64,7 +64,7 @@ def bench_run(
per_out_ch: bool, per_out_ch: bool,
mkn: tuple[int, int, int], mkn: tuple[int, int, int],
): ):
init_workspace_manager(torch.cuda.current_device()) init_workspace_manager(torch.accelerator.current_device_index())
(m, k, n) = mkn (m, k, n) = mkn
dtype = torch.half dtype = torch.half
......
...@@ -495,7 +495,7 @@ def main(): ...@@ -495,7 +495,7 @@ def main():
# Set device # Set device
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
# Get CPU process group # Get CPU process group
cpu_group = dist.new_group(backend="gloo") cpu_group = dist.new_group(backend="gloo")
......
...@@ -392,7 +392,7 @@ def benchmark_operation( ...@@ -392,7 +392,7 @@ def benchmark_operation(
num_op_per_cudagraph = 10 num_op_per_cudagraph = 10
# Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe # Use vLLM's graph_capture to make tensor_model_parallel_all_reduce graph-safe
device = torch.device(f"cuda:{torch.cuda.current_device()}") device = torch.device(f"cuda:{torch.accelerator.current_device_index()}")
with graph_capture(device=device), torch.cuda.graph(graph): with graph_capture(device=device), torch.cuda.graph(graph):
for _ in range(num_op_per_cudagraph): for _ in range(num_op_per_cudagraph):
operation_func(*args, **kwargs) operation_func(*args, **kwargs)
...@@ -984,7 +984,7 @@ def main(): ...@@ -984,7 +984,7 @@ def main():
world_size = int(os.environ["WORLD_SIZE"]) world_size = int(os.environ["WORLD_SIZE"])
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
torch.set_default_device(device) torch.set_default_device(device)
init_distributed_environment() init_distributed_environment()
......
...@@ -50,7 +50,7 @@ def bench_run( ...@@ -50,7 +50,7 @@ def bench_run(
per_out_ch: bool, per_out_ch: bool,
mkn: tuple[int, int, int], mkn: tuple[int, int, int],
): ):
init_workspace_manager(torch.cuda.current_device()) init_workspace_manager(torch.accelerator.current_device_index())
label = "Quant Matmul" label = "Quant Matmul"
sub_label = ( sub_label = (
......
...@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict): ...@@ -285,7 +285,7 @@ def tune_on_gpu(args_dict):
weight_shapes = args_dict["weight_shapes"] weight_shapes = args_dict["weight_shapes"]
args = args_dict["args"] args = args_dict["args"]
torch.cuda.set_device(gpu_id) torch.accelerator.set_device_index(gpu_id)
print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}") print(f"Starting tuning on GPU {gpu_id} with batch sizes {batch_sizes}")
block_n = args.block_n block_n = args.block_n
...@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus): ...@@ -334,7 +334,7 @@ def distribute_batch_sizes(batch_sizes, num_gpus):
def main(args): def main(args):
print(args) print(args)
num_gpus = torch.cuda.device_count() num_gpus = torch.accelerator.device_count()
if num_gpus == 0: if num_gpus == 0:
raise RuntimeError("No GPU available for tuning") raise RuntimeError("No GPU available for tuning")
print(f"Found {num_gpus} GPUs for parallel tuning") print(f"Found {num_gpus} GPUs for parallel tuning")
......
...@@ -15,7 +15,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2) ...@@ -15,7 +15,7 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct", tensor_parallel_size=2)
``` ```
!!! warning !!! warning
To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][]) To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.accelerator.set_device_index][])
before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`. before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable. To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
......
...@@ -91,8 +91,8 @@ If GPU/CPU communication cannot be established, you can use the following Python ...@@ -91,8 +91,8 @@ If GPU/CPU communication cannot be established, you can use the following Python
import torch import torch
import torch.distributed as dist import torch.distributed as dist
dist.init_process_group(backend="nccl") dist.init_process_group(backend="nccl")
local_rank = dist.get_rank() % torch.cuda.device_count() local_rank = dist.get_rank() % torch.accelerator.device_count()
torch.cuda.set_device(local_rank) torch.accelerator.set_device_index(local_rank)
data = torch.FloatTensor([1,] * 128).to("cuda") data = torch.FloatTensor([1,] * 128).to("cuda")
dist.all_reduce(data, op=dist.ReduceOp.SUM) dist.all_reduce(data, op=dist.ReduceOp.SUM)
torch.accelerator.synchronize() torch.accelerator.synchronize()
...@@ -337,7 +337,7 @@ import vllm ...@@ -337,7 +337,7 @@ import vllm
import torch import torch
print(f"CUDA available: {torch.cuda.is_available()}") print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device count: {torch.cuda.device_count()}") print(f"CUDA device count: {torch.accelerator.device_count()}")
EOF EOF
``` ```
......
...@@ -106,7 +106,7 @@ def main(): ...@@ -106,7 +106,7 @@ def main():
# IPC requires the training model to be on the same GPU as the vLLM server # IPC requires the training model to be on the same GPU as the vLLM server
# The server should be started on GPU 0 with reduced memory utilization # The server should be started on GPU 0 with reduced memory utilization
device = "cuda:0" device = "cuda:0"
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
# Load the training model on the same GPU as the server # Load the training model on the same GPU as the server
# Use bfloat16 to reduce memory footprint # Use bfloat16 to reduce memory footprint
......
...@@ -131,7 +131,7 @@ def main(): ...@@ -131,7 +131,7 @@ def main():
inference_world_size = get_world_size(BASE_URL) inference_world_size = get_world_size(BASE_URL)
world_size = inference_world_size + 1 # +1 for the trainer world_size = inference_world_size + 1 # +1 for the trainer
device = f"cuda:{inference_world_size}" device = f"cuda:{inference_world_size}"
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
# Load the training model # Load the training model
print(f"Loading training model: {MODEL_NAME}") print(f"Loading training model: {MODEL_NAME}")
......
...@@ -300,7 +300,7 @@ def async_tp_pass_on_test_model( ...@@ -300,7 +300,7 @@ def async_tp_pass_on_test_model(
set_random_seed(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
torch.set_default_device(device) torch.set_default_device(device)
torch.set_default_dtype(dtype) torch.set_default_dtype(dtype)
......
...@@ -262,7 +262,7 @@ def all_reduce_fusion_pass_on_test_model( ...@@ -262,7 +262,7 @@ def all_reduce_fusion_pass_on_test_model(
set_random_seed(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
torch.set_default_device(device) torch.set_default_device(device)
torch.set_default_dtype(dtype) torch.set_default_dtype(dtype)
......
...@@ -228,7 +228,7 @@ def sequence_parallelism_pass_on_test_model( ...@@ -228,7 +228,7 @@ def sequence_parallelism_pass_on_test_model(
set_random_seed(0) set_random_seed(0)
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
torch.set_default_device(device) torch.set_default_device(device)
torch.set_default_dtype(dtype) torch.set_default_dtype(dtype)
......
...@@ -428,7 +428,7 @@ class HfRunner: ...@@ -428,7 +428,7 @@ class HfRunner:
) )
# don't put this import at the top level # don't put this import at the top level
# it will call torch.cuda.device_count() # it will call torch.accelerator.device_count()
from transformers import AutoProcessor from transformers import AutoProcessor
self.processor = AutoProcessor.from_pretrained( self.processor = AutoProcessor.from_pretrained(
...@@ -1535,7 +1535,7 @@ def clean_gpu_memory_between_tests(): ...@@ -1535,7 +1535,7 @@ def clean_gpu_memory_between_tests():
from tests.utils import wait_for_gpu_memory_to_clear from tests.utils import wait_for_gpu_memory_to_clear
num_gpus = torch.cuda.device_count() num_gpus = torch.accelerator.device_count()
if num_gpus > 0: if num_gpus > 0:
try: try:
wait_for_gpu_memory_to_clear( wait_for_gpu_memory_to_clear(
......
...@@ -14,7 +14,7 @@ import torch # noqa: E402 ...@@ -14,7 +14,7 @@ import torch # noqa: E402
from vllm.platforms import current_platform # noqa: F401, E402 from vllm.platforms import current_platform # noqa: F401, E402
os.environ["CUDA_VISIBLE_DEVICES"] = "0" os.environ["CUDA_VISIBLE_DEVICES"] = "0"
count = torch.cuda.device_count() count = torch.accelerator.device_count()
if count == 0: if count == 0:
sys.exit(0) # Skip: no GPUs available sys.exit(0) # Skip: no GPUs available
......
...@@ -42,7 +42,7 @@ def set_env_vars_and_device(env: dict[str, str]) -> None: ...@@ -42,7 +42,7 @@ def set_env_vars_and_device(env: dict[str, str]) -> None:
update_environment_variables(env) update_environment_variables(env)
local_rank = os.environ["LOCAL_RANK"] local_rank = os.environ["LOCAL_RANK"]
device = torch.device(f"cuda:{local_rank}") device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
# Create a minimal vllm config for init_distributed_environment # Create a minimal vllm config for init_distributed_environment
vllm_config = VllmConfig() vllm_config = VllmConfig()
......
...@@ -43,7 +43,7 @@ def all_reduce_test_worker( ...@@ -43,7 +43,7 @@ def all_reduce_test_worker(
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_elements = 8 num_elements = 8
all_tensors = [ all_tensors = [
...@@ -69,7 +69,7 @@ def reduce_scatter_test_worker( ...@@ -69,7 +69,7 @@ def reduce_scatter_test_worker(
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_elements = 8 num_elements = 8
...@@ -100,7 +100,7 @@ def all_gather_test_worker( ...@@ -100,7 +100,7 @@ def all_gather_test_worker(
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
num_dimensions = 3 num_dimensions = 3
tensor_size = list(range(2, num_dimensions + 2)) tensor_size = list(range(2, num_dimensions + 2))
...@@ -134,7 +134,7 @@ def broadcast_tensor_dict_test_worker( ...@@ -134,7 +134,7 @@ def broadcast_tensor_dict_test_worker(
# they will be able to set the device to the correct GPU # they will be able to set the device to the correct GPU
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
test_dict = { test_dict = {
# device tensor # device tensor
...@@ -171,7 +171,7 @@ def send_recv_tensor_dict_test_worker( ...@@ -171,7 +171,7 @@ def send_recv_tensor_dict_test_worker(
): ):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
test_dict = { test_dict = {
...@@ -317,7 +317,7 @@ def send_recv_test_worker( ...@@ -317,7 +317,7 @@ def send_recv_test_worker(
): ):
monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False) monkeypatch.delenv("CUDA_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
size = 64 size = 64
......
...@@ -35,7 +35,7 @@ def graph_allreduce( ...@@ -35,7 +35,7 @@ def graph_allreduce(
m.delenv("CUDA_VISIBLE_DEVICES", raising=False) m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
m.delenv("HIP_VISIBLE_DEVICES", raising=False) m.delenv("HIP_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
ensure_model_parallel_initialized(tp_size, pp_size) ensure_model_parallel_initialized(tp_size, pp_size)
group = get_tp_group().device_group group = get_tp_group().device_group
...@@ -62,12 +62,10 @@ def graph_allreduce( ...@@ -62,12 +62,10 @@ def graph_allreduce(
for dtype in [torch.float32, torch.float16, torch.bfloat16]: for dtype in [torch.float32, torch.float16, torch.bfloat16]:
with graph_capture(device=device) as graph_capture_context: with graph_capture(device=device) as graph_capture_context:
# use integers so result matches NCCL exactly # use integers so result matches NCCL exactly
inp1 = torch.randint( device_idx = torch.accelerator.current_device_index()
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device() inp1 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
) inp2 = torch.randint(1, 16, (sz,), dtype=dtype, device=device_idx)
inp2 = torch.randint(
1, 16, (sz,), dtype=dtype, device=torch.cuda.current_device()
)
torch.accelerator.synchronize() torch.accelerator.synchronize()
graph = torch.cuda.CUDAGraph() graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph, stream=graph_capture_context.stream): with torch.cuda.graph(graph, stream=graph_capture_context.stream):
...@@ -95,7 +93,7 @@ def eager_allreduce( ...@@ -95,7 +93,7 @@ def eager_allreduce(
m.delenv("CUDA_VISIBLE_DEVICES", raising=False) m.delenv("CUDA_VISIBLE_DEVICES", raising=False)
m.delenv("HIP_VISIBLE_DEVICES", raising=False) m.delenv("HIP_VISIBLE_DEVICES", raising=False)
device = torch.device(f"cuda:{rank}") device = torch.device(f"cuda:{rank}")
torch.cuda.set_device(device) torch.accelerator.set_device_index(device)
init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port) init_test_distributed_environment(tp_size, pp_size, rank, distributed_init_port)
# we use the first group to communicate once # we use the first group to communicate once
...@@ -129,6 +127,6 @@ def test_custom_allreduce( ...@@ -129,6 +127,6 @@ def test_custom_allreduce(
test_target, test_target,
): ):
world_size = tp_size * pipeline_parallel_size world_size = tp_size * pipeline_parallel_size
if world_size > torch.cuda.device_count(): if world_size > torch.accelerator.device_count():
pytest.skip("Not enough GPUs to run the test.") pytest.skip("Not enough GPUs to run the test.")
multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target) multi_process_parallel(monkeypatch, tp_size, pipeline_parallel_size, test_target)
...@@ -442,7 +442,7 @@ def test_rearrange_expert_weights_with_redundancy( ...@@ -442,7 +442,7 @@ def test_rearrange_expert_weights_with_redundancy(
): ):
"""Test the functionality of rearranging expert weights with redundancy.""" """Test the functionality of rearranging expert weights with redundancy."""
if torch.cuda.device_count() < world_size: if torch.accelerator.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test") pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run( distributed_run(
_test_rearrange_expert_weights_with_redundancy, _test_rearrange_expert_weights_with_redundancy,
...@@ -528,7 +528,7 @@ def test_async_transfer_layer_without_mtp( ...@@ -528,7 +528,7 @@ def test_async_transfer_layer_without_mtp(
): ):
"""Exercise async EPLB transfer path without MTP/spec decode.""" """Exercise async EPLB transfer path without MTP/spec decode."""
if torch.cuda.device_count() < world_size: if torch.accelerator.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test") pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run( distributed_run(
...@@ -547,7 +547,7 @@ def test_rearrange_expert_weights_no_change(world_size): ...@@ -547,7 +547,7 @@ def test_rearrange_expert_weights_no_change(world_size):
unchanged. unchanged.
""" """
if torch.cuda.device_count() < world_size: if torch.accelerator.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test") pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(_test_rearrange_expert_weights_no_change, world_size) distributed_run(_test_rearrange_expert_weights_no_change, world_size)
...@@ -623,6 +623,6 @@ def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None: ...@@ -623,6 +623,6 @@ def _test_rearrange_expert_weights_profile_mode(env, world_size) -> None:
def test_rearrange_expert_weights_profile_mode(world_size): def test_rearrange_expert_weights_profile_mode(world_size):
"""Test profile mode (should not copy actual weights)""" """Test profile mode (should not copy actual weights)"""
if torch.cuda.device_count() < world_size: if torch.accelerator.device_count() < world_size:
pytest.skip(f"Need at least {world_size} GPUs to run the test") pytest.skip(f"Need at least {world_size} GPUs to run the test")
distributed_run(_test_rearrange_expert_weights_profile_mode, world_size) distributed_run(_test_rearrange_expert_weights_profile_mode, world_size)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment