[Test] Refactor hard coded device string in test files under...

[Test] Refactor hard coded device string in test files under compile/quantization/models/model_executor folders (#38901) Signed-off-by: Liao, Wei <wei.liao@intel.com>

[Test] Refactor hard coded device string in test files under...
[Test] Refactor hard coded device string in test files under compile/quantization/models/model_executor folders (#38901) Signed-off-by: Liao, Wei <wei.liao@intel.com>
3abf8584 · wliao2 · GitHub · f4b42df0 · 3abf8584 · 3abf8584
Unverified Commit 3abf8584 authored Apr 14, 2026 by wliao2 Committed by GitHub Apr 15, 2026
4 changed files
--- a/tests/quantization/test_torchao.py
+++ b/tests/quantization/test_torchao.py
@@ -8,6 +8,7 @@ import torch
 from vllm.model_executor.model_loader import get_model_loader
 from vllm.platforms import current_platform

+DEVICE_TYPE = current_platform.device_type
 DTYPE = ["bfloat16"]

 TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
@@ -33,7 +34,7 @@ def test_pre_quantized_model(vllm_runner):
 @pytest.mark.parametrize(
    "pt_load_map_location",
    [
-        "cuda:0",
+        f"{DEVICE_TYPE}:0",
        # {"": "cuda"},
    ],
 )
@@ -60,7 +61,7 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
        model_name=model_name,
        quantization="torchao",
        dtype="bfloat16",
-        pt_load_map_location="cuda:0",
+        pt_load_map_location=f"{DEVICE_TYPE}:0",
        enforce_eager=True,
    ) as llm:
        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
@@ -81,7 +82,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
        model_name=model_name,
        quantization="torchao",
        dtype="bfloat16",
-        pt_load_map_location="cuda:0",
+        pt_load_map_location=f"{DEVICE_TYPE}:0",
    ) as llm:
        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)

@@ -112,7 +113,7 @@ def test_online_quant_config_dict_json(vllm_runner, enable_pickle):
    with vllm_runner(
        model_name=model_name,
        dtype="bfloat16",
-        pt_load_map_location="cuda:0",
+        pt_load_map_location=f"{DEVICE_TYPE}:0",
        quantization="torchao",
        hf_overrides=hf_overrides,
        enforce_eager=True,
@@ -158,7 +159,7 @@ def test_online_quant_config_file(vllm_runner):
        with vllm_runner(
            model_name=model_name,
            dtype="bfloat16",
-            pt_load_map_location="cuda:0",
+            pt_load_map_location=f"{DEVICE_TYPE}:0",
            quantization="torchao",
            hf_overrides=hf_overrides,
            enforce_eager=True,
@@ -248,7 +249,7 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
    torch._dynamo.reset()
    model_name = "torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev"
    with vllm_runner(
-        model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0"
+        model_name=model_name, dtype="bfloat16", pt_load_map_location=f"{DEVICE_TYPE}:0"
    ) as llm:
        output = llm.generate_greedy(["The capital of France is"], max_tokens=4)

@@ -278,7 +279,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel(vllm_runner, monkeypat
        model_name=model_name,
        quantization="torchao",
        dtype="bfloat16",
-        pt_load_map_location="cuda:0",
+        pt_load_map_location=f"{DEVICE_TYPE}:0",
        enforce_eager=True,
    ) as llm:

@@ -357,7 +358,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
        model_name=model_name,
        quantization="torchao",
        dtype="bfloat16",
-        pt_load_map_location="cuda:0",
+        pt_load_map_location=f"{DEVICE_TYPE}:0",
        hf_overrides=hf_overrides,
        enforce_eager=True,
    ) as llm:

--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -34,6 +34,8 @@ from vllm.config.vllm import (
 )
 from vllm.platforms import current_platform

+DEVICE_TYPE = current_platform.device_type
+

 def test_compile_config_repr_succeeds():
    # setup: VllmBackend mutates the config object
@@ -504,8 +506,8 @@ def test_generation_config_loading():
 @pytest.mark.parametrize(
    "pt_load_map_location",
    [
-        "cuda",
-        {"": "cuda"},
+        DEVICE_TYPE,
+        {"": DEVICE_TYPE},
    ],
 )
 def test_load_config_pt_load_map_location(pt_load_map_location):

--- a/tests/v1/sample/test_topk_topp_sampler.py
+++ b/tests/v1/sample/test_topk_topp_sampler.py
@@ -127,7 +127,7 @@ def test_flashinfer_sampler():
 # =============================================================================


-@pytest.mark.skipif("CPU" in DEVICE_TYPE, reason="CUDA/XPU not available")
+@pytest.mark.skipif("cpu" in DEVICE_TYPE, reason="CUDA/XPU not available")
 class TestTritonTopkTopp:
    """Tests for the Triton top-k/top-p kernel."""


--- a/tests/v1/test_tensor_ipc_queue.py
+++ b/tests/v1/test_tensor_ipc_queue.py
@@ -14,6 +14,7 @@ import pytest
 import torch
 import torch.multiprocessing as torch_mp

+from vllm.platforms import current_platform
 from vllm.v1.engine.tensor_ipc import (
    TensorIpcData,
    TensorIpcReceiver,
@@ -21,6 +22,8 @@ from vllm.v1.engine.tensor_ipc import (
 )
 from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder

+DEVICE_TYPE = current_platform.device_type
+

 @pytest.fixture(scope="module", autouse=True)
 def setup_multiprocessing():
@@ -53,7 +56,7 @@ def encoder_process(
        encoder = MsgpackEncoder(oob_tensor_consumer=sender)

        if torch.cuda.is_available():
-            device = "cuda:0"
+            device = f"{DEVICE_TYPE}:0"
            tensor = torch.randn(
                *tensor_data["shape"], dtype=tensor_data["dtype"], device=device
            )
@@ -384,7 +387,7 @@ def mixed_tensor_encoder_process(

        # Create only CUDA tensor for IPC (CPU will be serialized)
        # But actually, let's just send CUDA tensor directly
-        cuda_tensor = torch.randn(4, 5, device="cuda:0")
+        cuda_tensor = torch.randn(4, 5, device=f"{DEVICE_TYPE}:0")

        # Manually send via IPC to test the mechanism
        cuda_tensor_shared = cuda_tensor.share_memory_()
@@ -651,7 +654,7 @@ def test_ipc_disabled_mode():

    # If CUDA is available, test with CUDA tensor too
    if torch.cuda.is_available():
-        cuda_tensor = torch.randn(4, 5, device="cuda:0")
+        cuda_tensor = torch.randn(4, 5, device=f"{DEVICE_TYPE}:0")
        encoded_cuda = encoder.encode({"cuda_tensor": cuda_tensor})
        assert len(encoded_cuda) > 0
        assert tensor_queues[0].empty(), (