Unverified Commit 3abf8584 authored by wliao2's avatar wliao2 Committed by GitHub
Browse files

[Test] Refactor hard coded device string in test files under...


[Test] Refactor hard coded device string in test files under compile/quantization/models/model_executor folders (#38901)
Signed-off-by: default avatarLiao, Wei <wei.liao@intel.com>
parent f4b42df0
......@@ -8,6 +8,7 @@ import torch
from vllm.model_executor.model_loader import get_model_loader
from vllm.platforms import current_platform
DEVICE_TYPE = current_platform.device_type
DTYPE = ["bfloat16"]
TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
......@@ -33,7 +34,7 @@ def test_pre_quantized_model(vllm_runner):
@pytest.mark.parametrize(
"pt_load_map_location",
[
"cuda:0",
f"{DEVICE_TYPE}:0",
# {"": "cuda"},
],
)
......@@ -60,7 +61,7 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0",
pt_load_map_location=f"{DEVICE_TYPE}:0",
enforce_eager=True,
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
......@@ -81,7 +82,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0",
pt_load_map_location=f"{DEVICE_TYPE}:0",
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
......@@ -112,7 +113,7 @@ def test_online_quant_config_dict_json(vllm_runner, enable_pickle):
with vllm_runner(
model_name=model_name,
dtype="bfloat16",
pt_load_map_location="cuda:0",
pt_load_map_location=f"{DEVICE_TYPE}:0",
quantization="torchao",
hf_overrides=hf_overrides,
enforce_eager=True,
......@@ -158,7 +159,7 @@ def test_online_quant_config_file(vllm_runner):
with vllm_runner(
model_name=model_name,
dtype="bfloat16",
pt_load_map_location="cuda:0",
pt_load_map_location=f"{DEVICE_TYPE}:0",
quantization="torchao",
hf_overrides=hf_overrides,
enforce_eager=True,
......@@ -248,7 +249,7 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
torch._dynamo.reset()
model_name = "torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev"
with vllm_runner(
model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0"
model_name=model_name, dtype="bfloat16", pt_load_map_location=f"{DEVICE_TYPE}:0"
) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
......@@ -278,7 +279,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel(vllm_runner, monkeypat
model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0",
pt_load_map_location=f"{DEVICE_TYPE}:0",
enforce_eager=True,
) as llm:
......@@ -357,7 +358,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
model_name=model_name,
quantization="torchao",
dtype="bfloat16",
pt_load_map_location="cuda:0",
pt_load_map_location=f"{DEVICE_TYPE}:0",
hf_overrides=hf_overrides,
enforce_eager=True,
) as llm:
......
......@@ -34,6 +34,8 @@ from vllm.config.vllm import (
)
from vllm.platforms import current_platform
DEVICE_TYPE = current_platform.device_type
def test_compile_config_repr_succeeds():
# setup: VllmBackend mutates the config object
......@@ -504,8 +506,8 @@ def test_generation_config_loading():
@pytest.mark.parametrize(
"pt_load_map_location",
[
"cuda",
{"": "cuda"},
DEVICE_TYPE,
{"": DEVICE_TYPE},
],
)
def test_load_config_pt_load_map_location(pt_load_map_location):
......
......@@ -127,7 +127,7 @@ def test_flashinfer_sampler():
# =============================================================================
@pytest.mark.skipif("CPU" in DEVICE_TYPE, reason="CUDA/XPU not available")
@pytest.mark.skipif("cpu" in DEVICE_TYPE, reason="CUDA/XPU not available")
class TestTritonTopkTopp:
"""Tests for the Triton top-k/top-p kernel."""
......
......@@ -14,6 +14,7 @@ import pytest
import torch
import torch.multiprocessing as torch_mp
from vllm.platforms import current_platform
from vllm.v1.engine.tensor_ipc import (
TensorIpcData,
TensorIpcReceiver,
......@@ -21,6 +22,8 @@ from vllm.v1.engine.tensor_ipc import (
)
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
DEVICE_TYPE = current_platform.device_type
@pytest.fixture(scope="module", autouse=True)
def setup_multiprocessing():
......@@ -53,7 +56,7 @@ def encoder_process(
encoder = MsgpackEncoder(oob_tensor_consumer=sender)
if torch.cuda.is_available():
device = "cuda:0"
device = f"{DEVICE_TYPE}:0"
tensor = torch.randn(
*tensor_data["shape"], dtype=tensor_data["dtype"], device=device
)
......@@ -384,7 +387,7 @@ def mixed_tensor_encoder_process(
# Create only CUDA tensor for IPC (CPU will be serialized)
# But actually, let's just send CUDA tensor directly
cuda_tensor = torch.randn(4, 5, device="cuda:0")
cuda_tensor = torch.randn(4, 5, device=f"{DEVICE_TYPE}:0")
# Manually send via IPC to test the mechanism
cuda_tensor_shared = cuda_tensor.share_memory_()
......@@ -651,7 +654,7 @@ def test_ipc_disabled_mode():
# If CUDA is available, test with CUDA tensor too
if torch.cuda.is_available():
cuda_tensor = torch.randn(4, 5, device="cuda:0")
cuda_tensor = torch.randn(4, 5, device=f"{DEVICE_TYPE}:0")
encoded_cuda = encoder.encode({"cuda_tensor": cuda_tensor})
assert len(encoded_cuda) > 0
assert tensor_queues[0].empty(), (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment