Unverified Commit 3abf8584 authored by wliao2's avatar wliao2 Committed by GitHub
Browse files

[Test] Refactor hard coded device string in test files under...


[Test] Refactor hard coded device string in test files under compile/quantization/models/model_executor folders (#38901)
Signed-off-by: default avatarLiao, Wei <wei.liao@intel.com>
parent f4b42df0
...@@ -8,6 +8,7 @@ import torch ...@@ -8,6 +8,7 @@ import torch
from vllm.model_executor.model_loader import get_model_loader from vllm.model_executor.model_loader import get_model_loader
from vllm.platforms import current_platform from vllm.platforms import current_platform
DEVICE_TYPE = current_platform.device_type
DTYPE = ["bfloat16"] DTYPE = ["bfloat16"]
TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None TORCHAO_AVAILABLE = importlib.util.find_spec("torchao") is not None
...@@ -33,7 +34,7 @@ def test_pre_quantized_model(vllm_runner): ...@@ -33,7 +34,7 @@ def test_pre_quantized_model(vllm_runner):
@pytest.mark.parametrize( @pytest.mark.parametrize(
"pt_load_map_location", "pt_load_map_location",
[ [
"cuda:0", f"{DEVICE_TYPE}:0",
# {"": "cuda"}, # {"": "cuda"},
], ],
) )
...@@ -60,7 +61,7 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner): ...@@ -60,7 +61,7 @@ def test_qwenvl_int8wo_model_loading_with_params(vllm_runner):
model_name=model_name, model_name=model_name,
quantization="torchao", quantization="torchao",
dtype="bfloat16", dtype="bfloat16",
pt_load_map_location="cuda:0", pt_load_map_location=f"{DEVICE_TYPE}:0",
enforce_eager=True, enforce_eager=True,
) as llm: ) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=4) output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
...@@ -81,7 +82,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner): ...@@ -81,7 +82,7 @@ def test_opt_125m_awq_int4wo_model_loading_with_params(vllm_runner):
model_name=model_name, model_name=model_name,
quantization="torchao", quantization="torchao",
dtype="bfloat16", dtype="bfloat16",
pt_load_map_location="cuda:0", pt_load_map_location=f"{DEVICE_TYPE}:0",
) as llm: ) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=4) output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
...@@ -112,7 +113,7 @@ def test_online_quant_config_dict_json(vllm_runner, enable_pickle): ...@@ -112,7 +113,7 @@ def test_online_quant_config_dict_json(vllm_runner, enable_pickle):
with vllm_runner( with vllm_runner(
model_name=model_name, model_name=model_name,
dtype="bfloat16", dtype="bfloat16",
pt_load_map_location="cuda:0", pt_load_map_location=f"{DEVICE_TYPE}:0",
quantization="torchao", quantization="torchao",
hf_overrides=hf_overrides, hf_overrides=hf_overrides,
enforce_eager=True, enforce_eager=True,
...@@ -158,7 +159,7 @@ def test_online_quant_config_file(vllm_runner): ...@@ -158,7 +159,7 @@ def test_online_quant_config_file(vllm_runner):
with vllm_runner( with vllm_runner(
model_name=model_name, model_name=model_name,
dtype="bfloat16", dtype="bfloat16",
pt_load_map_location="cuda:0", pt_load_map_location=f"{DEVICE_TYPE}:0",
quantization="torchao", quantization="torchao",
hf_overrides=hf_overrides, hf_overrides=hf_overrides,
enforce_eager=True, enforce_eager=True,
...@@ -248,7 +249,7 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner): ...@@ -248,7 +249,7 @@ def test_opt_125m_module_fqn_to_config_regex_model(vllm_runner):
torch._dynamo.reset() torch._dynamo.reset()
model_name = "torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev" model_name = "torchao-testing/opt-125m-ModuleFqnToConfig-v1-regex-0.14.0.dev"
with vllm_runner( with vllm_runner(
model_name=model_name, dtype="bfloat16", pt_load_map_location="cuda:0" model_name=model_name, dtype="bfloat16", pt_load_map_location=f"{DEVICE_TYPE}:0"
) as llm: ) as llm:
output = llm.generate_greedy(["The capital of France is"], max_tokens=4) output = llm.generate_greedy(["The capital of France is"], max_tokens=4)
...@@ -278,7 +279,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel(vllm_runner, monkeypat ...@@ -278,7 +279,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel(vllm_runner, monkeypat
model_name=model_name, model_name=model_name,
quantization="torchao", quantization="torchao",
dtype="bfloat16", dtype="bfloat16",
pt_load_map_location="cuda:0", pt_load_map_location=f"{DEVICE_TYPE}:0",
enforce_eager=True, enforce_eager=True,
) as llm: ) as llm:
...@@ -357,7 +358,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant( ...@@ -357,7 +358,7 @@ def test_opt_125m_int4wo_model_running_preshuffled_kernel_online_quant(
model_name=model_name, model_name=model_name,
quantization="torchao", quantization="torchao",
dtype="bfloat16", dtype="bfloat16",
pt_load_map_location="cuda:0", pt_load_map_location=f"{DEVICE_TYPE}:0",
hf_overrides=hf_overrides, hf_overrides=hf_overrides,
enforce_eager=True, enforce_eager=True,
) as llm: ) as llm:
......
...@@ -34,6 +34,8 @@ from vllm.config.vllm import ( ...@@ -34,6 +34,8 @@ from vllm.config.vllm import (
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
DEVICE_TYPE = current_platform.device_type
def test_compile_config_repr_succeeds(): def test_compile_config_repr_succeeds():
# setup: VllmBackend mutates the config object # setup: VllmBackend mutates the config object
...@@ -504,8 +506,8 @@ def test_generation_config_loading(): ...@@ -504,8 +506,8 @@ def test_generation_config_loading():
@pytest.mark.parametrize( @pytest.mark.parametrize(
"pt_load_map_location", "pt_load_map_location",
[ [
"cuda", DEVICE_TYPE,
{"": "cuda"}, {"": DEVICE_TYPE},
], ],
) )
def test_load_config_pt_load_map_location(pt_load_map_location): def test_load_config_pt_load_map_location(pt_load_map_location):
......
...@@ -127,7 +127,7 @@ def test_flashinfer_sampler(): ...@@ -127,7 +127,7 @@ def test_flashinfer_sampler():
# ============================================================================= # =============================================================================
@pytest.mark.skipif("CPU" in DEVICE_TYPE, reason="CUDA/XPU not available") @pytest.mark.skipif("cpu" in DEVICE_TYPE, reason="CUDA/XPU not available")
class TestTritonTopkTopp: class TestTritonTopkTopp:
"""Tests for the Triton top-k/top-p kernel.""" """Tests for the Triton top-k/top-p kernel."""
......
...@@ -14,6 +14,7 @@ import pytest ...@@ -14,6 +14,7 @@ import pytest
import torch import torch
import torch.multiprocessing as torch_mp import torch.multiprocessing as torch_mp
from vllm.platforms import current_platform
from vllm.v1.engine.tensor_ipc import ( from vllm.v1.engine.tensor_ipc import (
TensorIpcData, TensorIpcData,
TensorIpcReceiver, TensorIpcReceiver,
...@@ -21,6 +22,8 @@ from vllm.v1.engine.tensor_ipc import ( ...@@ -21,6 +22,8 @@ from vllm.v1.engine.tensor_ipc import (
) )
from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder from vllm.v1.serial_utils import MsgpackDecoder, MsgpackEncoder
DEVICE_TYPE = current_platform.device_type
@pytest.fixture(scope="module", autouse=True) @pytest.fixture(scope="module", autouse=True)
def setup_multiprocessing(): def setup_multiprocessing():
...@@ -53,7 +56,7 @@ def encoder_process( ...@@ -53,7 +56,7 @@ def encoder_process(
encoder = MsgpackEncoder(oob_tensor_consumer=sender) encoder = MsgpackEncoder(oob_tensor_consumer=sender)
if torch.cuda.is_available(): if torch.cuda.is_available():
device = "cuda:0" device = f"{DEVICE_TYPE}:0"
tensor = torch.randn( tensor = torch.randn(
*tensor_data["shape"], dtype=tensor_data["dtype"], device=device *tensor_data["shape"], dtype=tensor_data["dtype"], device=device
) )
...@@ -384,7 +387,7 @@ def mixed_tensor_encoder_process( ...@@ -384,7 +387,7 @@ def mixed_tensor_encoder_process(
# Create only CUDA tensor for IPC (CPU will be serialized) # Create only CUDA tensor for IPC (CPU will be serialized)
# But actually, let's just send CUDA tensor directly # But actually, let's just send CUDA tensor directly
cuda_tensor = torch.randn(4, 5, device="cuda:0") cuda_tensor = torch.randn(4, 5, device=f"{DEVICE_TYPE}:0")
# Manually send via IPC to test the mechanism # Manually send via IPC to test the mechanism
cuda_tensor_shared = cuda_tensor.share_memory_() cuda_tensor_shared = cuda_tensor.share_memory_()
...@@ -651,7 +654,7 @@ def test_ipc_disabled_mode(): ...@@ -651,7 +654,7 @@ def test_ipc_disabled_mode():
# If CUDA is available, test with CUDA tensor too # If CUDA is available, test with CUDA tensor too
if torch.cuda.is_available(): if torch.cuda.is_available():
cuda_tensor = torch.randn(4, 5, device="cuda:0") cuda_tensor = torch.randn(4, 5, device=f"{DEVICE_TYPE}:0")
encoded_cuda = encoder.encode({"cuda_tensor": cuda_tensor}) encoded_cuda = encoder.encode({"cuda_tensor": cuda_tensor})
assert len(encoded_cuda) > 0 assert len(encoded_cuda) > 0
assert tensor_queues[0].empty(), ( assert tensor_queues[0].empty(), (
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment