Unverified Commit 6cdf015c authored by Lucas Wilkinson's avatar Lucas Wilkinson Committed by GitHub
Browse files

[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues...


[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues in the future (#31747)
Signed-off-by: default avatarLucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: default avatarLucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: default avatarLuka Govedič <ProExpertProg@users.noreply.github.com>
parent 5d3b6097
...@@ -15,6 +15,7 @@ from vllm.config import ( ...@@ -15,6 +15,7 @@ from vllm.config import (
ModelConfig, ModelConfig,
PassConfig, PassConfig,
VllmConfig, VllmConfig,
set_current_vllm_config,
) )
from vllm.distributed import ( from vllm.distributed import (
tensor_model_parallel_all_gather, tensor_model_parallel_all_gather,
...@@ -340,6 +341,10 @@ def async_tp_pass_on_test_model( ...@@ -340,6 +341,10 @@ def async_tp_pass_on_test_model(
) )
async_tp_pass = AsyncTPPass(vllm_config) async_tp_pass = AsyncTPPass(vllm_config)
# Set the global vllm_config for TestBackend which calls
# get_current_vllm_config()
with set_current_vllm_config(vllm_config):
backend = TestBackend(async_tp_pass) backend = TestBackend(async_tp_pass)
assert ( assert (
......
...@@ -430,7 +430,7 @@ def test_cudagraph_sizes_post_init( ...@@ -430,7 +430,7 @@ def test_cudagraph_sizes_post_init(
) )
def test_cached_compilation_config(): def test_cached_compilation_config(default_vllm_config):
import torch import torch
from torch._inductor.utils import run_and_get_code from torch._inductor.utils import run_and_get_code
......
...@@ -189,6 +189,17 @@ def dist_init(): ...@@ -189,6 +189,17 @@ def dist_init():
cleanup_dist_env_and_memory() cleanup_dist_env_and_memory()
@pytest.fixture
def default_vllm_config():
"""Set a default VllmConfig for tests that directly test CustomOps or pathways
that use get_current_vllm_config() outside of a full engine context.
"""
from vllm.config import VllmConfig, set_current_vllm_config
with set_current_vllm_config(VllmConfig()):
yield
@pytest.fixture() @pytest.fixture()
def should_do_global_cleanup_after_test(request) -> bool: def should_do_global_cleanup_after_test(request) -> bool:
"""Allow subdirectories to skip global cleanup by overriding this fixture. """Allow subdirectories to skip global cleanup by overriding this fixture.
......
...@@ -458,7 +458,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( ...@@ -458,7 +458,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
) )
def test_trtllm_attention_rejects_num_kv_heads_1() -> None: def test_trtllm_attention_rejects_num_kv_heads_1(default_vllm_config) -> None:
"""Test that TRTLLM attention correctly rejects num_kv_heads=1. """Test that TRTLLM attention correctly rejects num_kv_heads=1.
When num_kv_heads=1 (MQA), the KV cache strides become degenerate When num_kv_heads=1 (MQA), the KV cache strides become degenerate
......
...@@ -36,7 +36,7 @@ if current_platform.is_rocm(): ...@@ -36,7 +36,7 @@ if current_platform.is_rocm():
@pytest.mark.parametrize("device", devices) @pytest.mark.parametrize("device", devices)
def test_mha_attn_platform(device: str): def test_mha_attn_platform(default_vllm_config, device: str):
""" """
Test the attention selector between different platform and device. Test the attention selector between different platform and device.
""" """
...@@ -116,6 +116,7 @@ CUDA_DEVICES = ["cuda"] ...@@ -116,6 +116,7 @@ CUDA_DEVICES = ["cuda"]
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_mha_attn_forward( def test_mha_attn_forward(
default_vllm_config,
batch_size: int, batch_size: int,
seq_len: int, seq_len: int,
num_heads: int, num_heads: int,
...@@ -162,6 +163,7 @@ def test_mha_attn_forward( ...@@ -162,6 +163,7 @@ def test_mha_attn_forward(
@pytest.mark.parametrize("dtype", DTYPES) @pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
def test_mha_attn_varlen_forward( def test_mha_attn_varlen_forward(
default_vllm_config,
var_seq_len: list[int], var_seq_len: list[int],
num_heads: int, num_heads: int,
num_kv_heads: int, num_kv_heads: int,
......
...@@ -45,6 +45,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e ...@@ -45,6 +45,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_act_and_mul( def test_act_and_mul(
default_vllm_config,
activation: str, activation: str,
num_tokens: int, num_tokens: int,
d: int, d: int,
...@@ -122,6 +123,7 @@ def test_act_and_mul( ...@@ -122,6 +123,7 @@ def test_act_and_mul(
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_activation( def test_activation(
default_vllm_config,
activation: type[torch.nn.Module], activation: type[torch.nn.Module],
num_tokens: int, num_tokens: int,
d: int, d: int,
......
...@@ -57,6 +57,7 @@ def _apply_qk_norm_rope( ...@@ -57,6 +57,7 @@ def _apply_qk_norm_rope(
@pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25]) @pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
@torch.inference_mode() @torch.inference_mode()
def test_fused_qk_norm_rope_matches_reference( def test_fused_qk_norm_rope_matches_reference(
default_vllm_config,
device: str, device: str,
dtype: torch.dtype, dtype: torch.dtype,
is_neox: bool, is_neox: bool,
......
...@@ -147,6 +147,7 @@ def ops_impl( ...@@ -147,6 +147,7 @@ def ops_impl(
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_rms_norm( def test_rms_norm(
default_vllm_config,
num_tokens: int, num_tokens: int,
hidden_size: int, hidden_size: int,
add_residual: bool, add_residual: bool,
......
...@@ -26,6 +26,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e ...@@ -26,6 +26,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
@pytest.mark.parametrize("strided_input", [False, True]) @pytest.mark.parametrize("strided_input", [False, True])
@torch.inference_mode() @torch.inference_mode()
def test_rms_norm( def test_rms_norm(
default_vllm_config,
num_tokens: int, num_tokens: int,
hidden_size: int, hidden_size: int,
add_residual: bool, add_residual: bool,
......
...@@ -90,6 +90,7 @@ num_tokens_list = [11, 8192] ...@@ -90,6 +90,7 @@ num_tokens_list = [11, 8192]
@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("num_tokens", num_tokens_list) @pytest.mark.parametrize("num_tokens", num_tokens_list)
def test_mrope( def test_mrope(
default_vllm_config,
model_name: str, model_name: str,
model_info: MRoPETestInfo, model_info: MRoPETestInfo,
tp_size: int, tp_size: int,
...@@ -159,6 +160,7 @@ def test_mrope( ...@@ -159,6 +160,7 @@ def test_mrope(
@pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("num_tokens", num_tokens_list) @pytest.mark.parametrize("num_tokens", num_tokens_list)
def test_mrope_torch_compile_tracing( def test_mrope_torch_compile_tracing(
default_vllm_config,
model_name: str, model_name: str,
model_info: MRoPETestInfo, model_info: MRoPETestInfo,
tp_size: int, tp_size: int,
......
...@@ -62,6 +62,7 @@ TENSORS_SHAPES_FN = [ ...@@ -62,6 +62,7 @@ TENSORS_SHAPES_FN = [
@pytest.mark.parametrize("use_key", USE_KEY) @pytest.mark.parametrize("use_key", USE_KEY)
@torch.inference_mode() @torch.inference_mode()
def test_rotary_embedding( def test_rotary_embedding(
default_vllm_config,
is_neox_style: bool, is_neox_style: bool,
tensor_shape_fn: Callable[[int, int, int, int], tuple[int, ...]], tensor_shape_fn: Callable[[int, int, int, int], tuple[int, ...]],
batch_size: int, batch_size: int,
...@@ -123,7 +124,7 @@ def test_rotary_embedding( ...@@ -123,7 +124,7 @@ def test_rotary_embedding(
@torch.inference_mode() @torch.inference_mode()
def test_rope_module_cache(): def test_rope_module_cache(default_vllm_config):
MAX_POSITIONS = [123, 1234] MAX_POSITIONS = [123, 1234]
ROPE_THETAS = [10000, 1000000] ROPE_THETAS = [10000, 1000000]
ROPE_PARAMETERS = ( ROPE_PARAMETERS = (
......
...@@ -36,6 +36,7 @@ def rotary_embedding_opcheck( ...@@ -36,6 +36,7 @@ def rotary_embedding_opcheck(
@pytest.mark.parametrize("use_key", [True, False]) @pytest.mark.parametrize("use_key", [True, False])
@pytest.mark.parametrize("head_stride_is_contiguous", [True, False]) @pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
def test_rotary_embedding_opcheck( def test_rotary_embedding_opcheck(
default_vllm_config,
dist_init, dist_init,
device, device,
max_position, max_position,
......
...@@ -6,7 +6,7 @@ import torch ...@@ -6,7 +6,7 @@ import torch
from tests.kernels.allclose_default import get_default_atol, get_default_rtol from tests.kernels.allclose_default import get_default_atol, get_default_rtol
from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.utils.torch_utils import set_random_seed from vllm.utils.torch_utils import set_random_seed
...@@ -24,11 +24,6 @@ USE_BIAS = [True, False] ...@@ -24,11 +24,6 @@ USE_BIAS = [True, False]
ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"] ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
DTYPE = [torch.bfloat16] DTYPE = [torch.bfloat16]
_CPU_MOE_ACT = {
"silu": SiluAndMul(),
"swigluoai": SwigluOAIAndMul(),
}
def ref_fused_moe( def ref_fused_moe(
input: torch.Tensor, input: torch.Tensor,
...@@ -106,6 +101,7 @@ def ref_fused_moe( ...@@ -106,6 +101,7 @@ def ref_fused_moe(
@pytest.mark.parametrize("act", ACT) @pytest.mark.parametrize("act", ACT)
@pytest.mark.parametrize("isa", ISA) @pytest.mark.parametrize("isa", ISA)
def test_cpu_fused_moe( def test_cpu_fused_moe(
default_vllm_config,
batch_size: int, batch_size: int,
expert_num: int, expert_num: int,
hidden_size: int, hidden_size: int,
......
...@@ -468,7 +468,12 @@ def test_fused_moe_wn16( ...@@ -468,7 +468,12 @@ def test_fused_moe_wn16(
) )
@torch.inference_mode() @torch.inference_mode()
def test_mixtral_moe( def test_mixtral_moe(
dist_init, dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, monkeypatch default_vllm_config,
dist_init,
dtype: torch.dtype,
padding: bool,
use_rocm_aiter: bool,
monkeypatch,
): ):
"""Make sure our Mixtral MoE implementation agrees with the one from """Make sure our Mixtral MoE implementation agrees with the one from
huggingface.""" huggingface."""
......
...@@ -23,7 +23,12 @@ from vllm.utils.torch_utils import set_random_seed ...@@ -23,7 +23,12 @@ from vllm.utils.torch_utils import set_random_seed
@pytest.mark.parametrize("use_ue8m0", [True, False]) @pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode() @torch.inference_mode()
def test_quantfp8_group_functionality( def test_quantfp8_group_functionality(
batch_size: int, hidden_dim: int, group_size: int, seed: int, use_ue8m0: bool default_vllm_config,
batch_size: int,
hidden_dim: int,
group_size: int,
seed: int,
use_ue8m0: bool,
) -> None: ) -> None:
"""Test QuantFP8 group quantization with various configurations. """Test QuantFP8 group quantization with various configurations.
...@@ -82,7 +87,9 @@ def test_quantfp8_group_functionality( ...@@ -82,7 +87,9 @@ def test_quantfp8_group_functionality(
@pytest.mark.parametrize("seed", [42]) @pytest.mark.parametrize("seed", [42])
@pytest.mark.parametrize("use_ue8m0", [True, False]) @pytest.mark.parametrize("use_ue8m0", [True, False])
@torch.inference_mode() @torch.inference_mode()
def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: def test_quantfp8_group_multidimensional(
default_vllm_config, seed: int, use_ue8m0: bool
) -> None:
set_random_seed(seed) set_random_seed(seed)
group_size = 64 group_size = 64
...@@ -135,7 +142,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: ...@@ -135,7 +142,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
@pytest.mark.parametrize("seed", [42]) @pytest.mark.parametrize("seed", [42])
@torch.inference_mode() @torch.inference_mode()
def test_quantfp8_group_edge_cases(seed: int) -> None: def test_quantfp8_group_edge_cases(default_vllm_config, seed: int) -> None:
set_random_seed(seed) set_random_seed(seed)
batch_size = 16 batch_size = 16
......
...@@ -102,7 +102,7 @@ SEEDS = [0] ...@@ -102,7 +102,7 @@ SEEDS = [0]
itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS), itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
) )
@torch.inference_mode() @torch.inference_mode()
def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed): def test_w8a8_fp8_fused_moe(default_vllm_config, M, N, K, E, topk, dtype, seed):
torch.manual_seed(seed) torch.manual_seed(seed)
# Initialize int8 quantization parameters # Initialize int8 quantization parameters
factor_for_scale = 1e-2 factor_for_scale = 1e-2
......
...@@ -31,6 +31,7 @@ BLOCK_SIZE = 16 ...@@ -31,6 +31,7 @@ BLOCK_SIZE = 16
@pytest.mark.parametrize("shape", SHAPES) @pytest.mark.parametrize("shape", SHAPES)
@torch.inference_mode() @torch.inference_mode()
def test_silu_mul_nvfp4_quant( def test_silu_mul_nvfp4_quant(
default_vllm_config,
dtype: torch.dtype, dtype: torch.dtype,
shape: tuple[int, int], shape: tuple[int, int],
) -> None: ) -> None:
......
...@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor: ...@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
@pytest.mark.parametrize("device", CUDA_DEVICES) @pytest.mark.parametrize("device", CUDA_DEVICES)
@torch.inference_mode() @torch.inference_mode()
def test_silu_and_mul( def test_silu_and_mul(
default_vllm_config,
num_tokens: int, num_tokens: int,
hidden_size: int, hidden_size: int,
dtype: torch.dtype, dtype: torch.dtype,
......
...@@ -82,7 +82,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA): ...@@ -82,7 +82,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):
@pytest.fixture @pytest.fixture
def dummy_model() -> nn.Module: def dummy_model(default_vllm_config) -> nn.Module:
model = DummyLoRAModel( model = DummyLoRAModel(
OrderedDict( OrderedDict(
[ [
...@@ -115,7 +115,7 @@ def dummy_model() -> nn.Module: ...@@ -115,7 +115,7 @@ def dummy_model() -> nn.Module:
@pytest.fixture @pytest.fixture
def dummy_model_gate_up() -> nn.Module: def dummy_model_gate_up(default_vllm_config) -> nn.Module:
model = DummyLoRAModel( model = DummyLoRAModel(
OrderedDict( OrderedDict(
[ [
......
...@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool: ...@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000]) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: def test_embeddings(
default_vllm_config, dist_init, num_loras, device, vocab_size, stage
) -> None:
# For multi-GPU testing of Triton kernel, we must explicitly set the CUDA # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
# device, see: https://github.com/triton-lang/triton/issues/2925 # device, see: https://github.com/triton-lang/triton/issues/2925
# Same below. # Same below.
...@@ -353,7 +355,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None: ...@@ -353,7 +355,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
@pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512]) @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_lm_head_logits_processor( def test_lm_head_logits_processor(
dist_init, num_loras, device, vocab_size, stage default_vllm_config, dist_init, num_loras, device, vocab_size, stage
) -> None: ) -> None:
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
torch.cuda.set_device(device) torch.cuda.set_device(device)
...@@ -470,6 +472,7 @@ def test_lm_head_logits_processor( ...@@ -470,6 +472,7 @@ def test_lm_head_logits_processor(
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_linear_replicated( def test_linear_replicated(
default_vllm_config,
dist_init, dist_init,
num_loras, num_loras,
device, device,
...@@ -580,7 +583,7 @@ def test_linear_replicated( ...@@ -580,7 +583,7 @@ def test_linear_replicated(
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_linear_parallel( def test_linear_parallel(
dist_init, num_loras, orientation, fully_shard, device, stage default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage
) -> None: ) -> None:
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
torch.cuda.set_device(device) torch.cuda.set_device(device)
...@@ -705,7 +708,7 @@ def test_linear_parallel( ...@@ -705,7 +708,7 @@ def test_linear_parallel(
@pytest.mark.parametrize("device", DEVICES) @pytest.mark.parametrize("device", DEVICES)
@pytest.mark.parametrize("stage", STAGES) @pytest.mark.parametrize("stage", STAGES)
def test_column_parallel_packed( def test_column_parallel_packed(
dist_init, num_loras, repeats, fully_shard, device, stage default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage
) -> None: ) -> None:
if current_platform.is_cuda_alike(): if current_platform.is_cuda_alike():
torch.cuda.set_device(device) torch.cuda.set_device(device)
...@@ -851,7 +854,7 @@ def test_column_parallel_packed( ...@@ -851,7 +854,7 @@ def test_column_parallel_packed(
@pytest.mark.parametrize( @pytest.mark.parametrize(
"seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS)) "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))
) )
def test_vocab_parallel_embedding_indices(tp_size, seed): def test_vocab_parallel_embedding_indices(tp_size, seed, default_vllm_config):
random.seed(seed) random.seed(seed)
vocab_size = random.randint(4000, 64000) vocab_size = random.randint(4000, 64000)
added_vocab_size = random.randint(0, 1024) added_vocab_size = random.randint(0, 1024)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment