[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues...

[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues in the future (#31747) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>

[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues...
[Misc] Fix `Current vLLM config is not set.` warnings, assert to avoid issues in the future (#31747) Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com> Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
6cdf015c · Lucas Wilkinson · GitHub · 5d3b6097 · 6cdf015c · 6cdf015c
Unverified Commit 6cdf015c authored Jan 08, 2026 by Lucas Wilkinson Committed by GitHub Jan 08, 2026
20 changed files
--- a/tests/compile/distributed/test_async_tp.py
+++ b/tests/compile/distributed/test_async_tp.py
@@ -15,6 +15,7 @@ from vllm.config import (
    ModelConfig,
    PassConfig,
    VllmConfig,
+    set_current_vllm_config,
 )
 from vllm.distributed import (
    tensor_model_parallel_all_gather,
@@ -340,38 +341,42 @@ def async_tp_pass_on_test_model(
    )

    async_tp_pass = AsyncTPPass(vllm_config)
-    backend = TestBackend(async_tp_pass)

-    assert (
-        async_tp_pass.compilation_config.splitting_ops
-        == vllm_config.compilation_config.splitting_ops
-    )
-    assert (
-        async_tp_pass.compilation_config.use_inductor_graph_partition
-        == vllm_config.compilation_config.use_inductor_graph_partition
-    )
+    # Set the global vllm_config for TestBackend which calls
+    # get_current_vllm_config()
+    with set_current_vllm_config(vllm_config):
+        backend = TestBackend(async_tp_pass)

-    model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor
+        assert (
+            async_tp_pass.compilation_config.splitting_ops
+            == vllm_config.compilation_config.splitting_ops
+        )
+        assert (
+            async_tp_pass.compilation_config.use_inductor_graph_partition
+            == vllm_config.compilation_config.use_inductor_graph_partition
+        )

-    hidden_states = torch.randn(
-        (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
-    )
+        model = test_model_cls(hidden_size, dtype)  # Pass dtype to model constructor
+
+        hidden_states = torch.randn(
+            (batch_size * seq_len, hidden_size), dtype=dtype, requires_grad=False
+        )

-    if dynamic:
-        torch._dynamo.mark_dynamic(hidden_states, 0)
+        if dynamic:
+            torch._dynamo.mark_dynamic(hidden_states, 0)

-    compiled_model = torch.compile(model, backend=backend)
-    compiled_model(hidden_states)
+        compiled_model = torch.compile(model, backend=backend)
+        compiled_model(hidden_states)

-    assert async_tp_pass.matched_count == 1
+        assert async_tp_pass.matched_count == 1

-    # In pre-nodes, all gather or reduce scatter should exist,
-    # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
-    backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)
+        # In pre-nodes, all gather or reduce scatter should exist,
+        # fused_matmul_reduce_scatter or fused_all_gather_matmul should not
+        backend.check_before_ops(model.ops_in_model_before(), fully_replaced=False)

-    # In post-nodes, fused_matmul_reduce_scatter or \
-    # fused_all_gather_matmul should exist
-    backend.check_after_ops(model.ops_in_model_after())
+        # In post-nodes, fused_matmul_reduce_scatter or \
+        # fused_all_gather_matmul should exist
+        backend.check_after_ops(model.ops_in_model_after())


 @create_new_process_for_each_test()

--- a/tests/compile/test_config.py
+++ b/tests/compile/test_config.py
@@ -430,7 +430,7 @@ def test_cudagraph_sizes_post_init(
        )


-def test_cached_compilation_config():
+def test_cached_compilation_config(default_vllm_config):
    import torch
    from torch._inductor.utils import run_and_get_code


--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -189,6 +189,17 @@ def dist_init():
    cleanup_dist_env_and_memory()


+@pytest.fixture
+def default_vllm_config():
+    """Set a default VllmConfig for tests that directly test CustomOps or pathways
+    that use get_current_vllm_config() outside of a full engine context.
+    """
+    from vllm.config import VllmConfig, set_current_vllm_config
+
+    with set_current_vllm_config(VllmConfig()):
+        yield
+
+
 @pytest.fixture()
 def should_do_global_cleanup_after_test(request) -> bool:
    """Allow subdirectories to skip global cleanup by overriding this fixture.

--- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py
+++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py
@@ -458,7 +458,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
    )


-def test_trtllm_attention_rejects_num_kv_heads_1() -> None:
+def test_trtllm_attention_rejects_num_kv_heads_1(default_vllm_config) -> None:
    """Test that TRTLLM attention correctly rejects num_kv_heads=1.

    When num_kv_heads=1 (MQA), the KV cache strides become degenerate

--- a/tests/kernels/attention/test_mha_attn.py
+++ b/tests/kernels/attention/test_mha_attn.py
@@ -36,7 +36,7 @@ if current_platform.is_rocm():


 @pytest.mark.parametrize("device", devices)
-def test_mha_attn_platform(device: str):
+def test_mha_attn_platform(default_vllm_config, device: str):
    """
    Test the attention selector between different platform and device.
    """
@@ -116,6 +116,7 @@ CUDA_DEVICES = ["cuda"]
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_mha_attn_forward(
+    default_vllm_config,
    batch_size: int,
    seq_len: int,
    num_heads: int,
@@ -162,6 +163,7 @@ def test_mha_attn_forward(
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 def test_mha_attn_varlen_forward(
+    default_vllm_config,
    var_seq_len: list[int],
    num_heads: int,
    num_kv_heads: int,

--- a/tests/kernels/core/test_activation.py
+++ b/tests/kernels/core/test_activation.py
@@ -45,6 +45,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_act_and_mul(
+    default_vllm_config,
    activation: str,
    num_tokens: int,
    d: int,
@@ -122,6 +123,7 @@ def test_act_and_mul(
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_activation(
+    default_vllm_config,
    activation: type[torch.nn.Module],
    num_tokens: int,
    d: int,

--- a/tests/kernels/core/test_fused_qk_norm_rope.py
+++ b/tests/kernels/core/test_fused_qk_norm_rope.py
@@ -57,6 +57,7 @@ def _apply_qk_norm_rope(
 @pytest.mark.parametrize("rotary_ratio", [1.0, 0.5, 0.25])
 @torch.inference_mode()
 def test_fused_qk_norm_rope_matches_reference(
+    default_vllm_config,
    device: str,
    dtype: torch.dtype,
    is_neox: bool,

--- a/tests/kernels/core/test_fused_quant_layernorm.py
+++ b/tests/kernels/core/test_fused_quant_layernorm.py
@@ -147,6 +147,7 @@ def ops_impl(
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_rms_norm(
+    default_vllm_config,
    num_tokens: int,
    hidden_size: int,
    add_residual: bool,

--- a/tests/kernels/core/test_layernorm.py
+++ b/tests/kernels/core/test_layernorm.py
@@ -26,6 +26,7 @@ CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 e
 @pytest.mark.parametrize("strided_input", [False, True])
 @torch.inference_mode()
 def test_rms_norm(
+    default_vllm_config,
    num_tokens: int,
    hidden_size: int,
    add_residual: bool,

--- a/tests/kernels/core/test_mrope.py
+++ b/tests/kernels/core/test_mrope.py
@@ -90,6 +90,7 @@ num_tokens_list = [11, 8192]
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("num_tokens", num_tokens_list)
 def test_mrope(
+    default_vllm_config,
    model_name: str,
    model_info: MRoPETestInfo,
    tp_size: int,
@@ -159,6 +160,7 @@ def test_mrope(
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("num_tokens", num_tokens_list)
 def test_mrope_torch_compile_tracing(
+    default_vllm_config,
    model_name: str,
    model_info: MRoPETestInfo,
    tp_size: int,

--- a/tests/kernels/core/test_pos_encoding.py
+++ b/tests/kernels/core/test_pos_encoding.py
@@ -62,6 +62,7 @@ TENSORS_SHAPES_FN = [
 @pytest.mark.parametrize("use_key", USE_KEY)
 @torch.inference_mode()
 def test_rotary_embedding(
+    default_vllm_config,
    is_neox_style: bool,
    tensor_shape_fn: Callable[[int, int, int, int], tuple[int, ...]],
    batch_size: int,
@@ -123,7 +124,7 @@ def test_rotary_embedding(


 @torch.inference_mode()
-def test_rope_module_cache():
+def test_rope_module_cache(default_vllm_config):
    MAX_POSITIONS = [123, 1234]
    ROPE_THETAS = [10000, 1000000]
    ROPE_PARAMETERS = (

--- a/tests/kernels/core/test_rotary_embedding.py
+++ b/tests/kernels/core/test_rotary_embedding.py
@@ -36,6 +36,7 @@ def rotary_embedding_opcheck(
 @pytest.mark.parametrize("use_key", [True, False])
 @pytest.mark.parametrize("head_stride_is_contiguous", [True, False])
 def test_rotary_embedding_opcheck(
+    default_vllm_config,
    dist_init,
    device,
    max_position,

--- a/tests/kernels/moe/test_cpu_fused_moe.py
+++ b/tests/kernels/moe/test_cpu_fused_moe.py
@@ -6,7 +6,7 @@ import torch

 from tests.kernels.allclose_default import get_default_atol, get_default_rtol
 from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight
-from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul
+from vllm.model_executor.layers.fused_moe.cpu_fused_moe import _CPU_MOE_ACT
 from vllm.platforms import current_platform
 from vllm.utils.torch_utils import set_random_seed

@@ -24,11 +24,6 @@ USE_BIAS = [True, False]
 ISA = ["amx", "vec"] if torch._C._cpu._is_amx_tile_supported() else ["vec"]
 DTYPE = [torch.bfloat16]

-_CPU_MOE_ACT = {
-    "silu": SiluAndMul(),
-    "swigluoai": SwigluOAIAndMul(),
-}
-

 def ref_fused_moe(
    input: torch.Tensor,
@@ -106,6 +101,7 @@ def ref_fused_moe(
 @pytest.mark.parametrize("act", ACT)
 @pytest.mark.parametrize("isa", ISA)
 def test_cpu_fused_moe(
+    default_vllm_config,
    batch_size: int,
    expert_num: int,
    hidden_size: int,

--- a/tests/kernels/moe/test_moe.py
+++ b/tests/kernels/moe/test_moe.py
@@ -468,7 +468,12 @@ def test_fused_moe_wn16(
 )
 @torch.inference_mode()
 def test_mixtral_moe(
-    dist_init, dtype: torch.dtype, padding: bool, use_rocm_aiter: bool, monkeypatch
+    default_vllm_config,
+    dist_init,
+    dtype: torch.dtype,
+    padding: bool,
+    use_rocm_aiter: bool,
+    monkeypatch,
 ):
    """Make sure our Mixtral MoE implementation agrees with the one from
    huggingface."""

--- a/tests/kernels/quantization/test_fp8_quant_group.py
+++ b/tests/kernels/quantization/test_fp8_quant_group.py
@@ -23,7 +23,12 @@ from vllm.utils.torch_utils import set_random_seed
 @pytest.mark.parametrize("use_ue8m0", [True, False])
 @torch.inference_mode()
 def test_quantfp8_group_functionality(
-    batch_size: int, hidden_dim: int, group_size: int, seed: int, use_ue8m0: bool
+    default_vllm_config,
+    batch_size: int,
+    hidden_dim: int,
+    group_size: int,
+    seed: int,
+    use_ue8m0: bool,
 ) -> None:
    """Test QuantFP8 group quantization with various configurations.

@@ -82,7 +87,9 @@ def test_quantfp8_group_functionality(
 @pytest.mark.parametrize("seed", [42])
 @pytest.mark.parametrize("use_ue8m0", [True, False])
 @torch.inference_mode()
-def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:
+def test_quantfp8_group_multidimensional(
+    default_vllm_config, seed: int, use_ue8m0: bool
+) -> None:
    set_random_seed(seed)

    group_size = 64
@@ -135,7 +142,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None:

 @pytest.mark.parametrize("seed", [42])
 @torch.inference_mode()
-def test_quantfp8_group_edge_cases(seed: int) -> None:
+def test_quantfp8_group_edge_cases(default_vllm_config, seed: int) -> None:
    set_random_seed(seed)

    batch_size = 16

--- a/tests/kernels/quantization/test_int8_kernel.py
+++ b/tests/kernels/quantization/test_int8_kernel.py
@@ -102,7 +102,7 @@ SEEDS = [0]
    itertools.product(M, N, K, E, TOP_KS, DTYPES, SEEDS),
 )
 @torch.inference_mode()
-def test_w8a8_fp8_fused_moe(M, N, K, E, topk, dtype, seed):
+def test_w8a8_fp8_fused_moe(default_vllm_config, M, N, K, E, topk, dtype, seed):
    torch.manual_seed(seed)
    # Initialize int8 quantization parameters
    factor_for_scale = 1e-2

--- a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
@@ -31,6 +31,7 @@ BLOCK_SIZE = 16
 @pytest.mark.parametrize("shape", SHAPES)
 @torch.inference_mode()
 def test_silu_mul_nvfp4_quant(
+    default_vllm_config,
    dtype: torch.dtype,
    shape: tuple[int, int],
 ) -> None:

--- a/tests/kernels/test_fused_quant_activation.py
+++ b/tests/kernels/test_fused_quant_activation.py
@@ -39,6 +39,7 @@ def ops_impl(x: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
 @pytest.mark.parametrize("device", CUDA_DEVICES)
 @torch.inference_mode()
 def test_silu_and_mul(
+    default_vllm_config,
    num_tokens: int,
    hidden_size: int,
    dtype: torch.dtype,

--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -82,7 +82,7 @@ class DummyLoRAModel(nn.Sequential, SupportsLoRA):


 @pytest.fixture
-def dummy_model() -> nn.Module:
+def dummy_model(default_vllm_config) -> nn.Module:
    model = DummyLoRAModel(
        OrderedDict(
            [
@@ -115,7 +115,7 @@ def dummy_model() -> nn.Module:


 @pytest.fixture
-def dummy_model_gate_up() -> nn.Module:
+def dummy_model_gate_up(default_vllm_config) -> nn.Module:
    model = DummyLoRAModel(
        OrderedDict(
            [

--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -252,7 +252,9 @@ def check_punica_wrapper(punica_wrapper) -> bool:
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 128000])
 @pytest.mark.parametrize("stage", STAGES)
-def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
+def test_embeddings(
+    default_vllm_config, dist_init, num_loras, device, vocab_size, stage
+) -> None:
    # For multi-GPU testing of Triton kernel, we must explicitly set the CUDA
    # device, see: https://github.com/triton-lang/triton/issues/2925
    # Same below.
@@ -353,7 +355,7 @@ def test_embeddings(dist_init, num_loras, device, vocab_size, stage) -> None:
 @pytest.mark.parametrize("vocab_size", [512, 32000, 64000, 256512])
 @pytest.mark.parametrize("stage", STAGES)
 def test_lm_head_logits_processor(
-    dist_init, num_loras, device, vocab_size, stage
+    default_vllm_config, dist_init, num_loras, device, vocab_size, stage
 ) -> None:
    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
@@ -470,6 +472,7 @@ def test_lm_head_logits_processor(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 def test_linear_replicated(
+    default_vllm_config,
    dist_init,
    num_loras,
    device,
@@ -580,7 +583,7 @@ def test_linear_replicated(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 def test_linear_parallel(
-    dist_init, num_loras, orientation, fully_shard, device, stage
+    default_vllm_config, dist_init, num_loras, orientation, fully_shard, device, stage
 ) -> None:
    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
@@ -705,7 +708,7 @@ def test_linear_parallel(
 @pytest.mark.parametrize("device", DEVICES)
 @pytest.mark.parametrize("stage", STAGES)
 def test_column_parallel_packed(
-    dist_init, num_loras, repeats, fully_shard, device, stage
+    default_vllm_config, dist_init, num_loras, repeats, fully_shard, device, stage
 ) -> None:
    if current_platform.is_cuda_alike():
        torch.cuda.set_device(device)
@@ -851,7 +854,7 @@ def test_column_parallel_packed(
 @pytest.mark.parametrize(
    "seed", list(range(VOCAB_PARALLEL_EMBEDDING_TEST_NUM_RANDOM_SEEDS))
 )
-def test_vocab_parallel_embedding_indices(tp_size, seed):
+def test_vocab_parallel_embedding_indices(tp_size, seed, default_vllm_config):
    random.seed(seed)
    vocab_size = random.randint(4000, 64000)
    added_vocab_size = random.randint(0, 1024)