Merge tag 'v0.9.1' into v0.9.1-ori

cc7f22a8 · zhuwenwen · b9ea0c09 · b6553be1 · cc7f22a8 · cc7f22a8
Commit cc7f22a8 authored Jun 11, 2025 by zhuwenwen
20 changed files
--- a/tests/kernels/quantization/test_ggml.py
+++ b/tests/kernels/quantization/test_ggml.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import gguf
 import pytest

--- a/tests/kernels/quantization/test_gguf.py
+++ b/tests/kernels/quantization/test_gguf.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from pathlib import Path

--- a/tests/kernels/quantization/test_gptq.py
+++ b/tests/kernels/quantization/test_gptq.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import torch

--- a/tests/kernels/quantization/test_int8_kernel.py
+++ b/tests/kernels/quantization/test_int8_kernel.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 # Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
 import itertools

--- a/tests/kernels/quantization/test_int8_quant.py
+++ b/tests/kernels/quantization/test_int8_quant.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch

--- a/tests/kernels/quantization/test_machete_mm.py
+++ b/tests/kernels/quantization/test_machete_mm.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the machete kernel.
 Run `pytest tests/kernels/test_machete_mm.py`.

--- a/tests/kernels/quantization/test_marlin_gemm.py
+++ b/tests/kernels/quantization/test_marlin_gemm.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the marlin kernel.
 Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.

--- a/tests/kernels/quantization/test_nvfp4_quant.py
+++ b/tests/kernels/quantization/test_nvfp4_quant.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch

--- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py
+++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 from nvfp4_utils import (FLOAT4_E2M1_MAX, FLOAT8_E4M3_MAX,

--- a/tests/kernels/quantization/test_rocm_skinny_gemms.py
+++ b/tests/kernels/quantization/test_rocm_skinny_gemms.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch

--- a/tests/kernels/quantization/test_triton_scaled_mm.py
+++ b/tests/kernels/quantization/test_triton_scaled_mm.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_scaled_mm kernel
 Run `pytest tests/kernels/test_triton_scaled_mm.py`.

--- a/tests/kernels/test_apply_repetition_penalties.py
+++ b/tests/kernels/test_apply_repetition_penalties.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from tests.kernels.utils import opcheck
+from vllm._custom_ops import (apply_repetition_penalties_cuda,
+                              apply_repetition_penalties_torch)
+from vllm.platforms import current_platform
+NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025]
+# [stress, stress, stress, Qwen, llama 4]
+VOCAB_SIZES = [17, 256, 1019, 151936, 202048]
+REPETITION_PENALTY_VALUES = [1.05]
+SEEDS = [0]
+DTYPES = [torch.float32, torch.float16]
+@pytest.mark.parametrize("num_seqs", NUM_SEQS)
+@pytest.mark.parametrize("vocab_size", VOCAB_SIZES)
+@pytest.mark.parametrize("repetition_penalty", REPETITION_PENALTY_VALUES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.skipif(not current_platform.is_cuda(),
+                    reason="This test for checking CUDA kernel")
+@torch.inference_mode()
+def test_apply_repetition_penalties(
+    num_seqs: int,
+    vocab_size: int,
+    repetition_penalty: float,
+    dtype: torch.dtype,
+    seed: int,
+) -> None:
+    """
+    Test the apply_repetition_penalties custom op 
+    against a reference implementation.
+    """
+    current_platform.seed_everything(seed)
+    torch.set_default_device("cuda:0")
+    # Create test data
+    logits = torch.randn(num_seqs, vocab_size, dtype=dtype)
+    # Create masks with some random tokens marked as repeated
+    prompt_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    output_mask = torch.zeros(num_seqs, vocab_size, dtype=torch.bool)
+    # Mark some tokens as repeated in prompt and output
+    prompt_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+    output_indices = torch.randint(0, vocab_size,
+                                   (num_seqs, max(1, vocab_size // 200)))
+    for i in range(num_seqs):
+        prompt_mask[i, prompt_indices[i]] = True
+        output_mask[i, output_indices[i]] = True
+    # Create repetition penalties tensor
+    repetition_penalties = torch.full((num_seqs, ),
+                                      repetition_penalty,
+                                      dtype=dtype)
+    # Run all three implementations
+    logits_torch = logits.clone()
+    logits_cuda = logits.clone()
+    apply_repetition_penalties_torch(logits_torch, prompt_mask, output_mask,
+                                     repetition_penalties)
+    apply_repetition_penalties_cuda(logits_cuda, prompt_mask, output_mask,
+                                    repetition_penalties)
+    # Compare all outputs to reference
+    torch.testing.assert_close(logits_torch, logits_cuda, rtol=1e-3, atol=1e-3)
+    # Test the operator by applying the opcheck utility
+    opcheck(torch.ops._C.apply_repetition_penalties_,
+            (logits.clone(), prompt_mask, output_mask, repetition_penalties))
--- a/tests/kernels/test_cutlass_mla_decode.py
+++ b/tests/kernels/test_cutlass_mla_decode.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch
 import torch.nn.functional as F
@@ -75,7 +76,9 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
    pack_factor = 128 // block_size
    block_num = ((block_num + pack_factor - 1) // pack_factor) * pack_factor
-    q = torch.randn(bs, h_q, d)
+    # Amplify input values to ensure test coverage of edge cases where CUTLASS
+    # kernel errors occur with split_k settings.
+    q = torch.randn(bs, h_q, d) * 100
    block_table = torch.randint(0,
                                bs * block_num, (bs, block_num),
                                dtype=torch.int32)

--- a/tests/kernels/test_flex_attention.py
+++ b/tests/kernels/test_flex_attention.py
+# SPDX-License-Identifier: Apache-2.0
+"""Integration tests for FlexAttention backend vs default backend"""
+import random
+import numpy as np
+import pytest
+import torch
+from packaging import version
+from vllm import LLM, SamplingParams
+TORCH_VERSION = version.parse(torch.__version__)
+MINIMUM_TORCH_VERSION = version.parse("2.7.0")
+def set_seed(seed):
+    """Set seeds for reproducibility"""
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+@pytest.mark.skipif(
+    not torch.cuda.is_available() or TORCH_VERSION < MINIMUM_TORCH_VERSION,
+    reason="CUDA not available or PyTorch version < 2.7",
+)
+def test_flex_attention_vs_default_backend(monkeypatch):
+    """Test that FlexAttention produces the same outputs as the default backend.
+    This test compares the outputs from the FlexAttention backend with
+    the default backend, ensuring they are identical when using the same seed.
+    """
+    model_name = "Qwen/Qwen2.5-1.5B-Instruct"
+    seed = 42
+    max_tokens = 32
+    prompts = [
+        "Hello, my name is",
+        "The president of the United States is",
+        "The capital of France is",
+    ]
+    sampling_params = SamplingParams(temperature=0.0,
+                                     top_p=1.0,
+                                     seed=seed,
+                                     max_tokens=max_tokens)
+    # Run with flex attention
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+        set_seed(seed)
+        llm_flex = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_flex = llm_flex.generate(prompts, sampling_params)
+    # Run with default backend
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+        m.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "0")
+        set_seed(seed)
+        llm_default = LLM(
+            model_name,
+            tensor_parallel_size=1,
+            num_gpu_blocks_override=128,
+            enforce_eager=True,
+        )
+        output_default = llm_default.generate(prompts, sampling_params)
+    # Compare outputs from both backends
+    for i, (flex_result,
+            default_result) in enumerate(zip(output_flex, output_default)):
+        prompt = prompts[i]
+        flex_text = flex_result.outputs[0].text
+        default_text = default_result.outputs[0].text
+        assert flex_text == default_text, (
+            f"FlexAttention output doesn't match default for: {prompt!r}\n"
+            f"FlexAttention: {flex_text!r}\n"
+            f"Default: {default_text!r}")
+if __name__ == "__main__":
+    pytest.main([__file__])
--- a/tests/kernels/test_fused_quant_activation.py
+++ b/tests/kernels/test_fused_quant_activation.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 import torch

--- a/tests/kernels/test_triton_flash_attention.py
+++ b/tests/kernels/test_triton_flash_attention.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Tests for the triton_flash_attention kernel
 Run `pytest tests/kernels/test_triton_flash_attention.py`.

--- a/tests/kernels/utils.py
+++ b/tests/kernels/utils.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Kernel test utils"""
 import itertools

--- a/tests/kv_transfer/test_disagg.py
+++ b/tests/kv_transfer/test_disagg.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import subprocess

--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import os
 import random

--- a/tests/kv_transfer/test_module.py
+++ b/tests/kv_transfer/test_module.py
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import subprocess
 import sys