Merge tag 'v0.7.3' into v0.7.3-dev

ec5e299c · zhuwenwen · 47bd229c · ed6e9075 · ec5e299c · ec5e299c
Commit ec5e299c authored Feb 21, 2025 by zhuwenwen
20 changed files
--- a/tests/entrypoints/offline_mode/test_offline_mode.py
+++ b/tests/entrypoints/offline_mode/test_offline_mode.py
@@ -5,6 +5,7 @@ import sys
 import os
 import pytest
+import urllib3
 from vllm import LLM
 from vllm.distributed import cleanup_dist_env_and_memory
@@ -31,6 +32,15 @@ MODEL_CONFIGS = [
        "tensor_parallel_size": 1,
        "tokenizer_mode": "mistral",
    },
+    {
+        "model": "sentence-transformers/all-MiniLM-L12-v2",
+        "enforce_eager": True,
+        "gpu_memory_utilization": 0.20,
+        "max_model_len": 64,
+        "max_num_batched_tokens": 64,
+        "max_num_seqs": 64,
+        "tensor_parallel_size": 1,
+    },
 ]
@@ -50,6 +60,16 @@ def test_offline_mode(monkeypatch):
    # Set HF to offline mode and ensure we can still construct an LLM
    try:
        monkeypatch.setenv("HF_HUB_OFFLINE", "1")
+        monkeypatch.setenv("VLLM_NO_USAGE_STATS", "1")
+        def disable_connect(*args, **kwargs):
+            raise RuntimeError("No http calls allowed")
+        monkeypatch.setattr(urllib3.connection.HTTPConnection, "connect",
+                            disable_connect)
+        monkeypatch.setattr(urllib3.connection.HTTPSConnection, "connect",
+                            disable_connect)
        # Need to re-import huggingface_hub and friends to setup offline mode
        _re_import_modules()
        # Cached model files should be used in offline mode
@@ -59,6 +79,7 @@ def test_offline_mode(monkeypatch):
        # Reset the environment after the test
        # NB: Assuming tests are run in online mode
        monkeypatch.delenv("HF_HUB_OFFLINE")
+        monkeypatch.delenv("VLLM_NO_USAGE_STATS")
        _re_import_modules()
        pass

--- a/tests/runai_model_streamer/__init__.py
+++ b/tests/runai_model_streamer/__init__.py
--- a/tests/entrypoints/openai/test_accuracy.py
+++ b/tests/entrypoints/openai/test_accuracy.py
@@ -14,7 +14,7 @@ import os
 from vllm.platforms import current_platform
-from ...utils import RemoteOpenAIServer, models_path_prefix
+from ....utils import RemoteOpenAIServer, models_path_prefix
 MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2-1.5B-Instruct")
 NUM_CONCURRENT = 500
@@ -22,7 +22,7 @@ TASK = "gsm8k"
 FILTER = "exact_match,strict-match"
 RTOL = 0.03
 EXPECTED_VALUE = 0.58
-DEFAULT_ARGS = ["--max-model-len", "2048", "--disable-log-requests"]
+DEFAULT_ARGS = ["--max-model-len", "4096", "--disable-log-requests"]
 MORE_ARGS_LIST = [
    [],  # Default
    ["--enable-chunked-prefill"],  # Chunked
@@ -68,14 +68,21 @@ def run_test(more_args):
                ), f"Expected: {EXPECTED_VALUE} |  Measured: {measured_value}"
-@pytest.mark.skipif(not current_platform.is_cuda(),
+@pytest.mark.skipif(not current_platform.is_cuda()
-                    reason="V1 currently only supported on CUDA")
+                    and not current_platform.is_tpu(),
+                    reason="V1 currently only supported on CUDA and TPU")
 def test_lm_eval_accuracy_v1_engine(monkeypatch):
    """Run with the V1 Engine."""
    with monkeypatch.context() as m:
        m.setenv("VLLM_USE_V1", "1")
-        run_test([])
+        more_args = []
+        # Limit compilation time for V1
+        if current_platform.is_tpu():
+            more_args = ["--max-num-seqs", "64"]
+        run_test(more_args)
 @pytest.mark.parametrize("more_args", MORE_ARGS_LIST)

--- a/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+++ b/tests/entrypoints/openai/correctness/test_transcription_api_correctness.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+Evaluate Transcription API correctness by computing Word Error Rate (WER)
+on a given ASR dataset. When provided, it will also compare the WER against
+a baseline.
+This simulates real work usage of the API and makes sure that the frontend and
+AsyncLLMEngine are working correctly.
+"""
+import asyncio
+import io
+import time
+from statistics import mean, median
+from typing import List
+import librosa
+import pytest
+import soundfile
+import torch
+from datasets import load_dataset
+from evaluate import load
+from transformers import AutoTokenizer
+from ....utils import RemoteOpenAIServer
+def to_bytes(y, sr):
+    buffer = io.BytesIO()
+    soundfile.write(buffer, y, sr, format="WAV")
+    buffer.seek(0)
+    return buffer
+async def transcribe_audio(client, tokenizer, y, sr):
+    # Send loaded audio directly instead of loading from disk,
+    # dont account for that time though
+    with to_bytes(y, sr) as f:
+        start_time = time.perf_counter()
+        transcription = await client.audio.transcriptions.create(
+            file=f,
+            model=tokenizer.name_or_path,
+            language="en",
+            temperature=0.0,
+        )
+        end_time = time.perf_counter()
+        # NOTE there's no streaming in transcriptions, can't measure ttft
+    latency = end_time - start_time
+    num_output_tokens = len(
+        tokenizer(transcription.text, add_special_tokens=False).input_ids)
+    return latency, num_output_tokens, transcription.text
+async def bound_transcribe(model_name, sem, client, audio, reference):
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Use semaphore to limit concurrent requests.
+    async with sem:
+        result = await transcribe_audio(client, tokenizer, *audio)
+        # Normalize *english* output/reference for evaluation.
+        out = tokenizer.normalize(result[2])
+        ref = tokenizer.normalize(reference)
+        return result[:2] + (out, ref)
+async def process_dataset(model, client, data, concurrent_request):
+    sem = asyncio.Semaphore(concurrent_request)
+    # Warmup call as the first `librosa.load` server-side is quite slow.
+    audio, sr = data[0]["audio"]["array"], data[0]["audio"]["sampling_rate"]
+    _ = await bound_transcribe(model, sem, client, (audio, sr), "")
+    tasks: List[asyncio.Task] = []
+    for sample in data:
+        audio, sr = sample["audio"]["array"], sample["audio"]["sampling_rate"]
+        task = asyncio.create_task(
+            bound_transcribe(model, sem, client, (audio, sr), sample["text"]))
+        tasks.append(task)
+    return await asyncio.gather(*tasks)
+def print_performance_metrics(results, total_time):
+    latencies = [res[0] for res in results]
+    total_tokens = sum([res[1] for res in results])
+    total = len(results)
+    print(f"Total Requests: {total}")
+    print(f"Successful Requests: {len(latencies)}")
+    print(f"Average Latency: {mean(latencies):.4f} seconds")
+    print(f"Median Latency: {median(latencies):.4f} seconds")
+    perc = sorted(latencies)[int(len(latencies) * 0.95) - 1]
+    print(f"95th Percentile Latency: {perc:.4f} seconds")
+    # Throughput
+    req_throughput = len(latencies) / total_time
+    print(f"Estimated req_Throughput: {req_throughput:.2f} requests/s")
+    throughput = total_tokens / total_time
+    print(f"Estimated Throughput: {throughput:.2f} tok/s")
+def add_duration(sample):
+    y, sr = sample['audio']["array"], sample['audio']["sampling_rate"]
+    sample['duration_ms'] = librosa.get_duration(y=y, sr=sr) * 1000
+    return sample
+def load_hf_dataset(dataset_repo: str, split='validation', **hf_kwargs):
+    ## Load and filter the dataset
+    dataset = load_dataset(dataset_repo, split=split, **hf_kwargs)
+    if 'duration_ms' not in dataset[0]:
+        # compute duration to filter
+        dataset = dataset.map(add_duration)
+    # Whisper max supported duration
+    dataset = dataset.filter(lambda example: example['duration_ms'] < 30000)
+    return dataset
+def run_evaluation(model: str,
+                   client,
+                   dataset,
+                   max_concurrent_reqs: int,
+                   n_examples: int = -1,
+                   print_metrics: bool = True):
+    if n_examples > 0:
+        dataset = dataset.select(range(n_examples))
+    start = time.perf_counter()
+    results = asyncio.run(
+        process_dataset(model, client, dataset, max_concurrent_reqs))
+    end = time.perf_counter()
+    total_time = end - start
+    print(f"Total Test Time: {total_time:.4f} seconds")
+    if print_metrics:
+        print_performance_metrics(results, total_time)
+    # Compute WER
+    predictions = [res[2] for res in results]
+    references = [res[3] for res in results]
+    wer = load("wer")
+    wer_score = 100 * wer.compute(references=references,
+                                  predictions=predictions)
+    print("WER:", wer_score)
+    return wer_score
+# alternatives "openai/whisper-large-v2", "openai/whisper-large-v3-turbo"..
+@pytest.mark.parametrize("model_name", ["openai/whisper-large-v3"])
+# Original dataset is 20GB+ in size, hence we use a pre-filtered slice.
+@pytest.mark.parametrize(
+    "dataset_repo", ["D4nt3/esb-datasets-earnings22-validation-tiny-filtered"])
+# NOTE: Expected WER measured with equivalent hf.transformers args:
+# whisper-large-v3 + esb-datasets-earnings22-validation-tiny-filtered.
+@pytest.mark.parametrize("expected_wer", [12.744980])
+def test_wer_correctness(model_name,
+                         dataset_repo,
+                         expected_wer,
+                         n_examples=-1,
+                         max_concurrent_request=None):
+    with RemoteOpenAIServer(model_name, ['--enforce-eager']) as remote_server:
+        dataset = load_hf_dataset(dataset_repo)
+        if not max_concurrent_request:
+            # No max concurrency
+            max_concurrent_request = n_examples if n_examples > 0\
+                else len(dataset)
+        client = remote_server.get_async_client()
+        wer = run_evaluation(model_name, client, dataset,
+                             max_concurrent_request, n_examples)
+        if expected_wer:
+            torch.testing.assert_close(wer, expected_wer, atol=1e-1, rtol=1e-2)
--- a/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
+++ b/tests/entrypoints/openai/reasoning_parsers/test_deepseekr1_reasoning_parser.py
@@ -15,32 +15,62 @@ start_token = "<think>"
 end_token = "</think>"
 SIMPLE_REASONING = {
-    "output": "<think>This is a reasoning section</think>This is the rest",
+    "output": "This is a reasoning section</think>This is the rest",
    "reasoning_content": "This is a reasoning section",
    "content": "This is the rest",
 }
 COMPLETE_REASONING = {
-    "output": "<think>This is a reasoning section</think>",
+    "output": "This is a reasoning section</think>",
    "reasoning_content": "This is a reasoning section",
    "content": None,
 }
-NO_REASONING = {
+NO_CONTENT = {
+    "output": "This is content",
+    "reasoning_content": "This is content",
+    "content": None,
+}
+NO_REASONING_STREAMING = {
    "output": "This is a reasoning section",
-    "reasoning_content": None,
+    "reasoning_content": "This is a reasoning section",
-    "content": "This is a reasoning section",
+    "content": None,
 }
 MULTIPLE_LINES = {
-    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "output": "This\nThat</think>This is the rest\nThat",
    "reasoning_content": "This\nThat",
    "content": "This is the rest\nThat",
 }
 SHORTEST_REASONING_NO_STREAMING = {
-    "output": "<think></think>This is the rest",
+    "output": "</think>This is the rest",
    "reasoning_content": "",
    "content": "This is the rest",
 }
 SHORTEST_REASONING = {
-    "output": "<think></think>This is the rest",
+    "output": "</think>This is the rest",
+    "reasoning_content": None,
+    "content": "This is the rest",
+}
+REASONING_WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>This is the rest",
+    "reasoning_content": "This is a reasoning section",
+    "content": "This is the rest",
+}
+COMPLETE_REASONING_WITH_THINK = {
+    "output": "<think>This is a reasoning section</think>",
+    "reasoning_content": "This is a reasoning section",
+    "content": None,
+}
+MULTIPLE_LINES_WITH_THINK = {
+    "output": "<think>This\nThat</think>This is the rest\nThat",
+    "reasoning_content": "This\nThat",
+    "content": "This is the rest\nThat",
+}
+SHORTEST_REASONING_NO_STREAMING_WITH_THINK = {
+    "output": "</think>This is the rest",
+    "reasoning_content": "",
+    "content": "This is the rest",
+}
+SHORTEST_REASONING_WITH_THINK = {
+    "output": "</think>This is the rest",
    "reasoning_content": None,
    "content": "This is the rest",
 }
@@ -49,37 +79,37 @@ TEST_CASES = [
    pytest.param(
        False,
        SIMPLE_REASONING,
-        id="simple_streaming",
+        id="simple_reasoning",
    ),
    pytest.param(
        True,
        SIMPLE_REASONING,
-        id="simple_streaming",
+        id="simple_reasoning_streaming",
    ),
    pytest.param(
        False,
        COMPLETE_REASONING,
-        id="complete_streaming",
+        id="complete_reasoning",
    ),
    pytest.param(
        True,
        COMPLETE_REASONING,
-        id="complete_streaming",
+        id="complete_reasoning_streaming",
    ),
    pytest.param(
        False,
-        NO_REASONING,
+        NO_CONTENT,
-        id="no_streaming",
+        id="no_content_token",
    ),
    pytest.param(
        True,
-        NO_REASONING,
+        NO_REASONING_STREAMING,
-        id="no_streaming",
+        id="no_reasoning_token_streaming",
    ),
    pytest.param(
        False,
        MULTIPLE_LINES,
-        id="multiple_lines_streaming",
+        id="multiple_lines",
    ),
    pytest.param(
        True,
@@ -89,23 +119,65 @@ TEST_CASES = [
    pytest.param(
        True,
        SHORTEST_REASONING,
-        id="shortest_streaming",
+        id="shortest",
    ),
    pytest.param(
        False,
        SHORTEST_REASONING_NO_STREAMING,
        id="shortest_streaming",
    ),
+    pytest.param(
+        False,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        REASONING_WITH_THINK,
+        id="reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think",
+    ),
+    pytest.param(
+        True,
+        COMPLETE_REASONING_WITH_THINK,
+        id="complete_reasoning_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think",
+    ),
+    pytest.param(
+        True,
+        MULTIPLE_LINES_WITH_THINK,
+        id="multiple_lines_with_think_streaming",
+    ),
+    pytest.param(
+        False,
+        SHORTEST_REASONING_NO_STREAMING_WITH_THINK,
+        id="shortest_with_think",
+    ),
+    pytest.param(
+        True,
+        SHORTEST_REASONING_WITH_THINK,
+        id="shortest_with_think_streaming",
+    ),
 ]
+# Global tokenizer initialization to avoid repeated loading
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
+tokenizer.add_tokens([start_token, end_token])
 @pytest.mark.parametrize("streaming, param_dict", TEST_CASES)
 def test_reasoning(
    streaming: bool,
    param_dict: dict,
 ):
-    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
-    tokenizer.add_tokens([start_token, end_token])
    output = tokenizer.tokenize(param_dict["output"])
    # decode everything to tokens
    output_tokens: List[str] = [

--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -12,10 +12,7 @@ from vllm.multimodal.utils import encode_audio_base64, fetch_audio
 from ...utils import RemoteOpenAIServer, models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3")
+MODEL_NAME = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
-# TEST_AUDIO_URLS = [
-#     AudioAsset("winning_call").url,
-# ]
 TEST_AUDIO_URLS = [
    "http://localhost:8000/winning_call.ogg"
 ]
@@ -86,7 +83,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=201, total_tokens=211)
    message = choice.message
    message = chat_completion.choices[0].message
@@ -143,7 +140,7 @@ async def test_single_chat_session_audio_base64encoded(
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=201, total_tokens=211)
    message = choice.message
    message = chat_completion.choices[0].message
@@ -199,7 +196,7 @@ async def test_single_chat_session_input_audio(
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=202, total_tokens=212)
+        completion_tokens=10, prompt_tokens=201, total_tokens=211)
    message = choice.message
    message = chat_completion.choices[0].message

--- a/tests/entrypoints/openai/test_basic.py
+++ b/tests/entrypoints/openai/test_basic.py
@@ -157,3 +157,19 @@ async def test_request_cancellation(server: RemoteOpenAIServer):
                                                    max_tokens=10)
    assert len(response.choices) == 1
+@pytest.mark.asyncio
+async def test_request_wrong_content_type(server: RemoteOpenAIServer):
+    chat_input = [{"role": "user", "content": "Write a long story"}]
+    client = server.get_async_client()
+    with pytest.raises(openai.APIStatusError):
+        await client.chat.completions.create(
+            messages=chat_input,
+            model=MODEL_NAME,
+            max_tokens=10000,
+            extra_headers={
+                "Content-Type": "application/x-www-form-urlencoded"
+            })
--- a/tests/entrypoints/openai/test_metrics.py
+++ b/tests/entrypoints/openai/test_metrics.py
@@ -86,6 +86,10 @@ EXPECTED_VALUES = {
    "vllm:time_per_output_token_seconds":
    [("_count", _NUM_REQUESTS * (_NUM_GENERATION_TOKENS_PER_REQUEST - 1))],
    "vllm:e2e_request_latency_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_queue_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_inference_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_prefill_time_seconds": [("_count", _NUM_REQUESTS)],
+    "vllm:request_decode_time_seconds": [("_count", _NUM_REQUESTS)],
    "vllm:request_prompt_tokens":
    [("_sum", _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST),
     ("_count", _NUM_REQUESTS)],
@@ -93,9 +97,14 @@ EXPECTED_VALUES = {
    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
     ("_count", _NUM_REQUESTS)],
    "vllm:request_params_n": [("_count", _NUM_REQUESTS)],
-    "vllm:request_params_max_tokens":
+    "vllm:request_params_max_tokens": [
-    [("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
+        ("_sum", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST),
-     ("_count", _NUM_REQUESTS)],
+        ("_count", _NUM_REQUESTS)
+    ],
+    "vllm:iteration_tokens_total":
+    [("_sum", _NUM_REQUESTS *
+      (_NUM_PROMPT_TOKENS_PER_REQUEST + _NUM_GENERATION_TOKENS_PER_REQUEST)),
+     ("_count", _NUM_REQUESTS * _NUM_GENERATION_TOKENS_PER_REQUEST)],
    "vllm:prompt_tokens": [("_total",
                            _NUM_REQUESTS * _NUM_PROMPT_TOKENS_PER_REQUEST)],
    "vllm:generation_tokens": [
@@ -170,6 +179,18 @@ EXPECTED_METRICS = [
    "vllm:e2e_request_latency_seconds_sum",
    "vllm:e2e_request_latency_seconds_bucket",
    "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_queue_time_seconds_sum",
+    "vllm:request_queue_time_seconds_bucket",
+    "vllm:request_queue_time_seconds_count",
+    "vllm:request_inference_time_seconds_sum",
+    "vllm:request_inference_time_seconds_bucket",
+    "vllm:request_inference_time_seconds_count",
+    "vllm:request_prefill_time_seconds_sum",
+    "vllm:request_prefill_time_seconds_bucket",
+    "vllm:request_prefill_time_seconds_count",
+    "vllm:request_decode_time_seconds_sum",
+    "vllm:request_decode_time_seconds_bucket",
+    "vllm:request_decode_time_seconds_count",
    "vllm:request_prompt_tokens_sum",
    "vllm:request_prompt_tokens_bucket",
    "vllm:request_prompt_tokens_count",
@@ -182,6 +203,7 @@ EXPECTED_METRICS = [
    "vllm:request_params_max_tokens_sum",
    "vllm:request_params_max_tokens_bucket",
    "vllm:request_params_max_tokens_count",
+    "vllm:iteration_tokens_total",
    "vllm:num_preemptions_total",
    "vllm:prompt_tokens_total",
    "vllm:generation_tokens_total",
@@ -204,8 +226,11 @@ EXPECTED_METRICS_V1 = [
    "vllm:num_requests_running",
    "vllm:num_requests_waiting",
    "vllm:gpu_cache_usage_perc",
+    "vllm:gpu_prefix_cache_queries",
+    "vllm:gpu_prefix_cache_hits",
    "vllm:prompt_tokens_total",
    "vllm:generation_tokens_total",
+    "vllm:iteration_tokens_total",
    "vllm:request_success_total",
    "vllm:request_prompt_tokens_sum",
    "vllm:request_prompt_tokens_bucket",
@@ -219,6 +244,21 @@ EXPECTED_METRICS_V1 = [
    "vllm:time_per_output_token_seconds_sum",
    "vllm:time_per_output_token_seconds_bucket",
    "vllm:time_per_output_token_seconds_count",
+    "vllm:e2e_request_latency_seconds_sum",
+    "vllm:e2e_request_latency_seconds_bucket",
+    "vllm:e2e_request_latency_seconds_count",
+    "vllm:request_queue_time_seconds_sum",
+    "vllm:request_queue_time_seconds_bucket",
+    "vllm:request_queue_time_seconds_count",
+    "vllm:request_inference_time_seconds_sum",
+    "vllm:request_inference_time_seconds_bucket",
+    "vllm:request_inference_time_seconds_count",
+    "vllm:request_prefill_time_seconds_sum",
+    "vllm:request_prefill_time_seconds_bucket",
+    "vllm:request_prefill_time_seconds_count",
+    "vllm:request_decode_time_seconds_sum",
+    "vllm:request_decode_time_seconds_bucket",
+    "vllm:request_decode_time_seconds_count",
 ]

--- a/tests/entrypoints/openai/test_rerank.py
+++ b/tests/entrypoints/openai/test_rerank.py
@@ -86,4 +86,4 @@ def test_rerank_max_model_len(server: RemoteOpenAIServer, model_name: str):
    assert rerank_response.status_code == 400
    # Assert just a small fragments of the response
    assert "Please reduce the length of the input." in \
        rerank_response.text
\ No newline at end of file
--- a/tests/entrypoints/openai/test_serving_models.py
+++ b/tests/entrypoints/openai/test_serving_models.py
@@ -16,7 +16,7 @@ from vllm.entrypoints.openai.serving_models import (BaseModelPath,
 from vllm.lora.request import LoRARequest
 from ...utils import models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-2-7b")
+MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
 BASE_MODEL_PATHS = [BaseModelPath(name=MODEL_NAME, model_path=MODEL_NAME)]
 LORA_LOADING_SUCCESS_MESSAGE = (
    "Success: LoRA adapter '{lora_name}' added successfully.")

--- a/tests/entrypoints/openai/test_shutdown.py
+++ b/tests/entrypoints/openai/test_shutdown.py
 # SPDX-License-Identifier: Apache-2.0
+import os
 import openai
 import pytest
 from ...utils import RemoteOpenAIServer, models_path_prefix
-MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B")
+MODEL_NAME = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-1B-Instruct")
 @pytest.mark.asyncio

--- a/tests/entrypoints/openai/test_sleep.py
+++ b/tests/entrypoints/openai/test_sleep.py
+# SPDX-License-Identifier: Apache-2.0
+import requests
+from ...utils import RemoteOpenAIServer
+MODEL_NAME = "meta-llama/Llama-3.2-1B"
+def test_sleep_mode():
+    # dtype, max-len etc set so that this can run in CI
+    args = [
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "8192",
+        "--max-num-seqs",
+        "128",
+        "--enable-sleep-mode",
+    ]
+    with RemoteOpenAIServer(MODEL_NAME,
+                            args,
+                            env_dict={
+                                "VLLM_SERVER_DEV_MODE": "1",
+                                "CUDA_VISIBLE_DEVICES": "0"
+                            }) as remote_server:
+        response = requests.post(remote_server.url_for("/sleep"),
+                                 data={"level": "1"})
+        assert response.status_code == 200
+        response = requests.post(remote_server.url_for("/wake_up"))
+        assert response.status_code == 200
--- a/tests/entrypoints/openai/test_transcription_validation.py
+++ b/tests/entrypoints/openai/test_transcription_validation.py
+# SPDX-License-Identifier: Apache-2.0
+# imports for guided decoding tests
+import io
+import json
+import librosa
+import numpy as np
+import openai
+import pytest
+import soundfile as sf
+from vllm.assets.audio import AudioAsset
+from ...utils import RemoteOpenAIServer
+@pytest.fixture
+def mary_had_lamb():
+    path = AudioAsset('mary_had_lamb').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+@pytest.fixture
+def winning_call():
+    path = AudioAsset('winning_call').get_local_path()
+    with open(str(path), "rb") as f:
+        yield f
+@pytest.mark.asyncio
+async def test_basic_audio(mary_had_lamb):
+    model_name = "openai/whisper-large-v3-turbo"
+    server_args = ["--enforce-eager"]
+    # Based on https://github.com/openai/openai-cookbook/blob/main/examples/Whisper_prompting_guide.ipynb.
+    prompt = "THE FIRST WORDS I SPOKE"
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        transcription = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            temperature=0.0)
+        out = json.loads(transcription)['text']
+        assert "Mary had a little lamb," in out
+        # This should "force" whisper to continue prompt in all caps
+        transcription_wprompt = await client.audio.transcriptions.create(
+            model=model_name,
+            file=mary_had_lamb,
+            language="en",
+            response_format="text",
+            prompt=prompt,
+            temperature=0.0)
+        out_capital = json.loads(transcription_wprompt)['text']
+        assert prompt not in out_capital
+@pytest.mark.asyncio
+async def test_bad_requests(mary_had_lamb):
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        # invalid language
+        with pytest.raises(openai.BadRequestError):
+            await client.audio.transcriptions.create(model=model_name,
+                                                     file=mary_had_lamb,
+                                                     language="hh",
+                                                     temperature=0.0)
+        # Expect audio too long: repeat the timeseries
+        mary_had_lamb.seek(0)
+        audio, sr = librosa.load(mary_had_lamb)
+        repeated_audio = np.tile(audio, 10)
+        # Repeated audio to buffer
+        buffer = io.BytesIO()
+        sf.write(buffer, repeated_audio, sr, format='WAV')
+        buffer.seek(0)
+        with pytest.raises(openai.BadRequestError):
+            await client.audio.transcriptions.create(model=model_name,
+                                                     file=buffer,
+                                                     language="en",
+                                                     temperature=0.0)
+@pytest.mark.asyncio
+async def test_non_asr_model(winning_call):
+    # text to text model
+    model_name = "JackFram/llama-68m"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.audio.transcriptions.create(model=model_name,
+                                                       file=winning_call,
+                                                       language="en",
+                                                       temperature=0.0)
+        assert res.code == 400 and not res.text
+        assert res.message == "The model does not support Transcriptions API"
+@pytest.mark.asyncio
+async def test_completion_endpoints():
+    # text to text model
+    model_name = "openai/whisper-small"
+    server_args = ["--enforce-eager"]
+    with RemoteOpenAIServer(model_name, server_args) as remote_server:
+        client = remote_server.get_async_client()
+        res = await client.chat.completions.create(
+            model=model_name,
+            messages=[{
+                "role": "system",
+                "content": "You are a helpful assistant."
+            }])
+        assert res.code == 400
+        assert res.message == "The model does not support Chat Completions API"
+        res = await client.completions.create(model=model_name, prompt="Hello")
+        assert res.code == 400
+        assert res.message == "The model does not support Completions API"
--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -101,7 +101,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=775, total_tokens=785)
+        completion_tokens=10, prompt_tokens=774, total_tokens=784)
    message = choice.message
    message = chat_completion.choices[0].message
@@ -194,7 +194,7 @@ async def test_single_chat_session_image_base64encoded(
    choice = chat_completion.choices[0]
    assert choice.finish_reason == "length"
    assert chat_completion.usage == openai.types.CompletionUsage(
-        completion_tokens=10, prompt_tokens=775, total_tokens=785)
+        completion_tokens=10, prompt_tokens=774, total_tokens=784)
    message = choice.message
    message = chat_completion.choices[0].message

--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -101,5 +101,5 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 3072
    assert embeddings.usage.completion_tokens == 0
-    assert embeddings.usage.prompt_tokens == 764
+    assert embeddings.usage.prompt_tokens == 763
-    assert embeddings.usage.total_tokens == 764
+    assert embeddings.usage.total_tokens == 763
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -23,7 +23,7 @@ from ..utils import VLLM_PATH
 EXAMPLES_DIR = VLLM_PATH / "examples"
 PHI3V_MODEL_ID = os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")
-ULTRAVOX_MODEL_ID = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_3")
+ULTRAVOX_MODEL_ID = os.path.join(models_path_prefix, "fixie-ai/ultravox-v0_5-llama-3_2-1b")
 QWEN2VL_MODEL_ID = os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")
 MLLAMA_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-3.2-11B-Vision-Instruct")
 LLAMA_GUARD_MODEL_ID = os.path.join(models_path_prefix, "meta-llama/Llama-Guard-3-1B")

--- a/tests/kernels/test_cutlass_2of4_sparse.py
+++ b/tests/kernels/test_cutlass_2of4_sparse.py
@@ -7,7 +7,6 @@ from typing import Tuple, Type
 import pytest
 import torch
-import torch.nn.functional as F
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
@@ -55,11 +54,39 @@ def prune_to_2_4(tensor):
    return pruned.reshape(original_shape)
+# This function checks that applying an identity matrix multiplication
+# to the compressed weights yields the original uncompressed weights.
+def check_compress_decompress_invariance(dtype: torch.dtype, b: torch.Tensor,
+                                         b_compressed: torch.Tensor,
+                                         b_metadata: torch.Tensor):
+    # For float16 and bfloat16, cutlass_scaled_sparse_mm's output must be the
+    # same dtype as its inputs. This line addresses that constraint while
+    # arbitrarily using bfloat16 for the int8/fp8 cases.
+    out_dtype = torch.float16 if dtype is torch.float16 else torch.bfloat16
+    eye = torch.eye(b.shape[0], device='cuda', dtype=dtype)
+    eye_scale = torch.ones(1, device='cuda', dtype=torch.float32)
+    b_decomp = ops.cutlass_scaled_sparse_mm(eye,
+                                            b_compressed,
+                                            b_metadata,
+                                            eye_scale,
+                                            eye_scale,
+                                            out_dtype=out_dtype)
+    torch.testing.assert_close(b.to(dtype=out_dtype), b_decomp)
 def make_rand_sparse_tensors(
        dtype: torch.dtype, m: int, n: int, k: int
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
-    a = torch.randn((m, k), device='cuda') * 5
+    a = torch.randn((m, k), device='cuda')
-    b = torch.randn((n, k), device='cuda').t() * 5
+    b = torch.randn((n, k), device='cuda').t()
+    if dtype == torch.int8:
+        # ensure A and B aren't all zeros after rounding
+        a = a * 5.0
+        b = b * 5.0
    b = prune_to_2_4(b.t()).t()
@@ -75,6 +102,7 @@ def make_rand_sparse_tensors(
        raise ValueError("unsupported dtype")
    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+    check_compress_decompress_invariance(dtype, b, b_compressed, e)
    # Compressed B, Metadata, Original A, B
    return b_compressed, e, a, b
@@ -134,27 +162,37 @@ MNK_FACTORS = [
 # Test working with a subset of A and B for sparse matmul
-@pytest.mark.skip(reason="2of4 sparse w16a16 CUTLASS produces bad output.")
 @pytest.mark.skipif(not sparse_cutlass_supported(),
                    reason="Sparse CUTLASS is not supported on this GPU type.")
-@pytest.mark.parametrize("m, k, n", MNK_FACTORS)
+@pytest.mark.parametrize("m, n, k", MNK_FACTORS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
-def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]):
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype],
+                             use_bias: bool):
    # Create tensors
    b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
    scale_a = torch.ones((1, 1), device="cuda", dtype=torch.float32)
    scale_b = torch.ones((1, 1), device="cuda", dtype=torch.float32)
+    bias = torch.rand((n, ), device="cuda", dtype=dtype) if use_bias else None
    out = ops.cutlass_scaled_sparse_mm(a,
                                       b_comp,
                                       e,
                                       scale_a,
                                       scale_b,
-                                       out_dtype=dtype)
+                                       out_dtype=dtype,
-    baseline = F.linear(a, b.T)
+                                       bias=bias)
-    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=1e-2)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=dtype,
+                                  bias=bias)
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
 @pytest.mark.skipif(not sparse_cutlass_supported(),
@@ -162,27 +200,34 @@ def test_cutlass_sparse_gemm(m: int, k: int, n: int, dtype: Type[torch.dtype]):
 @pytest.mark.parametrize("m, k, n", MNK_FACTORS)
 @pytest.mark.skipif(not current_platform.has_device_capability(89),
                    reason="FP8 is not supported on this GPU type.")
-def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int):
+@pytest.mark.parametrize("use_bias", [True, False])
+def test_cutlass_sparse_fp8_gemm(m: int, n: int, k: int, use_bias: bool):
    # Create tensors
    b_comp, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    out_dtype = torch.bfloat16
+    bias = torch.rand(
+        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
    out = ops.cutlass_scaled_sparse_mm(a,
                                       b_comp,
                                       e,
                                       scale_a,
                                       scale_b,
-                                       out_dtype=torch.bfloat16)
+                                       out_dtype=out_dtype,
+                                       bias=bias)
    baseline = baseline_scaled_mm(a,
                                  b,
                                  scale_a,
                                  scale_b,
-                                  out_dtype=torch.bfloat16)
+                                  out_dtype=out_dtype,
+                                  bias=bias)
-    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
+    torch.testing.assert_close(out, baseline, rtol=1e-2, atol=3e-1)
 @pytest.mark.skipif(not sparse_cutlass_supported(),
@@ -198,18 +243,24 @@ def test_cutlass_sparse_int8_gemm(m: int, n: int, k: int, per_act_token: bool,
    b_comp, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
    scale_a = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
    scale_b = (torch.randn((1, 1), device="cuda", dtype=torch.float32))
+    out_dtype = torch.bfloat16
+    bias = torch.rand(
+        (n, ), device="cuda", dtype=out_dtype) * 10 if use_bias else None
    out = ops.cutlass_scaled_sparse_mm(a,
                                       b_comp,
                                       e,
                                       scale_a,
                                       scale_b,
-                                       out_dtype=torch.bfloat16)
+                                       out_dtype=out_dtype,
+                                       bias=bias)
    baseline = baseline_scaled_mm(a,
                                  b,
                                  scale_a,
                                  scale_b,
-                                  out_dtype=torch.bfloat16)
+                                  out_dtype=out_dtype,
+                                  bias=bias)
    torch.testing.assert_close(out, baseline, rtol=1e0, atol=2e0)
--- a/tests/kernels/test_mamba_mixer2.py
+++ b/tests/kernels/test_mamba_mixer2.py
+# SPDX-License-Identifier: Apache-2.0
+import unittest
+from typing import Tuple
+import pytest
+import torch
+from tests.utils import multi_gpu_test
+from vllm.distributed.parallel_state import (init_distributed_environment,
+                                             initialize_model_parallel)
+from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated
+from vllm.platforms import current_platform
+from vllm.utils import update_environment_variables
+@multi_gpu_test(num_gpus=2)
+@pytest.mark.parametrize("batch_size", [8])
+@pytest.mark.parametrize("seq_len", [128])
+@pytest.mark.parametrize(
+    "hidden_size_n_groups",
+    [
+        (64, 1),
+        (64, 2),
+        (64, 4),  # hidden_size be divisible by num_gpus
+        (100, 5),  # and n_groups must divide hidden_size
+    ])
+@pytest.mark.parametrize("dtype", [torch.float16])
+def test_mixer2_gated_norm_multi_gpu(
+    batch_size: int,
+    seq_len: int,
+    hidden_size_n_groups: Tuple[int, int],
+    dtype: torch.dtype,
+    device: str = 'cuda',
+):
+    hidden_size, n_groups = hidden_size_n_groups
+    num_processes = 2
+    def run_torch_spawn(fn, nprocs):
+        # need to use torch.mp.spawn otherwise will have problems with
+        # torch.distributed and cuda
+        torch.multiprocessing.spawn(fn,
+                                    args=(
+                                        num_processes,
+                                        batch_size,
+                                        seq_len,
+                                        hidden_size,
+                                        n_groups,
+                                        dtype,
+                                        device,
+                                    ),
+                                    nprocs=nprocs)
+    run_torch_spawn(mixer2_gated_norm_tensor_parallel, 2)
+def mixer2_gated_norm_tensor_parallel(
+    local_rank: int,
+    world_size: int,
+    batch_size: int,
+    seq_len: int,
+    hidden_size: int,
+    n_groups: int,
+    dtype: torch.dtype,
+    device: str,
+):
+    current_platform.seed_everything(0)
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+    torch.set_default_device(device)
+    torch.set_default_dtype(dtype)
+    update_environment_variables({
+        'RANK': str(local_rank),
+        'LOCAL_RANK': str(local_rank),
+        'WORLD_SIZE': str(world_size),
+        'MASTER_ADDR': 'localhost',
+        'MASTER_PORT': '12345',
+    })
+    # initialize distributed
+    init_distributed_environment()
+    initialize_model_parallel(tensor_model_parallel_size=world_size)
+    # create random weights an inputs
+    weight = torch.rand((hidden_size, ), dtype=dtype, device=device)
+    hidden_states = torch.randn(batch_size, seq_len, hidden_size)
+    gate_states = torch.randn(batch_size, seq_len, hidden_size)
+    # create gated-norm with TP
+    mixer = Mixer2RMSNormGated(
+        full_hidden_size=hidden_size,
+        full_n_groups=n_groups,
+    )
+    mixer.weight.weight_loader(mixer.weight, weight)  # load
+    # create gated-norm without TP to compute reference
+    # - utilize mock patching to disable TP when
+    with (unittest.mock.patch(
+            "vllm.model_executor.layers.mamba.mamba_mixer2."
+            "get_tensor_model_parallel_world_size",
+            return_value=1),
+          unittest.mock.patch(
+              "vllm.model_executor.layers.mamba.mamba_mixer2."
+              "get_tensor_model_parallel_rank",
+              return_value=0)):
+        mixer_single_gpu = Mixer2RMSNormGated(
+            full_hidden_size=hidden_size,
+            full_n_groups=n_groups,
+        )
+    # assign weight to single-gpu mixer
+    mixer_single_gpu.weight.data = weight
+    # generate and compare
+    N = hidden_size // world_size
+    output = mixer(
+        hidden_states[..., local_rank * N:(local_rank + 1) * N],
+        gate_states[..., local_rank * N:(local_rank + 1) * N],
+    )
+    ref_output = mixer_single_gpu(hidden_states, gate_states)
+    torch.allclose(output,
+                   ref_output[..., local_rank * N:(local_rank + 1) * N],
+                   atol=1e-3,
+                   rtol=1e-3)
--- a/tests/kernels/test_mamba_ssm_ssd.py
+++ b/tests/kernels/test_mamba_ssm_ssd.py
+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, Tuple
+import pytest
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from vllm.model_executor.layers.mamba.ops.ssd_combined import (
+    mamba_chunk_scan_combined)
+from vllm.platforms import current_platform
+# Added by the IBM Team, 2024
+# Adapted from https://github.com/state-spaces/mamba/blob/v2.2.4/mamba_ssm/modules/ssd_minimal.py
+# this is the segsum implementation taken from above
+def segsum(x):
+    """Calculates segment sum."""
+    T = x.size(-1)
+    x = repeat(x, "... d -> ... d e", e=T)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
+                      diagonal=-1)
+    x = x.masked_fill(~mask, 0)
+    x_segsum = torch.cumsum(x, dim=-2)
+    mask = torch.tril(torch.ones(T, T, device=x.device, dtype=bool),
+                      diagonal=0)
+    x_segsum = x_segsum.masked_fill(~mask, -torch.inf)
+    return x_segsum
+def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None):
+    """
+    Arguments:
+        X: (batch, length, n_heads, d_head)
+        A: (batch, length, n_heads)
+        B: (batch, length, n_heads, d_state)
+        C: (batch, length, n_heads, d_state)
+    Return:
+        Y: (batch, length, n_heads, d_head)
+    """
+    assert X.dtype == A.dtype == B.dtype == C.dtype
+    assert X.shape[1] % block_len == 0
+    # Rearrange into blocks/chunks
+    X, A, B, C = (rearrange(x, "b (c l) ... -> b c l ...", l=block_len)
+                  for x in (X, A, B, C))
+    A = rearrange(A, "b c l h -> b h c l")
+    A_cumsum = torch.cumsum(A, dim=-1)
+    # 1. Compute the output for each intra-chunk (diagonal blocks)
+    L = torch.exp(segsum(A))
+    Y_diag = torch.einsum("bclhn,bcshn,bhcls,bcshp->bclhp", C, B, L, X)
+    # 2. Compute the state for each intra-chunk
+    # (right term of low-rank factorization of off-diagonal blocks; B terms)
+    decay_states = torch.exp(A_cumsum[:, :, :, -1:] - A_cumsum)
+    states = torch.einsum("bclhn,bhcl,bclhp->bchpn", B, decay_states, X)
+    # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at
+    #    chunk boundaries
+    # (middle term of factorization of off-diag blocks; A terms)
+    if initial_states is None:
+        initial_states = torch.zeros_like(states[:, :1])
+    states = torch.cat([initial_states, states], dim=1)
+    decay_chunk = torch.exp(segsum(F.pad(A_cumsum[:, :, :, -1], (1, 0))))
+    new_states = torch.einsum("bhzc,bchpn->bzhpn", decay_chunk, states)
+    states, final_state = new_states[:, :-1], new_states[:, -1]
+    # 4. Compute state -> output conversion per chunk
+    # (left term of low-rank factorization of off-diagonal blocks; C terms)
+    state_decay_out = torch.exp(A_cumsum)
+    Y_off = torch.einsum('bclhn,bchpn,bhcl->bclhp', C, states, state_decay_out)
+    # Add output of intra-chunk and inter-chunk terms
+    # (diagonal and off-diagonal blocks)
+    Y = rearrange(Y_diag + Y_off, "b c l h p -> b (c l) h p")
+    return Y, final_state
+def generate_random_inputs(batch_size,
+                           seqlen,
+                           n_heads,
+                           d_head,
+                           itype,
+                           device='cuda'):
+    current_platform.seed_everything(0)
+    A = (-torch.exp(torch.rand(n_heads, dtype=itype, device=device)))
+    dt = F.softplus(
+        torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) -
+        4)
+    X = torch.randn((batch_size, seqlen, n_heads, d_head),
+                    dtype=itype,
+                    device=device)
+    B = torch.randn((batch_size, seqlen, n_heads, d_head),
+                    dtype=itype,
+                    device=device)
+    C = torch.randn((batch_size, seqlen, n_heads, d_head),
+                    dtype=itype,
+                    device=device)
+    return A, dt, X, B, C
+def generate_continous_batched_examples(example_lens_by_batch,
+                                        num_examples,
+                                        full_length,
+                                        last_taken,
+                                        exhausted,
+                                        n_heads,
+                                        d_head,
+                                        itype,
+                                        device='cuda'):
+    # this function generates a random examples of certain length
+    # and then cut according to "example_lens_by_batch" and feed
+    # them in continuous batches to the kernels
+    # generate the full-length example
+    A, dt, X, B, C = generate_random_inputs(num_examples, full_length, n_heads,
+                                            d_head, itype)
+    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1),
+                                                  A * dt,
+                                                  B,
+                                                  C,
+                                                  block_len=full_length // 4)
+    # internal function that outputs a cont batch of examples
+    # given a tuple of lengths for each example in the batch
+    # e.g., example_lens=(8, 4) means take 8 samples from first eg,
+    #       4 examples from second eg, etc
+    def get_continuous_batch(example_lens: Tuple[int, ...]):
+        indices = []
+        for i, x in enumerate(example_lens):
+            c = last_taken.get(i, 0)
+            indices.append((c, c + x))
+            last_taken[i] = (c + x) % full_length
+            exhausted[i] = last_taken[i] == 0
+        return (torch.concat([x[i, s:e] for i, (s, e) in enumerate(indices)
+                              ]).unsqueeze(0) for x in (dt, X, B, C))
+    # internal function that maps "n" to the appropriate right boundary
+    # value when forming continuous batches from examples of length given
+    # by "full_length".
+    # - e.g., when n > full_length, returns n % full_length
+    #         when n == full_length, returns full_length
+    def end_boundary(n: int):
+        return n - ((n - 1) // full_length) * full_length
+    IND_E = None
+    for spec in example_lens_by_batch:
+        # get the (maybe partial) example seen in this cont batch
+        dt2, X2, B2, C2 = get_continuous_batch(spec)
+        # get the metadata
+        cu_seqlens = torch.tensor((0, ) + spec, device=device).cumsum(dim=0)
+        sed_idx = torch.zeros(cu_seqlens[-1],
+                              dtype=torch.int32,
+                              device=cu_seqlens.device)
+        for i, (srt, end) in enumerate(zip(
+                cu_seqlens,
+                cu_seqlens[1:],
+        )):
+            sed_idx[srt:end] = i
+        # for cont batch
+        if IND_E is None:
+            IND_S = [0 for _ in range(len(spec))]
+        else:
+            IND_S = [x % full_length for x in IND_E]
+        IND_E = [end_boundary(x + y) for x, y in zip(IND_S, spec)]
+        yield ([Y_min[s, IND_S[s]:IND_E[s]] for s in range(num_examples)],
+               cu_seqlens, sed_idx.unsqueeze(0), (A, dt2, X2, B2, C2))
+@pytest.mark.parametrize("itype",
+                         [torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("n_heads", [3, 4, 11, 16, 32])
+@pytest.mark.parametrize("d_head", [5, 8, 19, 32, 128])
+@pytest.mark.parametrize("seq_len_chunk_size", [(119, 17), (128, 32)])
+def test_mamba_chunk_scan_single_example(d_head, n_heads, seq_len_chunk_size,
+                                         itype):
+    # this tests the kernels on a single example (no batching)
+    # set seed
+    batch_size = 1  # batch_size
+    # ssd_minimal_discrete requires chunk_size divide seqlen
+    # - this is only required for generating the reference seqs,
+    #   it is not an operational limitation.
+    seqlen, chunk_size = seq_len_chunk_size
+    A, dt, X, B, C = generate_random_inputs(batch_size, seqlen, n_heads,
+                                            d_head, itype)
+    Y_min, final_state_min = ssd_minimal_discrete(X * dt.unsqueeze(-1), A * dt,
+                                                  B, C, chunk_size)
+    Y, final_state = mamba_chunk_scan_combined(X,
+                                               dt,
+                                               A,
+                                               B,
+                                               C,
+                                               chunk_size,
+                                               D=None,
+                                               return_final_states=True)
+    # just test the last in sequence
+    torch.allclose(Y[:, -1], Y_min[:, -1], atol=1e-3, rtol=1e-3)
+    # just test the last head
+    # NOTE, in the kernel we always cast states to fp32
+    torch.allclose(final_state[:, -1],
+                   final_state_min[:, -1].to(torch.float32),
+                   atol=1e-3,
+                   rtol=1e-3)
+@pytest.mark.parametrize("itype", [torch.float32, torch.float16])
+@pytest.mark.parametrize("n_heads", [4, 8, 13])
+@pytest.mark.parametrize("d_head", [5, 16, 21, 32])
+@pytest.mark.parametrize(
+    "seq_len_chunk_size_cases",
+    [
+        # small-ish chunk_size (8)
+        (64, 8, 2, [(64, 32), (64, 32)]),
+        (64, 8, 2, [(32, 32), (32, 32), (32, 32)]),
+        (64, 8, 2, [(8, 8), (8, 8), (8, 8)]),  # chunk size boundary
+        (64, 8, 2, [(4, 4), (4, 4), (4, 4),
+                    (4, 4)]),  # chunk_size larger than cont batches
+        (64, 8, 5, [
+            (64, 32, 16, 8, 8),
+            (8, 16, 32, 16, 8),
+            (8, 8, 16, 32, 16),
+        ]),  # mode examples with varied lengths
+        # odd chunk_size
+        (64, 29, 2, [(11, 4), (13, 23), (19, 22),
+                     (21, 15)]),  # irregular sizes
+        # large-ish chunk_size (256)
+        (64, 256, 1, [(5, ), (1, ), (1, ),
+                      (1, )]),  # irregular sizes with small sequences
+        (64, 256, 2, [(5, 30), (1, 2), (1, 2),
+                      (1, 2)]),  # irregular sizes with small sequences
+    ])
+def test_mamba_chunk_scan_cont_batch(d_head, n_heads, seq_len_chunk_size_cases,
+                                     itype):
+    # this test with multiple examples in a continuous batch
+    # (i.e. chunked prefill)
+    seqlen, chunk_size, num_examples, cases = seq_len_chunk_size_cases
+    # hold state during the cutting process so we know if an
+    # example has been exhausted and needs to cycle
+    last_taken: Dict = {}  # map: eg -> pointer to last taken sample
+    exhausted: Dict = {}  # map: eg -> boolean indicating example is exhausted
+    states = None
+    for Y_min, cu_seqlens, sed_idx, (A, dt, X, B,
+                                     C) in generate_continous_batched_examples(
+                                         cases, num_examples, seqlen,
+                                         last_taken, exhausted, n_heads,
+                                         d_head, itype):
+        Y, new_states = mamba_chunk_scan_combined(
+            X,
+            dt,
+            A,
+            B,
+            C,
+            chunk_size,
+            D=None,
+            cu_seqlens=cu_seqlens,
+            seq_idx=sed_idx,
+            return_varlen_states=True,
+            initial_states=states,
+        )
+        # just test the last in sequence
+        for i in range(num_examples):
+            # just test one dim and dstate
+            Y_eg = Y[0, cu_seqlens[i]:cu_seqlens[i + 1], 0, 0]
+            Y_min_eg = Y_min[i][:, 0, 0]
+            torch.allclose(Y_eg, Y_min_eg, atol=1e-3, rtol=1e-3)
+        # update states
+        states = new_states
+        for i, clear in exhausted.items():
+            if clear:
+                states[i].fill_(0.)
+                exhausted[i] = False
--- a/tests/kernels/test_nvfp4_quant.py
+++ b/tests/kernels/test_nvfp4_quant.py
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from vllm import _custom_ops as ops
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+if not current_platform.has_device_capability(100):
+    pytest.skip(reason="Nvfp4 Requires compute capability of 10 or above.",
+                allow_module_level=True)
+DTYPES = [torch.float16, torch.bfloat16]
+SHAPES = [(128, 64), (128, 128), (256, 64), (256, 128)]
+PAD_SHAPES = [(90, 64), (150, 64), (128, 48), (128, 80), (150, 80), (90, 48),
+              (90, 128), (150, 128), (150, 48), (90, 80)]
+SEEDS = [42]
+CUDA_DEVICES = ['cuda:0']
+FLOAT4_E2M1_MAX = scalar_types.float4_e2m1fn.max()
+FLOAT8_E4M3_MAX = torch.finfo(torch.float8_e4m3fn).max
+# E2M1 to float
+# 0111 -> 6
+# 0110 -> 4
+# 0101 -> 3
+# 0100 -> 2
+# 0011 -> 1.5
+# 0010 -> 1
+# 0001 -> 0.5
+# 0000 -> 0
+E2M1_TO_FLOAT32 = [
+    0., 0.5, 1., 1.5, 2., 3., 4., 6., 0., -0.5, -1., -1.5, -2., -3., -4., -6.
+]
+BLOCK_SIZE = 16
+def cast_from_fp4(x, m, n):
+    # The fp4 values are packed in uint8 as [v_1st | v_2nd]
+    v_2nd = x & 0xF
+    v_1st = (x >> 4) & 0xF
+    c = torch.stack((v_2nd, v_1st), dim=-1)
+    out = torch.tensor([E2M1_TO_FLOAT32[x] for x in c.flatten()])
+    out = out.reshape(m, n).to(torch.float32)
+    return out
+def cast_to_fp4(x):
+    sign = torch.sign(x)
+    x = torch.abs(x)
+    x[(x >= 0.0) & (x <= 0.25)] = 0.0
+    x[(x > 0.25) & (x < 0.75)] = 0.5
+    x[(x >= 0.75) & (x <= 1.25)] = 1.0
+    x[(x > 1.25) & (x < 1.75)] = 1.5
+    x[(x >= 1.75) & (x <= 2.5)] = 2.0
+    x[(x > 2.5) & (x < 3.5)] = 3.0
+    x[(x >= 3.5) & (x <= 5.0)] = 4.0
+    x[x > 5.0] = 6.0
+    return x * sign
+def get_reciprocal(x):
+    if isinstance(x, torch.Tensor):
+        return torch.where(x == 0, torch.tensor(0.0, dtype=x.dtype), 1.0 / x)
+    elif isinstance(x, (float, int)):
+        return 0.0 if x == 0 else 1.0 / x
+    else:
+        raise TypeError("Input must be a float, int, or a torch.Tensor.")
+def ref_nvfp4_quant(x, global_scale):
+    assert global_scale.dtype == torch.float32
+    assert x.ndim == 2
+    m, n = x.shape
+    x = torch.reshape(x, (m, n // BLOCK_SIZE, BLOCK_SIZE))
+    vec_max = torch.max(torch.abs(x), dim=-1,
+                        keepdim=True)[0].to(torch.float32)
+    scale = global_scale * (vec_max * get_reciprocal(FLOAT4_E2M1_MAX))
+    scale = scale.to(torch.float8_e4m3fn).to(torch.float32)
+    output_scale = get_reciprocal(scale * get_reciprocal(global_scale))
+    scaled_x = x.to(torch.float32) * output_scale
+    clipped_x = torch.clamp(scaled_x, -6.0, 6.0).reshape(m, n)
+    return cast_to_fp4(clipped_x), scale.squeeze(-1)
+def recover_swizzled_scales(scale, m, n):
+    round_up = lambda x, y: (x + y - 1) // y * y
+    rounded_m = round_up(m, 128)
+    scale_n = n // BLOCK_SIZE
+    rounded_n = round_up(scale_n, 4)
+    # Recover the swizzled scaling factor to linear layout
+    tmp = torch.reshape(scale, (1, rounded_m // 128, rounded_n // 4, 32, 4, 4))
+    tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
+    result = torch.reshape(tmp, (rounded_m, rounded_n)).to(torch.float32)
+    return result[:m, :scale_n]
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@pytest.mark.parametrize("device", CUDA_DEVICES)
+@torch.inference_mode()
+def test_quantize_to_fp4(
+    dtype: torch.dtype,
+    shape: tuple[int, int],
+    seed: int,
+    device: str,
+) -> None:
+    current_platform.seed_everything(seed)
+    torch.set_default_device(device)
+    m, n = shape
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
+    out, out_scale = ops.scaled_fp4_quant(x, global_scale)
+    scale_ans = recover_swizzled_scales(out_scale, m, n)
+    out_ans = cast_from_fp4(out, m, n)
+    torch.testing.assert_close(out_ans, out_ref)
+    torch.testing.assert_close(scale_ans, scale_ref)
+@pytest.mark.parametrize("pad_shape", PAD_SHAPES)
+@torch.inference_mode()
+def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None:
+    dtype = torch.float16
+    current_platform.seed_everything(42)
+    torch.set_default_device('cuda:0')
+    m, n = pad_shape
+    x = torch.randn((m, n), dtype=dtype)
+    tensor_amax = torch.abs(x).max().to(torch.float32)
+    global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / tensor_amax
+    out_ref, scale_ref = ref_nvfp4_quant(x, global_scale)
+    out, out_scale = ops.scaled_fp4_quant(x, global_scale)
+    scale_ans = recover_swizzled_scales(out_scale, m, n)
+    out_ans = cast_from_fp4(out, m, n)
+    torch.testing.assert_close(out_ans, out_ref)
+    torch.testing.assert_close(scale_ans, scale_ref)