Merge tag 'v0.6.6.post1' into v0.6.6.post1-dev

96ae75ad · zhuwenwen · f9f4a735 · 2339d59f · 96ae75ad · f9f4a735
Commit 96ae75ad authored Jan 04, 2025 by zhuwenwen
20 changed files
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -11,15 +11,16 @@ protobuf # Required by LlamaTokenizer.
 fastapi >= 0.107.0, < 0.113.0; python_version < '3.9'
 fastapi >= 0.107.0, != 0.113.*, != 0.114.0; python_version >= '3.9'
 aiohttp
-openai >= 1.45.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
+openai >= 1.52.0 # Ensure modern openai package (ensure types module present and max_completion_tokens field support)
 uvicorn[standard]
 pydantic >= 2.9  # Required for fastapi >= 0.113.0
-pillow  # Required for image processing
 prometheus_client >= 0.18.0
+pillow  # Required for image processing
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer >= 0.10.9, < 0.11
-outlines == 0.1.11
+outlines == 0.1.11 # Requires pytorch
+lark == 1.2.2 
 xgrammar >= 0.1.6; platform_machine == "x86_64"
 typing_extensions >= 4.10
 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
@@ -33,5 +34,6 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.8.1 # required for compressed-tensors
-depyf==0.18.0 # required for profiling and debugging torch.compile
+compressed-tensors == 0.8.1 # required for compressed-tensors, requires pytorch
+depyf==0.18.0 # required for profiling and debugging with compilation config
+cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
--- a/requirements-cuda-arm64.txt
+++ b/requirements-cuda-arm64.txt
--index-url https://download.pytorch.org/whl/nightly/cu124
-torchvision==0.22.0.dev20241215; platform_machine == 'aarch64'
-torch==2.6.0.dev20241210+cu124; platform_machine == 'aarch64'
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -2,9 +2,9 @@
 -r requirements-common.txt

 # Dependencies for NVIDIA GPUs
-ray >= 2.9
+ray[default] >= 2.9
 nvidia-ml-py >= 12.560.30 # for pynvml package
-torch == 2.5.1; platform_machine != 'aarch64'
+torch == 2.5.1
 # These must be updated alongside torch
-torchvision == 0.20.1; platform_machine != 'aarch64' # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
+torchvision == 0.20.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
 xformers == 0.0.28.post3; platform_system == 'Linux' and platform_machine == 'x86_64'  # Requires PyTorch 2.5.1
--- a/requirements-openvino.txt
+++ b/requirements-openvino.txt
@@ -4,5 +4,5 @@
 torch == 2.5.1 #  should be aligned with "common" vLLM torch version
 openvino >= 2024.4.0 # since 2024.4.0 both CPU and GPU support Paged Attention

-optimum @ git+https://github.com/huggingface/optimum.git@main # latest optimum is used to support latest transformers version
-optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git@main # latest optimum-intel is used to support latest transformers version
+optimum @ git+https://github.com/huggingface/optimum.git # latest optimum is used to support latest transformers version
+optimum-intel[nncf] @ git+https://github.com/huggingface/optimum-intel.git # latest optimum-intel is used to support latest transformers version
--- a/requirements-xpu.txt
+++ b/requirements-xpu.txt
@@ -9,8 +9,8 @@ setuptools-scm>=8
 wheel
 jinja2

-torch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
-intel-extension-for-pytorch @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
-oneccl_bind_pt @ https://intel-extension-for-pytorch.s3.us-east-1.amazonaws.com/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl
+torch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/torch-2.5.0a0%2Bgite84e33f-cp310-cp310-linux_x86_64.whl
+intel-extension-for-pytorch @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/intel_extension_for_pytorch-2.5.10%2Bgit9d489a8-cp310-cp310-linux_x86_64.whl
+oneccl_bind_pt @ https://intel-optimized-pytorch.s3.cn-north-1.amazonaws.com.cn/ipex_dev/xpu/oneccl_bind_pt-2.5.0%2Bxpu-cp310-cp310-linux_x86_64.whl

 triton-xpu == 3.0.0b1
--- a/setup.py
+++ b/setup.py
@@ -482,9 +482,9 @@ def get_version_add(sha: Optional[str] = None) -> str:
    
    new_version_content = f"""
 try:
-    __version__ = "0.6.5"
-    __version_tuple__ = (0, 6, 5)
-    __hcu_version__ = f'0.6.5+{version}' 
+    __version__ = "0.6.6.post1"
+    __version_tuple__ = (0, 6, 6)
+    __hcu_version__ = f'0.6.6.post1+{version}' 
    
    from vllm.version import __version__, __version_tuple__, __hcu_version__
 except Exception as e:
@@ -493,6 +493,8 @@ except Exception as e:
    warnings.warn(f"Failed to read commit hash:\\n + str(e)",
                  RuntimeWarning,
                  stacklevel=2)
+    __version__ = "dev"
+    __version_tuple__ = (0, 0, __version__)
 """
    
    with open(add_version_path, encoding="utf-8",mode="w") as file:
@@ -525,10 +527,14 @@ def get_gaudi_sw_version():


 def get_vllm_version() -> str:
-    if not _is_hip():
-        version = get_version(
-            write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
-        )
+    # TODO: Revisit this temporary approach: https://github.com/vllm-project/vllm/issues/9182#issuecomment-2404860236
+    try:
+        if not _is_hip():
+            version = get_version(
+                write_to="vllm/_version.py",  # TODO: move this to pyproject.toml
+            )
+    except LookupError:
+        version = "0.0.0"

        sep = "+" if "+" not in version else "."  # dev versions might contain +

@@ -537,7 +543,7 @@ def get_vllm_version() -> str:
            version += f"{sep}empty"
    elif _is_cuda():
        if envs.VLLM_USE_PRECOMPILED:
-            version += ".precompiled"
+            version += f"{sep}precompiled"
        else:
            cuda_version = str(get_nvcc_cuda_version())
            if cuda_version != MAIN_CUDA_VERSION:
@@ -702,6 +708,7 @@ setup(
    ext_modules=ext_modules,
    extras_require={
        "tensorizer": ["tensorizer>=2.9.0"],
+        "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
        "audio": ["librosa", "soundfile"],  # Required for audio processing
        "video": ["decord"]  # Required for video processing
    },

--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -122,7 +122,7 @@ def test_models(
 #     if test_suite != TARGET_TEST_SUITE:
 #         pytest.skip(f"Skip test for {test_suite}")

-#     if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
+#     if model == os.path.join(models_path_prefix, "meta-llama/Llama-2-7b-hf") and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4":  # noqa
 #         # test ray adag
 #         os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
 #         os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
@@ -130,11 +130,6 @@ def test_models(
 #     if attention_backend:
 #         os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend

-#     # Import VLLM_USE_V1 dynamically to handle patching
-#     from vllm.envs import VLLM_USE_V1
-#     if VLLM_USE_V1 and distributed_executor_backend != "mp":
-#         pytest.skip(f"Skip {distributed_executor_backend} for V1")
-
 #     dtype = "half"
 #     max_tokens = 5

@@ -153,11 +148,11 @@ def test_models(
 #         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)

 #     check_outputs_equal(
-#         outputs_0_lst=hf_outputs,
-#         outputs_1_lst=vllm_outputs,
-#         name_0="hf",
-#         name_1="vllm",
-#     )
+        outputs_0_lst=hf_outputs,
+        outputs_1_lst=vllm_outputs,
+        name_0="hf",
+        name_1="vllm",
+    )


 @pytest.mark.skip_v1

--- a/tests/entrypoints/llm/test_guided_generate.py
+++ b/tests/entrypoints/llm/test_guided_generate.py
@@ -13,7 +13,8 @@ from vllm.outputs import RequestOutput
 from vllm.sampling_params import GuidedDecodingParams, SamplingParams
 from ...utils import models_path_prefix

-MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")
+MODEL_NAME = os.path.join(models_path_prefix, "Qwen/Qwen2.5-7B-Instruct")
+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]


 @pytest.fixture(scope="module")
@@ -29,11 +30,13 @@ def llm():


 @pytest.mark.skip_global_cleanup
-def test_guided_regex(sample_regex, llm):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(regex=sample_regex))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_regex(sample_regex, llm, guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         regex=sample_regex,
+                                         backend=guided_decoding_backend))
    outputs = llm.generate(prompts=[
        f"Give an example IPv4 address with this regex: {sample_regex}"
    ] * 2,
@@ -53,11 +56,14 @@ def test_guided_regex(sample_regex, llm):


 @pytest.mark.skip_global_cleanup
-def test_guided_json_completion(sample_json_schema, llm):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_json_schema))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_json_completion(sample_json_schema, llm,
+                                guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_json_schema,
+                                         backend=guided_decoding_backend))
    outputs = llm.generate(prompts=[
        f"Give an example JSON for an employee profile "
        f"that fits this schema: {sample_json_schema}"
@@ -80,11 +86,14 @@ def test_guided_json_completion(sample_json_schema, llm):


 @pytest.mark.skip_global_cleanup
-def test_guided_complex_json_completion(sample_complex_json_schema, llm):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(json=sample_complex_json_schema))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_complex_json_completion(sample_complex_json_schema, llm,
+                                        guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json=sample_complex_json_schema,
+                                         backend=guided_decoding_backend))
    outputs = llm.generate(prompts=[
        f"Give an example JSON for an assignment grade "
        f"that fits this schema: {sample_complex_json_schema}"
@@ -108,11 +117,14 @@ def test_guided_complex_json_completion(sample_complex_json_schema, llm):


 @pytest.mark.skip_global_cleanup
-def test_guided_definition_json_completion(sample_definition_json_schema, llm):
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_definition_json_completion(sample_definition_json_schema, llm,
+                                           guided_decoding_backend: str):
    sampling_params = SamplingParams(temperature=1.0,
                                     max_tokens=1000,
                                     guided_decoding=GuidedDecodingParams(
-                                         json=sample_definition_json_schema))
+                                         json=sample_definition_json_schema,
+                                         backend=guided_decoding_backend))
    outputs = llm.generate(prompts=[
        f"Give an example JSON for solving 8x + 7 = -23 "
        f"that fits this schema: {sample_definition_json_schema}"
@@ -136,11 +148,14 @@ def test_guided_definition_json_completion(sample_definition_json_schema, llm):


 @pytest.mark.skip_global_cleanup
-def test_guided_choice_completion(sample_guided_choice, llm):
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        guided_decoding=GuidedDecodingParams(choice=sample_guided_choice))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_choice_completion(sample_guided_choice, llm,
+                                  guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     guided_decoding=GuidedDecodingParams(
+                                         choice=sample_guided_choice,
+                                         backend=guided_decoding_backend))
    outputs = llm.generate(
        prompts="The best language for type-safe systems programming is ",
        sampling_params=sampling_params,
@@ -159,13 +174,15 @@ def test_guided_choice_completion(sample_guided_choice, llm):


 @pytest.mark.skip_global_cleanup
-def test_guided_grammar(sample_sql_statements, llm):
-
-    sampling_params = SamplingParams(
-        temperature=0.8,
-        top_p=0.95,
-        max_tokens=1000,
-        guided_decoding=GuidedDecodingParams(grammar=sample_sql_statements))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_grammar(sample_sql_statements, llm,
+                        guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=1000,
+                                     guided_decoding=GuidedDecodingParams(
+                                         grammar=sample_sql_statements,
+                                         backend=guided_decoding_backend))
    outputs = llm.generate(
        prompts=("Generate a sql state that select col_1 from "
                 "table_1 where it is equals to 1"),
@@ -221,15 +238,18 @@ def test_validation_against_both_guided_decoding_options(sample_regex, llm):


 @pytest.mark.skip_global_cleanup
-def test_guided_json_object(llm):
-    sampling_params = SamplingParams(
-        temperature=1.0,
-        max_tokens=100,
-        guided_decoding=GuidedDecodingParams(json_object=True))
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
+def test_guided_json_object(llm, guided_decoding_backend: str):
+    sampling_params = SamplingParams(temperature=1.0,
+                                     max_tokens=100,
+                                     n=2,
+                                     guided_decoding=GuidedDecodingParams(
+                                         json_object=True,
+                                         backend=guided_decoding_backend))

    outputs = llm.generate(
-        prompts=("Generate a JSON object describing a person with name "
-                 "and age for John Smith who is 31 years old."),
+        prompts=("Generate a JSON object with curly braces for a person with "
+                 "name and age fields for John Smith who is 31 years old."),
        sampling_params=sampling_params,
        use_tqdm=True)

@@ -238,10 +258,11 @@ def test_guided_json_object(llm):
        assert output is not None
        assert isinstance(output, RequestOutput)

-        generated_text = output.outputs[0].text
-        print(generated_text)
-        assert generated_text is not None
+        for i in range(2):
+            generated_text = output.outputs[i].text
+            print(generated_text)
+            assert generated_text is not None

-        # Parse to verify it is valid JSON
-        parsed_json = json.loads(generated_text)
-        assert isinstance(parsed_json, dict)
+            # Parse to verify it is valid JSON
+            parsed_json = json.loads(generated_text)
+            assert isinstance(parsed_json, dict)
--- a/tests/entrypoints/openai/test_audio.py
+++ b/tests/entrypoints/openai/test_audio.py
@@ -77,6 +77,7 @@ async def test_single_chat_session_audio(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@@ -133,6 +134,7 @@ async def test_single_chat_session_audio_base64encoded(
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@@ -153,6 +155,7 @@ async def test_single_chat_session_audio_base64encoded(
        model=model_name,
        messages=messages,
        max_completion_tokens=10,
+        temperature=0.0,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0

--- a/tests/entrypoints/openai/test_chat.py
+++ b/tests/entrypoints/openai/test_chat.py
@@ -18,6 +18,8 @@ from .test_completion import zephyr_lora_files  # noqa: F401
 # any model with a chat template should work here
 MODEL_NAME = os.path.join(models_path_prefix, "HuggingFaceH4/zephyr-7b-beta")

+GUIDED_DECODING_BACKENDS = ["outlines", "lm-format-enforcer", "xgrammar"]
+

 @pytest.fixture(scope="module")
 def server(zephyr_lora_files, zephyr_lora_added_tokens_files):  # noqa: F811
@@ -469,8 +471,7 @@ async def test_chat_completion_stream_options(client: openai.AsyncOpenAI,
 # will fail on the second `guided_decoding_backend` even when I swap their order
 # (ref: https://github.com/vllm-project/vllm/pull/5526#issuecomment-2173772256)
 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat(client: openai.AsyncOpenAI,
                                  guided_decoding_backend: str,
                                  sample_guided_choice):
@@ -487,6 +488,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=10,
+        temperature=0.7,
        extra_body=dict(guided_choice=sample_guided_choice,
                        guided_decoding_backend=guided_decoding_backend))
    choice1 = chat_completion.choices[0].message.content
@@ -501,6 +503,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,
        model=MODEL_NAME,
        messages=messages,
        max_completion_tokens=10,
+        temperature=0.7,
        extra_body=dict(guided_choice=sample_guided_choice,
                        guided_decoding_backend=guided_decoding_backend))
    choice2 = chat_completion.choices[0].message.content
@@ -509,8 +512,7 @@ async def test_guided_choice_chat(client: openai.AsyncOpenAI,


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_json_chat(client: openai.AsyncOpenAI,
                                guided_decoding_backend: str,
                                sample_json_schema):
@@ -557,8 +559,7 @@ async def test_guided_json_chat(client: openai.AsyncOpenAI,


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_regex_chat(client: openai.AsyncOpenAI,
                                 guided_decoding_backend: str, sample_regex):
    messages = [{
@@ -616,8 +617,7 @@ async def test_guided_decoding_type_error(client: openai.AsyncOpenAI):


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,
                                           guided_decoding_backend: str,
                                           sample_guided_choice):
@@ -649,8 +649,7 @@ async def test_guided_choice_chat_logprobs(client: openai.AsyncOpenAI,


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend",
-                         ["outlines", "lm-format-enforcer"])
+@pytest.mark.parametrize("guided_decoding_backend", GUIDED_DECODING_BACKENDS)
 async def test_named_tool_use(client: openai.AsyncOpenAI,
                              guided_decoding_backend: str,
                              sample_json_schema):
@@ -684,7 +683,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
            "function": {
                "name": "dummy_function_name"
            }
-        })
+        },
+        extra_body=dict(guided_decoding_backend=guided_decoding_backend))
    message = chat_completion.choices[0].message
    assert len(message.content) == 0
    json_string = message.tool_calls[0].function.arguments
@@ -719,6 +719,7 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,
                "name": "dummy_function_name"
            }
        },
+        extra_body=dict(guided_decoding_backend=guided_decoding_backend),
        stream=True)

    output = []
@@ -741,10 +742,8 @@ async def test_named_tool_use(client: openai.AsyncOpenAI,


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
-async def test_required_tool_use_not_yet_supported(
-        client: openai.AsyncOpenAI, guided_decoding_backend: str,
-        sample_json_schema):
+async def test_required_tool_use_not_yet_supported(client: openai.AsyncOpenAI,
+                                                   sample_json_schema):
    messages = [{
        "role": "system",
        "content": "you are a helpful assistant"
@@ -788,9 +787,7 @@ async def test_required_tool_use_not_yet_supported(


 @pytest.mark.asyncio
-@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
 async def test_inconsistent_tool_choice_and_tools(client: openai.AsyncOpenAI,
-                                                  guided_decoding_backend: str,
                                                  sample_json_schema):
    messages = [{
        "role": "system",

--- a/tests/entrypoints/openai/test_embedding.py
+++ b/tests/entrypoints/openai/test_embedding.py
@@ -7,6 +7,7 @@ import os
 import pytest_asyncio
 import requests

+from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.transformers_utils.tokenizer import get_tokenizer

 from ...utils import RemoteOpenAIServer, models_path_prefix
@@ -18,6 +19,8 @@ DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' +
 @pytest.fixture(scope="module")
 def server():
    args = [
+        "--task",
+        "embed",
        # use half precision for speed and memory savings in CI environment
        "--dtype",
        "bfloat16",
@@ -46,11 +49,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):
    ]

    # test single embedding
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
@@ -60,11 +66,14 @@ async def test_single_embedding(client: openai.AsyncOpenAI, model_name: str):

    # test using token IDs
    input_tokens = [1, 1, 1, 1, 1]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
@@ -81,11 +90,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
        "The cat sat on the mat.", "A feline was resting on a rug.",
        "Stars twinkle brightly in the night sky."
    ]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
        model=model_name,
        input=input_texts,
        encoding_format="float",
    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
    assert embeddings.id is not None
    assert len(embeddings.data) == 3
    assert len(embeddings.data[0].embedding) == 4096
@@ -96,11 +108,14 @@ async def test_batch_embedding(client: openai.AsyncOpenAI, model_name: str):
    # test List[List[int]]
    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
                    [25, 32, 64, 77]]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
        model=model_name,
        input=input_tokens,
        encoding_format="float",
    )
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
    assert embeddings.id is not None
    assert len(embeddings.data) == 4
    assert len(embeddings.data[0].embedding) == 4096
@@ -125,14 +140,16 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
        "content": "Stars twinkle brightly in the night sky.",
    }]

-    chat_response = requests.post(server.url_for("v1/embeddings"),
-                                  json={
-                                      "model": model_name,
-                                      "messages": messages,
-                                      "encoding_format": "float",
-                                  })
+    chat_response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
    chat_response.raise_for_status()
-    chat_embeddings = chat_response.json()
+    chat_embeddings = EmbeddingResponse.model_validate(chat_response.json())

    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
    prompt = tokenizer.apply_chat_template(
@@ -149,13 +166,15 @@ async def test_conversation_embedding(server: RemoteOpenAIServer,
        # To be consistent with chat
        extra_body={"add_special_tokens": False},
    )
-    completion_embeddings = completion_response.model_dump(mode="json")
+    completion_embeddings = EmbeddingResponse.model_validate(
+        completion_response.model_dump(mode="json"))

-    assert chat_embeddings.pop("id") is not None
-    assert completion_embeddings.pop("id") is not None
-    assert chat_embeddings.pop("created") <= completion_embeddings.pop(
-        "created")
-    assert chat_embeddings == completion_embeddings
+    assert chat_embeddings.id is not None
+    assert completion_embeddings.id is not None
+    assert chat_embeddings.created <= completion_embeddings.created
+    assert chat_embeddings.model_dump(
+        exclude={"id", "created"}) == (completion_embeddings.model_dump(
+            exclude={"id", "created"}))


 @pytest.mark.asyncio
@@ -205,10 +224,13 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
    ]

    # test single embedding
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
        model=model_name,
        input=input_texts,
        extra_body={"truncate_prompt_tokens": 10})
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))
+
    assert embeddings.id is not None
    assert len(embeddings.data) == 1
    assert len(embeddings.data[0].embedding) == 4096
@@ -220,10 +242,12 @@ async def test_single_embedding_truncation(client: openai.AsyncOpenAI,
        1, 24428, 289, 18341, 26165, 285, 19323, 283, 289, 26789, 3871, 28728,
        9901, 340, 2229, 385, 340, 315, 28741, 28804, 2
    ]
-    embeddings = await client.embeddings.create(
+    embedding_response = await client.embeddings.create(
        model=model_name,
        input=input_tokens,
        extra_body={"truncate_prompt_tokens": 10})
+    embeddings = EmbeddingResponse.model_validate(
+        embedding_response.model_dump(mode="json"))

    assert embeddings.id is not None
    assert len(embeddings.data) == 1
@@ -242,10 +266,10 @@ async def test_single_embedding_truncation_invalid(client: openai.AsyncOpenAI,
    ]

    with pytest.raises(openai.BadRequestError):
-        embeddings = await client.embeddings.create(
+        response = await client.embeddings.create(
            model=model_name,
            input=input_texts,
            extra_body={"truncate_prompt_tokens": 8193})
-        assert "error" in embeddings.object
+        assert "error" in response.object
        assert "truncate_prompt_tokens value is greater than max_model_len. "\
-               "Please, select a smaller truncation size." in embeddings.message
+               "Please, select a smaller truncation size." in response.message
--- a/tests/entrypoints/openai/test_pooling.py
+++ b/tests/entrypoints/openai/test_pooling.py
+import base64
+
+import numpy as np
+import pytest
+import requests
+
+from vllm.entrypoints.openai.protocol import PoolingResponse
+from vllm.transformers_utils.tokenizer import get_tokenizer
+
+from ...utils import RemoteOpenAIServer
+
+MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
+DUMMY_CHAT_TEMPLATE = """{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\\n'}}{% endfor %}"""  # noqa: E501
+
+
+@pytest.fixture(scope="module")
+def server():
+    args = [
+        "--task",
+        "classify",
+        # use half precision for speed and memory savings in CI environment
+        "--dtype",
+        "bfloat16",
+        "--enforce-eager",
+        "--max-model-len",
+        "8192",
+        "--chat-template",
+        DUMMY_CHAT_TEMPLATE,
+    ]
+
+    with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_single_pooling(server: RemoteOpenAIServer, model_name: str):
+    input_texts = [
+        "The chef prepared a delicious meal.",
+    ]
+
+    # test single pooling
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 7
+    assert poolings.usage.total_tokens == 7
+
+    # test using token IDs
+    input_tokens = [1, 1, 1, 1, 1]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_tokens,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 1
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 5
+    assert poolings.usage.total_tokens == 5
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_pooling(server: RemoteOpenAIServer, model_name: str):
+    # test List[str]
+    input_texts = [
+        "The cat sat on the mat.", "A feline was resting on a rug.",
+        "Stars twinkle brightly in the night sky."
+    ]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_texts,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 3
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 25
+    assert poolings.usage.total_tokens == 25
+
+    # test List[List[int]]
+    input_tokens = [[4, 5, 7, 9, 20], [15, 29, 499], [24, 24, 24, 24, 24],
+                    [25, 32, 64, 77]]
+    response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": input_tokens,
+            "encoding_format": "float"
+        },
+    )
+    response.raise_for_status()
+    poolings = PoolingResponse.model_validate(response.json())
+
+    assert poolings.id is not None
+    assert len(poolings.data) == 4
+    assert len(poolings.data[0].data) == 2
+    assert poolings.usage.completion_tokens == 0
+    assert poolings.usage.prompt_tokens == 17
+    assert poolings.usage.total_tokens == 17
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_conversation_pooling(server: RemoteOpenAIServer,
+                                    model_name: str):
+    messages = [{
+        "role": "user",
+        "content": "The cat sat on the mat.",
+    }, {
+        "role": "assistant",
+        "content": "A feline was resting on a rug.",
+    }, {
+        "role": "user",
+        "content": "Stars twinkle brightly in the night sky.",
+    }]
+
+    chat_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float",
+        },
+    )
+    chat_response.raise_for_status()
+    chat_poolings = PoolingResponse.model_validate(chat_response.json())
+
+    tokenizer = get_tokenizer(tokenizer_name=model_name, tokenizer_mode="fast")
+    prompt = tokenizer.apply_chat_template(
+        messages,
+        chat_template=DUMMY_CHAT_TEMPLATE,
+        add_generation_prompt=True,
+        continue_final_message=False,
+        tokenize=False,
+    )
+    completions_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "model": model_name,
+            "input": prompt,
+            "encoding_format": "float",
+            # To be consistent with chat
+            "add_special_tokens": False,
+        },
+    )
+    completions_response.raise_for_status()
+    completion_poolings = PoolingResponse.model_validate(
+        completions_response.json())
+
+    assert chat_poolings.id is not None
+    assert completion_poolings.id is not None
+    assert chat_poolings.created <= completion_poolings.created
+    assert chat_poolings.model_dump(
+        exclude={"id", "created"}) == (completion_poolings.model_dump(
+            exclude={"id", "created"}))
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+async def test_batch_base64_pooling(server: RemoteOpenAIServer,
+                                    model_name: str):
+    input_texts = [
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ]
+
+    float_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+            "encoding_format": "float",
+        },
+    )
+    float_response.raise_for_status()
+    responses_float = PoolingResponse.model_validate(float_response.json())
+
+    base64_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+            "encoding_format": "base64",
+        },
+    )
+    base64_response.raise_for_status()
+    responses_base64 = PoolingResponse.model_validate(base64_response.json())
+
+    decoded_responses_base64_data = []
+    for data in responses_base64.data:
+        decoded_responses_base64_data.append(
+            np.frombuffer(base64.b64decode(data.data),
+                          dtype="float32").tolist())
+
+    assert responses_float.data[0].data == decoded_responses_base64_data[0]
+    assert responses_float.data[1].data == decoded_responses_base64_data[1]
+
+    # Default response is float32 decoded from base64 by OpenAI Client
+    default_response = requests.post(
+        server.url_for("pooling"),
+        json={
+            "input": input_texts,
+            "model": model_name,
+        },
+    )
+    default_response.raise_for_status()
+    responses_default = PoolingResponse.model_validate(default_response.json())
+
+    assert responses_float.data[0].data == responses_default.data[0].data
+    assert responses_float.data[1].data == responses_default.data[1].data
--- a/tests/entrypoints/openai/test_serving_chat.py
+++ b/tests/entrypoints/openai/test_serving_chat.py
 import asyncio
 from contextlib import suppress
 from dataclasses import dataclass
+from typing import Optional
 from unittest.mock import MagicMock

 from vllm.config import MultiModalConfig
@@ -32,6 +33,10 @@ class MockModelConfig:
    multimodal_config = MultiModalConfig()
    hf_config = MockHFConfig()
    logits_processor_pattern = None
+    diff_sampling_param: Optional[dict] = None
+
+    def get_diff_sampling_param(self):
+        return self.diff_sampling_param or {}


 @dataclass
@@ -95,3 +100,59 @@ def test_serving_chat_should_set_correct_max_tokens():
        asyncio.run(serving_chat.create_chat_completion(req))

    assert mock_engine.generate.call_args.args[1].max_tokens == 10
+
+
+def test_serving_chat_could_load_correct_generation_config():
+
+    mock_model_config = MockModelConfig()
+    mock_model_config.diff_sampling_param = {
+        "temperature": 0.5,
+        "repetition_penalty": 1.05
+    }
+
+    mock_engine = MagicMock(spec=MQLLMEngineClient)
+    mock_engine.get_tokenizer.return_value = get_tokenizer(MODEL_NAME)
+    mock_engine.errored = False
+
+    # Initialize the serving chat
+    serving_chat = OpenAIServingChat(mock_engine,
+                                     mock_model_config,
+                                     BASE_MODEL_PATHS,
+                                     response_role="assistant",
+                                     chat_template=CHAT_TEMPLATE,
+                                     chat_template_content_format="auto",
+                                     lora_modules=None,
+                                     prompt_adapters=None,
+                                     request_logger=None)
+    req = ChatCompletionRequest(
+        model=MODEL_NAME,
+        messages=[{
+            "role": "user",
+            "content": "what is 1+1?"
+        }],
+        guided_decoding_backend="outlines",
+    )
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.5
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+    # Test the param when user set it
+    req.temperature = 0.1
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.1
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
+
+    # Test When temperature==0.0
+    req.temperature = 0.0
+
+    with suppress(Exception):
+        asyncio.run(serving_chat.create_chat_completion(req))
+
+    assert mock_engine.generate.call_args.args[1].temperature == 0.0
+    assert mock_engine.generate.call_args.args[1].repetition_penalty == 1.05
--- a/tests/entrypoints/openai/test_video.py
+++ b/tests/entrypoints/openai/test_video.py
@@ -90,6 +90,7 @@ async def test_single_chat_session_video(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@@ -182,6 +183,7 @@ async def test_single_chat_session_video_base64encoded(
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@@ -202,6 +204,7 @@ async def test_single_chat_session_video_base64encoded(
        model=model_name,
        messages=messages,
        max_completion_tokens=10,
+        temperature=0.0,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0

--- a/tests/entrypoints/openai/test_vision.py
+++ b/tests/entrypoints/openai/test_vision.py
@@ -92,6 +92,7 @@ async def test_single_chat_session_image(client: openai.AsyncOpenAI,
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@@ -184,6 +185,7 @@ async def test_single_chat_session_image_base64encoded(
        messages=messages,
        max_completion_tokens=10,
        logprobs=True,
+        temperature=0.0,
        top_logprobs=5)
    assert len(chat_completion.choices) == 1

@@ -204,6 +206,7 @@ async def test_single_chat_session_image_base64encoded(
        model=model_name,
        messages=messages,
        max_completion_tokens=10,
+        temperature=0.0,
    )
    message = chat_completion.choices[0].message
    assert message.content is not None and len(message.content) >= 0

--- a/tests/entrypoints/openai/test_vision_embedding.py
+++ b/tests/entrypoints/openai/test_vision_embedding.py
@@ -2,9 +2,9 @@ from typing import Dict

 import os
 import pytest
-import pytest_asyncio
 import requests

+from vllm.entrypoints.openai.protocol import EmbeddingResponse
 from vllm.multimodal.utils import encode_image_base64, fetch_image

 from ...utils import VLLM_PATH, RemoteOpenAIServer, models_path_prefix, urls_port
@@ -54,12 +54,6 @@ def server():
        yield remote_server


-@pytest_asyncio.fixture
-async def client(server):
-    async with server.get_async_client() as async_client:
-        yield async_client
-
-
 @pytest.fixture(scope="session")
 def base64_encoded_image() -> Dict[str, str]:
    return {
@@ -90,18 +84,20 @@ async def test_image_embedding(server: RemoteOpenAIServer, model_name: str,
        ],
    }]

-    response = requests.post(server.url_for("v1/embeddings"),
-                             json={
-                                 "model": model_name,
-                                 "messages": messages,
-                                 "encoding_format": "float"
-                             })
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={
+            "model": model_name,
+            "messages": messages,
+            "encoding_format": "float"
+        },
+    )
    response.raise_for_status()
-
-    embeddings = response.json()
-    assert embeddings["id"] is not None
-    assert len(embeddings["data"]) == 1
-    assert len(embeddings["data"][0]["embedding"]) == 3072
-    assert embeddings["usage"]["completion_tokens"] == 0
-    assert embeddings["usage"]["prompt_tokens"] == 765
-    assert embeddings["usage"]["total_tokens"] == 765
+    embeddings = EmbeddingResponse.model_validate(response.json())
+
+    assert embeddings.id is not None
+    assert len(embeddings.data) == 1
+    assert len(embeddings.data[0].embedding) == 3072
+    assert embeddings.usage.completion_tokens == 0
+    assert embeddings.usage.prompt_tokens == 765
+    assert embeddings.usage.total_tokens == 765
--- a/tests/kernels/test_block_fp8.py
+++ b/tests/kernels/test_block_fp8.py
+# Adapted from https://github.com/sgl-project/sglang/pull/2575
+import itertools
+
+import pytest
+import torch
+
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.fused_moe import fused_moe
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    per_token_group_quant_fp8, w8a8_block_fp8_matmul)
+from vllm.platforms import current_platform
+
+if current_platform.get_device_capability() < (9, 0):
+    pytest.skip("FP8 Triton requires CUDA 9.0 or higher",
+                allow_module_level=True)
+
+# Test configurations
+DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
+NUM_TOKENS = [7, 83, 2048]
+D = [512, 4096, 5120, 13824]
+GROUP_SIZE = [64, 128, 256, 512]
+M = [1, 7, 83, 512, 2048]
+N = [128, 512, 1024, 4096, 7748, 13824]
+K = [256, 4096, 5120, 3884, 13824]
+# Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
+# and its hidden size is 7168.
+M_moe = [1, 7, 83, 512, 2048]
+N_moe = [4608]  # [128, 4608, 13824]
+K_moe = [7168]  # [256, 7168, 13824]
+BLOCK_SIZE = [[128, 128]]
+E = [256]  # [8, 24, 128, 256]
+TOP_KS = [1]  # [1, 2, 6]
+OUT_DTYPES = [torch.bfloat16]  # [torch.float32, torch.half, torch.bfloat16]
+SEEDS = [0]
+
+
+def native_per_token_group_quant_fp8(x,
+                                     group_size,
+                                     eps=1e-10,
+                                     dtype=torch.float8_e4m3fn):
+    """Function to perform per-token-group quantization on an input tensor
+    `x` using native torch."""
+    assert x.shape[-1] % group_size == 0, ("the last dimension of `x` cannot "
+                                           "be divisible by `group_size`")
+    assert x.is_contiguous(), "`x` is not contiguous"
+
+    finfo = torch.finfo(dtype)
+    fp8_min = finfo.min
+    fp8_max = finfo.max
+
+    x_ = x.reshape(x.numel() // group_size, group_size)
+    amax = x_.abs().max(dim=-1,
+                        keepdim=True)[0].clamp(min=eps).to(torch.float32)
+    x_s = amax / fp8_max
+    x_q = (x_ / x_s).clamp(min=fp8_min, max=fp8_max).to(dtype)
+    x_q = x_q.reshape(x.shape)
+    x_s = x_s.reshape(x.shape[:-1] + (x.shape[-1] // group_size, ))
+
+    return x_q, x_s
+
+
+def native_w8a8_block_fp8_matmul(A,
+                                 B,
+                                 As,
+                                 Bs,
+                                 block_size,
+                                 output_dtype=torch.float16):
+    """Matrix multiplication with block-wise quantization using native torch."""
+    A = A.to(torch.float32)
+    B = B.to(torch.float32)
+    assert A.shape[-1] == B.shape[-1]
+    assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
+    assert len(block_size) == 2
+    block_n, block_k = block_size[0], block_size[1]
+    assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
+    assert A.shape[:-1] == As.shape[:-1]
+
+    M = A.numel() // A.shape[-1]
+    N, K = B.shape
+    origin_C_shape = A.shape[:-1] + (N, )
+    A = A.reshape(M, A.shape[-1])
+    As = As.reshape(M, As.shape[-1])
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+    assert n_tiles == Bs.shape[0]
+    assert k_tiles == Bs.shape[1]
+
+    C_shape = (M, N)
+    C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
+
+    A_tiles = [
+        A[:, i * block_k:min((i + 1) * block_k, K)] for i in range(k_tiles)
+    ]
+    B_tiles = [[
+        B[j * block_n:min((j + 1) * block_n, N),
+          i * block_k:min((i + 1) * block_k, K), ] for i in range(k_tiles)
+    ] for j in range(n_tiles)]
+    C_tiles = [
+        C[:, j * block_n:min((j + 1) * block_n, N)] for j in range(n_tiles)
+    ]
+    As_tiles = [As[:, i:i + 1] for i in range(k_tiles)]
+
+    for i in range(k_tiles):
+        for j in range(n_tiles):
+            a = A_tiles[i]
+            b = B_tiles[j][i]
+            c = C_tiles[j]
+            s = As_tiles[i] * Bs[j][i]
+            c[:, :] += torch.matmul(a, b.t()) * s
+
+    C = C.reshape(origin_C_shape).to(output_dtype)
+    return C
+
+
+def torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk, block_shape):
+    """Fused moe with block-wise quantization using native torch."""
+    B, D = a.shape
+    a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
+    out = torch.zeros(B * topk, w2.shape[1], dtype=a.dtype, device=a.device)
+    score = torch.softmax(score, dim=-1, dtype=torch.float32)
+    topk_weight, topk_ids = torch.topk(score, topk)
+    topk_weight = topk_weight.view(-1)
+    topk_ids = topk_ids.view(-1)
+
+    _, block_k = block_shape[0], block_shape[1]
+    a_q, a_s = native_per_token_group_quant_fp8(a, block_k)
+    a_q = a_q.to(torch.float32)
+    for i in range(w1.shape[0]):
+        mask = topk_ids == i
+        if mask.sum():
+            inter_out = native_w8a8_block_fp8_matmul(a_q[mask],
+                                                     w1[i],
+                                                     a_s[mask],
+                                                     w1_s[i],
+                                                     block_shape,
+                                                     output_dtype=a.dtype)
+            act_out = SiluAndMul().forward_native(inter_out)
+            act_out_q, act_out_s = native_per_token_group_quant_fp8(
+                act_out, block_k)
+            act_out = act_out.to(torch.float32)
+            out[mask] = native_w8a8_block_fp8_matmul(act_out_q,
+                                                     w2[i],
+                                                     act_out_s,
+                                                     w2_s[i],
+                                                     block_shape,
+                                                     output_dtype=a.dtype)
+    return (out.view(B, -1, w2.shape[1]) *
+            topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
+
+
+# Skip all tests if CUDA is not available
+pytest.importorskip("torch.cuda")
+
+
+@pytest.fixture(autouse=True)
+def setup_cuda():
+    torch.set_default_device("cuda")
+
+
+@pytest.mark.parametrize("num_tokens,d,dtype,group_size,seed",
+                         itertools.product(NUM_TOKENS, D, DTYPES, GROUP_SIZE,
+                                           SEEDS))
+@torch.inference_mode()
+def test_per_token_group_quant_fp8(num_tokens, d, dtype, group_size, seed):
+    torch.manual_seed(seed)
+    x = torch.rand(num_tokens, d, dtype=dtype)
+
+    ref_out, ref_scale = native_per_token_group_quant_fp8(x, group_size)
+    out, scale = per_token_group_quant_fp8(x, group_size)
+
+    assert torch.allclose(out.to(torch.float32),
+                          ref_out.to(torch.float32),
+                          rtol=0.15)
+    assert torch.allclose(scale, ref_scale)
+
+
+@pytest.mark.parametrize("M,N,K,block_size,out_dtype,seed",
+                         itertools.product(M, N, K, BLOCK_SIZE, OUT_DTYPES,
+                                           SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_matmul(M, N, K, block_size, out_dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    A_fp32 = (torch.rand(M, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    A_fp8 = A_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    B_fp32 = (torch.rand(N, K, dtype=torch.float32) - 0.5) * 2 * fp8_max
+    B_fp8 = B_fp32.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles = (N + block_n - 1) // block_n
+    k_tiles = (K + block_k - 1) // block_k
+
+    As = torch.rand(M, k_tiles, dtype=torch.float32) * factor_for_scale
+    Bs = torch.rand(n_tiles, k_tiles, dtype=torch.float32) * factor_for_scale
+
+    ref_out = native_w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size,
+                                           out_dtype)
+    out = w8a8_block_fp8_matmul(A_fp8, B_fp8, As, Bs, block_size, out_dtype)
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.001
+
+
+@pytest.mark.parametrize("M,N,K,E,topk,block_size,dtype,seed",
+                         itertools.product(M_moe, N_moe, K_moe, E, TOP_KS,
+                                           BLOCK_SIZE, DTYPES, SEEDS))
+@torch.inference_mode()
+def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
+    torch.manual_seed(seed)
+    factor_for_scale = 1e-2
+    fp8_info = torch.finfo(torch.float8_e4m3fn)
+    fp8_max, fp8_min = fp8_info.max, fp8_info.min
+
+    a = torch.randn((M, K), dtype=dtype) / 10
+
+    w1_bf16 = (torch.rand(
+        (E, 2 * N, K), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    w1 = w1_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    del w1_bf16
+
+    w2_bf16 = (torch.rand((E, K, N), dtype=torch.bfloat16) - 0.5) * 2 * fp8_max
+    w2 = w2_bf16.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
+    del w2_bf16
+
+    block_n, block_k = block_size[0], block_size[1]
+    n_tiles_w1 = (2 * N + block_n - 1) // block_n
+    n_tiles_w2 = (K + block_n - 1) // block_n
+    k_tiles_w1 = (K + block_k - 1) // block_k
+    k_tiles_w2 = (N + block_k - 1) // block_k
+
+    w1_s = torch.rand(
+        (E, n_tiles_w1, k_tiles_w1), dtype=torch.float32) * factor_for_scale
+    w2_s = torch.rand(
+        (E, n_tiles_w2, k_tiles_w2), dtype=torch.float32) * factor_for_scale
+
+    score = torch.randn((M, E), dtype=dtype)
+
+    out = fused_moe(
+        a,
+        w1,
+        w2,
+        score,
+        topk,
+        renormalize=False,
+        use_fp8_w8a8=True,
+        w1_scale=w1_s,
+        w2_scale=w2_s,
+        block_shape=block_size,
+    )
+    ref_out = torch_w8a8_block_fp8_moe(a, w1, w2, w1_s, w2_s, score, topk,
+                                       block_size)
+
+    print(f"{out.sum()=}")
+    print(f"{ref_out.sum()=}")
+
+    rel_diff = (torch.mean(
+        torch.abs(out.to(torch.float32) - ref_out.to(torch.float32))) /
+                torch.mean(torch.abs(ref_out.to(torch.float32))))
+    assert rel_diff < 0.03
--- a/tests/kernels/test_semi_structured.py
+++ b/tests/kernels/test_semi_structured.py
+"""Tests for sparse cutlass kernels
+
+Run `pytest tests/kernels/test_semi_structured.py`.
+"""
+from typing import Optional, Tuple, Type
+
+import pytest
+import torch
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    sparse_cutlass_supported)
+from vllm.platforms import current_platform
+
+CUDA_DEVICES = [
+    f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)
+]
+
+capability = current_platform.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+def to_fp8(tensor: torch.Tensor):
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.Tensor):
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def rand_int8(shape: tuple, device: str = "cuda"):
+    return to_int8(torch.rand(shape, device=device) * 255 - 128)
+
+
+def to_bf16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.bfloat16)
+
+
+def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
+    return tensor.to(dtype=torch.float16)
+
+
+def prune_to_2_4(tensor):
+    # Reshape tensor to [N, 4] where N is number of groups of 4
+    original_shape = tensor.shape
+    reshaped = tensor.reshape(-1, 4)
+
+    # Get indices of top 2 absolute values in each group of 4
+    _, indices = torch.topk(torch.abs(reshaped), k=2, dim=1)
+
+    # Create binary mask
+    mask = torch.zeros_like(reshaped)
+    mask.scatter_(dim=1,
+                  index=indices,
+                  src=torch.ones_like(indices, dtype=mask.dtype))
+
+    # Apply mask and reshape back
+    pruned = reshaped * mask
+
+    # Turn all -0.0 to 0.0
+    pruned[pruned == -0.0] = 0.0
+
+    return pruned.reshape(original_shape)
+
+
+def make_rand_sparse_tensors(
+        dtype: torch.dtype, m: int, n: int, k: int
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    b = prune_to_2_4(b.t()).t()
+
+    if dtype == torch.int8:
+        a, b = to_int8(a), to_int8(b)
+    elif dtype == torch.float8_e4m3fn:
+        a, b = to_fp8(a), to_fp8(b)
+    elif dtype == torch.float16:
+        a, b = to_fp16(a), to_fp16(b)
+    elif dtype == torch.bfloat16:
+        a, b = to_bf16(a), to_bf16(b)
+    else:
+        raise ValueError("unsupported dtype")
+
+    b_compressed, e = ops.cutlass_sparse_compress(b.t())
+
+    # Compressed B, Metadata, Original A, B
+    return b_compressed, e, a, b
+
+
+def baseline_scaled_mm(a: torch.Tensor,
+                       b: torch.Tensor,
+                       scale_a: torch.Tensor,
+                       scale_b: torch.Tensor,
+                       out_dtype: Type[torch.dtype],
+                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+    output = (scale_a * (scale_b * (torch.mm(
+        a.to(dtype=torch.float32), b.to(dtype=torch.float32))))).to(out_dtype)
+    if bias is not None:
+        output = output + bias
+
+    return output
+
+
+@pytest.mark.skipif(not sparse_cutlass_supported(),
+                    reason="Sparse FP8 is not yet supported on this GPU type.")
+# Test working with a subset of A and B for sparse matmul
+def test_cutlass_sparse_subset():
+
+    big_m = 1024
+    m, n, k = 512, 512, 512
+
+    # Create tensors
+    b_comp, e, whole_a, b = make_rand_sparse_tensors(torch.float8_e4m3fn,
+                                                     big_m, n, k)
+    a = whole_a[0:m, 0:k]
+    scale_a = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+    scale_b = torch.randn((1, 1), device="cuda", dtype=torch.float32) / 10
+
+    out = ops.cutlass_scaled_sparse_mm(a,
+                                       b_comp,
+                                       e,
+                                       scale_a,
+                                       scale_b,
+                                       out_dtype=torch.bfloat16)
+    baseline = baseline_scaled_mm(a,
+                                  b,
+                                  scale_a,
+                                  scale_b,
+                                  out_dtype=torch.bfloat16)
+
+    torch.testing.assert_close(out, baseline, rtol=1e-1, atol=1e0)
--- a/tests/kv_transfer/test_lookup_buffer.py
+++ b/tests/kv_transfer/test_lookup_buffer.py
@@ -48,7 +48,7 @@ def test_run(my_rank, buffer, device):
        assert buffer.buffer_size == 0
        assert len(buffer.buffer) == 0

-    print("Test run passed!")
+    print("My rank: %d, Test run passed!" % (my_rank))


 def stress_test(my_rank, buf, device):
@@ -108,7 +108,7 @@ def stress_test(my_rank, buf, device):
    else:
        torch.distributed.send(torch.tensor([n]), 0)

-    print("Passed stress test!")
+    print("My rank: %d, Passed stress test!" % (my_rank))


 if __name__ == "__main__":

--- a/tests/kv_transfer/test_lookup_buffer.sh
+++ b/tests/kv_transfer/test_lookup_buffer.sh
 #!/bin/bash
-RANK=0 python test_lookup_buffer.py &
-RANK=1 python test_lookup_buffer.py &
\ No newline at end of file
+RANK=0 python3 test_lookup_buffer.py &
+PID0=$!
+RANK=1 python3 test_lookup_buffer.py &
+PID1=$!
+
+wait $PID0
+wait $PID1