Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 38d80967 · 38d80967
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/tests/v1/entrypoints/openai/test_completion.py
+++ b/tests/v1/entrypoints/openai/test_completion.py
@@ -686,7 +686,7 @@ async def test_invalid_grammar(client: openai.AsyncOpenAI, model_name: str):
 async def test_completion_with_empty_prompt_embeds(
        client: openai.AsyncOpenAI) -> None:
    """Test completion with empty prompt embeds."""
-    payload: dict[str, list] = {"prompt_embeds": []}
+    payload: dict[str, object] = {"prompt": "Hello", "prompt_embeds": []}
    headers: dict[str, str] = {"Content-Type": "application/json"}
    # base_url = http://localhost:8000/v1/completions
    response = requests.post(f"{client.base_url}completions",

--- a/tests/v1/executor/test_executor.py
+++ b/tests/v1/executor/test_executor.py
@@ -27,7 +27,7 @@ class CustomMultiprocExecutor(MultiprocExecutor):
                       kwargs: Optional[dict] = None,
                       non_block: bool = False,
                       unique_reply_rank: Optional[int] = None) -> list[Any]:
-        # Drop marker to show that this was ran
+        # Drop marker to show that this was run
        with open(".marker", "w"):
            ...
        return super().collective_rpc(method, timeout, args, kwargs)

--- a/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
+++ b/tests/v1/kv_connector/unit/test_remote_decode_lifecycle.py
@@ -42,7 +42,7 @@ def test_basic_lifecycle():
    engine_core_outputs = scheduler.update_from_output(scheduler_output,
                                                       model_runner_output)

-    # Ensure the request is finished after 1 tokens.
+    # Ensure the request is finished after 1 token.
    assert request.is_finished()
    assert request.status == RequestStatus.FINISHED_LENGTH_CAPPED
    output = engine_core_outputs[0].outputs[0]
@@ -141,7 +141,7 @@ def test_short_prompt_lifecycle():


 def test_prefix_cache_lifecycle():
-    """Test that remote decode params still works with a prefix cache hit."""
+    """Test that remote decode params still work with a prefix cache hit."""

    vllm_config = create_vllm_config()
    scheduler = create_scheduler(vllm_config)

--- a/tests/v1/kv_connector/unit/test_shared_storage_connector.py
+++ b/tests/v1/kv_connector/unit/test_shared_storage_connector.py
@@ -33,7 +33,7 @@ def _check_path_len(path):


 def _list_path(path):
-    """Return the list of foldername (hashes generatd) under the path"""
+    """Return the list of foldername (hashes generated) under the path"""
    return list(path.iterdir())


@@ -41,7 +41,7 @@ def run_test(tmp_path, processor, llm: LLM, question: str,
             image_urls: list[Image], expected_len: int, info: str):
    """
    One individual test to process the prompt and output base on 1 set of input
-    Then check if the length in the strorage path matches the expected length
+    Then check if the length in the storage path matches the expected length
    `info` introduces details or purpose of the individual test
    """
    print(f"***info: {info}***")
@@ -115,7 +115,7 @@ def test_shared_storage_connector_hashes(tmp_path):
    """
    Tests that SharedStorageConnector saves KV to the storage locations
    with proper hashes; that are unique for inputs with identical text but 
-    differnt images (same size), or same multiple images but different orders.
+    different images (same size), or same multiple images but different orders.
    """
    # Using tmp_path as the storage path to store KV
    print(f"KV storage path at: {str(tmp_path)}")
@@ -171,12 +171,12 @@ def test_shared_storage_connector_hashes(tmp_path):
                  img=[image_1],
                  expected_len=2,
                  info=("image_1 single input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
        InputCase(text=TEXT_PROMPTS[0],
                  img=[image_2],
                  expected_len=2,
                  info=("image_2 single input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
        InputCase(text=TEXT_PROMPTS[0],
                  img=[image_1, image_2],
                  expected_len=3,
@@ -189,12 +189,12 @@ def test_shared_storage_connector_hashes(tmp_path):
                  img=[image_1, image_2],
                  expected_len=4,
                  info=("[image_1, image_2] input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
        InputCase(text=TEXT_PROMPTS[0],
                  img=[image_2, image_1],
                  expected_len=4,
                  info=("[image_2, image_1] input the 2nd time. "
-                        "It should not form aother new hash.")),
+                        "It should not form another new hash.")),
        InputCase(text=TEXT_PROMPTS[0],
                  img=[],
                  expected_len=5,

--- a/tests/v1/kv_connector/unit/utils.py
+++ b/tests/v1/kv_connector/unit/utils.py
@@ -13,6 +13,7 @@ from vllm.distributed.kv_transfer.kv_connector.factory import (
    KVConnectorFactory)
 from vllm.distributed.kv_transfer.kv_connector.v1.shared_storage_connector import (  # noqa
    SharedStorageConnector)
+from vllm.utils import sha256
 from vllm.v1.core.kv_cache_manager import KVCacheBlocks
 from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
                                         init_none_hash)
@@ -127,11 +128,11 @@ def create_request(request_id: int,
                   use_all_1s_for_prompt_tokens: bool = False,
                   num_remote_blocks: int = 3,
                   block_size: int = 16,
-                   hash_fn: Callable = hash) -> Request:
+                   hash_fn: Callable = sha256) -> Request:
    """Make dummy request for testing."""
    global _none_hash_initialized
    if not _none_hash_initialized:
-        init_none_hash(hash)
+        init_none_hash(hash_fn)
        _none_hash_initialized = True

    kv_transfer_params: Optional[dict[str, Any]] = None

--- a/tests/v1/logits_processors/test_custom_offline.py
+++ b/tests/v1/logits_processors/test_custom_offline.py
@@ -15,6 +15,7 @@ from tests.v1.logits_processors.utils import (DUMMY_LOGITPROC_ARG,
                                              POOLING_MODEL_NAME, TEMP_GREEDY,
                                              CustomLogitprocSource,
                                              DummyLogitsProcessor,
+                                              WrappedPerReqLogitsProcessor,
                                              dummy_module)
 from tests.v1.logits_processors.utils import entry_points as fake_entry_points
 from tests.v1.logits_processors.utils import prompts
@@ -80,7 +81,7 @@ def _run_test(kwargs: dict, logitproc_loaded: bool) -> None:
            target_token = params.extra_args[DUMMY_LOGITPROC_ARG]
            if not all(x == target_token for x in lp_toks):
                raise AssertionError(
-                    f"Request {bdx} generated {lp_toks}, shoud all be "
+                    f"Request {bdx} generated {lp_toks}, should all be "
                    f"{target_token}")
        else:
            # This request does not exercise custom logitproc (or custom
@@ -161,6 +162,38 @@ def test_custom_logitsprocs(monkeypatch,
    _run_test(kwargs, logitproc_loaded=True)


+@create_new_process_for_each_test()
+def test_custom_logitsprocs_req(monkeypatch):
+    """Test passing request-level logits processor to offline Python interface
+    
+    Wrap a request-level logits processor to create a batch level logits
+    processor that has a well-defined behavior (mask out all tokens except one
+    `target_token`)
+
+    Construct an `LLM` instance which loads the wrapped logits processor. Pass
+    the custom logitproc as a class object.
+
+    Construct a reference `LLM` instance with no custom logitproc
+
+    Pass in a batch of requests, 50% of which pass a `target_token` value
+    in through `SamplingParams.extra_args`, 50% of which do not.
+
+    Validate that
+    * Requests which do not activate the custom logitproc, yield the same
+      results for both `LLM` instances
+    * Requests which activate the custom logitproc, only output `target_token`
+
+    Args:
+      monkeypatch: for setting env vars
+    """
+
+    # Test that logitproc info is passed to workers
+    monkeypatch.setenv("VLLM_ENABLE_V1_MULTIPROCESSING", "1")
+    random.seed(40)
+    _run_test({"logits_processors": [WrappedPerReqLogitsProcessor]},
+              logitproc_loaded=True)
+
+
 @create_new_process_for_each_test()
 @pytest.mark.parametrize("logitproc_source", [
    CustomLogitprocSource.LOGITPROC_SOURCE_ENTRYPOINT,

--- a/tests/v1/logits_processors/utils.py
+++ b/tests/v1/logits_processors/utils.py
@@ -3,15 +3,21 @@

 import types
 from enum import Enum, auto
-from typing import Optional
+from typing import Any, Optional

 import torch

 from vllm.config import VllmConfig
-from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP, BatchUpdate,
-                                             LogitsProcessor)
+from vllm.logger import init_logger
+from vllm.sampling_params import SamplingParams
+from vllm.v1.sample.logits_processor import (LOGITSPROCS_GROUP,
+                                             AdapterLogitsProcessor,
+                                             BatchUpdate, LogitsProcessor,
+                                             RequestLogitsProcessor)
 from vllm.v1.sample.logits_processor.builtin import process_dict_updates

+logger = init_logger(__name__)
+
 MODEL_NAME = "facebook/opt-125m"
 POOLING_MODEL_NAME = "BAAI/bge-base-en-v1.5"
 DUMMY_LOGITPROC_ARG = "target_token"
@@ -104,5 +110,60 @@ class EntryPoints(list):
        self.names = [ep.name for ep in eps]


+class DummyPerReqLogitsProcessor:
+    """The request-level logits processor masks out all logits except the
+    token id identified by `target_token`"""
+
+    def __init__(self, target_token: int) -> None:
+        """Specify `target_token`"""
+        self.target_token = target_token
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        val_to_keep = logits[self.target_token].item()
+        logits[:] = float("-inf")
+        logits[self.target_token] = val_to_keep
+        return logits
+
+
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of wrapping a fake request-level logit processor to create a
+    batch-level logits processor"""
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """This method returns a new request-level logits processor, customized
+        to the `target_token` value associated with a particular request.
+
+        Returns None if the logits processor should not be applied to the
+        particular request. To use the logits processor the request must have
+        a "target_token" custom argument with an integer value.
+
+        Args:
+          params: per-request sampling params
+
+        Returns:
+          `Callable` request logits processor, or None
+        """
+        target_token: Optional[
+            Any] = params.extra_args and params.extra_args.get("target_token")
+        if target_token is None:
+            return None
+        if not isinstance(target_token, int):
+            logger.warning(
+                "target_token value %s is not int; not applying logits"
+                " processor to request.", target_token)
+            return None
+        return DummyPerReqLogitsProcessor(target_token)
+
+
 """Fake version of importlib.metadata.entry_points"""
 entry_points = lambda group: EntryPoints(group)
--- a/tests/v1/metrics/test_engine_logger_apis.py
+++ b/tests/v1/metrics/test_engine_logger_apis.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import copy
+
+import pytest
+
+from vllm.v1.engine.async_llm import AsyncEngineArgs, AsyncLLM
+from vllm.v1.metrics.ray_wrappers import RayPrometheusStatLogger
+
+
+class DummyStatLogger:
+    """
+    A dummy stat logger for testing purposes.
+    Implements the minimal interface expected by StatLoggerManager.
+    """
+
+    def __init__(self, vllm_config, engine_idx):
+        self.vllm_config = vllm_config
+        self.engine_idx = engine_idx
+        self.recorded = []
+        self.logged = False
+        self.engine_initialized = False
+
+    def record(self, scheduler_stats, iteration_stats, engine_idx):
+        self.recorded.append((scheduler_stats, iteration_stats, engine_idx))
+
+    def log(self):
+        self.logged = True
+
+    def log_engine_initialized(self):
+        self.engine_initialized = True
+
+
+@pytest.fixture
+def log_stats_enabled_engine_args():
+    """
+    Shared fixture providing common AsyncEngineArgs configuration
+    used across multiple tests.
+    """
+    return AsyncEngineArgs(
+        model="distilbert/distilgpt2",
+        dtype="half",
+        disable_log_stats=False,
+        enforce_eager=True,
+    )
+
+
+@pytest.mark.asyncio
+async def test_async_llm_replace_default_loggers(
+        log_stats_enabled_engine_args):
+    """
+    RayPrometheusStatLogger should replace the default PrometheusStatLogger
+    """
+
+    engine = AsyncLLM.from_engine_args(log_stats_enabled_engine_args,
+                                       stat_loggers=[RayPrometheusStatLogger])
+    assert isinstance(engine.logger_manager.prometheus_logger,
+                      RayPrometheusStatLogger)
+    engine.shutdown()
+
+
+@pytest.mark.asyncio
+async def test_async_llm_add_to_default_loggers(log_stats_enabled_engine_args):
+    """
+    It's still possible to use custom stat loggers exclusively by passing 
+    disable_log_stats=True in addition to a list of custom stat loggers.
+    """
+    # Create engine_args with disable_log_stats=True for this test
+    disabled_log_engine_args = copy.deepcopy(log_stats_enabled_engine_args)
+    disabled_log_engine_args.disable_log_stats = True
+
+    # Disable default loggers; pass custom stat logger to the constructor
+    engine = AsyncLLM.from_engine_args(disabled_log_engine_args,
+                                       stat_loggers=[DummyStatLogger])
+
+    assert len(engine.logger_manager.per_engine_logger_dict[0]) == 1
+    assert isinstance(engine.logger_manager.per_engine_logger_dict[0][0],
+                      DummyStatLogger)
+
+    # log_stats is still True, since custom stat loggers are used
+    assert engine.log_stats
+
+    engine.shutdown()
--- a/tests/v1/sample/test_logprobs.py
+++ b/tests/v1/sample/test_logprobs.py
@@ -430,7 +430,7 @@ def test_zero_logprobs(vllm_model, example_prompts,


 def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
-    """Engine should return all vocabulary logprobs
+    """Engine should return all vocabulary logprobs and prompt logprobs

    Args:
      example_prompts: list of example prompts (test fixture)
@@ -444,16 +444,24 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
            # 2 other llms alive during whole session
            gpu_memory_utilization=0.15,
            max_model_len=256)
+
        sampling_params_logprobs_all = SamplingParams(max_tokens=5,
-                                                      logprobs=-1)
+                                                      logprobs=-1,
+                                                      prompt_logprobs=-1)
        results_logprobs_all = runner.llm.generate(
            example_prompts, sampling_params=sampling_params_logprobs_all)
        vocab_size = runner.llm.llm_engine.get_model_config().get_vocab_size()
+
        for i in range(len(results_logprobs_all)):
            logprobs = results_logprobs_all[i].outputs[0].logprobs
+            prompt_logprobs = results_logprobs_all[i].prompt_logprobs
            assert logprobs is not None
            for logprob in logprobs:
                assert len(logprob) == vocab_size
+            assert prompt_logprobs is not None
+            assert prompt_logprobs[0] is None
+            for prompt_logprob in prompt_logprobs[1:]:
+                assert len(prompt_logprob) == vocab_size


 @pytest.mark.parametrize("logprobs_mode", list(LogprobsMode))

--- a/tests/v1/spec_decode/test_eagle.py
+++ b/tests/v1/spec_decode/test_eagle.py
@@ -12,9 +12,10 @@ from tests.v1.attention.utils import (BatchSpec, _Backend,
                                      create_common_attn_metadata,
                                      create_standard_kv_cache_spec,
                                      get_attention_backend)
-from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, ModelConfig,
+from vllm.config import (CacheConfig, DeviceConfig, ModelConfig,
                         ParallelConfig, SchedulerConfig, SpeculativeConfig,
                         VllmConfig)
+from vllm.config.load import LoadConfig
 from vllm.model_executor.models.llama import LlamaForCausalLM
 from vllm.platforms import current_platform
 from vllm.v1.spec_decode.eagle import EagleProposer
@@ -183,7 +184,7 @@ def test_load_model(mock_get_model, mock_get_layers, mock_get_pp_group, method,
    mock_pp_group.world_size = pp_size
    mock_get_pp_group.return_value = mock_pp_group

-    # Setup the target model mock with a custom class so that
+    # Set up the target model mock with a custom class so that
    # isinstance() checks match the expected type.
    class _TargetModelStub(LlamaForCausalLM):
        model: mock.MagicMock

--- a/tests/v1/spec_decode/test_tree_attention.py
+++ b/tests/v1/spec_decode/test_tree_attention.py
@@ -187,7 +187,7 @@ def test_tree_attn_correctness() -> None:
                        dtype=torch.bfloat16,
                    )

-                    # Setup the block table and KV cache for paged KV.
+                    # Set up the block table and KV cache for paged KV.
                    assert max_sequence_length % block_size == 0
                    max_blocks_per_batch = max_sequence_length // block_size
                    kv_cache = torch.randn(
@@ -222,7 +222,7 @@ def test_tree_attn_correctness() -> None:
                                num_alloc_blocks_per_batch] = block_ids.view(
                                    -1, num_alloc_blocks_per_batch)

-                    # Setup the slot mapping for the input KVs.
+                    # Set up the slot mapping for the input KVs.
                    tree_positions = sequence_position + torch.arange(
                        0,
                        tree_size_q,

--- a/tests/v1/test_kv_sharing.py
+++ b/tests/v1/test_kv_sharing.py
@@ -30,7 +30,7 @@ def test_initialize_kv_cache_for_kv_sharing_different_attn_groups():
    }

    # Layers 0 and 1 both belong in KV cache group 0
-    # However, if they have have different attention backends, they will be
+    # However, if they have different attention backends, they will be
    # placed in different attention groups for KV cache group 0
    kv_cache_groups = [
        KVCacheGroupSpec(["model.layers.0", "model.layers.1"],

--- a/tests/v1/test_oracle.py
+++ b/tests/v1/test_oracle.py
@@ -10,7 +10,6 @@ from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine

 UNSUPPORTED_MODELS_V1 = [
-    "openai/whisper-large-v3",  # transcription
    "facebook/bart-large-cnn",  # encoder decoder
 ]


--- a/tests/v1/tpu/test_multimodal.py
+++ b/tests/v1/tpu/test_multimodal.py
@@ -4,18 +4,19 @@
 import openai
 import pytest

-from vllm.multimodal.utils import encode_image_base64, fetch_image
+from vllm.multimodal.utils import encode_image_base64
 from vllm.platforms import current_platform

-from ...entrypoints.openai.test_vision import TEST_IMAGE_URLS
+from ...entrypoints.openai.test_vision import TEST_IMAGE_ASSETS
 from ...utils import RemoteOpenAIServer


 @pytest.fixture(scope="session")
-def base64_encoded_image() -> dict[str, str]:
+def base64_encoded_image(local_asset_server) -> dict[str, str]:
    return {
-        image_url: encode_image_base64(fetch_image(image_url))
-        for image_url in TEST_IMAGE_URLS
+        image_asset:
+        encode_image_base64(local_asset_server.get_image_asset(image_asset))
+        for image_asset in TEST_IMAGE_ASSETS
    }


@@ -66,7 +67,7 @@ async def test_basic_vision(model_name: str, base64_encoded_image: dict[str,
        client: openai.AsyncOpenAI = remote_server.get_async_client()

        # Other requests now should be much faster
-        for image_url in TEST_IMAGE_URLS:
+        for image_url in TEST_IMAGE_ASSETS:
            image_base64 = base64_encoded_image[image_url]
            chat_completion_from_base64 = await client.chat.completions\
                .create(

--- a/tests/v1/tpu/test_topk_topp_sampler.py
+++ b/tests/v1/tpu/test_topk_topp_sampler.py
@@ -6,8 +6,12 @@ import pytest
 import torch

 from vllm.platforms import current_platform
-from vllm.v1.sample.ops.topk_topp_sampler import (apply_top_k_top_p,
-                                                  apply_top_k_top_p_tpu)
+from vllm.v1.sample.ops.topk_topp_sampler import apply_top_k_top_p
+
+# isort: off
+from vllm.v1.sample.tpu.sampler import (apply_top_k_top_p as
+                                        apply_top_k_top_p_tpu)
+# isort: on

 if not current_platform.is_tpu():
    pytest.skip("This test needs a TPU.", allow_module_level=True)

--- a/tests/v1/tracing/test_tracing.py
+++ b/tests/v1/tracing/test_tracing.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa
+# type: ignore
+from __future__ import annotations
+
+import threading
+from collections.abc import Iterable
+from concurrent import futures
+from typing import Callable, Generator, Literal
+
+import grpc
+import pytest
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2 import (
+    ExportTraceServiceResponse)
+from opentelemetry.proto.collector.trace.v1.trace_service_pb2_grpc import (
+    TraceServiceServicer, add_TraceServiceServicer_to_server)
+from opentelemetry.proto.common.v1.common_pb2 import AnyValue, KeyValue
+from opentelemetry.sdk.environment_variables import (
+    OTEL_EXPORTER_OTLP_TRACES_INSECURE)
+
+from vllm import LLM, SamplingParams
+from vllm.tracing import SpanAttributes
+
+FAKE_TRACE_SERVER_ADDRESS = "localhost:4317"
+
+FieldName = Literal['bool_value', 'string_value', 'int_value', 'double_value',
+                    'array_value']
+
+
+def decode_value(value: AnyValue):
+    field_decoders: dict[FieldName, Callable] = {
+        "bool_value": (lambda v: v.bool_value),
+        "string_value": (lambda v: v.string_value),
+        "int_value": (lambda v: v.int_value),
+        "double_value": (lambda v: v.double_value),
+        "array_value":
+        (lambda v: [decode_value(item) for item in v.array_value.values]),
+    }
+    for field, decoder in field_decoders.items():
+        if value.HasField(field):
+            return decoder(value)
+    raise ValueError(f"Couldn't decode value: {value}")
+
+
+def decode_attributes(attributes: Iterable[KeyValue]):
+    return {kv.key: decode_value(kv.value) for kv in attributes}
+
+
+class FakeTraceService(TraceServiceServicer):
+
+    def __init__(self):
+        self.request = None
+        self.evt = threading.Event()
+
+    def Export(self, request, context):
+        self.request = request
+        self.evt.set()
+        return ExportTraceServiceResponse()
+
+
+@pytest.fixture
+def trace_service() -> Generator[FakeTraceService, None, None]:
+    """Fixture to set up a fake gRPC trace service"""
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=1))
+    service = FakeTraceService()
+    add_TraceServiceServicer_to_server(service, server)
+    server.add_insecure_port(FAKE_TRACE_SERVER_ADDRESS)
+    server.start()
+
+    yield service
+
+    server.stop(None)
+
+
+def test_traces(
+    monkeypatch: pytest.MonkeyPatch,
+    trace_service: FakeTraceService,
+):
+    with monkeypatch.context() as m:
+        m.setenv(OTEL_EXPORTER_OTLP_TRACES_INSECURE, "true")
+        m.setenv("VLLM_USE_V1", "1")
+        sampling_params = SamplingParams(
+            temperature=0.01,
+            top_p=0.1,
+            max_tokens=256,
+        )
+        model = "facebook/opt-125m"
+        llm = LLM(model=model,
+                  otlp_traces_endpoint=FAKE_TRACE_SERVER_ADDRESS,
+                  gpu_memory_utilization=0.3,
+                  disable_log_stats=False)
+        prompts = ["This is a short prompt"]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+        print(f"test_traces outputs is : {outputs}")
+
+        timeout = 10
+        if not trace_service.evt.wait(timeout):
+            raise TimeoutError(
+                f"The fake trace service didn't receive a trace within "
+                f"the {timeout} seconds timeout")
+
+        request = trace_service.request
+        assert len(request.resource_spans) == 1, (
+            f"Expected 1 resource span, "
+            f"but got {len(request.resource_spans)}")
+        assert len(request.resource_spans[0].scope_spans) == 1, (
+            f"Expected 1 scope span, "
+            f"but got {len(request.resource_spans[0].scope_spans)}")
+        assert len(request.resource_spans[0].scope_spans[0].spans) == 1, (
+            f"Expected 1 span, "
+            f"but got {len(request.resource_spans[0].scope_spans[0].spans)}")
+
+        attributes = decode_attributes(
+            request.resource_spans[0].scope_spans[0].spans[0].attributes)
+        # assert attributes.get(SpanAttributes.GEN_AI_RESPONSE_MODEL) == model
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_ID) == outputs[0].request_id
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_TEMPERATURE
+                              ) == sampling_params.temperature
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_TOP_P) == sampling_params.top_p
+        assert attributes.get(SpanAttributes.GEN_AI_REQUEST_MAX_TOKENS
+                              ) == sampling_params.max_tokens
+        assert attributes.get(
+            SpanAttributes.GEN_AI_REQUEST_N) == sampling_params.n
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_PROMPT_TOKENS) == len(
+                outputs[0].prompt_token_ids)
+        completion_tokens = sum(len(o.token_ids) for o in outputs[0].outputs)
+        assert attributes.get(
+            SpanAttributes.GEN_AI_USAGE_COMPLETION_TOKENS) == completion_tokens
+
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_TIME_IN_QUEUE) > 0
+        assert attributes.get(
+            SpanAttributes.GEN_AI_LATENCY_TIME_TO_FIRST_TOKEN) > 0
+        assert attributes.get(SpanAttributes.GEN_AI_LATENCY_E2E) > 0
--- a/tests/v1/worker/test_gpu_model_runner.py
+++ b/tests/v1/worker/test_gpu_model_runner.py
@@ -702,7 +702,7 @@ def test_hybrid_attention_mamba_tensor_shapes(monkeypatch):
    KVCacheTensors for the attention and mamba layers
    (via _reshape_kv_cache_tensors function). This test verifies
    that the views are compatible: writing a mamba block
-    will not corrupt an attention block and vice-versa
+    will not corrupt an attention block and vice versa
    '''

    current_platform.seed_everything(42)

--- a/tools/install_deepgemm.sh
+++ b/tools/install_deepgemm.sh
@@ -6,7 +6,7 @@ set -e

 # Default values
 DEEPGEMM_GIT_REPO="https://github.com/deepseek-ai/DeepGEMM.git"
-DEEPGEMM_GIT_REF="7b6b5563b9d4c1ae07ffbce7f78ad3ac9204827c"
+DEEPGEMM_GIT_REF="ea9c5d9270226c5dd7a577c212e9ea385f6ef048"

 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
@@ -105,4 +105,4 @@ fi

 popd

-echo "✅ DeepGEMM installation completed successfully"
\ No newline at end of file
+echo "✅ DeepGEMM installation completed successfully"
--- a/tools/mypy.sh
+++ b/tools/mypy.sh
@@ -29,7 +29,7 @@ run_mypy vllm/engine
 run_mypy vllm/executor
 run_mypy vllm/inputs
 run_mypy vllm/lora
-run_mypy vllm/model_executor
+run_mypy --exclude 'vllm/model_executor/layers/fla/ops' vllm/model_executor
 run_mypy vllm/plugins
 run_mypy vllm/worker
 run_mypy vllm/v1
--- a/tools/profiler/visualize_layerwise_profile.py
+++ b/tools/profiler/visualize_layerwise_profile.py
@@ -119,7 +119,7 @@ def attempt_to_make_names_unique(entries_and_traces):
             if not all_the_same(trace_eles)), None)

        if first_trace_difference is None:
-            # can't create a unique name, leave them names as the
+            # can't create a unique name, leave the names as they
            # are they will get aggregated by the pivot_table call
            continue