Merge branch 'v0.9.2-dev' of http://10.16.6.30/dcutoolkit/deeplearing/vllm into v0.9.2-dev

903a588f · zhuwenwen · 64e307c7 · ba0cd35c · 903a588f · 903a588f
Commit 903a588f authored Dec 01, 2025 by zhuwenwen
Showing with 13 additions and 2 deletions

vllm/envs.py vllm/envs.py +5 -0

vllm/v1/attention/backends/mla/flashmla.py vllm/v1/attention/backends/mla/flashmla.py +2 -2

vllm/v1/engine/core.py vllm/v1/engine/core.py +6 -0

No files found.
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -189,6 +189,7 @@ if TYPE_CHECKING:
    VLLM_USE_FUSE_SILU_AND_MUL: bool = False
    VLLM_USE_OPT_RESHAPE_AND_CACHE: bool = False
    VLLM_USE_TOPK_RENORM: bool = False
+    VLLM_PP_DEBUG: bool = False
    VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT: bool = False
 def get_default_cache_root():
@@ -1238,6 +1239,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
        lambda:
        (os.environ.get("VLLM_USE_TOPK_RENORM", "True").lower() in
                ("true", "1")),
+    "VLLM_PP_DEBUG":
+        lambda:
+        (os.environ.get("VLLM_PP_DEBUG", "False").lower() in
+         ("true", "1")),
    # vllm will use fused rmsnorm + contiguous + rope(for dpsk-v3) + concat_and_cache_mla
    "VLLM_USE_LIGHTOP_RMS_ROPE_CONCAT":

--- a/vllm/v1/attention/backends/mla/flashmla.py
+++ b/vllm/v1/attention/backends/mla/flashmla.py
@@ -168,7 +168,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
        assert kv_c_and_k_pe_cache.numel() > 0
        assert attn_metadata.decode is not None
-        if not envs.VLLM_USE_CAT_MLA:
+        if not envs.VLLM_USE_CAT_MLA or kv_cache_dtype == "fp8_e4m3":
            if envs.VLLM_USE_OPT_CAT:
                if q_nope.shape[0] < 1024:
                    from vllm.v1.attention.backends.mla.test_concat import concat_helper_decode
@@ -181,7 +181,7 @@ class FlashMLAImpl(MLACommonImpl[FlashMLAMetadata]):
                q = torch.cat([q_nope, q_pe], dim=-1)\
                    .unsqueeze(1) # Add seqlen dim of 1 (decode)
-        if not envs.VLLM_USE_CAT_MLA:
+        if not envs.VLLM_USE_CAT_MLA or kv_cache_dtype == "fp8_e4m3":
            o, _ = flash_mla_with_kvcache(
                q=q,
                k_cache=kv_c_and_k_pe_cache.unsqueeze(-2),  # Add head dim of 1

--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -275,6 +275,12 @@ class EngineCore:
                pass
            scheduler_output = self.scheduler.schedule()
            if scheduler_output.total_num_scheduled_tokens > 0:
+                if envs.VLLM_PP_DEBUG:
+                    import sys,os
+                    num_run_reqs = len(scheduler_output.scheduled_new_reqs) + scheduler_output.scheduled_cached_reqs.num_reqs
+                    sys.stderr.write(f"[pid- {os.getpid()}]running requests in micro batch is:{num_run_reqs}, "
+                                     f"total_num_scheduled_tokens is {scheduler_output.total_num_scheduled_tokens}\n")
+                    sys.stderr.flush()
                future = self.model_executor.execute_model(scheduler_output)
                self.batch_queue.put_nowait(
                    (future, scheduler_output))  # type: ignore