创建pcp分支

d1fd831b · 王敏 · 2a79e7d5 · d1fd831b · d1fd831b
Commit d1fd831b authored Apr 01, 2026 by 王敏
Show whitespace changes
Inline Side-by-side

Showing with 15 additions and 2 deletions

vllm/config/vllm.py vllm/config/vllm.py +2 -1

vllm/envs.py vllm/envs.py +13 -1

No files found.
--- a/vllm/config/vllm.py
+++ b/vllm/config/vllm.py
@@ -1186,7 +1186,8 @@ class VllmConfig:
            if (
                self.parallel_config.tensor_parallel_size > 1
-                and self.compilation_config.pass_config.enable_sp
+                and (self.compilation_config.pass_config.enable_sp
+                     or envs.VLLM_MLA_CP)
            ):
                cudagraph_capture_sizes = self.update_sizes_for_sequence_parallelism(
                    cudagraph_capture_sizes

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -323,6 +323,10 @@ if TYPE_CHECKING:
    USE_LIGHTOP_PER_TOKEN_GROUP_QUANT_FP8: bool = False
    USE_LIGHTOP_TOPK: bool = False
    USE_LIGHTOP_CONVERT_REQ_INDEX_TO_GLOBAL_INDEX: bool = False
+    VLLM_MLA_CP: bool = False
+    VLLM_MLA_CPLB: bool = False
 def get_default_cache_root():
    return os.getenv(
        "XDG_CACHE_HOME",
@@ -2003,6 +2007,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
        lambda: (os.environ.get("USE_LIGHTOP_CONVERT_REQ_INDEX_TO_GLOBAL_INDEX", "False").lower() in
                    ("true", "1")), 
+    # If set to 1/True, enable mla context parallel
+    "VLLM_MLA_CP":
+        lambda: (os.environ.get("VLLM_MLA_CP", "False").lower() in
+                    ("true", "1")),
+    "VLLM_MLA_CPLB":
+        lambda: (os.environ.get("VLLM_MLA_CPLB", "False").lower() in
+                    ("true", "1")),         
 }
 # --8<-- [end:env-vars-definition]