update setup.py

cb563bb5 · zhuwenwen · 0c1fa562 · cb563bb5 · cb563bb5
Commit cb563bb5 authored May 23, 2025 by zhuwenwen
Hide whitespace changes
Inline Side-by-side

Showing with 42 additions and 23 deletions

setup.py setup.py +34 -15

vllm/envs.py vllm/envs.py +8 -8

No files found.
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,10 @@ pwd = os.path.dirname(os.path.abspath(__file__))
 add_git_version = False
 if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1:
    add_git_version = True
+    
+skip_vllm_build = False
+if int(os.environ.get('SKIP_VLLM_BUILD', '0')) == 1:
+    skip_vllm_build = True

 def load_module_from_path(module_name, path):
    spec = importlib.util.spec_from_file_location(module_name, path)
@@ -475,7 +479,8 @@ def _is_xpu() -> bool:


 def _build_custom_ops() -> bool:
-    return _is_cuda() or _is_hip() or _is_cpu()
+    if not skip_vllm_build:
+        return _is_cuda() or _is_hip() or _is_cpu()


 def get_rocm_version():
@@ -717,8 +722,9 @@ def get_requirements() -> list[str]:

 ext_modules = []

-if _is_cuda() or _is_hip():
-    ext_modules.append(CMakeExtension(name="vllm._moe_C"))
+if not skip_vllm_build:
+    if _is_cuda() or _is_hip():
+        ext_modules.append(CMakeExtension(name="vllm._moe_C"))

 # if _is_hip():
 #     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
@@ -738,18 +744,31 @@ if _is_cuda():
 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))

-package_data = {
-    "vllm": [
-        "py.typed",
-        "model_executor/layers/fused_moe/configs/*.json",
-        "model_executor/layers/quantization/utils/configs/*.json",
-        "perf/*.py",
-        "attention/backends/configs/*.json",
-        "model_executor/layers/quantization/configs/awq/*.json"
-    ]
-}
-
-if _no_device():
+if skip_vllm_build:
+    package_data = {
+        "vllm": [
+            "py.typed",
+            "model_executor/layers/fused_moe/configs/*.json",
+            "model_executor/layers/quantization/utils/configs/*.json",
+            "perf/*.py",
+            "attention/backends/configs/*.json",
+            "model_executor/layers/quantization/configs/awq/*.json",
+            "/opt/dtk/*.so",
+        ]
+    }
+else:
+    package_data = {
+        "vllm": [
+            "py.typed",
+            "model_executor/layers/fused_moe/configs/*.json",
+            "model_executor/layers/quantization/utils/configs/*.json",
+            "perf/*.py",
+            "attention/backends/configs/*.json",
+            "model_executor/layers/quantization/configs/awq/*.json",
+        ]
+    }
+
+if _no_device() or skip_vllm_build:
    ext_modules = []

 if not ext_modules:

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -639,14 +639,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # If set, vLLM will disable the MLA attention optimizations.
    "VLLM_MLA_DISABLE":
    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
-    
-    # If set, vLLM will use optimized MLA attention optimizations.
-    "VLLM_USE_TRITON_OPT_MLA":
-    lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
-    
-    # If set, vLLM will use FLASH MLA attention optimizations.
-    "VLLM_USE_FLASH_MLA":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),

    # If set, vLLM will use the Triton implementation of moe_align_block_size,
    # i.e. moe_align_block_size_triton in fused_moe.py.
@@ -770,6 +762,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
    lambda: (os.environ.get("VLLM_USE_TRITON_PREFIX_FLASH_ATTN", "False").lower() in
             ("true", "1")),
    
+    # If set, vLLM will use optimized MLA attention optimizations.
+    "VLLM_USE_TRITON_OPT_MLA":
+    lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
+    
+    # If set, vLLM will use FLASH MLA attention optimizations.
+    "VLLM_USE_FLASH_MLA":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),
+    
    # flag to control vllm to use optimized kernels
    "VLLM_USE_OPT_OP":
    lambda: (os.environ.get("VLLM_USE_OPT_OP", "True").lower() in