update setup.py

cb563bb5 · zhuwenwen · 0c1fa562 · cb563bb5 · cb563bb5
Commit cb563bb5 authored May 23, 2025 by zhuwenwen
Show whitespace changes
Inline Side-by-side

Showing with 42 additions and 23 deletions

setup.py setup.py +34 -15

vllm/envs.py vllm/envs.py +8 -8

No files found.
--- a/setup.py
+++ b/setup.py
@@ -26,6 +26,10 @@ add_git_version = False
 if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1:
    add_git_version = True
+skip_vllm_build = False
+if int(os.environ.get('SKIP_VLLM_BUILD', '0')) == 1:
+    skip_vllm_build = True
 def load_module_from_path(module_name, path):
    spec = importlib.util.spec_from_file_location(module_name, path)
    module = importlib.util.module_from_spec(spec)
@@ -475,6 +479,7 @@ def _is_xpu() -> bool:
 def _build_custom_ops() -> bool:
+    if not skip_vllm_build:
        return _is_cuda() or _is_hip() or _is_cpu()
@@ -717,7 +722,8 @@ def get_requirements() -> list[str]:
 ext_modules = []
-if _is_cuda() or _is_hip():
+if not skip_vllm_build:
+    if _is_cuda() or _is_hip():
        ext_modules.append(CMakeExtension(name="vllm._moe_C"))
 # if _is_hip():
@@ -738,18 +744,31 @@ if _is_cuda():
 if _build_custom_ops():
    ext_modules.append(CMakeExtension(name="vllm._C"))
-package_data = {
+if skip_vllm_build:
+    package_data = {
        "vllm": [
            "py.typed",
            "model_executor/layers/fused_moe/configs/*.json",
            "model_executor/layers/quantization/utils/configs/*.json",
            "perf/*.py",
            "attention/backends/configs/*.json",
-        "model_executor/layers/quantization/configs/awq/*.json"
+            "model_executor/layers/quantization/configs/awq/*.json",
+            "/opt/dtk/*.so",
        ]
-}
+    }
+else:
+    package_data = {
+        "vllm": [
+            "py.typed",
+            "model_executor/layers/fused_moe/configs/*.json",
+            "model_executor/layers/quantization/utils/configs/*.json",
+            "perf/*.py",
+            "attention/backends/configs/*.json",
+            "model_executor/layers/quantization/configs/awq/*.json",
+        ]
+    }
-if _no_device():
+if _no_device() or skip_vllm_build:
    ext_modules = []
 if not ext_modules:

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -640,14 +640,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_MLA_DISABLE":
    lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
-    # If set, vLLM will use optimized MLA attention optimizations.
-    "VLLM_USE_TRITON_OPT_MLA":
-    lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
-    # If set, vLLM will use FLASH MLA attention optimizations.
-    "VLLM_USE_FLASH_MLA":
-    lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),
    # If set, vLLM will use the Triton implementation of moe_align_block_size,
    # i.e. moe_align_block_size_triton in fused_moe.py.
    "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
@@ -770,6 +762,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
    lambda: (os.environ.get("VLLM_USE_TRITON_PREFIX_FLASH_ATTN", "False").lower() in
             ("true", "1")),
+    # If set, vLLM will use optimized MLA attention optimizations.
+    "VLLM_USE_TRITON_OPT_MLA":
+    lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
+    # If set, vLLM will use FLASH MLA attention optimizations.
+    "VLLM_USE_FLASH_MLA":
+    lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),
    # flag to control vllm to use optimized kernels
    "VLLM_USE_OPT_OP":
    lambda: (os.environ.get("VLLM_USE_OPT_OP", "True").lower() in