Commit cb563bb5 authored by zhuwenwen's avatar zhuwenwen
Browse files

update setup.py

parent 0c1fa562
...@@ -26,6 +26,10 @@ add_git_version = False ...@@ -26,6 +26,10 @@ add_git_version = False
if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1: if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1:
add_git_version = True add_git_version = True
skip_vllm_build = False
if int(os.environ.get('SKIP_VLLM_BUILD', '0')) == 1:
skip_vllm_build = True
def load_module_from_path(module_name, path): def load_module_from_path(module_name, path):
spec = importlib.util.spec_from_file_location(module_name, path) spec = importlib.util.spec_from_file_location(module_name, path)
module = importlib.util.module_from_spec(spec) module = importlib.util.module_from_spec(spec)
...@@ -475,6 +479,7 @@ def _is_xpu() -> bool: ...@@ -475,6 +479,7 @@ def _is_xpu() -> bool:
def _build_custom_ops() -> bool: def _build_custom_ops() -> bool:
if not skip_vllm_build:
return _is_cuda() or _is_hip() or _is_cpu() return _is_cuda() or _is_hip() or _is_cpu()
...@@ -717,7 +722,8 @@ def get_requirements() -> list[str]: ...@@ -717,7 +722,8 @@ def get_requirements() -> list[str]:
ext_modules = [] ext_modules = []
if _is_cuda() or _is_hip(): if not skip_vllm_build:
if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C")) ext_modules.append(CMakeExtension(name="vllm._moe_C"))
# if _is_hip(): # if _is_hip():
...@@ -738,18 +744,31 @@ if _is_cuda(): ...@@ -738,18 +744,31 @@ if _is_cuda():
if _build_custom_ops(): if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C")) ext_modules.append(CMakeExtension(name="vllm._C"))
package_data = { if skip_vllm_build:
package_data = {
"vllm": [ "vllm": [
"py.typed", "py.typed",
"model_executor/layers/fused_moe/configs/*.json", "model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json", "model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py", "perf/*.py",
"attention/backends/configs/*.json", "attention/backends/configs/*.json",
"model_executor/layers/quantization/configs/awq/*.json" "model_executor/layers/quantization/configs/awq/*.json",
"/opt/dtk/*.so",
] ]
} }
else:
package_data = {
"vllm": [
"py.typed",
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py",
"attention/backends/configs/*.json",
"model_executor/layers/quantization/configs/awq/*.json",
]
}
if _no_device(): if _no_device() or skip_vllm_build:
ext_modules = [] ext_modules = []
if not ext_modules: if not ext_modules:
......
...@@ -640,14 +640,6 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -640,14 +640,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MLA_DISABLE": "VLLM_MLA_DISABLE":
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
# If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),
# If set, vLLM will use the Triton implementation of moe_align_block_size, # If set, vLLM will use the Triton implementation of moe_align_block_size,
# i.e. moe_align_block_size_triton in fused_moe.py. # i.e. moe_align_block_size_triton in fused_moe.py.
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON": "VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON":
...@@ -770,6 +762,14 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -770,6 +762,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (os.environ.get("VLLM_USE_TRITON_PREFIX_FLASH_ATTN", "False").lower() in lambda: (os.environ.get("VLLM_USE_TRITON_PREFIX_FLASH_ATTN", "False").lower() in
("true", "1")), ("true", "1")),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
# If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),
# flag to control vllm to use optimized kernels # flag to control vllm to use optimized kernels
"VLLM_USE_OPT_OP": "VLLM_USE_OPT_OP":
lambda: (os.environ.get("VLLM_USE_OPT_OP", "True").lower() in lambda: (os.environ.get("VLLM_USE_OPT_OP", "True").lower() in
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment