Commit cb563bb5 authored by zhuwenwen's avatar zhuwenwen
Browse files

update setup.py

parent 0c1fa562
...@@ -25,6 +25,10 @@ pwd = os.path.dirname(os.path.abspath(__file__)) ...@@ -25,6 +25,10 @@ pwd = os.path.dirname(os.path.abspath(__file__))
add_git_version = False add_git_version = False
if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1: if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1:
add_git_version = True add_git_version = True
skip_vllm_build = False
if int(os.environ.get('SKIP_VLLM_BUILD', '0')) == 1:
skip_vllm_build = True
def load_module_from_path(module_name, path): def load_module_from_path(module_name, path):
spec = importlib.util.spec_from_file_location(module_name, path) spec = importlib.util.spec_from_file_location(module_name, path)
...@@ -475,7 +479,8 @@ def _is_xpu() -> bool: ...@@ -475,7 +479,8 @@ def _is_xpu() -> bool:
def _build_custom_ops() -> bool: def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu() if not skip_vllm_build:
return _is_cuda() or _is_hip() or _is_cpu()
def get_rocm_version(): def get_rocm_version():
...@@ -717,8 +722,9 @@ def get_requirements() -> list[str]: ...@@ -717,8 +722,9 @@ def get_requirements() -> list[str]:
ext_modules = [] ext_modules = []
if _is_cuda() or _is_hip(): if not skip_vllm_build:
ext_modules.append(CMakeExtension(name="vllm._moe_C")) if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C"))
# if _is_hip(): # if _is_hip():
# ext_modules.append(CMakeExtension(name="vllm._rocm_C")) # ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
...@@ -738,18 +744,31 @@ if _is_cuda(): ...@@ -738,18 +744,31 @@ if _is_cuda():
if _build_custom_ops(): if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C")) ext_modules.append(CMakeExtension(name="vllm._C"))
package_data = { if skip_vllm_build:
"vllm": [ package_data = {
"py.typed", "vllm": [
"model_executor/layers/fused_moe/configs/*.json", "py.typed",
"model_executor/layers/quantization/utils/configs/*.json", "model_executor/layers/fused_moe/configs/*.json",
"perf/*.py", "model_executor/layers/quantization/utils/configs/*.json",
"attention/backends/configs/*.json", "perf/*.py",
"model_executor/layers/quantization/configs/awq/*.json" "attention/backends/configs/*.json",
] "model_executor/layers/quantization/configs/awq/*.json",
} "/opt/dtk/*.so",
]
if _no_device(): }
else:
package_data = {
"vllm": [
"py.typed",
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py",
"attention/backends/configs/*.json",
"model_executor/layers/quantization/configs/awq/*.json",
]
}
if _no_device() or skip_vllm_build:
ext_modules = [] ext_modules = []
if not ext_modules: if not ext_modules:
......
...@@ -639,14 +639,6 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -639,14 +639,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will disable the MLA attention optimizations. # If set, vLLM will disable the MLA attention optimizations.
"VLLM_MLA_DISABLE": "VLLM_MLA_DISABLE":
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))), lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
# If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),
# If set, vLLM will use the Triton implementation of moe_align_block_size, # If set, vLLM will use the Triton implementation of moe_align_block_size,
# i.e. moe_align_block_size_triton in fused_moe.py. # i.e. moe_align_block_size_triton in fused_moe.py.
...@@ -770,6 +762,14 @@ environment_variables: dict[str, Callable[[], Any]] = { ...@@ -770,6 +762,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (os.environ.get("VLLM_USE_TRITON_PREFIX_FLASH_ATTN", "False").lower() in lambda: (os.environ.get("VLLM_USE_TRITON_PREFIX_FLASH_ATTN", "False").lower() in
("true", "1")), ("true", "1")),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
# If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),
# flag to control vllm to use optimized kernels # flag to control vllm to use optimized kernels
"VLLM_USE_OPT_OP": "VLLM_USE_OPT_OP":
lambda: (os.environ.get("VLLM_USE_OPT_OP", "True").lower() in lambda: (os.environ.get("VLLM_USE_OPT_OP", "True").lower() in
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment