Commit cb563bb5 authored by zhuwenwen's avatar zhuwenwen
Browse files

update setup.py

parent 0c1fa562
......@@ -25,6 +25,10 @@ pwd = os.path.dirname(os.path.abspath(__file__))
add_git_version = False
if int(os.environ.get('ADD_GIT_VERSION', '0')) == 1:
add_git_version = True
skip_vllm_build = False
if int(os.environ.get('SKIP_VLLM_BUILD', '0')) == 1:
skip_vllm_build = True
def load_module_from_path(module_name, path):
spec = importlib.util.spec_from_file_location(module_name, path)
......@@ -475,7 +479,8 @@ def _is_xpu() -> bool:
def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu()
if not skip_vllm_build:
return _is_cuda() or _is_hip() or _is_cpu()
def get_rocm_version():
......@@ -717,8 +722,9 @@ def get_requirements() -> list[str]:
ext_modules = []
if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C"))
if not skip_vllm_build:
if _is_cuda() or _is_hip():
ext_modules.append(CMakeExtension(name="vllm._moe_C"))
# if _is_hip():
# ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
......@@ -738,18 +744,31 @@ if _is_cuda():
if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C"))
package_data = {
"vllm": [
"py.typed",
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py",
"attention/backends/configs/*.json",
"model_executor/layers/quantization/configs/awq/*.json"
]
}
if _no_device():
if skip_vllm_build:
package_data = {
"vllm": [
"py.typed",
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py",
"attention/backends/configs/*.json",
"model_executor/layers/quantization/configs/awq/*.json",
"/opt/dtk/*.so",
]
}
else:
package_data = {
"vllm": [
"py.typed",
"model_executor/layers/fused_moe/configs/*.json",
"model_executor/layers/quantization/utils/configs/*.json",
"perf/*.py",
"attention/backends/configs/*.json",
"model_executor/layers/quantization/configs/awq/*.json",
]
}
if _no_device() or skip_vllm_build:
ext_modules = []
if not ext_modules:
......
......@@ -639,14 +639,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vLLM will disable the MLA attention optimizations.
"VLLM_MLA_DISABLE":
lambda: bool(int(os.getenv("VLLM_MLA_DISABLE", "0"))),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
# If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),
# If set, vLLM will use the Triton implementation of moe_align_block_size,
# i.e. moe_align_block_size_triton in fused_moe.py.
......@@ -770,6 +762,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda: (os.environ.get("VLLM_USE_TRITON_PREFIX_FLASH_ATTN", "False").lower() in
("true", "1")),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA":
lambda: bool(int(os.getenv("VLLM_USE_TRITON_OPT_MLA", "0"))),
# If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA":
lambda: bool(int(os.getenv("VLLM_USE_FLASH_MLA", "1"))),
# flag to control vllm to use optimized kernels
"VLLM_USE_OPT_OP":
lambda: (os.environ.get("VLLM_USE_OPT_OP", "True").lower() in
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment