Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cb563bb5
Commit
cb563bb5
authored
May 23, 2025
by
zhuwenwen
Browse files
update setup.py
parent
0c1fa562
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
42 additions
and
23 deletions
+42
-23
setup.py
setup.py
+34
-15
vllm/envs.py
vllm/envs.py
+8
-8
No files found.
setup.py
View file @
cb563bb5
...
@@ -26,6 +26,10 @@ add_git_version = False
...
@@ -26,6 +26,10 @@ add_git_version = False
if
int
(
os
.
environ
.
get
(
'ADD_GIT_VERSION'
,
'0'
))
==
1
:
if
int
(
os
.
environ
.
get
(
'ADD_GIT_VERSION'
,
'0'
))
==
1
:
add_git_version
=
True
add_git_version
=
True
skip_vllm_build
=
False
if
int
(
os
.
environ
.
get
(
'SKIP_VLLM_BUILD'
,
'0'
))
==
1
:
skip_vllm_build
=
True
def
load_module_from_path
(
module_name
,
path
):
def
load_module_from_path
(
module_name
,
path
):
spec
=
importlib
.
util
.
spec_from_file_location
(
module_name
,
path
)
spec
=
importlib
.
util
.
spec_from_file_location
(
module_name
,
path
)
module
=
importlib
.
util
.
module_from_spec
(
spec
)
module
=
importlib
.
util
.
module_from_spec
(
spec
)
...
@@ -475,6 +479,7 @@ def _is_xpu() -> bool:
...
@@ -475,6 +479,7 @@ def _is_xpu() -> bool:
def
_build_custom_ops
()
->
bool
:
def
_build_custom_ops
()
->
bool
:
if
not
skip_vllm_build
:
return
_is_cuda
()
or
_is_hip
()
or
_is_cpu
()
return
_is_cuda
()
or
_is_hip
()
or
_is_cpu
()
...
@@ -717,7 +722,8 @@ def get_requirements() -> list[str]:
...
@@ -717,7 +722,8 @@ def get_requirements() -> list[str]:
ext_modules
=
[]
ext_modules
=
[]
if
_is_cuda
()
or
_is_hip
():
if
not
skip_vllm_build
:
if
_is_cuda
()
or
_is_hip
():
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._moe_C"
))
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._moe_C"
))
# if _is_hip():
# if _is_hip():
...
@@ -738,18 +744,31 @@ if _is_cuda():
...
@@ -738,18 +744,31 @@ if _is_cuda():
if
_build_custom_ops
():
if
_build_custom_ops
():
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._C"
))
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._C"
))
package_data
=
{
if
skip_vllm_build
:
package_data
=
{
"vllm"
:
[
"vllm"
:
[
"py.typed"
,
"py.typed"
,
"model_executor/layers/fused_moe/configs/*.json"
,
"model_executor/layers/fused_moe/configs/*.json"
,
"model_executor/layers/quantization/utils/configs/*.json"
,
"model_executor/layers/quantization/utils/configs/*.json"
,
"perf/*.py"
,
"perf/*.py"
,
"attention/backends/configs/*.json"
,
"attention/backends/configs/*.json"
,
"model_executor/layers/quantization/configs/awq/*.json"
"model_executor/layers/quantization/configs/awq/*.json"
,
"/opt/dtk/*.so"
,
]
]
}
}
else
:
package_data
=
{
"vllm"
:
[
"py.typed"
,
"model_executor/layers/fused_moe/configs/*.json"
,
"model_executor/layers/quantization/utils/configs/*.json"
,
"perf/*.py"
,
"attention/backends/configs/*.json"
,
"model_executor/layers/quantization/configs/awq/*.json"
,
]
}
if
_no_device
():
if
_no_device
()
or
skip_vllm_build
:
ext_modules
=
[]
ext_modules
=
[]
if
not
ext_modules
:
if
not
ext_modules
:
...
...
vllm/envs.py
View file @
cb563bb5
...
@@ -640,14 +640,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -640,14 +640,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MLA_DISABLE"
:
"VLLM_MLA_DISABLE"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_MLA_DISABLE"
,
"0"
))),
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_MLA_DISABLE"
,
"0"
))),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_TRITON_OPT_MLA"
,
"0"
))),
# If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_FLASH_MLA"
,
"1"
))),
# If set, vLLM will use the Triton implementation of moe_align_block_size,
# If set, vLLM will use the Triton implementation of moe_align_block_size,
# i.e. moe_align_block_size_triton in fused_moe.py.
# i.e. moe_align_block_size_triton in fused_moe.py.
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON"
:
"VLLM_ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON"
:
...
@@ -770,6 +762,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -770,6 +762,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_TRITON_PREFIX_FLASH_ATTN"
,
"False"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_TRITON_PREFIX_FLASH_ATTN"
,
"False"
).
lower
()
in
(
"true"
,
"1"
)),
(
"true"
,
"1"
)),
# If set, vLLM will use optimized MLA attention optimizations.
"VLLM_USE_TRITON_OPT_MLA"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_TRITON_OPT_MLA"
,
"0"
))),
# If set, vLLM will use FLASH MLA attention optimizations.
"VLLM_USE_FLASH_MLA"
:
lambda
:
bool
(
int
(
os
.
getenv
(
"VLLM_USE_FLASH_MLA"
,
"1"
))),
# flag to control vllm to use optimized kernels
# flag to control vllm to use optimized kernels
"VLLM_USE_OPT_OP"
:
"VLLM_USE_OPT_OP"
:
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_OPT_OP"
,
"True"
).
lower
()
in
lambda
:
(
os
.
environ
.
get
(
"VLLM_USE_OPT_OP"
,
"True"
).
lower
()
in
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment