Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d1af8b7b
Unverified
Commit
d1af8b7b
authored
Aug 10, 2025
by
Doug Smith
Committed by
GitHub
Aug 10, 2025
Browse files
enable Docker-aware precompiled wheel setup (#22106)
Signed-off-by:
dougbtv
<
dosmith@redhat.com
>
parent
68b254d6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
116 additions
and
95 deletions
+116
-95
docker/Dockerfile
docker/Dockerfile
+5
-10
setup.py
setup.py
+102
-83
vllm/envs.py
vllm/envs.py
+9
-2
No files found.
docker/Dockerfile
View file @
d1af8b7b
...
...
@@ -210,16 +210,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
ARG
SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
ARG
VLLM_USE_PRECOMPILED
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV
VLLM_USE_PRECOMPILED=""
RUN if
[
"
${
VLLM_USE_PRECOMPILED
}
"
=
"1"
]
;
then
\
export
VLLM_USE_PRECOMPILED
=
1
&&
\
echo
"Using precompiled wheels"
;
\
else
\
unset
VLLM_USE_PRECOMPILED
&&
\
echo
"Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"
;
\
fi
ARG
VLLM_USE_PRECOMPILED=""
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN
--mount
=
type
=
cache,target
=
/root/.cache/uv
\
...
...
@@ -236,6 +227,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&&
export
SCCACHE_S3_NO_CREDENTIALS
=
${
SCCACHE_S3_NO_CREDENTIALS
}
\
&&
export
SCCACHE_IDLE_TIMEOUT
=
0
\
&&
export
CMAKE_BUILD_TYPE
=
Release
\
&&
export
VLLM_USE_PRECOMPILED
=
"
${
VLLM_USE_PRECOMPILED
}
"
\
&&
export
VLLM_DOCKER_BUILD_CONTEXT
=
1
\
&&
sccache
--show-stats
\
&&
python3 setup.py bdist_wheel
--dist-dir
=
dist
--py-limited-api
=
cp38
\
&&
sccache
--show-stats
;
\
...
...
@@ -249,6 +242,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
...
...
setup.py
View file @
d1af8b7b
...
...
@@ -7,6 +7,7 @@ import json
import
logging
import
os
import
re
import
shutil
import
subprocess
import
sys
from
pathlib
import
Path
...
...
@@ -281,10 +282,81 @@ class cmake_build_ext(build_ext):
self
.
copy_file
(
file
,
dst_file
)
class
repackage_wheel
(
build_ext
):
class
precompiled_build_ext
(
build_ext
):
"""Disables extension building when using precompiled binaries."""
def
run
(
self
)
->
None
:
assert
_is_cuda
(
),
"VLLM_USE_PRECOMPILED is only supported for CUDA builds"
def
build_extensions
(
self
)
->
None
:
print
(
"Skipping build_ext: using precompiled extensions."
)
return
class
precompiled_wheel_utils
:
"""Extracts libraries and other files from an existing wheel."""
def
get_base_commit_in_main_branch
(
self
)
->
str
:
@
staticmethod
def
extract_precompiled_and_patch_package
(
wheel_url_or_path
:
str
)
->
dict
:
import
tempfile
import
zipfile
temp_dir
=
None
try
:
if
not
os
.
path
.
isfile
(
wheel_url_or_path
):
wheel_filename
=
wheel_url_or_path
.
split
(
"/"
)[
-
1
]
temp_dir
=
tempfile
.
mkdtemp
(
prefix
=
"vllm-wheels"
)
wheel_path
=
os
.
path
.
join
(
temp_dir
,
wheel_filename
)
print
(
f
"Downloading wheel from
{
wheel_url_or_path
}
"
f
"to
{
wheel_path
}
"
)
from
urllib.request
import
urlretrieve
urlretrieve
(
wheel_url_or_path
,
filename
=
wheel_path
)
else
:
wheel_path
=
wheel_url_or_path
print
(
f
"Using existing wheel at
{
wheel_path
}
"
)
package_data_patch
=
{}
with
zipfile
.
ZipFile
(
wheel_path
)
as
wheel
:
files_to_copy
=
[
"vllm/_C.abi3.so"
,
"vllm/_moe_C.abi3.so"
,
"vllm/_flashmla_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so"
,
"vllm/cumem_allocator.abi3.so"
,
]
compiled_regex
=
re
.
compile
(
r
"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members
=
list
(
filter
(
lambda
x
:
x
.
filename
in
files_to_copy
,
wheel
.
filelist
))
file_members
+=
list
(
filter
(
lambda
x
:
compiled_regex
.
match
(
x
.
filename
),
wheel
.
filelist
))
for
file
in
file_members
:
print
(
f
"[extract]
{
file
.
filename
}
"
)
target_path
=
os
.
path
.
join
(
"."
,
file
.
filename
)
os
.
makedirs
(
os
.
path
.
dirname
(
target_path
),
exist_ok
=
True
)
with
wheel
.
open
(
file
.
filename
)
as
src
,
open
(
target_path
,
"wb"
)
as
dst
:
shutil
.
copyfileobj
(
src
,
dst
)
pkg
=
os
.
path
.
dirname
(
file
.
filename
).
replace
(
"/"
,
"."
)
package_data_patch
.
setdefault
(
pkg
,
[]).
append
(
os
.
path
.
basename
(
file
.
filename
))
return
package_data_patch
finally
:
if
temp_dir
is
not
None
:
print
(
f
"Removing temporary directory
{
temp_dir
}
"
)
shutil
.
rmtree
(
temp_dir
)
@
staticmethod
def
get_base_commit_in_main_branch
()
->
str
:
# Force to use the nightly wheel. This is mainly used for CI testing.
if
envs
.
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL
:
return
"nightly"
...
...
@@ -297,6 +369,10 @@ class repackage_wheel(build_ext):
]).
decode
(
"utf-8"
)
upstream_main_commit
=
json
.
loads
(
resp_json
)[
"sha"
]
# In Docker build context, .git may be immutable or missing.
if
envs
.
VLLM_DOCKER_BUILD_CONTEXT
:
return
upstream_main_commit
# Check if the upstream_main_commit exists in the local repo
try
:
subprocess
.
check_output
(
...
...
@@ -329,86 +405,6 @@ class repackage_wheel(build_ext):
"wheel may not be compatible with your dev branch: %s"
,
err
)
return
"nightly"
def
run
(
self
)
->
None
:
assert
_is_cuda
(
),
"VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location
=
os
.
getenv
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
,
None
)
if
wheel_location
is
None
:
base_commit
=
self
.
get_base_commit_in_main_branch
()
wheel_location
=
f
"https://wheels.vllm.ai/
{
base_commit
}
/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if
not
is_url_available
(
wheel_location
):
wheel_location
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
import
zipfile
if
os
.
path
.
isfile
(
wheel_location
):
wheel_path
=
wheel_location
print
(
f
"Using existing wheel=
{
wheel_path
}
"
)
else
:
# Download the wheel from a given URL, assume
# the filename is the last part of the URL
wheel_filename
=
wheel_location
.
split
(
"/"
)[
-
1
]
import
tempfile
# create a temporary directory to store the wheel
temp_dir
=
tempfile
.
mkdtemp
(
prefix
=
"vllm-wheels"
)
wheel_path
=
os
.
path
.
join
(
temp_dir
,
wheel_filename
)
print
(
f
"Downloading wheel from
{
wheel_location
}
to
{
wheel_path
}
"
)
from
urllib.request
import
urlretrieve
try
:
urlretrieve
(
wheel_location
,
filename
=
wheel_path
)
except
Exception
as
e
:
from
setuptools.errors
import
SetupError
raise
SetupError
(
f
"Failed to get vLLM wheel from
{
wheel_location
}
"
)
from
e
with
zipfile
.
ZipFile
(
wheel_path
)
as
wheel
:
files_to_copy
=
[
"vllm/_C.abi3.so"
,
"vllm/_moe_C.abi3.so"
,
"vllm/_flashmla_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so"
,
"vllm/cumem_allocator.abi3.so"
,
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members
=
list
(
filter
(
lambda
x
:
x
.
filename
in
files_to_copy
,
wheel
.
filelist
))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
compiled_regex
=
re
.
compile
(
r
"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members
+=
list
(
filter
(
lambda
x
:
compiled_regex
.
match
(
x
.
filename
),
wheel
.
filelist
))
for
file
in
file_members
:
print
(
f
"Extracting and including
{
file
.
filename
}
"
"from existing wheel"
)
package_name
=
os
.
path
.
dirname
(
file
.
filename
).
replace
(
"/"
,
"."
)
file_name
=
os
.
path
.
basename
(
file
.
filename
)
if
package_name
not
in
package_data
:
package_data
[
package_name
]
=
[]
wheel
.
extract
(
file
)
if
file_name
.
endswith
(
".py"
):
# python files shouldn't be added to package_data
continue
package_data
[
package_name
].
append
(
file_name
)
def
_no_device
()
->
bool
:
return
VLLM_TARGET_DEVICE
==
"empty"
...
...
@@ -639,6 +635,29 @@ package_data = {
]
}
# If using precompiled, extract and patch package_data (in advance of setup)
if
envs
.
VLLM_USE_PRECOMPILED
:
assert
_is_cuda
(),
"VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location
=
os
.
getenv
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
,
None
)
if
wheel_location
is
not
None
:
wheel_url
=
wheel_location
else
:
base_commit
=
precompiled_wheel_utils
.
get_base_commit_in_main_branch
()
wheel_url
=
f
"https://wheels.vllm.ai/
{
base_commit
}
/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
from
urllib.request
import
urlopen
try
:
with
urlopen
(
wheel_url
)
as
resp
:
if
resp
.
status
!=
200
:
wheel_url
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
except
Exception
as
e
:
print
(
f
"[warn] Falling back to nightly wheel:
{
e
}
"
)
wheel_url
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
patch
=
precompiled_wheel_utils
.
extract_precompiled_and_patch_package
(
wheel_url
)
for
pkg
,
files
in
patch
.
items
():
package_data
.
setdefault
(
pkg
,
[]).
extend
(
files
)
if
_no_device
():
ext_modules
=
[]
...
...
@@ -647,7 +666,7 @@ if not ext_modules:
else
:
cmdclass
=
{
"build_ext"
:
re
package_wheel
if
envs
.
VLLM_USE_PRECOMPILED
else
cmake_build_ext
p
re
compiled_build_ext
if
envs
.
VLLM_USE_PRECOMPILED
else
cmake_build_ext
}
setup
(
...
...
vllm/envs.py
View file @
d1af8b7b
...
...
@@ -70,6 +70,7 @@ if TYPE_CHECKING:
MAX_JOBS
:
Optional
[
str
]
=
None
NVCC_THREADS
:
Optional
[
str
]
=
None
VLLM_USE_PRECOMPILED
:
bool
=
False
VLLM_DOCKER_BUILD_CONTEXT
:
bool
=
False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL
:
bool
=
False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH
:
bool
=
False
CMAKE_BUILD_TYPE
:
Optional
[
str
]
=
None
...
...
@@ -234,8 +235,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vllm will use precompiled binaries (*.so)
"VLLM_USE_PRECOMPILED"
:
lambda
:
bool
(
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
))
or
bool
(
os
.
environ
.
get
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
)),
lambda
:
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
,
""
).
strip
().
lower
()
in
(
"1"
,
"true"
)
or
bool
(
os
.
environ
.
get
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
)),
# Used to mark that setup.py is running in a Docker build context,
# in order to force the use of precompiled binaries.
"VLLM_DOCKER_BUILD_CONTEXT"
:
lambda
:
os
.
environ
.
get
(
"VLLM_DOCKER_BUILD_CONTEXT"
,
""
).
strip
().
lower
()
in
(
"1"
,
"true"
),
# Whether to force using nightly wheel in python build.
# This is used for testing the nightly wheel in python build.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment