Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d1af8b7b
Unverified
Commit
d1af8b7b
authored
Aug 10, 2025
by
Doug Smith
Committed by
GitHub
Aug 10, 2025
Browse files
enable Docker-aware precompiled wheel setup (#22106)
Signed-off-by:
dougbtv
<
dosmith@redhat.com
>
parent
68b254d6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
116 additions
and
95 deletions
+116
-95
docker/Dockerfile
docker/Dockerfile
+5
-10
setup.py
setup.py
+102
-83
vllm/envs.py
vllm/envs.py
+9
-2
No files found.
docker/Dockerfile
View file @
d1af8b7b
...
@@ -210,16 +210,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
...
@@ -210,16 +210,7 @@ ARG SCCACHE_REGION_NAME=us-west-2
ARG
SCCACHE_S3_NO_CREDENTIALS=0
ARG
SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
# Flag to control whether to use pre-built vLLM wheels
ARG
VLLM_USE_PRECOMPILED
ARG
VLLM_USE_PRECOMPILED=""
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV
VLLM_USE_PRECOMPILED=""
RUN if
[
"
${
VLLM_USE_PRECOMPILED
}
"
=
"1"
]
;
then
\
export
VLLM_USE_PRECOMPILED
=
1
&&
\
echo
"Using precompiled wheels"
;
\
else
\
unset
VLLM_USE_PRECOMPILED
&&
\
echo
"Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"
;
\
fi
# if USE_SCCACHE is set, use sccache to speed up compilation
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN
--mount
=
type
=
cache,target
=
/root/.cache/uv
\
RUN
--mount
=
type
=
cache,target
=
/root/.cache/uv
\
...
@@ -236,6 +227,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
...
@@ -236,6 +227,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&&
export
SCCACHE_S3_NO_CREDENTIALS
=
${
SCCACHE_S3_NO_CREDENTIALS
}
\
&&
export
SCCACHE_S3_NO_CREDENTIALS
=
${
SCCACHE_S3_NO_CREDENTIALS
}
\
&&
export
SCCACHE_IDLE_TIMEOUT
=
0
\
&&
export
SCCACHE_IDLE_TIMEOUT
=
0
\
&&
export
CMAKE_BUILD_TYPE
=
Release
\
&&
export
CMAKE_BUILD_TYPE
=
Release
\
&&
export
VLLM_USE_PRECOMPILED
=
"
${
VLLM_USE_PRECOMPILED
}
"
\
&&
export
VLLM_DOCKER_BUILD_CONTEXT
=
1
\
&&
sccache
--show-stats
\
&&
sccache
--show-stats
\
&&
python3 setup.py bdist_wheel
--dist-dir
=
dist
--py-limited-api
=
cp38
\
&&
python3 setup.py bdist_wheel
--dist-dir
=
dist
--py-limited-api
=
cp38
\
&&
sccache
--show-stats
;
\
&&
sccache
--show-stats
;
\
...
@@ -249,6 +242,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
...
@@ -249,6 +242,8 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Clean any existing CMake artifacts
# Clean any existing CMake artifacts
rm -rf .deps && \
rm -rf .deps && \
mkdir -p .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
fi
...
...
setup.py
View file @
d1af8b7b
...
@@ -7,6 +7,7 @@ import json
...
@@ -7,6 +7,7 @@ import json
import
logging
import
logging
import
os
import
os
import
re
import
re
import
shutil
import
subprocess
import
subprocess
import
sys
import
sys
from
pathlib
import
Path
from
pathlib
import
Path
...
@@ -281,10 +282,81 @@ class cmake_build_ext(build_ext):
...
@@ -281,10 +282,81 @@ class cmake_build_ext(build_ext):
self
.
copy_file
(
file
,
dst_file
)
self
.
copy_file
(
file
,
dst_file
)
class
repackage_wheel
(
build_ext
):
class
precompiled_build_ext
(
build_ext
):
"""Disables extension building when using precompiled binaries."""
def
run
(
self
)
->
None
:
assert
_is_cuda
(
),
"VLLM_USE_PRECOMPILED is only supported for CUDA builds"
def
build_extensions
(
self
)
->
None
:
print
(
"Skipping build_ext: using precompiled extensions."
)
return
class
precompiled_wheel_utils
:
"""Extracts libraries and other files from an existing wheel."""
"""Extracts libraries and other files from an existing wheel."""
def
get_base_commit_in_main_branch
(
self
)
->
str
:
@
staticmethod
def
extract_precompiled_and_patch_package
(
wheel_url_or_path
:
str
)
->
dict
:
import
tempfile
import
zipfile
temp_dir
=
None
try
:
if
not
os
.
path
.
isfile
(
wheel_url_or_path
):
wheel_filename
=
wheel_url_or_path
.
split
(
"/"
)[
-
1
]
temp_dir
=
tempfile
.
mkdtemp
(
prefix
=
"vllm-wheels"
)
wheel_path
=
os
.
path
.
join
(
temp_dir
,
wheel_filename
)
print
(
f
"Downloading wheel from
{
wheel_url_or_path
}
"
f
"to
{
wheel_path
}
"
)
from
urllib.request
import
urlretrieve
urlretrieve
(
wheel_url_or_path
,
filename
=
wheel_path
)
else
:
wheel_path
=
wheel_url_or_path
print
(
f
"Using existing wheel at
{
wheel_path
}
"
)
package_data_patch
=
{}
with
zipfile
.
ZipFile
(
wheel_path
)
as
wheel
:
files_to_copy
=
[
"vllm/_C.abi3.so"
,
"vllm/_moe_C.abi3.so"
,
"vllm/_flashmla_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so"
,
"vllm/cumem_allocator.abi3.so"
,
]
compiled_regex
=
re
.
compile
(
r
"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members
=
list
(
filter
(
lambda
x
:
x
.
filename
in
files_to_copy
,
wheel
.
filelist
))
file_members
+=
list
(
filter
(
lambda
x
:
compiled_regex
.
match
(
x
.
filename
),
wheel
.
filelist
))
for
file
in
file_members
:
print
(
f
"[extract]
{
file
.
filename
}
"
)
target_path
=
os
.
path
.
join
(
"."
,
file
.
filename
)
os
.
makedirs
(
os
.
path
.
dirname
(
target_path
),
exist_ok
=
True
)
with
wheel
.
open
(
file
.
filename
)
as
src
,
open
(
target_path
,
"wb"
)
as
dst
:
shutil
.
copyfileobj
(
src
,
dst
)
pkg
=
os
.
path
.
dirname
(
file
.
filename
).
replace
(
"/"
,
"."
)
package_data_patch
.
setdefault
(
pkg
,
[]).
append
(
os
.
path
.
basename
(
file
.
filename
))
return
package_data_patch
finally
:
if
temp_dir
is
not
None
:
print
(
f
"Removing temporary directory
{
temp_dir
}
"
)
shutil
.
rmtree
(
temp_dir
)
@
staticmethod
def
get_base_commit_in_main_branch
()
->
str
:
# Force to use the nightly wheel. This is mainly used for CI testing.
# Force to use the nightly wheel. This is mainly used for CI testing.
if
envs
.
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL
:
if
envs
.
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL
:
return
"nightly"
return
"nightly"
...
@@ -297,6 +369,10 @@ class repackage_wheel(build_ext):
...
@@ -297,6 +369,10 @@ class repackage_wheel(build_ext):
]).
decode
(
"utf-8"
)
]).
decode
(
"utf-8"
)
upstream_main_commit
=
json
.
loads
(
resp_json
)[
"sha"
]
upstream_main_commit
=
json
.
loads
(
resp_json
)[
"sha"
]
# In Docker build context, .git may be immutable or missing.
if
envs
.
VLLM_DOCKER_BUILD_CONTEXT
:
return
upstream_main_commit
# Check if the upstream_main_commit exists in the local repo
# Check if the upstream_main_commit exists in the local repo
try
:
try
:
subprocess
.
check_output
(
subprocess
.
check_output
(
...
@@ -329,86 +405,6 @@ class repackage_wheel(build_ext):
...
@@ -329,86 +405,6 @@ class repackage_wheel(build_ext):
"wheel may not be compatible with your dev branch: %s"
,
err
)
"wheel may not be compatible with your dev branch: %s"
,
err
)
return
"nightly"
return
"nightly"
def
run
(
self
)
->
None
:
assert
_is_cuda
(
),
"VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location
=
os
.
getenv
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
,
None
)
if
wheel_location
is
None
:
base_commit
=
self
.
get_base_commit_in_main_branch
()
wheel_location
=
f
"https://wheels.vllm.ai/
{
base_commit
}
/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if
not
is_url_available
(
wheel_location
):
wheel_location
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
import
zipfile
if
os
.
path
.
isfile
(
wheel_location
):
wheel_path
=
wheel_location
print
(
f
"Using existing wheel=
{
wheel_path
}
"
)
else
:
# Download the wheel from a given URL, assume
# the filename is the last part of the URL
wheel_filename
=
wheel_location
.
split
(
"/"
)[
-
1
]
import
tempfile
# create a temporary directory to store the wheel
temp_dir
=
tempfile
.
mkdtemp
(
prefix
=
"vllm-wheels"
)
wheel_path
=
os
.
path
.
join
(
temp_dir
,
wheel_filename
)
print
(
f
"Downloading wheel from
{
wheel_location
}
to
{
wheel_path
}
"
)
from
urllib.request
import
urlretrieve
try
:
urlretrieve
(
wheel_location
,
filename
=
wheel_path
)
except
Exception
as
e
:
from
setuptools.errors
import
SetupError
raise
SetupError
(
f
"Failed to get vLLM wheel from
{
wheel_location
}
"
)
from
e
with
zipfile
.
ZipFile
(
wheel_path
)
as
wheel
:
files_to_copy
=
[
"vllm/_C.abi3.so"
,
"vllm/_moe_C.abi3.so"
,
"vllm/_flashmla_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so"
,
"vllm/cumem_allocator.abi3.so"
,
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members
=
list
(
filter
(
lambda
x
:
x
.
filename
in
files_to_copy
,
wheel
.
filelist
))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
compiled_regex
=
re
.
compile
(
r
"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members
+=
list
(
filter
(
lambda
x
:
compiled_regex
.
match
(
x
.
filename
),
wheel
.
filelist
))
for
file
in
file_members
:
print
(
f
"Extracting and including
{
file
.
filename
}
"
"from existing wheel"
)
package_name
=
os
.
path
.
dirname
(
file
.
filename
).
replace
(
"/"
,
"."
)
file_name
=
os
.
path
.
basename
(
file
.
filename
)
if
package_name
not
in
package_data
:
package_data
[
package_name
]
=
[]
wheel
.
extract
(
file
)
if
file_name
.
endswith
(
".py"
):
# python files shouldn't be added to package_data
continue
package_data
[
package_name
].
append
(
file_name
)
def
_no_device
()
->
bool
:
def
_no_device
()
->
bool
:
return
VLLM_TARGET_DEVICE
==
"empty"
return
VLLM_TARGET_DEVICE
==
"empty"
...
@@ -639,6 +635,29 @@ package_data = {
...
@@ -639,6 +635,29 @@ package_data = {
]
]
}
}
# If using precompiled, extract and patch package_data (in advance of setup)
if
envs
.
VLLM_USE_PRECOMPILED
:
assert
_is_cuda
(),
"VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location
=
os
.
getenv
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
,
None
)
if
wheel_location
is
not
None
:
wheel_url
=
wheel_location
else
:
base_commit
=
precompiled_wheel_utils
.
get_base_commit_in_main_branch
()
wheel_url
=
f
"https://wheels.vllm.ai/
{
base_commit
}
/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
from
urllib.request
import
urlopen
try
:
with
urlopen
(
wheel_url
)
as
resp
:
if
resp
.
status
!=
200
:
wheel_url
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
except
Exception
as
e
:
print
(
f
"[warn] Falling back to nightly wheel:
{
e
}
"
)
wheel_url
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
patch
=
precompiled_wheel_utils
.
extract_precompiled_and_patch_package
(
wheel_url
)
for
pkg
,
files
in
patch
.
items
():
package_data
.
setdefault
(
pkg
,
[]).
extend
(
files
)
if
_no_device
():
if
_no_device
():
ext_modules
=
[]
ext_modules
=
[]
...
@@ -647,7 +666,7 @@ if not ext_modules:
...
@@ -647,7 +666,7 @@ if not ext_modules:
else
:
else
:
cmdclass
=
{
cmdclass
=
{
"build_ext"
:
"build_ext"
:
re
package_wheel
if
envs
.
VLLM_USE_PRECOMPILED
else
cmake_build_ext
p
re
compiled_build_ext
if
envs
.
VLLM_USE_PRECOMPILED
else
cmake_build_ext
}
}
setup
(
setup
(
...
...
vllm/envs.py
View file @
d1af8b7b
...
@@ -70,6 +70,7 @@ if TYPE_CHECKING:
...
@@ -70,6 +70,7 @@ if TYPE_CHECKING:
MAX_JOBS
:
Optional
[
str
]
=
None
MAX_JOBS
:
Optional
[
str
]
=
None
NVCC_THREADS
:
Optional
[
str
]
=
None
NVCC_THREADS
:
Optional
[
str
]
=
None
VLLM_USE_PRECOMPILED
:
bool
=
False
VLLM_USE_PRECOMPILED
:
bool
=
False
VLLM_DOCKER_BUILD_CONTEXT
:
bool
=
False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL
:
bool
=
False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL
:
bool
=
False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH
:
bool
=
False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH
:
bool
=
False
CMAKE_BUILD_TYPE
:
Optional
[
str
]
=
None
CMAKE_BUILD_TYPE
:
Optional
[
str
]
=
None
...
@@ -234,8 +235,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
...
@@ -234,8 +235,14 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vllm will use precompiled binaries (*.so)
# If set, vllm will use precompiled binaries (*.so)
"VLLM_USE_PRECOMPILED"
:
"VLLM_USE_PRECOMPILED"
:
lambda
:
bool
(
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
))
or
bool
(
lambda
:
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
,
""
).
strip
().
lower
()
in
os
.
environ
.
get
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
)),
(
"1"
,
"true"
)
or
bool
(
os
.
environ
.
get
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
)),
# Used to mark that setup.py is running in a Docker build context,
# in order to force the use of precompiled binaries.
"VLLM_DOCKER_BUILD_CONTEXT"
:
lambda
:
os
.
environ
.
get
(
"VLLM_DOCKER_BUILD_CONTEXT"
,
""
).
strip
().
lower
()
in
(
"1"
,
"true"
),
# Whether to force using nightly wheel in python build.
# Whether to force using nightly wheel in python build.
# This is used for testing the nightly wheel in python build.
# This is used for testing the nightly wheel in python build.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment