Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
da31f6ad
Unverified
Commit
da31f6ad
authored
Aug 01, 2025
by
Simon Mo
Committed by
GitHub
Aug 01, 2025
Browse files
Revert precompile wheel changes (#22055)
parent
98df153a
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
107 additions
and
137 deletions
+107
-137
docker/Dockerfile
docker/Dockerfile
+10
-17
requirements/test.txt
requirements/test.txt
+8
-16
setup.py
setup.py
+87
-95
vllm/envs.py
vllm/envs.py
+2
-9
No files found.
docker/Dockerfile
View file @
da31f6ad
...
...
@@ -206,7 +206,16 @@ ARG SCCACHE_REGION_NAME=us-west-2
ARG
SCCACHE_S3_NO_CREDENTIALS=0
# Flag to control whether to use pre-built vLLM wheels
ARG
VLLM_USE_PRECOMPILED=""
ARG
VLLM_USE_PRECOMPILED
# TODO: in setup.py VLLM_USE_PRECOMPILED is sensitive to truthiness, it will take =0 as "true", this should be fixed
ENV
VLLM_USE_PRECOMPILED=""
RUN if
[
"
${
VLLM_USE_PRECOMPILED
}
"
=
"1"
]
;
then
\
export
VLLM_USE_PRECOMPILED
=
1
&&
\
echo
"Using precompiled wheels"
;
\
else
\
unset
VLLM_USE_PRECOMPILED
&&
\
echo
"Leaving VLLM_USE_PRECOMPILED unset to build wheels from source"
;
\
fi
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN
--mount
=
type
=
cache,target
=
/root/.cache/uv
\
...
...
@@ -223,8 +232,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&&
export
SCCACHE_S3_NO_CREDENTIALS
=
${
SCCACHE_S3_NO_CREDENTIALS
}
\
&&
export
SCCACHE_IDLE_TIMEOUT
=
0
\
&&
export
CMAKE_BUILD_TYPE
=
Release
\
&&
export
VLLM_USE_PRECOMPILED
=
"
${
VLLM_USE_PRECOMPILED
}
"
\
&&
export
VLLM_DOCKER_BUILD_CONTEXT
=
1
\
&&
sccache
--show-stats
\
&&
python3 setup.py bdist_wheel
--dist-dir
=
dist
--py-limited-api
=
cp38
\
&&
sccache
--show-stats
;
\
...
...
@@ -238,22 +245,9 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" && \
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
# When using precompiled wheels, keep only the newest manylinux1 wheel and delete others
RUN if
[
"
$VLLM_USE_PRECOMPILED
"
=
"1"
]
;
then
\
echo
"Cleaning up extra wheels in dist/..."
&&
\
# Identify the most recent manylinux1_x86_64 wheel
KEEP_WHEEL=$(ls -t dist/*manylinux1_x86_64.whl 2>/dev/null | head -n1) && \
if [ -n "$KEEP_WHEEL" ]; then \
echo "Keeping wheel: $KEEP_WHEEL"; \
find dist/ -type f -name "*.whl" ! -path "${KEEP_WHEEL}" -delete; \
fi; \
fi
# Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY
.buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py
...
...
@@ -369,7 +363,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
fi
# Install vllm wheel first, so that torch etc will be installed.
# !bang
RUN
--mount
=
type
=
bind
,from
=
build,src
=
/workspace/dist,target
=
/vllm-workspace/dist
\
--mount
=
type
=
cache,target
=
/root/.cache/uv
\
uv pip
install
--system
dist/
*
.whl
--verbose
\
...
...
requirements/test.txt
View file @
da31f6ad
...
...
@@ -22,7 +22,9 @@ aiohttp==3.10.11
aiohttp-cors==0.8.1
# via ray
aiosignal==1.3.1
# via aiohttp
# via
# aiohttp
# ray
albucore==0.0.16
# via terratorch
albumentations==1.4.6
...
...
@@ -137,7 +139,7 @@ contourpy==1.3.0
# via matplotlib
cramjam==2.9.0
# via fastparquet
cupy-cuda12x==13.
5.1
cupy-cuda12x==13.
3.0
# via ray
cycler==0.12.1
# via matplotlib
...
...
@@ -224,6 +226,7 @@ frozenlist==1.5.0
# via
# aiohttp
# aiosignal
# ray
fsspec==2024.9.0
# via
# datasets
...
...
@@ -600,18 +603,10 @@ opencv-python-headless==4.11.0.86
opentelemetry-api==1.35.0
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# opentelemetry-sdk
# opentelemetry-semantic-conventions
opentelemetry-exporter-prometheus==0.56b0
# via ray
opentelemetry-proto==1.36.0
# via ray
opentelemetry-sdk==1.35.0
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# ray
# via mlflow-skinny
opentelemetry-semantic-conventions==0.56b0
# via opentelemetry-sdk
packaging==24.2
...
...
@@ -702,9 +697,7 @@ pqdm==0.2.0
pretrainedmodels==0.7.4
# via segmentation-models-pytorch
prometheus-client==0.22.0
# via
# opentelemetry-exporter-prometheus
# ray
# via ray
propcache==0.2.0
# via yarl
proto-plus==1.26.1
...
...
@@ -714,7 +707,6 @@ protobuf==5.28.3
# google-api-core
# googleapis-common-protos
# mlflow-skinny
# opentelemetry-proto
# proto-plus
# ray
# tensorboardx
...
...
@@ -862,7 +854,7 @@ rasterio==1.4.3
# rioxarray
# terratorch
# torchgeo
ray==2.4
8
.0
ray==2.4
3
.0
# via -r requirements/test.in
redis==5.2.0
# via tensorizer
...
...
setup.py
View file @
da31f6ad
...
...
@@ -7,7 +7,6 @@ import json
import
logging
import
os
import
re
import
shutil
import
subprocess
import
sys
from
pathlib
import
Path
...
...
@@ -282,69 +281,10 @@ class cmake_build_ext(build_ext):
self
.
copy_file
(
file
,
dst_file
)
class
p
re
compiled_wheel_utils
:
class
re
package_wheel
(
build_ext
)
:
"""Extracts libraries and other files from an existing wheel."""
@
staticmethod
def
extract_precompiled_and_patch_package
(
wheel_url_or_path
:
str
)
->
dict
:
import
tempfile
import
zipfile
temp_dir
=
None
try
:
if
not
os
.
path
.
isfile
(
wheel_url_or_path
):
wheel_filename
=
wheel_url_or_path
.
split
(
"/"
)[
-
1
]
temp_dir
=
tempfile
.
mkdtemp
(
prefix
=
"vllm-wheels"
)
wheel_path
=
os
.
path
.
join
(
temp_dir
,
wheel_filename
)
print
(
f
"Downloading wheel from
{
wheel_url_or_path
}
"
f
"to
{
wheel_path
}
"
)
from
urllib.request
import
urlretrieve
urlretrieve
(
wheel_url_or_path
,
filename
=
wheel_path
)
else
:
wheel_path
=
wheel_url_or_path
print
(
f
"Using existing wheel at
{
wheel_path
}
"
)
package_data_patch
=
{}
with
zipfile
.
ZipFile
(
wheel_path
)
as
wheel
:
files_to_copy
=
[
"vllm/_C.abi3.so"
,
"vllm/_moe_C.abi3.so"
,
"vllm/_flashmla_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so"
,
"vllm/cumem_allocator.abi3.so"
,
]
compiled_regex
=
re
.
compile
(
r
"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members
=
list
(
filter
(
lambda
x
:
x
.
filename
in
files_to_copy
,
wheel
.
filelist
))
file_members
+=
list
(
filter
(
lambda
x
:
compiled_regex
.
match
(
x
.
filename
),
wheel
.
filelist
))
for
file
in
file_members
:
print
(
f
"[extract]
{
file
.
filename
}
"
)
target_path
=
os
.
path
.
join
(
"."
,
file
.
filename
)
os
.
makedirs
(
os
.
path
.
dirname
(
target_path
),
exist_ok
=
True
)
with
wheel
.
open
(
file
.
filename
)
as
src
,
open
(
target_path
,
"wb"
)
as
dst
:
shutil
.
copyfileobj
(
src
,
dst
)
pkg
=
os
.
path
.
dirname
(
file
.
filename
).
replace
(
"/"
,
"."
)
package_data_patch
.
setdefault
(
pkg
,
[]).
append
(
os
.
path
.
basename
(
file
.
filename
))
return
package_data_patch
finally
:
if
temp_dir
is
not
None
:
print
(
f
"Removing temporary directory
{
temp_dir
}
"
)
shutil
.
rmtree
(
temp_dir
)
@
staticmethod
def
get_base_commit_in_main_branch
()
->
str
:
def
get_base_commit_in_main_branch
(
self
)
->
str
:
# Force to use the nightly wheel. This is mainly used for CI testing.
if
envs
.
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL
:
return
"nightly"
...
...
@@ -357,10 +297,6 @@ class precompiled_wheel_utils:
]).
decode
(
"utf-8"
)
upstream_main_commit
=
json
.
loads
(
resp_json
)[
"sha"
]
# In Docker build context, .git may be immutable or missing.
if
envs
.
VLLM_DOCKER_BUILD_CONTEXT
:
return
upstream_main_commit
# Check if the upstream_main_commit exists in the local repo
try
:
subprocess
.
check_output
(
...
...
@@ -393,15 +329,92 @@ class precompiled_wheel_utils:
"wheel may not be compatible with your dev branch: %s"
,
err
)
return
"nightly"
def
run
(
self
)
->
None
:
assert
_is_cuda
(
),
"VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location
=
os
.
getenv
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
,
None
)
if
wheel_location
is
None
:
base_commit
=
self
.
get_base_commit_in_main_branch
()
wheel_location
=
f
"https://wheels.vllm.ai/
{
base_commit
}
/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
# Fallback to nightly wheel if latest commit wheel is unavailable,
# in this rare case, the nightly release CI hasn't finished on main.
if
not
is_url_available
(
wheel_location
):
wheel_location
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
import
zipfile
if
os
.
path
.
isfile
(
wheel_location
):
wheel_path
=
wheel_location
print
(
f
"Using existing wheel=
{
wheel_path
}
"
)
else
:
# Download the wheel from a given URL, assume
# the filename is the last part of the URL
wheel_filename
=
wheel_location
.
split
(
"/"
)[
-
1
]
import
tempfile
# create a temporary directory to store the wheel
temp_dir
=
tempfile
.
mkdtemp
(
prefix
=
"vllm-wheels"
)
wheel_path
=
os
.
path
.
join
(
temp_dir
,
wheel_filename
)
print
(
f
"Downloading wheel from
{
wheel_location
}
to
{
wheel_path
}
"
)
from
urllib.request
import
urlretrieve
try
:
urlretrieve
(
wheel_location
,
filename
=
wheel_path
)
except
Exception
as
e
:
from
setuptools.errors
import
SetupError
raise
SetupError
(
f
"Failed to get vLLM wheel from
{
wheel_location
}
"
)
from
e
with
zipfile
.
ZipFile
(
wheel_path
)
as
wheel
:
files_to_copy
=
[
"vllm/_C.abi3.so"
,
"vllm/_moe_C.abi3.so"
,
"vllm/_flashmla_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so"
,
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so"
,
"vllm/cumem_allocator.abi3.so"
,
# "vllm/_version.py", # not available in nightly wheels yet
]
file_members
=
list
(
filter
(
lambda
x
:
x
.
filename
in
files_to_copy
,
wheel
.
filelist
))
# vllm_flash_attn python code:
# Regex from
# `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
compiled_regex
=
re
.
compile
(
r
"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members
+=
list
(
filter
(
lambda
x
:
compiled_regex
.
match
(
x
.
filename
),
wheel
.
filelist
))
for
file
in
file_members
:
print
(
f
"Extracting and including
{
file
.
filename
}
"
"from existing wheel"
)
package_name
=
os
.
path
.
dirname
(
file
.
filename
).
replace
(
"/"
,
"."
)
file_name
=
os
.
path
.
basename
(
file
.
filename
)
if
package_name
not
in
package_data
:
package_data
[
package_name
]
=
[]
wheel
.
extract
(
file
)
if
file_name
.
endswith
(
".py"
):
# python files shouldn't be added to package_data
continue
package_data
[
package_name
].
append
(
file_name
)
def
_no_device
()
->
bool
:
return
VLLM_TARGET_DEVICE
==
"empty"
def
_is_cuda
()
->
bool
:
# Allow forced CUDA in Docker/precompiled builds, even without torch.cuda
if
envs
.
VLLM_USE_PRECOMPILED
and
envs
.
VLLM_DOCKER_BUILD_CONTEXT
:
return
True
has_cuda
=
torch
.
version
.
cuda
is
not
None
return
(
VLLM_TARGET_DEVICE
==
"cuda"
and
has_cuda
and
not
(
_is_neuron
()
or
_is_tpu
()))
...
...
@@ -626,37 +639,16 @@ package_data = {
]
}
# If using precompiled, extract and patch package_data (in advance of setup)
if
envs
.
VLLM_USE_PRECOMPILED
:
assert
_is_cuda
(),
"VLLM_USE_PRECOMPILED is only supported for CUDA builds"
wheel_location
=
os
.
getenv
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
,
None
)
if
wheel_location
is
not
None
:
wheel_url
=
wheel_location
else
:
base_commit
=
precompiled_wheel_utils
.
get_base_commit_in_main_branch
()
wheel_url
=
f
"https://wheels.vllm.ai/
{
base_commit
}
/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
from
urllib.request
import
urlopen
try
:
with
urlopen
(
wheel_url
)
as
resp
:
if
resp
.
status
!=
200
:
wheel_url
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
except
Exception
as
e
:
print
(
f
"[warn] Falling back to nightly wheel:
{
e
}
"
)
wheel_url
=
"https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
patch
=
precompiled_wheel_utils
.
extract_precompiled_and_patch_package
(
wheel_url
)
for
pkg
,
files
in
patch
.
items
():
package_data
.
setdefault
(
pkg
,
[]).
extend
(
files
)
if
_no_device
():
ext_modules
=
[]
if
not
ext_modules
or
envs
.
VLLM_USE_PRECOMPILED
:
# Disable build_ext when using precompiled wheel
if
not
ext_modules
:
cmdclass
=
{}
else
:
cmdclass
=
{
"build_ext"
:
cmake_build_ext
}
cmdclass
=
{
"build_ext"
:
repackage_wheel
if
envs
.
VLLM_USE_PRECOMPILED
else
cmake_build_ext
}
setup
(
# static metadata should rather go in pyproject.toml
...
...
vllm/envs.py
View file @
da31f6ad
...
...
@@ -68,7 +68,6 @@ if TYPE_CHECKING:
MAX_JOBS
:
Optional
[
str
]
=
None
NVCC_THREADS
:
Optional
[
str
]
=
None
VLLM_USE_PRECOMPILED
:
bool
=
False
VLLM_DOCKER_BUILD_CONTEXT
:
bool
=
False
VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL
:
bool
=
False
VLLM_NO_DEPRECATION_WARNING
:
bool
=
False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH
:
bool
=
False
...
...
@@ -228,14 +227,8 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set, vllm will use precompiled binaries (*.so)
"VLLM_USE_PRECOMPILED"
:
lambda
:
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
,
""
).
strip
().
lower
()
in
(
"1"
,
"true"
)
or
bool
(
os
.
environ
.
get
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
)),
# Used to mark that setup.py is running in a Docker build context,
# in order to force the use of precompiled binaries.
"VLLM_DOCKER_BUILD_CONTEXT"
:
lambda
:
os
.
environ
.
get
(
"VLLM_DOCKER_BUILD_CONTEXT"
,
""
).
strip
().
lower
()
in
(
"1"
,
"true"
),
lambda
:
bool
(
os
.
environ
.
get
(
"VLLM_USE_PRECOMPILED"
))
or
bool
(
os
.
environ
.
get
(
"VLLM_PRECOMPILED_WHEEL_LOCATION"
)),
# Whether to force using nightly wheel in python build.
# This is used for testing the nightly wheel in python build.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment