Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
3fb4b5fa
Commit
3fb4b5fa
authored
Mar 23, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.18.0' into v0.18.0-ori
parents
bcf25339
89138b21
Changes
488
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
954 additions
and
431 deletions
+954
-431
examples/pooling/token_embed/colqwen3_token_embed_online.py
examples/pooling/token_embed/colqwen3_token_embed_online.py
+198
-0
mkdocs.yaml
mkdocs.yaml
+7
-3
pyproject.toml
pyproject.toml
+30
-171
requirements/build.txt
requirements/build.txt
+0
-1
requirements/common.txt
requirements/common.txt
+7
-5
requirements/cpu.txt
requirements/cpu.txt
+3
-3
requirements/cuda.txt
requirements/cuda.txt
+8
-2
requirements/docs.txt
requirements/docs.txt
+2
-1
requirements/kv_connectors.txt
requirements/kv_connectors.txt
+2
-1
requirements/lint.txt
requirements/lint.txt
+1
-1
requirements/nightly_torch_test.txt
requirements/nightly_torch_test.txt
+6
-5
requirements/rocm-build.txt
requirements/rocm-build.txt
+2
-2
requirements/rocm-test.txt
requirements/rocm-test.txt
+21
-5
requirements/rocm.txt
requirements/rocm.txt
+8
-4
requirements/test.in
requirements/test.in
+22
-6
requirements/test.txt
requirements/test.txt
+153
-125
requirements/xpu.txt
requirements/xpu.txt
+1
-1
scripts/autotune_helion_kernels.py
scripts/autotune_helion_kernels.py
+435
-0
setup.py
setup.py
+37
-93
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+11
-2
No files found.
Too many changes to show.
To preserve performance only
488 of 488+
files are displayed.
Plain diff
Email patch
examples/pooling/token_embed/colqwen3_token_embed_online.py
0 → 100644
View file @
3fb4b5fa
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Example online usage of Pooling API for ColQwen3 multi-vector retrieval.
ColQwen3 is a multi-modal late interaction model based on Qwen3-VL that
produces per-token embeddings (320-dim, L2-normalized) for both text and
image inputs. Similarity is computed via MaxSim scoring.
This example mirrors the official TomoroAI inference code
(https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-4b) but uses the
vLLM serving API instead of local HuggingFace model loading.
Start the server with:
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
Then run this script:
python colqwen3_token_embed_online.py
"""
import
argparse
import
base64
from
io
import
BytesIO
import
numpy
as
np
import
requests
from
PIL
import
Image
# ── Helpers ─────────────────────────────────────────────────
def
post_http_request
(
payload
:
dict
,
api_url
:
str
)
->
requests
.
Response
:
headers
=
{
"User-Agent"
:
"Test Client"
}
return
requests
.
post
(
api_url
,
headers
=
headers
,
json
=
payload
)
def
load_image
(
url
:
str
)
->
Image
.
Image
:
"""Download an image from URL (handles Wikimedia 403)."""
for
hdrs
in
({},
{
"User-Agent"
:
"Mozilla/5.0 (compatible; ColQwen3-demo/1.0)"
}):
resp
=
requests
.
get
(
url
,
headers
=
hdrs
,
timeout
=
10
)
if
resp
.
status_code
==
403
:
continue
resp
.
raise_for_status
()
return
Image
.
open
(
BytesIO
(
resp
.
content
)).
convert
(
"RGB"
)
raise
RuntimeError
(
f
"Could not fetch image from
{
url
}
"
)
def
encode_image_base64
(
image
:
Image
.
Image
)
->
str
:
"""Encode a PIL image to a base64 data URI."""
buf
=
BytesIO
()
image
.
save
(
buf
,
format
=
"PNG"
)
return
"data:image/png;base64,"
+
base64
.
b64encode
(
buf
.
getvalue
()).
decode
()
def
compute_maxsim
(
q_emb
:
np
.
ndarray
,
d_emb
:
np
.
ndarray
)
->
float
:
"""Compute ColBERT-style MaxSim score between query and document."""
sim
=
q_emb
@
d_emb
.
T
return
float
(
sim
.
max
(
axis
=-
1
).
sum
())
# ── Encode functions ────────────────────────────────────────
def
encode_queries
(
texts
:
list
[
str
],
model
:
str
,
api_url
:
str
)
->
list
[
np
.
ndarray
]:
"""Encode text queries → list of multi-vector embeddings."""
resp
=
post_http_request
({
"model"
:
model
,
"input"
:
texts
},
api_url
)
return
[
np
.
array
(
item
[
"data"
])
for
item
in
resp
.
json
()[
"data"
]]
def
encode_images
(
image_urls
:
list
[
str
],
model
:
str
,
api_url
:
str
)
->
list
[
np
.
ndarray
]:
"""Encode image documents → list of multi-vector embeddings.
Images are sent via the chat-style `messages` field so that the
vLLM multimodal processor handles them correctly.
"""
embeddings
=
[]
for
url
in
image_urls
:
print
(
f
" Loading:
{
url
.
split
(
'/'
)[
-
1
]
}
..."
)
image
=
load_image
(
url
)
image_uri
=
encode_image_base64
(
image
)
resp
=
post_http_request
(
{
"model"
:
model
,
"messages"
:
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_url"
,
"image_url"
:
{
"url"
:
image_uri
}},
{
"type"
:
"text"
,
"text"
:
"Describe the image."
},
],
}
],
},
api_url
,
)
result
=
resp
.
json
()
if
resp
.
status_code
!=
200
or
"data"
not
in
result
:
print
(
f
" Error (
{
resp
.
status_code
}
):
{
str
(
result
)[:
200
]
}
"
)
continue
embeddings
.
append
(
np
.
array
(
result
[
"data"
][
0
][
"data"
]))
return
embeddings
# ── Main ────────────────────────────────────────────────────
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"localhost"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
8000
)
parser
.
add_argument
(
"--model"
,
type
=
str
,
default
=
"TomoroAI/tomoro-colqwen3-embed-4b"
,
)
return
parser
.
parse_args
()
def
main
(
args
):
pooling_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/pooling"
score_url
=
f
"http://
{
args
.
host
}
:
{
args
.
port
}
/score"
model
=
args
.
model
# Same sample data as the official TomoroAI example
queries
=
[
"Retrieve the city of Singapore"
,
"Retrieve the city of Beijing"
,
"Retrieve the city of London"
,
]
image_urls
=
[
"https://upload.wikimedia.org/wikipedia/commons/2/27/Singapore_skyline_2022.jpg"
,
"https://upload.wikimedia.org/wikipedia/commons/6/61/Beijing_skyline_at_night.JPG"
,
"https://upload.wikimedia.org/wikipedia/commons/4/49/London_skyline.jpg"
,
]
# ── 1) Text query embeddings ────────────────────────────
print
(
"="
*
60
)
print
(
"1. Encode text queries (multi-vector)"
)
print
(
"="
*
60
)
query_embeddings
=
encode_queries
(
queries
,
model
,
pooling_url
)
for
i
,
emb
in
enumerate
(
query_embeddings
):
norm
=
float
(
np
.
linalg
.
norm
(
emb
[
0
]))
print
(
f
' Query
{
i
}
:
{
emb
.
shape
}
(L2 norm:
{
norm
:.
4
f
}
) "
{
queries
[
i
]
}
"'
)
# ── 2) Image document embeddings ────────────────────────
print
()
print
(
"="
*
60
)
print
(
"2. Encode image documents (multi-vector)"
)
print
(
"="
*
60
)
doc_embeddings
=
encode_images
(
image_urls
,
model
,
pooling_url
)
for
i
,
emb
in
enumerate
(
doc_embeddings
):
print
(
f
" Doc
{
i
}
:
{
emb
.
shape
}
{
image_urls
[
i
].
split
(
'/'
)[
-
1
]
}
"
)
# ── 3) Cross-modal MaxSim scoring ───────────────────────
if
doc_embeddings
:
print
()
print
(
"="
*
60
)
print
(
"3. Cross-modal MaxSim scores (text queries × image docs)"
)
print
(
"="
*
60
)
# Header
print
(
f
"
{
''
:
>
35
s
}
"
,
end
=
""
)
for
j
in
range
(
len
(
doc_embeddings
)):
print
(
f
" Doc
{
j
:
>
2
d
}
"
,
end
=
""
)
print
()
# Score matrix
for
i
,
q_emb
in
enumerate
(
query_embeddings
):
print
(
f
"
{
queries
[
i
]:
<
33
s
}
"
,
end
=
""
)
for
j
,
d_emb
in
enumerate
(
doc_embeddings
):
score
=
compute_maxsim
(
q_emb
,
d_emb
)
print
(
f
"
{
score
:
6.2
f
}
"
,
end
=
""
)
print
()
# ── 4) Text-only /score endpoint ────────────────────────
print
()
print
(
"="
*
60
)
print
(
"4. Text-only late interaction scoring (/score endpoint)"
)
print
(
"="
*
60
)
text_query
=
"What is the capital of France?"
text_docs
=
[
"The capital of France is Paris."
,
"Berlin is the capital of Germany."
,
"Python is a programming language."
,
]
resp
=
post_http_request
(
{
"model"
:
model
,
"text_1"
:
text_query
,
"text_2"
:
text_docs
},
score_url
,
)
print
(
f
' Query: "
{
text_query
}
"
\n
'
)
for
item
in
resp
.
json
()[
"data"
]:
idx
=
item
[
"index"
]
print
(
f
" Doc
{
idx
}
(score=
{
item
[
'score'
]:.
4
f
}
):
{
text_docs
[
idx
]
}
"
)
if
__name__
==
"__main__"
:
args
=
parse_args
()
main
(
args
)
mkdocs.yaml
View file @
3fb4b5fa
...
...
@@ -42,6 +42,7 @@ theme:
-
navigation.sections
-
navigation.indexes
-
navigation.top
-
navigation.path
-
search.highlight
-
search.share
-
toc.follow
...
...
@@ -63,8 +64,9 @@ plugins:
-
git-revision-date-localized
:
# exclude autogenerated files
exclude
:
-
a
rgparse
/*
-
a
pi
/*
-
examples/*
-
generated/*
-
minify
:
minify_html
:
true
minify_js
:
true
...
...
@@ -92,7 +94,6 @@ plugins:
-
"
!.*_pb2_grpc"
# Exclude auto-generated gRPC stubs
summary
:
modules
:
true
show_if_no_docstring
:
true
show_signature_annotations
:
true
separate_signature
:
true
show_overloads
:
true
...
...
@@ -105,6 +106,10 @@ plugins:
-
https://numpy.org/doc/stable/objects.inv
-
https://pytorch.org/docs/stable/objects.inv
-
https://psutil.readthedocs.io/en/stable/objects.inv
-
redirects
:
redirect_maps
:
features/spec_decode/README.md
:
features/speculative_decoding/README.md
features/spec_decode/speculators.md
:
features/speculative_decoding/speculators.md
markdown_extensions
:
-
attr_list
...
...
@@ -141,7 +146,6 @@ extra_css:
-
mkdocs/stylesheets/extra.css
extra_javascript
:
-
mkdocs/javascript/reo.js
-
mkdocs/javascript/run_llm_widget.js
-
mkdocs/javascript/mathjax.js
-
https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
...
...
pyproject.toml
View file @
3fb4b5fa
...
...
@@ -9,7 +9,6 @@ requires = [
"torch == 2.10.0"
,
"wheel"
,
"jinja2"
,
"grpcio-tools==1.78.0"
,
]
build-backend
=
"setuptools.build_meta"
...
...
@@ -56,10 +55,6 @@ include = ["vllm*"]
"vllm/third_party/**"
=
["ALL"]
"vllm/version.py"
=
["F401"]
"vllm/_version.py"
=
["ALL"]
# Exclude generated protobuf files
"vllm/grpc/*_pb2.py"
=
["ALL"]
"vllm/grpc/*_pb2_grpc.py"
=
["ALL"]
"vllm/grpc/*_pb2.pyi"
=
["ALL"]
[tool.ruff.lint]
select
=
[
...
...
@@ -112,12 +107,10 @@ markers = [
"cpu_test: mark test as CPU-only test"
,
"split: run this test as part of a split"
,
"distributed: run this test only in distributed GPU tests"
,
"skip_v1: do not run this test with v1"
,
"optional: optional tests that are automatically skipped, include --optional to run them"
,
]
[tool.ty.src]
root
=
"./vllm"
respect-ignore-files
=
true
[tool.ty.environment]
...
...
@@ -125,190 +118,56 @@ python = "./.venv"
[tool.typos.files]
# these files may be written in non english words
extend-exclude
=
[
"tests/models/fixtures/*"
,
"tests/prompts/*"
,
"benchmarks/sonnet.txt"
,
"tests/lora/data/*"
,
"build/*"
,
"vllm/third_party/*"
,
"vllm/entrypoints/serve/instrumentator/static/*"
,
"docs/governance/process.md"
]
ignore-hidden
=
true
ignore-files
=
true
ignore-dot
=
true
ignore-vcs
=
true
ignore-global
=
true
ignore-parent
=
true
extend-exclude
=
[
"tests/models/fixtures/*"
,
"tests/prompts/*"
,
"tests/tokenizers_/*"
,
"benchmarks/sonnet.txt"
,
"tests/lora/data/*"
,
"examples/pooling/token_embed/*"
,
"build/*"
,
"vllm/third_party/*"
,
"vllm/entrypoints/serve/instrumentator/static/*"
,
"tests/entrypoints/openai/test_transcription_validation.py"
,
"docs/governance/process.md"
,
"tests/v1/engine/test_fast_incdec_prefix_err.py"
,
".git/*"
]
ignore-hidden
=
false
[tool.typos.default]
binary
=
false
check-filename
=
false
check-file
=
true
unicode
=
true
ignore-hex
=
true
identifier-leading-digits
=
false
locale
=
"en"
extend-ignore-identifiers-re
=
[
"NVML_*"
,
".*Unc.*"
,
".*_thw"
,
".*UE8M0.*"
,
".*[UE4M3|ue4m3].*"
,
".*eles.*"
,
".*[Tt]h[rR].*"
]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
extend-ignore-identifiers-re
=
[".*[Uu][Ee][0-9][Mm][0-9].*"]
[tool.typos.default.extend-identifiers]
bbc5b7ede
=
"bbc5b7ede"
womens_doubles
=
"womens_doubles"
v_2nd
=
"v_2nd"
# splitted_input = "splitted_input"
NOOPs
=
"NOOPs"
typ
=
"typ"
nin_shortcut
=
"nin_shortcut"
UperNetDecoder
=
"UperNetDecoder"
subtile
=
"subtile"
cudaDevAttrMaxSharedMemoryPerBlockOptin
=
"cudaDevAttrMaxSharedMemoryPerBlockOptin"
SFOuput
=
"SFOuput"
# huggingface transformers repo uses these words
depthwise_seperable_out_channel
=
"depthwise_seperable_out_channel"
DepthWiseSeperableConv1d
=
"DepthWiseSeperableConv1d"
depthwise_seperable_CNN
=
"depthwise_seperable_CNN"
pard_token
=
"pard_token"
ptd_token_id
=
"ptd_token_id"
ser_de
=
"ser_de"
shared_memory_per_block_optin
=
"shared_memory_per_block_optin"
FoPE
=
"FoPE"
k_ot
=
"k_ot"
view_seperator
=
"view_seperator"
inverse_std_variences
=
"inverse_std_variences"
[tool.typos.default.extend-words]
iy
=
"iy"
tendencias
=
"tendencias"
indx
=
"indx"
# intel cpu features
tme
=
"tme"
dout
=
"dout"
Pn
=
"Pn"
arange
=
"arange"
[tool.typos.type.py]
extend-glob
=
[]
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.py.extend-identifiers]
arange
=
"arange"
NDArray
=
"NDArray"
EOFError
=
"EOFError"
fo
=
"fo"
ba
=
"ba"
[tool.typos.type.py.extend-words]
thw
=
"thw"
subtile
=
"subtile"
HSA
=
"HSA"
setp
=
"setp"
CPY
=
"CPY"
thr
=
"thr"
Thr
=
"Thr"
PARD
=
"PARD"
pard
=
"pard"
AKS
=
"AKS"
ba
=
"ba"
fo
=
"fo"
nd
=
"nd"
[tool.typos.type.cpp]
extend-glob
=
["*.cu"]
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.cpp.extend-identifiers]
countr_one
=
"countr_one"
k_ot
=
"k_ot"
ot
=
"ot"
[tool.typos.type.cpp.extend-words]
[tool.typos.type.rust]
extend-glob
=
[]
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.rust.extend-identifiers]
flate2
=
"flate2"
[tool.typos.type.rust.extend-words]
eles
=
"eles"
datas
=
"datas"
ser
=
"ser"
[tool.typos.type.lock]
extend-glob
=
[]
check-file
=
false
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.lock.extend-identifiers]
[tool.typos.type.lock.extend-words]
[tool.typos.type.jl]
extend-glob
=
[]
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.jl.extend-identifiers]
[tool.typos.type.jl.extend-words]
modul
=
"modul"
egals
=
"egals"
usig
=
"usig"
egal
=
"egal"
[tool.typos.type.go]
extend-glob
=
[]
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.go.extend-identifiers]
flate
=
"flate"
[tool.typos.type.go.extend-words]
[tool.typos.type.css]
extend-glob
=
[]
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.css.extend-identifiers]
nd
=
"nd"
[tool.typos.type.css.extend-words]
[tool.typos.type.man]
extend-glob
=
[]
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.man.extend-identifiers]
Nd
=
"Nd"
[tool.typos.type.man.extend-words]
[tool.typos.type.cert]
extend-glob
=
[]
check-file
=
false
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.cert.extend-identifiers]
[tool.typos.type.cert.extend-words]
[tool.typos.type.sh]
extend-glob
=
[]
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.sh.extend-identifiers]
ot
=
"ot"
[tool.typos.type.sh.extend-words]
[tool.typos.type.vimscript]
extend-glob
=
[]
extend-ignore-identifiers-re
=
[]
extend-ignore-words-re
=
[]
extend-ignore-re
=
[]
[tool.typos.type.vimscript.extend-identifiers]
windo
=
"windo"
[tool.typos.type.vimscript.extend-words]
ure
=
"ure"
[tool.uv]
no-build-isolation-package
=
["torch"]
\ No newline at end of file
no-build-isolation-package
=
["torch"]
requirements/build.txt
View file @
3fb4b5fa
...
...
@@ -10,4 +10,3 @@ jinja2>=3.1.6
regex
build
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.*
grpcio-tools==1.78.0 # Required for grpc entrypoints
requirements/common.txt
View file @
3fb4b5fa
...
...
@@ -12,7 +12,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp >= 3.13.3
openai >= 1.99.1 # For Responses API with reasoning content
openai >= 1.99.1
, < 2.25.0
# For Responses API with reasoning content
pydantic >= 2.12.0
prometheus_client >= 0.18.0
pillow # Required for image processing
...
...
@@ -24,14 +24,14 @@ outlines_core == 0.2.11
# required for outlines backend disk cache
diskcache == 5.6.3
lark == 1.2.2
xgrammar
=
= 0.1.
29
; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
xgrammar
>
= 0.1.
32, < 1.0.0
; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
pyzmq >= 25.0.0
msgspec
gguf >= 0.17.0
mistral_common[image] >= 1.
9
.0
mistral_common[image] >= 1.
10
.0
opencv-python-headless >= 4.13.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
...
...
@@ -51,5 +51,7 @@ openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic >= 0.71.0
model-hosting-container-standards >= 0.1.13, < 1.0.0
mcp
grpcio
grpcio-reflection
\ No newline at end of file
opentelemetry-sdk >= 1.27.0
opentelemetry-api >= 1.27.0
opentelemetry-exporter-otlp >= 1.27.0
opentelemetry-semantic-conventions-ai >= 0.4.1
requirements/cpu.txt
View file @
3fb4b5fa
...
...
@@ -7,13 +7,13 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
# Dependencies for CPUs
torch==2.10.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.10.0; platform_machine == "aarch64" or platform_system == "Darwin" or platform_machine == "ppc64le"
torch==2.10.0; platform_machine == "aarch64" or platform_system == "Darwin" or platform_machine == "ppc64le"
or platform_machine == "riscv64"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "s390x"
torchaudio; platform_machine != "s390x"
and platform_machine != "riscv64"
# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "s390x"
torchvision; platform_machine != "s390x"
and platform_machine != "riscv64"
# Intel Extension for PyTorch, only for x86_64 CPUs
intel-openmp==2024.2.1; platform_machine == "x86_64"
...
...
requirements/cuda.txt
View file @
3fb4b5fa
...
...
@@ -4,10 +4,16 @@
numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0
torch==2.10.0
torchaudio==2.10.0
# These must be updated alongside torch
torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.6.3
flashinfer-python==0.6.6
# Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to
# breaking changes in 1.19.0
nvidia-cudnn-frontend>=1.13.0,<1.19.0
# QuACK and Cutlass DSL for FA4 (cute-DSL implementation)
nvidia-cutlass-dsl>=4.4.0.dev1
quack-kernels>=0.2.7
requirements/docs.txt
View file @
3fb4b5fa
mkdocs
mkdocs
<2.0.0
mkdocs-api-autonav
mkdocs-material
mkdocstrings-python
...
...
@@ -7,6 +7,7 @@ mkdocs-awesome-nav
mkdocs-glightbox
mkdocs-git-revision-date-localized-plugin
mkdocs-minify-plugin
mkdocs-redirects
regex
ruff
pydantic
...
...
requirements/kv_connectors.txt
View file @
3fb4b5fa
lmcache >= 0.3.9
nixl >= 0.7.1 # Required for disaggregated prefill
nixl >= 0.7.1, < 0.10.0 # Required for disaggregated prefill
mooncake-transfer-engine >= 0.3.8
requirements/lint.txt
View file @
3fb4b5fa
# formatting
pre-commit
=
=4.
0
.1
pre-commit
>
=4.
5
.1
requirements/nightly_torch_test.txt
View file @
3fb4b5fa
...
...
@@ -23,17 +23,17 @@ jiwer # required for audio tests
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.9.
0
# required for voxtral test
mistral_common[image,audio] >= 1.9.
1
# required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.
9.2
# required for model evaluation test
mteb
>=1.38.11
, <
2
# required for mteb test
lm-eval[api]>=0.4.
11
# required for model evaluation test
mteb
[bm25s]>=2
, <
3
# required for mteb test
transformers==4.57.5
tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes>=0.4
6.1
bitsandbytes>=0.4
9.2
buildkite-test-collector==0.1.9
...
...
@@ -42,6 +42,7 @@ tritonclient>=2.51.0
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.
3
runai-model-streamer[s3,gcs
,azure
]==0.15.
7
fastsafetensors>=0.2.2
instanttensor>=0.1.5
pydantic>=2.12 # 2.11 leads to error on python 3.13
requirements/rocm-build.txt
View file @
3fb4b5fa
# Common dependencies
-r common.txt
--extra-index-url https://download.pytorch.org/whl/
test/
rocm7.
0
--extra-index-url https://download.pytorch.org/whl/rocm7.
1
torch==2.10.0
torchvision==0.25.0
torchaudio==2.10.0
...
...
@@ -12,5 +12,5 @@ setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
wheel
jinja2>=3.1.6
amdsmi==
6.4.3
amdsmi==
7.0.2
timm>=1.0.17
requirements/rocm-test.txt
View file @
3fb4b5fa
...
...
@@ -45,6 +45,8 @@ pystemmer==3.0.0
# via mteb
# Multi-modal processing
av==16.1.0
# required for audio_in_video tests
blobfile==3.0.0
# Multi-Modal Models Test
decord==0.6.0
...
...
@@ -58,7 +60,7 @@ schemathesis==3.39.15
# OpenAI schema test
# Evaluation and benchmarking
lm-eval[api]==0.4.
9.2
lm-eval[api]==0.4.
11
jiwer==4.0.0
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
...
...
@@ -67,12 +69,10 @@ multiprocess==0.70.16
# Required for v1/metrics/test_engine_logger_apis.py
ray[cgraph,default]>=2.48.0
# Plugins test
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
torchgeo==0.7.0
# via terratorch
# MTEB Benchmark Test
mteb
==2.1.2
mteb
[bm25s]>=2, <3
# Utilities
num2words==0.5.14
...
...
@@ -93,6 +93,22 @@ timm==1.0.17
# Required for plugins test
albumentations==1.4.6
# Pin transformers version
transformers==4.57.
3
transformers==4.57.
5
# Pin HF Hub version
huggingface-hub==0.36.2
# Pin Mistral Common
mistral-common[image,audio]==1.10.0
# Required for Prithvi tests
terratorch==1.2.2
# Required for Prithvi tests
segmentation-models-pytorch==0.5.0
# Required for Prithvi tests
imagehash==4.3.2
# Required for bitsandbytes quantization test
bitsandbytes==0.49.2
# Examples (tensorizer) tests
tensorizer==2.10.1
# Multi-modal models test (`allendou/FireRedASR2-LLM-vllm`)
kaldi-native-fbank==1.22.3
# Pinning numpy version
numpy==2.2.6
requirements/rocm.txt
View file @
3fb4b5fa
# Common dependencies
-r common.txt
# The version of gRPC libraries should be consistent with each other
grpcio==1.78.0
grpcio-reflection==1.78.0
numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for AMD GPUs
datasets
ray[cgraph]>=2.48.0
peft
pytest-asyncio
tensorizer==2.10.1
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
runai-model-streamer[s3,gcs]==0.15.3
runai-model-streamer[s3,gcs,azure]==0.15.7
# conch-triton-kernels==1.2.1
timm>=1.0.17
grpcio-tools==1.78.0 # Should match `build.txt`
\ No newline at end of file
# amd-quark: required for Quark quantization on ROCm
# To be consistent with test_quark.py
amd-quark>=0.8.99
\ No newline at end of file
requirements/test.in
View file @
3fb4b5fa
...
...
@@ -10,6 +10,7 @@ pytest-cov
# testing utils
albumentations # required for Nemotron Parse in test_common.py
av # required for audio_in_video tests
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl
...
...
@@ -30,33 +31,48 @@ torchaudio==2.10.0
torchvision==0.25.0
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.9.
0
# required for voxtral test
mistral_common[image,audio] >= 1.9.
1
# required for voxtral test
num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.
9.2
# required for model evaluation test
lm-eval[api]>=0.4.
11
# required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5
tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes==0.4
6.1
bitsandbytes==0.4
9.2
buildkite-test-collector==0.1.9
genai_perf>=0.0.8
tritonclient>=2.51.0
grpcio-tools==1.78.0 # Should match `build.txt`
# The version of gRPC libraries should be consistent with each other
grpcio==1.78.0
grpcio-reflection==1.78.0
arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.
3
runai-model-streamer[s3,gcs
,azure
]==0.15.
7
fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
instanttensor>=0.1.5
pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
terratorch >= 1.2.2 # Required for Prithvi tests
imagehash # Required for Prithvi tests
segmentation-models-pytorch > 0.4.0 # Required for Prithvi tests
gpt-oss >= 0.0.7; python_version > '3.11'
perceptron # required for isaac test
kaldi-native-fbank >= 1.18.7 # required for fireredasr2 test
# Newer versions of datasets require torchcoded, that makes the tests fail in CI because of a missing library.
# Older versions are in conflict with teerratorch requirements.
datasets>=3.3.0,<=3.6.0
openpyxl # required for perf comparison excel report
plotly # required for perf comparison html report
requirements/test.txt
View file @
3fb4b5fa
# This file was autogenerated by uv via the following command:
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
absl-py==2.1.0
# via rouge-score
accelerate==1.0.1
# via
# lm-eval
# peft
# rouge-score
# tensorboard
accelerate==1.0.1
# via peft
aenum==3.1.16
# via lightly
affine==2.4.0
...
...
@@ -31,9 +31,7 @@ albumentations==1.4.6
# -r requirements/test.in
# terratorch
alembic==1.16.4
# via
# mlflow
# optuna
# via optuna
annotated-doc==0.0.4
# via fastapi
annotated-types==0.7.0
...
...
@@ -64,18 +62,26 @@ attrs==24.2.0
# referencing
audioread==3.0.1
# via librosa
av==16.1.0
# via -r requirements/test.in
azure-core==1.38.2
# via
# azure-identity
# azure-storage-blob
azure-identity==1.25.2
# via runai-model-streamer-azure
azure-storage-blob==12.28.0
# via runai-model-streamer-azure
backoff==2.2.1
# via
# -r requirements/test.in
# schemathesis
bitsandbytes==0.4
6.1
bitsandbytes==0.4
9.2
# via
# -r requirements/test.in
# lightning
black==24.10.0
# via datamodel-code-generator
blinker==1.9.0
# via flask
blobfile==3.0.0
# via -r requirements/test.in
bm25s==0.2.13
...
...
@@ -93,9 +99,7 @@ bounded-pool-executor==0.0.3
buildkite-test-collector==0.1.9
# via -r requirements/test.in
cachetools==5.5.2
# via
# google-auth
# mlflow-skinny
# via google-auth
certifi==2024.8.30
# via
# fiona
...
...
@@ -106,8 +110,11 @@ certifi==2024.8.30
# pyproj
# rasterio
# requests
cffi==1.17.1
# via soundfile
# sentry-sdk
cffi==2.0.0
# via
# cryptography
# soundfile
chardet==5.2.0
# via mbstrdecoder
charset-normalizer==3.4.0
...
...
@@ -120,15 +127,14 @@ click==8.1.7
# click-plugins
# cligj
# fiona
# flask
# jiwer
# mlflow-skinny
# nltk
# rasterio
# ray
# schemathesis
# typer
# uvicorn
# wandb
click-plugins==1.1.1.2
# via
# fiona
...
...
@@ -137,14 +143,11 @@ cligj==0.7.2
# via
# fiona
# rasterio
cloudpickle==3.1.1
# via mlflow-skinny
colorama==0.4.6
# via
# perceptron
# sacrebleu
# schemathesis
# tqdm-multiprocess
colorful==0.5.6
# via ray
colorlog==6.10.1
...
...
@@ -155,6 +158,12 @@ coverage==7.10.6
# via pytest-cov
cramjam==2.9.0
# via fastparquet
cryptography==46.0.5
# via
# azure-identity
# azure-storage-blob
# msal
# pyjwt
cuda-bindings==12.9.4
# via torch
cuda-pathfinder==1.3.3
...
...
@@ -163,16 +172,15 @@ cupy-cuda12x==13.6.0
# via ray
cycler==0.12.1
# via matplotlib
databricks-sdk==0.59.0
# via mlflow-skinny
datamodel-code-generator==0.26.3
# via -r requirements/test.in
dataproperty==1.0.1
# via
# pytablewriter
# tabledata
datasets==3.
0.2
datasets==3.
3.0
# via
# -r requirements/test.in
# evaluate
# lm-eval
# mteb
...
...
@@ -180,6 +188,8 @@ decorator==5.1.1
# via librosa
decord==0.6.0
# via -r requirements/test.in
diffusers==0.36.0
# via terratorch
dill==0.3.8
# via
# datasets
...
...
@@ -191,15 +201,11 @@ distlib==0.3.9
dnspython==2.7.0
# via email-validator
docker==7.1.0
# via
# gpt-oss
# mlflow
# via gpt-oss
docopt==0.6.2
# via num2words
docstring-parser==0.17.0
# via jsonargparse
efficientnet-pytorch==0.7.1
# via segmentation-models-pytorch
einops==0.8.1
# via
# -r requirements/test.in
...
...
@@ -214,12 +220,12 @@ email-validator==2.2.0
# via pydantic
encodec==0.1.1
# via vocos
et-xmlfile==2.0.0
# via openpyxl
evaluate==0.4.3
# via lm-eval
fastapi==0.128.0
# via
# gpt-oss
# mlflow-skinny
# via gpt-oss
fastparquet==2024.11.0
# via genai-perf
fastrlock==0.8.2
...
...
@@ -230,6 +236,7 @@ filelock==3.16.1
# via
# blobfile
# datasets
# diffusers
# huggingface-hub
# ray
# torch
...
...
@@ -237,8 +244,6 @@ filelock==3.16.1
# virtualenv
fiona==1.10.1
# via torchgeo
flask==3.1.1
# via mlflow
fonttools==4.55.0
# via matplotlib
fqdn==1.5.1
...
...
@@ -249,7 +254,7 @@ frozenlist==1.5.0
# via
# aiohttp
# aiosignal
fsspec==2024.
9
.0
fsspec==2024.
12
.0
# via
# datasets
# evaluate
...
...
@@ -257,6 +262,7 @@ fsspec==2024.9.0
# huggingface-hub
# lightning
# pytorch-lightning
# tacoreader
# torch
ftfy==6.3.1
# via open-clip-torch
...
...
@@ -269,7 +275,7 @@ geopandas==1.0.1
gitdb==4.0.12
# via gitpython
gitpython==3.1.44
# via
mlflow-skinny
# via
wandb
google-api-core==2.24.2
# via
# google-cloud-core
...
...
@@ -277,7 +283,6 @@ google-api-core==2.24.2
# opencensus
google-auth==2.40.2
# via
# databricks-sdk
# google-api-core
# google-cloud-core
# google-cloud-storage
...
...
@@ -296,25 +301,18 @@ googleapis-common-protos==1.70.0
# via google-api-core
gpt-oss==0.0.8
# via -r requirements/test.in
graphene==3.4.3
# via mlflow
graphql-core==3.2.6
# via
# graphene
# graphql-relay
# hypothesis-graphql
graphql-relay==3.2.0
# via graphene
# via hypothesis-graphql
greenlet==3.2.3
# via sqlalchemy
grpcio==1.78.0
# via
# grpcio-tools
# -r requirements/test.in
# grpcio-reflection
# ray
grpcio-tools==1.78.0
# tensorboard
grpcio-reflection==1.78.0
# via -r requirements/test.in
gunicorn==23.0.0
# via mlflow
h11==0.14.0
# via
# httpcore
...
...
@@ -338,12 +336,14 @@ httpcore==1.0.6
httpx==0.27.2
# via
# -r requirements/test.in
# diffusers
# perceptron
# schemathesis
huggingface-hub==0.36.2
# via
# accelerate
# datasets
# diffusers
# evaluate
# open-clip-torch
# peft
...
...
@@ -379,11 +379,13 @@ idna==3.10
# jsonschema
# requests
# yarl
imagehash==4.3.2
# via -r requirements/test.in
imageio==2.37.0
# via scikit-image
importlib-metadata==8.7.0
# via
#
mlflow-skinny
#
diffusers
# opentelemetry-api
importlib-resources==6.5.2
# via typeshed-client
...
...
@@ -391,18 +393,19 @@ inflect==5.6.2
# via datamodel-code-generator
iniconfig==2.0.0
# via pytest
instanttensor==0.1.5
# via -r requirements/test.in
isodate==0.7.2
# via azure-storage-blob
isoduration==20.11.0
# via jsonschema
isort==5.13.2
# via datamodel-code-generator
itsdangerous==2.2.0
# via flask
jinja2==3.1.6
# via
# datamodel-code-generator
# flask
# genai-perf
#
mlflow
#
lm-eval
# torch
jiwer==3.0.5
# via -r requirements/test.in
...
...
@@ -415,12 +418,14 @@ joblib==1.4.2
# librosa
# nltk
# scikit-learn
jsonargparse==4.
35
.0
jsonargparse==4.
46
.0
# via
# lightning
# terratorch
jsonlines==4.0.0
# via lm-eval
jsonnet==0.21.0
# via jsonargparse
jsonpointer==3.0.0
# via jsonschema
jsonschema==4.23.0
...
...
@@ -433,6 +438,8 @@ jsonschema-specifications==2024.10.1
# via jsonschema
junit-xml==1.9
# via schemathesis
kaldi-native-fbank==1.22.3
# via -r requirements/test.in
kaleido==0.2.1
# via genai-perf
kiwisolver==1.4.7
...
...
@@ -449,13 +456,13 @@ libnacl==2.1.0
# via tensorizer
librosa==0.10.2.post1
# via -r requirements/test.in
lightly==1.5.2
0
lightly==1.5.2
2
# via
# terratorch
# torchgeo
lightly-utils==0.0.2
# via lightly
lightning==2.
5
.1
.post0
lightning==2.
6
.1
# via
# terratorch
# torchgeo
...
...
@@ -466,7 +473,7 @@ lightning-utilities==0.14.3
# torchmetrics
llvmlite==0.44.0
# via numba
lm-eval==0.4.
9.2
lm-eval==0.4.
11
# via -r requirements/test.in
lxml==5.3.0
# via
...
...
@@ -476,12 +483,11 @@ lxml==5.3.0
mako==1.3.10
# via alembic
markdown==3.8.2
# via
mlflow
# via
tensorboard
markdown-it-py==3.0.0
# via rich
markupsafe==3.0.1
# via
# flask
# jinja2
# mako
# werkzeug
...
...
@@ -489,7 +495,6 @@ matplotlib==3.9.2
# via
# -r requirements/test.in
# lightning
# mlflow
# pycocotools
# torchgeo
mbstrdecoder==1.1.3
...
...
@@ -499,21 +504,23 @@ mbstrdecoder==1.1.3
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.
9
.0
mistral-common==1.
10
.0
# via -r requirements/test.in
mlflow==2.22.0
# via terratorch
mlflow-skinny==2.22.0
# via mlflow
more-itertools==10.5.0
# via lm-eval
mpmath==1.3.0
# via sympy
msal==1.34.0
# via
# azure-identity
# msal-extensions
msal-extensions==1.3.1
# via azure-identity
msgpack==1.1.0
# via
# librosa
# ray
mteb==2.
1.2
mteb==2.
8.3
# via -r requirements/test.in
multidict==6.1.0
# via
...
...
@@ -523,8 +530,6 @@ multiprocess==0.70.16
# via
# datasets
# evaluate
munch==4.0.0
# via pretrainedmodels
mypy-extensions==1.0.0
# via black
networkx==3.2.1
...
...
@@ -539,8 +544,6 @@ numba==0.61.2
# via
# -r requirements/test.in
# librosa
numexpr==2.10.1
# via lm-eval
numpy==2.2.6
# via
# -r requirements/test.in
...
...
@@ -553,6 +556,7 @@ numpy==2.2.6
# cupy-cuda12x
# datasets
# decord
# diffusers
# einx
# encodec
# evaluate
...
...
@@ -560,16 +564,16 @@ numpy==2.2.6
# genai-perf
# geopandas
# h5py
# imagehash
# imageio
# librosa
# lightly
# lightly-utils
# lm-eval
# matplotlib
# mistral-common
# mlflow
# mteb
# numba
# numexpr
# opencv-python-headless
# optuna
# pandas
...
...
@@ -578,6 +582,7 @@ numpy==2.2.6
# perceptron
# pycocotools
# pyogrio
# pywavelets
# rasterio
# rioxarray
# rouge-score
...
...
@@ -590,8 +595,10 @@ numpy==2.2.6
# shapely
# soxr
# statsmodels
# tensorboard
# tensorboardx
# tensorizer
# terratorch
# tifffile
# torchgeo
# torchmetrics
...
...
@@ -657,9 +664,10 @@ opencv-python-headless==4.13.0.90
# albucore
# albumentations
# mistral-common
openpyxl==3.1.5
# via -r requirements/test.in
opentelemetry-api==1.35.0
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# opentelemetry-sdk
# opentelemetry-semantic-conventions
...
...
@@ -669,7 +677,6 @@ opentelemetry-proto==1.36.0
# via ray
opentelemetry-sdk==1.35.0
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# ray
opentelemetry-semantic-conventions==0.56b0
...
...
@@ -681,13 +688,13 @@ orjson==3.11.5
packaging==24.2
# via
# accelerate
# bitsandbytes
# black
# datamodel-code-generator
# datasets
# evaluate
# fastparquet
# geopandas
# gunicorn
# huggingface-hub
# hydra-core
# kornia
...
...
@@ -695,7 +702,6 @@ packaging==24.2
# lightning
# lightning-utilities
# matplotlib
# mlflow-skinny
# optuna
# peft
# plotly
...
...
@@ -708,10 +714,12 @@ packaging==24.2
# rioxarray
# scikit-image
# statsmodels
# tensorboard
# tensorboardx
# torchmetrics
# transformers
# typepy
# wandb
# xarray
pandas==2.2.3
# via
...
...
@@ -720,8 +728,8 @@ pandas==2.2.3
# fastparquet
# genai-perf
# geopandas
# mlflow
# statsmodels
# tacoreader
# torchgeo
# xarray
pathspec==0.12.1
...
...
@@ -731,16 +739,16 @@ pathvalidate==3.2.1
patsy==1.0.1
# via statsmodels
peft==0.16.0
# via
# -r requirements/test.in
# lm-eval
# via -r requirements/test.in
perceptron==0.1.4
# via -r requirements/test.in
perf-analyzer==0.1.0
# via genai-perf
pillow==10.4.0
# via
# diffusers
# genai-perf
# imagehash
# imageio
# lightly-utils
# matplotlib
...
...
@@ -748,6 +756,7 @@ pillow==10.4.0
# perceptron
# scikit-image
# segmentation-models-pytorch
# tensorboard
# torchgeo
# torchvision
platformdirs==4.3.6
...
...
@@ -755,8 +764,11 @@ platformdirs==4.3.6
# black
# pooch
# virtualenv
# wandb
plotly==5.24.1
# via genai-perf
# via
# -r requirements/test.in
# genai-perf
pluggy==1.5.0
# via
# pytest
...
...
@@ -769,8 +781,6 @@ portalocker==2.10.1
# via sacrebleu
pqdm==0.2.0
# via -r requirements/test.in
pretrainedmodels==0.7.4
# via segmentation-models-pytorch
prometheus-client==0.22.0
# via
# opentelemetry-exporter-prometheus
...
...
@@ -785,13 +795,14 @@ protobuf==6.33.2
# via
# google-api-core
# googleapis-common-protos
# grpcio-tools
# mlflow-skinny
# grpcio-reflection
# opentelemetry-proto
# proto-plus
# ray
# tensorboard
# tensorboardx
# tensorizer
# wandb
psutil==6.1.0
# via
# accelerate
...
...
@@ -801,19 +812,18 @@ py==1.11.0
# via pytest-forked
py-spy==0.4.0
# via ray
pyarrow==
18
.0.0
pyarrow==
23
.0.0
# via
# datasets
# genai-perf
# mlflow
# tacoreader
# terratorch
pyasn1==0.6.1
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.4.2
# via google-auth
pybind11==2.13.6
# via lm-eval
pycocotools==2.0.8
# via terratorch
pycountry==24.6.1
...
...
@@ -831,17 +841,19 @@ pydantic==2.12.0
# gpt-oss
# lightly
# mistral-common
# mlflow-skinny
# mteb
# openai-harmony
# pydantic-extra-types
# ray
# wandb
pydantic-core==2.41.1
# via pydantic
pydantic-extra-types==2.10.5
# via mistral-common
pygments==2.18.0
# via rich
pyjwt==2.11.0
# via msal
pyogrio==0.11.0
# via geopandas
pyparsing==3.2.0
...
...
@@ -873,7 +885,6 @@ pytest==8.3.5
# pytest-subtests
# pytest-timeout
# schemathesis
# terratorch
pytest-asyncio==0.24.0
# via -r requirements/test.in
pytest-cov==6.3.0
...
...
@@ -896,7 +907,6 @@ python-dateutil==2.9.0.post0
# via
# arrow
# botocore
# graphene
# lightly
# matplotlib
# pandas
...
...
@@ -913,6 +923,8 @@ pytz==2024.2
# via
# pandas
# typepy
pywavelets==1.9.0
# via imagehash
pyyaml==6.0.2
# via
# accelerate
...
...
@@ -923,7 +935,6 @@ pyyaml==6.0.2
# huggingface-hub
# jsonargparse
# lightning
# mlflow-skinny
# omegaconf
# optuna
# peft
...
...
@@ -934,6 +945,7 @@ pyyaml==6.0.2
# timm
# transformers
# vocos
# wandb
rapidfuzz==3.12.1
# via jiwer
rasterio==1.4.3
...
...
@@ -951,6 +963,7 @@ referencing==0.35.1
# jsonschema-specifications
regex==2024.9.11
# via
# diffusers
# nltk
# open-clip-torch
# sacrebleu
...
...
@@ -958,9 +971,10 @@ regex==2024.9.11
# transformers
requests==2.32.3
# via
# azure-core
# buildkite-test-collector
# databricks-sdk
# datasets
# diffusers
# docker
# evaluate
# google-api-core
...
...
@@ -970,15 +984,17 @@ requests==2.32.3
# lightly
# lm-eval
# mistral-common
# m
lflow-skinny
# m
sal
# mteb
# pooch
# ray
# responses
# schemathesis
# starlette-testclient
# tacoreader
# tiktoken
# transformers
# wandb
responses==0.25.3
# via genai-perf
rfc3339-validator==0.1.4
...
...
@@ -991,6 +1007,7 @@ rich==13.9.4
# lightning
# mteb
# perceptron
# terratorch
# typer
rioxarray==0.19.0
# via terratorch
...
...
@@ -1004,11 +1021,13 @@ rsa==4.9.1
# via google-auth
rtree==1.4.0
# via torchgeo
runai-model-streamer==0.15.
3
runai-model-streamer==0.15.
7
# via -r requirements/test.in
runai-model-streamer-gcs==0.15.3
runai-model-streamer-azure==0.15.7
# via runai-model-streamer
runai-model-streamer-gcs==0.15.7
# via runai-model-streamer
runai-model-streamer-s3==0.15.
3
runai-model-streamer-s3==0.15.
7
# via runai-model-streamer
s3transfer==0.10.3
# via boto3
...
...
@@ -1017,47 +1036,54 @@ sacrebleu==2.4.3
safetensors==0.4.5
# via
# accelerate
# diffusers
# open-clip-torch
# peft
# segmentation-models-pytorch
# timm
# transformers
schemathesis==3.39.15
# via -r requirements/test.in
scikit-image==0.25.2
# via albumentations
# via
# albumentations
# terratorch
scikit-learn==1.5.2
# via
# albumentations
# librosa
# lm-eval
# mlflow
# mteb
# sentence-transformers
# terratorch
scipy==1.13.1
# via
# albumentations
# bm25s
# imagehash
# librosa
# mlflow
# mteb
# scikit-image
# scikit-learn
# sentence-transformers
# statsmodels
# vocos
segmentation-models-pytorch==0.
4
.0
segmentation-models-pytorch==0.
5
.0
# via
# -r requirements/test.in
# terratorch
# torchgeo
sentence-transformers==5.2.0
# via
# -r requirements/test.in
# mteb
sentry-sdk==2.52.0
# via wandb
setuptools==77.0.3
# via
# grpcio-tools
# lightning-utilities
# pytablewriter
# tensorboard
# torch
shapely==2.1.1
# via
...
...
@@ -1075,7 +1101,6 @@ six==1.16.0
# python-dateutil
# rfc3339-validator
# rouge-score
# segmentation-models-pytorch
smart-open==7.1.0
# via ray
smmap==5.0.2
...
...
@@ -1099,12 +1124,9 @@ soxr==0.5.0.post1
sqlalchemy==2.0.41
# via
# alembic
# mlflow
# optuna
sqlitedict==2.1.0
# via lm-eval
sqlparse==0.5.3
# via mlflow-skinny
starlette==0.50.0
# via
# fastapi
...
...
@@ -1124,6 +1146,8 @@ tabledata==1.3.3
# via pytablewriter
tabulate==0.9.0
# via sacrebleu
tacoreader==0.5.6
# via terratorch
tblib==3.1.0
# via -r requirements/test.in
tcolorpy==0.1.6
...
...
@@ -1133,13 +1157,19 @@ tenacity==9.1.2
# gpt-oss
# lm-eval
# plotly
tensorboard==2.20.0
# via terratorch
tensorboard-data-server==0.7.2
# via tensorboard
tensorboardx==2.6.4
# via lightning
tensorizer==2.10.1
# via -r requirements/test.in
termcolor==3.1.0
# via gpt-oss
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
# via
# gpt-oss
# terratorch
terratorch==1.2.2
# via -r requirements/test.in
threadpoolctl==3.5.0
# via scikit-learn
...
...
@@ -1172,16 +1202,14 @@ torch==2.10.0+cu129
# -r requirements/test.in
# accelerate
# bitsandbytes
# efficientnet-pytorch
# encodec
# instanttensor
# kornia
# lightly
# lightning
# lm-eval
# mteb
# open-clip-torch
# peft
# pretrainedmodels
# pytorch-lightning
# runai-model-streamer
# segmentation-models-pytorch
...
...
@@ -1213,12 +1241,11 @@ torchvision==0.25.0+cu129
# -r requirements/test.in
# lightly
# open-clip-torch
# pretrainedmodels
# segmentation-models-pytorch
# terratorch
# timm
# torchgeo
tqdm==4.6
6.6
tqdm==4.6
7.3
# via
# datasets
# evaluate
...
...
@@ -1232,19 +1259,16 @@ tqdm==4.66.6
# optuna
# peft
# pqdm
# pretrainedmodels
# pytorch-lightning
# segmentation-models-pytorch
# sentence-transformers
# tqdm-multiprocess
# tacoreader
# terratorch
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.57.5
# via
# -r requirements/test.in
# genai-perf
# lm-eval
# peft
# sentence-transformers
# transformers-stream-generator
...
...
@@ -1272,16 +1296,18 @@ typing-extensions==4.15.0
# aiosignal
# albumentations
# alembic
# azure-core
# azure-identity
# azure-storage-blob
# chz
# fastapi
# graphene
# grpcio
# huggingface-hub
# librosa
# lightning
# lightning-utilities
# lm-eval
# mistral-common
# mlflow-skinny
# mteb
# opentelemetry-api
# opentelemetry-sdk
...
...
@@ -1299,6 +1325,7 @@ typing-extensions==4.15.0
# typer
# typeshed-client
# typing-inspection
# wandb
typing-inspection==0.4.2
# via pydantic
tzdata==2024.2
...
...
@@ -1313,25 +1340,26 @@ urllib3==2.2.3
# lightly
# requests
# responses
# sentry-sdk
# tritonclient
uvicorn==0.35.0
# via
# gpt-oss
# mlflow-skinny
# via gpt-oss
vector-quantize-pytorch==1.21.2
# via -r requirements/test.in
virtualenv==20.31.2
# via ray
vocos==0.1.0
# via -r requirements/test.in
wandb==0.24.2
# via terratorch
wcwidth==0.2.13
# via ftfy
webcolors==24.11.1
# via jsonschema
werkzeug==3.1.3
# via
# flask
# schemathesis
# tensorboard
word2number==1.1
# via lm-eval
wrapt==1.17.2
...
...
requirements/xpu.txt
View file @
3fb4b5fa
...
...
@@ -15,4 +15,4 @@ torch==2.10.0+xpu
torchaudio
torchvision
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.
2
/vllm_xpu_kernels-0.1.
2
-cp3
12-cp312
-linux_x86_64.whl
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.
3
/vllm_xpu_kernels-0.1.
3
-cp3
8-abi3
-linux_x86_64.whl
scripts/autotune_helion_kernels.py
0 → 100644
View file @
3fb4b5fa
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Autotune registered Helion kernels for optimal configurations.
Usage:
# Autotune all registered kernels
python scripts/autotune_helion_kernels.py
# Autotune specific kernel
python scripts/autotune_helion_kernels.py --kernels silu_mul_fp8
# Autotune multiple kernels
python scripts/autotune_helion_kernels.py --kernels silu_mul_fp8 rms_norm_fp8
# Force re-autotuning
python scripts/autotune_helion_kernels.py --force
# List available kernels
python scripts/autotune_helion_kernels.py --list
"""
import
argparse
import
sys
import
time
from
dataclasses
import
dataclass
import
torch
from
torch._subclasses.fake_tensor
import
FakeTensorMode
try
:
import
helion
from
vllm.kernels.helion
import
(
ConfigManager
,
get_kernel_by_name
,
get_registered_kernels
,
)
from
vllm.kernels.helion.utils
import
get_canonical_gpu_name
from
vllm.logger
import
init_logger
from
vllm.utils.import_utils
import
has_helion
except
ImportError
as
e
:
print
(
f
"Error importing vLLM:
{
e
}
"
)
print
(
"Please ensure vLLM is installed and in your Python path"
)
sys
.
exit
(
1
)
logger
=
init_logger
(
"vllm.scripts.autotune_helion_kernels"
)
@
dataclass
class
AutotuneResult
:
status
:
str
# "success" | "partial" | "error" | "skipped"
successful
:
int
failed
:
int
configs
:
dict
[
str
,
"helion.Config"
]
message
:
str
=
""
def
list_kernels
()
->
None
:
kernels
=
get_registered_kernels
()
if
not
kernels
:
print
(
"No Helion kernels found in registry."
)
return
print
(
"Available Helion kernels:"
)
print
(
"="
*
50
)
for
name
in
sorted
(
kernels
.
keys
()):
print
(
f
"
{
name
}
"
)
print
(
f
"
\n
Total:
{
len
(
kernels
)
}
kernels"
)
def
check_requirements
()
->
bool
:
if
not
torch
.
cuda
.
is_available
():
logger
.
error
(
"CUDA is not available. Helion autotuning requires GPU."
)
return
False
if
not
has_helion
():
logger
.
error
(
"Helion is not installed. Please install Helion package."
)
return
False
return
True
def
autotune_kernel
(
kernel_name
:
str
,
platform
:
str
,
config_manager
:
ConfigManager
,
force
:
bool
=
False
,
autotune_effort
:
str
=
"quick"
,
)
->
AutotuneResult
:
logger
.
debug
(
"Starting autotune for kernel '%s' with effort='%s'"
,
kernel_name
,
autotune_effort
,
)
kernel_wrapper
=
get_kernel_by_name
(
kernel_name
)
if
kernel_wrapper
is
None
:
error_msg
=
f
"Kernel '
{
kernel_name
}
' not found in registry"
logger
.
error
(
error_msg
)
return
AutotuneResult
(
status
=
"error"
,
message
=
error_msg
,
successful
=
0
,
failed
=
0
,
configs
=
{},
)
try
:
with
FakeTensorMode
():
all_config_keys
=
list
(
kernel_wrapper
.
get_inputs
().
keys
())
except
NotImplementedError
:
error_msg
=
f
"Kernel '
{
kernel_name
}
' has no input generator registered"
logger
.
error
(
error_msg
)
return
AutotuneResult
(
status
=
"error"
,
message
=
error_msg
,
successful
=
0
,
failed
=
0
,
configs
=
{},
)
try
:
logger
.
info
(
"Autotuning kernel '%s' for platform '%s' with %d configs"
,
kernel_name
,
platform
,
len
(
all_config_keys
),
)
if
not
force
:
existing_configs
=
config_manager
.
get_platform_configs
(
kernel_name
,
platform
)
keys_to_autotune
=
[]
for
config_key
in
all_config_keys
:
if
config_key
in
existing_configs
:
logger
.
debug
(
"Config '%s' already exists for platform '%s', skipping"
,
config_key
,
platform
,
)
else
:
keys_to_autotune
.
append
(
config_key
)
else
:
logger
.
debug
(
"Force mode enabled, will re-autotune all configs"
)
keys_to_autotune
=
all_config_keys
if
not
keys_to_autotune
:
logger
.
info
(
"All configs already exist for kernel '%s' on platform '%s'. "
"Use --force to re-autotune."
,
kernel_name
,
platform
,
)
return
AutotuneResult
(
status
=
"skipped"
,
message
=
"All configs already exist"
,
successful
=
0
,
failed
=
0
,
configs
=
{},
)
inputs_dict
=
kernel_wrapper
.
get_inputs
()
configs_to_autotune
=
{
k
:
inputs_dict
[
k
]
for
k
in
keys_to_autotune
}
total_start_time
=
time
.
time
()
autotuned_configs
=
{}
failed_configs
=
[]
for
config_key
,
inputs
in
configs_to_autotune
.
items
():
logger
.
info
(
"Autotuning config: %s"
,
config_key
)
logger
.
debug
(
"Input shapes: %s"
,
[
getattr
(
inp
,
"shape"
,
type
(
inp
).
__name__
)
for
inp
in
inputs
],
)
try
:
config_start_time
=
time
.
time
()
config
=
kernel_wrapper
.
run_autotune
(
inputs
,
autotune_effort
)
config_duration
=
time
.
time
()
-
config_start_time
# Save immediately for checkpointing
config_manager
.
save_configs
(
kernel_name
,
platform
,
{
config_key
:
config
})
autotuned_configs
[
config_key
]
=
config
logger
.
debug
(
"Config details: %s"
,
config
)
logger
.
info
(
"✓ Autotuned and saved config '%s' (%.2fs)"
,
config_key
,
config_duration
,
)
except
(
RuntimeError
,
ValueError
,
OSError
)
as
e
:
logger
.
exception
(
"Failed to autotune config '%s': %s"
,
config_key
,
e
,
)
failed_configs
.
append
(
config_key
)
total_duration
=
time
.
time
()
-
total_start_time
successful
=
len
(
autotuned_configs
)
failed
=
len
(
failed_configs
)
logger
.
info
(
"Completed autotuning for kernel '%s': %d successful, %d failed (%.2fs)"
,
kernel_name
,
successful
,
failed
,
total_duration
,
)
status
=
"success"
if
failed
==
0
else
"partial"
return
AutotuneResult
(
status
=
status
,
successful
=
successful
,
failed
=
failed
,
configs
=
autotuned_configs
,
)
except
(
KeyError
,
RuntimeError
,
ValueError
,
OSError
)
as
e
:
error_msg
=
f
"Unexpected error:
{
e
}
"
logger
.
exception
(
"Failed to autotune kernel '%s': %s"
,
kernel_name
,
e
)
return
AutotuneResult
(
status
=
"error"
,
message
=
error_msg
,
successful
=
0
,
failed
=
0
,
configs
=
{},
)
def
summarize_results
(
results
:
dict
[
str
,
AutotuneResult
])
->
bool
:
logger
.
info
(
"="
*
50
)
logger
.
info
(
"Autotuning Results Summary"
)
logger
.
info
(
"="
*
50
)
total_successful
=
0
total_failed
=
0
success_kernels
=
[]
partial_kernels
=
[]
error_kernels
=
[]
skipped_kernels
=
[]
for
kernel_name
,
result
in
results
.
items
():
total_successful
+=
result
.
successful
total_failed
+=
result
.
failed
if
result
.
status
==
"success"
:
success_kernels
.
append
(
f
"
{
kernel_name
}
(
{
result
.
successful
}
configs)"
)
logger
.
info
(
"✓ %s: %d configs successful"
,
kernel_name
,
result
.
successful
)
elif
result
.
status
==
"partial"
:
partial_kernels
.
append
(
f
"
{
kernel_name
}
(
{
result
.
successful
}
ok,
{
result
.
failed
}
failed)"
)
logger
.
warning
(
"⚠ %s: %d successful, %d failed"
,
kernel_name
,
result
.
successful
,
result
.
failed
,
)
elif
result
.
status
==
"error"
:
error_kernels
.
append
(
f
"
{
kernel_name
}
:
{
result
.
message
or
'Unknown error'
}
"
)
logger
.
error
(
"✗ %s: %s"
,
kernel_name
,
result
.
message
or
"Unknown error"
)
elif
result
.
status
==
"skipped"
:
skipped_kernels
.
append
(
f
"
{
kernel_name
}
:
{
result
.
message
or
'Skipped'
}
"
)
logger
.
info
(
"- %s: %s"
,
kernel_name
,
result
.
message
or
"Skipped"
)
logger
.
info
(
"="
*
50
)
logger
.
info
(
"Summary: %d total configs (%d successful, %d failed)"
,
total_successful
+
total_failed
,
total_successful
,
total_failed
,
)
logger
.
info
(
"Kernels: %d success, %d partial, %d error, %d skipped"
,
len
(
success_kernels
),
len
(
partial_kernels
),
len
(
error_kernels
),
len
(
skipped_kernels
),
)
has_failures
=
bool
(
error_kernels
or
partial_kernels
)
if
not
has_failures
:
if
total_successful
>
0
:
logger
.
info
(
"All configs autotuned successfully!"
)
else
:
logger
.
info
(
"No new configs were generated (all may already exist)"
)
return
not
has_failures
def
get_kernels_to_autotune
(
requested_kernels
:
list
[
str
]
|
None
)
->
list
[
str
]:
all_kernels
=
get_registered_kernels
()
if
not
all_kernels
:
logger
.
error
(
"No Helion kernels found in registry"
)
sys
.
exit
(
1
)
if
not
requested_kernels
:
return
list
(
all_kernels
.
keys
())
if
len
(
requested_kernels
)
!=
len
(
set
(
requested_kernels
)):
duplicates
=
[
k
for
k
in
set
(
requested_kernels
)
if
requested_kernels
.
count
(
k
)
>
1
]
logger
.
error
(
"Duplicate kernel names in --kernels flag: %s"
,
duplicates
)
sys
.
exit
(
1
)
kernels_to_autotune
=
[]
missing_kernels
=
[]
for
kernel_name
in
requested_kernels
:
if
kernel_name
in
all_kernels
:
kernels_to_autotune
.
append
(
kernel_name
)
else
:
missing_kernels
.
append
(
kernel_name
)
if
missing_kernels
:
logger
.
error
(
"Kernel(s) not found: %s"
,
missing_kernels
)
logger
.
error
(
"Available kernels: %s"
,
list
(
all_kernels
.
keys
()))
sys
.
exit
(
1
)
return
kernels_to_autotune
def
main
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Autotune Helion kernels"
,
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
epilog
=
__doc__
.
split
(
"Usage:"
)[
1
]
if
"Usage:"
in
__doc__
else
""
,
)
parser
.
add_argument
(
"--kernels"
,
nargs
=
"+"
,
help
=
"Kernel(s) to autotune (default: all kernels)"
,
)
parser
.
add_argument
(
"--config-dir"
,
type
=
str
,
help
=
"Config directory for config files (default: vLLM helion configs dir)"
,
)
parser
.
add_argument
(
"--list"
,
action
=
"store_true"
,
help
=
"List available Helion kernels and exit"
,
)
parser
.
add_argument
(
"--force"
,
action
=
"store_true"
,
help
=
(
"Force re-autotuning even if configs already exist for the "
"platform and config keys"
),
)
parser
.
add_argument
(
"--autotune-effort"
,
type
=
str
,
default
=
"quick"
,
help
=
(
"Helion autotune effort level: 'quick' (smaller search) or "
"'full' (full search budget) (default: quick)"
),
)
parser
.
add_argument
(
"--verbose"
,
action
=
"store_true"
,
help
=
"Enable verbose logging"
,
)
args
=
parser
.
parse_args
()
import
logging
if
args
.
verbose
:
logging
.
getLogger
(
"vllm"
).
setLevel
(
logging
.
DEBUG
)
logger
.
debug
(
"Verbose mode enabled"
)
logger
.
debug
(
"Arguments: %s"
,
vars
(
args
))
else
:
logging
.
getLogger
(
"vllm"
).
setLevel
(
logging
.
INFO
)
if
args
.
list
:
list_kernels
()
return
if
not
check_requirements
():
sys
.
exit
(
1
)
platform
=
get_canonical_gpu_name
()
logger
.
info
(
"Detected GPU platform: %s"
,
platform
)
config_manager
=
(
ConfigManager
(
args
.
config_dir
)
if
args
.
config_dir
else
ConfigManager
()
)
try
:
config_manager
.
ensure_base_dir_writable
()
except
OSError
as
e
:
logger
.
error
(
"Failed to access config directory: %s"
,
e
)
sys
.
exit
(
1
)
kernels_to_autotune
=
get_kernels_to_autotune
(
args
.
kernels
)
logger
.
info
(
"Will autotune %d kernel(s) for platform '%s': %s"
,
len
(
kernels_to_autotune
),
platform
,
kernels_to_autotune
,
)
results
=
{}
for
kernel_name
in
kernels_to_autotune
:
result
=
autotune_kernel
(
kernel_name
,
platform
,
config_manager
,
args
.
force
,
args
.
autotune_effort
)
results
[
kernel_name
]
=
result
success
=
summarize_results
(
results
)
sys
.
exit
(
0
if
success
else
1
)
if
__name__
==
"__main__"
:
main
()
setup.py
View file @
3fb4b5fa
...
...
@@ -18,8 +18,6 @@ import torch
from
packaging.version
import
Version
,
parse
from
setuptools
import
Extension
,
setup
from
setuptools.command.build_ext
import
build_ext
from
setuptools.command.build_py
import
build_py
from
setuptools.command.develop
import
develop
from
setuptools_scm
import
get_version
from
torch.utils.cpp_extension
import
CUDA_HOME
,
ROCM_HOME
...
...
@@ -81,81 +79,6 @@ def is_freethreaded():
return
bool
(
sysconfig
.
get_config_var
(
"Py_GIL_DISABLED"
))
def
compile_grpc_protos
():
"""Compile gRPC protobuf definitions during build.
This generates *_pb2.py, *_pb2_grpc.py, and *_pb2.pyi files from
the vllm_engine.proto definition.
"""
try
:
from
grpc_tools
import
protoc
except
ImportError
:
logger
.
warning
(
"grpcio-tools not installed, skipping gRPC proto compilation. "
"gRPC server functionality will not be available."
)
return
False
proto_file
=
ROOT_DIR
/
"vllm"
/
"grpc"
/
"vllm_engine.proto"
if
not
proto_file
.
exists
():
logger
.
warning
(
"Proto file not found at %s, skipping compilation"
,
proto_file
)
return
False
logger
.
info
(
"Compiling gRPC protobuf: %s"
,
proto_file
)
result
=
protoc
.
main
(
[
"grpc_tools.protoc"
,
f
"--proto_path=
{
ROOT_DIR
}
"
,
f
"--python_out=
{
ROOT_DIR
}
"
,
f
"--grpc_python_out=
{
ROOT_DIR
}
"
,
f
"--pyi_out=
{
ROOT_DIR
}
"
,
str
(
proto_file
),
]
)
if
result
!=
0
:
logger
.
error
(
"protoc failed with exit code %s"
,
result
)
return
False
# Add SPDX headers and mypy ignore to generated files
spdx_header
=
(
"# SPDX-License-Identifier: Apache-2.0
\n
"
"# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
\n
"
"# mypy: ignore-errors
\n
"
)
grpc_dir
=
ROOT_DIR
/
"vllm"
/
"grpc"
for
generated_file
in
[
grpc_dir
/
"vllm_engine_pb2.py"
,
grpc_dir
/
"vllm_engine_pb2_grpc.py"
,
grpc_dir
/
"vllm_engine_pb2.pyi"
,
]:
if
generated_file
.
exists
():
content
=
generated_file
.
read_text
()
if
not
content
.
startswith
(
"# SPDX-License-Identifier"
):
generated_file
.
write_text
(
spdx_header
+
content
)
logger
.
info
(
"gRPC protobuf compilation successful"
)
return
True
class
BuildPyAndGenerateGrpc
(
build_py
):
"""Build Python modules and generate gRPC stubs from proto files."""
def
run
(
self
):
compile_grpc_protos
()
super
().
run
()
class
DevelopAndGenerateGrpc
(
develop
):
"""Develop mode that also generates gRPC stubs from proto files."""
def
run
(
self
):
compile_grpc_protos
()
super
().
run
()
class
CMakeExtension
(
Extension
):
def
__init__
(
self
,
name
:
str
,
cmake_lists_dir
:
str
=
"."
,
**
kwa
)
->
None
:
super
().
__init__
(
name
,
sources
=
[],
py_limited_api
=
not
is_freethreaded
(),
**
kwa
)
...
...
@@ -734,13 +657,18 @@ class precompiled_wheel_utils:
def
get_base_commit_in_main_branch
()
->
str
:
try
:
# Get the latest commit hash of the upstream main branch.
resp_json
=
subprocess
.
check_output
(
[
"curl"
,
"-s"
,
"https://api.github.com/repos/vllm-project/vllm/commits/main"
,
curl_cmd
=
[
"curl"
,
"-s"
,
"https://api.github.com/repos/vllm-project/vllm/commits/main"
,
]
github_token
=
os
.
getenv
(
"GH_TOKEN"
,
os
.
getenv
(
"GITHUB_TOKEN"
))
if
github_token
:
curl_cmd
+=
[
"-H"
,
f
"Authorization: token
{
github_token
}
"
,
]
).
decode
(
"utf-8"
)
resp_json
=
subprocess
.
check_output
(
curl_cmd
).
decode
(
"utf-8"
)
upstream_main_commit
=
json
.
loads
(
resp_json
)[
"sha"
]
print
(
f
"Upstream main branch latest commit:
{
upstream_main_commit
}
"
)
...
...
@@ -818,7 +746,7 @@ def _is_xpu() -> bool:
def
_build_custom_ops
()
->
bool
:
return
_is_cuda
()
or
_is_hip
()
or
_is_cpu
()
return
_is_cuda
()
or
_is_hip
()
def
get_rocm_version
():
...
...
@@ -976,6 +904,11 @@ if _is_cuda():
):
# FA3 requires CUDA 12.3 or later
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm.vllm_flash_attn._vllm_fa3_C"
))
# FA4 CuteDSL - Python-only component for FA4's cute DSL support
# Optional since this doesn't produce a .so file, just copies Python files
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm.vllm_flash_attn._vllm_fa4_cutedsl_C"
,
optional
=
True
)
)
if
envs
.
VLLM_USE_PRECOMPILED
or
(
CUDA_HOME
and
get_nvcc_cuda_version
()
>=
Version
(
"12.9"
)
):
...
...
@@ -987,6 +920,16 @@ if _is_cuda():
CMakeExtension
(
name
=
"vllm._flashmla_extension_C"
,
optional
=
True
)
)
if
_is_cpu
():
import
platform
if
platform
.
machine
()
in
(
"x86_64"
,
"AMD64"
):
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._C"
))
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._C_AVX512"
))
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._C_AVX2"
))
else
:
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._C"
))
if
_build_custom_ops
():
ext_modules
.
append
(
CMakeExtension
(
name
=
"vllm._C"
))
...
...
@@ -1014,17 +957,12 @@ if _no_device():
ext_modules
=
[]
if
not
ext_modules
:
cmdclass
=
{
"build_py"
:
BuildPyAndGenerateGrpc
,
"develop"
:
DevelopAndGenerateGrpc
,
}
cmdclass
=
{}
else
:
cmdclass
=
{
"build_ext"
:
precompiled_build_ext
if
envs
.
VLLM_USE_PRECOMPILED
else
cmake_build_ext
,
"build_py"
:
BuildPyAndGenerateGrpc
,
"develop"
:
DevelopAndGenerateGrpc
,
}
setup
(
...
...
@@ -1033,22 +971,28 @@ setup(
ext_modules
=
ext_modules
,
install_requires
=
get_requirements
(),
extras_require
=
{
"bench"
:
[
"pandas"
,
"matplotlib"
,
"seaborn"
,
"datasets"
,
"scipy"
],
# AMD Zen CPU optimizations via zentorch
"zen"
:
[
"zentorch"
],
"bench"
:
[
"pandas"
,
"matplotlib"
,
"seaborn"
,
"datasets"
,
"scipy"
,
"plotly"
],
"tensorizer"
:
[
"tensorizer==2.10.1"
],
"fastsafetensors"
:
[
"fastsafetensors >= 0.2.2"
],
"runai"
:
[
"runai-model-streamer[s3,gcs] >= 0.15.3"
],
"instanttensor"
:
[
"instanttensor >= 0.1.5"
],
"runai"
:
[
"runai-model-streamer[s3,gcs,azure] >= 0.15.7"
],
"audio"
:
[
"librosa"
,
"scipy"
,
"soundfile"
,
"mistral_common[audio]"
,
"av"
,
],
# Required for audio processing
"video"
:
[],
# Kept for backwards compatibility
"flashinfer"
:
[],
# Kept for backwards compatibility
# Optional deps for AMD FP4 quantization support
"petit-kernel"
:
[
"petit-kernel"
],
# Optional deps for Helion kernel development
"helion"
:
[
"helion"
],
"helion"
:
[
"helion==0.3.2"
],
# Optional deps for gRPC server (vllm serve --grpc)
"grpc"
:
[
"smg-grpc-servicer[vllm] >= 0.5.0"
],
# Optional deps for OpenTelemetry tracing
"otel"
:
[
"opentelemetry-sdk>=1.26.0"
,
...
...
tests/basic_correctness/test_basic_correctness.py
View file @
3fb4b5fa
...
...
@@ -11,6 +11,8 @@ from unittest.mock import Mock
import
pytest
import
torch
from
packaging.version
import
Version
from
transformers
import
__version__
as
TRANSFORMERS_VERSION
from
vllm
import
LLM
from
vllm.platforms
import
current_platform
...
...
@@ -91,6 +93,15 @@ def test_models(
if
enable_prompt_embeds
:
with
torch
.
no_grad
():
prompt_embeds
=
hf_model
.
get_prompt_embeddings
(
example_prompts
)
if
model
==
"hmellor/tiny-random-Gemma2ForCausalLM"
and
(
Version
(
TRANSFORMERS_VERSION
)
<
Version
(
"5.3.0.dev0"
)
):
# For Gemma 1/2 models with Transformers 5.4.0+, the prompt embeddings
# are normalised in `get_prompt_embeddings`, like Gemma 3.
# For older versions, we need to manually normalise.
embed_scale
=
hf_model
.
config
.
hidden_size
**
0.5
normalizer
=
torch
.
tensor
(
embed_scale
,
dtype
=
prompt_embeds
[
0
].
dtype
)
prompt_embeds
=
[
p_e
*
normalizer
for
p_e
in
prompt_embeds
]
with
VllmRunner
(
model
,
...
...
@@ -124,8 +135,6 @@ def test_models(
[
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
,
{}),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
,
{}),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
,
{
"VLLM_SLEEP_WHEN_IDLE"
:
"1"
}),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
,
{
"VLLM_SLEEP_WHEN_IDLE"
:
"1"
}),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"ray"
,
""
,
"L4"
,
{}),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
"mp"
,
""
,
"L4"
,
{}),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"A100"
,
{}),
...
...
Prev
1
…
16
17
18
19
20
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment