Commit 3fb4b5fa authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.18.0' into v0.18.0-ori

parents bcf25339 89138b21
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""
Example online usage of Pooling API for ColQwen3 multi-vector retrieval.
ColQwen3 is a multi-modal late interaction model based on Qwen3-VL that
produces per-token embeddings (320-dim, L2-normalized) for both text and
image inputs. Similarity is computed via MaxSim scoring.
This example mirrors the official TomoroAI inference code
(https://huggingface.co/TomoroAI/tomoro-colqwen3-embed-4b) but uses the
vLLM serving API instead of local HuggingFace model loading.
Start the server with:
vllm serve TomoroAI/tomoro-colqwen3-embed-4b --max-model-len 4096
Then run this script:
python colqwen3_token_embed_online.py
"""
import argparse
import base64
from io import BytesIO
import numpy as np
import requests
from PIL import Image
# ── Helpers ─────────────────────────────────────────────────
def post_http_request(payload: dict, api_url: str) -> requests.Response:
headers = {"User-Agent": "Test Client"}
return requests.post(api_url, headers=headers, json=payload)
def load_image(url: str) -> Image.Image:
"""Download an image from URL (handles Wikimedia 403)."""
for hdrs in ({}, {"User-Agent": "Mozilla/5.0 (compatible; ColQwen3-demo/1.0)"}):
resp = requests.get(url, headers=hdrs, timeout=10)
if resp.status_code == 403:
continue
resp.raise_for_status()
return Image.open(BytesIO(resp.content)).convert("RGB")
raise RuntimeError(f"Could not fetch image from {url}")
def encode_image_base64(image: Image.Image) -> str:
"""Encode a PIL image to a base64 data URI."""
buf = BytesIO()
image.save(buf, format="PNG")
return "data:image/png;base64," + base64.b64encode(buf.getvalue()).decode()
def compute_maxsim(q_emb: np.ndarray, d_emb: np.ndarray) -> float:
"""Compute ColBERT-style MaxSim score between query and document."""
sim = q_emb @ d_emb.T
return float(sim.max(axis=-1).sum())
# ── Encode functions ────────────────────────────────────────
def encode_queries(texts: list[str], model: str, api_url: str) -> list[np.ndarray]:
"""Encode text queries → list of multi-vector embeddings."""
resp = post_http_request({"model": model, "input": texts}, api_url)
return [np.array(item["data"]) for item in resp.json()["data"]]
def encode_images(image_urls: list[str], model: str, api_url: str) -> list[np.ndarray]:
"""Encode image documents → list of multi-vector embeddings.
Images are sent via the chat-style `messages` field so that the
vLLM multimodal processor handles them correctly.
"""
embeddings = []
for url in image_urls:
print(f" Loading: {url.split('/')[-1]}...")
image = load_image(url)
image_uri = encode_image_base64(image)
resp = post_http_request(
{
"model": model,
"messages": [
{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": image_uri}},
{"type": "text", "text": "Describe the image."},
],
}
],
},
api_url,
)
result = resp.json()
if resp.status_code != 200 or "data" not in result:
print(f" Error ({resp.status_code}): {str(result)[:200]}")
continue
embeddings.append(np.array(result["data"][0]["data"]))
return embeddings
# ── Main ────────────────────────────────────────────────────
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
"--model",
type=str,
default="TomoroAI/tomoro-colqwen3-embed-4b",
)
return parser.parse_args()
def main(args):
pooling_url = f"http://{args.host}:{args.port}/pooling"
score_url = f"http://{args.host}:{args.port}/score"
model = args.model
# Same sample data as the official TomoroAI example
queries = [
"Retrieve the city of Singapore",
"Retrieve the city of Beijing",
"Retrieve the city of London",
]
image_urls = [
"https://upload.wikimedia.org/wikipedia/commons/2/27/Singapore_skyline_2022.jpg",
"https://upload.wikimedia.org/wikipedia/commons/6/61/Beijing_skyline_at_night.JPG",
"https://upload.wikimedia.org/wikipedia/commons/4/49/London_skyline.jpg",
]
# ── 1) Text query embeddings ────────────────────────────
print("=" * 60)
print("1. Encode text queries (multi-vector)")
print("=" * 60)
query_embeddings = encode_queries(queries, model, pooling_url)
for i, emb in enumerate(query_embeddings):
norm = float(np.linalg.norm(emb[0]))
print(f' Query {i}: {emb.shape} (L2 norm: {norm:.4f}) "{queries[i]}"')
# ── 2) Image document embeddings ────────────────────────
print()
print("=" * 60)
print("2. Encode image documents (multi-vector)")
print("=" * 60)
doc_embeddings = encode_images(image_urls, model, pooling_url)
for i, emb in enumerate(doc_embeddings):
print(f" Doc {i}: {emb.shape} {image_urls[i].split('/')[-1]}")
# ── 3) Cross-modal MaxSim scoring ───────────────────────
if doc_embeddings:
print()
print("=" * 60)
print("3. Cross-modal MaxSim scores (text queries × image docs)")
print("=" * 60)
# Header
print(f"{'':>35s}", end="")
for j in range(len(doc_embeddings)):
print(f" Doc {j:>2d}", end="")
print()
# Score matrix
for i, q_emb in enumerate(query_embeddings):
print(f" {queries[i]:<33s}", end="")
for j, d_emb in enumerate(doc_embeddings):
score = compute_maxsim(q_emb, d_emb)
print(f" {score:6.2f}", end="")
print()
# ── 4) Text-only /score endpoint ────────────────────────
print()
print("=" * 60)
print("4. Text-only late interaction scoring (/score endpoint)")
print("=" * 60)
text_query = "What is the capital of France?"
text_docs = [
"The capital of France is Paris.",
"Berlin is the capital of Germany.",
"Python is a programming language.",
]
resp = post_http_request(
{"model": model, "text_1": text_query, "text_2": text_docs},
score_url,
)
print(f' Query: "{text_query}"\n')
for item in resp.json()["data"]:
idx = item["index"]
print(f" Doc {idx} (score={item['score']:.4f}): {text_docs[idx]}")
if __name__ == "__main__":
args = parse_args()
main(args)
......@@ -42,6 +42,7 @@ theme:
- navigation.sections
- navigation.indexes
- navigation.top
- navigation.path
- search.highlight
- search.share
- toc.follow
......@@ -63,8 +64,9 @@ plugins:
- git-revision-date-localized:
# exclude autogenerated files
exclude:
- argparse/*
- api/*
- examples/*
- generated/*
- minify:
minify_html: true
minify_js: true
......@@ -92,7 +94,6 @@ plugins:
- "!.*_pb2_grpc" # Exclude auto-generated gRPC stubs
summary:
modules: true
show_if_no_docstring: true
show_signature_annotations: true
separate_signature: true
show_overloads: true
......@@ -105,6 +106,10 @@ plugins:
- https://numpy.org/doc/stable/objects.inv
- https://pytorch.org/docs/stable/objects.inv
- https://psutil.readthedocs.io/en/stable/objects.inv
- redirects:
redirect_maps:
features/spec_decode/README.md: features/speculative_decoding/README.md
features/spec_decode/speculators.md: features/speculative_decoding/speculators.md
markdown_extensions:
- attr_list
......@@ -141,7 +146,6 @@ extra_css:
- mkdocs/stylesheets/extra.css
extra_javascript:
- mkdocs/javascript/reo.js
- mkdocs/javascript/run_llm_widget.js
- mkdocs/javascript/mathjax.js
- https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
......
......@@ -9,7 +9,6 @@ requires = [
"torch == 2.10.0",
"wheel",
"jinja2",
"grpcio-tools==1.78.0",
]
build-backend = "setuptools.build_meta"
......@@ -56,10 +55,6 @@ include = ["vllm*"]
"vllm/third_party/**" = ["ALL"]
"vllm/version.py" = ["F401"]
"vllm/_version.py" = ["ALL"]
# Exclude generated protobuf files
"vllm/grpc/*_pb2.py" = ["ALL"]
"vllm/grpc/*_pb2_grpc.py" = ["ALL"]
"vllm/grpc/*_pb2.pyi" = ["ALL"]
[tool.ruff.lint]
select = [
......@@ -112,12 +107,10 @@ markers = [
"cpu_test: mark test as CPU-only test",
"split: run this test as part of a split",
"distributed: run this test only in distributed GPU tests",
"skip_v1: do not run this test with v1",
"optional: optional tests that are automatically skipped, include --optional to run them",
]
[tool.ty.src]
root = "./vllm"
respect-ignore-files = true
[tool.ty.environment]
......@@ -125,190 +118,56 @@ python = "./.venv"
[tool.typos.files]
# these files may be written in non english words
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "build/*",
"vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*",
"docs/governance/process.md"]
ignore-hidden = true
ignore-files = true
ignore-dot = true
ignore-vcs = true
ignore-global = true
ignore-parent = true
extend-exclude = ["tests/models/fixtures/*", "tests/prompts/*", "tests/tokenizers_/*",
"benchmarks/sonnet.txt", "tests/lora/data/*", "examples/pooling/token_embed/*", "build/*",
"vllm/third_party/*", "vllm/entrypoints/serve/instrumentator/static/*", "tests/entrypoints/openai/test_transcription_validation.py",
"docs/governance/process.md", "tests/v1/engine/test_fast_incdec_prefix_err.py", ".git/*"]
ignore-hidden = false
[tool.typos.default]
binary = false
check-filename = false
check-file = true
unicode = true
ignore-hex = true
identifier-leading-digits = false
locale = "en"
extend-ignore-identifiers-re = ["NVML_*", ".*Unc.*", ".*_thw",
".*UE8M0.*", ".*[UE4M3|ue4m3].*", ".*eles.*",
".*[Tt]h[rR].*"]
extend-ignore-words-re = []
extend-ignore-re = []
extend-ignore-identifiers-re = [".*[Uu][Ee][0-9][Mm][0-9].*"]
[tool.typos.default.extend-identifiers]
bbc5b7ede = "bbc5b7ede"
womens_doubles = "womens_doubles"
v_2nd = "v_2nd"
# splitted_input = "splitted_input"
NOOPs = "NOOPs"
typ = "typ"
nin_shortcut = "nin_shortcut"
UperNetDecoder = "UperNetDecoder"
subtile = "subtile"
cudaDevAttrMaxSharedMemoryPerBlockOptin = "cudaDevAttrMaxSharedMemoryPerBlockOptin"
SFOuput = "SFOuput"
# huggingface transformers repo uses these words
depthwise_seperable_out_channel = "depthwise_seperable_out_channel"
DepthWiseSeperableConv1d = "DepthWiseSeperableConv1d"
depthwise_seperable_CNN = "depthwise_seperable_CNN"
pard_token = "pard_token"
ptd_token_id = "ptd_token_id"
ser_de = "ser_de"
shared_memory_per_block_optin = "shared_memory_per_block_optin"
FoPE = "FoPE"
k_ot = "k_ot"
view_seperator = "view_seperator"
inverse_std_variences = "inverse_std_variences"
[tool.typos.default.extend-words]
iy = "iy"
tendencias = "tendencias"
indx = "indx"
# intel cpu features
tme = "tme"
dout = "dout"
Pn = "Pn"
arange = "arange"
[tool.typos.type.py]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.py.extend-identifiers]
arange = "arange"
NDArray = "NDArray"
EOFError = "EOFError"
fo = "fo"
ba = "ba"
[tool.typos.type.py.extend-words]
thw = "thw"
subtile = "subtile"
HSA = "HSA"
setp = "setp"
CPY = "CPY"
thr = "thr"
Thr = "Thr"
PARD = "PARD"
pard = "pard"
AKS = "AKS"
ba = "ba"
fo = "fo"
nd = "nd"
[tool.typos.type.cpp]
extend-glob = ["*.cu"]
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.cpp.extend-identifiers]
countr_one = "countr_one"
k_ot = "k_ot"
ot = "ot"
[tool.typos.type.cpp.extend-words]
[tool.typos.type.rust]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.rust.extend-identifiers]
flate2 = "flate2"
[tool.typos.type.rust.extend-words]
eles = "eles"
datas = "datas"
ser = "ser"
[tool.typos.type.lock]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.lock.extend-identifiers]
[tool.typos.type.lock.extend-words]
[tool.typos.type.jl]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.jl.extend-identifiers]
[tool.typos.type.jl.extend-words]
modul = "modul"
egals = "egals"
usig = "usig"
egal = "egal"
[tool.typos.type.go]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.go.extend-identifiers]
flate = "flate"
[tool.typos.type.go.extend-words]
[tool.typos.type.css]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.css.extend-identifiers]
nd = "nd"
[tool.typos.type.css.extend-words]
[tool.typos.type.man]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.man.extend-identifiers]
Nd = "Nd"
[tool.typos.type.man.extend-words]
[tool.typos.type.cert]
extend-glob = []
check-file = false
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.cert.extend-identifiers]
[tool.typos.type.cert.extend-words]
[tool.typos.type.sh]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.sh.extend-identifiers]
ot = "ot"
[tool.typos.type.sh.extend-words]
[tool.typos.type.vimscript]
extend-glob = []
extend-ignore-identifiers-re = []
extend-ignore-words-re = []
extend-ignore-re = []
[tool.typos.type.vimscript.extend-identifiers]
windo = "windo"
[tool.typos.type.vimscript.extend-words]
ure = "ure"
[tool.uv]
no-build-isolation-package = ["torch"]
\ No newline at end of file
no-build-isolation-package = ["torch"]
......@@ -10,4 +10,3 @@ jinja2>=3.1.6
regex
build
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.*
grpcio-tools==1.78.0 # Required for grpc entrypoints
......@@ -12,7 +12,7 @@ tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp >= 3.13.3
openai >= 1.99.1 # For Responses API with reasoning content
openai >= 1.99.1, < 2.25.0 # For Responses API with reasoning content
pydantic >= 2.12.0
prometheus_client >= 0.18.0
pillow # Required for image processing
......@@ -24,14 +24,14 @@ outlines_core == 0.2.11
# required for outlines backend disk cache
diskcache == 5.6.3
lark == 1.2.2
xgrammar == 0.1.29; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
xgrammar >= 0.1.32, < 1.0.0; platform_machine == "x86_64" or platform_machine == "aarch64" or platform_machine == "arm64" or platform_machine == "s390x" or platform_machine == "ppc64le"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
pyzmq >= 25.0.0
msgspec
gguf >= 0.17.0
mistral_common[image] >= 1.9.0
mistral_common[image] >= 1.10.0
opencv-python-headless >= 4.13.0 # required for video IO
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
......@@ -51,5 +51,7 @@ openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic >= 0.71.0
model-hosting-container-standards >= 0.1.13, < 1.0.0
mcp
grpcio
grpcio-reflection
\ No newline at end of file
opentelemetry-sdk >= 1.27.0
opentelemetry-api >= 1.27.0
opentelemetry-exporter-otlp >= 1.27.0
opentelemetry-semantic-conventions-ai >= 0.4.1
......@@ -7,13 +7,13 @@ numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative d
# Dependencies for CPUs
torch==2.10.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.10.0; platform_machine == "aarch64" or platform_system == "Darwin" or platform_machine == "ppc64le"
torch==2.10.0; platform_machine == "aarch64" or platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "riscv64"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "s390x"
torchaudio; platform_machine != "s390x" and platform_machine != "riscv64"
# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "s390x"
torchvision; platform_machine != "s390x" and platform_machine != "riscv64"
# Intel Extension for PyTorch, only for x86_64 CPUs
intel-openmp==2024.2.1; platform_machine == "x86_64"
......
......@@ -4,10 +4,16 @@
numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0
torch==2.10.0
torchaudio==2.10.0
# These must be updated alongside torch
torchvision==0.25.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# FlashInfer should be updated together with the Dockerfile
flashinfer-python==0.6.3
flashinfer-python==0.6.6
# Cap nvidia-cudnn-frontend (transitive dep of flashinfer) due to
# breaking changes in 1.19.0
nvidia-cudnn-frontend>=1.13.0,<1.19.0
# QuACK and Cutlass DSL for FA4 (cute-DSL implementation)
nvidia-cutlass-dsl>=4.4.0.dev1
quack-kernels>=0.2.7
mkdocs
mkdocs<2.0.0
mkdocs-api-autonav
mkdocs-material
mkdocstrings-python
......@@ -7,6 +7,7 @@ mkdocs-awesome-nav
mkdocs-glightbox
mkdocs-git-revision-date-localized-plugin
mkdocs-minify-plugin
mkdocs-redirects
regex
ruff
pydantic
......
lmcache >= 0.3.9
nixl >= 0.7.1 # Required for disaggregated prefill
nixl >= 0.7.1, < 0.10.0 # Required for disaggregated prefill
mooncake-transfer-engine >= 0.3.8
# formatting
pre-commit==4.0.1
pre-commit>=4.5.1
......@@ -23,17 +23,17 @@ jiwer # required for audio tests
timm # required for internvl test
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.9.0 # required for voxtral test
mistral_common[image,audio] >= 1.9.1 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.9.2 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5
tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes>=0.46.1
bitsandbytes>=0.49.2
buildkite-test-collector==0.1.9
......@@ -42,6 +42,7 @@ tritonclient>=2.51.0
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.3
runai-model-streamer[s3,gcs,azure]==0.15.7
fastsafetensors>=0.2.2
instanttensor>=0.1.5
pydantic>=2.12 # 2.11 leads to error on python 3.13
# Common dependencies
-r common.txt
--extra-index-url https://download.pytorch.org/whl/test/rocm7.0
--extra-index-url https://download.pytorch.org/whl/rocm7.1
torch==2.10.0
torchvision==0.25.0
torchaudio==2.10.0
......@@ -12,5 +12,5 @@ setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
wheel
jinja2>=3.1.6
amdsmi==6.4.3
amdsmi==7.0.2
timm>=1.0.17
......@@ -45,6 +45,8 @@ pystemmer==3.0.0
# via mteb
# Multi-modal processing
av==16.1.0
# required for audio_in_video tests
blobfile==3.0.0
# Multi-Modal Models Test
decord==0.6.0
......@@ -58,7 +60,7 @@ schemathesis==3.39.15
# OpenAI schema test
# Evaluation and benchmarking
lm-eval[api]==0.4.9.2
lm-eval[api]==0.4.11
jiwer==4.0.0
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
......@@ -67,12 +69,10 @@ multiprocess==0.70.16
# Required for v1/metrics/test_engine_logger_apis.py
ray[cgraph,default]>=2.48.0
# Plugins test
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
torchgeo==0.7.0
# via terratorch
# MTEB Benchmark Test
mteb==2.1.2
mteb[bm25s]>=2, <3
# Utilities
num2words==0.5.14
......@@ -93,6 +93,22 @@ timm==1.0.17
# Required for plugins test
albumentations==1.4.6
# Pin transformers version
transformers==4.57.3
transformers==4.57.5
# Pin HF Hub version
huggingface-hub==0.36.2
# Pin Mistral Common
mistral-common[image,audio]==1.10.0
# Required for Prithvi tests
terratorch==1.2.2
# Required for Prithvi tests
segmentation-models-pytorch==0.5.0
# Required for Prithvi tests
imagehash==4.3.2
# Required for bitsandbytes quantization test
bitsandbytes==0.49.2
# Examples (tensorizer) tests
tensorizer==2.10.1
# Multi-modal models test (`allendou/FireRedASR2-LLM-vllm`)
kaldi-native-fbank==1.22.3
# Pinning numpy version
numpy==2.2.6
# Common dependencies
-r common.txt
# The version of gRPC libraries should be consistent with each other
grpcio==1.78.0
grpcio-reflection==1.78.0
numba == 0.61.2 # Required for N-gram speculative decoding
# Dependencies for AMD GPUs
datasets
ray[cgraph]>=2.48.0
peft
pytest-asyncio
tensorizer==2.10.1
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
runai-model-streamer[s3,gcs]==0.15.3
runai-model-streamer[s3,gcs,azure]==0.15.7
# conch-triton-kernels==1.2.1
timm>=1.0.17
grpcio-tools==1.78.0 # Should match `build.txt`
\ No newline at end of file
# amd-quark: required for Quark quantization on ROCm
# To be consistent with test_quark.py
amd-quark>=0.8.99
\ No newline at end of file
......@@ -10,6 +10,7 @@ pytest-cov
# testing utils
albumentations # required for Nemotron Parse in test_common.py
av # required for audio_in_video tests
backoff # required for phi4mm test
blobfile # required for kimi-vl test
einops # required for MPT, qwen-vl
......@@ -30,33 +31,48 @@ torchaudio==2.10.0
torchvision==0.25.0
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.9.0 # required for voxtral test
mistral_common[image,audio] >= 1.9.1 # required for voxtral test
num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py
opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.9.2 # required for model evaluation test
lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5
tokenizers==0.22.0
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
bitsandbytes==0.46.1
bitsandbytes==0.49.2
buildkite-test-collector==0.1.9
genai_perf>=0.0.8
tritonclient>=2.51.0
grpcio-tools==1.78.0 # Should match `build.txt`
# The version of gRPC libraries should be consistent with each other
grpcio==1.78.0
grpcio-reflection==1.78.0
arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.3
runai-model-streamer[s3,gcs,azure]==0.15.7
fastsafetensors>=0.2.2 # 0.2.2 contains important fixes for multi-GPU mem usage
instanttensor>=0.1.5
pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0
terratorch @ git+https://github.com/IBM/terratorch.git@1.1.rc3 # required for PrithviMAE test
terratorch >= 1.2.2 # Required for Prithvi tests
imagehash # Required for Prithvi tests
segmentation-models-pytorch > 0.4.0 # Required for Prithvi tests
gpt-oss >= 0.0.7; python_version > '3.11'
perceptron # required for isaac test
kaldi-native-fbank >= 1.18.7 # required for fireredasr2 test
# Newer versions of datasets require torchcoded, that makes the tests fail in CI because of a missing library.
# Older versions are in conflict with teerratorch requirements.
datasets>=3.3.0,<=3.6.0
openpyxl # required for perf comparison excel report
plotly # required for perf comparison html report
# This file was autogenerated by uv via the following command:
# uv pip compile requirements/test.in -o requirements/test.txt --index-strategy unsafe-best-match --torch-backend cu129 --python-platform x86_64-manylinux_2_28 --python-version 3.12
absl-py==2.1.0
# via rouge-score
accelerate==1.0.1
# via
# lm-eval
# peft
# rouge-score
# tensorboard
accelerate==1.0.1
# via peft
aenum==3.1.16
# via lightly
affine==2.4.0
......@@ -31,9 +31,7 @@ albumentations==1.4.6
# -r requirements/test.in
# terratorch
alembic==1.16.4
# via
# mlflow
# optuna
# via optuna
annotated-doc==0.0.4
# via fastapi
annotated-types==0.7.0
......@@ -64,18 +62,26 @@ attrs==24.2.0
# referencing
audioread==3.0.1
# via librosa
av==16.1.0
# via -r requirements/test.in
azure-core==1.38.2
# via
# azure-identity
# azure-storage-blob
azure-identity==1.25.2
# via runai-model-streamer-azure
azure-storage-blob==12.28.0
# via runai-model-streamer-azure
backoff==2.2.1
# via
# -r requirements/test.in
# schemathesis
bitsandbytes==0.46.1
bitsandbytes==0.49.2
# via
# -r requirements/test.in
# lightning
black==24.10.0
# via datamodel-code-generator
blinker==1.9.0
# via flask
blobfile==3.0.0
# via -r requirements/test.in
bm25s==0.2.13
......@@ -93,9 +99,7 @@ bounded-pool-executor==0.0.3
buildkite-test-collector==0.1.9
# via -r requirements/test.in
cachetools==5.5.2
# via
# google-auth
# mlflow-skinny
# via google-auth
certifi==2024.8.30
# via
# fiona
......@@ -106,8 +110,11 @@ certifi==2024.8.30
# pyproj
# rasterio
# requests
cffi==1.17.1
# via soundfile
# sentry-sdk
cffi==2.0.0
# via
# cryptography
# soundfile
chardet==5.2.0
# via mbstrdecoder
charset-normalizer==3.4.0
......@@ -120,15 +127,14 @@ click==8.1.7
# click-plugins
# cligj
# fiona
# flask
# jiwer
# mlflow-skinny
# nltk
# rasterio
# ray
# schemathesis
# typer
# uvicorn
# wandb
click-plugins==1.1.1.2
# via
# fiona
......@@ -137,14 +143,11 @@ cligj==0.7.2
# via
# fiona
# rasterio
cloudpickle==3.1.1
# via mlflow-skinny
colorama==0.4.6
# via
# perceptron
# sacrebleu
# schemathesis
# tqdm-multiprocess
colorful==0.5.6
# via ray
colorlog==6.10.1
......@@ -155,6 +158,12 @@ coverage==7.10.6
# via pytest-cov
cramjam==2.9.0
# via fastparquet
cryptography==46.0.5
# via
# azure-identity
# azure-storage-blob
# msal
# pyjwt
cuda-bindings==12.9.4
# via torch
cuda-pathfinder==1.3.3
......@@ -163,16 +172,15 @@ cupy-cuda12x==13.6.0
# via ray
cycler==0.12.1
# via matplotlib
databricks-sdk==0.59.0
# via mlflow-skinny
datamodel-code-generator==0.26.3
# via -r requirements/test.in
dataproperty==1.0.1
# via
# pytablewriter
# tabledata
datasets==3.0.2
datasets==3.3.0
# via
# -r requirements/test.in
# evaluate
# lm-eval
# mteb
......@@ -180,6 +188,8 @@ decorator==5.1.1
# via librosa
decord==0.6.0
# via -r requirements/test.in
diffusers==0.36.0
# via terratorch
dill==0.3.8
# via
# datasets
......@@ -191,15 +201,11 @@ distlib==0.3.9
dnspython==2.7.0
# via email-validator
docker==7.1.0
# via
# gpt-oss
# mlflow
# via gpt-oss
docopt==0.6.2
# via num2words
docstring-parser==0.17.0
# via jsonargparse
efficientnet-pytorch==0.7.1
# via segmentation-models-pytorch
einops==0.8.1
# via
# -r requirements/test.in
......@@ -214,12 +220,12 @@ email-validator==2.2.0
# via pydantic
encodec==0.1.1
# via vocos
et-xmlfile==2.0.0
# via openpyxl
evaluate==0.4.3
# via lm-eval
fastapi==0.128.0
# via
# gpt-oss
# mlflow-skinny
# via gpt-oss
fastparquet==2024.11.0
# via genai-perf
fastrlock==0.8.2
......@@ -230,6 +236,7 @@ filelock==3.16.1
# via
# blobfile
# datasets
# diffusers
# huggingface-hub
# ray
# torch
......@@ -237,8 +244,6 @@ filelock==3.16.1
# virtualenv
fiona==1.10.1
# via torchgeo
flask==3.1.1
# via mlflow
fonttools==4.55.0
# via matplotlib
fqdn==1.5.1
......@@ -249,7 +254,7 @@ frozenlist==1.5.0
# via
# aiohttp
# aiosignal
fsspec==2024.9.0
fsspec==2024.12.0
# via
# datasets
# evaluate
......@@ -257,6 +262,7 @@ fsspec==2024.9.0
# huggingface-hub
# lightning
# pytorch-lightning
# tacoreader
# torch
ftfy==6.3.1
# via open-clip-torch
......@@ -269,7 +275,7 @@ geopandas==1.0.1
gitdb==4.0.12
# via gitpython
gitpython==3.1.44
# via mlflow-skinny
# via wandb
google-api-core==2.24.2
# via
# google-cloud-core
......@@ -277,7 +283,6 @@ google-api-core==2.24.2
# opencensus
google-auth==2.40.2
# via
# databricks-sdk
# google-api-core
# google-cloud-core
# google-cloud-storage
......@@ -296,25 +301,18 @@ googleapis-common-protos==1.70.0
# via google-api-core
gpt-oss==0.0.8
# via -r requirements/test.in
graphene==3.4.3
# via mlflow
graphql-core==3.2.6
# via
# graphene
# graphql-relay
# hypothesis-graphql
graphql-relay==3.2.0
# via graphene
# via hypothesis-graphql
greenlet==3.2.3
# via sqlalchemy
grpcio==1.78.0
# via
# grpcio-tools
# -r requirements/test.in
# grpcio-reflection
# ray
grpcio-tools==1.78.0
# tensorboard
grpcio-reflection==1.78.0
# via -r requirements/test.in
gunicorn==23.0.0
# via mlflow
h11==0.14.0
# via
# httpcore
......@@ -338,12 +336,14 @@ httpcore==1.0.6
httpx==0.27.2
# via
# -r requirements/test.in
# diffusers
# perceptron
# schemathesis
huggingface-hub==0.36.2
# via
# accelerate
# datasets
# diffusers
# evaluate
# open-clip-torch
# peft
......@@ -379,11 +379,13 @@ idna==3.10
# jsonschema
# requests
# yarl
imagehash==4.3.2
# via -r requirements/test.in
imageio==2.37.0
# via scikit-image
importlib-metadata==8.7.0
# via
# mlflow-skinny
# diffusers
# opentelemetry-api
importlib-resources==6.5.2
# via typeshed-client
......@@ -391,18 +393,19 @@ inflect==5.6.2
# via datamodel-code-generator
iniconfig==2.0.0
# via pytest
instanttensor==0.1.5
# via -r requirements/test.in
isodate==0.7.2
# via azure-storage-blob
isoduration==20.11.0
# via jsonschema
isort==5.13.2
# via datamodel-code-generator
itsdangerous==2.2.0
# via flask
jinja2==3.1.6
# via
# datamodel-code-generator
# flask
# genai-perf
# mlflow
# lm-eval
# torch
jiwer==3.0.5
# via -r requirements/test.in
......@@ -415,12 +418,14 @@ joblib==1.4.2
# librosa
# nltk
# scikit-learn
jsonargparse==4.35.0
jsonargparse==4.46.0
# via
# lightning
# terratorch
jsonlines==4.0.0
# via lm-eval
jsonnet==0.21.0
# via jsonargparse
jsonpointer==3.0.0
# via jsonschema
jsonschema==4.23.0
......@@ -433,6 +438,8 @@ jsonschema-specifications==2024.10.1
# via jsonschema
junit-xml==1.9
# via schemathesis
kaldi-native-fbank==1.22.3
# via -r requirements/test.in
kaleido==0.2.1
# via genai-perf
kiwisolver==1.4.7
......@@ -449,13 +456,13 @@ libnacl==2.1.0
# via tensorizer
librosa==0.10.2.post1
# via -r requirements/test.in
lightly==1.5.20
lightly==1.5.22
# via
# terratorch
# torchgeo
lightly-utils==0.0.2
# via lightly
lightning==2.5.1.post0
lightning==2.6.1
# via
# terratorch
# torchgeo
......@@ -466,7 +473,7 @@ lightning-utilities==0.14.3
# torchmetrics
llvmlite==0.44.0
# via numba
lm-eval==0.4.9.2
lm-eval==0.4.11
# via -r requirements/test.in
lxml==5.3.0
# via
......@@ -476,12 +483,11 @@ lxml==5.3.0
mako==1.3.10
# via alembic
markdown==3.8.2
# via mlflow
# via tensorboard
markdown-it-py==3.0.0
# via rich
markupsafe==3.0.1
# via
# flask
# jinja2
# mako
# werkzeug
......@@ -489,7 +495,6 @@ matplotlib==3.9.2
# via
# -r requirements/test.in
# lightning
# mlflow
# pycocotools
# torchgeo
mbstrdecoder==1.1.3
......@@ -499,21 +504,23 @@ mbstrdecoder==1.1.3
# typepy
mdurl==0.1.2
# via markdown-it-py
mistral-common==1.9.0
mistral-common==1.10.0
# via -r requirements/test.in
mlflow==2.22.0
# via terratorch
mlflow-skinny==2.22.0
# via mlflow
more-itertools==10.5.0
# via lm-eval
mpmath==1.3.0
# via sympy
msal==1.34.0
# via
# azure-identity
# msal-extensions
msal-extensions==1.3.1
# via azure-identity
msgpack==1.1.0
# via
# librosa
# ray
mteb==2.1.2
mteb==2.8.3
# via -r requirements/test.in
multidict==6.1.0
# via
......@@ -523,8 +530,6 @@ multiprocess==0.70.16
# via
# datasets
# evaluate
munch==4.0.0
# via pretrainedmodels
mypy-extensions==1.0.0
# via black
networkx==3.2.1
......@@ -539,8 +544,6 @@ numba==0.61.2
# via
# -r requirements/test.in
# librosa
numexpr==2.10.1
# via lm-eval
numpy==2.2.6
# via
# -r requirements/test.in
......@@ -553,6 +556,7 @@ numpy==2.2.6
# cupy-cuda12x
# datasets
# decord
# diffusers
# einx
# encodec
# evaluate
......@@ -560,16 +564,16 @@ numpy==2.2.6
# genai-perf
# geopandas
# h5py
# imagehash
# imageio
# librosa
# lightly
# lightly-utils
# lm-eval
# matplotlib
# mistral-common
# mlflow
# mteb
# numba
# numexpr
# opencv-python-headless
# optuna
# pandas
......@@ -578,6 +582,7 @@ numpy==2.2.6
# perceptron
# pycocotools
# pyogrio
# pywavelets
# rasterio
# rioxarray
# rouge-score
......@@ -590,8 +595,10 @@ numpy==2.2.6
# shapely
# soxr
# statsmodels
# tensorboard
# tensorboardx
# tensorizer
# terratorch
# tifffile
# torchgeo
# torchmetrics
......@@ -657,9 +664,10 @@ opencv-python-headless==4.13.0.90
# albucore
# albumentations
# mistral-common
openpyxl==3.1.5
# via -r requirements/test.in
opentelemetry-api==1.35.0
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# opentelemetry-sdk
# opentelemetry-semantic-conventions
......@@ -669,7 +677,6 @@ opentelemetry-proto==1.36.0
# via ray
opentelemetry-sdk==1.35.0
# via
# mlflow-skinny
# opentelemetry-exporter-prometheus
# ray
opentelemetry-semantic-conventions==0.56b0
......@@ -681,13 +688,13 @@ orjson==3.11.5
packaging==24.2
# via
# accelerate
# bitsandbytes
# black
# datamodel-code-generator
# datasets
# evaluate
# fastparquet
# geopandas
# gunicorn
# huggingface-hub
# hydra-core
# kornia
......@@ -695,7 +702,6 @@ packaging==24.2
# lightning
# lightning-utilities
# matplotlib
# mlflow-skinny
# optuna
# peft
# plotly
......@@ -708,10 +714,12 @@ packaging==24.2
# rioxarray
# scikit-image
# statsmodels
# tensorboard
# tensorboardx
# torchmetrics
# transformers
# typepy
# wandb
# xarray
pandas==2.2.3
# via
......@@ -720,8 +728,8 @@ pandas==2.2.3
# fastparquet
# genai-perf
# geopandas
# mlflow
# statsmodels
# tacoreader
# torchgeo
# xarray
pathspec==0.12.1
......@@ -731,16 +739,16 @@ pathvalidate==3.2.1
patsy==1.0.1
# via statsmodels
peft==0.16.0
# via
# -r requirements/test.in
# lm-eval
# via -r requirements/test.in
perceptron==0.1.4
# via -r requirements/test.in
perf-analyzer==0.1.0
# via genai-perf
pillow==10.4.0
# via
# diffusers
# genai-perf
# imagehash
# imageio
# lightly-utils
# matplotlib
......@@ -748,6 +756,7 @@ pillow==10.4.0
# perceptron
# scikit-image
# segmentation-models-pytorch
# tensorboard
# torchgeo
# torchvision
platformdirs==4.3.6
......@@ -755,8 +764,11 @@ platformdirs==4.3.6
# black
# pooch
# virtualenv
# wandb
plotly==5.24.1
# via genai-perf
# via
# -r requirements/test.in
# genai-perf
pluggy==1.5.0
# via
# pytest
......@@ -769,8 +781,6 @@ portalocker==2.10.1
# via sacrebleu
pqdm==0.2.0
# via -r requirements/test.in
pretrainedmodels==0.7.4
# via segmentation-models-pytorch
prometheus-client==0.22.0
# via
# opentelemetry-exporter-prometheus
......@@ -785,13 +795,14 @@ protobuf==6.33.2
# via
# google-api-core
# googleapis-common-protos
# grpcio-tools
# mlflow-skinny
# grpcio-reflection
# opentelemetry-proto
# proto-plus
# ray
# tensorboard
# tensorboardx
# tensorizer
# wandb
psutil==6.1.0
# via
# accelerate
......@@ -801,19 +812,18 @@ py==1.11.0
# via pytest-forked
py-spy==0.4.0
# via ray
pyarrow==18.0.0
pyarrow==23.0.0
# via
# datasets
# genai-perf
# mlflow
# tacoreader
# terratorch
pyasn1==0.6.1
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.4.2
# via google-auth
pybind11==2.13.6
# via lm-eval
pycocotools==2.0.8
# via terratorch
pycountry==24.6.1
......@@ -831,17 +841,19 @@ pydantic==2.12.0
# gpt-oss
# lightly
# mistral-common
# mlflow-skinny
# mteb
# openai-harmony
# pydantic-extra-types
# ray
# wandb
pydantic-core==2.41.1
# via pydantic
pydantic-extra-types==2.10.5
# via mistral-common
pygments==2.18.0
# via rich
pyjwt==2.11.0
# via msal
pyogrio==0.11.0
# via geopandas
pyparsing==3.2.0
......@@ -873,7 +885,6 @@ pytest==8.3.5
# pytest-subtests
# pytest-timeout
# schemathesis
# terratorch
pytest-asyncio==0.24.0
# via -r requirements/test.in
pytest-cov==6.3.0
......@@ -896,7 +907,6 @@ python-dateutil==2.9.0.post0
# via
# arrow
# botocore
# graphene
# lightly
# matplotlib
# pandas
......@@ -913,6 +923,8 @@ pytz==2024.2
# via
# pandas
# typepy
pywavelets==1.9.0
# via imagehash
pyyaml==6.0.2
# via
# accelerate
......@@ -923,7 +935,6 @@ pyyaml==6.0.2
# huggingface-hub
# jsonargparse
# lightning
# mlflow-skinny
# omegaconf
# optuna
# peft
......@@ -934,6 +945,7 @@ pyyaml==6.0.2
# timm
# transformers
# vocos
# wandb
rapidfuzz==3.12.1
# via jiwer
rasterio==1.4.3
......@@ -951,6 +963,7 @@ referencing==0.35.1
# jsonschema-specifications
regex==2024.9.11
# via
# diffusers
# nltk
# open-clip-torch
# sacrebleu
......@@ -958,9 +971,10 @@ regex==2024.9.11
# transformers
requests==2.32.3
# via
# azure-core
# buildkite-test-collector
# databricks-sdk
# datasets
# diffusers
# docker
# evaluate
# google-api-core
......@@ -970,15 +984,17 @@ requests==2.32.3
# lightly
# lm-eval
# mistral-common
# mlflow-skinny
# msal
# mteb
# pooch
# ray
# responses
# schemathesis
# starlette-testclient
# tacoreader
# tiktoken
# transformers
# wandb
responses==0.25.3
# via genai-perf
rfc3339-validator==0.1.4
......@@ -991,6 +1007,7 @@ rich==13.9.4
# lightning
# mteb
# perceptron
# terratorch
# typer
rioxarray==0.19.0
# via terratorch
......@@ -1004,11 +1021,13 @@ rsa==4.9.1
# via google-auth
rtree==1.4.0
# via torchgeo
runai-model-streamer==0.15.3
runai-model-streamer==0.15.7
# via -r requirements/test.in
runai-model-streamer-gcs==0.15.3
runai-model-streamer-azure==0.15.7
# via runai-model-streamer
runai-model-streamer-gcs==0.15.7
# via runai-model-streamer
runai-model-streamer-s3==0.15.3
runai-model-streamer-s3==0.15.7
# via runai-model-streamer
s3transfer==0.10.3
# via boto3
......@@ -1017,47 +1036,54 @@ sacrebleu==2.4.3
safetensors==0.4.5
# via
# accelerate
# diffusers
# open-clip-torch
# peft
# segmentation-models-pytorch
# timm
# transformers
schemathesis==3.39.15
# via -r requirements/test.in
scikit-image==0.25.2
# via albumentations
# via
# albumentations
# terratorch
scikit-learn==1.5.2
# via
# albumentations
# librosa
# lm-eval
# mlflow
# mteb
# sentence-transformers
# terratorch
scipy==1.13.1
# via
# albumentations
# bm25s
# imagehash
# librosa
# mlflow
# mteb
# scikit-image
# scikit-learn
# sentence-transformers
# statsmodels
# vocos
segmentation-models-pytorch==0.4.0
segmentation-models-pytorch==0.5.0
# via
# -r requirements/test.in
# terratorch
# torchgeo
sentence-transformers==5.2.0
# via
# -r requirements/test.in
# mteb
sentry-sdk==2.52.0
# via wandb
setuptools==77.0.3
# via
# grpcio-tools
# lightning-utilities
# pytablewriter
# tensorboard
# torch
shapely==2.1.1
# via
......@@ -1075,7 +1101,6 @@ six==1.16.0
# python-dateutil
# rfc3339-validator
# rouge-score
# segmentation-models-pytorch
smart-open==7.1.0
# via ray
smmap==5.0.2
......@@ -1099,12 +1124,9 @@ soxr==0.5.0.post1
sqlalchemy==2.0.41
# via
# alembic
# mlflow
# optuna
sqlitedict==2.1.0
# via lm-eval
sqlparse==0.5.3
# via mlflow-skinny
starlette==0.50.0
# via
# fastapi
......@@ -1124,6 +1146,8 @@ tabledata==1.3.3
# via pytablewriter
tabulate==0.9.0
# via sacrebleu
tacoreader==0.5.6
# via terratorch
tblib==3.1.0
# via -r requirements/test.in
tcolorpy==0.1.6
......@@ -1133,13 +1157,19 @@ tenacity==9.1.2
# gpt-oss
# lm-eval
# plotly
tensorboard==2.20.0
# via terratorch
tensorboard-data-server==0.7.2
# via tensorboard
tensorboardx==2.6.4
# via lightning
tensorizer==2.10.1
# via -r requirements/test.in
termcolor==3.1.0
# via gpt-oss
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
# via
# gpt-oss
# terratorch
terratorch==1.2.2
# via -r requirements/test.in
threadpoolctl==3.5.0
# via scikit-learn
......@@ -1172,16 +1202,14 @@ torch==2.10.0+cu129
# -r requirements/test.in
# accelerate
# bitsandbytes
# efficientnet-pytorch
# encodec
# instanttensor
# kornia
# lightly
# lightning
# lm-eval
# mteb
# open-clip-torch
# peft
# pretrainedmodels
# pytorch-lightning
# runai-model-streamer
# segmentation-models-pytorch
......@@ -1213,12 +1241,11 @@ torchvision==0.25.0+cu129
# -r requirements/test.in
# lightly
# open-clip-torch
# pretrainedmodels
# segmentation-models-pytorch
# terratorch
# timm
# torchgeo
tqdm==4.66.6
tqdm==4.67.3
# via
# datasets
# evaluate
......@@ -1232,19 +1259,16 @@ tqdm==4.66.6
# optuna
# peft
# pqdm
# pretrainedmodels
# pytorch-lightning
# segmentation-models-pytorch
# sentence-transformers
# tqdm-multiprocess
# tacoreader
# terratorch
# transformers
tqdm-multiprocess==0.0.11
# via lm-eval
transformers==4.57.5
# via
# -r requirements/test.in
# genai-perf
# lm-eval
# peft
# sentence-transformers
# transformers-stream-generator
......@@ -1272,16 +1296,18 @@ typing-extensions==4.15.0
# aiosignal
# albumentations
# alembic
# azure-core
# azure-identity
# azure-storage-blob
# chz
# fastapi
# graphene
# grpcio
# huggingface-hub
# librosa
# lightning
# lightning-utilities
# lm-eval
# mistral-common
# mlflow-skinny
# mteb
# opentelemetry-api
# opentelemetry-sdk
......@@ -1299,6 +1325,7 @@ typing-extensions==4.15.0
# typer
# typeshed-client
# typing-inspection
# wandb
typing-inspection==0.4.2
# via pydantic
tzdata==2024.2
......@@ -1313,25 +1340,26 @@ urllib3==2.2.3
# lightly
# requests
# responses
# sentry-sdk
# tritonclient
uvicorn==0.35.0
# via
# gpt-oss
# mlflow-skinny
# via gpt-oss
vector-quantize-pytorch==1.21.2
# via -r requirements/test.in
virtualenv==20.31.2
# via ray
vocos==0.1.0
# via -r requirements/test.in
wandb==0.24.2
# via terratorch
wcwidth==0.2.13
# via ftfy
webcolors==24.11.1
# via jsonschema
werkzeug==3.1.3
# via
# flask
# schemathesis
# tensorboard
word2number==1.1
# via lm-eval
wrapt==1.17.2
......
......@@ -15,4 +15,4 @@ torch==2.10.0+xpu
torchaudio
torchvision
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.2/vllm_xpu_kernels-0.1.2-cp312-cp312-linux_x86_64.whl
vllm_xpu_kernels @ https://github.com/vllm-project/vllm-xpu-kernels/releases/download/v0.1.3/vllm_xpu_kernels-0.1.3-cp38-abi3-linux_x86_64.whl
#!/usr/bin/env python3
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Autotune registered Helion kernels for optimal configurations.
Usage:
# Autotune all registered kernels
python scripts/autotune_helion_kernels.py
# Autotune specific kernel
python scripts/autotune_helion_kernels.py --kernels silu_mul_fp8
# Autotune multiple kernels
python scripts/autotune_helion_kernels.py --kernels silu_mul_fp8 rms_norm_fp8
# Force re-autotuning
python scripts/autotune_helion_kernels.py --force
# List available kernels
python scripts/autotune_helion_kernels.py --list
"""
import argparse
import sys
import time
from dataclasses import dataclass
import torch
from torch._subclasses.fake_tensor import FakeTensorMode
try:
import helion
from vllm.kernels.helion import (
ConfigManager,
get_kernel_by_name,
get_registered_kernels,
)
from vllm.kernels.helion.utils import get_canonical_gpu_name
from vllm.logger import init_logger
from vllm.utils.import_utils import has_helion
except ImportError as e:
print(f"Error importing vLLM: {e}")
print("Please ensure vLLM is installed and in your Python path")
sys.exit(1)
logger = init_logger("vllm.scripts.autotune_helion_kernels")
@dataclass
class AutotuneResult:
status: str # "success" | "partial" | "error" | "skipped"
successful: int
failed: int
configs: dict[str, "helion.Config"]
message: str = ""
def list_kernels() -> None:
kernels = get_registered_kernels()
if not kernels:
print("No Helion kernels found in registry.")
return
print("Available Helion kernels:")
print("=" * 50)
for name in sorted(kernels.keys()):
print(f" {name}")
print(f"\nTotal: {len(kernels)} kernels")
def check_requirements() -> bool:
if not torch.cuda.is_available():
logger.error("CUDA is not available. Helion autotuning requires GPU.")
return False
if not has_helion():
logger.error("Helion is not installed. Please install Helion package.")
return False
return True
def autotune_kernel(
kernel_name: str,
platform: str,
config_manager: ConfigManager,
force: bool = False,
autotune_effort: str = "quick",
) -> AutotuneResult:
logger.debug(
"Starting autotune for kernel '%s' with effort='%s'",
kernel_name,
autotune_effort,
)
kernel_wrapper = get_kernel_by_name(kernel_name)
if kernel_wrapper is None:
error_msg = f"Kernel '{kernel_name}' not found in registry"
logger.error(error_msg)
return AutotuneResult(
status="error",
message=error_msg,
successful=0,
failed=0,
configs={},
)
try:
with FakeTensorMode():
all_config_keys = list(kernel_wrapper.get_inputs().keys())
except NotImplementedError:
error_msg = f"Kernel '{kernel_name}' has no input generator registered"
logger.error(error_msg)
return AutotuneResult(
status="error",
message=error_msg,
successful=0,
failed=0,
configs={},
)
try:
logger.info(
"Autotuning kernel '%s' for platform '%s' with %d configs",
kernel_name,
platform,
len(all_config_keys),
)
if not force:
existing_configs = config_manager.get_platform_configs(
kernel_name, platform
)
keys_to_autotune = []
for config_key in all_config_keys:
if config_key in existing_configs:
logger.debug(
"Config '%s' already exists for platform '%s', skipping",
config_key,
platform,
)
else:
keys_to_autotune.append(config_key)
else:
logger.debug("Force mode enabled, will re-autotune all configs")
keys_to_autotune = all_config_keys
if not keys_to_autotune:
logger.info(
"All configs already exist for kernel '%s' on platform '%s'. "
"Use --force to re-autotune.",
kernel_name,
platform,
)
return AutotuneResult(
status="skipped",
message="All configs already exist",
successful=0,
failed=0,
configs={},
)
inputs_dict = kernel_wrapper.get_inputs()
configs_to_autotune = {k: inputs_dict[k] for k in keys_to_autotune}
total_start_time = time.time()
autotuned_configs = {}
failed_configs = []
for config_key, inputs in configs_to_autotune.items():
logger.info("Autotuning config: %s", config_key)
logger.debug(
"Input shapes: %s",
[getattr(inp, "shape", type(inp).__name__) for inp in inputs],
)
try:
config_start_time = time.time()
config = kernel_wrapper.run_autotune(inputs, autotune_effort)
config_duration = time.time() - config_start_time
# Save immediately for checkpointing
config_manager.save_configs(kernel_name, platform, {config_key: config})
autotuned_configs[config_key] = config
logger.debug("Config details: %s", config)
logger.info(
"✓ Autotuned and saved config '%s' (%.2fs)",
config_key,
config_duration,
)
except (RuntimeError, ValueError, OSError) as e:
logger.exception(
"Failed to autotune config '%s': %s",
config_key,
e,
)
failed_configs.append(config_key)
total_duration = time.time() - total_start_time
successful = len(autotuned_configs)
failed = len(failed_configs)
logger.info(
"Completed autotuning for kernel '%s': %d successful, %d failed (%.2fs)",
kernel_name,
successful,
failed,
total_duration,
)
status = "success" if failed == 0 else "partial"
return AutotuneResult(
status=status,
successful=successful,
failed=failed,
configs=autotuned_configs,
)
except (KeyError, RuntimeError, ValueError, OSError) as e:
error_msg = f"Unexpected error: {e}"
logger.exception("Failed to autotune kernel '%s': %s", kernel_name, e)
return AutotuneResult(
status="error",
message=error_msg,
successful=0,
failed=0,
configs={},
)
def summarize_results(results: dict[str, AutotuneResult]) -> bool:
logger.info("=" * 50)
logger.info("Autotuning Results Summary")
logger.info("=" * 50)
total_successful = 0
total_failed = 0
success_kernels = []
partial_kernels = []
error_kernels = []
skipped_kernels = []
for kernel_name, result in results.items():
total_successful += result.successful
total_failed += result.failed
if result.status == "success":
success_kernels.append(f"{kernel_name} ({result.successful} configs)")
logger.info("✓ %s: %d configs successful", kernel_name, result.successful)
elif result.status == "partial":
partial_kernels.append(
f"{kernel_name} ({result.successful} ok, {result.failed} failed)"
)
logger.warning(
"⚠ %s: %d successful, %d failed",
kernel_name,
result.successful,
result.failed,
)
elif result.status == "error":
error_kernels.append(f"{kernel_name}: {result.message or 'Unknown error'}")
logger.error("✗ %s: %s", kernel_name, result.message or "Unknown error")
elif result.status == "skipped":
skipped_kernels.append(f"{kernel_name}: {result.message or 'Skipped'}")
logger.info("- %s: %s", kernel_name, result.message or "Skipped")
logger.info("=" * 50)
logger.info(
"Summary: %d total configs (%d successful, %d failed)",
total_successful + total_failed,
total_successful,
total_failed,
)
logger.info(
"Kernels: %d success, %d partial, %d error, %d skipped",
len(success_kernels),
len(partial_kernels),
len(error_kernels),
len(skipped_kernels),
)
has_failures = bool(error_kernels or partial_kernels)
if not has_failures:
if total_successful > 0:
logger.info("All configs autotuned successfully!")
else:
logger.info("No new configs were generated (all may already exist)")
return not has_failures
def get_kernels_to_autotune(requested_kernels: list[str] | None) -> list[str]:
all_kernels = get_registered_kernels()
if not all_kernels:
logger.error("No Helion kernels found in registry")
sys.exit(1)
if not requested_kernels:
return list(all_kernels.keys())
if len(requested_kernels) != len(set(requested_kernels)):
duplicates = [
k for k in set(requested_kernels) if requested_kernels.count(k) > 1
]
logger.error("Duplicate kernel names in --kernels flag: %s", duplicates)
sys.exit(1)
kernels_to_autotune = []
missing_kernels = []
for kernel_name in requested_kernels:
if kernel_name in all_kernels:
kernels_to_autotune.append(kernel_name)
else:
missing_kernels.append(kernel_name)
if missing_kernels:
logger.error("Kernel(s) not found: %s", missing_kernels)
logger.error("Available kernels: %s", list(all_kernels.keys()))
sys.exit(1)
return kernels_to_autotune
def main():
parser = argparse.ArgumentParser(
description="Autotune Helion kernels",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__.split("Usage:")[1] if "Usage:" in __doc__ else "",
)
parser.add_argument(
"--kernels",
nargs="+",
help="Kernel(s) to autotune (default: all kernels)",
)
parser.add_argument(
"--config-dir",
type=str,
help="Config directory for config files (default: vLLM helion configs dir)",
)
parser.add_argument(
"--list",
action="store_true",
help="List available Helion kernels and exit",
)
parser.add_argument(
"--force",
action="store_true",
help=(
"Force re-autotuning even if configs already exist for the "
"platform and config keys"
),
)
parser.add_argument(
"--autotune-effort",
type=str,
default="quick",
help=(
"Helion autotune effort level: 'quick' (smaller search) or "
"'full' (full search budget) (default: quick)"
),
)
parser.add_argument(
"--verbose",
action="store_true",
help="Enable verbose logging",
)
args = parser.parse_args()
import logging
if args.verbose:
logging.getLogger("vllm").setLevel(logging.DEBUG)
logger.debug("Verbose mode enabled")
logger.debug("Arguments: %s", vars(args))
else:
logging.getLogger("vllm").setLevel(logging.INFO)
if args.list:
list_kernels()
return
if not check_requirements():
sys.exit(1)
platform = get_canonical_gpu_name()
logger.info("Detected GPU platform: %s", platform)
config_manager = (
ConfigManager(args.config_dir) if args.config_dir else ConfigManager()
)
try:
config_manager.ensure_base_dir_writable()
except OSError as e:
logger.error("Failed to access config directory: %s", e)
sys.exit(1)
kernels_to_autotune = get_kernels_to_autotune(args.kernels)
logger.info(
"Will autotune %d kernel(s) for platform '%s': %s",
len(kernels_to_autotune),
platform,
kernels_to_autotune,
)
results = {}
for kernel_name in kernels_to_autotune:
result = autotune_kernel(
kernel_name, platform, config_manager, args.force, args.autotune_effort
)
results[kernel_name] = result
success = summarize_results(results)
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()
......@@ -18,8 +18,6 @@ import torch
from packaging.version import Version, parse
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
from setuptools.command.build_py import build_py
from setuptools.command.develop import develop
from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
......@@ -81,81 +79,6 @@ def is_freethreaded():
return bool(sysconfig.get_config_var("Py_GIL_DISABLED"))
def compile_grpc_protos():
"""Compile gRPC protobuf definitions during build.
This generates *_pb2.py, *_pb2_grpc.py, and *_pb2.pyi files from
the vllm_engine.proto definition.
"""
try:
from grpc_tools import protoc
except ImportError:
logger.warning(
"grpcio-tools not installed, skipping gRPC proto compilation. "
"gRPC server functionality will not be available."
)
return False
proto_file = ROOT_DIR / "vllm" / "grpc" / "vllm_engine.proto"
if not proto_file.exists():
logger.warning("Proto file not found at %s, skipping compilation", proto_file)
return False
logger.info("Compiling gRPC protobuf: %s", proto_file)
result = protoc.main(
[
"grpc_tools.protoc",
f"--proto_path={ROOT_DIR}",
f"--python_out={ROOT_DIR}",
f"--grpc_python_out={ROOT_DIR}",
f"--pyi_out={ROOT_DIR}",
str(proto_file),
]
)
if result != 0:
logger.error("protoc failed with exit code %s", result)
return False
# Add SPDX headers and mypy ignore to generated files
spdx_header = (
"# SPDX-License-Identifier: Apache-2.0\n"
"# SPDX-FileCopyrightText: Copyright contributors to the vLLM project\n"
"# mypy: ignore-errors\n"
)
grpc_dir = ROOT_DIR / "vllm" / "grpc"
for generated_file in [
grpc_dir / "vllm_engine_pb2.py",
grpc_dir / "vllm_engine_pb2_grpc.py",
grpc_dir / "vllm_engine_pb2.pyi",
]:
if generated_file.exists():
content = generated_file.read_text()
if not content.startswith("# SPDX-License-Identifier"):
generated_file.write_text(spdx_header + content)
logger.info("gRPC protobuf compilation successful")
return True
class BuildPyAndGenerateGrpc(build_py):
"""Build Python modules and generate gRPC stubs from proto files."""
def run(self):
compile_grpc_protos()
super().run()
class DevelopAndGenerateGrpc(develop):
"""Develop mode that also generates gRPC stubs from proto files."""
def run(self):
compile_grpc_protos()
super().run()
class CMakeExtension(Extension):
def __init__(self, name: str, cmake_lists_dir: str = ".", **kwa) -> None:
super().__init__(name, sources=[], py_limited_api=not is_freethreaded(), **kwa)
......@@ -734,13 +657,18 @@ class precompiled_wheel_utils:
def get_base_commit_in_main_branch() -> str:
try:
# Get the latest commit hash of the upstream main branch.
resp_json = subprocess.check_output(
[
"curl",
"-s",
"https://api.github.com/repos/vllm-project/vllm/commits/main",
curl_cmd = [
"curl",
"-s",
"https://api.github.com/repos/vllm-project/vllm/commits/main",
]
github_token = os.getenv("GH_TOKEN", os.getenv("GITHUB_TOKEN"))
if github_token:
curl_cmd += [
"-H",
f"Authorization: token {github_token}",
]
).decode("utf-8")
resp_json = subprocess.check_output(curl_cmd).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
print(f"Upstream main branch latest commit: {upstream_main_commit}")
......@@ -818,7 +746,7 @@ def _is_xpu() -> bool:
def _build_custom_ops() -> bool:
return _is_cuda() or _is_hip() or _is_cpu()
return _is_cuda() or _is_hip()
def get_rocm_version():
......@@ -976,6 +904,11 @@ if _is_cuda():
):
# FA3 requires CUDA 12.3 or later
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
# FA4 CuteDSL - Python-only component for FA4's cute DSL support
# Optional since this doesn't produce a .so file, just copies Python files
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa4_cutedsl_C", optional=True)
)
if envs.VLLM_USE_PRECOMPILED or (
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.9")
):
......@@ -987,6 +920,16 @@ if _is_cuda():
CMakeExtension(name="vllm._flashmla_extension_C", optional=True)
)
if _is_cpu():
import platform
if platform.machine() in ("x86_64", "AMD64"):
ext_modules.append(CMakeExtension(name="vllm._C"))
ext_modules.append(CMakeExtension(name="vllm._C_AVX512"))
ext_modules.append(CMakeExtension(name="vllm._C_AVX2"))
else:
ext_modules.append(CMakeExtension(name="vllm._C"))
if _build_custom_ops():
ext_modules.append(CMakeExtension(name="vllm._C"))
......@@ -1014,17 +957,12 @@ if _no_device():
ext_modules = []
if not ext_modules:
cmdclass = {
"build_py": BuildPyAndGenerateGrpc,
"develop": DevelopAndGenerateGrpc,
}
cmdclass = {}
else:
cmdclass = {
"build_ext": precompiled_build_ext
if envs.VLLM_USE_PRECOMPILED
else cmake_build_ext,
"build_py": BuildPyAndGenerateGrpc,
"develop": DevelopAndGenerateGrpc,
}
setup(
......@@ -1033,22 +971,28 @@ setup(
ext_modules=ext_modules,
install_requires=get_requirements(),
extras_require={
"bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy"],
# AMD Zen CPU optimizations via zentorch
"zen": ["zentorch"],
"bench": ["pandas", "matplotlib", "seaborn", "datasets", "scipy", "plotly"],
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.2.2"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
"instanttensor": ["instanttensor >= 0.1.5"],
"runai": ["runai-model-streamer[s3,gcs,azure] >= 0.15.7"],
"audio": [
"librosa",
"scipy",
"soundfile",
"mistral_common[audio]",
"av",
], # Required for audio processing
"video": [], # Kept for backwards compatibility
"flashinfer": [], # Kept for backwards compatibility
# Optional deps for AMD FP4 quantization support
"petit-kernel": ["petit-kernel"],
# Optional deps for Helion kernel development
"helion": ["helion"],
"helion": ["helion==0.3.2"],
# Optional deps for gRPC server (vllm serve --grpc)
"grpc": ["smg-grpc-servicer[vllm] >= 0.5.0"],
# Optional deps for OpenTelemetry tracing
"otel": [
"opentelemetry-sdk>=1.26.0",
......
......@@ -11,6 +11,8 @@ from unittest.mock import Mock
import pytest
import torch
from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
from vllm import LLM
from vllm.platforms import current_platform
......@@ -91,6 +93,15 @@ def test_models(
if enable_prompt_embeds:
with torch.no_grad():
prompt_embeds = hf_model.get_prompt_embeddings(example_prompts)
if model == "hmellor/tiny-random-Gemma2ForCausalLM" and (
Version(TRANSFORMERS_VERSION) < Version("5.3.0.dev0")
):
# For Gemma 1/2 models with Transformers 5.4.0+, the prompt embeddings
# are normalised in `get_prompt_embeddings`, like Gemma 3.
# For older versions, we need to manually normalise.
embed_scale = hf_model.config.hidden_size**0.5
normalizer = torch.tensor(embed_scale, dtype=prompt_embeds[0].dtype)
prompt_embeds = [p_e * normalizer for p_e in prompt_embeds]
with VllmRunner(
model,
......@@ -124,8 +135,6 @@ def test_models(
[
("facebook/opt-125m", "ray", "", "L4", {}),
("facebook/opt-125m", "mp", "", "L4", {}),
("facebook/opt-125m", "ray", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("facebook/opt-125m", "mp", "", "L4", {"VLLM_SLEEP_WHEN_IDLE": "1"}),
("meta-llama/Llama-3.2-1B-Instruct", "ray", "", "L4", {}),
("meta-llama/Llama-3.2-1B-Instruct", "mp", "", "L4", {}),
("facebook/opt-125m", "ray", "", "A100", {}),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment