Commit 8d75f22e authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

parents ce888aa4 7d80c73d
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm import LLM
from vllm.inputs.data import TextPrompt
from vllm.multimodal.utils import fetch_image
# Initialize model
model = LLM(
model="jinaai/jina-embeddings-v4-vllm-text-matching",
runner="pooling",
max_model_len=1024,
gpu_memory_utilization=0.8,
)
# Create text prompts
text1 = "Ein wunderschöner Sonnenuntergang am Strand"
text1_prompt = TextPrompt(prompt=f"Query: {text1}")
text2 = "浜辺に沈む美しい夕日"
text2_prompt = TextPrompt(prompt=f"Query: {text2}")
# Create image prompt
image = fetch_image(
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg" # noqa: E501
)
image_prompt = TextPrompt(
prompt="<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n", # noqa: E501
multi_modal_data={"image": image},
)
# Encode all prompts
prompts = [text1_prompt, text2_prompt, image_prompt]
outputs = model.encode(prompts, pooling_task="token_embed")
def get_embeddings(outputs):
VISION_START_TOKEN_ID, VISION_END_TOKEN_ID = 151652, 151653
embeddings = []
for output in outputs:
if VISION_START_TOKEN_ID in output.prompt_token_ids:
# Gather only vision tokens
img_start_pos = torch.where(
torch.tensor(output.prompt_token_ids) == VISION_START_TOKEN_ID
)[0][0]
img_end_pos = torch.where(
torch.tensor(output.prompt_token_ids) == VISION_END_TOKEN_ID
)[0][0]
embeddings_tensor = output.outputs.data.detach().clone()[
img_start_pos : img_end_pos + 1
]
else:
# Use all tokens for text-only prompts
embeddings_tensor = output.outputs.data.detach().clone()
# Pool and normalize embeddings
pooled_output = (
embeddings_tensor.sum(dim=0, dtype=torch.float32)
/ embeddings_tensor.shape[0]
)
embeddings.append(torch.nn.functional.normalize(pooled_output, dim=-1))
return embeddings
embeddings = get_embeddings(outputs)
for embedding in embeddings:
print(embedding.shape)
......@@ -46,6 +46,7 @@ scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu
pybase64 # fast base64 implementation
cbor2 # Required for cross-language serialization of hashable objects
ijson # Required for mistral streaming tool parser
setproctitle # Used to set process names for better debugging and monitoring
openai-harmony >= 0.0.3 # Required for gpt-oss
anthropic == 0.71.0
......
......@@ -3,7 +3,6 @@ ninja
packaging>=24.2
setuptools>=77.0.3,<81.0.0
setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
scons; platform_machine == "aarch64" # needed to build Arm Compute Library (ACL)
......
......@@ -4,7 +4,6 @@
numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding
# Dependencies for CPUs
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
......
......@@ -42,6 +42,6 @@ tritonclient==2.51.0
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.0
runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.1.10
pydantic>=2.12 # 2.11 leads to error on python 3.13
......@@ -49,6 +49,7 @@ blobfile==3.0.0
# Multi-Modal Models Test
decord==0.6.0
# video processing, required by entrypoints/openai/test_video.py
rapidfuzz==3.12.1
# OpenAI compatibility and testing
gpt-oss==0.0.8
......@@ -58,10 +59,14 @@ schemathesis==3.39.15
# Evaluation and benchmarking
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
jiwer==4.0.0
# Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
multiprocess==0.70.16
# Required for v1/metrics/test_engine_logger_apis.py
ray[cgraph,default]>=2.48.0
# Plugins test
terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
torchgeo==0.7.0
......@@ -70,8 +75,8 @@ torchgeo==0.7.0
mteb==2.1.2
# Data processing
xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628
# Test async scheduling
xgrammar==0.1.27
# Test async scheduling
# Utilities
num2words==0.5.14
......
......@@ -12,7 +12,7 @@ tensorizer==2.10.1
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
runai-model-streamer[s3,gcs]==0.15.0
runai-model-streamer[s3,gcs]==0.15.3
# conch-triton-kernels==1.2.1
timm>=1.0.17
fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
......@@ -51,7 +51,7 @@ tritonclient==2.51.0
arctic-inference == 0.1.1 # Required for suffix decoding test
numba == 0.61.2 # Required for N-gram speculative decoding
numpy
runai-model-streamer[s3,gcs]==0.15.0
runai-model-streamer[s3,gcs]==0.15.3
fastsafetensors>=0.1.10
pydantic>=2.12 # 2.11 leads to error on python 3.13
decord==0.6.0
......
......@@ -965,11 +965,11 @@ rsa==4.9.1
# via google-auth
rtree==1.4.0
# via torchgeo
runai-model-streamer==0.15.0
runai-model-streamer==0.15.3
# via -r requirements/test.in
runai-model-streamer-gcs==0.15.0
runai-model-streamer-gcs==0.15.3
# via runai-model-streamer
runai-model-streamer-s3==0.15.0
runai-model-streamer-s3==0.15.3
# via runai-model-streamer
s3transfer==0.10.3
# via boto3
......
......@@ -11,5 +11,4 @@ ray[default]
ray[data]
setuptools==78.1.0
nixl==0.3.0
tpu_info==0.4.0
tpu-inference==0.11.1
tpu-inference==0.12.0
......@@ -346,10 +346,13 @@ class precompiled_wheel_utils:
The order of preference is:
1. user-specified wheel location (can be either local or remote, via
VLLM_PRECOMPILED_WHEEL_LOCATION)
2. user-specified variant from nightly repo (current main commit via
VLLM_PRECOMPILED_WHEEL_VARIANT)
2. user-specified variant (VLLM_PRECOMPILED_WHEEL_VARIANT) from nightly repo
3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
4. the default variant from nightly repo (current main commit)
4. the default variant from nightly repo
If downloading from the nightly repo, the commit can be specified via
VLLM_PRECOMPILED_WHEEL_COMMIT; otherwise, the head commit in the main branch
is used.
"""
wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
if wheel_location is not None:
......@@ -362,10 +365,13 @@ class precompiled_wheel_utils:
# try to fetch the wheel metadata from the nightly wheel repo
main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
commit = os.getenv(
"VLLM_PRECOMPILED_WHEEL_COMMIT",
precompiled_wheel_utils.get_base_commit_in_main_branch(),
)
commit = os.getenv("VLLM_PRECOMPILED_WHEEL_COMMIT", "").lower()
if not commit or len(commit) != 40:
print(
f"VLLM_PRECOMPILED_WHEEL_COMMIT not valid: {commit}"
", trying to fetch base commit in main branch"
)
commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
print(f"Using precompiled wheel commit {commit} with variant {variant}")
try_default = False
wheels, repo_url, download_filename = None, None, None
......@@ -461,14 +467,22 @@ class precompiled_wheel_utils:
"vllm/cumem_allocator.abi3.so",
]
compiled_regex = re.compile(
flash_attn_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
triton_kernels_regex = re.compile(
r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
)
file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
)
file_members += list(
filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
)
file_members += list(
filter(
lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
)
)
for file in file_members:
......@@ -494,10 +508,6 @@ class precompiled_wheel_utils:
@staticmethod
def get_base_commit_in_main_branch() -> str:
# Force to use the nightly wheel. This is mainly used for CI testing.
if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
return "nightly"
try:
# Get the latest commit hash of the upstream main branch.
resp_json = subprocess.check_output(
......@@ -508,6 +518,7 @@ class precompiled_wheel_utils:
]
).decode("utf-8")
upstream_main_commit = json.loads(resp_json)["sha"]
print(f"Upstream main branch latest commit: {upstream_main_commit}")
# In Docker build context, .git may be immutable or missing.
if envs.VLLM_DOCKER_BUILD_CONTEXT:
......@@ -648,7 +659,7 @@ def get_vllm_version() -> str:
if envs.VLLM_TARGET_DEVICE == "empty":
version += f"{sep}empty"
elif _is_cuda():
if envs.VLLM_USE_PRECOMPILED:
if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
version += f"{sep}precompiled"
else:
cuda_version = str(get_nvcc_cuda_version())
......@@ -786,7 +797,7 @@ setup(
"bench": ["pandas", "matplotlib", "seaborn", "datasets"],
"tensorizer": ["tensorizer==2.10.1"],
"fastsafetensors": ["fastsafetensors >= 0.1.10"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
"runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
"audio": [
"librosa",
"soundfile",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment