Merge tag 'v0.13.0rc1' into v0.13.0rc1-ori

8d75f22e · zhuwenwen · ce888aa4 · 7d80c73d · 8d75f22e · 8d75f22e
Commit 8d75f22e authored Dec 13, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/pooling/convert_model_to_seq_cls.py
+++ b/examples/offline_inference/pooling/convert_model_to_seq_cls.py
--- a/examples/online_serving/pooling/jinaai_rerank_client.py
+++ b/examples/online_serving/pooling/jinaai_rerank_client.py
--- a/examples/online_serving/pooling/openai_cross_encoder_score.py
+++ b/examples/online_serving/pooling/openai_cross_encoder_score.py
--- a/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
+++ b/examples/online_serving/pooling/openai_cross_encoder_score_for_multimodal.py
--- a/examples/offline_inference/pooling/qwen3_reranker.py
+++ b/examples/offline_inference/pooling/qwen3_reranker.py
--- a/examples/offline_inference/pooling/ner.py
+++ b/examples/offline_inference/pooling/ner.py
--- a/examples/online_serving/pooling/ner_client.py
+++ b/examples/online_serving/pooling/ner_client.py
--- a/examples/pooling/token_embed/jina_embeddings_v4.py
+++ b/examples/pooling/token_embed/jina_embeddings_v4.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import torch
+
+from vllm import LLM
+from vllm.inputs.data import TextPrompt
+from vllm.multimodal.utils import fetch_image
+
+# Initialize model
+model = LLM(
+    model="jinaai/jina-embeddings-v4-vllm-text-matching",
+    runner="pooling",
+    max_model_len=1024,
+    gpu_memory_utilization=0.8,
+)
+
+# Create text prompts
+text1 = "Ein wunderschöner Sonnenuntergang am Strand"
+text1_prompt = TextPrompt(prompt=f"Query: {text1}")
+
+text2 = "浜辺に沈む美しい夕日"
+text2_prompt = TextPrompt(prompt=f"Query: {text2}")
+
+# Create image prompt
+image = fetch_image(
+    "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg"  # noqa: E501
+)
+image_prompt = TextPrompt(
+    prompt="<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|>\n",  # noqa: E501
+    multi_modal_data={"image": image},
+)
+
+# Encode all prompts
+prompts = [text1_prompt, text2_prompt, image_prompt]
+outputs = model.encode(prompts, pooling_task="token_embed")
+
+
+def get_embeddings(outputs):
+    VISION_START_TOKEN_ID, VISION_END_TOKEN_ID = 151652, 151653
+
+    embeddings = []
+    for output in outputs:
+        if VISION_START_TOKEN_ID in output.prompt_token_ids:
+            # Gather only vision tokens
+            img_start_pos = torch.where(
+                torch.tensor(output.prompt_token_ids) == VISION_START_TOKEN_ID
+            )[0][0]
+            img_end_pos = torch.where(
+                torch.tensor(output.prompt_token_ids) == VISION_END_TOKEN_ID
+            )[0][0]
+            embeddings_tensor = output.outputs.data.detach().clone()[
+                img_start_pos : img_end_pos + 1
+            ]
+        else:
+            # Use all tokens for text-only prompts
+            embeddings_tensor = output.outputs.data.detach().clone()
+
+        # Pool and normalize embeddings
+        pooled_output = (
+            embeddings_tensor.sum(dim=0, dtype=torch.float32)
+            / embeddings_tensor.shape[0]
+        )
+        embeddings.append(torch.nn.functional.normalize(pooled_output, dim=-1))
+    return embeddings
+
+
+embeddings = get_embeddings(outputs)
+
+for embedding in embeddings:
+    print(embedding.shape)
--- a/examples/offline_inference/pooling/multi_vector_retrieval.py
+++ b/examples/offline_inference/pooling/multi_vector_retrieval.py
--- a/examples/online_serving/pooling/multi_vector_retrieval_client.py
+++ b/examples/online_serving/pooling/multi_vector_retrieval_client.py
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -46,6 +46,7 @@ scipy # Required for phi-4-multimodal-instruct
 ninja # Required for xgrammar, rocm, tpu, xpu
 pybase64 # fast base64 implementation
 cbor2 # Required for cross-language serialization of hashable objects
+ijson # Required for mistral streaming tool parser
 setproctitle # Used to set process names for better debugging and monitoring
 openai-harmony >= 0.0.3  # Required for gpt-oss
 anthropic == 0.71.0

--- a/requirements/cpu-build.txt
+++ b/requirements/cpu-build.txt
@@ -3,7 +3,6 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
--extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"
 scons; platform_machine == "aarch64"    # needed to build Arm Compute Library (ACL)

--- a/requirements/cpu.txt
+++ b/requirements/cpu.txt
@@ -4,7 +4,6 @@
 numba == 0.61.2; platform_machine != "s390x" # Required for N-gram speculative decoding

 # Dependencies for CPUs
--extra-index-url https://download.pytorch.org/whl/cpu
 torch==2.9.1+cpu; platform_machine == "x86_64" or platform_machine == "s390x"
 torch==2.9.1; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "aarch64"


--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -42,6 +42,6 @@ tritonclient==2.51.0

 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.1.10
 pydantic>=2.12 # 2.11 leads to error on python 3.13
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -49,6 +49,7 @@ blobfile==3.0.0
    # Multi-Modal Models Test
 decord==0.6.0
    # video processing, required by entrypoints/openai/test_video.py
+rapidfuzz==3.12.1

 # OpenAI compatibility and testing
 gpt-oss==0.0.8
@@ -58,10 +59,14 @@ schemathesis==3.39.15

 # Evaluation and benchmarking
 lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
+jiwer==4.0.0

 # Required for multiprocessed tests that use spawn method, Datasets and Evaluate Test
 multiprocess==0.70.16

+# Required for v1/metrics/test_engine_logger_apis.py
+ray[cgraph,default]>=2.48.0
+
 # Plugins test
 terratorch @ git+https://github.com/IBM/terratorch.git@07184fcf91a1324f831ff521dd238d97fe350e3e
 torchgeo==0.7.0
@@ -70,8 +75,8 @@ torchgeo==0.7.0
 mteb==2.1.2

 # Data processing
-xgrammar @ git+https://github.com/mlc-ai/xgrammar.git@eafd4db51b78acc64b3f0764ef27dfd206c28628
-    # Test async scheduling
+xgrammar==0.1.27
+# Test async scheduling

 # Utilities
 num2words==0.5.14

--- a/requirements/rocm.txt
+++ b/requirements/rocm.txt
@@ -12,7 +12,7 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
-runai-model-streamer[s3,gcs]==0.15.0
+
+runai-model-streamer[s3,gcs]==0.15.3
 # conch-triton-kernels==1.2.1
-timm>=1.0.17
 fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459
--- a/requirements/test.in
+++ b/requirements/test.in
@@ -51,7 +51,7 @@ tritonclient==2.51.0
 arctic-inference == 0.1.1 # Required for suffix decoding test
 numba == 0.61.2 # Required for N-gram speculative decoding
 numpy
-runai-model-streamer[s3,gcs]==0.15.0
+runai-model-streamer[s3,gcs]==0.15.3
 fastsafetensors>=0.1.10
 pydantic>=2.12 # 2.11 leads to error on python 3.13
 decord==0.6.0

--- a/requirements/test.txt
+++ b/requirements/test.txt
@@ -965,11 +965,11 @@ rsa==4.9.1
    # via google-auth
 rtree==1.4.0
    # via torchgeo
-runai-model-streamer==0.15.0
+runai-model-streamer==0.15.3
    # via -r requirements/test.in
-runai-model-streamer-gcs==0.15.0
+runai-model-streamer-gcs==0.15.3
    # via runai-model-streamer
-runai-model-streamer-s3==0.15.0
+runai-model-streamer-s3==0.15.3
    # via runai-model-streamer
 s3transfer==0.10.3
    # via boto3

--- a/requirements/tpu.txt
+++ b/requirements/tpu.txt
@@ -11,5 +11,4 @@ ray[default]
 ray[data]
 setuptools==78.1.0
 nixl==0.3.0
-tpu_info==0.4.0
-tpu-inference==0.11.1
+tpu-inference==0.12.0
--- a/setup.py
+++ b/setup.py
@@ -346,10 +346,13 @@ class precompiled_wheel_utils:
        The order of preference is:
        1. user-specified wheel location (can be either local or remote, via
           VLLM_PRECOMPILED_WHEEL_LOCATION)
-        2. user-specified variant from nightly repo (current main commit via
-           VLLM_PRECOMPILED_WHEEL_VARIANT)
+        2. user-specified variant (VLLM_PRECOMPILED_WHEEL_VARIANT) from nightly repo
        3. the variant corresponding to VLLM_MAIN_CUDA_VERSION from nightly repo
-        4. the default variant from nightly repo (current main commit)
+        4. the default variant from nightly repo
+
+        If downloading from the nightly repo, the commit can be specified via
+        VLLM_PRECOMPILED_WHEEL_COMMIT; otherwise, the head commit in the main branch
+        is used.
        """
        wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
        if wheel_location is not None:
@@ -362,10 +365,13 @@ class precompiled_wheel_utils:
            # try to fetch the wheel metadata from the nightly wheel repo
            main_variant = "cu" + envs.VLLM_MAIN_CUDA_VERSION.replace(".", "")
            variant = os.getenv("VLLM_PRECOMPILED_WHEEL_VARIANT", main_variant)
-            commit = os.getenv(
-                "VLLM_PRECOMPILED_WHEEL_COMMIT",
-                precompiled_wheel_utils.get_base_commit_in_main_branch(),
-            )
+            commit = os.getenv("VLLM_PRECOMPILED_WHEEL_COMMIT", "").lower()
+            if not commit or len(commit) != 40:
+                print(
+                    f"VLLM_PRECOMPILED_WHEEL_COMMIT not valid: {commit}"
+                    ", trying to fetch base commit in main branch"
+                )
+                commit = precompiled_wheel_utils.get_base_commit_in_main_branch()
            print(f"Using precompiled wheel commit {commit} with variant {variant}")
            try_default = False
            wheels, repo_url, download_filename = None, None, None
@@ -461,14 +467,22 @@ class precompiled_wheel_utils:
                    "vllm/cumem_allocator.abi3.so",
                ]

-                compiled_regex = re.compile(
+                flash_attn_regex = re.compile(
                    r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
                )
+                triton_kernels_regex = re.compile(
+                    r"vllm/third_party/triton_kernels/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
+                )
                file_members = list(
                    filter(lambda x: x.filename in files_to_copy, wheel.filelist)
                )
                file_members += list(
-                    filter(lambda x: compiled_regex.match(x.filename), wheel.filelist)
+                    filter(lambda x: flash_attn_regex.match(x.filename), wheel.filelist)
+                )
+                file_members += list(
+                    filter(
+                        lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
+                    )
                )

                for file in file_members:
@@ -494,10 +508,6 @@ class precompiled_wheel_utils:

    @staticmethod
    def get_base_commit_in_main_branch() -> str:
-        # Force to use the nightly wheel. This is mainly used for CI testing.
-        if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
-            return "nightly"
-
        try:
            # Get the latest commit hash of the upstream main branch.
            resp_json = subprocess.check_output(
@@ -508,6 +518,7 @@ class precompiled_wheel_utils:
                ]
            ).decode("utf-8")
            upstream_main_commit = json.loads(resp_json)["sha"]
+            print(f"Upstream main branch latest commit: {upstream_main_commit}")

            # In Docker build context, .git may be immutable or missing.
            if envs.VLLM_DOCKER_BUILD_CONTEXT:
@@ -648,7 +659,7 @@ def get_vllm_version() -> str:
        if envs.VLLM_TARGET_DEVICE == "empty":
            version += f"{sep}empty"
    elif _is_cuda():
-        if envs.VLLM_USE_PRECOMPILED:
+        if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
            version += f"{sep}precompiled"
        else:
            cuda_version = str(get_nvcc_cuda_version())
@@ -786,7 +797,7 @@ setup(
        "bench": ["pandas", "matplotlib", "seaborn", "datasets"],
        "tensorizer": ["tensorizer==2.10.1"],
        "fastsafetensors": ["fastsafetensors >= 0.1.10"],
-        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.0"],
+        "runai": ["runai-model-streamer[s3,gcs] >= 0.15.3"],
        "audio": [
            "librosa",
            "soundfile",