Merge tag 'v0.10.0' into v0.10.0-dev

711aa9d5 · zhuwenwen · 751c492c · 6d8d0a24 · 711aa9d5 · 711aa9d5
Commit 711aa9d5 authored Jul 30, 2025 by zhuwenwen
20 changed files
--- a/examples/offline_inference/basic/embed.py
+++ b/examples/offline_inference/basic/embed.py
@@ -31,10 +31,10 @@ def main(args: Namespace):
    # Create an LLM.
    # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)
    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)

--- a/examples/offline_inference/basic/score.py
+++ b/examples/offline_inference/basic/score.py
@@ -27,10 +27,10 @@ def main(args: Namespace):
    # Create an LLM.
    # You should pass task="score" for cross-encoder models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
    # Generate scores. The output is a list of ScoringRequestOutputs.
-    outputs = model.score(text_1, texts_2)
+    outputs = llm.score(text_1, texts_2)
    # Print the outputs.
    print("\nGenerated Outputs:\n" + "-" * 60)

--- a/examples/offline_inference/batch_llm_inference.py
+++ b/examples/offline_inference/batch_llm_inference.py
@@ -3,17 +3,19 @@
 """
 This example shows how to use Ray Data for data parallel batch inference.
-Ray Data is a data processing framework that can handle large datasets
+Ray Data is a data processing framework that can process very large datasets
-and integrates tightly with vLLM for data-parallel inference.
+with first-class support for vLLM.
-As of Ray 2.44, Ray Data has a native integration with
-vLLM (under ray.data.llm).
 Ray Data provides functionality for:
-* Reading and writing to cloud storage (S3, GCS, etc.)
+* Reading and writing to most popular file formats and cloud object storage.
-* Automatic sharding and load-balancing across a cluster
+* Streaming execution, so you can run inference on datasets that far exceed
-* Optimized configuration of vLLM using continuous batching
+  the aggregate RAM of the cluster.
-* Compatible with tensor/pipeline parallel inference as well.
+* Scale up the workload without code changes.
+* Automatic sharding, load-balancing, and autoscaling across a Ray cluster,
+  with built-in fault-tolerance and retry semantics.
+* Continuous batching that keeps vLLM replicas saturated and maximizes GPU
+  utilization.
+* Compatible with tensor/pipeline parallel inference.
 Learn more about Ray Data's LLM integration:
 https://docs.ray.io/en/latest/data/working-with-llms.html

--- a/examples/offline_inference/convert_model_to_seq_cls.py
+++ b/examples/offline_inference/convert_model_to_seq_cls.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+import argparse
+import json
+import torch
+import transformers
+# Usage:
+# for BAAI/bge-reranker-v2-gemma
+# Caution: "Yes" and "yes" are two different tokens
+# python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls
+# for mxbai-rerank-v2
+# python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls
+# for Qwen3-Reranker
+# python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls
+def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device):
+    # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
+    assert len(tokens) == 2
+    lm_head_weights = causal_lm.lm_head.weight
+    false_id = tokenizer.convert_tokens_to_ids(tokens[0])
+    true_id = tokenizer.convert_tokens_to_ids(tokens[1])
+    score_weight = lm_head_weights[true_id].to(device).to(
+        torch.float32
+    ) - lm_head_weights[false_id].to(device).to(torch.float32)
+    with torch.no_grad():
+        seq_cls_model.score.weight.copy_(score_weight.unsqueeze(0))
+        if seq_cls_model.score.bias is not None:
+            seq_cls_model.score.bias.zero_()
+def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device):
+    lm_head_weights = causal_lm.lm_head.weight
+    token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens]
+    score_weight = lm_head_weights[token_ids].to(device)
+    with torch.no_grad():
+        seq_cls_model.score.weight.copy_(score_weight)
+        if seq_cls_model.score.bias is not None:
+            seq_cls_model.score.bias.zero_()
+method_map = {
+    function.__name__: function for function in [from_2_way_softmax, no_post_processing]
+}
+def converting(
+    model_name, classifier_from_tokens, path, method, use_pad_token=False, device="cpu"
+):
+    assert method in method_map
+    if method == "from_2_way_softmax":
+        assert len(classifier_from_tokens) == 2
+        num_labels = 1
+    else:
+        num_labels = len(classifier_from_tokens)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
+    causal_lm = transformers.AutoModelForCausalLM.from_pretrained(
+        model_name, device_map=device
+    )
+    seq_cls_model = transformers.AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        num_labels=num_labels,
+        ignore_mismatched_sizes=True,
+        device_map=device,
+    )
+    method_map[method](
+        causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device
+    )
+    # `llm as reranker` defaults to not using pad_token
+    seq_cls_model.config.use_pad_token = use_pad_token
+    seq_cls_model.config.pad_token_id = tokenizer.pad_token_id
+    seq_cls_model.save_pretrained(path)
+    tokenizer.save_pretrained(path)
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Converting *ForCausalLM models to "
+        "*ForSequenceClassification models."
+    )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="BAAI/bge-reranker-v2-gemma",
+        help="Model name",
+    )
+    parser.add_argument(
+        "--classifier_from_tokens",
+        type=str,
+        default='["Yes"]',
+        help="classifier from tokens",
+    )
+    parser.add_argument(
+        "--method", type=str, default="no_post_processing", help="Converting converting"
+    )
+    parser.add_argument(
+        "--use-pad-token", action="store_true", help="Whether to use pad_token"
+    )
+    parser.add_argument(
+        "--path",
+        type=str,
+        default="./bge-reranker-v2-gemma-seq-cls",
+        help="Path to save converted model",
+    )
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = parse_args()
+    converting(
+        model_name=args.model_name,
+        classifier_from_tokens=json.loads(args.classifier_from_tokens),
+        method=args.method,
+        use_pad_token=args.use_pad_token,
+        path=args.path,
+    )
--- a/examples/offline_inference/embed_jina_embeddings_v3.py
+++ b/examples/offline_inference/embed_jina_embeddings_v3.py
@@ -30,11 +30,11 @@ def main(args: Namespace):
    # Create an LLM.
    # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
    # Only text matching task is supported for now. See #16120
-    outputs = model.embed(prompts)
+    outputs = llm.embed(prompts)
    # Print the outputs.
    print("\nGenerated Outputs:")

--- a/examples/offline_inference/embed_matryoshka_fy.py
+++ b/examples/offline_inference/embed_matryoshka_fy.py
@@ -30,10 +30,10 @@ def main(args: Namespace):
    # Create an LLM.
    # You should pass task="embed" for embedding models
-    model = LLM(**vars(args))
+    llm = LLM(**vars(args))
    # Generate embedding. The output is a list of EmbeddingRequestOutputs.
-    outputs = model.embed(prompts, pooling_params=PoolingParams(dimensions=32))
+    outputs = llm.embed(prompts, pooling_params=PoolingParams(dimensions=32))
    # Print the outputs.
    print("\nGenerated Outputs:")

--- a/examples/offline_inference/neuron_eagle.py
+++ b/examples/offline_inference/neuron_eagle.py
@@ -54,7 +54,7 @@ def main():
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}")
+        print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}")
 if __name__ == "__main__":

--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
@@ -25,7 +25,7 @@ def config_buckets():
    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
-def initialize_model():
+def initialize_llm():
    """Create an LLM with speculative decoding."""
    return LLM(
        model="openlm-research/open_llama_7b",
@@ -37,15 +37,14 @@ def initialize_model():
        max_num_seqs=4,
        max_model_len=2048,
        block_size=2048,
-        use_v2_block_manager=True,
        device="neuron",
        tensor_parallel_size=32,
    )
-def process_requests(model: LLM, sampling_params: SamplingParams):
+def process_requests(llm: LLM, sampling_params: SamplingParams):
    """Generate texts from prompts and print them."""
-    outputs = model.generate(prompts, sampling_params)
+    outputs = llm.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
@@ -53,12 +52,12 @@ def process_requests(model: LLM, sampling_params: SamplingParams):
 def main():
-    """Main function that sets up the model and processes prompts."""
+    """Main function that sets up the llm and processes prompts."""
    config_buckets()
-    model = initialize_model()
+    llm = initialize_llm()
    # Create a sampling params object.
    sampling_params = SamplingParams(max_tokens=100, top_k=1)
-    process_requests(model, sampling_params)
+    process_requests(llm, sampling_params)
 if __name__ == "__main__":

--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This is a demo script showing how to use the
-PrithviGeospatialMAE model with vLLM
-This script is based on: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/blob/main/inference.py # noqa
-Target model weights: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/resolve/main/Prithvi-EO-V2-300M-TL-Sen1Floods11.pt # noqa
-The requirements for running this script are:
- Installing [terratorch, albumentations, rasterio] in your python environment
- downloading the model weights in a 'model' folder local to the script
-  (temporary measure until the proper config.json file is uploaded to HF)
- download an input example image (India_900498_S2Hand.tif) and place it in
-  the same folder with the script (or specify with the --data_file argument)
-Run the example:
-python prithvi_geospatial_mae.py
-"""  # noqa: E501
 import argparse
 import datetime
 import os
+import re
 from typing import Union
 import albumentations
 import numpy as np
 import rasterio
-import regex as re
 import torch
 from einops import rearrange
 from terratorch.datamodules import Sen1Floods11NonGeoDataModule
 from vllm import LLM
+torch.set_default_dtype(torch.float16)
 NO_DATA = -9999
 NO_DATA_FLOAT = 0.0001
 OFFSET = 0
 PERCENTILE = 99
-model_config = """{
-  "architectures": ["PrithviGeoSpatialMAE"],
-  "num_classes": 0,
-  "pretrained_cfg": {
-    "task_args": {
-      "task": "SemanticSegmentationTask",
-      "model_factory": "EncoderDecoderFactory",
-      "loss": "ce",
-      "ignore_index": -1,
-      "lr": 0.001,
-      "freeze_backbone": false,
-      "freeze_decoder": false,
-      "plot_on_val": 10,
-      "optimizer": "AdamW",
-      "scheduler": "CosineAnnealingLR"
-    },
-    "model_args": {
-      "backbone_pretrained": false,
-      "backbone": "prithvi_eo_v2_300_tl",
-      "decoder": "UperNetDecoder",
-      "decoder_channels": 256,
-      "decoder_scale_modules": true,
-      "num_classes": 2,
-      "rescale": true,
-      "backbone_bands": [
-        "BLUE",
-        "GREEN",
-        "RED",
-        "NIR_NARROW",
-        "SWIR_1",
-        "SWIR_2"
-      ],
-      "head_dropout": 0.1,
-      "necks": [
-        {
-          "name": "SelectIndices",
-          "indices": [
-            5,
-            11,
-            17,
-            23
-          ]
-        },
-        {
-          "name": "ReshapeTokensToImage"
-        }
-      ]
-    },
-    "optimizer_params" : {
-      "lr": 5.0e-05,
-      "betas": [0.9, 0.999],
-      "eps": [1.0e-08],
-      "weight_decay": 0.05,
-      "amsgrad": false,
-      "maximize": false,
-      "capturable": false,
-      "differentiable": false
-    },
-    "scheduler_params" : {
-        "T_max": 50,
-        "eta_min": 0,
-        "last_epoch": -1,
-        "verbose": "deprecated"
-    }
-  },
-  "torch_dtype": "float32"
-}
-"""
-# Temporarily creating the "config.json" for the model.
-# This is going to disappear once the correct config.json is available on HF
-with open(
-    os.path.join(os.path.dirname(__file__), "./model/config.json"), "w"
-) as config_file:
-    config_file.write(model_config)
 datamodule_config = {
    "bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
    "batch_size": 16,
@@ -138,28 +43,24 @@ datamodule_config = {
 class PrithviMAE:
-    def __init__(self):
+    def __init__(self, model):
-        print("Initializing PrithviMAE model")
        self.model = LLM(
-            model=os.path.join(os.path.dirname(__file__), "./model"),
+            model=model, skip_tokenizer_init=True, dtype="float16", enforce_eager=True
-            skip_tokenizer_init=True,
-            dtype="float32",
        )
    def run(self, input_data, location_coords):
-        print("################ Running inference on vLLM ##############")
        # merge the inputs into one data structure
+        if input_data is not None and input_data.dtype == torch.float32:
+            input_data = input_data.to(torch.float16)
+            input_data = input_data[0]
        mm_data = {
-            "pixel_values": torch.empty(0) if input_data is None else input_data,
+            "pixel_values": input_data,
-            "location_coords": torch.empty(0)
+            "location_coords": location_coords,
-            if location_coords is None
-            else location_coords,
        }
        prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
        outputs = self.model.encode(prompt, use_tqdm=False)
-        print("################ Inference done (it took seconds)  ##############")
        return outputs[0].outputs.data
@@ -181,11 +82,12 @@ def process_channel_group(orig_img, channels):
    """
    Args:
        orig_img: torch.Tensor representing original image (reference)
-                  with shape = (bands, H, W).
+        with shape = (bands, H, W).
        channels: list of indices representing RGB channels.
    Returns:
-        torch.Tensor with shape (num_channels, height, width) for original image
+        torch.Tensor with shape (num_channels, height, width)
+        for original image
    """
    orig_img = orig_img[channels, ...]
@@ -260,10 +162,10 @@ def load_example(
    Args:
        file_paths: list of file paths .
-        mean: list containing mean values for each band in the images
+        mean: list containing mean values for each band in the
-              in *file_paths*.
+              images in *file_paths*.
-        std: list containing std values for each band in the images
+        std: list containing std values for each band in the
-             in *file_paths*.
+             images in *file_paths*.
    Returns:
        np.array containing created example
@@ -308,7 +210,7 @@ def load_example(
            print(f"Could not extract timestamp for {file} ({e})")
    imgs = np.stack(imgs, axis=0)  # num_frames, H, W, C
-    imgs = np.moveaxis(imgs, -1, 0).astype("float32")
+    imgs = np.moveaxis(imgs, -1, 0).astype("float32")  # C, num_frames, H, W
    imgs = np.expand_dims(imgs, axis=0)  # add batch di
    return imgs, temporal_coords, location_coords, metas
@@ -332,8 +234,10 @@ def run_model(
    )
    # Build sliding window
    batch_size = 1
-    batch = torch.tensor(input_data, device="cpu")
+    # batch = torch.tensor(input_data, device="cpu")
+    batch = torch.tensor(input_data)
    windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
    h1, w1 = windows.shape[3:5]
    windows = rearrange(
@@ -344,18 +248,16 @@ def run_model(
    num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
    windows = torch.tensor_split(windows, num_batches, dim=0)
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    if temporal_coords:
-        temporal_coords = torch.tensor(temporal_coords, device=device).unsqueeze(0)
+        temporal_coords = torch.tensor(temporal_coords).unsqueeze(0)
    else:
        temporal_coords = None
    if location_coords:
-        location_coords = torch.tensor(location_coords[0], device=device).unsqueeze(0)
+        location_coords = torch.tensor(location_coords[0]).unsqueeze(0)
    else:
        location_coords = None
-    # Run model
+    # Run Prithvi-EO-V2-300M-TL-Sen1Floods11
    pred_imgs = []
    for x in windows:
        # Apply standardization
@@ -363,15 +265,7 @@ def run_model(
        x = datamodule.aug(x)["image"]
        with torch.no_grad():
-            x = x.to(device)
            pred = model.run(x, location_coords=location_coords)
-            if lightning_model:
-                pred_lightning = lightning_model(
-                    x, temporal_coords=temporal_coords, location_coords=location_coords
-                )
-                pred_lightning = pred_lightning.output.detach().cpu()
-                if not torch.equal(pred, pred_lightning):
-                    print("Inference output is not equal")
        y_hat = pred.argmax(dim=1)
        y_hat = torch.nn.functional.interpolate(
@@ -403,52 +297,18 @@ def run_model(
    return pred_imgs
-def parse_args():
-    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
-    parser.add_argument(
-        "--data_file",
-        type=str,
-        default="./India_900498_S2Hand.tif",
-        help="Path to the file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="output",
-        help="Path to the directory where to save outputs.",
-    )
-    parser.add_argument(
-        "--input_indices",
-        default=[1, 2, 3, 8, 11, 12],
-        type=int,
-        nargs="+",
-        help="0-based indices of the six Prithvi channels to be selected from the  "
-        "input. By default selects [1,2,3,8,11,12] for S2L1C data.",
-    )
-    parser.add_argument(
-        "--rgb_outputs",
-        action="store_true",
-        help="If present, output files will only contain RGB channels. "
-        "Otherwise, all bands will be saved.",
-    )
 def main(
    data_file: str,
+    model: str,
    output_dir: str,
    rgb_outputs: bool,
    input_indices: list[int] = None,
 ):
    os.makedirs(output_dir, exist_ok=True)
-    # Load model ---------------------------------------------------------------
+    model_obj = PrithviMAE(model=model)
-    model_obj = PrithviMAE()
    datamodule = generate_datamodule()
-    img_size = 256  # Size of Sen1Floods11
+    img_size = 512  # Size of Sen1Floods11
-    # Loading data -------------------------------------------------------------
    input_data, temporal_coords, location_coords, meta_data = load_example(
        file_paths=[data_file],
@@ -460,8 +320,6 @@ def main(
    if input_data.mean() > 1:
        input_data = input_data / 10000  # Convert to range 0-1
-    # Running model ------------------------------------------------------------
    channels = [
        datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"]
    ]  # BGR -> RGB
@@ -469,7 +327,6 @@ def main(
    pred = run_model(
        input_data, temporal_coords, location_coords, model_obj, datamodule, img_size
    )
    # Save pred
    meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
    pred_file = os.path.join(
@@ -487,6 +344,7 @@ def main(
        orig_img=torch.Tensor(input_data[0, :, 0, ...]),
        channels=channels,
    )
+    rgb_orig = rgb_orig.to(torch.float32)
    pred[pred == 0.0] = np.nan
    img_pred = rgb_orig * 0.7 + pred * 0.3
@@ -503,9 +361,10 @@ def main(
    # Save image rgb
    if rgb_outputs:
+        name_suffix = os.path.splitext(os.path.basename(data_file))[0]
        rgb_file = os.path.join(
            output_dir,
-            f"original_rgb_{os.path.splitext(os.path.basename(data_file))[0]}.tiff",
+            f"original_rgb_{name_suffix}.tiff",
        )
        save_geotiff(
            image=_convert_np_uint8(rgb_orig),
@@ -515,6 +374,42 @@ def main(
 if __name__ == "__main__":
-    args = parse_args()
+    parser = argparse.ArgumentParser("MAE run inference", add_help=False)
+    parser.add_argument(
+        "--data_file",
+        type=str,
+        default="./India_900498_S2Hand.tif",
+        help="Path to the file.",
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
+        help="Path to a checkpoint file to load from.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="output",
+        help="Path to the directory where to save outputs.",
+    )
+    parser.add_argument(
+        "--input_indices",
+        default=[1, 2, 3, 8, 11, 12],
+        type=int,
+        nargs="+",
+        help="""
+        0-based indices of the six Prithvi channels to be selected from the input.
+        By default selects [1,2,3,8,11,12] for S2L1C data.
+        """,
+    )
+    parser.add_argument(
+        "--rgb_outputs",
+        action="store_true",
+        help="If present, output files will only contain RGB channels. "
+        "Otherwise, all bands will be saved.",
+    )
+    args = parser.parse_args()
    main(**vars(args))
--- a/examples/offline_inference/qwen3_reranker.py
+++ b/examples/offline_inference/qwen3_reranker.py
@@ -17,13 +17,13 @@ model_name = "Qwen/Qwen3-Reranker-0.6B"
 # Models converted offline using this method can not only be more efficient
 # and support the vllm score API, but also make the init parameters more
 # concise, for example.
-# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
+# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
 # If you want to load the official original version, the init parameters are
 # as follows.
-def get_model() -> LLM:
+def get_llm() -> LLM:
    """Initializes and returns the LLM model for Qwen3-Reranker."""
    return LLM(
        model=model_name,
@@ -76,8 +76,8 @@ def main() -> None:
    ]
    documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
-    model = get_model()
+    llm = get_llm()
-    outputs = model.score(queries, documents)
+    outputs = llm.score(queries, documents)
    print("-" * 30)
    print([output.outputs.score for output in outputs])

--- a/examples/offline_inference/rlhf.py
+++ b/examples/offline_inference/rlhf.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-a simple demonstration of RLHF with vLLM, inspired by
+Demonstrates reinforcement learning from human feedback (RLHF) using vLLM and Ray.
-the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
-It follows the design that, training processes and inference processes
+The script separates training and inference workloads onto distinct GPUs
-are different, and they live on different GPUs.
+so that Ray can manage process placement and inter-process communication.
-Training processes send prompts to inference processes to generate data,
+A Hugging Face Transformer model occupies GPU 0 for training, whereas a
-and also synchronize the weights of the model by broadcasting the weights
+tensor-parallel vLLM inference engine occupies GPU 1–2.
-from the training process to the inference process.
-Note that this is a simple demonstration of one training instance and one
+The example performs the following steps:
-inference instance. In practice, there could be multiple training instances
-and multiple inference instances. For the full implementation, please refer
+* Load the training model on GPU 0.
-to the OpenRLHF framework.
+* Split the inference model across GPUs 1–2 using vLLM's tensor parallelism
+  and Ray placement groups.
+* Generate text from a list of prompts using the inference engine.
+* Update the weights of the training model and broadcast the updated weights
+  to the inference engine by using a Ray collective RPC group. Note that
+  for demonstration purposes we simply zero out the weights.
+For a production-ready implementation that supports multiple training and
+inference replicas, see the OpenRLHF framework:
+https://github.com/OpenRLHF/OpenRLHF
+This example assumes a single-node cluster with three GPUs, but Ray
+supports multi-node clusters. vLLM expects the GPUs are only used for vLLM
+workloads. Residual GPU activity interferes with vLLM memory profiling and
+causes unexpected behavior.
 """
 import os
@@ -28,29 +42,27 @@ from vllm.utils import get_ip, get_open_port
 class MyLLM(LLM):
+    """Configure the vLLM worker for Ray placement group execution."""
    def __init__(self, *args, **kwargs):
-        # a hack to make the script work.
+        # Remove the top-level CUDA_VISIBLE_DEVICES variable set by Ray
-        # stop ray from manipulating CUDA_VISIBLE_DEVICES
+        # so that vLLM can manage its own device placement within the worker.
-        # at the top-level
        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
        super().__init__(*args, **kwargs)
-"""
+# Load the OPT-125M model onto GPU 0 for the training workload.
-Start the training process, here we use huggingface transformers 
-as an example to hold a model on GPU 0.
-"""
 train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
 train_model.to("cuda:0")
-"""
-Start the inference process, here we use vLLM to hold a model on GPU 1 and 
+# Initialize Ray and set the visible devices. The vLLM engine will
-GPU 2. For the details on how to use ray, please refer to the ray 
+# be placed on GPUs 1 and 2.
-documentation https://docs.ray.io/en/latest/ .
-"""
 os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
 ray.init()
+# Create a placement group that reserves GPU 1–2 for the vLLM inference engine.
+# Learn more about Ray placement groups:
+# https://docs.ray.io/en/latest/placement-groups.html
 pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
 ray.get(pg_inference.ready())
 scheduling_inference = PlacementGroupSchedulingStrategy(
@@ -58,10 +70,9 @@ scheduling_inference = PlacementGroupSchedulingStrategy(
    placement_group_capture_child_tasks=True,
    placement_group_bundle_index=0,
 )
-"""
-launch the vLLM inference engine.
+# Launch the vLLM inference engine. The `enforce_eager` flag reduces
-here we use `enforce_eager` to reduce the start time.
+# start-up latency.
-"""
 llm = ray.remote(
    num_cpus=0,
    num_gpus=0,
@@ -74,7 +85,7 @@ llm = ray.remote(
    distributed_executor_backend="ray",
 )
-# Generate texts from the prompts.
+# Generate text from the prompts.
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
@@ -93,8 +104,8 @@ for output in outputs:
    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
    print("-" * 50)
-# set up the communication between the training process
+# Set up the communication channel between the training process and the
-# and the inference engine.
+# inference engine.
 master_address = get_ip()
 master_port = get_open_port()
@@ -107,21 +118,23 @@ model_update_group = stateless_init_process_group(
 )
 ray.get(handle)
-# simulate training, modify the weights of the model.
+# Simulate a training step by zeroing out all model weights.
+# In a real RLHF training loop the weights would be updated using the gradient
+# from an RL objective such as PPO on a reward model.
 for name, p in train_model.named_parameters():
    p.data.zero_()
-# sync weight from the training process to the inference engine.
+# Synchronize the updated weights to the inference engine.
 for name, p in train_model.named_parameters():
    handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape))
    model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
    ray.get(handle)
-# check if the weights are updated.
+# Verify that the inference weights have been updated.
 assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
-# use the updated model to generate texts, they will be nonsense
+# Generate text with the updated model. The output is expected to be nonsense
-# because the weights are all zeros.
+# because the weights are zero.
 outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
 print("-" * 50)
 for output in outputs_updated:

--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
-a simple demonstration to show how to co-locate
+Demonstrates how to co-locate a vLLM inference worker and training
-vLLM worker with training actors on the same GPUs,
+actors on the same set of GPUs for reinforcement learning from human feedback
-for RLHF-like applications.
+(RLHF) workloads.
-The key points:
- Control the placement of the vLLM workers with Ray, by setting
+Ray serves as the distributed execution framework in this example. Ray
-    VLLM_RAY_PER_WORKER_GPUS and VLLM_RAY_BUNDLE_INDICES properly.
+placement groups allocate both training actors and vLLM workers to the
- Use cuda-ipc to pass tensors, since NCCL does not work when we have
+same GPU bundles, enabling fast, in-GPU communication between the two
-    multiple processes on the same GPU.
+components.
+The script shows how to do the following:
+* Configure environment variables (`VLLM_RAY_PER_WORKER_GPUS` and
+  `VLLM_RAY_BUNDLE_INDICES`) so that vLLM workers land on the desired
+  devices.
+* Exchange tensors between processes by means of CUDA inter-process
+  communication (IPC). CUDA IPC sidesteps NCCL limitations that occur
+  when multiple processes share a single GPU.
+Note that this example assumes a single-node cluster with four GPUs, but Ray
+supports multi-node clusters. vLLM expects exclusive use of the GPUs during
+its initialization for memory profiling. Residual GPU activity interferes
+with vLLM memory profiling and causes unexpected behavior.
+Learn more about Ray placement groups:
+https://docs.ray.io/en/latest/placement-groups.html
 """
 import os
@@ -22,13 +39,24 @@ from vllm import LLM
 class MyLLM(LLM):
-    def __init__(self, *args, bundle_indices: list, **kwargs):
+    """Configure the vLLM worker for Ray placement group execution.
-        # a hack to make the script work.
-        # stop ray from manipulating CUDA_VISIBLE_DEVICES
+    The constructor sets environment variables that allow multiple vLLM
-        # at the top-level
+    workers to share a single physical GPU and that encode the bundle
+    indices assigned by the placement group.
+    Args:
+        *args: Positional arguments forwarded to `vllm.LLM`.
+        bundle_indices (list[int]): Placement-group bundle indices
+            assigned to this worker.
+        **kwargs: Keyword arguments forwarded to `vllm.LLM`.
+    """
+    def __init__(self, *args, bundle_indices: list[int], **kwargs):
+        # Prevent Ray from manipulating the top-level CUDA_VISIBLE_DEVICES variable
+        # so that vLLM can its own device placement inside the worker.
        os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-        # every worker will use 0.4 GPU, so that we can schedule
+        # Each worker uses 0.4 GPU so that two instances fit on the same GPUs.
-        # 2 instances on the same GPUs.
        os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
        os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices))
        print(f"creating LLM with bundle_indices={bundle_indices}")
@@ -36,17 +64,25 @@ class MyLLM(LLM):
 class RayTrainingActor:
+    """Training actor that hosts a Facebook OPT-125M model from Hugging Face.
+    The model is loaded onto the first GPU assigned to this actor, and expose
+    the CUDA IPC handles so that colocated vLLM workers can map tensors
+    directly.
+    """
    def __init__(self):
-        # ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
+        # Ray sets CUDA_VISIBLE_DEVICES to the GPUs assigned to this actor.
        from transformers import AutoModelForCausalLM
        self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
        self.model.to("cuda:0")
+        # Zero out all the parameters.
        for name, p in self.model.named_parameters():
            p.data.zero_()
        torch.cuda.synchronize()
-        # the argument for get_device_uuid is the index
+        # The argument for `get_device_uuid` is the index of the GPU in the
-        # of the GPU in the visible devices.
+        # list of visible devices.
        from vllm.platforms import current_platform
        self.device_uuid = current_platform.get_device_uuid(0)
@@ -59,23 +95,23 @@ class RayTrainingActor:
        data = {}
        for name, p in self.model.named_parameters():
-            # the training actor might only have a subset of the weights
+            # A training actor might hold only a subset of the weights and may
-            # and need to all-gather the weights from all the actors.
+            # need to gather weights from other actors. For demonstration
-            # for demonstration, here we assume all training actors have
+            # purposes, each training actor owns the full weight set.
-            # the full weights.
            data[name] = reduce_tensor(p.detach())
        return {self.device_uuid: data}
-# ray manages 4 GPUs
+# Ray manages four GPUs.
 os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
 ray.init()
-# we want to co-locate vLLM instance and the training actor
+# Co-locate vLLM instances and training actors on the same set of GPUs:
-# on the same set of GPUs.
+#   * GPU 0 and 1: training actor 0, training actor 1, and vLLM instance 0
-# the placement plan is as follows:
+#     (tensor parallelism = 2).
-# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2)
+#   * GPU 2 and 3: training actor 2, training actor 3, and vLLM instance 1
-# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
+#     (tensor parallelism = 2).
 pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
 ray.get(pg.ready())
@@ -104,10 +140,8 @@ for bundle_index, training_actor in enumerate(training_actors):
    training_actor_device_ids.append(device_id)
 for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
-    # IMPORTANT: when creating vLLM instances, we need to
+    # Use the following syntax instead of the @ray.remote decorator so that
-    # make sure there are no GPU activities on the target GPUs,
+    # the placement group is customized for each bundle.
-    # otherwise, they will interfere with the vLLM memory profiling,
-    # and cause unexpected behaviors.
    llm = ray.remote(
        num_cpus=0,
        num_gpus=0,
@@ -125,8 +159,8 @@ for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
        bundle_indices=bundle_indices,
    )
    inference_engines.append(llm)
-    # don't call any method on the inference engine here,
+    # Do not call any method on the inference engine at this point; the call
-    # otherwise it will block until the vLLM instance is created.
+    # blocks until the vLLM instance finishes initialization.
 for i, llm in enumerate(inference_engines):
    inference_engine_device_ids.append(
@@ -134,26 +168,25 @@ for i, llm in enumerate(inference_engines):
    )
    print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
-# check the placement
+# Verify placement: the first two training actors share the same GPUs as
-# the first two training actors should be
+# the first inference engine.
-# on the same GPUs as the first inference engine
 assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
-# the last two training actors should be
+# Verify placement: the last two training actors share the same GPUs as
-# on the same GPUs as the second inference engine
+# the second inference engine.
 assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
-print("gather all the IPC handles from the training actors")
+print("Gather all the IPC handles from the training actors.")
 ipc_handles = {}
 for actor in training_actors:
    ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
-print("update the weights of the inference engines")
+print("Update the weights of the inference engines.")
 for llm in inference_engines:
    ray.get(
        llm.collective_rpc.remote(
            "update_weights_from_ipc_handles", args=(ipc_handles,)
        )
    )
-print("check if the weights are updated")
+print("Check if the weights are updated.")
 for llm in inference_engines:
    assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))
--- a/examples/offline_inference/skip_loading_weights_in_engine_init.py
+++ b/examples/offline_inference/skip_loading_weights_in_engine_init.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from vllm import LLM, RequestOutput, SamplingParams
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+def print_prompts_and_outputs(outputs: list[RequestOutput]) -> None:
+    print("-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+def main():
+    # Create an LLM without loading real weights
+    llm = LLM(
+        model="Qwen/Qwen3-0.6B",
+        load_format="dummy",
+        enforce_eager=True,
+        tensor_parallel_size=4,
+    )
+    outputs = llm.generate(prompts, sampling_params)
+    print("\nOutputs do not make sense:")
+    print_prompts_and_outputs(outputs)
+    # Update load format from `dummy` to `auto`
+    llm.collective_rpc(
+        "update_config", args=({"load_config": {"load_format": "auto"}},)
+    )
+    # Now reload real weights inplace
+    llm.collective_rpc("reload_weights")
+    # Check outputs make sense
+    outputs = llm.generate(prompts, sampling_params)
+    print("\nOutputs make sense after loading real weights:")
+    print_prompts_and_outputs(outputs)
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/spec_decode.py
+++ b/examples/offline_inference/spec_decode.py
@@ -84,6 +84,7 @@ def main():
        gpu_memory_utilization=0.8,
        speculative_config=speculative_config,
        disable_log_stats=False,
+        max_model_len=16384,
    )
    sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)

--- a/examples/offline_inference/vision_language.py
+++ b/examples/offline_inference/vision_language.py
@@ -429,6 +429,44 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    )
+# Nemontron_VL
+def run_nemotron_vl(questions: list[str], modality: str) -> ModelRequestData:
+    model_name = "nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=8192,
+        limit_mm_per_prompt={modality: 1},
+    )
+    assert modality == "image"
+    placeholder = "<image>"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    messages = [
+        [{"role": "user", "content": f"{placeholder}\n{question}"}]
+        for question in questions
+    ]
+    prompts = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    # Stop tokens for InternVL
+    # models variants may have different stop tokens
+    # please refer to the model card for the correct "stop words":
+    # https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
+    stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
+    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
+    stop_token_ids = [token_id for token_id in stop_token_ids if token_id is not None]
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompts=prompts,
+        stop_token_ids=stop_token_ids,
+    )
 # Keye-VL
 def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-8B-Preview"
@@ -1186,6 +1224,7 @@ model_example_map = {
    "h2ovl_chat": run_h2ovl,
    "idefics3": run_idefics3,
    "internvl_chat": run_internvl,
+    "nemotron_vl": run_nemotron_vl,
    "keye_vl": run_keye_vl,
    "kimi_vl": run_kimi_vl,
    "llava": run_llava,

--- a/examples/offline_inference/vision_language_embedding.py
+++ b/examples/offline_inference/vision_language_embedding.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """
 This example shows how to use vLLM for running offline inference with
-the correct prompt format on vision language models for multimodal embedding.
+the correct prompt format on vision language models for multimodal pooling.
 For most models, the prompt format should follow corresponding examples
 on HuggingFace model repository.
@@ -15,6 +15,7 @@ from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
 from PIL.Image import Image
 from vllm import LLM, EngineArgs
+from vllm.entrypoints.score_utils import ScoreMultiModalParam
 from vllm.multimodal.utils import fetch_image
 from vllm.utils import FlexibleArgumentParser
@@ -35,14 +36,22 @@ class TextImageQuery(TypedDict):
    image: Image
-QueryModality = Literal["text", "image", "text+image"]
+class TextImagesQuery(TypedDict):
-Query = Union[TextQuery, ImageQuery, TextImageQuery]
+    modality: Literal["text+images"]
+    text: str
+    image: ScoreMultiModalParam
+QueryModality = Literal["text", "image", "text+image", "text+images"]
+Query = Union[TextQuery, ImageQuery, TextImageQuery, TextImagesQuery]
 class ModelRequestData(NamedTuple):
    engine_args: EngineArgs
-    prompt: str
+    prompt: Optional[str] = None
-    image: Optional[Image]
+    image: Optional[Image] = None
+    query: Optional[str] = None
+    documents: Optional[ScoreMultiModalParam] = None
 def run_e5_v(query: Query) -> ModelRequestData:
@@ -107,6 +116,29 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
    )
+def run_jinavl_reranker(query: Query) -> ModelRequestData:
+    if query["modality"] != "text+images":
+        raise ValueError(f"Unsupported query modality: '{query['modality']}'")
+    engine_args = EngineArgs(
+        model="jinaai/jina-reranker-m0",
+        task="score",
+        max_model_len=32768,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 3136,
+            "max_pixels": 602112,
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        query=query["text"],
+        documents=query["image"],
+    )
 def get_query(modality: QueryModality):
    if modality == "text":
        return TextQuery(modality="text", text="A dog sitting in the grass")
@@ -128,6 +160,28 @@ def get_query(modality: QueryModality):
            ),
        )
+    if modality == "text+images":
+        return TextImagesQuery(
+            modality="text+images",
+            text="slm markdown",
+            image={
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
+                        },
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+                        },
+                    },
+                ]
+            },
+        )
    msg = f"Modality {modality} is not supported."
    raise ValueError(msg)
@@ -162,16 +216,31 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
        print("-" * 50)
+def run_score(model: str, modality: QueryModality, seed: Optional[int]):
+    query = get_query(modality)
+    req_data = model_example_map[model](query)
+    engine_args = asdict(req_data.engine_args) | {"seed": seed}
+    llm = LLM(**engine_args)
+    outputs = llm.score(req_data.query, req_data.documents)
+    print("-" * 30)
+    print([output.outputs.score for output in outputs])
+    print("-" * 30)
 model_example_map = {
    "e5_v": run_e5_v,
    "vlm2vec": run_vlm2vec,
+    "jinavl_reranker": run_jinavl_reranker,
 }
 def parse_args():
    parser = FlexibleArgumentParser(
        description="Demo on using vLLM for offline inference with "
-        "vision language models for multimodal embedding"
+        "vision language models for multimodal pooling tasks."
    )
    parser.add_argument(
        "--model-name",
@@ -181,6 +250,14 @@ def parse_args():
        choices=model_example_map.keys(),
        help="The name of the embedding model.",
    )
+    parser.add_argument(
+        "--task",
+        "-t",
+        type=str,
+        default="embedding",
+        choices=["embedding", "scoring"],
+        help="The task type.",
+    )
    parser.add_argument(
        "--modality",
        type=str,
@@ -198,7 +275,12 @@ def parse_args():
 def main(args: Namespace):
-    run_encode(args.model_name, args.modality, args.seed)
+    if args.task == "embedding":
+        run_encode(args.model_name, args.modality, args.seed)
+    elif args.task == "scoring":
+        run_score(args.model_name, args.modality, args.seed)
+    else:
+        raise ValueError(f"Unsupported task: {args.task}")
 if __name__ == "__main__":

--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_example_p2p_nccl_xpyd.sh
@@ -93,6 +93,7 @@ ensure_python_library_installed() {
 cleanup() {
    echo "Stopping everything…"
    trap - INT TERM        # prevent re-entrancy
+    pkill -9 -f "disagg_proxy_p2p_nccl_xpyd.py"
    kill -- -$$            # negative PID  ==  "this whole process-group"
    wait                   # reap children so we don't leave zombies
    exit 0

--- a/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
+++ b/examples/online_serving/disaggregated_serving_p2p_nccl_xpyd/disagg_proxy_p2p_nccl_xpyd.py
@@ -4,7 +4,9 @@
 import os
 import socket
 import threading
+import time
 import uuid
+from typing import Any
 import aiohttp
 import msgpack
@@ -12,12 +14,25 @@ import zmq
 from quart import Quart, make_response, request
 count = 0
-prefill_instances: dict[str, str] = {}  # http_address: zmq_address
+prefill_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)
-decode_instances: dict[str, str] = {}  # http_address: zmq_address
+decode_instances: dict[str, Any] = {}  # http_address: (zmq_address, stamp)
 prefill_cv = threading.Condition()
 decode_cv = threading.Condition()
+DEFAULT_PING_SECONDS = 5
+def _remove_oldest_instances(instances: dict[str, Any]) -> None:
+    oldest_key = next(iter(instances), None)
+    while oldest_key is not None:
+        value = instances[oldest_key]
+        if value[1] > time.time():
+            break
+        print(f"🔴Remove [HTTP:{oldest_key}, ZMQ:{value[0]}, stamp:{value[1]}]")
+        instances.pop(oldest_key, None)
+        oldest_key = next(iter(instances), None)
 def _listen_for_register(poller, router_socket):
    while True:
@@ -31,12 +46,23 @@ def _listen_for_register(poller, router_socket):
                global prefill_instances
                global prefill_cv
                with prefill_cv:
-                    prefill_instances[data["http_address"]] = data["zmq_address"]
+                    node = prefill_instances.pop(data["http_address"], None)
+                    prefill_instances[data["http_address"]] = (
+                        data["zmq_address"],
+                        time.time() + DEFAULT_PING_SECONDS,
+                    )
+                    _remove_oldest_instances(prefill_instances)
            elif data["type"] == "D":
                global decode_instances
                global decode_cv
                with decode_cv:
-                    decode_instances[data["http_address"]] = data["zmq_address"]
+                    node = decode_instances.pop(data["http_address"], None)
+                    decode_instances[data["http_address"]] = (
+                        data["zmq_address"],
+                        time.time() + DEFAULT_PING_SECONDS,
+                    )
+                    _remove_oldest_instances(decode_instances)
            else:
                print(
                    "Unexpected, Received message from %s, data: %s",
@@ -44,6 +70,9 @@ def _listen_for_register(poller, router_socket):
                    data,
                )
+            if node is None:
+                print(f"🔵Add [HTTP:{data['http_address']}, ZMQ:{data['zmq_address']}]")
 def start_service_discovery(hostname, port):
    if not hostname:
@@ -105,12 +134,14 @@ async def handle_request():
        with prefill_cv:
            prefill_list = list(prefill_instances.items())
            prefill_addr, prefill_zmq_addr = prefill_list[count % len(prefill_list)]
+            prefill_zmq_addr = prefill_zmq_addr[0]
        global decode_instances
        global decode_cv
        with decode_cv:
            decode_list = list(decode_instances.items())
            decode_addr, decode_zmq_addr = decode_list[count % len(decode_list)]
+            decode_zmq_addr = decode_zmq_addr[0]
        print(
            f"handle_request count: {count}, [HTTP:{prefill_addr}, "

--- a/examples/online_serving/elastic_ep/bench.sh
+++ b/examples/online_serving/elastic_ep/bench.sh
+#!/bin/bash
+MODEL_NAME="deepseek-ai/DeepSeek-V2-Lite"
+LOCAL_MODEL_PATH="/models/models--deepseek-ai--DeepSeek-V2-Lite/snapshots/604d5664dddd88a0433dbae533b7fe9472482de0"
+HOST="localhost"
+PORT=8006
+NUM_PROMPTS=20
+REQUEST_RATE=5
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --model)
+            MODEL_NAME="$2"
+            shift 2
+            ;;
+        --local-model)
+            MODEL_NAME=$LOCAL_MODEL_PATH
+            shift
+            ;;
+        --host)
+            HOST="$2"
+            shift 2
+            ;;
+        --port)
+            PORT="$2"
+            shift 2
+            ;;
+        --num-prompts)
+            NUM_PROMPTS="$2"
+            shift 2
+            ;;
+        --request-rate)
+            REQUEST_RATE="$2"
+            shift 2
+            ;;
+        -h|--help)
+            echo "Usage: $0 [OPTIONS]"
+            echo "Options:"
+            echo "  --model MODEL_NAME           Set model name or path (default: deepseek-ai/DeepSeek-V2-Lite)"
+            echo "  --local-model                Use local model path (convenience option)"
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Use -h or --help for usage information"
+            exit 1
+            ;;
+    esac
+done
+vllm bench serve \
+    --model $MODEL_NAME \
+    --host $HOST \
+    --port $PORT \
+    --num-prompts $NUM_PROMPTS \
+    --request-rate $REQUEST_RATE
--- a/examples/online_serving/elastic_ep/scale.py
+++ b/examples/online_serving/elastic_ep/scale.py
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import argparse
+import json
+import sys
+import requests
+def scale(host, port, new_dp_size):
+    url = f"http://{host}:{port}/scale_elastic_ep"
+    payload = {"new_data_parallel_size": new_dp_size}
+    headers = {"Content-Type": "application/json"}
+    print(f"Sending scale request to {url}")
+    print(f"Payload: {json.dumps(payload, indent=2)}")
+    try:
+        response = requests.post(url, json=payload, headers=headers, timeout=300)
+        print(f"Status Code: {response.status_code}")
+        print(f"Response: {response.text}")
+        if response.status_code == 200:
+            print("Scale up/down request successful!")
+            return True
+        else:
+            print("Scale up/down request failed!")
+            return False
+    except requests.exceptions.RequestException as e:
+        print(f"Request failed: {e}")
+        return False
+def main():
+    parser = argparse.ArgumentParser(description="Test scale up/down functionality")
+    parser.add_argument("--host", default="localhost", help="API server host")
+    parser.add_argument("--port", type=int, default=8006, help="API server port")
+    parser.add_argument(
+        "--new-dp-size", type=int, default=2, help="New data parallel size"
+    )
+    args = parser.parse_args()
+    success = scale(args.host, args.port, args.new_dp_size)
+    sys.exit(0 if success else 1)
+if __name__ == "__main__":
+    main()