Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori

38d80967 · zhuwenwen · 33650733 · 880c741b · 38d80967 · 38d80967
Commit 38d80967 authored Sep 12, 2025 by zhuwenwen
20 changed files
--- a/docs/usage/v1_guide.md
+++ b/docs/usage/v1_guide.md
@@ -83,7 +83,7 @@ based on assigned priority, with FCFS as a tie-breaker), configurable via the
 | Model Type                  | Status                                                                             |
 |-----------------------------|------------------------------------------------------------------------------------|
 | **Decoder-only Models**     | <nobr>🚀 Optimized</nobr>                                                          |
-| **Encoder-Decoder Models**  | <nobr>🟠 Delayed</nobr>                                                            |
+| **Encoder-Decoder Models**  | <nobr>🟢 Whisper only</nobr>                                                       |
 | **Embedding Models**        | <nobr>🟢 Functional</nobr>                                                         |
 | **Mamba Models**            | <nobr>🟢 (Mamba-2), 🟢 (Mamba-1)</nobr>                                            |
 | **Multimodal Models**       | <nobr>🟢 Functional</nobr>                                                         |
@@ -110,7 +110,7 @@ Models using selective state-space mechanisms instead of standard transformer at
 Models that use Mamba-2 and Mamba-1 layers (e.g., `Mamba2ForCausalLM`, `MambaForCausalLM`,`FalconMambaForCausalLM`) are supported.

 Hybrid models that combine Mamba-2 and Mamba-1 layers with standard attention layers are also supported (e.g., `BambaForCausalLM`,
-`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`).
+`Zamba2ForCausalLM`, `NemotronHForCausalLM`, `FalconH1ForCausalLM` and `GraniteMoeHybridForCausalLM`, `JambaForCausalLM`, `Plamo2ForCausalLM`).

 Hybrid models with mechanisms different to Mamba are also supported (e.g, `MiniMaxText01ForCausalLM`, `MiniMaxM1ForCausalLM`, `Lfm2ForCausalLM`).

@@ -118,8 +118,9 @@ Please note that prefix caching is not yet supported for any of the above models

 #### Encoder-Decoder Models

-Models requiring cross-attention between separate encoder and decoder (e.g., `BartForConditionalGeneration`, `MllamaForConditionalGeneration`)
-are not yet supported.
+Whisper is supported. Other models requiring cross-attention between separate
+encoder and decoder (e.g., `BartForConditionalGeneration`,
+`MllamaForConditionalGeneration`) are not yet supported.

 ### Features


--- a/examples/offline_inference/audio_language.py
+++ b/examples/offline_inference/audio_language.py
@@ -117,7 +117,7 @@ def run_gemma3n(question: str, audio_count: int) -> ModelRequestData:

 # Granite Speech
 def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
-    # NOTE - the setting in this example are somehat different than what is
+    # NOTE - the setting in this example are somewhat different from what is
    # optimal for granite speech, and it is generally recommended to use beam
    # search. Check the model README for suggested settings.
    # https://huggingface.co/ibm-granite/granite-speech-3.3-8b
@@ -146,6 +146,36 @@ def run_granite_speech(question: str, audio_count: int) -> ModelRequestData:
    )


+# MiDashengLM
+def run_midashenglm(question: str, audio_count: int):
+    model_name = "mispeech/midashenglm-7b"
+
+    engine_args = EngineArgs(
+        model=model_name,
+        trust_remote_code=True,
+        max_model_len=4096,
+        max_num_seqs=5,
+        limit_mm_per_prompt={"audio": audio_count},
+    )
+
+    audio_in_prompt = "".join(
+        ["<|audio_bos|><|AUDIO|><|audio_eos|>" for idx in range(audio_count)]
+    )
+
+    default_system = "You are a helpful language and speech assistant."
+
+    prompt = (
+        f"<|im_start|>system\n{default_system}<|im_end|>\n"
+        "<|im_start|>user\n"
+        f"{audio_in_prompt}{question}<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+    )
+
+
 # MiniCPM-O
 def run_minicpmo(question: str, audio_count: int) -> ModelRequestData:
    model_name = "openbmb/MiniCPM-o-2_6"
@@ -352,6 +382,7 @@ model_example_map = {
    "voxtral": run_voxtral,
    "gemma3n": run_gemma3n,
    "granite_speech": run_granite_speech,
+    "midashenglm": run_midashenglm,
    "minicpmo": run_minicpmo,
    "phi4_mm": run_phi4mm,
    "phi4_multimodal": run_phi4_multimodal,

--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -143,5 +143,5 @@ outputs = llm.chat(messages, sampling_params, tools=tools)

 print(outputs[0].outputs[0].text.strip())
 # yields
-#   'The weather in Dallas, TX is 85 degrees fahrenheit. '
+#   'The weather in Dallas, TX is 85 degrees Fahrenheit. '
 #   'It is partly cloudly, with highs in the 90's.'
--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -87,6 +87,11 @@ def parse_args():
        default=0.8,
        help=("Fraction of GPU memory vLLM is allowed to allocate (0.0, 1.0]."),
    )
+    parser.add_argument(
+        "--compilation-config",
+        type=int,
+        help=("Compilation optimization (O) level 0-3."),
+    )
    parser.add_argument(
        "--quantization",
        type=str,
@@ -106,6 +111,7 @@ def main(
    trust_remote_code,
    max_num_seqs,
    max_model_len,
+    compilation_config,
    gpu_memory_utilization,
    quantization,
 ):
@@ -162,6 +168,7 @@ def main(
        max_model_len=max_model_len,
        gpu_memory_utilization=gpu_memory_utilization,
        quantization=quantization,
+        compilation_config=compilation_config,
    )
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
@@ -218,6 +225,7 @@ if __name__ == "__main__":
                args.trust_remote_code,
                args.max_num_seqs,
                args.max_model_len,
+                args.compilation_config,
                args.gpu_memory_utilization,
                args.quantization,
            ),

--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -30,12 +30,12 @@ def run_prefill(prefill_done):
    ]
    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)

-    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # Using P2pNcclConnector to transmit KV caches between vLLM instances.
    # This instance is the prefill node (kv_producer, rank 0).
    # The number of parallel instances for KV cache transfer is set to 2,
-    # as required for PyNcclConnector.
+    # as required for P2pNcclConnector.
    ktc = KVTransferConfig(
-        kv_connector="PyNcclConnector",
+        kv_connector="P2pNcclConnector",
        kv_role="kv_producer",
        kv_rank=0,
        kv_parallel_size=2,
@@ -74,12 +74,12 @@ def run_decode(prefill_done):
    ]
    sampling_params = SamplingParams(temperature=0, top_p=0.95)

-    # Using PyNcclConnector to transmit KV caches between vLLM instances.
+    # Using P2pNcclConnector to transmit KV caches between vLLM instances.
    # This instance is the decode node (kv_consumer, rank 1).
    # The number of parallel instances for KV cache transfer is set to 2,
-    # as required for PyNcclConnector.
+    # as required for P2pNcclConnector.
    ktc = KVTransferConfig(
-        kv_connector="PyNcclConnector",
+        kv_connector="P2pNcclConnector",
        kv_role="kv_consumer",
        kv_rank=1,
        kv_parallel_size=2,

--- a/examples/offline_inference/encoder_decoder.py
+++ b/examples/offline_inference/encoder_decoder.py
@@ -5,6 +5,8 @@ Demonstrate prompting of text-to-text
 encoder/decoder models, specifically BART and mBART.

 This script is refactored to allow model selection via command-line arguments.
+
+NOTE: This example is not yet supported in V1.
 """

 import argparse

--- a/examples/offline_inference/encoder_decoder_multimodal.py
+++ b/examples/offline_inference/encoder_decoder_multimodal.py
@@ -5,6 +5,7 @@ This example shows how to use vLLM for running offline inference with
 the explicit/implicit prompt format on enc-dec LMMs for text generation.
 """

+import os
 import time
 from collections.abc import Sequence
 from dataclasses import asdict
@@ -130,6 +131,8 @@ def run_mllama():


 def run_whisper():
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
    engine_args = EngineArgs(
        model="openai/whisper-large-v3-turbo",
        max_model_len=448,

--- a/examples/offline_inference/logits_processor.py
+++ b/examples/offline_inference/logits_processor.py
--- a/examples/offline_inference/logits_processor/custom_req.py
+++ b/examples/offline_inference/logits_processor/custom_req.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates wrapping a request-level logits processor to be
+compatible with vLLM's batch-level logits processing
+
+For demo purposes, a dummy logits processor is employed which, if
+`target_token` is passed as a keyword argument to `SamplingParams.extra_args`,
+will mask out all tokens except `target_token`. This logits processor can be
+applied to a vector of logits associated with a single decode step for a single
+request. The logits processor cannot be applied to a request which does not
+pass in a `target_token` custom argument.
+
+The request-level dummy logits processor is wrapped to create a batch-level
+logits processor, which can apply the logits processor to output logits from
+all requests in the persistent batch in a given decode step. For requests which
+do not provide a `target_token` argument, the corresponding row of `logits`
+will not be modified.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect the `target_token` to be decoded in each step, yielding an output
+similar to that shown below:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+"""
+
+from typing import Any, Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.logger import init_logger
+from vllm.v1.sample.logits_processor import (
+    AdapterLogitsProcessor,
+    RequestLogitsProcessor,
+)
+
+logger = init_logger(__name__)
+
+
+class DummyPerReqLogitsProcessor:
+    """The request-level logits processor masks out all logits except the
+    token id identified by `target_token`"""
+
+    def __init__(self, target_token: int) -> None:
+        """Specify `target_token`"""
+        self.target_token = target_token
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        val_to_keep = logits[self.target_token].item()
+        logits[:] = float("-inf")
+        logits[self.target_token] = val_to_keep
+        return logits
+
+
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of wrapping a fake request-level logit processor to create a
+    batch-level logits processor"""
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """This method returns a new request-level logits processor, customized
+        to the `target_token` value associated with a particular request.
+
+        Returns None if the logits processor should not be applied to the
+        particular request. To use the logits processor the request must have
+        a "target_token" custom argument with an integer value.
+
+        Args:
+          params: per-request sampling params
+
+        Returns:
+          `Callable` request logits processor, or None
+        """
+        target_token: Optional[Any] = params.extra_args and params.extra_args.get(
+            "target_token"
+        )
+        if target_token is None:
+            return None
+        if not isinstance(target_token, int):
+            logger.warning(
+                "target_token value %s is not int; not applying logits"
+                " processor to request.",
+                target_token,
+            )
+            return None
+        return DummyPerReqLogitsProcessor(target_token)
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[WrappedPerReqLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/logits_processor/custom_req_init.py
+++ b/examples/offline_inference/logits_processor/custom_req_init.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+"""This example demonstrates a special case of wrapping a request-level logits
+processor, namely the case where it is necessary to utilize engine config or
+environment info passed to the constructor. The subclass must override the
+wrapper base class `__init__()` method to access the engine config, the device
+identifier, or the flag which indicates whether pinned memory is available.
+
+For demo purposes, a request-level dummy logits processor is employed which
+causes the same token (`target_token`) to be decoded in each step. The
+request-level dummy logits processor is wrapped to create a batch-level logits
+processor, which can apply the logits processor to output logits from all
+requests in the persistent batch in a given decode step.
+
+The wrapped dummy logits processor below models a scenario where we must
+disable the logits processor on non-"cuda" platforms. The wrapper base class
+`__init__()` is overridden in order to check this condition and set a flag.
+
+A batch is constructed with `temperature=0.0` and 50% of requests specifying
+`target_token`, and for these requests - and *only* these requests - we
+expect that on a "cuda" device the output will look something like:
+
+Generated Outputs:
+------------------------------------------------------------
+Prompt:    'Hello, my name is'
+Output:    " ' ' ' ' ' ' ' ' ' ' ' ' ' ' ' '"
+------------------------------------------------------------
+Prompt:    'The president of the United States is'
+Output:    " not a racist. He is a racist.\nHe's a racist because he"
+------------------------------------------------------------
+Prompt:    'The capital of France is'
+Output:    ' also also also also also also also also also also also also also
+             also also also'
+------------------------------------------------------------
+Prompt:    'The future of AI is'
+Output:    ' in the hands of the people.\n\nThe future of AI is in the'
+------------------------------------------------------------
+
+which indicates that the logits processor is running. However, on a non-"cuda"
+device, the first and third requests would not repeat the same token.
+"""
+
+from typing import Optional
+
+import torch
+
+from vllm import LLM, SamplingParams
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.v1.sample.logits_processor import (
+    AdapterLogitsProcessor,
+    RequestLogitsProcessor,
+)
+
+logger = init_logger(__name__)
+
+
+class DummyPerReqLogitsProcessor:
+    """The request-level logits processor masks out all logits except the
+    token id identified by `target_token`"""
+
+    def __init__(self, target_token: int) -> None:
+        """Specify `target_token`"""
+        self.target_token = target_token
+
+    def __call__(
+        self,
+        output_ids: list[int],
+        logits: torch.Tensor,
+    ) -> torch.Tensor:
+        val_to_keep = logits[self.target_token].item()
+        logits[:] = float("-inf")
+        logits[self.target_token] = val_to_keep
+        return logits
+
+
+class WrappedPerReqLogitsProcessor(AdapterLogitsProcessor):
+    """Example of overriding the wrapper class `__init__()` in order to utilize
+    info about the device type"""
+
+    def __init__(
+        self, vllm_config: VllmConfig, device: torch.device, is_pin_memory: bool
+    ):
+        super().__init__(vllm_config, device, is_pin_memory)
+        self.is_cuda = device.type == "cuda"
+
+    def is_argmax_invariant(self) -> bool:
+        return False
+
+    def new_req_logits_processor(
+        self,
+        params: SamplingParams,
+    ) -> Optional[RequestLogitsProcessor]:
+        """This method returns a new request-level logits processor, customized
+        to the `target_token` value associated with a particular request.
+
+        Returns None if the logits processor should not be applied to the
+        particular request. To use the logits processor the request must have
+        a "target_token" custom argument with an integer value, and the device
+        must be "cuda"-type
+
+        Args:
+          params: per-request sampling params
+
+        Returns:
+          `Callable` request logits processor, or None
+        """
+        if (
+            not self.is_cuda
+            or (
+                target_token := params.extra_args
+                and params.extra_args.get("target_token")
+            )
+            is None
+        ):
+            return None
+        if not isinstance(target_token, int):
+            logger.warning(
+                "target_token value %s is not int; not applying logits"
+                " processor to request.",
+                target_token,
+            )
+            return None
+        return DummyPerReqLogitsProcessor(target_token)
+
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a mixture of requests which do and don't utilize the dummy logitproc
+sampling_params_list = [
+    SamplingParams(temperature=0.0, extra_args={"target_token": 128}),
+    SamplingParams(temperature=0.0),
+    SamplingParams(temperature=0.0, extra_args={"target_token": 67}),
+    SamplingParams(temperature=0.0),
+]
+
+
+def main():
+    # Create an LLM.
+    llm = LLM(
+        model="facebook/opt-125m",
+        logits_processors=[WrappedPerReqLogitsProcessor],
+    )
+    # Generate texts from the prompts.
+    # The output is a list of RequestOutput objects
+    # that contain the prompt, generated text, and other information.
+    outputs = llm.generate(prompts, sampling_params_list)
+    # Print the outputs.
+    print("\nGenerated Outputs:\n" + "-" * 60)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt:    {prompt!r}")
+        print(f"Output:    {generated_text!r}")
+        print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/offline_inference/multilora_inference.py
+++ b/examples/offline_inference/multilora_inference.py
@@ -23,7 +23,7 @@ def create_test_prompts(
    2 requests for base model, 4 requests for the LoRA. We define 2
    different LoRA adapters (using the same model for demo purposes).
    Since we also set `max_loras=1`, the expectation is that the requests
-    with the second LoRA adapter will be ran after all requests with the
+    with the second LoRA adapter will be run after all requests with the
    first adapter have finished.
    """
    return [

--- a/examples/offline_inference/neuron.py
+++ b/examples/offline_inference/neuron.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        max_num_seqs=8,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in transformers-neuronx.
-        # TODO(liangfu): Support paged-attention in transformers-neuronx.
-        max_model_len=1024,
-        block_size=1024,
-        # ruff: noqa: E501
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        tensor_parallel_size=2,
-    )
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    print("-" * 50)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/neuron_eagle.py
+++ b/examples/offline_inference/neuron_eagle.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to run offline inference with an EAGLE speculative
-decoding model on neuron. To use EAGLE speculative decoding, you must use
-a draft model that is specifically fine-tuned for EAGLE speculation.
-Additionally, to use EAGLE with NxD Inference, the draft model must include
-the LM head weights from the target model. These weights are shared between
-the draft and target model.
-"""
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "What is annapurna labs?",
-]
-
-
-def main():
-    # Create a sampling params object.
-    sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
-
-    # Create an LLM.
-    llm = LLM(
-        model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
-        speculative_config={
-            "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
-            "num_speculative_tokens": 5,
-            "max_model_len": 2048,
-        },
-        max_num_seqs=4,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in neuronx-distributed-inference.
-        max_model_len=2048,
-        block_size=2048,
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        tensor_parallel_size=32,
-        override_neuron_config={
-            "enable_eagle_speculation": True,
-            "enable_fused_speculation": True,
-        },
-    )
-
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, \n\n\n Generated text: {generated_text!r}")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/neuron_int8_quantization.py
+++ b/examples/offline_inference/neuron_int8_quantization.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-
-from vllm import LLM, SamplingParams
-
-# creates XLA hlo graphs for all the context length buckets.
-os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
-# creates XLA hlo graphs for all the token gen buckets.
-os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
-# Quantizes neuron model weight to int8 ,
-# The default config for quantization is int8 dtype.
-os.environ["NEURON_QUANT_DTYPE"] = "s8"
-
-# Sample prompts.
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-# Create a sampling params object.
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-
-
-def main():
-    # Create an LLM.
-    llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        max_num_seqs=8,
-        # The max_model_len and block_size arguments are required to be same as
-        # max sequence length when targeting neuron device.
-        # Currently, this is a known limitation in continuous batching support
-        # in transformers-neuronx.
-        # TODO(liangfu): Support paged-attention in transformers-neuronx.
-        max_model_len=2048,
-        block_size=2048,
-        # ruff: noqa: E501
-        # The device can be automatically detected when AWS Neuron SDK is installed.
-        # The device argument can be either unspecified for automated detection,
-        # or explicitly assigned.
-        device="neuron",
-        quantization="neuron_quant",
-        override_neuron_config={
-            "cast_logits_dtype": "bfloat16",
-        },
-        tensor_parallel_size=2,
-    )
-    # Generate texts from the prompts. The output is a list of RequestOutput objects
-    # that contain the prompt, generated text, and other information.
-    outputs = llm.generate(prompts, sampling_params)
-    # Print the outputs.
-    print("-" * 50)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
-        print("-" * 50)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/neuron_multimodal.py
+++ b/examples/offline_inference/neuron_multimodal.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-import requests
-import torch
-from neuronx_distributed_inference.models.mllama.utils import add_instruct
-from PIL import Image
-
-from vllm import LLM, SamplingParams, TextPrompt
-
-
-def get_image(image_url):
-    image = Image.open(requests.get(image_url, stream=True).raw)
-    return image
-
-
-# Model Inputs
-PROMPTS = [
-    "What is in this image? Tell me a story",
-    "What is the recipe of mayonnaise in two sentences?",
-    "Describe this image",
-    "What is the capital of Italy famous for?",
-]
-IMAGES = [
-    get_image(
-        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
-    ),
-    None,
-    get_image(
-        "https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
-    ),
-    None,
-]
-SAMPLING_PARAMS = [
-    dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
-    for _ in range(len(PROMPTS))
-]
-
-
-def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
-    # Prepare all inputs for mllama generation, including:
-    # 1. put text prompt into instruct chat template
-    # 2. compose single text and single image prompt into Vllm's prompt class
-    # 3. prepare sampling parameters
-    input_image = single_image
-    has_image = torch.tensor([1])
-    if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
-        has_image = torch.tensor([0])
-
-    instruct_prompt = add_instruct(prompt, has_image)
-    inputs = TextPrompt(prompt=instruct_prompt)
-
-    if input_image is not None:
-        inputs["multi_modal_data"] = {"image": input_image}
-
-    sampling_params = SamplingParams(**sampling_params)
-    return inputs, sampling_params
-
-
-def print_outputs(outputs):
-    # Print the outputs.
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def main():
-    assert (
-        len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
-    ), f"""Text, image prompts and sampling parameters should have the 
-            same batch size; but got {len(PROMPTS)}, {len(IMAGES)}, 
-            and {len(SAMPLING_PARAMS)}"""
-
-    # Create an LLM.
-    llm = LLM(
-        model="meta-llama/Llama-3.2-11B-Vision-Instruct",
-        max_num_seqs=1,
-        max_model_len=4096,
-        block_size=4096,
-        device="neuron",
-        tensor_parallel_size=32,
-        override_neuron_config={
-            "sequence_parallel_enabled": False,
-            "skip_warmup": True,
-            "save_sharded_checkpoint": True,
-            "on_device_sampling_config": {
-                "global_topk": 1,
-                "dynamic": False,
-                "deterministic": False,
-            },
-        },
-    )
-
-    batched_inputs = []
-    batched_sample_params = []
-    for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
-        inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
-        # test batch-size = 1
-        outputs = llm.generate(inputs, sampling_params)
-        print_outputs(outputs)
-        batched_inputs.append(inputs)
-        batched_sample_params.append(sampling_params)
-
-    # test batch-size = 4
-    outputs = llm.generate(batched_inputs, batched_sample_params)
-    print_outputs(outputs)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to run offline inference with a speculative
-decoding model on neuron.
-"""
-
-import os
-
-from vllm import LLM, SamplingParams
-
-# Sample prompts.
-prompts = [
-    "Hello, I am a language model and I can help",
-    "The president of the United States is",
-    "The capital of France is",
-]
-
-
-def config_buckets():
-    """Configure context length and token gen buckets."""
-    # creates XLA hlo graphs for all the context length buckets.
-    os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
-    # creates XLA hlo graphs for all the token gen buckets.
-    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
-
-
-def initialize_llm():
-    """Create an LLM with speculative decoding."""
-    return LLM(
-        model="openlm-research/open_llama_7b",
-        speculative_config={
-            "model": "openlm-research/open_llama_3b",
-            "num_speculative_tokens": 4,
-            "max_model_len": 2048,
-        },
-        max_num_seqs=4,
-        max_model_len=2048,
-        block_size=2048,
-        device="neuron",
-        tensor_parallel_size=32,
-    )
-
-
-def process_requests(llm: LLM, sampling_params: SamplingParams):
-    """Generate texts from prompts and print them."""
-    outputs = llm.generate(prompts, sampling_params)
-    for output in outputs:
-        prompt = output.prompt
-        generated_text = output.outputs[0].text
-        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-def main():
-    """Main function that sets up the llm and processes prompts."""
-    config_buckets()
-    llm = initialize_llm()
-    # Create a sampling params object.
-    sampling_params = SamplingParams(max_tokens=100, top_k=1)
-    process_requests(llm, sampling_params)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/offline_inference/prithvi_geospatial_mae.py
+++ b/examples/offline_inference/prithvi_geospatial_mae.py
@@ -45,7 +45,11 @@ datamodule_config = {
 class PrithviMAE:
    def __init__(self, model):
        self.model = LLM(
-            model=model, skip_tokenizer_init=True, dtype="float16", enforce_eager=True
+            model=model,
+            skip_tokenizer_init=True,
+            dtype="float16",
+            enforce_eager=True,
+            model_impl="terratorch",
        )

    def run(self, input_data, location_coords):

--- a/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
+++ b/examples/offline_inference/prithvi_geospatial_mae_io_processor.py
@@ -12,13 +12,13 @@ from vllm.pooling_params import PoolingParams
 # multimodal data. In this specific case this example will take a geotiff
 # image as input, process it using the multimodal data processor, and
 # perform inference.
-# Reuirement - install plugin at:
+# Requirement - install plugin at:
 #   https://github.com/christian-pinto/prithvi_io_processor_plugin


 def main():
    torch.set_default_dtype(torch.float16)
-    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif"  # noqa: E501
+    image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/valencia_example_2024-10-26.tiff"  # noqa: E501

    img_prompt = dict(
        data=image_url,
@@ -36,7 +36,8 @@ def main():
        # to avoid the model going OOM.
        # The maximum number depends on the available GPU memory
        max_num_seqs=32,
-        io_processor_plugin="prithvi_to_tiff_india",
+        io_processor_plugin="prithvi_to_tiff",
+        model_impl="terratorch",
    )

    pooling_params = PoolingParams(task="encode", softmax=False)

--- a/examples/offline_inference/rlhf_colocate.py
+++ b/examples/offline_inference/rlhf_colocate.py
@@ -28,12 +28,15 @@ Learn more about Ray placement groups:
 https://docs.ray.io/en/latest/placement-groups.html
 """

+import gc
 import os

 import ray
 import torch
+import zmq
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+from torch.multiprocessing.reductions import reduce_tensor

 from vllm import LLM

@@ -86,20 +89,72 @@ class RayTrainingActor:
        from vllm.platforms import current_platform

        self.device_uuid = current_platform.get_device_uuid(0)
+        self.zmq_context = zmq.Context()
+        self.zmq_address_counter = 0
+        self.zmq_handle = None

    def report_device_id(self) -> str:
        return self.device_uuid

-    def get_weight_ipc_handles(self):
-        from torch.multiprocessing.reductions import reduce_tensor
+    def get_zmq_handles(self) -> dict[str, str]:
+        suffix = f"{self.device_uuid}-{self.zmq_address_counter}"
+        self.zmq_handle = f"ipc:///tmp/rl-colocate-zmq-{suffix}.sock"
+        self.zmq_address_counter += 1
+        return {self.device_uuid: self.zmq_handle}

-        data = {}
-        for name, p in self.model.named_parameters():
-            # A training actor might hold only a subset of the weights and may
-            # need to gather weights from other actors. For demonstration
-            # purposes, each training actor owns the full weight set.
-            data[name] = reduce_tensor(p.detach())
-        return {self.device_uuid: data}
+    def update_weights(self):
+        # align size to avoid misaligned address
+        align_size = 256
+
+        def get_size(p: torch.Tensor) -> int:
+            return (p.nbytes + align_size - 1) // align_size * align_size
+
+        named_parameters: dict[str, torch.nn.Parameter] = dict(
+            self.model.named_parameters()
+        )
+        max_tensor_size = max(get_size(p) for p in named_parameters.values())
+        # use max_tensor_size * 2 as buffer size
+        buffer = torch.empty(max_tensor_size * 2, dtype=torch.uint8, device="cuda:0")
+        s = self.zmq_context.socket(zmq.REQ)
+        s.bind(self.zmq_handle)
+        handle = reduce_tensor(buffer)
+
+        offset = 0
+        buckets: list[tuple[list[dict], list[torch.Tensor]]] = []
+        named_tensors: list[dict] = []
+        real_tensors: list[torch.Tensor] = []
+        for name, p in named_parameters.items():
+            size = get_size(p)
+            if offset + size > buffer.numel():
+                buckets.append((named_tensors, real_tensors))
+                named_tensors, real_tensors = [], []
+                offset = 0
+            # assume tensors are contiguous
+            named_tensors.append(
+                {"name": name, "dtype": p.dtype, "shape": p.shape, "offset": offset}
+            )
+            real_tensors.append(p)
+            offset += size
+        if named_tensors:
+            buckets.append((named_tensors, real_tensors))
+        s.send_pyobj(handle)
+        s.recv()
+        for named_tensors, real_tensors in buckets:
+            offset = 0
+            for p in real_tensors:
+                buffer[offset : offset + p.nbytes].data.copy_(
+                    p.data.view(-1).view(dtype=torch.uint8), non_blocking=True
+                )
+                offset += get_size(p)
+            torch.cuda.synchronize()
+            s.send_pyobj(named_tensors)
+            s.recv()
+        s.send_pyobj(None)
+        s.recv()
+        s.close()
+        del buffer
+        gc.collect()
+        torch.cuda.empty_cache()


 # Ray manages four GPUs.
@@ -175,18 +230,22 @@ assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
 # the second inference engine.
 assert training_actor_device_ids[2:] == inference_engine_device_ids[1]

-print("Gather all the IPC handles from the training actors.")
-ipc_handles = {}
+print("Gather all the ZMQ handles from the training actors.")
+zmq_handles = {}
 for actor in training_actors:
-    ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
+    zmq_handles.update(ray.get(actor.get_zmq_handles.remote()))
+
+print(f"ZMQ handles: {zmq_handles}")

 print("Update the weights of the inference engines.")
-for llm in inference_engines:
-    ray.get(
-        llm.collective_rpc.remote(
-            "update_weights_from_ipc_handles", args=(ipc_handles,)
-        )
-    )
+ray.get(
+    [actor.update_weights.remote() for actor in training_actors]
+    + [
+        llm.collective_rpc.remote("update_weights_from_ipc", args=(zmq_handles,))
+        for llm in inference_engines
+    ]
+)
+
 print("Check if the weights are updated.")
 for llm in inference_engines:
    assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))
--- a/examples/offline_inference/rlhf_utils.py
+++ b/examples/offline_inference/rlhf_utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import gc
+from typing import Callable, Optional, TypedDict
+
 import torch
+import zmq


 def stateless_init_process_group(master_address, master_port, rank, world_size, device):
@@ -66,6 +70,27 @@ class WorkerExtension:
        return weights_updated


+def rebuild_ipc(
+    handle: tuple[Callable, tuple], device_id: Optional[int] = None
+) -> torch.Tensor:
+    func, args = handle
+    list_args = list(args)
+    if device_id is not None:
+        # the key is to change device id to the current device id
+        # in case two processes have different CUDA_VISIBLE_DEVICES
+        list_args[6] = device_id
+    buffer = func(*list_args)
+    return buffer
+
+
+class FlattenedTensorMetadata(TypedDict):
+    name: str
+    shape: torch.Size
+    dtype: torch.dtype
+    # specify the start offset of this tensor in shared ipc_buffer tensor
+    offset: int
+
+
 class ColocateWorkerExtension:
    """
    The class for vLLM's worker to inherit from, in the colocate setting.
@@ -76,27 +101,62 @@ class ColocateWorkerExtension:
    should pass the full qualified name as `worker_extension_cls` argument.
    """

+    def update_weights_from_ipc(self, zmq_handles: dict[str, str]):
+        from vllm.model_executor.model_loader.utils import process_weights_after_loading
+
+        assert self.device is not None
+        if not hasattr(self, "_zmq_ctx") or self._zmq_ctx is None:
+            self._zmq_ctx = zmq.Context()
+        socket = self._zmq_ctx.socket(zmq.REP)
+        socket.connect(zmq_handles[self.report_device_id()])
+        buffer: Optional[torch.Tensor] = None
+        while True:
+            payload: tuple[Callable, tuple] | list[FlattenedTensorMetadata] | None = (
+                socket.recv_pyobj()
+            )
+            if payload is None:
+                # means the update is done
+                process_weights_after_loading(
+                    self.model_runner.model, self.model_config, self.device
+                )
+                torch.cuda.synchronize()
+                socket.send(b"")
+                break
+            if isinstance(payload, tuple):
+                # an ipc handle that vLLM can use `func, args = handle`
+                # and `func(*args)` to rebuild GPU tensor.
+                buffer = rebuild_ipc(payload, self.device.index)
+                assert buffer.dtype == torch.uint8
+                socket.send(b"")
+                continue
+            assert isinstance(payload, list)
+            assert buffer is not None
+            weights = []
+            for item in payload:
+                shape = item["shape"]
+                if isinstance(shape, (list, tuple)):
+                    shape = torch.Size(shape)
+                assert isinstance(shape, torch.Size)
+                dtype, offset = item["dtype"], item["offset"]
+                size = dtype.itemsize * shape.numel()
+                tensor = buffer[offset : offset + size].view(dtype=dtype).view(shape)
+                weights.append((item["name"], tensor))
+            self.model_runner.model.load_weights(weights=weights)
+            del weights
+            torch.cuda.synchronize()
+            socket.send(b"")
+
+        socket.close()
+        del buffer
+        gc.collect()
+        torch.cuda.empty_cache()
+
    def report_device_id(self) -> str:
        from vllm.platforms import current_platform

        self.device_uuid = current_platform.get_device_uuid(self.device.index)
        return self.device_uuid

-    def update_weights_from_ipc_handles(self, ipc_handles):
-        handles = ipc_handles[self.device_uuid]
-        device_id = self.device.index
-        weights = []
-        for name, handle in handles.items():
-            func, args = handle
-            list_args = list(args)
-            # the key is to change device id to the current device id
-            # in case two processes have different CUDA_VISIBLE_DEVICES
-            list_args[6] = device_id
-            tensor = func(*list_args)
-            weights.append((name, tensor))
-        self.model_runner.model.load_weights(weights=weights)
-        torch.cuda.synchronize()
-
    def check_weights_changed(self):
        """
        Check if the weights are updated to 0.