Merge tag 'v0.9.0' into v0.9.0-ori

7a985548 · zhuwenwen · 45d3785c · dc1440cf · 7a985548 · 7a985548
Commit 7a985548 authored May 22, 2025 by zhuwenwen
20 changed files
--- a/docs/source/serving/serve_args.md
+++ b/docs/source/serving/serve_args.md
+(serve-args)=
+# Server Arguments
+The `vllm serve` command is used to launch the OpenAI-compatible server.
+## CLI Arguments
+The following are all arguments available from the `vllm serve` command:
+<!--- pyml disable-num-lines 7 no-space-in-emphasis -->
+```{eval-rst}
+.. argparse::
+    :module: vllm.entrypoints.openai.cli_args
+    :func: create_parser_for_docs
+    :prog: vllm serve
+    :nodefaultconst:
+    :markdownhelp:
+```
+## Configuration file
+You can load CLI arguments via a [YAML](https://yaml.org/) config file.
+The argument names must be the long form of those outlined [above](#serve-args).
+For example:
+```yaml
+# config.yaml
+model: meta-llama/Llama-3.1-8B-Instruct
+host: "127.0.0.1"
+port: 6379
+uvicorn-log-level: "info"
+```
+To use the above config file:
+```bash
+vllm serve --config config.yaml
+```
+:::{note}
+In case an argument is supplied simultaneously using command line and the config file, the value from the command line will take precedence.
+The order of priorities is `command line > config file values > defaults`.
+e.g. `vllm serve SOME_MODEL --config config.yaml`, SOME_MODEL takes precedence over `model` in config file.
+:::
--- a/examples/lmcache/README.md
+++ b/examples/lmcache/README.md
@@ -44,8 +44,8 @@ The main script generates several log files:
 ## 2. CPU Offload Examples
- `cpu_offload_lmcache_v0.py` - CPU offloading implementation for vLLM v0
+- `python cpu_offload_lmcache.py -v v0` - CPU offloading implementation for vLLM v0
- `cpu_offload_lmcache_v1.py` - CPU offloading implementation for vLLM v1
+- `python cpu_offload_lmcache.py -v v1` - CPU offloading implementation for vLLM v1
 ## 3. KV Cache Sharing

--- a/examples/lmcache/cpu_offload_lmcache_v0.py
+++ b/examples/lmcache/cpu_offload_lmcache_v0.py
 # SPDX-License-Identifier: Apache-2.0
 """
 This file demonstrates the example usage of cpu offloading
-with LMCache.
+with LMCache in vLLM v1 or v0.
+Usage:
+    Specify vLLM version
+    -v v0 : Use LMCacheConnector
+            model = mistralai/Mistral-7B-Instruct-v0.2
+            (Includes enable_chunked_prefill = True)
+    -v v1 : Use LMCacheConnectorV1 (default)
+            model = meta-llama/Meta-Llama-3.1-8B-Instruct
+            (Without enable_chunked_prefill)
 Note that `lmcache` is needed to run this example.
 Requirements: Linux, Python: 3.10 or higher, CUDA: 12.1
 Learn more about LMCache environment setup, please refer to:
 https://docs.lmcache.ai/getting_started/installation.html
 """
+import argparse
 import contextlib
 import os
 import time
+from dataclasses import asdict
 from lmcache.experimental.cache_engine import LMCacheEngineBuilder
 from lmcache.integration.vllm.utils import ENGINE_NAME
 from vllm import LLM, SamplingParams
 from vllm.config import KVTransferConfig
+from vllm.engine.arg_utils import EngineArgs
-def setup_environment_variables():
+def setup_environment_variables(vllm_version: str):
    # LMCache-related environment variables
    # Use experimental features in LMCache
    os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
@@ -29,21 +44,37 @@ def setup_environment_variables():
    os.environ["LMCACHE_LOCAL_CPU"] = "True"
    # Set local CPU memory limit to 5.0 GB
    os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
+    if vllm_version == "v0":
+        os.environ["VLLM_USE_V1"] = "0"
 @contextlib.contextmanager
-def build_llm_with_lmcache():
+def build_llm_with_lmcache(lmcache_connector: str, model: str,
-    ktc = KVTransferConfig.from_cli(
+                           vllm_version: str):
-        '{"kv_connector":"LMCacheConnector", "kv_role":"kv_both"}')
+    ktc = KVTransferConfig(
+        kv_connector=lmcache_connector,
+        kv_role="kv_both",
+    )
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # memory. Reduce the value if your GPU has less memory.
    # Note: LMCache supports chunked prefill (see vLLM#14505, LMCache#392).
-    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
+    if vllm_version == "v0":
-              kv_transfer_config=ktc,
+        llm_args = EngineArgs(
-              max_model_len=8000,
+            model=model,
-              enable_chunked_prefill=True,
+            kv_transfer_config=ktc,
-              gpu_memory_utilization=0.8)
+            max_model_len=8000,
+            gpu_memory_utilization=0.8,
+            enable_chunked_prefill=True,  # Only in v0
+        )
+    else:
+        llm_args = EngineArgs(
+            model=model,
+            kv_transfer_config=ktc,
+            max_model_len=8000,
+            gpu_memory_utilization=0.8,
+        )
+    llm = LLM(**asdict(llm_args))
    try:
        yield llm
    finally:
@@ -57,6 +88,9 @@ def print_output(
    sampling_params: SamplingParams,
    req_str: str,
 ):
+    # Should be able to see logs like the following:
+    # `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
+    # This indicates that the KV cache has been stored in LMCache.
    start = time.time()
    outputs = llm.generate(prompt, sampling_params)
    print("-" * 50)
@@ -68,10 +102,29 @@ def print_output(
    print("-" * 50)
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-v",
+                        "--version",
+                        choices=["v0", "v1"],
+                        default="v1",
+                        help="Specify vLLM version (default: v1)")
+    return parser.parse_args()
 def main():
-    setup_environment_variables()
+    args = parse_args()
+    if args.version == "v0":
+        lmcache_connector = "LMCacheConnector"
+        model = "mistralai/Mistral-7B-Instruct-v0.2"
+    else:
+        lmcache_connector = "LMCacheConnectorV1"
+        model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
+    setup_environment_variables(args.version)
-    with build_llm_with_lmcache() as llm:
+    with build_llm_with_lmcache(lmcache_connector, model, args.version) as llm:
        # This example script runs two requests with a shared prefix.
        # Define the shared prompt and specific prompts

--- a/examples/lmcache/cpu_offload_lmcache_v1.py
+++ b/examples/lmcache/cpu_offload_lmcache_v1.py
-# SPDX-License-Identifier: Apache-2.0
-"""
-This file demonstrates the example usage of cpu offloading
-with LMCache in vLLM v1.
-Note that lmcache needs to be installed to run this example.
-Learn more about LMCache in https://github.com/LMCache/LMCache.
-"""
-import os
-from lmcache.experimental.cache_engine import LMCacheEngineBuilder
-from lmcache.integration.vllm.utils import ENGINE_NAME
-from vllm import LLM, SamplingParams
-from vllm.config import KVTransferConfig
-# LMCache-related environment variables
-# Use experimental features in LMCache
-os.environ["LMCACHE_USE_EXPERIMENTAL"] = "True"
-# LMCache is set to use 256 tokens per chunk
-os.environ["LMCACHE_CHUNK_SIZE"] = "256"
-# Enable local CPU backend in LMCache
-os.environ["LMCACHE_LOCAL_CPU"] = "True"
-# Set local CPU memory limit to 5.0 GB
-os.environ["LMCACHE_MAX_LOCAL_CPU_SIZE"] = "5.0"
-# This example script runs two requests with a shared prefix.
-shared_prompt = "Hello, how are you?" * 1000
-first_prompt = [
-    shared_prompt + "Hello, my name is",
-]
-second_prompt = [
-    shared_prompt + "Tell me a very long story",
-]
-sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
-ktc = KVTransferConfig.from_cli(
-    '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
-# Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
-# memory. Reduce the value if your GPU has less memory.
-# Note that LMCache is not compatible with chunked prefill for now.
-llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
-          kv_transfer_config=ktc,
-          max_model_len=8000,
-          gpu_memory_utilization=0.8)
-# Should be able to see logs like the following:
-# `LMCache INFO: Storing KV cache for 6006 out of 6006 tokens for request 0`
-# This indicates that the KV cache has been stored in LMCache.
-outputs = llm.generate(first_prompt, sampling_params)
-for output in outputs:
-    generated_text = output.outputs[0].text
-    print(f"Generated text: {generated_text!r}")
-# Clean up lmcache backend
-LMCacheEngineBuilder.destroy(ENGINE_NAME)
--- a/examples/lmcache/disagg_prefill_lmcache_v0.py
+++ b/examples/lmcache/disagg_prefill_lmcache_v0.py
@@ -49,9 +49,10 @@ def run_prefill(prefill_done, prompts):
    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
-    ktc = KVTransferConfig.from_cli(
+    ktc = KVTransferConfig(kv_connector="LMCacheConnector",
-        '{"kv_connector":"LMCacheConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
+                           kv_role="kv_producer",
-    )
+                           kv_rank=0,
+                           kv_parallel_size=2)
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # memory. Reduce the value if your GPU has less memory.
    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
@@ -78,9 +79,10 @@ def run_decode(prefill_done, prompts, timeout=1):
    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
-    ktc = KVTransferConfig.from_cli(
+    ktc = KVTransferConfig(kv_connector="LMCacheConnector",
-        '{"kv_connector":"LMCacheConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
+                           kv_role="kv_consumer",
-    )
+                           kv_rank=1,
+                           kv_parallel_size=2)
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # of memory. Reduce the value if your GPU has less memory.
    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",

--- a/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
+++ b/examples/lmcache/disagg_prefill_lmcache_v1/disagg_vllm_launcher.sh
@@ -54,6 +54,6 @@ elif [[ $1 == "decoder" ]]; then
 else
    echo "Invalid role: $1"
-    echo "Should be either prefill, decode"
+    echo "Should be either prefiller, decoder"
    exit 1
 fi
--- a/examples/lmcache/kv_cache_sharing_lmcache_v1.py
+++ b/examples/lmcache/kv_cache_sharing_lmcache_v1.py
@@ -49,8 +49,8 @@ def run_store(store_done, prompts):
    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
-    ktc = KVTransferConfig.from_cli(
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
-        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
+                           kv_role="kv_both")
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # memory. Reduce the value if your GPU has less memory.
    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",
@@ -76,8 +76,8 @@ def run_retrieve(store_done, prompts, timeout=1):
    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
-    ktc = KVTransferConfig.from_cli(
+    ktc = KVTransferConfig(kv_connector="LMCacheConnectorV1",
-        '{"kv_connector":"LMCacheConnectorV1", "kv_role":"kv_both"}')
+                           kv_role="kv_both")
    # Set GPU memory utilization to 0.8 for an A40 GPU with 40GB
    # of memory. Reduce the value if your GPU has less memory.
    llm = LLM(model="mistralai/Mistral-7B-Instruct-v0.2",

--- a/examples/offline_inference/basic/chat.py
+++ b/examples/offline_inference/basic/chat.py
@@ -7,9 +7,8 @@ from vllm.utils import FlexibleArgumentParser
 def create_parser():
    parser = FlexibleArgumentParser()
    # Add engine args
-    engine_group = parser.add_argument_group("Engine arguments")
+    EngineArgs.add_cli_args(parser)
-    EngineArgs.add_cli_args(engine_group)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
-    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)

--- a/examples/offline_inference/basic/generate.py
+++ b/examples/offline_inference/basic/generate.py
@@ -7,9 +7,8 @@ from vllm.utils import FlexibleArgumentParser
 def create_parser():
    parser = FlexibleArgumentParser()
    # Add engine args
-    engine_group = parser.add_argument_group("Engine arguments")
+    EngineArgs.add_cli_args(parser)
-    EngineArgs.add_cli_args(engine_group)
+    parser.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
-    engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
    # Add sampling params
    sampling_group = parser.add_argument_group("Sampling parameters")
    sampling_group.add_argument("--max-tokens", type=int)

--- a/examples/offline_inference/chat_with_tools.py
+++ b/examples/offline_inference/chat_with_tools.py
@@ -68,7 +68,7 @@ def get_current_weather(city: str, state: str, unit: 'str'):
            "partly cloudly, with highs in the 90's.")
-tool_funtions = {"get_current_weather": get_current_weather}
+tool_functions = {"get_current_weather": get_current_weather}
 tools = [{
    "type": "function",
@@ -122,7 +122,7 @@ messages.append({
 # above defined function
 tool_calls = json.loads(output)
 tool_answers = [
-    tool_funtions[call['name']](**call['arguments']) for call in tool_calls
+    tool_functions[call['name']](**call['arguments']) for call in tool_calls
 ]
 # append the answer as a tool message and let the LLM give you an answer

--- a/examples/offline_inference/data_parallel.py
+++ b/examples/offline_inference/data_parallel.py
@@ -65,11 +65,17 @@ def parse_args():
                        type=int,
                        default=0,
                        help="Master node port")
+    parser.add_argument("--enforce-eager",
+                        action='store_true',
+                        help="Enforce eager mode execution.")
+    parser.add_argument("--trust-remote-code",
+                        action='store_true',
+                        help="Trust remote code.")
    return parser.parse_args()
 def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
-         dp_master_port, GPUs_per_dp_rank):
+         dp_master_port, GPUs_per_dp_rank, enforce_eager, trust_remote_code):
    os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
    os.environ["VLLM_DP_RANK_LOCAL"] = str(local_dp_rank)
    os.environ["VLLM_DP_SIZE"] = str(dp_size)
@@ -109,10 +115,13 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
                                     max_tokens=[16, 20][global_dp_rank % 2])
    # Create an LLM.
-    llm = LLM(model=model,
+    llm = LLM(
-              tensor_parallel_size=GPUs_per_dp_rank,
+        model=model,
-              enforce_eager=True,
+        tensor_parallel_size=GPUs_per_dp_rank,
-              enable_expert_parallel=True)
+        enforce_eager=enforce_eager,
+        enable_expert_parallel=True,
+        trust_remote_code=trust_remote_code,
+    )
    outputs = llm.generate(prompts, sampling_params)
    # Print the outputs.
    for i, output in enumerate(outputs):
@@ -155,7 +164,8 @@ if __name__ == "__main__":
        proc = Process(target=main,
                       args=(args.model, dp_size, local_dp_rank,
                             global_dp_rank, dp_master_ip, dp_master_port,
-                             tp_size))
+                             tp_size, args.enforce_eager,
+                             args.trust_remote_code))
        proc.start()
        procs.append(proc)
    exit_code = 0

--- a/examples/offline_inference/disaggregated-prefill-v1/README.md
+++ b/examples/offline_inference/disaggregated-prefill-v1/README.md
+# Disaggregated Prefill V1
+This example contains scripts that demonstrate disaggregated prefill in the offline setting of vLLM.
+## Files
+- `run.sh` - A helper script that will run `prefill_example.py` and `decode_example.py` sequentially.
+- `prefill_example.py` - A script which performs prefill only, saving the KV state to the `local_storage` directory and the prompts to `output.txt`.
+- `decode_example.py` - A script which performs decode only, loading the KV state from the `local_storage` directory and the prompts from `output.txt`.
--- a/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/decode_example.py
@@ -16,16 +16,17 @@ except FileNotFoundError:
 sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
-llm = LLM(
+llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
-    model="meta-llama/Llama-3.2-1B-Instruct",
+          enforce_eager=True,
-    enforce_eager=True,
+          gpu_memory_utilization=0.8,
-    gpu_memory_utilization=0.8,
+          max_num_batched_tokens=64,
-    max_num_batched_tokens=64,
+          max_num_seqs=16,
-    max_num_seqs=16,
+          kv_transfer_config=KVTransferConfig(
-    kv_transfer_config=KVTransferConfig.from_cli(
+              kv_connector="SharedStorageConnector",
-        '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
+              kv_role="kv_both",
-        '"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
+              kv_connector_extra_config={
-    ))  #, max_model_len=2048, max_num_batched_tokens=2048)
+                  "shared_storage_path": "local_storage"
+              }))  #, max_model_len=2048, max_num_batched_tokens=2048)
 # 1ST generation (prefill instance)
 outputs = llm.generate(prompts, sampling_params)

--- a/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
+++ b/examples/offline_inference/disaggregated-prefill-v1/prefill_example.py
@@ -17,11 +17,12 @@ sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
 llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
          enforce_eager=True,
          gpu_memory_utilization=0.8,
-          kv_transfer_config=KVTransferConfig.from_cli(
+          kv_transfer_config=KVTransferConfig(
-              '{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
+              kv_connector="SharedStorageConnector",
-              '"kv_connector_extra_config": '
+              kv_role="kv_both",
-              '{"shared_storage_path": "local_storage"}}')
+              kv_connector_extra_config={
-          )  #, max_model_len=2048, max_num_batched_tokens=2048)
+                  "shared_storage_path": "local_storage"
+              }))  #, max_model_len=2048, max_num_batched_tokens=2048)
 # 1ST generation (prefill instance)
 outputs = llm.generate(

--- a/examples/offline_inference/disaggregated_prefill.py
+++ b/examples/offline_inference/disaggregated_prefill.py
@@ -32,9 +32,10 @@ def run_prefill(prefill_done):
    # This instance is the prefill node (kv_producer, rank 0).
    # The number of parallel instances for KV cache transfer is set to 2,
    # as required for PyNcclConnector.
-    ktc = KVTransferConfig.from_cli(
+    ktc = KVTransferConfig(kv_connector="PyNcclConnector",
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
+                           kv_role="kv_producer",
-    )
+                           kv_rank=0,
+                           kv_parallel_size=2)
    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
    # memory. You may need to adjust the value to fit your GPU.
@@ -71,9 +72,10 @@ def run_decode(prefill_done):
    # This instance is the decode node (kv_consumer, rank 1).
    # The number of parallel instances for KV cache transfer is set to 2,
    # as required for PyNcclConnector.
-    ktc = KVTransferConfig.from_cli(
+    ktc = KVTransferConfig(kv_connector="PyNcclConnector",
-        '{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
+                           kv_role="kv_consumer",
-    )
+                           kv_rank=1,
+                           kv_parallel_size=2)
    # Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
    # memory. You may need to adjust the value to fit your GPU.

--- a/examples/offline_inference/eagle.py
+++ b/examples/offline_inference/eagle.py
@@ -36,6 +36,10 @@ def parse_args():
        help="downloaded from the eagle repo " \
        "https://github.com/SafeAILab/EAGLE/blob/main/eagle/data/"
    )
+    parser.add_argument("--method",
+                        type=str,
+                        default='eagle',
+                        choices=['eagle', 'eagle3'])
    parser.add_argument("--max_num_seqs", type=int, default=8)
    parser.add_argument("--num_prompts", type=int, default=80)
    parser.add_argument("--num_spec_tokens", type=int, default=2)
@@ -53,7 +57,13 @@ def main():
    args = parse_args()
    model_dir = "meta-llama/Llama-3.1-8B-Instruct"
-    eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+    if args.method == 'eagle':
+        eagle_dir = "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+    elif args.method == 'eagle3':
+        eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+    else:
+        raise ValueError(f"unknown method: {args.method}")
    max_model_len = 2048
@@ -81,7 +91,7 @@ def main():
        max_num_seqs=args.max_num_seqs,
        gpu_memory_utilization=0.8,
        speculative_config={
-            "method": "eagle3" if "eagle3" in eagle_dir.lower() else "eagle",
+            "method": args.method,
            "model": eagle_dir,
            "num_speculative_tokens": args.num_spec_tokens,
            "draft_tensor_parallel_size": args.draft_tp,
@@ -95,6 +105,13 @@ def main():
    outputs = llm.generate(prompt_token_ids=prompt_ids,
                           sampling_params=sampling_params)
+    # print the generated text
+    for output in outputs:
+        print("-" * 50)
+        print(f"prompt: {output.prompt}")
+        print(f"generated text: {output.outputs[0].text}")
+        print("-" * 50)
    if not hasattr(outputs, "metrics") or outputs.metrics is None:
        return
@@ -108,8 +125,8 @@ def main():
            acceptance_counts[step] += count
    print("-" * 50)
-    print(f"mean acceptance length: \
+    print(f"mean acceptance length (including bonus tokens): \
-        {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
+        {1 + (sum(acceptance_counts) / acceptance_counts[0]):.2f}")
    print("-" * 50)
    # print acceptance at each token position

--- a/examples/offline_inference/lora_with_quantization_inference.py
+++ b/examples/offline_inference/lora_with_quantization_inference.py
@@ -75,43 +75,38 @@ def initialize_engine(model: str, quantization: str,
                      lora_repo: Optional[str]) -> LLMEngine:
    """Initialize the LLMEngine."""
-    if quantization == "bitsandbytes":
+    engine_args = EngineArgs(model=model,
-        # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique.
+                             quantization=quantization,
-        # It quantizes the model when loading, with some config info from the
+                             enable_lora=True,
-        # LoRA adapter repo. So need to set the parameter of load_format and
+                             max_lora_rank=64,
-        # qlora_adapter_name_or_path as below.
+                             max_loras=4)
-        engine_args = EngineArgs(model=model,
-                                 quantization=quantization,
-                                 qlora_adapter_name_or_path=lora_repo,
-                                 enable_lora=True,
-                                 max_lora_rank=64)
-    else:
-        engine_args = EngineArgs(model=model,
-                                 quantization=quantization,
-                                 enable_lora=True,
-                                 max_loras=4)
    return LLMEngine.from_engine_args(engine_args)
 def main():
    """Main function that sets up and runs the prompt processing."""
-    test_configs = [{
+    test_configs = [
-        "name": "qlora_inference_example",
+        # QLoRA (https://arxiv.org/abs/2305.14314)
-        'model': "huggyllama/llama-7b",
+        {
-        'quantization': "bitsandbytes",
+            "name": "qlora_inference_example",
-        'lora_repo': 'timdettmers/qlora-flan-7b'
+            'model': "huggyllama/llama-7b",
-    }, {
+            'quantization': "bitsandbytes",
-        "name": "AWQ_inference_with_lora_example",
+            'lora_repo': 'timdettmers/qlora-flan-7b'
-        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
+        },
-        'quantization': "awq",
+        {
-        'lora_repo': 'jashing/tinyllama-colorist-lora'
+            "name": "AWQ_inference_with_lora_example",
-    }, {
+            'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
-        "name": "GPTQ_inference_with_lora_example",
+            'quantization': "awq",
-        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
+            'lora_repo': 'jashing/tinyllama-colorist-lora'
-        'quantization': "gptq",
+        },
-        'lora_repo': 'jashing/tinyllama-colorist-lora'
+        {
-    }]
+            "name": "GPTQ_inference_with_lora_example",
+            'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
+            'quantization': "gptq",
+            'lora_repo': 'jashing/tinyllama-colorist-lora'
+        }
+    ]
    for test_config in test_configs:
        print(

--- a/examples/offline_inference/neuron_eagle.py
+++ b/examples/offline_inference/neuron_eagle.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to run offline inference with an EAGLE speculative 
+decoding model on neuron. To use EAGLE speculative decoding, you must use
+a draft model that is specifically fine-tuned for EAGLE speculation.
+Additionally, to use EAGLE with NxD Inference, the draft model must include
+the LM head weights from the target model. These weights are shared between
+the draft and target model.
+"""
+from vllm import LLM, SamplingParams
+# Sample prompts.
+prompts = [
+    "What is annapurna labs?",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
+# Create an LLM.
+llm = LLM(
+    model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
+    speculative_config={
+        "model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
+        "num_speculative_tokens": 5,
+        "max_model_len": 2048
+    },
+    max_num_seqs=4,
+    # The max_model_len and block_size arguments are required to be same as
+    # max sequence length when targeting neuron device.
+    # Currently, this is a known limitation in continuous batching support
+    # in neuronx-distributed-inference.
+    max_model_len=2048,
+    block_size=2048,
+    # The device can be automatically detected when AWS Neuron SDK is installed.
+    # The device argument can be either unspecified for automated detection,
+    # or explicitly assigned.
+    device="neuron",
+    tensor_parallel_size=32,
+    override_neuron_config={
+        "enable_eagle_speculation": True,
+        "enable_fused_speculation": True
+    },
+)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}")
--- a/examples/offline_inference/neuron_speculation.py
+++ b/examples/offline_inference/neuron_speculation.py
+# SPDX-License-Identifier: Apache-2.0
+"""
+This example shows how to run offline inference with a speculative 
+decoding model on neuron.
+"""
+import os
+from vllm import LLM, SamplingParams
+# Sample prompts.
+prompts = [
+    "Hello, I am a language model and I can help",
+    "The president of the United States is",
+    "The capital of France is",
+]
+def config_buckets():
+    """Configure context length and token gen buckets."""
+    # creates XLA hlo graphs for all the context length buckets.
+    os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048"
+    # creates XLA hlo graphs for all the token gen buckets.
+    os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048"
+def initialize_model():
+    """Create an LLM with speculative decoding."""
+    return LLM(
+        model="openlm-research/open_llama_7b",
+        speculative_config={
+            "model": "openlm-research/open_llama_3b",
+            "num_speculative_tokens": 4,
+            "max_model_len": 2048
+        },
+        max_num_seqs=4,
+        max_model_len=2048,
+        block_size=2048,
+        use_v2_block_manager=True,
+        device="neuron",
+        tensor_parallel_size=32,
+    )
+def process_requests(model: LLM, sampling_params: SamplingParams):
+    """Generate texts from prompts and print them."""
+    outputs = model.generate(prompts, sampling_params)
+    for output in outputs:
+        prompt = output.prompt
+        generated_text = output.outputs[0].text
+        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+def main():
+    """Main function that sets up the model and processes prompts."""
+    config_buckets()
+    model = initialize_model()
+    # Create a sampling params object.
+    sampling_params = SamplingParams(max_tokens=100, top_k=1)
+    process_requests(model, sampling_params)
+if __name__ == '__main__':
+    main()
--- a/examples/offline_inference/openai/openai_batch.md
+++ b/examples/offline_inference/openai/openai_batch.md
@@ -8,7 +8,7 @@ This is a guide to performing batch inference using the OpenAI batch file format
 The OpenAI batch file format consists of a series of json objects on new lines.
-[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
+[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl)
 Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
@@ -30,13 +30,13 @@ We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` e
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 ```console
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
 ```
 Once you've created your batch file it should look like this
 ```console
-$ cat offline_inference/openai/openai_example_batch.jsonl
+$ cat offline_inference/openai_batch/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -48,7 +48,7 @@ The batch running tool is designed to be used from the command line.
 You can run the batch with the following command, which will write its results to a file called `results.jsonl`
 ```console
-python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 ### Step 3: Check your results
@@ -65,10 +65,10 @@ $ cat results.jsonl
 The batch runner supports remote input and output urls that are accessible via http/https.
-For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run
 ```console
-python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
 ```
 ## Example 3: Integrating with AWS S3
@@ -89,13 +89,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
 To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
 ```console
-wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
+wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
 ```
 Once you've created your batch file it should look like this
 ```console
-$ cat offline_inference/openai/openai_example_batch.jsonl
+$ cat offline_inference/openai_batch/openai_example_batch.jsonl
 {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
 ```
@@ -103,7 +103,7 @@ $ cat offline_inference/openai/openai_example_batch.jsonl
 Now upload your batch file to your S3 bucket.
 ```console
-aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
 ```
 ### Step 2: Generate your presigned urls