Merge branch 'v0.5.4-dtk24.04.1'

e7c1b7f3 · zhuwenwen · 7462218e · 04c62b93 · e7c1b7f3 · e7c1b7f3
Commit e7c1b7f3 authored Sep 06, 2024 by zhuwenwen
20 changed files
--- a/examples/logging_configuration.md
+++ b/examples/logging_configuration.md
@@ -95,9 +95,7 @@ to the path of the custom logging configuration JSON file:

 ```bash
 VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```


@@ -152,9 +150,7 @@ to the path of the custom logging configuration JSON file:

 ```bash
 VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```


@@ -167,9 +163,7 @@ loggers.

 ```bash
 VLLM_CONFIGURE_LOGGING=0 \
-    python3 -m vllm.entrypoints.openai.api_server \
-    --max-model-len 2048 \
-    --model mistralai/Mistral-7B-v0.1
+    vllm serve mistralai/Mistral-7B-v0.1 --max-model-len 2048
 ```



--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@@ -5,7 +5,7 @@ distributively on a multi-nodes cluster.
 Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
 """

-from typing import Dict
+from typing import Any, Dict, List

 import numpy as np
 import ray
@@ -40,8 +40,8 @@ class LLMPredictor:
        # The output is a list of RequestOutput objects that contain the prompt,
        # generated text, and other information.
        outputs = self.llm.generate(batch["text"], sampling_params)
-        prompt = []
-        generated_text = []
+        prompt: List[str] = []
+        generated_text: List[str] = []
        for output in outputs:
            prompt.append(output.prompt)
            generated_text.append(' '.join([o.text for o in output.outputs]))
@@ -71,7 +71,7 @@ def scheduling_strategy_fn():
        pg, placement_group_capture_child_tasks=True))


-resources_kwarg = {}
+resources_kwarg: Dict[str, Any] = {}
 if tensor_parallel_size == 1:
    # For tensor_parallel_size == 1, we simply set num_gpus=1.
    resources_kwarg["num_gpus"] = 1

--- a/examples/offline_inference_mlpspeculator.py
+++ b/examples/offline_inference_mlpspeculator.py
+import gc
+import time
+from typing import List
+
+from vllm import LLM, SamplingParams
+
+
+def time_generation(llm: LLM, prompts: List[str],
+                    sampling_params: SamplingParams):
+    # Generate texts from the prompts. The output is a list of RequestOutput
+    # objects that contain the prompt, generated text, and other information.
+    # Warmup first
+    llm.generate(prompts, sampling_params)
+    llm.generate(prompts, sampling_params)
+    start = time.time()
+    outputs = llm.generate(prompts, sampling_params)
+    end = time.time()
+    print((end - start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
+    # Print the outputs.
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        print(f"text: {generated_text!r}")
+
+
+if __name__ == "__main__":
+
+    template = (
+        "Below is an instruction that describes a task. Write a response "
+        "that appropriately completes the request.\n\n### Instruction:\n{}"
+        "\n\n### Response:\n")
+
+    # Sample prompts.
+    prompts = [
+        "Write about the president of the United States.",
+    ]
+    prompts = [template.format(prompt) for prompt in prompts]
+    # Create a sampling params object.
+    sampling_params = SamplingParams(temperature=0.0, max_tokens=200)
+
+    # Create an LLM without spec decoding
+    llm = LLM(model="meta-llama/Llama-2-13b-chat-hf")
+
+    print("Without speculation")
+    time_generation(llm, prompts, sampling_params)
+
+    del llm
+    gc.collect()
+
+    # Create an LLM with spec decoding
+    llm = LLM(
+        model="meta-llama/Llama-2-13b-chat-hf",
+        speculative_model="ibm-fms/llama-13b-accelerator",
+        # These are currently required for MLPSpeculator decoding
+        use_v2_block_manager=True,
+    )
+
+    print("With speculation")
+    time_generation(llm, prompts, sampling_params)
--- a/examples/offline_inference_neuron.py
+++ b/examples/offline_inference_neuron.py
--- a/examples/offline_inference_tpu.py
+++ b/examples/offline_inference_tpu.py
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "A robot may not injure a human being",
+    "It is only with the heart that one can see rightly;",
+    "The greatest glory in living lies not in never falling,",
+]
+answers = [
+    " or, through inaction, allow a human being to come to harm.",
+    " what is essential is invisible to the eye.",
+    " but in rising every time we fall.",
+]
+N = 1
+# Currently, top-p sampling is disabled. `top_p` should be 1.0.
+sampling_params = SamplingParams(temperature=0.7,
+                                 top_p=1.0,
+                                 n=N,
+                                 max_tokens=16)
+
+# Set `enforce_eager=True` to avoid ahead-of-time compilation.
+# In real workloads, `enforace_eager` should be `False`.
+llm = LLM(model="google/gemma-2b", enforce_eager=True)
+outputs = llm.generate(prompts, sampling_params)
+for output, answer in zip(outputs, answers):
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+    assert generated_text.startswith(answer)
--- a/examples/offline_inference_vision_language.py
+++ b/examples/offline_inference_vision_language.py
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on vision language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+from transformers import AutoTokenizer
+
+from vllm import LLM, SamplingParams
+from vllm.assets.image import ImageAsset
+from vllm.utils import FlexibleArgumentParser
+
+# Input image and question
+image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
+question = "What is the content of this image?"
+
+
+# LLaVA-1.5
+def run_llava(question):
+
+    prompt = f"USER: <image>\n{question}\nASSISTANT:"
+
+    llm = LLM(model="llava-hf/llava-1.5-7b-hf")
+
+    return llm, prompt
+
+
+# LLaVA-1.6/LLaVA-NeXT
+def run_llava_next(question):
+
+    prompt = f"[INST] <image>\n{question} [/INST]"
+    llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
+
+    return llm, prompt
+
+
+# Fuyu
+def run_fuyu(question):
+
+    prompt = f"{question}\n"
+    llm = LLM(model="adept/fuyu-8b")
+
+    return llm, prompt
+
+
+# Phi-3-Vision
+def run_phi3v(question):
+
+    prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
+    # Note: The default setting of max_num_seqs (256) and
+    # max_model_len (128k) for this model may cause OOM.
+    # You may lower either to run this example on lower-end GPUs.
+
+    # In this example, we override max_num_seqs to 5 while
+    # keeping the original context length of 128k.
+    llm = LLM(
+        model="microsoft/Phi-3-vision-128k-instruct",
+        trust_remote_code=True,
+        max_num_seqs=5,
+    )
+    return llm, prompt
+
+
+# PaliGemma
+def run_paligemma(question):
+
+    # PaliGemma has special prompt format for VQA
+    prompt = "caption en"
+    llm = LLM(model="google/paligemma-3b-mix-224")
+
+    return llm, prompt
+
+
+# Chameleon
+def run_chameleon(question):
+
+    prompt = f"{question}<image>"
+    llm = LLM(model="facebook/chameleon-7b")
+    return llm, prompt
+
+
+# MiniCPM-V
+def run_minicpmv(question):
+
+    # 2.0
+    # The official repo doesn't work yet, so we need to use a fork for now
+    # For more details, please see: See: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 # noqa
+    # model_name = "HwwwH/MiniCPM-V-2"
+
+    # 2.5
+    model_name = "openbmb/MiniCPM-Llama3-V-2_5"
+    tokenizer = AutoTokenizer.from_pretrained(model_name,
+                                              trust_remote_code=True)
+    llm = LLM(
+        model=model_name,
+        trust_remote_code=True,
+    )
+
+    messages = [{
+        'role': 'user',
+        'content': f'(<image>./</image>)\n{question}'
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+    return llm, prompt
+
+
+# InternVL
+def run_internvl(question):
+    # Generally, InternVL can use chatml template for conversation
+    TEMPLATE = "<|im_start|>User\n{prompt}<|im_end|>\n<|im_start|>Assistant\n"
+    prompt = f"<image>\n{question}\n"
+    prompt = TEMPLATE.format(prompt=prompt)
+    llm = LLM(
+        model="OpenGVLab/InternVL2-4B",
+        trust_remote_code=True,
+        max_num_seqs=5,
+    )
+    return llm, prompt
+
+
+# BLIP-2
+def run_blip2(question):
+
+    # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
+    # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
+    prompt = f"Question: {question} Answer:"
+    llm = LLM(model="Salesforce/blip2-opt-2.7b")
+    return llm, prompt
+
+
+model_example_map = {
+    "llava": run_llava,
+    "llava-next": run_llava_next,
+    "fuyu": run_fuyu,
+    "phi3_v": run_phi3v,
+    "paligemma": run_paligemma,
+    "chameleon": run_chameleon,
+    "minicpmv": run_minicpmv,
+    "blip-2": run_blip2,
+    "internvl_chat": run_internvl,
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    llm, prompt = model_example_map[model](question)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
+
+    assert args.num_prompts > 0
+    if args.num_prompts == 1:
+        # Single inference
+        inputs = {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        }
+
+    else:
+        # Batch inference
+        inputs = [{
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            },
+        } for _ in range(args.num_prompts)]
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'vision language models')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="llava",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+
+    args = parser.parse_args()
+    main(args)
--- a/examples/openai_embedding_client.py
+++ b/examples/openai_embedding_client.py
@@ -13,11 +13,14 @@ client = OpenAI(
 models = client.models.list()
 model = models.data[0].id

-responses = client.embeddings.create(input=[
-    "Hello my name is",
-    "The best thing about vLLM is that it supports many different models"
-],
-                                     model=model)
+responses = client.embeddings.create(
+    input=[
+        "Hello my name is",
+        "The best thing about vLLM is that it supports many different models"
+    ],
+    model=model,
+    encoding_format="float",
+)

 for data in responses.data:
    print(data.embedding)  # list of float of len 4096
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
+"""An example showing how to use vLLM to serve VLMs.
+
+Launch the vLLM server with the following command:
+vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+"""
+import base64
+
+import requests
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+# Use image url in the payload
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What’s in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print(f"Chat completion output:{result}")
+
+
+# Use base64 encoded image in the payload
+def encode_image_base64_from_url(image_url: str) -> str:
+    """Encode an image retrieved from a remote url to base64 format."""
+
+    with requests.get(image_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+
+image_base64 = encode_image_base64_from_url(image_url=image_url)
+chat_completion_from_base64 = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What’s in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_base64}"
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_tokens=64,
+)
+
+result = chat_completion_from_base64.choices[0].message.content
+print(f"Chat completion output:{result}")
--- a/examples/production_monitoring/Otel.md
+++ b/examples/production_monitoring/Otel.md
+# Setup OpenTelemetry POC
+
+1. Install OpenTelemetry packages:
+    ```
+    pip install \
+        opentelemetry-sdk \
+        opentelemetry-api \
+        opentelemetry-exporter-otlp \
+        opentelemetry-semantic-conventions-ai
+    ```
+
+1. Start Jaeger in a docker container:
+    ```
+    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
+    docker run --rm --name jaeger \
+        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
+        -p 6831:6831/udp \
+        -p 6832:6832/udp \
+        -p 5778:5778 \
+        -p 16686:16686 \
+        -p 4317:4317 \
+        -p 4318:4318 \
+        -p 14250:14250 \
+        -p 14268:14268 \
+        -p 14269:14269 \
+        -p 9411:9411 \
+        jaegertracing/all-in-one:1.57
+    ```
+
+1. In a new shell, export Jaeger IP:
+    ```
+    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    ```
+    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
+    ```
+    export OTEL_SERVICE_NAME="vllm-server"
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    ```
+
+1. In a new shell, send requests with trace context from a dummy client
+    ```
+    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    export OTEL_SERVICE_NAME="client-service"
+    python dummy_client.py
+    ```
+
+1. Open Jaeger webui: http://localhost:16686/
+
+    In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
+    ![Traces](https://i.imgur.com/GYHhFjo.png)
+
+1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
+![Spans details](https://i.imgur.com/OPf6CBL.png)
+
+## Exporter Protocol
+OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
+By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
+```
+export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+```
+
+## Instrumentation of FastAPI
+OpenTelemetry allows automatic instrumentation of FastAPI.
+1. Install the instrumentation library
+    ```
+    pip install opentelemetry-instrumentation-fastapi
+    ```
+
+1. Run vLLM with `opentelemetry-instrument`
+    ```
+    opentelemetry-instrument vllm serve facebook/opt-125m
+    ```
+
+1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
+
+![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
\ No newline at end of file
--- a/examples/production_monitoring/README.md
+++ b/examples/production_monitoring/README.md
@@ -10,8 +10,7 @@ Install:

 Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
 ```bash
-python3 -m vllm.entrypoints.openai.api_server \
-    --model mistralai/Mistral-7B-v0.1 \
+vllm serve mistralai/Mistral-7B-v0.1 \
    --max-model-len 2048 \
    --disable-log-requests
 ```

--- a/examples/production_monitoring/dummy_client.py
+++ b/examples/production_monitoring/dummy_client.py
+import requests
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+    OTLPSpanExporter)
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
+                                            ConsoleSpanExporter)
+from opentelemetry.trace import SpanKind, set_tracer_provider
+from opentelemetry.trace.propagation.tracecontext import (
+    TraceContextTextMapPropagator)
+
+trace_provider = TracerProvider()
+set_tracer_provider(trace_provider)
+
+trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+
+tracer = trace_provider.get_tracer("dummy-client")
+
+url = "http://localhost:8000/v1/completions"
+with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
+    prompt = "San Francisco is a"
+    span.set_attribute("prompt", prompt)
+    headers = {}
+    TraceContextTextMapPropagator().inject(headers)
+    payload = {
+        "model": "facebook/opt-125m",
+        "prompt": prompt,
+        "max_tokens": 10,
+        "best_of": 20,
+        "n": 3,
+        "use_beam_search": "true",
+        "temperature": 0.0,
+        # "stream": True,
+    }
+    response = requests.post(url, headers=headers, json=payload)
--- a/examples/production_monitoring/grafana.json
+++ b/examples/production_monitoring/grafana.json
 {
  "__inputs": [
-    {
-      "name": "DS_PROMETHEUS",
-      "label": "prometheus",
-      "description": "",
-      "type": "datasource",
-      "pluginId": "prometheus",
-      "pluginName": "Prometheus"
-    }
  ],
  "__elements": {},
  "__requires": [
@@ -1215,11 +1207,21 @@
  "templating": {
    "list": [
      {
+        "type": "datasource",
+        "name": "DS_PROMETHEUS",
+        "label": "datasource",
        "current": {},
-        "datasource": {
-          "type": "prometheus",
-          "uid": "${DS_PROMETHEUS}"
-        },
+        "hide": 0,
+        "includeAll": false,
+        "multi": false,
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false
+      },
+      {
        "definition": "label_values(model_name)",
        "hide": 0,
        "includeAll": false,
@@ -1250,3 +1252,4 @@
  "version": 1,
  "weekStart": ""
 }
+
--- a/examples/run_cluster.sh
+++ b/examples/run_cluster.sh
+#!/bin/bash
+
+# Check for minimum number of required arguments
+if [ $# -lt 4 ]; then
+    echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
+    exit 1
+fi
+
+# Assign the first three arguments and shift them away
+DOCKER_IMAGE="$1"
+HEAD_NODE_ADDRESS="$2"
+NODE_TYPE="$3"  # Should be --head or --worker
+PATH_TO_HF_HOME="$4"
+shift 4
+
+# Additional arguments are passed directly to the Docker command
+ADDITIONAL_ARGS="$@"
+
+# Validate node type
+if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
+    echo "Error: Node type must be --head or --worker"
+    exit 1
+fi
+
+# Define a function to cleanup on EXIT signal
+cleanup() {
+    docker stop node
+    docker rm node
+}
+trap cleanup EXIT
+
+# Command setup for head or worker node
+RAY_START_CMD="ray start --block"
+if [ "${NODE_TYPE}" == "--head" ]; then
+    RAY_START_CMD+=" --head --port=6379"
+else
+    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
+fi
+
+# Run the docker command with the user specified parameters and additional arguments
+docker run \
+    --entrypoint /bin/bash \
+    --network host \
+    --name node \
+    --shm-size 10.24g \
+    --gpus all \
+    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+    ${ADDITIONAL_ARGS} \
+    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
--- a/examples/save_sharded_state.py
+++ b/examples/save_sharded_state.py
@@ -20,15 +20,15 @@ llm = LLM(
    tensor_parallel_size=8,
 )
 """
-import argparse
 import dataclasses
 import os
 import shutil
 from pathlib import Path

 from vllm import LLM, EngineArgs
+from vllm.utils import FlexibleArgumentParser

-parser = argparse.ArgumentParser()
+parser = FlexibleArgumentParser()
 EngineArgs.add_cli_args(parser)
 parser.add_argument("--output",
                    "-o",

--- a/examples/template_blip2.jinja
+++ b/examples/template_blip2.jinja
+{%- for message in messages -%}
+    {%- if message['role'] == 'user' -%}
+        {{- 'Question: ' + message['content'] + ' ' -}}
+    {%- elif message['role'] == 'assistant' -%}
+        {{- 'Answer: ' + message['content'] + ' ' -}}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{- 'Answer:' -}}
+{% endif %}
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
@@ -9,6 +9,7 @@ from vllm.engine.arg_utils import EngineArgs
 from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
                                                         TensorizerConfig,
                                                         tensorize_vllm_model)
+from vllm.utils import FlexibleArgumentParser

 # yapf conflicts with isort for this docstring
 # yapf: disable
@@ -96,7 +97,7 @@ deserialization in this example script, although `--tensorizer-uri` and


 def parse_args():
-    parser = argparse.ArgumentParser(
+    parser = FlexibleArgumentParser(
        description="An example script that can be used to serialize and "
        "deserialize vLLM models. These models "
        "can be loaded using tensorizer directly to the GPU "

--- a/format.sh
+++ b/format.sh
@@ -96,22 +96,20 @@ echo 'vLLM yapf: Done'

 # Run mypy
 echo 'vLLM mypy:'
-mypy vllm/attention --config-file pyproject.toml
-mypy vllm/core --config-file pyproject.toml
-mypy vllm/distributed --config-file pyproject.toml
-mypy vllm/entrypoints --config-file pyproject.toml
-mypy vllm/executor --config-file pyproject.toml
-mypy vllm/multimodal --config-file pyproject.toml
-mypy vllm/usage --config-file pyproject.toml
-mypy vllm/*.py --config-file pyproject.toml
-mypy vllm/transformers_utils --config-file pyproject.toml
-mypy vllm/engine  --config-file pyproject.toml
-mypy vllm/worker --config-file pyproject.toml
-mypy vllm/spec_decode --config-file pyproject.toml
-mypy vllm/model_executor  --config-file pyproject.toml
-mypy vllm/lora --config-file pyproject.toml
-mypy vllm/logging --config-file pyproject.toml
-mypy vllm/model_executor --config-file pyproject.toml
+mypy --follow-imports skip  # Note that this is less strict than CI
+mypy tests --follow-imports skip
+mypy vllm/attention --follow-imports skip
+mypy vllm/core --follow-imports skip
+mypy vllm/distributed --follow-imports skip
+mypy vllm/engine  --follow-imports skip
+mypy vllm/entrypoints --follow-imports skip
+mypy vllm/executor --follow-imports skip
+mypy vllm/lora --follow-imports skip
+mypy vllm/model_executor  --follow-imports skip
+mypy vllm/prompt_adapter --follow-imports skip
+mypy vllm/spec_decode --follow-imports skip
+mypy vllm/worker --follow-imports skip
+echo 'vLLM mypy: Done'


 # If git diff returns a file that is in the skip list, the file may be checked anyway:
@@ -130,7 +128,7 @@ spell_check_all(){
  codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
 }

-# Spelling  check of files that differ from main branch.
+# Spelling check of files that differ from main branch.
 spell_check_changed() {
    # The `if` guard ensures that the list of filenames is not empty, which
    # could cause ruff to receive 0 positional arguments, making it hang
@@ -244,12 +242,6 @@ echo 'vLLM isort: Done'
 # NOTE: Keep up to date with .github/workflows/clang-format.yml
 CLANG_FORMAT_EXCLUDES=(
    'csrc/moe/topk_softmax_kernels.cu'
-    'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
-    'csrc/punica/bgmv/bgmv_config.h'
-    'csrc/punica/bgmv/bgmv_impl.cuh'
-    'csrc/punica/bgmv/vec_dtypes.cuh'
-    'csrc/punica/punica_ops.cu'
-    'csrc/punica/type_convert.h'
 )

 # Format specified files with clang-format

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ requires = [
    "ninja",
    "packaging",
    "setuptools >= 49.4.0",
-    "torch == 2.3.0",
+    "torch == 2.4.0",
    "wheel",
 ]
 build-backend = "setuptools.build_meta"
@@ -48,9 +48,22 @@ python_version = "3.8"

 ignore_missing_imports = true
 check_untyped_defs = true
-follow_imports = "skip"
+follow_imports = "silent"

-files = "vllm"
+# After fixing type errors resulting from follow_imports: "skip" -> "silent",
+# move the directory here and remove it from format.sh and mypy.yaml
+files = [
+    "vllm/*.py",
+    "vllm/adapter_commons",
+    "vllm/assets",
+    "vllm/inputs",
+    "vllm/logging",
+    "vllm/multimodal",
+    "vllm/platforms",
+    "vllm/transformers_utils",
+    "vllm/triton_utils",
+    "vllm/usage",
+]
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 exclude = [
    "vllm/model_executor/parallel_utils/|vllm/model_executor/models/",
@@ -69,7 +82,5 @@ skip_gitignore = true
 [tool.pytest.ini_options]
 markers = [
    "skip_global_cleanup",
-    "llm: run tests for vLLM API only",
-    "openai: run tests for OpenAI API only",
-    "llava: run tests for LLaVA models only",
+    "vlm: run tests for vision language models only",
 ]
--- a/requirements-adag.txt
+++ b/requirements-adag.txt
+# Dependencies for Ray accelerated DAG
+cupy-cuda12x
+ray >= 2.32
\ No newline at end of file
--- a/requirements-build.txt
+++ b/requirements-build.txt
@@ -3,5 +3,5 @@ cmake>=3.21
 ninja
 packaging
 setuptools>=49.4.0
-torch==2.3.0
+torch==2.4.0
 wheel