update vllm0.6.2

8228a79e · laibao · d77d1901 · 8228a79e · 8228a79e · d77d1901
Commit 8228a79e authored Dec 17, 2024 by laibao
20 changed files
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
+"""An example showing how to use vLLM to serve VLMs.
+
+Launch the vLLM server with the following command:
+
+(single image inference with Llava)
+vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
+
+(multi-image inference with Phi-3.5-vision-instruct)
+vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \
+    --trust-remote-code --limit-mm-per-prompt image=2
+"""
+import base64
+
+import requests
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+# Single-image input inference
+image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+
+## Use image url in the payload
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print("Chat completion output:", result)
+
+
+## Use base64 encoded image in the payload
+def encode_image_base64_from_url(image_url: str) -> str:
+    """Encode an image retrieved from a remote url to base64 format."""
+
+    with requests.get(image_url) as response:
+        response.raise_for_status()
+        result = base64.b64encode(response.content).decode('utf-8')
+
+    return result
+
+
+image_base64 = encode_image_base64_from_url(image_url=image_url)
+chat_completion_from_base64 = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": f"data:image/jpeg;base64,{image_base64}"
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_tokens=64,
+)
+
+result = chat_completion_from_base64.choices[0].message.content
+print(f"Chat completion output:{result}")
+
+# Multi-image input inference
+image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
+image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
+chat_completion_from_url = client.chat.completions.create(
+    messages=[{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What are the animals in these images?"
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url_duck
+                },
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url_lion
+                },
+            },
+        ],
+    }],
+    model=model,
+    max_tokens=64,
+)
+
+result = chat_completion_from_url.choices[0].message.content
+print("Chat completion output:", result)
--- a/examples/production_monitoring/Otel.md
+++ b/examples/production_monitoring/Otel.md
+# Setup OpenTelemetry POC
+
+1. Install OpenTelemetry packages:
+    ```
+    pip install \
+      'opentelemetry-sdk>=1.26.0,<1.27.0' \
+      'opentelemetry-api>=1.26.0,<1.27.0' \
+      'opentelemetry-exporter-otlp>=1.26.0,<1.27.0' \
+      'opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0'
+    ```
+
+1. Start Jaeger in a docker container:
+    ```
+    # From: https://www.jaegertracing.io/docs/1.57/getting-started/
+    docker run --rm --name jaeger \
+        -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
+        -p 6831:6831/udp \
+        -p 6832:6832/udp \
+        -p 5778:5778 \
+        -p 16686:16686 \
+        -p 4317:4317 \
+        -p 4318:4318 \
+        -p 14250:14250 \
+        -p 14268:14268 \
+        -p 14269:14269 \
+        -p 9411:9411 \
+        jaegertracing/all-in-one:1.57
+    ```
+
+1. In a new shell, export Jaeger IP:
+    ```
+    export JAEGER_IP=$(docker inspect   --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    ```
+    Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
+    ```
+    export OTEL_SERVICE_NAME="vllm-server"
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+    ```
+
+1. In a new shell, send requests with trace context from a dummy client
+    ```
+    export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
+    export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
+    export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
+    export OTEL_SERVICE_NAME="client-service"
+    python dummy_client.py
+    ```
+
+1. Open Jaeger webui: http://localhost:16686/
+
+    In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
+    ![Traces](https://i.imgur.com/GYHhFjo.png)
+
+1. Clicking on a trace will show its spans and their tags. In this demo, each trace has 2 spans. One from the dummy client containing the prompt text and one from vLLM containing metadata about the request.
+![Spans details](https://i.imgur.com/OPf6CBL.png)
+
+## Exporter Protocol
+OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
+By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
+```
+export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
+export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
+vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
+```
+
+## Instrumentation of FastAPI
+OpenTelemetry allows automatic instrumentation of FastAPI.
+1. Install the instrumentation library
+    ```
+    pip install opentelemetry-instrumentation-fastapi
+    ```
+
+1. Run vLLM with `opentelemetry-instrument`
+    ```
+    opentelemetry-instrument vllm serve facebook/opt-125m
+    ```
+
+1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
+
+![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
\ No newline at end of file
--- a/examples/production_monitoring/README.md
+++ b/examples/production_monitoring/README.md
-# vLLM + Prometheus/Grafana 
-
-This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites. 
-
-Install: 
- [`docker`](https://docs.docker.com/engine/install/)
- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
-
-### Launch
-
-Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
-```bash
-python3 -m vllm.entrypoints.openai.api_server \
-    --model mistralai/Mistral-7B-v0.1 \
-    --max-model-len 2048 \
-    --disable-log-requests
-```
-
-Launch Prometheus and Grafana servers with `docker compose`:
-```bash
-docker compose up
-```
-
-Submit some sample requests to the server:
-```bash
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-python3 ../../benchmarks/benchmark_serving.py \
-    --model mistralai/Mistral-7B-v0.1 \
-    --tokenizer mistralai/Mistral-7B-v0.1 \
-    --endpoint /v1/completions \
-    --dataset-name sharegpt \
-    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
-    --request-rate 3.0
-```
-
-Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
-
-### Grafana Dashboard
-
-Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
-
-#### Add Prometheus Data Source
-
-Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus. 
-
-On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
-
-Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
-
-#### Import Dashboard 
-
-Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
-
-![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png)
--- a/examples/production_monitoring/docker-compose.yaml
+++ b/examples/production_monitoring/docker-compose.yaml
-# docker-compose.yaml
-version: "3"
-
-services:
-  prometheus:
-    image: prom/prometheus:latest
-    extra_hosts:
-      - "host.docker.internal:host-gateway"     # allow a direct connection from container to the local machine
-    ports:
-      - "9090:9090"   # the default port used by Prometheus
-    volumes:
-      - ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
-
-  grafana:
-    image: grafana/grafana:latest
-    depends_on:
-      - prometheus
-    ports:
-      - "3000:3000" # the default port used by Grafana
--- a/examples/production_monitoring/dummy_client.py
+++ b/examples/production_monitoring/dummy_client.py
+import requests
+from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import (
+    OTLPSpanExporter)
+from opentelemetry.sdk.trace import TracerProvider
+from opentelemetry.sdk.trace.export import (BatchSpanProcessor,
+                                            ConsoleSpanExporter)
+from opentelemetry.trace import SpanKind, set_tracer_provider
+from opentelemetry.trace.propagation.tracecontext import (
+    TraceContextTextMapPropagator)
+
+trace_provider = TracerProvider()
+set_tracer_provider(trace_provider)
+
+trace_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter()))
+trace_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
+
+tracer = trace_provider.get_tracer("dummy-client")
+
+url = "http://localhost:8000/v1/completions"
+with tracer.start_as_current_span("client-span", kind=SpanKind.CLIENT) as span:
+    prompt = "San Francisco is a"
+    span.set_attribute("prompt", prompt)
+    headers = {}
+    TraceContextTextMapPropagator().inject(headers)
+    payload = {
+        "model": "facebook/opt-125m",
+        "prompt": prompt,
+        "max_tokens": 10,
+        "best_of": 20,
+        "n": 3,
+        "use_beam_search": "true",
+        "temperature": 0.0,
+        # "stream": True,
+    }
+    response = requests.post(url, headers=headers, json=payload)
--- a/examples/production_monitoring/grafana.json
+++ b/examples/production_monitoring/grafana.json
-{
-  "__inputs": [
-    {
-      "name": "DS_PROMETHEUS",
-      "label": "prometheus",
-      "description": "",
-      "type": "datasource",
-      "pluginId": "prometheus",
-      "pluginName": "Prometheus"
-    }
-  ],
-  "__elements": {},
-  "__requires": [
-    {
-      "type": "grafana",
-      "id": "grafana",
-      "name": "Grafana",
-      "version": "10.4.2"
-    },
-    {
-      "type": "panel",
-      "id": "heatmap",
-      "name": "Heatmap",
-      "version": ""
-    },
-    {
-      "type": "datasource",
-      "id": "prometheus",
-      "name": "Prometheus",
-      "version": "1.0.0"
-    },
-    {
-      "type": "panel",
-      "id": "timeseries",
-      "name": "Time series",
-      "version": ""
-    }
-  ],
-  "annotations": {
-    "list": [
-      {
-        "builtIn": 1,
-        "datasource": {
-          "type": "grafana",
-          "uid": "-- Grafana --"
-        },
-        "enable": true,
-        "hide": true,
-        "iconColor": "rgba(0, 211, 255, 1)",
-        "name": "Annotations & Alerts",
-        "target": {
-          "limit": 100,
-          "matchAny": false,
-          "tags": [],
-          "type": "dashboard"
-        },
-        "type": "dashboard"
-      }
-    ]
-  },
-  "description": "Monitoring vLLM Inference Server",
-  "editable": true,
-  "fiscalYearStartMonth": 0,
-  "graphTooltip": 0,
-  "id": null,
-  "links": [],
-  "liveNow": false,
-  "panels": [
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "description": "End to end request latency measured in seconds.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 0
-      },
-      "id": 9,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P99",
-          "range": true,
-          "refId": "A",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P95",
-          "range": true,
-          "refId": "B",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P90",
-          "range": true,
-          "refId": "C",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P50",
-          "range": true,
-          "refId": "D",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "editorMode": "code",
-          "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
-          "hide": false,
-          "instant": false,
-          "legendFormat": "Average",
-          "range": true,
-          "refId": "E"
-        }
-      ],
-      "title": "E2E Request Latency",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "description": "Number of tokens processed per second",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 0
-      },
-      "id": 8,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "rate(vllm:prompt_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
-          "fullMetaSearch": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "Prompt Tokens/Sec",
-          "range": true,
-          "refId": "A",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "rate(vllm:generation_tokens_total{model_name=\"$model_name\"}[$__rate_interval])",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "Generation Tokens/Sec",
-          "range": true,
-          "refId": "B",
-          "useBackend": false
-        }
-      ],
-      "title": "Token Throughput",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "description": "Inter token latency in seconds.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 8
-      },
-      "id": 10,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P99",
-          "range": true,
-          "refId": "A",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P95",
-          "range": true,
-          "refId": "B",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P90",
-          "range": true,
-          "refId": "C",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P50",
-          "range": true,
-          "refId": "D",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "editorMode": "code",
-          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
-          "hide": false,
-          "instant": false,
-          "legendFormat": "Mean",
-          "range": true,
-          "refId": "E"
-        }
-      ],
-      "title": "Time Per Output Token Latency",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "none"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 8
-      },
-      "id": 3,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "vllm:num_requests_running{model_name=\"$model_name\"}",
-          "fullMetaSearch": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "legendFormat": "Num Running",
-          "range": true,
-          "refId": "A",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "vllm:num_requests_swapped{model_name=\"$model_name\"}",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "legendFormat": "Num Swapped",
-          "range": true,
-          "refId": "B",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "vllm:num_requests_waiting{model_name=\"$model_name\"}",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "legendFormat": "Num Waiting",
-          "range": true,
-          "refId": "C",
-          "useBackend": false
-        }
-      ],
-      "title": "Scheduler State",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "description": "P50, P90, P95, and P99 TTFT latency in seconds.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "s"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 16
-      },
-      "id": 5,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P99",
-          "range": true,
-          "refId": "A",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P95",
-          "range": true,
-          "refId": "B",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P90",
-          "range": true,
-          "refId": "C",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket{model_name=\"$model_name\"}[$__rate_interval])))",
-          "fullMetaSearch": false,
-          "hide": false,
-          "includeNullMetadata": false,
-          "instant": false,
-          "legendFormat": "P50",
-          "range": true,
-          "refId": "D",
-          "useBackend": false
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "editorMode": "code",
-          "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
-          "hide": false,
-          "instant": false,
-          "legendFormat": "Average",
-          "range": true,
-          "refId": "E"
-        }
-      ],
-      "title": "Time To First Token Latency",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "description": "Percentage of used cache blocks by vLLM.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          },
-          "unit": "percentunit"
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 16
-      },
-      "id": 4,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "editorMode": "code",
-          "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
-          "instant": false,
-          "legendFormat": "GPU Cache Usage",
-          "range": true,
-          "refId": "A"
-        },
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "editorMode": "code",
-          "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
-          "hide": false,
-          "instant": false,
-          "legendFormat": "CPU Cache Usage",
-          "range": true,
-          "refId": "B"
-        }
-      ],
-      "title": "Cache Utilization",
-      "type": "timeseries"
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "description": "Heatmap of request prompt length",
-      "fieldConfig": {
-        "defaults": {
-          "custom": {
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "scaleDistribution": {
-              "type": "linear"
-            }
-          }
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 24
-      },
-      "id": 12,
-      "options": {
-        "calculate": false,
-        "cellGap": 1,
-        "cellValues": {
-          "unit": "none"
-        },
-        "color": {
-          "exponent": 0.5,
-          "fill": "dark-orange",
-          "min": 0,
-          "mode": "scheme",
-          "reverse": false,
-          "scale": "exponential",
-          "scheme": "Spectral",
-          "steps": 64
-        },
-        "exemplars": {
-          "color": "rgba(255,0,255,0.7)"
-        },
-        "filterValues": {
-          "le": 1e-9
-        },
-        "legend": {
-          "show": true
-        },
-        "rowsFrame": {
-          "layout": "auto",
-          "value": "Request count"
-        },
-        "tooltip": {
-          "mode": "single",
-          "showColorScale": false,
-          "yHistogram": true
-        },
-        "yAxis": {
-          "axisLabel": "Prompt Length",
-          "axisPlacement": "left",
-          "reverse": false,
-          "unit": "none"
-        }
-      },
-      "pluginVersion": "10.4.2",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
-          "format": "heatmap",
-          "fullMetaSearch": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "legendFormat": "{{le}}",
-          "range": true,
-          "refId": "A",
-          "useBackend": false
-        }
-      ],
-      "title": "Request Prompt Length",
-      "type": "heatmap"
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "description": "Heatmap of request generation length",
-      "fieldConfig": {
-        "defaults": {
-          "custom": {
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "scaleDistribution": {
-              "type": "linear"
-            }
-          }
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 12,
-        "y": 24
-      },
-      "id": 13,
-      "options": {
-        "calculate": false,
-        "cellGap": 1,
-        "cellValues": {
-          "unit": "none"
-        },
-        "color": {
-          "exponent": 0.5,
-          "fill": "dark-orange",
-          "min": 0,
-          "mode": "scheme",
-          "reverse": false,
-          "scale": "exponential",
-          "scheme": "Spectral",
-          "steps": 64
-        },
-        "exemplars": {
-          "color": "rgba(255,0,255,0.7)"
-        },
-        "filterValues": {
-          "le": 1e-9
-        },
-        "legend": {
-          "show": true
-        },
-        "rowsFrame": {
-          "layout": "auto",
-          "value": "Request count"
-        },
-        "tooltip": {
-          "mode": "single",
-          "showColorScale": false,
-          "yHistogram": true
-        },
-        "yAxis": {
-          "axisLabel": "Generation Length",
-          "axisPlacement": "left",
-          "reverse": false,
-          "unit": "none"
-        }
-      },
-      "pluginVersion": "10.4.2",
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
-          "format": "heatmap",
-          "fullMetaSearch": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "legendFormat": "{{le}}",
-          "range": true,
-          "refId": "A",
-          "useBackend": false
-        }
-      ],
-      "title": "Request Generation Length",
-      "type": "heatmap"
-    },
-    {
-      "datasource": {
-        "type": "prometheus",
-        "uid": "${DS_PROMETHEUS}"
-      },
-      "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
-      "fieldConfig": {
-        "defaults": {
-          "color": {
-            "mode": "palette-classic"
-          },
-          "custom": {
-            "axisBorderShow": false,
-            "axisCenteredZero": false,
-            "axisColorMode": "text",
-            "axisLabel": "",
-            "axisPlacement": "auto",
-            "barAlignment": 0,
-            "drawStyle": "line",
-            "fillOpacity": 0,
-            "gradientMode": "none",
-            "hideFrom": {
-              "legend": false,
-              "tooltip": false,
-              "viz": false
-            },
-            "insertNulls": false,
-            "lineInterpolation": "linear",
-            "lineWidth": 1,
-            "pointSize": 5,
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "showPoints": "auto",
-            "spanNulls": false,
-            "stacking": {
-              "group": "A",
-              "mode": "none"
-            },
-            "thresholdsStyle": {
-              "mode": "off"
-            }
-          },
-          "mappings": [],
-          "thresholds": {
-            "mode": "absolute",
-            "steps": [
-              {
-                "color": "green",
-                "value": null
-              },
-              {
-                "color": "red",
-                "value": 80
-              }
-            ]
-          }
-        },
-        "overrides": []
-      },
-      "gridPos": {
-        "h": 8,
-        "w": 12,
-        "x": 0,
-        "y": 32
-      },
-      "id": 11,
-      "options": {
-        "legend": {
-          "calcs": [],
-          "displayMode": "list",
-          "placement": "bottom",
-          "showLegend": true
-        },
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        }
-      },
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "${DS_PROMETHEUS}"
-          },
-          "disableTextWrap": false,
-          "editorMode": "builder",
-          "expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))",
-          "fullMetaSearch": false,
-          "includeNullMetadata": true,
-          "instant": false,
-          "interval": "",
-          "legendFormat": "__auto",
-          "range": true,
-          "refId": "A",
-          "useBackend": false
-        }
-      ],
-      "title": "Finish Reason",
-      "type": "timeseries"
-    }
-  ],
-  "refresh": "",
-  "schemaVersion": 39,
-  "tags": [],
-  "templating": {
-    "list": [
-      {
-        "current": {},
-        "datasource": {
-          "type": "prometheus",
-          "uid": "${DS_PROMETHEUS}"
-        },
-        "definition": "label_values(model_name)",
-        "hide": 0,
-        "includeAll": false,
-        "label": "model_name",
-        "multi": false,
-        "name": "model_name",
-        "options": [],
-        "query": {
-          "query": "label_values(model_name)",
-          "refId": "StandardVariableQuery"
-        },
-        "refresh": 1,
-        "regex": "",
-        "skipUrlSync": false,
-        "sort": 0,
-        "type": "query"
-      }
-    ]
-  },
-  "time": {
-    "from": "now-5m",
-    "to": "now"
-  },
-  "timepicker": {},
-  "timezone": "",
-  "title": "vLLM",
-  "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b",
-  "version": 1,
-  "weekStart": ""
-}
--- a/examples/production_monitoring/prometheus.yaml
+++ b/examples/production_monitoring/prometheus.yaml
-# prometheus.yaml
-global:
-  scrape_interval: 5s
-  evaluation_interval: 30s
-
-scrape_configs:
-  - job_name: vllm
-    static_configs:
-      - targets:
-          - 'host.docker.internal:8000'
--- a/examples/run_cluster.sh
+++ b/examples/run_cluster.sh
+#!/bin/bash
+
+# Check for minimum number of required arguments
+if [ $# -lt 4 ]; then
+    echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
+    exit 1
+fi
+
+# Assign the first three arguments and shift them away
+DOCKER_IMAGE="$1"
+HEAD_NODE_ADDRESS="$2"
+NODE_TYPE="$3"  # Should be --head or --worker
+PATH_TO_HF_HOME="$4"
+shift 4
+
+# Additional arguments are passed directly to the Docker command
+ADDITIONAL_ARGS="$@"
+
+# Validate node type
+if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
+    echo "Error: Node type must be --head or --worker"
+    exit 1
+fi
+
+# Define a function to cleanup on EXIT signal
+cleanup() {
+    docker stop node
+    docker rm node
+}
+trap cleanup EXIT
+
+# Command setup for head or worker node
+RAY_START_CMD="ray start --block"
+if [ "${NODE_TYPE}" == "--head" ]; then
+    RAY_START_CMD+=" --head --port=6379"
+else
+    RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
+fi
+
+# Run the docker command with the user specified parameters and additional arguments
+docker run \
+    --entrypoint /bin/bash \
+    --network host \
+    --name node \
+    --shm-size 10.24g \
+    --gpus all \
+    -v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
+    ${ADDITIONAL_ARGS} \
+    "${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
--- a/examples/save_sharded_state.py
+++ b/examples/save_sharded_state.py
-"""
-Saves each worker's model state dict directly to a checkpoint, which enables a
-fast load path for large tensor-parallel models where each worker only needs to
-read its own shard rather than the entire checkpoint.
-
-Example usage:
-
-python save_sharded_state.py \
-    --model /path/to/load \
-    --quantization deepspeedfp \
-    --tensor-parallel-size 8 \
-    --output /path/to/save
-
-Then, the model can be loaded with
-
-llm = LLM(
-    model="/path/to/save",
-    load_format="sharded_state",
-    quantization="deepspeedfp",
-    tensor_parallel_size=8,
-)
-"""
-import argparse
-import dataclasses
-import os
-import shutil
-from pathlib import Path
-
-from vllm import LLM, EngineArgs
-
-parser = argparse.ArgumentParser()
-EngineArgs.add_cli_args(parser)
-parser.add_argument("--output",
-                    "-o",
-                    required=True,
-                    type=str,
-                    help="path to output checkpoint")
-parser.add_argument("--file-pattern",
-                    type=str,
-                    help="string pattern of saved filenames")
-parser.add_argument("--max-file-size",
-                    type=str,
-                    default=5 * 1024**3,
-                    help="max size (in bytes) of each safetensors file")
-
-
-def main(args):
-    engine_args = EngineArgs.from_cli_args(args)
-    if engine_args.enable_lora:
-        raise ValueError("Saving with enable_lora=True is not supported!")
-    model_path = engine_args.model
-    if not Path(model_path).is_dir():
-        raise ValueError("model path must be a local directory")
-    # Create LLM instance from arguments
-    llm = LLM(**dataclasses.asdict(engine_args))
-    # Prepare output directory
-    Path(args.output).mkdir(exist_ok=True)
-    # Dump worker states to output directory
-    model_executor = llm.llm_engine.model_executor
-    model_executor.save_sharded_state(path=args.output,
-                                      pattern=args.file_pattern,
-                                      max_size=args.max_file_size)
-    # Copy metadata files to output directory
-    for file in os.listdir(model_path):
-        if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
-            if os.path.isdir(os.path.join(model_path, file)):
-                shutil.copytree(os.path.join(model_path, file),
-                                os.path.join(args.output, file))
-            else:
-                shutil.copy(os.path.join(model_path, file), args.output)
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    main(args)
--- a/examples/template_alpaca.jinja
+++ b/examples/template_alpaca.jinja
-{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
-
-{% for message in messages %}
-{% if message['role'] == 'user' %}
-### Instruction:
-{{ message['content']|trim -}}
-{% if not loop.last %}
-
-
-{% endif %}
-{% elif message['role'] == 'assistant' %}
-### Response:
-{{ message['content']|trim -}}
-{% if not loop.last %}
-
-
-{% endif %}
-{% elif message['role'] == 'user_context' %}
-### Input:
-{{ message['content']|trim -}}
-{% if not loop.last %}
-
-
-{% endif %}
-{% endif %}
-{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
-### Response:
-{% endif %}
\ No newline at end of file
--- a/examples/template_falcon.jinja
+++ b/examples/template_falcon.jinja
 {%- for message in messages -%}
    {%- if message['role'] == 'user' -%}
-        {{- 'User: ' + message['content'] -}}
+        {{- 'Question: ' + message['content'] + ' ' -}}
    {%- elif message['role'] == 'assistant' -%}
-        {{- 'Assistant: ' + message['content'] -}}
-    {%- endif -%}
-    {%- if (loop.last and add_generation_prompt) or not loop.last -%}
-        {{- '\n' -}}
+        {{- 'Answer: ' + message['content'] + ' ' -}}
    {%- endif -%}
 {%- endfor -%}

-
-{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
-    {{- 'Assistant:' -}}
-{% endif %}
\ No newline at end of file
+{%- if add_generation_prompt -%}
+    {{- 'Answer:' -}}
+{% endif %}
--- a/examples/template_chatglm.jinja
+++ b/examples/template_chatglm.jinja
-{%- set counter = namespace(index=0) -%}
-{%- for message in messages -%}
-    {%- if message['role'] == 'user' -%}
-        {{- '[Round ' + counter.index|string + ']\n问：' + message['content'] -}}
-        {%- set counter.index = counter.index + 1 -%}
-    {%- endif -%}
-    {%- if message['role'] == 'assistant' -%}
-        {{- '\n答：' + message['content'] -}}
-        {%- if (loop.last and add_generation_prompt) or not loop.last -%}
-            {{- '\n' -}}
-        {%- endif -%}
-    {%- endif -%}
-{%- endfor -%}
-
-
-{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
-    {{- '\n答：' -}}
-{%- endif -%}
\ No newline at end of file
--- a/examples/template_chatml.jinja
+++ b/examples/template_chatml.jinja
-{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
\ No newline at end of file
--- a/examples/template_falcon_180b.jinja
+++ b/examples/template_falcon_180b.jinja
-{%- for message in messages -%}
-    {%- if message['role'] == 'system' -%}
-        {{- 'System: ' + message['content'] -}}
-    {%- elif message['role'] == 'user' -%}
-        {{- 'User: ' + message['content'] -}}
-    {%- elif message['role'] == 'assistant' -%}
-        {{- 'Falcon: ' + message['content'] -}}
-    {%- endif -%}
-    {%- if (loop.last and add_generation_prompt) or not loop.last -%}
-        {{- '\n' -}}
-    {%- endif -%}
-{%- endfor -%}
-
-
-{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
-    {{- 'Falcon:' -}}
-{% endif %}
\ No newline at end of file
--- a/examples/template_inkbot.jinja
+++ b/examples/template_inkbot.jinja
-<#meta#>
- Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
- Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
-<#system#>
-{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
-<#chat#>
-{% for message in messages %}
-{% if message['role'] == 'user' %}
-<#user#>
-{{ message['content']|trim -}}
-{% if not loop.last %}
-
-{% endif %}
-{% elif message['role'] == 'assistant' %}
-<#bot#>
-{{ message['content']|trim -}}
-{% if not loop.last %}
-
-{% endif %}
-{% elif message['role'] == 'user_context' %}
-<#user_context#>
-{{ message['content']|trim -}}
-{% if not loop.last %}
-
-{% endif %}
-{% endif %}
-{% endfor %}
-{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
-<#bot#>
-{% endif %}
\ No newline at end of file
--- a/examples/template_llama_chat.jinja
+++ b/examples/template_llama_chat.jinja
-{% if messages[0]['role'] == 'system' %}
-    {% set system_message = '<<SYS>>\n' + messages[0]['content'] | trim + '\n<</SYS>>\n\n' %}
-    {% set messages = messages[1:] %}
-{% else %}
-    {% set system_message = '' %}
-{% endif %}
-
-{% for message in messages %}
-    {% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
-        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
-    {% endif %}
-
-    {% if loop.index0 == 0 %}
-        {% set content = system_message + message['content'] %}
-    {% else %}
-        {% set content = message['content'] %}
-    {% endif %}
-
-    {% if message['role'] == 'user' %}
-        {{ bos_token + '[INST] ' + content | trim + ' [/INST]' }}
-    {% elif message['role'] == 'assistant' %}
-        {{ ' ' + content | trim + ' ' + eos_token }}
-    {% endif %}
-{% endfor %}
\ No newline at end of file
--- a/examples/template_llava.jinja
+++ b/examples/template_llava.jinja
-{%- if messages[0]['role'] == 'system' -%}
-    {%- set system_message = messages[0]['content'] -%}
-    {%- set messages = messages[1:] -%}
-{%- else -%}
-    {% set system_message = '' -%}
-{%- endif -%}
-
-{{ bos_token + system_message }}
-{%- for message in messages -%}
-    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
-        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
-    {%- endif -%}
-
-    {%- if message['role'] == 'user' -%}
-        {{ 'USER: ' + message['content'] + '\n' }}
-    {%- elif message['role'] == 'assistant' -%}
-        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
-    {%- endif -%}
-{%- endfor -%}
-
-{%- if add_generation_prompt -%}
-    {{ 'ASSISTANT:' }}
-{% endif %}
--- a/examples/template_baichuan.jinja
+++ b/examples/template_baichuan.jinja
@@ -2,12 +2,8 @@

 {%- for message in messages -%}
    {%- if message['role'] == 'user' -%}
-        {{- '<reserved_106>' + message['content'] -}}
+        {{- '<_user>' + message['content'] +'<_bot>' -}}
    {%- elif message['role'] == 'assistant' -%}
-        {{- '<reserved_107>' + message['content'] -}}
+        {{- message['content'] + '<_end>' -}}
    {%- endif -%}
 {%- endfor -%}
-
-{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%}
-    {{- '<reserved_107>' -}}
-{% endif %}
\ No newline at end of file
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
-import argparse
-import dataclasses
-import json
-import os
-import uuid
-from functools import partial
-
-from tensorizer import stream_io
-
-from vllm import LLM
-from vllm.distributed import (init_distributed_environment,
-                              initialize_model_parallel)
-from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.llm_engine import LLMEngine
-from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
-                                                         TensorizerConfig,
-                                                         serialize_vllm_model)
-
-# yapf conflicts with isort for this docstring
-# yapf: disable
-"""
-tensorize_vllm_model.py is a script that can be used to serialize and 
-deserialize vLLM models. These models can be loaded using tensorizer 
-to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
-or locally. Tensor encryption and decryption is also supported, although 
-libsodium must be installed to use it. Install vllm with tensorizer support 
-using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
-https://github.com/coreweave/tensorizer
-
-To serialize a model, install vLLM from source, then run something 
-like this from the root level of this repository:
-
-python -m examples.tensorize_vllm_model \
-   --model facebook/opt-125m \
-   serialize \
-   --serialized-directory s3://my-bucket \
-   --suffix v1
-   
-Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
-and saves it to your S3 bucket. A local directory can also be used. This
-assumes your S3 credentials are specified as environment variables
-in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and 
-`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide 
-`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint` 
-as CLI args to this script.
-
-You can also encrypt the model weights with a randomly-generated key by 
-providing a `--keyfile` argument.
-
-To deserialize a model, you can run something like this from the root 
-level of this repository:
-
-python -m examples.tensorize_vllm_model \
-   --model EleutherAI/gpt-j-6B \
-   --dtype float16 \
-   deserialize \
-   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors
-
-Which downloads the model tensors from your S3 bucket and deserializes them.
-
-You can also provide a `--keyfile` argument to decrypt the model weights if 
-they were serialized with encryption.
-
-For more information on the available arguments for serializing, run 
-`python -m examples.tensorize_vllm_model serialize --help`.
-
-Or for deserializing:
-
-`python -m examples.tensorize_vllm_model deserialize --help`.
-
-Once a model is serialized, tensorizer can be invoked with the `LLM` class 
-directly to load models:
-
-    llm = LLM(model="facebook/opt-125m",
-              load_format="tensorizer",
-              model_loader_extra_config=TensorizerConfig(
-                    tensorizer_uri = path_to_tensors,
-                    num_readers=3,
-                    )
-              )
-            
-A serialized model can be used during model loading for the vLLM OpenAI
-inference server. `model_loader_extra_config` is exposed as the CLI arg
-`--model-loader-extra-config`, and accepts a JSON string literal of the
-TensorizerConfig arguments desired.
-
-In order to see all of the available arguments usable to configure 
-loading with tensorizer that are given to `TensorizerConfig`, run:
-
-`python -m examples.tensorize_vllm_model deserialize --help`
-
-under the `tensorizer options` section. These can also be used for
-deserialization in this example script, although `--tensorizer-uri` and
-`--path-to-tensors` are functionally the same in this case.
-"""
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="An example script that can be used to serialize and "
-        "deserialize vLLM models. These models "
-        "can be loaded using tensorizer directly to the GPU "
-        "extremely quickly. Tensor encryption and decryption is "
-        "also supported, although libsodium must be installed to "
-        "use it.")
-    parser = EngineArgs.add_cli_args(parser)
-    subparsers = parser.add_subparsers(dest='command')
-
-    serialize_parser = subparsers.add_parser(
-        'serialize', help="Serialize a model to `--serialized-directory`")
-
-    serialize_parser.add_argument(
-        "--suffix",
-        type=str,
-        required=False,
-        help=(
-            "The suffix to append to the serialized model directory, which is "
-            "used to construct the location of the serialized model tensors, "
-            "e.g. if `--serialized-directory` is `s3://my-bucket/` and "
-            "`--suffix` is `v1`, the serialized model tensors will be "
-            "saved to "
-            "`s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors`. "
-            "If none is provided, a random UUID will be used."))
-    serialize_parser.add_argument(
-        "--serialized-directory",
-        type=str,
-        required=True,
-        help="The directory to serialize the model to. "
-        "This can be a local directory or S3 URI. The path to where the "
-        "tensors are saved is a combination of the supplied `dir` and model "
-        "reference ID. For instance, if `dir` is the serialized directory, "
-        "and the model HuggingFace ID is `EleutherAI/gpt-j-6B`, tensors will "
-        "be saved to `dir/vllm/EleutherAI/gpt-j-6B/suffix/model.tensors`, "
-        "where `suffix` is given by `--suffix` or a random UUID if not "
-        "provided.")
-
-    serialize_parser.add_argument(
-        "--keyfile",
-        type=str,
-        required=False,
-        help=("Encrypt the model weights with a randomly-generated binary key,"
-              " and save the key at this path"))
-
-    deserialize_parser = subparsers.add_parser(
-        'deserialize',
-        help=("Deserialize a model from `--path-to-tensors`"
-              " to verify it can be loaded and used."))
-
-    deserialize_parser.add_argument(
-        "--path-to-tensors",
-        type=str,
-        required=True,
-        help="The local path or S3 URI to the model tensors to deserialize. ")
-
-    deserialize_parser.add_argument(
-        "--keyfile",
-        type=str,
-        required=False,
-        help=("Path to a binary key to use to decrypt the model weights,"
-              " if the model was serialized with encryption"))
-
-    TensorizerArgs.add_cli_args(deserialize_parser)
-
-    return parser.parse_args()
-
-
-
-def deserialize():
-    llm = LLM(model=args.model,
-              load_format="tensorizer",
-              model_loader_extra_config=tensorizer_config
-    )
-    return llm
-
-
-
-args = parse_args()
-
-s3_access_key_id = (getattr(args, 's3_access_key_id', None)
-                    or os.environ.get("S3_ACCESS_KEY_ID", None))
-s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
-                        or os.environ.get("S3_SECRET_ACCESS_KEY", None))
-s3_endpoint = (getattr(args, 's3_endpoint', None)
-               or os.environ.get("S3_ENDPOINT_URL", None))
-
-credentials = {
-    "s3_access_key_id": s3_access_key_id,
-    "s3_secret_access_key": s3_secret_access_key,
-    "s3_endpoint": s3_endpoint
-}
-
-_read_stream, _write_stream = (partial(
-    stream_io.open_stream,
-    mode=mode,
-    s3_access_key_id=s3_access_key_id,
-    s3_secret_access_key=s3_secret_access_key,
-    s3_endpoint=s3_endpoint,
-) for mode in ("rb", "wb+"))
-
-model_ref = args.model
-
-model_name = model_ref.split("/")[1]
-
-os.environ["MASTER_ADDR"] = "127.0.0.1"
-os.environ["MASTER_PORT"] = "8080"
-
-init_distributed_environment(world_size=1, rank=0, local_rank=0)
-initialize_model_parallel()
-
-keyfile = args.keyfile if args.keyfile else None
-
-
-if args.model_loader_extra_config:
-    config = json.loads(args.model_loader_extra_config)
-    tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args()
-    tensorizer_args.tensorizer_uri = args.path_to_tensors
-else:
-    tensorizer_args = None
-
-if args.command == "serialize":
-    eng_args_dict = {f.name: getattr(args, f.name) for f in
-                     dataclasses.fields(EngineArgs)}
-
-    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
-    engine = LLMEngine.from_engine_args(engine_args)
-
-    input_dir = args.serialized_directory.rstrip('/')
-    suffix = args.suffix if args.suffix else uuid.uuid4().hex
-    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
-    model_path = f"{base_path}/model.tensors"
-    tensorizer_config = TensorizerConfig(
-        tensorizer_uri=model_path,
-        **credentials)
-    serialize_vllm_model(engine, tensorizer_config, keyfile)
-elif args.command == "deserialize":
-    if not tensorizer_args:
-        tensorizer_config = TensorizerConfig(
-            tensorizer_uri=args.path_to_tensors,
-            encryption_keyfile = keyfile,
-            **credentials
-        )
-    deserialize()
-else:
-    raise ValueError("Either serialize or deserialize must be specified.")
--- a/examples/tool_chat_template_hermes.jinja
+++ b/examples/tool_chat_template_hermes.jinja
+{%- macro json_to_python_type(json_spec) %}
+    {%- set basic_type_map = {
+    "string": "str",
+    "number": "float",
+    "integer": "int",
+    "boolean": "bool"
+} %}
+
+    {%- if basic_type_map[json_spec.type] is defined %}
+        {{- basic_type_map[json_spec.type] }}
+    {%- elif json_spec.type == "array" %}
+        {{- "list[" +  json_to_python_type(json_spec|items) + "]" }}
+    {%- elif json_spec.type == "object" %}
+        {%- if json_spec.additionalProperties is defined %}
+            {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }}
+        {%- else %}
+            {{- "dict" }}
+        {%- endif %}
+    {%- elif json_spec.type is iterable %}
+        {{- "Union[" }}
+        {%- for t in json_spec.type %}
+            {{- json_to_python_type({"type": t}) }}
+            {%- if not loop.last %}
+                {{- "," }}
+            {%- endif %}
+        {%- endfor %}
+        {{- "]" }}
+    {%- else %}
+        {{- "Any" }}
+    {%- endif %}
+{%- endmacro %}
+
+
+{{- bos_token }}
+{{- "<|im_start|>system\nYou are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> " }}
+{%- if tools is iterable and tools | length > 0 %}
+    {%- for tool in tools %}
+        {%- if tool.function is defined %}
+            {%- set tool = tool.function %}
+        {%- endif %}
+        {{- '{"type": "function", "function": ' }}
+        {{- '{"name": "' + tool.name + '", ' }}
+        {{- '"description": "' + tool.name + '(' }}
+        {%- for param_name, param_fields in tool.parameters.properties|items %}
+            {{- param_name + ": " + json_to_python_type(param_fields) }}
+            {%- if not loop.last %}
+                {{- ", " }}
+            {%- endif %}
+        {%- endfor %}
+        {{- ")" }}
+        {%- if tool.return is defined %}
+            {{- " -> " + json_to_python_type(tool.return) }}
+        {%- endif %}
+        {{- " - " + tool.description + "\n\n" }}
+        {%- for param_name, param_fields in tool.parameters.properties|items %}
+            {%- if loop.first %}
+                {{- "    Args:\n" }}
+            {%- endif %}
+            {{- "        " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }}
+        {%- endfor %}
+        {%- if tool.return is defined and tool.return.description is defined %}
+            {{- "\n    Returns:\n        " + tool.return.description }}
+        {%- endif %}
+        {{- '"' }}
+        {{- ', "parameters": ' }}
+        {%- if tool.parameters.properties | length == 0 %}
+            {{- "{}" }}
+        {%- else %}
+            {{- tool.parameters|tojson }}
+        {%- endif %}
+        {{- "}" }}
+        {%- if not loop.last %}
+            {{- "\n" }}
+        {%- endif %}
+    {%- endfor %}
+{%- endif %}
+{{- " </tools>" }}
+{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}}
+' }}
+{{- "For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
+" }}
+{{- "<tool_call>
+" }}
+{{- '{"name": <function-name>, "arguments": <args-dict>}
+' }}
+{{- '</tool_call><|im_end|>' }}
+{%- for message in messages %}
+    {%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" and message.tool_calls is defined %}
+        {{- '<|im_start|>' + message.role }}
+        {%- for tool_call in message.tool_calls %}
+            {{- '\n<tool_call>\n' }}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '{' }}
+            {{- '"name": "' }}
+            {{- tool_call.name }}
+            {{- '"' }}
+            {%- if tool_call.arguments is defined %}
+                {{- ', ' }}
+                {{- '"arguments": ' }}
+                {{- tool_call.arguments|tojson }}
+            {%- endif %}
+            {{- '}' }}
+            {{- '\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if loop.previtem and loop.previtem.role != "tool" %}
+            {{- '<|im_start|>tool\n' }}
+        {%- endif %}
+        {{- '<tool_response>\n' }}
+        {{- message.content }}
+        {%- if not loop.last %}
+            {{- '\n</tool_response>\n' }}
+        {%- else %}
+            {{- '\n</tool_response>' }}
+        {%- endif %}
+        {%- if not loop.last and loop.nextitem.role != "tool" %}
+            {{- '<|im_end|>' }}
+        {%- elif loop.last %}
+            {{- '<|im_end|>' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}