merge v0.4.3

b9e12416 · zhuwenwen · e5d707db · e9d3aa04 · b9e12416 · b9e12416
Commit b9e12416 authored May 31, 2024 by zhuwenwen
20 changed files
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -23,11 +23,15 @@ def run_llava_pixel_values():
        "\nUSER: What is the content of this image?\nASSISTANT:")

    # This should be provided by another online or offline component.
-    images = torch.load("images/stop_sign_pixel_values.pt")
+    image = torch.load("images/stop_sign_pixel_values.pt")
+
+    outputs = llm.generate({
+        "prompt":
+        prompt,
+        "multi_modal_data":
+        MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
+    })

-    outputs = llm.generate(prompt,
-                           multi_modal_data=MultiModalData(
-                               type=MultiModalData.Type.IMAGE, data=images))
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)
@@ -46,11 +50,14 @@ def run_llava_image_features():
        "\nUSER: What is the content of this image?\nASSISTANT:")

    # This should be provided by another online or offline component.
-    images = torch.load("images/stop_sign_image_features.pt")
-
-    outputs = llm.generate(prompt,
-                           multi_modal_data=MultiModalData(
-                               type=MultiModalData.Type.IMAGE, data=images))
+    image = torch.load("images/stop_sign_image_features.pt")
+
+    outputs = llm.generate({
+        "prompt":
+        prompt,
+        "multi_modal_data":
+        MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
+    })
    for o in outputs:
        generated_text = o.outputs[0].text
        print(generated_text)

--- a/examples/offline_inference_arctic.py
+++ b/examples/offline_inference_arctic.py
+from vllm import LLM, SamplingParams
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="snowflake/snowflake-arctic-instruct",
+          quantization="deepspeedfp",
+          tensor_parallel_size=8,
+          trust_remote_code=True)
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
--- a/examples/offline_inference_distributed.py
+++ b/examples/offline_inference_distributed.py
@@ -9,19 +9,31 @@ from typing import Dict

 import numpy as np
 import ray
+from packaging.version import Version
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

 from vllm import LLM, SamplingParams

+assert Version(ray.__version__) >= Version(
+    "2.22.0"), "Ray version must be at least 2.22.0"
+
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)

+# Set tensor parallelism per instance.
+tensor_parallel_size = 1
+
+# Set number of instances. Each instance will use tensor_parallel_size GPUs.
+num_instances = 1
+

 # Create a class to do batch inference.
 class LLMPredictor:

    def __init__(self):
        # Create an LLM.
-        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
+        self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+                       tensor_parallel_size=tensor_parallel_size)

    def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
        # Generate texts from the prompts.
@@ -43,17 +55,41 @@ class LLMPredictor:
 # from cloud storage (such as JSONL, Parquet, CSV, binary format).
 ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")

+
+# For tensor_parallel_size > 1, we need to create placement groups for vLLM
+# to use. Every actor has to have its own placement group.
+def scheduling_strategy_fn():
+    # One bundle per tensor parallel worker
+    pg = ray.util.placement_group(
+        [{
+            "GPU": 1,
+            "CPU": 1
+        }] * tensor_parallel_size,
+        strategy="STRICT_PACK",
+    )
+    return dict(scheduling_strategy=PlacementGroupSchedulingStrategy(
+        pg, placement_group_capture_child_tasks=True))
+
+
+resources_kwarg = {}
+if tensor_parallel_size == 1:
+    # For tensor_parallel_size == 1, we simply set num_gpus=1.
+    resources_kwarg["num_gpus"] = 1
+else:
+    # Otherwise, we have to set num_gpus=0 and provide
+    # a function that will create a placement group for
+    # each instance.
+    resources_kwarg["num_gpus"] = 0
+    resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
+
 # Apply batch inference for all input data.
 ds = ds.map_batches(
    LLMPredictor,
    # Set the concurrency to the number of LLM instances.
-    concurrency=10,
-    # Specify the number of GPUs required per LLM instance.
-    # NOTE: Do NOT set `num_gpus` when using vLLM with tensor-parallelism
-    # (i.e., `tensor_parallel_size`).
-    num_gpus=1,
+    concurrency=num_instances,
    # Specify the batch size for inference.
    batch_size=32,
+    **resources_kwarg,
 )

 # Peek first 10 results.

--- a/examples/offline_inference_embedding.py
+++ b/examples/offline_inference_embedding.py
+from vllm import LLM
+
+# Sample prompts.
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create an LLM.
+model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True)
+# Generate embedding. The output is a list of EmbeddingRequestOutputs.
+outputs = model.encode(prompts)
+# Print the outputs.
+for output in outputs:
+    print(output.outputs.embedding)  # list of 4096 floats
--- a/examples/offline_inference_openai.md
+++ b/examples/offline_inference_openai.md
+# Offline Inference with the OpenAI Batch file format
+
+ **NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API.
+ 
+ ## File Format
+ 
+ The OpenAI batch file format consists of a series of json objects on new lines.
+ 
+ [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
+ 
+ Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
+ 
+ **NOTE:** We currently only support to `/v1/chat/completions` endpoint (embeddings and completions coming soon).
+ 
+ ## Pre-requisites
+ 
+* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`.
+* The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
+  - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
+  - Install the token on your machine (Run `huggingface-cli login`).
+  - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
+ 
+ 
+ ## Example: Running with a local file
+ 
+ ### Step 1: Create your batch file
+ 
+ To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
+ 
+ ```
+ wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+ ```
+ 
+ Once you've created your batch file it should look like this
+ 
+ ```
+ $ cat openai_example_batch.jsonl
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+ ```
+ 
+ ### Step 2: Run the batch
+ 
+The batch running tool is designed to be used from the command line.
+
+You can run the batch with the following command, which will write its results to a file called `results.jsonl`
+
+```
+python -m vllm.entrypoints.openai.run_batch -i openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+### Step 3: Check your results
+
+You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`
+
+```
+$ cat ../results.jsonl
+{"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
+{"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
+```
+
+## Example 2: Using remote files
+
+The batch runner supports remote input and output urls that are accessible via http/https.
+
+For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl`, you can run
+
+```
+python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+## Example 3: Integrating with AWS S3
+
+To integrate with cloud blob storage, we recommend using presigned urls.
+
+[Learn more about S3 presigned urls here]
+
+### Additional prerequisites
+
+* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html). 
+* The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
+  - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
+* The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
+
+### Step 1: Upload your input script
+
+To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
+ 
+ ```
+ wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
+ ```
+ 
+ Once you've created your batch file it should look like this
+ 
+ ```
+ $ cat openai_example_batch.jsonl
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+ ```
+
+Now upload your batch file to your S3 bucket.
+
+```
+aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
+```
+
+  
+### Step 2: Generate your presigned urls
+
+Presigned put urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
+
+(The script is adapted from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py)
+
+```
+import boto3
+from botocore.exceptions import ClientError
+
+def generate_presigned_url(s3_client, client_method, method_parameters, expires_in):
+    """
+    Generate a presigned Amazon S3 URL that can be used to perform an action.
+
+    :param s3_client: A Boto3 Amazon S3 client.
+    :param client_method: The name of the client method that the URL performs.
+    :param method_parameters: The parameters of the specified client method.
+    :param expires_in: The number of seconds the presigned URL is valid for.
+    :return: The presigned URL.
+    """
+    try:
+        url = s3_client.generate_presigned_url(
+            ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
+        )
+    except ClientError:
+        raise
+    return url
+
+
+s3_client = boto3.client("s3")
+input_url = generate_presigned_url(
+    s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
+)
+output_url = generate_presigned_url(
+    s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
+)
+print(f"{input_url=}")
+print(f"{output_url=}")
+```
+
+This script should output
+
+```
+input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
+output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
+```
+
+### Step 3: Run the batch runner using your presigned urls
+
+You can now run the batch runner, using the urls generated in the previous section.
+
+```
+python -m vllm.entrypoints.openai.run_batch \
+    -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
+    --model --model meta-llama/Meta-Llama-3-8B-Instruct
+```
+
+### Step 4: View your results
+
+Your results are now on S3. You can view them in your terminal by running
+
+```
+aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
+```
--- a/examples/openai_embedding_client.py
+++ b/examples/openai_embedding_client.py
+from openai import OpenAI
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"
+
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)
+
+models = client.models.list()
+model = models.data[0].id
+
+responses = client.embeddings.create(input=[
+    "Hello my name is",
+    "The best thing about vLLM is that it supports many different models"
+],
+                                     model=model)
+
+for data in responses.data:
+    print(data.embedding)  # list of float of len 4096
--- a/examples/openai_example_batch.jsonl
+++ b/examples/openai_example_batch.jsonl
+{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
+{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}}
--- a/examples/production_monitoring/README.md
+++ b/examples/production_monitoring/README.md
@@ -29,7 +29,8 @@ python3 ../../benchmarks/benchmark_serving.py \
    --model mistralai/Mistral-7B-v0.1 \
    --tokenizer mistralai/Mistral-7B-v0.1 \
    --endpoint /v1/completions \
-    --dataset ShareGPT_V3_unfiltered_cleaned_split.json \
+    --dataset-name sharegpt \
+    --dataset-path ShareGPT_V3_unfiltered_cleaned_split.json \
    --request-rate 3.0
 ```


--- a/examples/production_monitoring/grafana.json
+++ b/examples/production_monitoring/grafana.json
 {
+  "__inputs": [
+    {
+      "name": "DS_PROMETHEUS",
+      "label": "prometheus",
+      "description": "",
+      "type": "datasource",
+      "pluginId": "prometheus",
+      "pluginName": "Prometheus"
+    }
+  ],
+  "__elements": {},
+  "__requires": [
+    {
+      "type": "grafana",
+      "id": "grafana",
+      "name": "Grafana",
+      "version": "10.4.2"
+    },
+    {
+      "type": "panel",
+      "id": "heatmap",
+      "name": "Heatmap",
+      "version": ""
+    },
+    {
+      "type": "datasource",
+      "id": "prometheus",
+      "name": "Prometheus",
+      "version": "1.0.0"
+    },
+    {
+      "type": "panel",
+      "id": "timeseries",
+      "name": "Time series",
+      "version": ""
+    }
+  ],
  "annotations": {
    "list": [
      {
@@ -25,14 +62,14 @@
  "editable": true,
  "fiscalYearStartMonth": 0,
  "graphTooltip": 0,
-  "id": 29,
+  "id": null,
  "links": [],
  "liveNow": false,
  "panels": [
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "${DS_PROMETHEUS}"
      },
      "description": "End to end request latency measured in seconds.",
      "fieldConfig": {
@@ -41,6 +78,7 @@
            "mode": "palette-classic"
          },
          "custom": {
+            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@@ -54,6 +92,7 @@
              "tooltip": false,
              "viz": false
            },
+            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@@ -111,7 +150,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -127,7 +166,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -144,7 +183,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -161,7 +200,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -178,7 +217,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr": "rate(vllm:e2e_request_latency_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
@@ -195,7 +234,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Number of tokens processed per second",
      "fieldConfig": {
@@ -204,6 +243,7 @@
            "mode": "palette-classic"
          },
          "custom": {
+            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@@ -217,6 +257,7 @@
              "tooltip": false,
              "viz": false
            },
+            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@@ -273,7 +314,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -289,7 +330,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -310,7 +351,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Inter token latency in seconds.",
      "fieldConfig": {
@@ -319,6 +360,7 @@
            "mode": "palette-classic"
          },
          "custom": {
+            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@@ -332,6 +374,7 @@
              "tooltip": false,
              "viz": false
            },
+            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@@ -389,7 +432,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -405,7 +448,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -422,7 +465,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -439,7 +482,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -456,7 +499,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr": "rate(vllm:time_per_output_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
@@ -473,7 +516,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
      "fieldConfig": {
@@ -482,6 +525,7 @@
            "mode": "palette-classic"
          },
          "custom": {
+            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@@ -495,6 +539,7 @@
              "tooltip": false,
              "viz": false
            },
+            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@@ -552,7 +597,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -568,7 +613,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -585,7 +630,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -606,7 +651,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "${DS_PROMETHEUS}"
      },
      "description": "P50, P90, P95, and P99 TTFT latency in seconds.",
      "fieldConfig": {
@@ -615,6 +660,7 @@
            "mode": "palette-classic"
          },
          "custom": {
+            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@@ -628,6 +674,7 @@
              "tooltip": false,
              "viz": false
            },
+            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@@ -685,7 +732,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -702,7 +749,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -718,7 +765,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -735,7 +782,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -752,7 +799,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr": "rate(vllm:time_to_first_token_seconds_sum{model_name=\"$model_name\"}[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count{model_name=\"$model_name\"}[$__rate_interval])",
@@ -769,7 +816,7 @@
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "${DS_PROMETHEUS}"
      },
      "description": "Percentage of used cache blocks by vLLM.",
      "fieldConfig": {
@@ -778,6 +825,7 @@
            "mode": "palette-classic"
          },
          "custom": {
+            "axisBorderShow": false,
            "axisCenteredZero": false,
            "axisColorMode": "text",
            "axisLabel": "",
@@ -791,6 +839,7 @@
              "tooltip": false,
              "viz": false
            },
+            "insertNulls": false,
            "lineInterpolation": "linear",
            "lineWidth": 1,
            "pointSize": 5,
@@ -848,7 +897,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr": "vllm:gpu_cache_usage_perc{model_name=\"$model_name\"}",
@@ -860,7 +909,7 @@
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "editorMode": "code",
          "expr": "vllm:cpu_cache_usage_perc{model_name=\"$model_name\"}",
@@ -875,229 +924,232 @@
      "type": "timeseries"
    },
    {
-      "type": "heatmap",
-      "title": "Request Prompt Length",
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
      "description": "Heatmap of request prompt length",
+      "fieldConfig": {
+        "defaults": {
+          "custom": {
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
+            }
+          }
+        },
+        "overrides": []
+      },
      "gridPos": {
-        "x": 0,
-        "y": 24,
+        "h": 8,
        "w": 12,
-        "h": 8
-      },
-      "datasource": {
-        "uid": "prometheus",
-        "type": "prometheus"
+        "x": 0,
+        "y": 24
      },
      "id": 12,
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "refId": "A",
-          "expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
-          "range": true,
-          "instant": false,
-          "editorMode": "builder",
-          "legendFormat": "{{le}}",
-          "useBackend": false,
-          "disableTextWrap": false,
-          "fullMetaSearch": false,
-          "includeNullMetadata": true,
-          "format": "heatmap"
-        }
-      ],
      "options": {
        "calculate": false,
-        "yAxis": {
-          "axisPlacement": "left",
-          "reverse": false,
-          "unit": "none",
-          "axisLabel": "Prompt Length"
-        },
-        "rowsFrame": {
-          "layout": "auto",
-          "value": "Request count"
+        "cellGap": 1,
+        "cellValues": {
+          "unit": "none"
        },
        "color": {
-          "mode": "scheme",
+          "exponent": 0.5,
          "fill": "dark-orange",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
          "scale": "exponential",
-          "exponent": 0.5,
          "scheme": "Spectral",
-          "steps": 64,
-          "reverse": false,
-          "min": 0
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
        },
-        "cellGap": 1,
        "filterValues": {
          "le": 1e-9
        },
-        "tooltip": {
-          "show": true,
-          "yHistogram": true
-        },
        "legend": {
          "show": true
        },
-        "exemplars": {
-          "color": "rgba(255,0,255,0.7)"
+        "rowsFrame": {
+          "layout": "auto",
+          "value": "Request count"
        },
-        "cellValues": {
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": true
+        },
+        "yAxis": {
+          "axisLabel": "Prompt Length",
+          "axisPlacement": "left",
+          "reverse": false,
          "unit": "none"
        }
      },
+      "pluginVersion": "10.4.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
+          "format": "heatmap",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{le}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Request Prompt Length",
+      "type": "heatmap"
+    },
+    {
+      "datasource": {
+        "type": "prometheus",
+        "uid": "${DS_PROMETHEUS}"
+      },
+      "description": "Heatmap of request generation length",
      "fieldConfig": {
        "defaults": {
          "custom": {
-            "scaleDistribution": {
-              "type": "linear"
-            },
            "hideFrom": {
+              "legend": false,
              "tooltip": false,
-              "viz": false,
-              "legend": false
+              "viz": false
+            },
+            "scaleDistribution": {
+              "type": "linear"
            }
          }
        },
        "overrides": []
      },
-      "pluginVersion": "10.2.0"
-    },
-    {
-      "datasource": {
-        "uid": "prometheus",
-        "type": "prometheus"
-      },
-      "type": "heatmap",
-      "title": "Request Generation Length",
-      "description": "Heatmap of request generation length",
      "gridPos": {
-        "x": 12,
-        "y": 24,
+        "h": 8,
        "w": 12,
-        "h": 8
+        "x": 12,
+        "y": 24
      },
      "id": 13,
-      "targets": [
-        {
-          "datasource": {
-            "type": "prometheus",
-            "uid": "prometheus"
-          },
-          "refId": "A",
-          "expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
-          "range": true,
-          "instant": false,
-          "editorMode": "builder",
-          "legendFormat": "{{le}}",
-          "useBackend": false,
-          "disableTextWrap": false,
-          "fullMetaSearch": false,
-          "includeNullMetadata": true,
-          "format": "heatmap"
-        }
-      ],
      "options": {
        "calculate": false,
-        "yAxis": {
-          "axisPlacement": "left",
-          "reverse": false,
-          "unit": "none",
-          "axisLabel": "Generation Length"
-        },
-        "rowsFrame": {
-          "layout": "auto",
-          "value": "Request count"
+        "cellGap": 1,
+        "cellValues": {
+          "unit": "none"
        },
        "color": {
-          "mode": "scheme",
+          "exponent": 0.5,
          "fill": "dark-orange",
+          "min": 0,
+          "mode": "scheme",
+          "reverse": false,
          "scale": "exponential",
-          "exponent": 0.5,
          "scheme": "Spectral",
-          "steps": 64,
-          "reverse": false,
-          "min": 0
+          "steps": 64
+        },
+        "exemplars": {
+          "color": "rgba(255,0,255,0.7)"
        },
-        "cellGap": 1,
        "filterValues": {
          "le": 1e-9
        },
-        "tooltip": {
-          "show": true,
-          "yHistogram": true
-        },
        "legend": {
          "show": true
        },
-        "exemplars": {
-          "color": "rgba(255,0,255,0.7)"
+        "rowsFrame": {
+          "layout": "auto",
+          "value": "Request count"
        },
-        "cellValues": {
+        "tooltip": {
+          "mode": "single",
+          "showColorScale": false,
+          "yHistogram": true
+        },
+        "yAxis": {
+          "axisLabel": "Generation Length",
+          "axisPlacement": "left",
+          "reverse": false,
          "unit": "none"
        }
      },
-      "fieldConfig": {
-        "defaults": {
-          "custom": {
-            "scaleDistribution": {
-              "type": "linear"
-            },
-            "hideFrom": {
-              "tooltip": false,
-              "viz": false,
-              "legend": false
-            }
-          }
-        },
-        "overrides": []
-      },
-      "pluginVersion": "10.2.0"
+      "pluginVersion": "10.4.2",
+      "targets": [
+        {
+          "datasource": {
+            "type": "prometheus",
+            "uid": "${DS_PROMETHEUS}"
+          },
+          "disableTextWrap": false,
+          "editorMode": "builder",
+          "expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
+          "format": "heatmap",
+          "fullMetaSearch": false,
+          "includeNullMetadata": true,
+          "instant": false,
+          "legendFormat": "{{le}}",
+          "range": true,
+          "refId": "A",
+          "useBackend": false
+        }
+      ],
+      "title": "Request Generation Length",
+      "type": "heatmap"
    },
    {
      "datasource": {
        "type": "prometheus",
-        "uid": "prometheus"
+        "uid": "${DS_PROMETHEUS}"
      },
+      "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
      "fieldConfig": {
        "defaults": {
+          "color": {
+            "mode": "palette-classic"
+          },
          "custom": {
-            "drawStyle": "line",
-            "lineInterpolation": "linear",
+            "axisBorderShow": false,
+            "axisCenteredZero": false,
+            "axisColorMode": "text",
+            "axisLabel": "",
+            "axisPlacement": "auto",
            "barAlignment": 0,
-            "lineWidth": 1,
+            "drawStyle": "line",
            "fillOpacity": 0,
            "gradientMode": "none",
-            "spanNulls": false,
+            "hideFrom": {
+              "legend": false,
+              "tooltip": false,
+              "viz": false
+            },
            "insertNulls": false,
-            "showPoints": "auto",
+            "lineInterpolation": "linear",
+            "lineWidth": 1,
            "pointSize": 5,
-            "stacking": {
-              "mode": "none",
-              "group": "A"
-            },
-            "axisPlacement": "auto",
-            "axisLabel": "",
-            "axisColorMode": "text",
-            "axisBorderShow": false,
            "scaleDistribution": {
              "type": "linear"
            },
-            "axisCenteredZero": false,
-            "hideFrom": {
-              "tooltip": false,
-              "viz": false,
-              "legend": false
+            "showPoints": "auto",
+            "spanNulls": false,
+            "stacking": {
+              "group": "A",
+              "mode": "none"
            },
            "thresholdsStyle": {
              "mode": "off"
            }
          },
-          "color": {
-            "mode": "palette-classic"
-          },
          "mappings": [],
          "thresholds": {
            "mode": "absolute",
@@ -1123,22 +1175,22 @@
      },
      "id": 11,
      "options": {
-        "tooltip": {
-          "mode": "single",
-          "sort": "none"
-        },
        "legend": {
-          "showLegend": true,
+          "calcs": [],
          "displayMode": "list",
          "placement": "bottom",
-          "calcs": []
+          "showLegend": true
+        },
+        "tooltip": {
+          "mode": "single",
+          "sort": "none"
        }
      },
      "targets": [
        {
          "datasource": {
            "type": "prometheus",
-            "uid": "prometheus"
+            "uid": "${DS_PROMETHEUS}"
          },
          "disableTextWrap": false,
          "editorMode": "builder",
@@ -1154,25 +1206,19 @@
        }
      ],
      "title": "Finish Reason",
-      "description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
      "type": "timeseries"
    }
  ],
  "refresh": "",
-  "schemaVersion": 37,
-  "style": "dark",
+  "schemaVersion": 39,
  "tags": [],
  "templating": {
    "list": [
      {
-        "current": {
-          "selected": false,
-          "text": "vllm",
-          "value": "vllm"
-        },
+        "current": {},
        "datasource": {
          "type": "prometheus",
-          "uid": "prometheus"
+          "uid": "${DS_PROMETHEUS}"
        },
        "definition": "label_values(model_name)",
        "hide": 0,
@@ -1201,6 +1247,6 @@
  "timezone": "",
  "title": "vLLM",
  "uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b",
-  "version": 2,
+  "version": 1,
  "weekStart": ""
 }
--- a/examples/save_sharded_state.py
+++ b/examples/save_sharded_state.py
+"""
+Saves each worker's model state dict directly to a checkpoint, which enables a
+fast load path for large tensor-parallel models where each worker only needs to
+read its own shard rather than the entire checkpoint.
+
+Example usage:
+
+python save_sharded_state.py \
+    --model /path/to/load \
+    --quantization deepspeedfp \
+    --tensor-parallel-size 8 \
+    --output /path/to/save
+
+Then, the model can be loaded with
+
+llm = LLM(
+    model="/path/to/save",
+    load_format="sharded_state",
+    quantization="deepspeedfp",
+    tensor_parallel_size=8,
+)
+"""
+import argparse
+import dataclasses
+import os
+import shutil
+from pathlib import Path
+
+from vllm import LLM, EngineArgs
+
+parser = argparse.ArgumentParser()
+EngineArgs.add_cli_args(parser)
+parser.add_argument("--output",
+                    "-o",
+                    required=True,
+                    type=str,
+                    help="path to output checkpoint")
+parser.add_argument("--file-pattern",
+                    type=str,
+                    help="string pattern of saved filenames")
+parser.add_argument("--max-file-size",
+                    type=str,
+                    default=5 * 1024**3,
+                    help="max size (in bytes) of each safetensors file")
+
+
+def main(args):
+    engine_args = EngineArgs.from_cli_args(args)
+    if engine_args.enable_lora:
+        raise ValueError("Saving with enable_lora=True is not supported!")
+    model_path = engine_args.model
+    if not Path(model_path).is_dir():
+        raise ValueError("model path must be a local directory")
+    # Create LLM instance from arguments
+    llm = LLM(**dataclasses.asdict(engine_args))
+    # Prepare output directory
+    Path(args.output).mkdir(exist_ok=True)
+    # Dump worker states to output directory
+    model_executor = llm.llm_engine.model_executor
+    model_executor.save_sharded_state(path=args.output,
+                                      pattern=args.file_pattern,
+                                      max_size=args.max_file_size)
+    # Copy metadata files to output directory
+    for file in os.listdir(model_path):
+        if os.path.splitext(file)[1] not in (".bin", ".pt", ".safetensors"):
+            if os.path.isdir(os.path.join(model_path, file)):
+                shutil.copytree(os.path.join(model_path, file),
+                                os.path.join(args.output, file))
+            else:
+                shutil.copy(os.path.join(model_path, file), args.output)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args)
--- a/examples/tensorize_vllm_model.py
+++ b/examples/tensorize_vllm_model.py
 import argparse
 import dataclasses
+import json
 import os
-import time
 import uuid
 from functools import partial
-from typing import Type

-import torch
-import torch.nn as nn
-from tensorizer import (DecryptionParams, EncryptionParams, TensorDeserializer,
-                        TensorSerializer, stream_io)
-from tensorizer.utils import convert_bytes, get_mem_usage, no_init_or_tensor
-from transformers import AutoConfig, PretrainedConfig
+from tensorizer import stream_io

-from vllm.distributed import initialize_model_parallel
+from vllm import LLM
+from vllm.distributed import (init_distributed_environment,
+                              initialize_model_parallel)
 from vllm.engine.arg_utils import EngineArgs
 from vllm.engine.llm_engine import LLMEngine
-from vllm.model_executor.model_loader.tensorizer import TensorizerArgs
-from vllm.model_executor.models import ModelRegistry
+from vllm.model_executor.model_loader.tensorizer import (TensorizerArgs,
+                                                         TensorizerConfig,
+                                                         serialize_vllm_model)

 # yapf conflicts with isort for this docstring
 # yapf: disable
@@ -27,25 +24,25 @@ deserialize vLLM models. These models can be loaded using tensorizer
 to the GPU extremely quickly over an HTTP/HTTPS endpoint, an S3 endpoint,
 or locally. Tensor encryption and decryption is also supported, although 
 libsodium must be installed to use it. Install vllm with tensorizer support 
-using `pip install vllm[tensorizer]`.
+using `pip install vllm[tensorizer]`. To learn more about tensorizer, visit
+https://github.com/coreweave/tensorizer

 To serialize a model, install vLLM from source, then run something 
 like this from the root level of this repository:

 python -m examples.tensorize_vllm_model \
-   --model EleutherAI/gpt-j-6B \
-   --dtype float16 \
+   --model facebook/opt-125m \
   serialize \
-   --serialized-directory s3://my-bucket/ \
-   --suffix vllm
+   --serialized-directory s3://my-bucket \
+   --suffix v1
   
 Which downloads the model from HuggingFace, loads it into vLLM, serializes it,
 and saves it to your S3 bucket. A local directory can also be used. This
 assumes your S3 credentials are specified as environment variables
-in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and `S3_ENDPOINT`.
-To provide S3 credentials directly, you can provide `--s3-access-key-id` and 
-`--s3-secret-access-key`, as well as `--s3-endpoint` as CLI args to this 
-script.
+in the form of `S3_ACCESS_KEY_ID`, `S3_SECRET_ACCESS_KEY`, and 
+`S3_ENDPOINT_URL`. To provide S3 credentials directly, you can provide 
+`--s3-access-key-id` and `--s3-secret-access-key`, as well as `--s3-endpoint` 
+as CLI args to this script.

 You can also encrypt the model weights with a randomly-generated key by 
 providing a `--keyfile` argument.
@@ -57,7 +54,7 @@ python -m examples.tensorize_vllm_model \
   --model EleutherAI/gpt-j-6B \
   --dtype float16 \
   deserialize \
-   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/vllm/model.tensors
+   --path-to-tensors s3://my-bucket/vllm/EleutherAI/gpt-j-6B/v1/model.tensors

 Which downloads the model tensors from your S3 bucket and deserializes them.

@@ -71,26 +68,30 @@ Or for deserializing:

 `python -m examples.tensorize_vllm_model deserialize --help`.

-Once a model is serialized, it can be used to load the model when running the
-OpenAI inference client at `vllm/entrypoints/openai/api_server.py` by providing
-the `--tensorizer-uri` CLI argument that is functionally the same as the
-`--path-to-tensors` argument in this script, along with `--vllm-tensorized`, to
-signify that the model to be deserialized is a vLLM model, rather than a 
-HuggingFace `PreTrainedModel`, which can also be deserialized using tensorizer
-in the same inference server, albeit without the speed optimizations. To
-deserialize an encrypted file, the `--encryption-keyfile` argument can be used
-to provide the path to the keyfile used to encrypt the model weights. For
-information on all the arguments that can be used to configure tensorizer's
-deserialization, check out the tensorizer options argument group in the
-`vllm/entrypoints/openai/api_server.py` script with `--help`.
-
-Tensorizer can also be invoked with the `LLM` class directly to load models:
+Once a model is serialized, tensorizer can be invoked with the `LLM` class 
+directly to load models:

    llm = LLM(model="facebook/opt-125m",
              load_format="tensorizer",
-              tensorizer_uri=path_to_opt_tensors,
-              num_readers=3,
-              vllm_tensorized=True)
+              model_loader_extra_config=TensorizerConfig(
+                    tensorizer_uri = path_to_tensors,
+                    num_readers=3,
+                    )
+              )
+            
+A serialized model can be used during model loading for the vLLM OpenAI
+inference server. `model_loader_extra_config` is exposed as the CLI arg
+`--model-loader-extra-config`, and accepts a JSON string literal of the
+TensorizerConfig arguments desired.
+
+In order to see all of the available arguments usable to configure 
+loading with tensorizer that are given to `TensorizerConfig`, run:
+
+`python -m examples.tensorize_vllm_model deserialize --help`
+
+under the `tensorizer options` section. These can also be used for
+deserialization in this example script, although `--tensorizer-uri` and
+`--path-to-tensors` are functionally the same in this case.
 """


@@ -158,95 +159,35 @@ def parse_args():
        help=("Path to a binary key to use to decrypt the model weights,"
              " if the model was serialized with encryption"))

-    return parser.parse_args()
-
-
-def make_model_contiguous(model):
-    # Ensure tensors are saved in memory contiguously
-    for param in model.parameters():
-        param.data = param.data.contiguous()
-
-
-def _get_vllm_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
-    architectures = getattr(config, "architectures", [])
-    for arch in architectures:
-        model_cls = ModelRegistry.load_model_cls(arch)
-        if model_cls is not None:
-            return model_cls
-    raise ValueError(
-        f"Model architectures {architectures} are not supported for now. "
-        f"Supported architectures: {ModelRegistry.get_supported_archs()}")
-
-
-def serialize():
-
-    eng_args_dict = {f.name: getattr(args, f.name) for f in
-                     dataclasses.fields(EngineArgs)}
-    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
-    engine = LLMEngine.from_engine_args(engine_args)
+    TensorizerArgs.add_cli_args(deserialize_parser)

-    model = (engine.model_executor.driver_worker.
-             model_runner.model)
-
-    encryption_params = EncryptionParams.random() if keyfile else None
-    if keyfile:
-        with _write_stream(keyfile) as stream:
-            stream.write(encryption_params.key)
-
-    with _write_stream(model_path) as stream:
-        serializer = TensorSerializer(stream, encryption=encryption_params)
-        serializer.write_module(model)
-        serializer.close()
+    return parser.parse_args()

-    print("Serialization complete. Model tensors saved to", model_path)
-    if keyfile:
-        print("Key saved to", keyfile)


 def deserialize():
-    config = AutoConfig.from_pretrained(model_ref)
-
-    with no_init_or_tensor():
-        model_class = _get_vllm_model_architecture(config)
-        model = model_class(config)
-
-    before_mem = get_mem_usage()
-    start = time.time()
-
-    if keyfile:
-        with _read_stream(keyfile) as stream:
-            key = stream.read()
-            decryption_params = DecryptionParams.from_key(key)
-            tensorizer_args.deserializer_params['encryption'] = \
-                decryption_params
-
-    with (_read_stream(model_path)) as stream, TensorDeserializer(
-            stream, **tensorizer_args.deserializer_params) as deserializer:
-        deserializer.load_into_module(model)
-        end = time.time()
-
-    # Brag about how fast we are.
-    total_bytes_str = convert_bytes(deserializer.total_tensor_bytes)
-    duration = end - start
-    per_second = convert_bytes(deserializer.total_tensor_bytes / duration)
-    after_mem = get_mem_usage()
-    print(
-        f"Deserialized {total_bytes_str} in {end - start:0.2f}s, {per_second}/s"
+    llm = LLM(model=args.model,
+              load_format="tensorizer",
+              model_loader_extra_config=tensorizer_config
    )
-    print(f"Memory usage before: {before_mem}")
-    print(f"Memory usage after: {after_mem}")
+    return llm

-    return model


 args = parse_args()

-s3_access_key_id = (args.s3_access_key_id or os.environ.get("S3_ACCESS_KEY_ID")
-                    or None)
-s3_secret_access_key = (args.s3_secret_access_key
-                        or os.environ.get("S3_SECRET_ACCESS_KEY") or None)
+s3_access_key_id = (getattr(args, 's3_access_key_id', None)
+                    or os.environ.get("S3_ACCESS_KEY_ID", None))
+s3_secret_access_key = (getattr(args, 's3_secret_access_key', None)
+                        or os.environ.get("S3_SECRET_ACCESS_KEY", None))
+s3_endpoint = (getattr(args, 's3_endpoint', None)
+               or os.environ.get("S3_ENDPOINT_URL", None))

-s3_endpoint = (args.s3_endpoint or os.environ.get("S3_ENDPOINT_URL") or None)
+credentials = {
+    "s3_access_key_id": s3_access_key_id,
+    "s3_secret_access_key": s3_secret_access_key,
+    "s3_endpoint": s3_endpoint
+}

 _read_stream, _write_stream = (partial(
    stream_io.open_stream,
@@ -263,20 +204,41 @@ model_name = model_ref.split("/")[1]
 os.environ["MASTER_ADDR"] = "127.0.0.1"
 os.environ["MASTER_PORT"] = "8080"

-torch.distributed.init_process_group(world_size=1, rank=0)
+init_distributed_environment(world_size=1, rank=0, local_rank=0)
 initialize_model_parallel()

 keyfile = args.keyfile if args.keyfile else None

+
+if args.model_loader_extra_config:
+    config = json.loads(args.model_loader_extra_config)
+    tensorizer_args = TensorizerConfig(**config)._construct_tensorizer_args()
+    tensorizer_args.tensorizer_uri = args.path_to_tensors
+else:
+    tensorizer_args = None
+
 if args.command == "serialize":
+    eng_args_dict = {f.name: getattr(args, f.name) for f in
+                     dataclasses.fields(EngineArgs)}
+
+    engine_args = EngineArgs.from_cli_args(argparse.Namespace(**eng_args_dict))
+    engine = LLMEngine.from_engine_args(engine_args)
+
    input_dir = args.serialized_directory.rstrip('/')
    suffix = args.suffix if args.suffix else uuid.uuid4().hex
    base_path = f"{input_dir}/vllm/{model_ref}/{suffix}"
    model_path = f"{base_path}/model.tensors"
-    serialize()
+    tensorizer_config = TensorizerConfig(
+        tensorizer_uri=model_path,
+        **credentials)
+    serialize_vllm_model(engine, tensorizer_config, keyfile)
 elif args.command == "deserialize":
-    tensorizer_args = TensorizerArgs.from_cli_args(args)
-    model_path = args.path_to_tensors
+    if not tensorizer_args:
+        tensorizer_config = TensorizerConfig(
+            tensorizer_uri=args.path_to_tensors,
+            encryption_keyfile = keyfile,
+            **credentials
+        )
    deserialize()
 else:
    raise ValueError("Either serialize or deserialize must be specified.")
--- a/format.sh
+++ b/format.sh
@@ -26,6 +26,7 @@ RUFF_VERSION=$(ruff --version | awk '{print $2}')
 MYPY_VERSION=$(mypy --version | awk '{print $2}')
 CODESPELL_VERSION=$(codespell --version)
 ISORT_VERSION=$(isort --vn)
+CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')

 # # params: tool name, tool version, required version
 tool_version_check() {
@@ -40,6 +41,7 @@ tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt |
 tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
 tool_version_check "isort" "$ISORT_VERSION" "$(grep isort requirements-dev.txt | cut -d'=' -f3)"
 tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "clang-format" "$CLANGFORMAT_VERSION" "$(grep clang-format requirements-dev.txt | cut -d'=' -f3)"

 YAPF_FLAGS=(
    '--recursive'
@@ -111,8 +113,11 @@ mypy vllm/logging --config-file pyproject.toml
 mypy vllm/model_executor --config-file pyproject.toml


+# If git diff returns a file that is in the skip list, the file may be checked anyway:
+# https://github.com/codespell-project/codespell/issues/1915
+# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem
 CODESPELL_EXCLUDES=(
-    '--skip' '*docs/source/_build/**'
+    '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,tests/lora/data/**,build/**'
 )

 # check spelling of specified files
@@ -133,10 +138,9 @@ spell_check_changed() {
    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
    # exist on both branches.
    MERGEBASE="$(git merge-base origin/main HEAD)"
-
    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             codespell "${CODESPELL_EXCLUDES[@]}"
+            codespell "${CODESPELL_EXCLUDES[@]}"
    fi
 }

@@ -180,7 +184,6 @@ lint_changed() {
 }

 # Run Ruff
-echo 'vLLM ruff:'
 ### This flag lints individual files. --files *must* be the first command line
 ### arg to use this option.
 if [[ "$1" == '--files' ]]; then
@@ -193,6 +196,7 @@ else
   # Format only the files that changed in last commit.
   lint_changed
 fi
+echo 'vLLM ruff: Done'

 # check spelling of specified files
 isort_check() {
@@ -234,6 +238,59 @@ else
 fi
 echo 'vLLM isort: Done'

+# Clang-format section
+# Exclude some files for formatting because they are vendored
+# NOTE: Keep up to date with .github/workflows/clang-format.yml
+CLANG_FORMAT_EXCLUDES=(
+    'csrc/moe/topk_softmax_kernels.cu'
+    'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+    'csrc/punica/bgmv/bgmv_config.h'
+    'csrc/punica/bgmv/bgmv_impl.cuh'
+    'csrc/punica/bgmv/vec_dtypes.cuh'
+    'csrc/punica/punica_ops.cu'
+    'csrc/punica/type_convert.h'
+)
+
+# Format specified files with clang-format
+clang_format() {
+    clang-format -i "$@"
+}
+
+# Format files that differ from main branch with clang-format.
+clang_format_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause clang-format to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    # Get the list of changed files, excluding the specified ones
+    changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}"))
+    if [ -n "$changed_files" ]; then
+        echo "$changed_files" | xargs -P 5 clang-format -i
+    fi
+}
+
+# Format all files with clang-format
+clang_format_all() {
+    find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
+        | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \
+        | xargs clang-format -i
+}
+
+# Run clang-format
+if [[ "$1" == '--files' ]]; then
+   clang_format "${@:2}"
+elif [[ "$1" == '--all' ]]; then
+   clang_format_all
+else
+   clang_format_changed
+fi
+echo 'vLLM clang-format: Done'
+
+
 if ! git diff --quiet &>/dev/null; then
    echo 'Reformatted files. Please review and stage the changes.'
    echo 'Changes not staged for commit:'

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,8 +60,15 @@ exclude = [

 [tool.codespell]
 ignore-words-list = "dout, te, indicies"
-skip = "./tests/prompts,./benchmarks/sonnet.txt"
+skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"

 [tool.isort]
 use_parentheses = true
 skip_gitignore = true
+
+[tool.pytest.ini_options]
+markers = [
+    "skip_global_cleanup",
+    "llm: run tests for vLLM API only",
+    "openai: run tests for OpenAI API only",
+]
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -8,13 +8,14 @@ py-cpuinfo
 transformers >= 4.40.0  # Required for StarCoder2 & Llava, Llama 3.
 tokenizers >= 0.19.1  # Required for Llama 3.
 fastapi
+aiohttp
 openai
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
-tiktoken == 0.6.0  # Required for DBRX tokenizer
-lm-format-enforcer == 0.9.8
+tiktoken >= 0.6.0  # Required for DBRX tokenizer
+lm-format-enforcer == 0.10.1
 outlines == 0.0.34 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -4,6 +4,6 @@
 # Dependencies for NVIDIA GPUs
 ray >= 2.9
 nvidia-ml-py # for pynvml package
-vllm-nccl-cu12>=2.18,<2.19  # for downloading nccl library
 torch == 2.3.0
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
+vllm-flash-attn == 2.5.8.post2  # Requires PyTorch 2.3.0
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -5,6 +5,7 @@ tomli==2.0.1
 ruff==0.1.5
 codespell==2.2.6
 isort==5.13.2
+clang-format==18.1.5

 # type checking
 mypy==1.9.0
@@ -14,17 +15,20 @@ types-setuptools

 # testing
 pytest
-tensorizer==2.9.0
+tensorizer>=2.9.0
 pytest-forked
 pytest-asyncio
 pytest-rerunfailures
 pytest-shard
-httpx
+
+# testing utils
+awscli
 einops # required for MPT
+httpx
+peft
 requests
 ray
-peft
-awscli
+sentence-transformers # required for embedding

 # Benchmarking
 aiohttp

--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -3,3 +3,5 @@

 # Dependencies for AMD GPUs
 ray == 2.9.1
+# ray >= 2.10.0
+pytest-asyncio
--- a/setup.py
+++ b/setup.py
@@ -414,14 +414,15 @@ def get_requirements() -> List[str]:

    if _is_cuda():
        requirements = _read_requirements("requirements-cuda.txt")
-        cuda_major = torch.version.cuda.split(".")[0]
+        cuda_major, cuda_minor = torch.version.cuda.split(".")
        modified_requirements = []
        for req in requirements:
-            if "vllm-nccl-cu12" in req:
-                modified_requirements.append(
-                    req.replace("vllm-nccl-cu12", f"vllm-nccl-cu{cuda_major}"))
-            else:
-                modified_requirements.append(req)
+            if ("vllm-flash-attn" in req
+                    and not (cuda_major == "12" and cuda_minor == "1")):
+                # vllm-flash-attn is built only for CUDA 12.1.
+                # Skip for other versions.
+                continue
+            modified_requirements.append(req)
        requirements = modified_requirements
    elif _is_hip():
        requirements = _read_requirements("requirements-rocm.txt")
@@ -440,12 +441,12 @@ ext_modules = []
 if _is_cuda():
    ext_modules.append(CMakeExtension(name="vllm._moe_C"))

-    if _install_punica():
-        ext_modules.append(CMakeExtension(name="vllm._punica_C"))
-
 if not _is_neuron():
    ext_modules.append(CMakeExtension(name="vllm._C"))

+    if _install_punica():
+        ext_modules.append(CMakeExtension(name="vllm._punica_C"))
+
 package_data = {
    "vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
 }
@@ -481,7 +482,7 @@ setup(
    install_requires=get_requirements(),
    ext_modules=ext_modules,
    extras_require={
-        "tensorizer": ["tensorizer==2.9.0"],
+        "tensorizer": ["tensorizer>=2.9.0"],
    },
    cmdclass={"build_ext": cmake_build_ext} if not _is_neuron() else {},
    package_data=package_data,

--- a/tests/async_engine/__init__.py
+++ b/tests/async_engine/__init__.py
--- a/tests/async_engine/test_async_llm_engine.py
+++ b/tests/async_engine/test_async_llm_engine.py
@@ -25,7 +25,7 @@ class MockEngine:
        return [RequestOutput(
            request_id=self.request_id)] if self.request_id else []

-    async def encode_request_async(self, *args, **kwargs):
+    async def process_model_inputs_async(self, *args, **kwargs):
        pass

    def generate(self, request_id):