Commit 99324e25 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.2' into v0.9.2-ori

parents cc7f22a8 a5dd03c1
...@@ -18,7 +18,7 @@ And LiteLLM supports all models on VLLM. ...@@ -18,7 +18,7 @@ And LiteLLM supports all models on VLLM.
- Setup vLLM and litellm environment - Setup vLLM and litellm environment
```console ```bash
pip install vllm litellm pip install vllm litellm
``` ```
...@@ -28,33 +28,35 @@ pip install vllm litellm ...@@ -28,33 +28,35 @@ pip install vllm litellm
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```
- Call it with litellm: - Call it with litellm:
```python ??? Code
import litellm
messages = [{ "content": "Hello, how are you?","role": "user"}] ```python
import litellm
# hosted_vllm is prefix key word and necessary messages = [{ "content": "Hello, how are you?","role": "user"}]
response = litellm.completion(
model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name # hosted_vllm is prefix key word and necessary
messages=messages, response = litellm.completion(
api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1", model="hosted_vllm/qwen/Qwen1.5-0.5B-Chat", # pass the vllm model name
temperature=0.2, messages=messages,
max_tokens=80) api_base="http://{your-vllm-server-host}:{your-vllm-server-port}/v1",
temperature=0.2,
print(response) max_tokens=80)
```
print(response)
```
### Embeddings ### Embeddings
- Start the vLLM server with the supported embedding model, e.g. - Start the vLLM server with the supported embedding model, e.g.
```console ```bash
vllm serve BAAI/bge-base-en-v1.5 vllm serve BAAI/bge-base-en-v1.5
``` ```
......
...@@ -17,99 +17,101 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber ...@@ -17,99 +17,101 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber
Deploy the following yaml file `lws.yaml` Deploy the following yaml file `lws.yaml`
```yaml ??? Yaml
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet ```yaml
metadata: apiVersion: leaderworkerset.x-k8s.io/v1
name: vllm kind: LeaderWorkerSet
spec: metadata:
replicas: 2 name: vllm
leaderWorkerTemplate: spec:
size: 2 replicas: 2
restartPolicy: RecreateGroupOnPodRestart leaderWorkerTemplate:
leaderTemplate: size: 2
metadata: restartPolicy: RecreateGroupOnPodRestart
labels: leaderTemplate:
role: leader metadata:
spec: labels:
containers: role: leader
- name: vllm-leader spec:
image: docker.io/vllm/vllm-openai:latest containers:
env: - name: vllm-leader
- name: HUGGING_FACE_HUB_TOKEN image: docker.io/vllm/vllm-openai:latest
value: <your-hf-token> env:
command: - name: HUGGING_FACE_HUB_TOKEN
- sh value: <your-hf-token>
- -c command:
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE); - sh
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2" - -c
resources: - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
limits: python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
nvidia.com/gpu: "8" resources:
memory: 1124Gi limits:
ephemeral-storage: 800Gi nvidia.com/gpu: "8"
requests: memory: 1124Gi
ephemeral-storage: 800Gi ephemeral-storage: 800Gi
cpu: 125 requests:
ports: ephemeral-storage: 800Gi
- containerPort: 8080 cpu: 125
readinessProbe: ports:
tcpSocket: - containerPort: 8080
port: 8080 readinessProbe:
initialDelaySeconds: 15 tcpSocket:
periodSeconds: 10 port: 8080
volumeMounts: initialDelaySeconds: 15
- mountPath: /dev/shm periodSeconds: 10
name: dshm volumeMounts:
volumes: - mountPath: /dev/shm
- name: dshm name: dshm
emptyDir: volumes:
medium: Memory - name: dshm
sizeLimit: 15Gi emptyDir:
workerTemplate: medium: Memory
spec: sizeLimit: 15Gi
containers: workerTemplate:
- name: vllm-worker spec:
image: docker.io/vllm/vllm-openai:latest containers:
command: - name: vllm-worker
- sh image: docker.io/vllm/vllm-openai:latest
- -c command:
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)" - sh
resources: - -c
limits: - "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
nvidia.com/gpu: "8" resources:
memory: 1124Gi limits:
ephemeral-storage: 800Gi nvidia.com/gpu: "8"
requests: memory: 1124Gi
ephemeral-storage: 800Gi ephemeral-storage: 800Gi
cpu: 125 requests:
env: ephemeral-storage: 800Gi
- name: HUGGING_FACE_HUB_TOKEN cpu: 125
value: <your-hf-token> env:
volumeMounts: - name: HUGGING_FACE_HUB_TOKEN
- mountPath: /dev/shm value: <your-hf-token>
name: dshm volumeMounts:
volumes: - mountPath: /dev/shm
- name: dshm name: dshm
emptyDir: volumes:
medium: Memory - name: dshm
sizeLimit: 15Gi emptyDir:
--- medium: Memory
apiVersion: v1 sizeLimit: 15Gi
kind: Service ---
metadata: apiVersion: v1
name: vllm-leader kind: Service
spec: metadata:
ports: name: vllm-leader
- name: http spec:
port: 8080 ports:
protocol: TCP - name: http
targetPort: 8080 port: 8080
selector: protocol: TCP
leaderworkerset.sigs.k8s.io/name: vllm targetPort: 8080
role: leader selector:
type: ClusterIP leaderworkerset.sigs.k8s.io/name: vllm
``` role: leader
type: ClusterIP
```
```bash ```bash
kubectl apply -f lws.yaml kubectl apply -f lws.yaml
...@@ -175,25 +177,27 @@ curl http://localhost:8080/v1/completions \ ...@@ -175,25 +177,27 @@ curl http://localhost:8080/v1/completions \
The output should be similar to the following The output should be similar to the following
```text ??? Output
{
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d", ```text
"object": "text_completion",
"created": 1715138766,
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"choices": [
{ {
"index": 0, "id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
"text": " top destination for foodies, with", "object": "text_completion",
"logprobs": null, "created": 1715138766,
"finish_reason": "length", "model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"stop_reason": null "choices": [
{
"index": 0,
"text": " top destination for foodies, with",
"logprobs": null,
"finish_reason": "length",
"stop_reason": null
}
],
"usage": {
"prompt_tokens": 5,
"total_tokens": 12,
"completion_tokens": 7
}
} }
], ```
"usage": {
"prompt_tokens": 5,
"total_tokens": 12,
"completion_tokens": 7
}
}
```
...@@ -7,13 +7,13 @@ title: Open WebUI ...@@ -7,13 +7,13 @@ title: Open WebUI
2. Start the vLLM server with the supported chat completion model, e.g. 2. Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```
1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port): 1. Start the [Open WebUI](https://github.com/open-webui/open-webui) docker container (replace the vllm serve host and vllm serve port):
```console ```bash
docker run -d -p 3000:8080 \ docker run -d -p 3000:8080 \
--name open-webui \ --name open-webui \
-v open-webui:/app/backend/data \ -v open-webui:/app/backend/data \
......
...@@ -15,7 +15,7 @@ Here are the integrations: ...@@ -15,7 +15,7 @@ Here are the integrations:
- Setup vLLM and langchain environment - Setup vLLM and langchain environment
```console ```bash
pip install -U vllm \ pip install -U vllm \
langchain_milvus langchain_openai \ langchain_milvus langchain_openai \
langchain_community beautifulsoup4 \ langchain_community beautifulsoup4 \
...@@ -26,14 +26,14 @@ pip install -U vllm \ ...@@ -26,14 +26,14 @@ pip install -U vllm \
- Start the vLLM server with the supported embedding model, e.g. - Start the vLLM server with the supported embedding model, e.g.
```console ```bash
# Start embedding service (port 8000) # Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base vllm serve ssmits/Qwen2-7B-Instruct-embed-base
``` ```
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
# Start chat service (port 8001) # Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
``` ```
...@@ -52,7 +52,7 @@ python retrieval_augmented_generation_with_langchain.py ...@@ -52,7 +52,7 @@ python retrieval_augmented_generation_with_langchain.py
- Setup vLLM and llamaindex environment - Setup vLLM and llamaindex environment
```console ```bash
pip install vllm \ pip install vllm \
llama-index llama-index-readers-web \ llama-index llama-index-readers-web \
llama-index-llms-openai-like \ llama-index-llms-openai-like \
...@@ -64,14 +64,14 @@ pip install vllm \ ...@@ -64,14 +64,14 @@ pip install vllm \
- Start the vLLM server with the supported embedding model, e.g. - Start the vLLM server with the supported embedding model, e.g.
```console ```bash
# Start embedding service (port 8000) # Start embedding service (port 8000)
vllm serve ssmits/Qwen2-7B-Instruct-embed-base vllm serve ssmits/Qwen2-7B-Instruct-embed-base
``` ```
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
# Start chat service (port 8001) # Start chat service (port 8001)
vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001 vllm serve qwen/Qwen1.5-0.5B-Chat --port 8001
``` ```
......
...@@ -15,7 +15,7 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet ...@@ -15,7 +15,7 @@ vLLM can be **run and scaled to multiple service replicas on clouds and Kubernet
- Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)). - Check that you have installed SkyPilot ([docs](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html)).
- Check that `sky check` shows clouds or Kubernetes are enabled. - Check that `sky check` shows clouds or Kubernetes are enabled.
```console ```bash
pip install skypilot-nightly pip install skypilot-nightly
sky check sky check
``` ```
...@@ -24,52 +24,54 @@ sky check ...@@ -24,52 +24,54 @@ sky check
See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml). See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
```yaml ??? Yaml
resources:
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. ```yaml
use_spot: True resources:
disk_size: 512 # Ensure model checkpoints can fit. accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
disk_tier: best use_spot: True
ports: 8081 # Expose to internet traffic. disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
envs: ports: 8081 # Expose to internet traffic.
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
setup: | HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
conda create -n vllm python=3.10 -y
conda activate vllm setup: |
conda create -n vllm python=3.10 -y
pip install vllm==0.4.0.post1 conda activate vllm
# Install Gradio for web UI.
pip install gradio openai pip install vllm==0.4.0.post1
pip install flash-attn==2.5.7 # Install Gradio for web UI.
pip install gradio openai
run: | pip install flash-attn==2.5.7
conda activate vllm
echo 'Starting vllm api server...' run: |
python -u -m vllm.entrypoints.openai.api_server \ conda activate vllm
--port 8081 \ echo 'Starting vllm api server...'
--model $MODEL_NAME \ python -u -m vllm.entrypoints.openai.api_server \
--trust-remote-code \ --port 8081 \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ --model $MODEL_NAME \
2>&1 | tee api_server.log & --trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
echo 'Waiting for vllm api server to start...' 2>&1 | tee api_server.log &
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
echo 'Waiting for vllm api server to start...'
echo 'Starting gradio server...' while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \ echo 'Starting gradio server...'
-m $MODEL_NAME \ git clone https://github.com/vllm-project/vllm.git || true
--port 8811 \ python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
--model-url http://localhost:8081/v1 \ -m $MODEL_NAME \
--stop-token-ids 128009,128001 --port 8811 \
``` --model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001
```
Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...): Start the serving the Llama-3 8B model on any of the candidate GPUs listed (L4, A10g, ...):
```console ```bash
HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN HF_TOKEN="your-huggingface-token" sky launch serving.yaml --env HF_TOKEN
``` ```
...@@ -81,7 +83,7 @@ Check the output of the command. There will be a shareable gradio link (like the ...@@ -81,7 +83,7 @@ Check the output of the command. There will be a shareable gradio link (like the
**Optional**: Serve the 70B model instead of the default 8B and use more GPU: **Optional**: Serve the 70B model instead of the default 8B and use more GPU:
```console ```bash
HF_TOKEN="your-huggingface-token" \ HF_TOKEN="your-huggingface-token" \
sky launch serving.yaml \ sky launch serving.yaml \
--gpus A100:8 \ --gpus A100:8 \
...@@ -93,72 +95,71 @@ HF_TOKEN="your-huggingface-token" \ ...@@ -93,72 +95,71 @@ HF_TOKEN="your-huggingface-token" \
SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file. SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
```yaml ??? Yaml
service:
replicas: 2 ```yaml
# An actual request for readiness probe. service:
readiness_probe: replicas: 2
path: /v1/chat/completions # An actual request for readiness probe.
post_data: readiness_probe:
model: $MODEL_NAME path: /v1/chat/completions
messages: post_data:
- role: user model: $MODEL_NAME
content: Hello! What is your name? messages:
max_completion_tokens: 1 - role: user
``` content: Hello! What is your name?
<details>
<summary>Click to see the full recipe YAML</summary>
```yaml
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_completion_tokens: 1 max_completion_tokens: 1
```
resources: ??? Yaml
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
use_spot: True ```yaml
disk_size: 512 # Ensure model checkpoints can fit. service:
disk_tier: best replicas: 2
ports: 8081 # Expose to internet traffic. # An actual request for readiness probe.
readiness_probe:
envs: path: /v1/chat/completions
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct post_data:
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass. model: $MODEL_NAME
messages:
setup: | - role: user
conda create -n vllm python=3.10 -y content: Hello! What is your name?
conda activate vllm max_completion_tokens: 1
pip install vllm==0.4.0.post1 resources:
# Install Gradio for web UI. accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
pip install gradio openai use_spot: True
pip install flash-attn==2.5.7 disk_size: 512 # Ensure model checkpoints can fit.
disk_tier: best
run: | ports: 8081 # Expose to internet traffic.
conda activate vllm
echo 'Starting vllm api server...' envs:
python -u -m vllm.entrypoints.openai.api_server \ MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
--port 8081 \ HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
--model $MODEL_NAME \
--trust-remote-code \ setup: |
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ conda create -n vllm python=3.10 -y
2>&1 | tee api_server.log conda activate vllm
```
pip install vllm==0.4.0.post1
</details> # Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
run: |
conda activate vllm
echo 'Starting vllm api server...'
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
2>&1 | tee api_server.log
```
Start the serving the Llama-3 8B model on multiple replicas: Start the serving the Llama-3 8B model on multiple replicas:
```console ```bash
HF_TOKEN="your-huggingface-token" \ HF_TOKEN="your-huggingface-token" \
sky serve up -n vllm serving.yaml \ sky serve up -n vllm serving.yaml \
--env HF_TOKEN --env HF_TOKEN
...@@ -166,12 +167,11 @@ HF_TOKEN="your-huggingface-token" \ ...@@ -166,12 +167,11 @@ HF_TOKEN="your-huggingface-token" \
Wait until the service is ready: Wait until the service is ready:
```console ```bash
watch -n10 sky serve status vllm watch -n10 sky serve status vllm
``` ```
<details> Example outputs:
<summary>Example outputs:</summary>
```console ```console
Services Services
...@@ -184,29 +184,29 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R ...@@ -184,29 +184,29 @@ vllm 1 1 xx.yy.zz.121 18 mins ago 1x GCP([Spot]{'L4': 1}) R
vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4 vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) READY us-east4
``` ```
</details>
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint: After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
```console ??? Commands
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
curl -L http://$ENDPOINT/v1/chat/completions \ ```bash
-H "Content-Type: application/json" \ ENDPOINT=$(sky serve status --endpoint 8081 vllm)
-d '{ curl -L http://$ENDPOINT/v1/chat/completions \
"model": "meta-llama/Meta-Llama-3-8B-Instruct", -H "Content-Type: application/json" \
"messages": [ -d '{
{ "model": "meta-llama/Meta-Llama-3-8B-Instruct",
"role": "system", "messages": [
"content": "You are a helpful assistant." {
}, "role": "system",
{ "content": "You are a helpful assistant."
"role": "user", },
"content": "Who are you?" {
} "role": "user",
], "content": "Who are you?"
"stop_token_ids": [128009, 128001] }
}' ],
``` "stop_token_ids": [128009, 128001]
}'
```
To enable autoscaling, you could replace the `replicas` with the following configs in `service`: To enable autoscaling, you could replace the `replicas` with the following configs in `service`:
...@@ -220,67 +220,64 @@ service: ...@@ -220,67 +220,64 @@ service:
This will scale the service up to when the QPS exceeds 2 for each replica. This will scale the service up to when the QPS exceeds 2 for each replica.
<details> ??? Yaml
<summary>Click to see the full recipe YAML</summary>
```yaml
```yaml service:
service: replica_policy:
replica_policy: min_replicas: 2
min_replicas: 2 max_replicas: 4
max_replicas: 4 target_qps_per_replica: 2
target_qps_per_replica: 2 # An actual request for readiness probe.
# An actual request for readiness probe. readiness_probe:
readiness_probe: path: /v1/chat/completions
path: /v1/chat/completions post_data:
post_data: model: $MODEL_NAME
model: $MODEL_NAME messages:
messages: - role: user
- role: user content: Hello! What is your name?
content: Hello! What is your name? max_completion_tokens: 1
max_completion_tokens: 1
resources:
resources: accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model. use_spot: True
use_spot: True disk_size: 512 # Ensure model checkpoints can fit.
disk_size: 512 # Ensure model checkpoints can fit. disk_tier: best
disk_tier: best ports: 8081 # Expose to internet traffic.
ports: 8081 # Expose to internet traffic.
envs:
envs: MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
HF_TOKEN: <your-huggingface-token> # Change to your own huggingface token, or use --env to pass.
setup: |
setup: | conda create -n vllm python=3.10 -y
conda create -n vllm python=3.10 -y conda activate vllm
conda activate vllm
pip install vllm==0.4.0.post1
pip install vllm==0.4.0.post1 # Install Gradio for web UI.
# Install Gradio for web UI. pip install gradio openai
pip install gradio openai pip install flash-attn==2.5.7
pip install flash-attn==2.5.7
run: |
run: | conda activate vllm
conda activate vllm echo 'Starting vllm api server...'
echo 'Starting vllm api server...' python -u -m vllm.entrypoints.openai.api_server \
python -u -m vllm.entrypoints.openai.api_server \ --port 8081 \
--port 8081 \ --model $MODEL_NAME \
--model $MODEL_NAME \ --trust-remote-code \
--trust-remote-code \ --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 2>&1 | tee api_server.log
2>&1 | tee api_server.log ```
```
</details>
To update the service with the new config: To update the service with the new config:
```console ```bash
HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN HF_TOKEN="your-huggingface-token" sky serve update vllm serving.yaml --env HF_TOKEN
``` ```
To stop the service: To stop the service:
```console ```bash
sky serve down vllm sky serve down vllm
``` ```
...@@ -288,42 +285,39 @@ sky serve down vllm ...@@ -288,42 +285,39 @@ sky serve down vllm
It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas. It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
<details> ??? Yaml
<summary>Click to see the full GUI YAML</summary>
```yaml ```yaml
envs: envs:
MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm. ENDPOINT: x.x.x.x:3031 # Address of the API server running vllm.
resources: resources:
cpus: 2 cpus: 2
setup: |
conda create -n vllm python=3.10 -y
conda activate vllm
# Install Gradio for web UI.
pip install gradio openai
run: |
conda activate vllm
export PATH=$PATH:/sbin
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://$ENDPOINT/v1 \
--stop-token-ids 128009,128001 | tee ~/gradio.log
```
</details> setup: |
conda create -n vllm python=3.10 -y
conda activate vllm
# Install Gradio for web UI.
pip install gradio openai
run: |
conda activate vllm
export PATH=$PATH:/sbin
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/online_serving/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://$ENDPOINT/v1 \
--stop-token-ids 128009,128001 | tee ~/gradio.log
```
1. Start the chat web UI: 1. Start the chat web UI:
```console ```bash
sky launch \ sky launch \
-c gui ./gui.yaml \ -c gui ./gui.yaml \
--env ENDPOINT=$(sky serve status --endpoint vllm) --env ENDPOINT=$(sky serve status --endpoint vllm)
......
...@@ -15,13 +15,13 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu ...@@ -15,13 +15,13 @@ It can be quickly integrated with vLLM as a backend API server, enabling powerfu
- Start the vLLM server with the supported chat completion model, e.g. - Start the vLLM server with the supported chat completion model, e.g.
```console ```bash
vllm serve qwen/Qwen1.5-0.5B-Chat vllm serve qwen/Qwen1.5-0.5B-Chat
``` ```
- Install streamlit and openai: - Install streamlit and openai:
```console ```bash
pip install streamlit openai pip install streamlit openai
``` ```
...@@ -29,7 +29,7 @@ pip install streamlit openai ...@@ -29,7 +29,7 @@ pip install streamlit openai
- Start the streamlit web UI and start to chat: - Start the streamlit web UI and start to chat:
```console ```bash
streamlit run streamlit_openai_chatbot_webserver.py streamlit run streamlit_openai_chatbot_webserver.py
# or specify the VLLM_API_BASE or VLLM_API_KEY # or specify the VLLM_API_BASE or VLLM_API_KEY
......
...@@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta ...@@ -7,7 +7,7 @@ vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-sta
To install Llama Stack, run To install Llama Stack, run
```console ```bash
pip install llama-stack -q pip install llama-stack -q
``` ```
......
...@@ -60,22 +60,22 @@ And then you can send out a query to the OpenAI-compatible API to check the avai ...@@ -60,22 +60,22 @@ And then you can send out a query to the OpenAI-compatible API to check the avai
curl -o- http://localhost:30080/models curl -o- http://localhost:30080/models
``` ```
Expected output: ??? Output
```json ```json
{
"object": "list",
"data": [
{ {
"id": "facebook/opt-125m", "object": "list",
"object": "model", "data": [
"created": 1737428424, {
"owned_by": "vllm", "id": "facebook/opt-125m",
"root": null "object": "model",
"created": 1737428424,
"owned_by": "vllm",
"root": null
}
]
} }
] ```
}
```
To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint: To send an actual chatting request, you can issue a curl request to the OpenAI `/completion` endpoint:
...@@ -89,23 +89,23 @@ curl -X POST http://localhost:30080/completions \ ...@@ -89,23 +89,23 @@ curl -X POST http://localhost:30080/completions \
}' }'
``` ```
Expected output: ??? Output
```json ```json
{
"id": "completion-id",
"object": "text_completion",
"created": 1737428424,
"model": "facebook/opt-125m",
"choices": [
{ {
"text": " there was a brave knight who...", "id": "completion-id",
"index": 0, "object": "text_completion",
"finish_reason": "length" "created": 1737428424,
"model": "facebook/opt-125m",
"choices": [
{
"text": " there was a brave knight who...",
"index": 0,
"finish_reason": "length"
}
]
} }
] ```
}
```
### Uninstall ### Uninstall
...@@ -121,23 +121,25 @@ sudo helm uninstall vllm ...@@ -121,23 +121,25 @@ sudo helm uninstall vllm
The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above: The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
```yaml ??? Yaml
servingEngineSpec:
runtimeClassName: ""
modelSpec:
- name: "opt125m"
repository: "vllm/vllm-openai"
tag: "latest"
modelURL: "facebook/opt-125m"
replicaCount: 1 ```yaml
servingEngineSpec:
runtimeClassName: ""
modelSpec:
- name: "opt125m"
repository: "vllm/vllm-openai"
tag: "latest"
modelURL: "facebook/opt-125m"
requestCPU: 6 replicaCount: 1
requestMemory: "16Gi"
requestGPU: 1
pvcStorage: "10Gi" requestCPU: 6
``` requestMemory: "16Gi"
requestGPU: 1
pvcStorage: "10Gi"
```
In this YAML configuration: In this YAML configuration:
* **`modelSpec`** includes: * **`modelSpec`** includes:
......
...@@ -5,19 +5,22 @@ title: Using Kubernetes ...@@ -5,19 +5,22 @@ title: Using Kubernetes
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes. Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
* [Deployment with CPUs](#deployment-with-cpus) - [Deployment with CPUs](#deployment-with-cpus)
* [Deployment with GPUs](#deployment-with-gpus) - [Deployment with GPUs](#deployment-with-gpus)
- [Troubleshooting](#troubleshooting)
- [Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"](#startup-probe-or-readiness-probe-failure-container-log-contains-keyboardinterrupt-terminated)
- [Conclusion](#conclusion)
Alternatively, you can deploy vLLM to Kubernetes using any of the following: Alternatively, you can deploy vLLM to Kubernetes using any of the following:
* [Helm](frameworks/helm.md) - [Helm](frameworks/helm.md)
* [InftyAI/llmaz](integrations/llmaz.md) - [InftyAI/llmaz](integrations/llmaz.md)
* [KServe](integrations/kserve.md) - [KServe](integrations/kserve.md)
* [kubernetes-sigs/lws](frameworks/lws.md) - [kubernetes-sigs/lws](frameworks/lws.md)
* [meta-llama/llama-stack](integrations/llamastack.md) - [meta-llama/llama-stack](integrations/llamastack.md)
* [substratusai/kubeai](integrations/kubeai.md) - [substratusai/kubeai](integrations/kubeai.md)
* [vllm-project/aibrix](https://github.com/vllm-project/aibrix) - [vllm-project/aibrix](https://github.com/vllm-project/aibrix)
* [vllm-project/production-stack](integrations/production-stack.md) - [vllm-project/production-stack](integrations/production-stack.md)
## Deployment with CPUs ## Deployment with CPUs
...@@ -26,89 +29,93 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following: ...@@ -26,89 +29,93 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model: First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
```bash ??? Config
cat <<EOF |kubectl apply -f -
apiVersion: v1 ```bash
kind: PersistentVolumeClaim cat <<EOF |kubectl apply -f -
metadata: apiVersion: v1
name: vllm-models kind: PersistentVolumeClaim
spec: metadata:
accessModes: name: vllm-models
- ReadWriteOnce spec:
volumeMode: Filesystem accessModes:
resources: - ReadWriteOnce
requests: volumeMode: Filesystem
storage: 50Gi resources:
--- requests:
apiVersion: v1 storage: 50Gi
kind: Secret ---
metadata: apiVersion: v1
name: hf-token-secret kind: Secret
type: Opaque metadata:
data: name: hf-token-secret
token: $(HF_TOKEN) type: Opaque
EOF data:
``` token: $(HF_TOKEN)
EOF
```
Next, start the vLLM server as a Kubernetes Deployment and Service: Next, start the vLLM server as a Kubernetes Deployment and Service:
```bash ??? Config
cat <<EOF |kubectl apply -f -
apiVersion: apps/v1 ```bash
kind: Deployment cat <<EOF |kubectl apply -f -
metadata: apiVersion: apps/v1
name: vllm-server kind: Deployment
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: vllm
template:
metadata: metadata:
labels: name: vllm-server
app.kubernetes.io/name: vllm
spec: spec:
containers: replicas: 1
- name: vllm selector:
image: vllm/vllm-openai:latest matchLabels:
command: ["/bin/sh", "-c"] app.kubernetes.io/name: vllm
args: [ template:
"vllm serve meta-llama/Llama-3.2-1B-Instruct" metadata:
] labels:
env: app.kubernetes.io/name: vllm
- name: HUGGING_FACE_HUB_TOKEN spec:
valueFrom: containers:
secretKeyRef: - name: vllm
name: hf-token-secret image: vllm/vllm-openai:latest
key: token command: ["/bin/sh", "-c"]
ports: args: [
- containerPort: 8000 "vllm serve meta-llama/Llama-3.2-1B-Instruct"
volumeMounts: ]
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
ports:
- containerPort: 8000
volumeMounts:
- name: llama-storage
mountPath: /root/.cache/huggingface
volumes:
- name: llama-storage - name: llama-storage
mountPath: /root/.cache/huggingface persistentVolumeClaim:
volumes: claimName: vllm-models
- name: llama-storage ---
persistentVolumeClaim: apiVersion: v1
claimName: vllm-models kind: Service
--- metadata:
apiVersion: v1 name: vllm-server
kind: Service spec:
metadata: selector:
name: vllm-server app.kubernetes.io/name: vllm
spec: ports:
selector: - protocol: TCP
app.kubernetes.io/name: vllm port: 8000
ports: targetPort: 8000
- protocol: TCP type: ClusterIP
port: 8000 EOF
targetPort: 8000 ```
type: ClusterIP
EOF
```
We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model): We can verify that the vLLM server has started successfully via the logs (this might take a couple of minutes to download the model):
```console ```bash
kubectl logs -l app.kubernetes.io/name=vllm kubectl logs -l app.kubernetes.io/name=vllm
... ...
INFO: Started server process [1] INFO: Started server process [1]
...@@ -125,6 +132,9 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ...@@ -125,6 +132,9 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
PVC is used to store the model cache and it is optional, you can use hostPath or other storage options PVC is used to store the model cache and it is optional, you can use hostPath or other storage options
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: v1 apiVersion: v1
kind: PersistentVolumeClaim kind: PersistentVolumeClaim
...@@ -141,6 +151,8 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ...@@ -141,6 +151,8 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
volumeMode: Filesystem volumeMode: Filesystem
``` ```
</details>
Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models Secret is optional and only required for accessing gated models, you can skip this step if you are not using gated models
```yaml ```yaml
...@@ -153,13 +165,16 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ...@@ -153,13 +165,16 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
stringData: stringData:
token: "REPLACE_WITH_TOKEN" token: "REPLACE_WITH_TOKEN"
``` ```
Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model. Next to create the deployment file for vLLM to run the model server. The following example deploys the `Mistral-7B-Instruct-v0.3` model.
Here are two examples for using NVIDIA GPU and AMD GPU. Here are two examples for using NVIDIA GPU and AMD GPU.
NVIDIA GPU: NVIDIA GPU:
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
...@@ -230,10 +245,15 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ...@@ -230,10 +245,15 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
periodSeconds: 5 periodSeconds: 5
``` ```
</details>
AMD GPU: AMD GPU:
You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X. You can refer to the `deployment.yaml` below if using AMD ROCm GPU like MI300X.
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: apps/v1 apiVersion: apps/v1
kind: Deployment kind: Deployment
...@@ -302,12 +322,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ...@@ -302,12 +322,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
mountPath: /dev/shm mountPath: /dev/shm
``` ```
</details>
You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>. You can get the full example with steps and sample yaml files from <https://github.com/ROCm/k8s-device-plugin/tree/master/example/vllm-serve>.
2. Create a Kubernetes Service for vLLM 2. Create a Kubernetes Service for vLLM
Next, create a Kubernetes Service file to expose the `mistral-7b` deployment: Next, create a Kubernetes Service file to expose the `mistral-7b` deployment:
<details>
<summary>Yaml</summary>
```yaml ```yaml
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
...@@ -327,18 +352,20 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ...@@ -327,18 +352,20 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
type: ClusterIP type: ClusterIP
``` ```
</details>
3. Deploy and Test 3. Deploy and Test
Apply the deployment and service configurations using `kubectl apply -f <filename>`: Apply the deployment and service configurations using `kubectl apply -f <filename>`:
```console ```bash
kubectl apply -f deployment.yaml kubectl apply -f deployment.yaml
kubectl apply -f service.yaml kubectl apply -f service.yaml
``` ```
To test the deployment, run the following `curl` command: To test the deployment, run the following `curl` command:
```console ```bash
curl http://mistral-7b.default.svc.cluster.local/v1/completions \ curl http://mistral-7b.default.svc.cluster.local/v1/completions \
-H "Content-Type: application/json" \ -H "Content-Type: application/json" \
-d '{ -d '{
...@@ -351,6 +378,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit) ...@@ -351,6 +378,17 @@ INFO: Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
If the service is correctly deployed, you should receive a response from the vLLM model. If the service is correctly deployed, you should receive a response from the vLLM model.
## Troubleshooting
### Startup Probe or Readiness Probe Failure, container log contains "KeyboardInterrupt: terminated"
If the startup or readiness probe failureThreshold is too low for the time needed to startup the server, Kubernetes scheduler will kill the container. A couple of indications that this has happened:
1. container log contains "KeyboardInterrupt: terminated"
2. `kubectl get events` shows message `Container $NAME failed startup probe, will be restarted`
To mitigate, increase the failureThreshold to allow more time for the model server to start serving. You can identify an ideal failureThreshold by removing the probes from the manifest and measuring how much time it takes for the model server to show it's ready to serve.
## Conclusion ## Conclusion
Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation. Deploying vLLM with Kubernetes allows for efficient scaling and management of ML models leveraging GPU resources. By following the steps outlined above, you should be able to set up and test a vLLM deployment within your Kubernetes cluster. If you encounter any issues or have suggestions, please feel free to contribute to the documentation.
...@@ -11,13 +11,13 @@ This document shows how to launch multiple vLLM serving containers and use Nginx ...@@ -11,13 +11,13 @@ This document shows how to launch multiple vLLM serving containers and use Nginx
This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory. This guide assumes that you have just cloned the vLLM project and you're currently in the vllm root directory.
```console ```bash
export vllm_root=`pwd` export vllm_root=`pwd`
``` ```
Create a file named `Dockerfile.nginx`: Create a file named `Dockerfile.nginx`:
```console ```dockerfile
FROM nginx:latest FROM nginx:latest
RUN rm /etc/nginx/conf.d/default.conf RUN rm /etc/nginx/conf.d/default.conf
EXPOSE 80 EXPOSE 80
...@@ -26,7 +26,7 @@ CMD ["nginx", "-g", "daemon off;"] ...@@ -26,7 +26,7 @@ CMD ["nginx", "-g", "daemon off;"]
Build the container: Build the container:
```console ```bash
docker build . -f Dockerfile.nginx --tag nginx-lb docker build . -f Dockerfile.nginx --tag nginx-lb
``` ```
...@@ -36,36 +36,38 @@ docker build . -f Dockerfile.nginx --tag nginx-lb ...@@ -36,36 +36,38 @@ docker build . -f Dockerfile.nginx --tag nginx-lb
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`. Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
```console ??? Config
upstream backend {
least_conn; ```console
server vllm0:8000 max_fails=3 fail_timeout=10000s; upstream backend {
server vllm1:8000 max_fails=3 fail_timeout=10000s; least_conn;
} server vllm0:8000 max_fails=3 fail_timeout=10000s;
server { server vllm1:8000 max_fails=3 fail_timeout=10000s;
listen 80;
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
} }
} server {
``` listen 80;
location / {
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
}
}
```
[](){ #nginxloadbalancer-nginx-vllm-container } [](){ #nginxloadbalancer-nginx-vllm-container }
## Build vLLM Container ## Build vLLM Container
```console ```bash
cd $vllm_root cd $vllm_root
docker build -f docker/Dockerfile . --tag vllm docker build -f docker/Dockerfile . --tag vllm
``` ```
If you are behind proxy, you can pass the proxy settings to the docker build command as shown below: If you are behind proxy, you can pass the proxy settings to the docker build command as shown below:
```console ```bash
cd $vllm_root cd $vllm_root
docker build \ docker build \
-f docker/Dockerfile . \ -f docker/Dockerfile . \
...@@ -78,7 +80,7 @@ docker build \ ...@@ -78,7 +80,7 @@ docker build \
## Create Docker Network ## Create Docker Network
```console ```bash
docker network create vllm_nginx docker network create vllm_nginx
``` ```
...@@ -93,30 +95,32 @@ Notes: ...@@ -93,30 +95,32 @@ Notes:
- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command. - The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`. - Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
```console ??? Commands
mkdir -p ~/.cache/huggingface/hub/
hf_cache_dir=~/.cache/huggingface/ ```console
docker run \ mkdir -p ~/.cache/huggingface/hub/
-itd \ hf_cache_dir=~/.cache/huggingface/
--ipc host \ docker run \
--network vllm_nginx \ -itd \
--gpus device=0 \ --ipc host \
--shm-size=10.24gb \ --network vllm_nginx \
-v $hf_cache_dir:/root/.cache/huggingface/ \ --gpus device=0 \
-p 8081:8000 \ --shm-size=10.24gb \
--name vllm0 vllm \ -v $hf_cache_dir:/root/.cache/huggingface/ \
--model meta-llama/Llama-2-7b-chat-hf -p 8081:8000 \
docker run \ --name vllm0 vllm \
-itd \ --model meta-llama/Llama-2-7b-chat-hf
--ipc host \ docker run \
--network vllm_nginx \ -itd \
--gpus device=1 \ --ipc host \
--shm-size=10.24gb \ --network vllm_nginx \
-v $hf_cache_dir:/root/.cache/huggingface/ \ --gpus device=1 \
-p 8082:8000 \ --shm-size=10.24gb \
--name vllm1 vllm \ -v $hf_cache_dir:/root/.cache/huggingface/ \
--model meta-llama/Llama-2-7b-chat-hf -p 8082:8000 \
``` --name vllm1 vllm \
--model meta-llama/Llama-2-7b-chat-hf
```
!!! note !!! note
If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`. If you are behind proxy, you can pass the proxy settings to the docker run command via `-e http_proxy=$http_proxy -e https_proxy=$https_proxy`.
...@@ -125,7 +129,7 @@ docker run \ ...@@ -125,7 +129,7 @@ docker run \
## Launch Nginx ## Launch Nginx
```console ```bash
docker run \ docker run \
-itd \ -itd \
-p 8000:80 \ -p 8000:80 \
...@@ -138,7 +142,7 @@ docker run \ ...@@ -138,7 +142,7 @@ docker run \
## Verify That vLLM Servers Are Ready ## Verify That vLLM Servers Are Ready
```console ```bash
docker logs vllm0 | grep Uvicorn docker logs vllm0 | grep Uvicorn
docker logs vllm1 | grep Uvicorn docker logs vllm1 | grep Uvicorn
``` ```
......
...@@ -22,31 +22,33 @@ server. ...@@ -22,31 +22,33 @@ server.
Here is a sample of `LLM` class usage: Here is a sample of `LLM` class usage:
```python ??? Code
from vllm import LLM, SamplingParams
```python
# Define a list of input prompts from vllm import LLM, SamplingParams
prompts = [
"Hello, my name is", # Define a list of input prompts
"The capital of France is", prompts = [
"The largest ocean is", "Hello, my name is",
] "The capital of France is",
"The largest ocean is",
# Define sampling parameters ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Define sampling parameters
# Initialize the LLM engine with the OPT-125M model sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM(model="facebook/opt-125m")
# Initialize the LLM engine with the OPT-125M model
# Generate outputs for the input prompts llm = LLM(model="facebook/opt-125m")
outputs = llm.generate(prompts, sampling_params)
# Generate outputs for the input prompts
# Print the generated outputs outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt # Print the generated outputs
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") prompt = output.prompt
``` generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs. More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
...@@ -72,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model> ...@@ -72,7 +74,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>. That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
More details on the API server can be found in the [OpenAI-Compatible Server][openai-compatible-server] document. More details on the API server can be found in the [OpenAI-Compatible Server][serving-openai-compatible-server] document.
## LLM Engine ## LLM Engine
...@@ -178,32 +180,34 @@ vision-language model. ...@@ -178,32 +180,34 @@ vision-language model.
To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one: To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
```python ??? Code
class MyOldModel(nn.Module):
def __init__( ```python
self, class MyOldModel(nn.Module):
config, def __init__(
cache_config: Optional[CacheConfig] = None, self,
quant_config: Optional[QuantizationConfig] = None, config,
lora_config: Optional[LoRAConfig] = None, cache_config: Optional[CacheConfig] = None,
prefix: str = "", quant_config: Optional[QuantizationConfig] = None,
) -> None: lora_config: Optional[LoRAConfig] = None,
... prefix: str = "",
) -> None:
from vllm.config import VllmConfig ...
class MyNewModel(MyOldModel):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): from vllm.config import VllmConfig
config = vllm_config.model_config.hf_config class MyNewModel(MyOldModel):
cache_config = vllm_config.cache_config def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
quant_config = vllm_config.quant_config config = vllm_config.model_config.hf_config
lora_config = vllm_config.lora_config cache_config = vllm_config.cache_config
super().__init__(config, cache_config, quant_config, lora_config, prefix) quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
if __version__ >= "0.6.4": super().__init__(config, cache_config, quant_config, lora_config, prefix)
MyModel = MyNewModel
else: if __version__ >= "0.6.4":
MyModel = MyOldModel MyModel = MyNewModel
``` else:
MyModel = MyOldModel
```
This way, the model can work with both old and new versions of vLLM. This way, the model can work with both old and new versions of vLLM.
......
...@@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall, ...@@ -448,27 +448,29 @@ elements of the entire head for all context tokens. However, overall,
all results for output have been calculated but are just stored in all results for output have been calculated but are just stored in
different thread register memory. different thread register memory.
```cpp ??? Code
float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = NUM_WARPS; i > 1; i /= 2) {
// Upper warps write to shared memory.
...
float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
dst[row_idx] = accs[i];
}
// Lower warps update the output. ```cpp
const float* src = &out_smem[warp_idx * HEAD_SIZE]; float* out_smem = reinterpret_cast<float*>(shared_mem);
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) { for (int i = NUM_WARPS; i > 1; i /= 2) {
// Upper warps write to shared memory.
... ...
accs[i] += src[row_idx]; float* dst = &out_smem[(warp_idx - mid) * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
dst[row_idx] = accs[i];
}
// Lower warps update the output.
const float* src = &out_smem[warp_idx * HEAD_SIZE];
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
...
accs[i] += src[row_idx];
}
// Write out the accs.
} }
```
// Write out the accs.
}
```
## Output ## Output
......
...@@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture ( ...@@ -13,28 +13,30 @@ Plugins are user-registered code that vLLM executes. Given vLLM's architecture (
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin: vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
```python ??? Code
# inside `setup.py` file
from setuptools import setup ```python
# inside `setup.py` file
setup(name='vllm_add_dummy_model', from setuptools import setup
version='0.1',
packages=['vllm_add_dummy_model'], setup(name='vllm_add_dummy_model',
entry_points={ version='0.1',
'vllm.general_plugins': packages=['vllm_add_dummy_model'],
["register_dummy_model = vllm_add_dummy_model:register"] entry_points={
}) 'vllm.general_plugins':
["register_dummy_model = vllm_add_dummy_model:register"]
# inside `vllm_add_dummy_model.py` file })
def register():
from vllm import ModelRegistry # inside `vllm_add_dummy_model.py` file
def register():
if "MyLlava" not in ModelRegistry.get_supported_archs(): from vllm import ModelRegistry
ModelRegistry.register_model(
"MyLlava", if "MyLlava" not in ModelRegistry.get_supported_archs():
"vllm_add_dummy_model.my_llava:MyLlava", ModelRegistry.register_model(
) "MyLlava",
``` "vllm_add_dummy_model.my_llava:MyLlava",
)
```
For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html). For more information on adding entry points to your package, please check the [official documentation](https://setuptools.pypa.io/en/latest/userguide/entry_point.html).
......
...@@ -7,7 +7,7 @@ page for information on known issues and how to solve them. ...@@ -7,7 +7,7 @@ page for information on known issues and how to solve them.
## Introduction ## Introduction
!!! warning !!! important
The source code references are to the state of the code at the time of writing in December, 2024. The source code references are to the state of the code at the time of writing in December, 2024.
The use of Python multiprocessing in vLLM is complicated by: The use of Python multiprocessing in vLLM is complicated by:
...@@ -123,7 +123,7 @@ what is happening. First, a log message from vLLM: ...@@ -123,7 +123,7 @@ what is happening. First, a log message from vLLM:
WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously WARNING 12-11 14:50:37 multiproc_worker_utils.py:281] CUDA was previously
initialized. We must use the `spawn` multiprocessing start method. Setting initialized. We must use the `spawn` multiprocessing start method. Setting
VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See
https://docs.vllm.ai/en/latest/usage/debugging.html#python-multiprocessing https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing
for more information. for more information.
``` ```
......
An implementation of xPyD with dynamic scaling based on point-to-point communication, partly inspired by Dynamo.
# Detailed Design
## Overall Process
As shown in Figure 1, the overall process of this **PD disaggregation** solution is described through a request flow:
1. The client sends an HTTP request to the Proxy/Router's `/v1/completions` interface.
2. The Proxy/Router selects a **1P1D (1 Prefill instance + 1 Decode instance)** through either through round-robin or random selection, generates a `request_id` (rules to be introduced later), modifies the `max_tokens` in the HTTP request message to **1**, and then forwards the request to the **P instance**.
3. Immediately afterward, the Proxy/Router forwards the **original HTTP request** to the **D instance**.
4. The **P instance** performs **Prefill** and then **actively sends the generated KV cache** to the D instance (using **PUT_ASYNC** mode). The D instance's `zmq_addr` can be resolved through the `request_id`.
5. The **D instance** has a **dedicated thread** for receiving the KV cache (to avoid blocking the main process). The received KV cache is saved into the **GPU memory buffer**, the size of which is determined by the vLLM startup parameter `kv_buffer_size`. When the GPU buffer is full, the KV cache is stored in the **local Tensor memory pool**.
6. During the **Decode**, the D instance's main process retrieves the KV cache (transmitted by the P instance) from either the **GPU buffer** or the **memory pool**, thereby **skipping Prefill**.
7. After completing **Decode**, the D instance returns the result to the **Proxy/Router**, which then forwards it to the **client**.
![image1](https://github.com/user-attachments/assets/fb01bde6-755b-49f7-ad45-48a94b1e10a7)
## Proxy/Router (Demo)
A simple HTTP service acts as the entry point for client requests and starts a background thread to listen for P/D instances reporting their HTTP IP and PORT, as well as ZMQ IP and PORT. It maintains a dictionary of `http_addr -> zmq_addr`. The `http_addr` is the IP:PORT for the vLLM instance's request, while the `zmq_addr` is the address for KV cache handshake and metadata reception.
The Proxy/Router is responsible for selecting 1P1D based on the characteristics of the client request, such as the prompt, and generating a corresponding `request_id`, for example:
```
cmpl-___prefill_addr_10.0.1.2:21001___decode_addr_10.0.1.3:22001_93923d63113b4b338973f24d19d4bf11-0
```
Currently, to quickly verify whether xPyD can work, a round-robin selection of 1P1D is used. In the future, it is planned to use a trie combined with the load status of instances to select appropriate P and D.
Each P/D instance periodically sends a heartbeat packet to the Proxy/Router (currently every 3 seconds) to register (i.e., report `http_addr -> zmq_addr`) and keep the connection alive. If an instance crashes and fails to send a ping for a certain period of time, the Proxy/Router will remove the timed-out instance (this feature has not yet been developed).
## KV Cache Transfer Methods
There are three methods for KVcache transfer: PUT, GET, and PUT_ASYNC. These methods can be specified using the `--kv-transfer-config` and `kv_connector_extra_config` parameters, specifically through the `send_type` field. Both PUT and PUT_ASYNC involve the P instance actively sending KVcache to the D instance. The difference is that PUT is a synchronous transfer method that blocks the main process, while PUT_ASYNC is an asynchronous transfer method. PUT_ASYNC uses a dedicated thread for sending KVcache, which means it does not block the main process. In contrast, the GET method involves the P instance saving the KVcache to the memory buffer after computing the prefill. The D instance then actively retrieves the computed KVcache from the P instance once it has allocated space for the KVcache.
Experimental results have shown that the performance of these methods, from highest to lowest, is as follows: PUT_ASYNC → GET → PUT.
## P2P Communication via ZMQ & NCCL
As long as the address of the counterpart is known, point-to-point KV cache transfer (using NCCL) can be performed, without being constrained by rank and world size. To support dynamic scaling (expansion and contraction) of instances with PD disaggregation. This means that adding or removing P/D instances does not require a full system restart.
Each P/D instance only needs to create a single `P2pNcclEngine` instance. This instance maintains a ZMQ Server, which runs a dedicated thread to listen on the `zmq_addr` address and receive control flow requests from other instances. These requests include requests to establish an NCCL connection and requests to send KVcache metadata (such as tensor shapes and data types). However, it does not actually transmit the KVcache data itself.
When a P instance and a D instance transmit KVcache for the first time, they need to establish a ZMQ connection and an NCCL group. For subsequent KVcache transmissions, this ZMQ connection and NCCL group are reused. The NCCL group consists of only two ranks, meaning the world size is equal to 2. This design is intended to support dynamic scaling, which means that adding or removing P/D instances does not require a full system restart. As long as the address of the counterpart is known, point-to-point KVcache transmission can be performed, without being restricted by rank or world size.
## NCCL Group Topology
Currently, only symmetric TP (Tensor Parallelism) methods are supported for KVcache transmission. Asymmetric TP and PP (Pipeline Parallelism) methods will be supported in the future. Figure 2 illustrates the 1P2D setup, where each instance has a TP (Tensor Parallelism) degree of 2. There are a total of 7 NCCL groups: three vLLM instances each have one NCCL group with TP=2. Additionally, the 0th GPU card of the P instance establishes an NCCL group with the 0th GPU card of each D instance. Similarly, the 1st GPU card of the P instance establishes an NCCL group with the 1st GPU card of each D instance.
![image2](https://github.com/user-attachments/assets/837e61d6-365e-4cbf-8640-6dd7ab295b36)
Each NCCL group occupies a certain amount of GPU memory buffer for communication, the size of which is primarily influenced by the `NCCL_MAX_NCHANNELS` environment variable. When `NCCL_MAX_NCHANNELS=16`, an NCCL group typically occupies 100MB, while when `NCCL_MAX_NCHANNELS=8`, it usually takes up 52MB. For large-scale xPyD configurations—such as DeepSeek's 96P144D—this implementation is currently not feasible. Moving forward, we are considering using RDMA for point-to-point communication and are also keeping an eye on UCCL.
## GPU Memory Buffer and Tensor Memory Pool
The trade-off in the size of the memory buffer is as follows: For P instances, the memory buffer is not required in PUT and PUT_ASYNC modes, but it is necessary in GET mode. For D instances, a memory buffer is needed in all three modes. The memory buffer for D instances should not be too large. Similarly, for P instances in GET mode, the memory buffer should also not be too large. The memory buffer of D instances is used to temporarily store KVcache sent by P instances. If it is too large, it will reduce the KVcache space available for normal inference by D instances, thereby decreasing the inference batch size and ultimately leading to a reduction in output throughput. The size of the memory buffer is configured by the parameter `kv_buffer_size`, measured in bytes, and is typically set to 5%~10% of the memory size.
If the `--max-num-seqs` parameter for P instances is set to a large value, due to the large batch size, P instances will generate a large amount of KVcache simultaneously. This may exceed the capacity of the memory buffer of D instances, resulting in KVcache loss. Once KVcache is lost, D instances need to recompute Prefill, which is equivalent to performing Prefill twice. Consequently, the time-to-first-token (TTFT) will significantly increase, leading to degraded performance.
To address the above issues, I have designed and developed a local Tensor memory pool for storing KVcache, inspired by the buddy system used in Linux memory modules. Since the memory is sufficiently large, typically in the TB range on servers, there is no need to consider prefix caching or using block-based designs to reuse memory, thereby saving space. When the memory buffer is insufficient, KVcache can be directly stored in the Tensor memory pool, and D instances can subsequently retrieve KVcache from it. The read and write speed is that of PCIe, with PCIe 4.0 having a speed of approximately 21 GB/s, which is usually faster than the Prefill speed. Otherwise, solutions like Mooncake and lmcache would not be necessary. The Tensor memory pool acts as a flood diversion area, typically unused except during sudden traffic surges. In the worst-case scenario, my solution performs no worse than the normal situation with a Cache store.
# Install vLLM
??? Commands
```shell
# Enter the home directory or your working directory.
cd /home
# Download the installation package, and I will update the commit-id in time. You can directly copy the command.
wget https://vllm-wheels.s3.us-west-2.amazonaws.com/9112b443a042d8d815880b8780633882ad32b183/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
# Download the code repository.
git clone -b xpyd-v1 https://github.com/Abatom/vllm.git
cd vllm
# Set the installation package path.
export VLLM_PRECOMPILED_WHEEL_LOCATION=/home/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
# installation
pip install -e . -v
```
# Run xPyD
## Instructions
- The following examples are run on an A800 (80GB) device, using the Meta-Llama-3.1-8B-Instruct model.
- Pay attention to the setting of the `kv_buffer_size` (in bytes). The empirical value is 10% of the GPU memory size. This is related to the kvcache size. If it is too small, the GPU memory buffer for temporarily storing the received kvcache will overflow, causing the kvcache to be stored in the tensor memory pool, which increases latency. If it is too large, the kvcache available for inference will be reduced, leading to a smaller batch size and decreased throughput.
- For Prefill instances, when using non-GET mode, the `kv_buffer_size` can be set to 1, as Prefill currently does not need to receive kvcache. However, when using GET mode, a larger `kv_buffer_size` is required because it needs to store the kvcache sent to the D instance.
- You may need to modify the `kv_buffer_size` and `port` in the following commands (if there is a conflict).
- `PUT_ASYNC` offers the best performance and should be prioritized.
- The `--port` must be consistent with the `http_port` in the `--kv-transfer-config`.
- The `disagg_prefill_proxy_xpyd.py` script will use port 10001 (for receiving client requests) and port 30001 (for receiving service discovery from P and D instances).
- The node running the proxy must have `quart` installed.
- Supports multiple nodes; you just need to modify the `proxy_ip` and `proxy_port` in `--kv-transfer-config`.
- In the following examples, it is assumed that **the proxy's IP is 10.0.1.1**.
## Run 1P3D
### Proxy (e.g. 10.0.1.1)
```shell
cd {your vllm directory}/examples/online_serving/disagg_xpyd/
python3 disagg_prefill_proxy_xpyd.py &
```
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20005 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20009 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20003 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20008 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
## Run 3P1D
### Proxy (e.g. 10.0.1.1)
```shell
cd {your vllm directory}/examples/online_serving/disagg_xpyd/
python3 disagg_prefill_proxy_xpyd.py &
```
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20005 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"21001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20005","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20009 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"22001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20009","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20003 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_buffer_size":"1e1","kv_port":"23001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20003","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
??? Command
```shell
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
--host 0.0.0.0 \
--port 20008 \
--tensor-parallel-size 1 \
--seed 1024 \
--served-model-name base_model \
--dtype float16 \
--max-model-len 10000 \
--max-num-batched-tokens 10000 \
--max-num-seqs 256 \
--trust-remote-code \
--gpu-memory-utilization 0.7 \
--disable-log-request \
--kv-transfer-config \
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_buffer_size":"8e9","kv_port":"24001","kv_connector_extra_config":{"proxy_ip":"10.0.1.1","proxy_port":"30001","http_port":"20008","send_type":"PUT_ASYNC","nccl_num_channels":"16"}}' > /var/vllm.log 2>&1 &
```
# Single request
```shell
curl -X POST -s http://10.0.1.1:10001/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "base_model",
"prompt": "San Francisco is a",
"max_tokens": 10,
"temperature": 0
}'
```
# Benchmark
??? Command
```shell
python3 benchmark_serving.py \
--backend vllm \
--model base_model \
--tokenizer meta-llama/Llama-3.1-8B-Instruct \
--dataset-name "random" \
--host 10.0.1.1 \
--port 10001 \
--random-input-len 1024 \
--random-output-len 1024 \
--ignore-eos \
--burstiness 100 \
--percentile-metrics "ttft,tpot,itl,e2el" \
--metric-percentiles "90,95,99" \
--seed $(date +%s) \
--trust-remote-code \
--request-rate 3 \
--num-prompts 1000
```
# Shut down
```shell
pgrep python | xargs kill -9 && pkill -f python
```
# Test data
## **Scenario 1**: 1K input & 1K output tokens, E2E P99 latency ~20s
- **1P5D (6×A800) vs vLLM (1×A800)**:
- Throughput ↑7.2% (1085 → 6979/6)
- ITL (P99) ↓81.3% (120ms → 22.9ms)
- TTFT (P99) ↑26.8% (175ms → 222ms)
- TPOT: No change
- **1P6D (7×A800) vs vLLM (1×A800)**:
- Throughput ↑9.6% (1085 → 8329/7)
- ITL (P99) ↓81.0% (120ms → 22.7ms)
- TTFT (P99) ↑210% (175ms →543ms)
- TPOT: No change
## **Scenario 2**: 1K input & 200 output tokens, E2E P99 latency ~4s
- **1P1D (2×A800) vs vLLM (1×A800)**:
- Throughput ↑37.4% (537 → 1476/2)
- ITL (P99) ↓81.8% (127ms → 23.1ms)
- TTFT (P99) ↑41.8% (160ms → 227ms)
- TPOT: No change
![testdata](https://github.com/user-attachments/assets/f791bfc7-9f3d-4e5c-9171-a42f9f4da627)
...@@ -117,8 +117,8 @@ There are two design points to highlight: ...@@ -117,8 +117,8 @@ There are two design points to highlight:
1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time. 1. We allocate all KVCacheBlock when initializing the KV cache manager to be a block pool. This avoids Python object creation overheads and can easily track all blocks all the time.
2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits: 2. We introduce doubly linked list pointers directly in the KVCacheBlock, so that we could construct a free queue directly. This gives us two benefits:
1. We could have O(1) complexity moving elements in the middle to the tail. 1. We could have O(1) complexity moving elements in the middle to the tail.
2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements. 2. We could avoid introducing another Python queue (e.g., `deque`) which has a wrapper to the elements.
As a result, we will have the following components when the KV cache manager is initialized: As a result, we will have the following components when the KV cache manager is initialized:
...@@ -135,19 +135,19 @@ As a result, we will have the following components when the KV cache manager is ...@@ -135,19 +135,19 @@ As a result, we will have the following components when the KV cache manager is
**New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation: **New request:** Workflow for the scheduler to schedule a new request with KV cache block allocation:
1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up Cache Blocks. 1. The scheduler calls `kv_cache_manager.get_computed_blocks()` to get a sequence of blocks that have already been computed. This is done by hashing the prompt tokens in the request and looking up cache blocks.
2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps: 2. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:
1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate. 1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasn’t used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration. 2. “Touch” the computed blocks. It increases the reference count of the computed block by one, and removes the block from the free queue if the block wasn’t used by other requests. This is to avoid these computed blocks being evicted. See the example in the next section for illustration.
3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on. 3. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
4. If an allocated block is already full of tokens, we immediately add it to the Cache Block, so that the block can be reused by other requests in the same batch. 4. If an allocated block is already full of tokens, we immediately add it to the cache block, so that the block can be reused by other requests in the same batch.
**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation: **Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps: 1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:
1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate. 1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on. 2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it. 3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the cache block to cache it.
**Duplicated blocks** **Duplicated blocks**
Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3: Assuming block size is 4 and you send a request (Request 1\) with prompt ABCDEF and decoding length 3:
...@@ -199,7 +199,7 @@ When a request is finished, we free all its blocks if no other requests are usin ...@@ -199,7 +199,7 @@ When a request is finished, we free all its blocks if no other requests are usin
When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps: When the head block (least recently used block) of the free queue is cached, we have to evict the block to prevent it from being used by other requests. Specifically, eviction involves the following steps:
1. Pop the block from the head of the free queue. This is the LRU block to be evicted. 1. Pop the block from the head of the free queue. This is the LRU block to be evicted.
2. Remove the block ID from the Cache Block. 2. Remove the block ID from the cache block.
3. Remove the block hash. 3. Remove the block hash.
## Example ## Example
......
...@@ -28,27 +28,29 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all ...@@ -28,27 +28,29 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
In the very verbose logs, we can see: In the very verbose logs, we can see:
``` ??? Logs
DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
```text
DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache): DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py DEBUG 03-07 03:06:54 [backends.py:370] Traced files (to be considered for compilation cache):
DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/_dynamo/polyfills/builtins.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/container.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/torch/nn/modules/module.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/attention/layer.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/communication_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/distributed/parallel_state.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/custom_op.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/activation.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/layernorm.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/linear.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/rotary_embedding.py
DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/layers/vocab_parallel_embedding.py
DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py DEBUG 03-07 03:06:54 [backends.py:370] xxx/vllm/model_executor/models/llama.py
DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
``` DEBUG 03-07 03:07:07 [backends.py:462] Computation graph saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/computation_graph.py
DEBUG 03-07 03:07:07 [wrapper.py:105] Dynamo transformed code saved to ~/.cache/vllm/torch_compile_cache/1517964802/rank_0_0/transformed_code.py
```
This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation. This is about the Python code compilation, i.e. graph capture by Dynamo. It tries to trace the function with code `xxx/vllm/model_executor/models/llama.py:339`, which is the `forward` function of the model we compile. During the forward pass, there are also other functions called and inlined by Dynamo, as shown by the logs, including some PyTorch functions from `xxx/torch/nn/modules/module.py` (used by PyTorch `nn.Module`, because module attribute access will trigger a function call), some communication / attention / activation functions from vLLM. All the traced files will be considered when we decide the cache directory to use. This way, any code change in the above files will trigger compilation cache miss, and therefore recompilation.
...@@ -99,28 +101,31 @@ This time, Inductor compilation is completely bypassed, and we will load from di ...@@ -99,28 +101,31 @@ This time, Inductor compilation is completely bypassed, and we will load from di
The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example: The above example just uses Inductor to compile for a general shape (i.e. symbolic shape). We can also use Inductor to compile for some of the specific shapes, for example:
``` ```bash
vllm serve meta-llama/Llama-3.2-1B --compilation_config '{"compile_sizes": [1, 2, 4, 8]}' vllm serve meta-llama/Llama-3.2-1B \
--compilation_config '{"compile_sizes": [1, 2, 4, 8]}'
``` ```
Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel. Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At this time, all of the shapes in the computation graph are static and known, and we will turn on auto-tuning to tune for max performance. This can be slow when you run it for the first time, but the next time you run it, we can directly bypass the tuning and run the tuned kernel.
When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log: When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
``` ??? Logs
AUTOTUNE mm(8x2048, 2048x3072)
triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 ```
triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 AUTOTUNE mm(8x2048, 2048x3072)
triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4 triton_mm_4 0.0130 ms 100.0% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
mm 0.0160 ms 81.6% triton_mm_8 0.0134 ms 97.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8 triton_mm_12 0.0148 ms 87.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=4, num_warps=4
triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2 mm 0.0160 ms 81.6%
triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2 triton_mm_16 0.0165 ms 78.7% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=8
triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 triton_mm_3 0.0199 ms 65.4% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=2
triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4 triton_mm_1 0.0203 ms 64.2% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=128, BLOCK_M=16, BLOCK_N=32, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=2, num_warps=2
triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4 triton_mm_7 0.0203 ms 64.1% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling triton_mm_2 0.0208 ms 62.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=32, BLOCK_M=16, BLOCK_N=64, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=5, num_warps=4
``` triton_mm_11 0.0215 ms 60.5% ACC_TYPE='tl.float32', ALLOW_TF32=False, BLOCK_K=64, BLOCK_M=16, BLOCK_N=128, B_PROLOGUE_CAST_TYPE=None, EVEN_K=True, GROUP_M=8, num_stages=3, num_warps=4
SingleProcess AUTOTUNE benchmarking takes 2.0428 seconds and 7.5727 seconds precompiling
```
It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library). It means, for a matrix multiplication with shape `8x2048x3072`, `torch.compile` tries triton template with various configs, and it is much faster than the default code (which dispatches to cublas library).
...@@ -136,8 +141,9 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh ...@@ -136,8 +141,9 @@ The cudagraphs are captured and managed by the compiler backend, and replayed wh
By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`: By default, vLLM will try to determine a set of sizes to capture cudagraph. You can also override it using the config `cudagraph_capture_sizes`:
``` ```bash
vllm serve meta-llama/Llama-3.2-1B --compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}' vllm serve meta-llama/Llama-3.2-1B \
--compilation-config '{"cudagraph_capture_sizes": [1, 2, 4, 8]}'
``` ```
Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture. Then it will only capture cudagraph for the specified sizes. It can be useful to have fine-grained control over the cudagraph capture.
......
...@@ -59,23 +59,23 @@ th:not(:first-child) { ...@@ -59,23 +59,23 @@ th:not(:first-child) {
## Feature x Hardware ## Feature x Hardware
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | | Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU |
|-----------------------------------------------------------|--------------------|----------|----------|-------|----------|--------------------|-------| |-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
| [CP][chunked-prefill] | [](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [CP][chunked-prefill] | [](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [APC][automatic-prefix-caching] | [](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [APC][automatic-prefix-caching] | [](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [LoRA][lora-adapter] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | [](gh-issue:8475) | ✅ | | <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | [](gh-issue:8475) | ✅ | ❌ |
| [SD][spec-decode] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | [SD][spec-decode] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | | CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ |
| <abbr title="Pooling Models">pooling</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | | <abbr title="Pooling Models">pooling</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ❌ |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | <abbr title="Multimodal Inputs">mm</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | | <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [](gh-issue:8477) | ✅ | | multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [](gh-issue:8477) | ✅ | ❌ |
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
!!! note !!! note
Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
...@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa ...@@ -29,24 +29,26 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
the third parameter is the path to the LoRA adapter. the third parameter is the path to the LoRA adapter.
```python ??? Code
sampling_params = SamplingParams(
temperature=0, ```python
max_tokens=256, sampling_params = SamplingParams(
stop=["[/assistant]"] temperature=0,
) max_tokens=256,
stop=["[/assistant]"]
prompts = [ )
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", prompts = [
] "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
outputs = llm.generate( ]
prompts,
sampling_params, outputs = llm.generate(
lora_request=LoRARequest("sql_adapter", 1, sql_lora_path) prompts,
) sampling_params,
``` lora_request=LoRARequest("sql_adapter", 1, sql_lora_path)
)
```
Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options. Check out <gh-file:examples/offline_inference/multilora_inference.py> for an example of how to use LoRA adapters with the async engine and how to use more advanced configuration options.
...@@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora ...@@ -68,24 +70,26 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.): with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
```bash ??? Command
curl localhost:8000/v1/models | jq .
{ ```bash
"object": "list", curl localhost:8000/v1/models | jq .
"data": [ {
{ "object": "list",
"id": "meta-llama/Llama-2-7b-hf", "data": [
"object": "model", {
... "id": "meta-llama/Llama-2-7b-hf",
}, "object": "model",
{ ...
"id": "sql-lora", },
"object": "model", {
... "id": "sql-lora",
} "object": "model",
] ...
} }
``` ]
}
```
Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be Requests can specify the LoRA adapter as if it were any other model via the `model` request parameter. The requests will be
processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other processed according to the server-wide LoRA configuration (i.e. in parallel with base model requests, and potentially other
...@@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin: ...@@ -168,36 +172,36 @@ Alternatively, follow these example steps to implement your own plugin:
1. Implement the LoRAResolver interface. 1. Implement the LoRAResolver interface.
Example of a simple S3 LoRAResolver implementation: ??? Example of a simple S3 LoRAResolver implementation
```python ```python
import os import os
import s3fs import s3fs
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.resolver import LoRAResolver from vllm.lora.resolver import LoRAResolver
class S3LoRAResolver(LoRAResolver): class S3LoRAResolver(LoRAResolver):
def __init__(self): def __init__(self):
self.s3 = s3fs.S3FileSystem() self.s3 = s3fs.S3FileSystem()
self.s3_path_format = os.getenv("S3_PATH_TEMPLATE") self.s3_path_format = os.getenv("S3_PATH_TEMPLATE")
self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE") self.local_path_format = os.getenv("LOCAL_PATH_TEMPLATE")
async def resolve_lora(self, base_model_name, lora_name): async def resolve_lora(self, base_model_name, lora_name):
s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name) s3_path = self.s3_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name) local_path = self.local_path_format.format(base_model_name=base_model_name, lora_name=lora_name)
# Download the LoRA from S3 to the local path # Download the LoRA from S3 to the local path
await self.s3._get( await self.s3._get(
s3_path, local_path, recursive=True, maxdepth=1 s3_path, local_path, recursive=True, maxdepth=1
) )
lora_request = LoRARequest( lora_request = LoRARequest(
lora_name=lora_name, lora_name=lora_name,
lora_path=local_path, lora_path=local_path,
lora_int_id=abs(hash(lora_name)) lora_int_id=abs(hash(lora_name))
) )
return lora_request return lora_request
``` ```
2. Register `LoRAResolver` plugin. 2. Register `LoRAResolver` plugin.
...@@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo ...@@ -234,38 +238,40 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter. - The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
- The `root` field points to the artifact location of the lora adapter. - The `root` field points to the artifact location of the lora adapter.
```bash ??? Command output
$ curl http://localhost:8000/v1/models
```bash
{ $ curl http://localhost:8000/v1/models
"object": "list",
"data": [ {
{ "object": "list",
"id": "meta-llama/Llama-2-7b-hf", "data": [
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"parent": null,
"permission": [
{ {
..... "id": "meta-llama/Llama-2-7b-hf",
} "object": "model",
] "created": 1715644056,
}, "owned_by": "vllm",
{ "root": "~/.cache/huggingface/hub/models--meta-llama--Llama-2-7b-hf/snapshots/01c7f73d771dfac7d292323805ebc428287df4f9/",
"id": "sql-lora", "parent": null,
"object": "model", "permission": [
"created": 1715644056, {
"owned_by": "vllm", .....
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/", }
"parent": meta-llama/Llama-2-7b-hf, ]
"permission": [ },
{ {
.... "id": "sql-lora",
"object": "model",
"created": 1715644056,
"owned_by": "vllm",
"root": "~/.cache/huggingface/hub/models--yard1--llama-2-7b-sql-lora-test/snapshots/0dfa347e8877a4d4ed19ee56c140fa518470028c/",
"parent": meta-llama/Llama-2-7b-hf,
"permission": [
{
....
}
]
} }
] ]
} }
] ```
}
```
...@@ -20,112 +20,161 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]: ...@@ -20,112 +20,161 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples: You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
```python ??? Code
from vllm import LLM
llm = LLM(model="llava-hf/llava-1.5-7b-hf") ```python
from vllm import LLM
# Refer to the HuggingFace repo for the correct format to use llm = LLM(model="llava-hf/llava-1.5-7b-hf")
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Load the image using PIL.Image # Refer to the HuggingFace repo for the correct format to use
image = PIL.Image.open(...) prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
# Single prompt inference # Load the image using PIL.Image
outputs = llm.generate({ image = PIL.Image.open(...)
"prompt": prompt,
"multi_modal_data": {"image": image},
})
for o in outputs: # Single prompt inference
generated_text = o.outputs[0].text outputs = llm.generate({
print(generated_text) "prompt": prompt,
"multi_modal_data": {"image": image},
})
# Batch inference for o in outputs:
image_1 = PIL.Image.open(...) generated_text = o.outputs[0].text
image_2 = PIL.Image.open(...) print(generated_text)
outputs = llm.generate(
[
{
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_1},
},
{
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_2},
}
]
)
for o in outputs: # Batch inference
generated_text = o.outputs[0].text image_1 = PIL.Image.open(...)
print(generated_text) image_2 = PIL.Image.open(...)
``` outputs = llm.generate(
[
{
"prompt": "USER: <image>\nWhat is the content of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_1},
},
{
"prompt": "USER: <image>\nWhat's the color of this image?\nASSISTANT:",
"multi_modal_data": {"image": image_2},
}
]
)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
Full example: <gh-file:examples/offline_inference/vision_language.py> Full example: <gh-file:examples/offline_inference/vision_language.py>
To substitute multiple images inside the same text prompt, you can pass in a list of images instead: To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
```python ??? Code
from vllm import LLM
llm = LLM( ```python
model="microsoft/Phi-3.5-vision-instruct", from vllm import LLM
trust_remote_code=True, # Required to load Phi-3.5-vision
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
)
# Refer to the HuggingFace repo for the correct format to use llm = LLM(
prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n" model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, # Required to load Phi-3.5-vision
max_model_len=4096, # Otherwise, it may not fit in smaller GPUs
limit_mm_per_prompt={"image": 2}, # The maximum number to accept
)
# Load the images using PIL.Image # Refer to the HuggingFace repo for the correct format to use
image1 = PIL.Image.open(...) prompt = "<|user|>\n<|image_1|>\n<|image_2|>\nWhat is the content of each image?<|end|>\n<|assistant|>\n"
image2 = PIL.Image.open(...)
outputs = llm.generate({ # Load the images using PIL.Image
"prompt": prompt, image1 = PIL.Image.open(...)
"multi_modal_data": { image2 = PIL.Image.open(...)
"image": [image1, image2]
},
})
for o in outputs: outputs = llm.generate({
generated_text = o.outputs[0].text "prompt": prompt,
print(generated_text) "multi_modal_data": {
``` "image": [image1, image2]
},
})
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py> Full example: <gh-file:examples/offline_inference/vision_language_multi_image.py>
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: If using the [LLM.chat](https://docs.vllm.ai/en/stable/models/generative_models.html#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings:
```python ```python
from vllm import LLM from vllm import LLM
from vllm.assets.image import ImageAsset
# Specify the maximum number of frames per video to be 4. This can be changed. llm = LLM(model="llava-hf/llava-1.5-7b-hf")
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) image_url = "https://picsum.photos/id/32/512/512"
image_pil = ImageAsset('cherry_blossom').pil_image
image_embeds = torch.load(...)
# Create the request payload. conversation = [
video_frames = ... # load your video making sure it only has the number of frames specified earlier. {"role": "system", "content": "You are a helpful assistant"},
message = { {"role": "user", "content": "Hello"},
"role": "user", {"role": "assistant", "content": "Hello! How can I assist you today?"},
"content": [ {
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."}, "role": "user",
], "content": [{
} "type": "image_url",
for i in range(len(video_frames)): "image_url": {
base64_image = encode_image(video_frames[i]) # base64 encoding. "url": image_url
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}} }
message["content"].append(new_image) },{
"type": "image_pil",
"image_pil": image_pil
}, {
"type": "image_embeds",
"image_embeds": image_embeds
}, {
"type": "text",
"text": "What's in these images?"
}],
},
]
# Perform inference and log output. # Perform inference and log output.
outputs = llm.chat([message]) outputs = llm.chat(conversation)
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
??? Code
```python
from vllm import LLM
# Specify the maximum number of frames per video to be 4. This can be changed.
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
# Create the request payload.
video_frames = ... # load your video making sure it only has the number of frames specified earlier.
message = {
"role": "user",
"content": [
{"type": "text", "text": "Describe this set of frames. Consider the frames to be a part of the same video."},
],
}
for i in range(len(video_frames)):
base64_image = encode_image(video_frames[i]) # base64 encoding.
new_image = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}
message["content"].append(new_image)
# Perform inference and log output.
outputs = llm.chat([message])
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
### Video Inputs ### Video Inputs
You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary You can pass a list of NumPy arrays directly to the `'video'` field of the multi-modal dictionary
...@@ -144,81 +193,85 @@ Full example: <gh-file:examples/offline_inference/audio_language.py> ...@@ -144,81 +193,85 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model, To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary. pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
```python ??? Code
from vllm import LLM
# Inference with image embeddings as input ```python
llm = LLM(model="llava-hf/llava-1.5-7b-hf") from vllm import LLM
# Refer to the HuggingFace repo for the correct format to use # Inference with image embeddings as input
prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:" llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# Embeddings for single image # Refer to the HuggingFace repo for the correct format to use
# torch.Tensor of shape (1, image_feature_size, hidden_size of LM) prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
image_embeds = torch.load(...)
outputs = llm.generate({ # Embeddings for single image
"prompt": prompt, # torch.Tensor of shape (1, image_feature_size, hidden_size of LM)
"multi_modal_data": {"image": image_embeds}, image_embeds = torch.load(...)
})
for o in outputs: outputs = llm.generate({
generated_text = o.outputs[0].text "prompt": prompt,
print(generated_text) "multi_modal_data": {"image": image_embeds},
``` })
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
```
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings: For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
```python ??? Code
# Construct the prompt based on your model
prompt = ...
# Embeddings for multiple images ```python
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM) # Construct the prompt based on your model
image_embeds = torch.load(...) prompt = ...
# Embeddings for multiple images
# torch.Tensor of shape (num_images, image_feature_size, hidden_size of LM)
image_embeds = torch.load(...)
# Qwen2-VL # Qwen2-VL
llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4}) llm = LLM("Qwen/Qwen2-VL-2B-Instruct", limit_mm_per_prompt={"image": 4})
mm_data = { mm_data = {
"image": { "image": {
"image_embeds": image_embeds, "image_embeds": image_embeds,
# image_grid_thw is needed to calculate positional encoding. # image_grid_thw is needed to calculate positional encoding.
"image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3), "image_grid_thw": torch.load(...), # torch.Tensor of shape (1, 3),
}
} }
}
# MiniCPM-V
# MiniCPM-V llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4})
llm = LLM("openbmb/MiniCPM-V-2_6", trust_remote_code=True, limit_mm_per_prompt={"image": 4}) mm_data = {
mm_data = { "image": {
"image": { "image_embeds": image_embeds,
"image_embeds": image_embeds, # image_sizes is needed to calculate details of the sliced image.
# image_sizes is needed to calculate details of the sliced image. "image_sizes": [image.size for image in images], # list of image sizes
"image_sizes": [image.size for image in images], # list of image sizes }
} }
}
outputs = llm.generate({ outputs = llm.generate({
"prompt": prompt, "prompt": prompt,
"multi_modal_data": mm_data, "multi_modal_data": mm_data,
}) })
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
``` ```
## Online Serving ## Online Serving
Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat). Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions API](https://platform.openai.com/docs/api-reference/chat).
!!! warning !!! important
A chat template is **required** to use Chat Completions API. A chat template is **required** to use Chat Completions API.
For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`. For HF format models, the default chat template is defined inside `chat_template.json` or `tokenizer_config.json`.
If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>. If no default chat template is available, we will first look for a built-in fallback in <gh-file:vllm/transformers_utils/chat_templates/registry.py>.
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument. If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
For certain models, we provide alternative chat templates inside <gh-dir:vllm/examples>. For certain models, we provide alternative chat templates inside <gh-dir:examples>.
For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision. For example, VLM2Vec uses <gh-file:examples/template_vlm2vec.jinja> which is different from the default one for Phi-3-Vision.
### Image Inputs ### Image Inputs
...@@ -235,51 +288,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ ...@@ -235,51 +288,53 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
from openai import OpenAI
```python
openai_api_key = "EMPTY" from openai import OpenAI
openai_api_base = "http://localhost:8000/v1"
openai_api_key = "EMPTY"
client = OpenAI( openai_api_base = "http://localhost:8000/v1"
api_key=openai_api_key,
base_url=openai_api_base, client = OpenAI(
) api_key=openai_api_key,
base_url=openai_api_base,
# Single-image input inference )
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
# Single-image input inference
chat_response = client.chat.completions.create( image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
model="microsoft/Phi-3.5-vision-instruct",
messages=[{ chat_response = client.chat.completions.create(
"role": "user", model="microsoft/Phi-3.5-vision-instruct",
"content": [ messages=[{
# NOTE: The prompt formatting with the image token `<image>` is not needed "role": "user",
# since the prompt will be processed automatically by the API server. "content": [
{"type": "text", "text": "What’s in this image?"}, # NOTE: The prompt formatting with the image token `<image>` is not needed
{"type": "image_url", "image_url": {"url": image_url}}, # since the prompt will be processed automatically by the API server.
], {"type": "text", "text": "What’s in this image?"},
}], {"type": "image_url", "image_url": {"url": image_url}},
) ],
print("Chat completion output:", chat_response.choices[0].message.content) }],
)
# Multi-image input inference print("Chat completion output:", chat_response.choices[0].message.content)
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" # Multi-image input inference
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
chat_response = client.chat.completions.create( image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
model="microsoft/Phi-3.5-vision-instruct",
messages=[{ chat_response = client.chat.completions.create(
"role": "user", model="microsoft/Phi-3.5-vision-instruct",
"content": [ messages=[{
{"type": "text", "text": "What are the animals in these images?"}, "role": "user",
{"type": "image_url", "image_url": {"url": image_url_duck}}, "content": [
{"type": "image_url", "image_url": {"url": image_url_lion}}, {"type": "text", "text": "What are the animals in these images?"},
], {"type": "image_url", "image_url": {"url": image_url_duck}},
}], {"type": "image_url", "image_url": {"url": image_url_lion}},
) ],
print("Chat completion output:", chat_response.choices[0].message.content) }],
``` )
print("Chat completion output:", chat_response.choices[0].message.content)
```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
...@@ -295,7 +350,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for ...@@ -295,7 +350,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
By default, the timeout for fetching images through HTTP URL is `5` seconds. By default, the timeout for fetching images through HTTP URL is `5` seconds.
You can override this by setting the environment variable: You can override this by setting the environment variable:
```console ```bash
export VLLM_IMAGE_FETCH_TIMEOUT=<timeout> export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
``` ```
...@@ -311,44 +366,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model ...@@ -311,44 +366,46 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
from openai import OpenAI
openai_api_key = "EMPTY" ```python
openai_api_base = "http://localhost:8000/v1" from openai import OpenAI
client = OpenAI( openai_api_key = "EMPTY"
api_key=openai_api_key, openai_api_base = "http://localhost:8000/v1"
base_url=openai_api_base,
)
video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4" client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
## Use video url in the payload video_url = "http://commondatastorage.googleapis.com/gtv-videos-bucket/sample/ForBiggerFun.mp4"
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": video_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content ## Use video url in the payload
print("Chat completion output from image url:", result) chat_completion_from_url = client.chat.completions.create(
``` messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this video?"
},
{
"type": "video_url",
"video_url": {
"url": video_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)
```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
...@@ -356,7 +413,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for ...@@ -356,7 +413,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
By default, the timeout for fetching videos through HTTP URL is `30` seconds. By default, the timeout for fetching videos through HTTP URL is `30` seconds.
You can override this by setting the environment variable: You can override this by setting the environment variable:
```console ```bash
export VLLM_VIDEO_FETCH_TIMEOUT=<timeout> export VLLM_VIDEO_FETCH_TIMEOUT=<timeout>
``` ```
...@@ -373,84 +430,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b ...@@ -373,84 +430,88 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
Then, you can use the OpenAI client as follows: Then, you can use the OpenAI client as follows:
```python ??? Code
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
def encode_base64_content_from_url(content_url: str) -> str: ```python
"""Encode a content retrieved from a remote url to base64 format.""" import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
with requests.get(content_url) as response: def encode_base64_content_from_url(content_url: str) -> str:
response.raise_for_status() """Encode a content retrieved from a remote url to base64 format."""
result = base64.b64encode(response.content).decode('utf-8')
return result with requests.get(content_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
openai_api_key = "EMPTY" return result
openai_api_base = "http://localhost:8000/v1"
client = OpenAI( openai_api_key = "EMPTY"
api_key=openai_api_key, openai_api_base = "http://localhost:8000/v1"
base_url=openai_api_base,
)
# Any format supported by librosa is supported client = OpenAI(
audio_url = AudioAsset("winning_call").url api_key=openai_api_key,
audio_base64 = encode_base64_content_from_url(audio_url) base_url=openai_api_base,
)
chat_completion_from_base64 = client.chat.completions.create( # Any format supported by librosa is supported
messages=[{ audio_url = AudioAsset("winning_call").url
"role": "user", audio_base64 = encode_base64_content_from_url(audio_url)
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "wav"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content chat_completion_from_base64 = client.chat.completions.create(
print("Chat completion output from input audio:", result) messages=[{
``` "role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "input_audio",
"input_audio": {
"data": audio_base64,
"format": "wav"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:", result)
```
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input: Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
```python ??? Code
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content ```python
print("Chat completion output from audio url:", result) chat_completion_from_url = client.chat.completions.create(
``` messages=[{
"role": "user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:", result)
```
Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py> Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for_multimodal.py>
...@@ -458,7 +519,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for ...@@ -458,7 +519,7 @@ Full example: <gh-file:examples/online_serving/openai_chat_completion_client_for
By default, the timeout for fetching audios through HTTP URL is `10` seconds. By default, the timeout for fetching audios through HTTP URL is `10` seconds.
You can override this by setting the environment variable: You can override this by setting the environment variable:
```console ```bash
export VLLM_AUDIO_FETCH_TIMEOUT=<timeout> export VLLM_AUDIO_FETCH_TIMEOUT=<timeout>
``` ```
...@@ -470,61 +531,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary. ...@@ -470,61 +531,63 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field. For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
The following example demonstrates how to pass image embeddings to the OpenAI server: The following example demonstrates how to pass image embeddings to the OpenAI server:
```python ??? Code
image_embedding = torch.load(...)
grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct ```python
image_embedding = torch.load(...)
buffer = io.BytesIO() grid_thw = torch.load(...) # Required by Qwen/Qwen2-VL-2B-Instruct
torch.save(image_embedding, buffer)
buffer.seek(0) buffer = io.BytesIO()
binary_data = buffer.read() torch.save(image_embedding, buffer)
base64_image_embedding = base64.b64encode(binary_data).decode('utf-8') buffer.seek(0)
binary_data = buffer.read()
client = OpenAI( base64_image_embedding = base64.b64encode(binary_data).decode('utf-8')
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key, client = OpenAI(
base_url=openai_api_base, # defaults to os.environ.get("OPENAI_API_KEY")
) api_key=openai_api_key,
base_url=openai_api_base,
# Basic usage - this is equivalent to the LLaVA example for offline inference )
model = "llava-hf/llava-1.5-7b-hf"
embeds = { # Basic usage - this is equivalent to the LLaVA example for offline inference
"type": "image_embeds", model = "llava-hf/llava-1.5-7b-hf"
"image_embeds": f"{base64_image_embedding}" embeds = {
} "type": "image_embeds",
"image_embeds": f"{base64_image_embedding}"
# Pass additional parameters (available to Qwen2-VL and MiniCPM-V) }
model = "Qwen/Qwen2-VL-2B-Instruct"
embeds = { # Pass additional parameters (available to Qwen2-VL and MiniCPM-V)
"type": "image_embeds", model = "Qwen/Qwen2-VL-2B-Instruct"
"image_embeds": { embeds = {
"image_embeds": f"{base64_image_embedding}" , # Required "type": "image_embeds",
"image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct "image_embeds": {
}, "image_embeds": f"{base64_image_embedding}" , # Required
} "image_grid_thw": f"{base64_image_grid_thw}" # Required by Qwen/Qwen2-VL-2B-Instruct
model = "openbmb/MiniCPM-V-2_6"
embeds = {
"type": "image_embeds",
"image_embeds": {
"image_embeds": f"{base64_image_embedding}" , # Required
"image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
},
}
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"type": "text",
"text": "What's in this image?",
}, },
embeds, }
], model = "openbmb/MiniCPM-V-2_6"
}, embeds = {
], "type": "image_embeds",
model=model, "image_embeds": {
) "image_embeds": f"{base64_image_embedding}" , # Required
``` "image_sizes": f"{base64_image_sizes}" # Required by openbmb/MiniCPM-V-2_6
},
}
chat_completion = client.chat.completions.create(
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{
"type": "text",
"text": "What's in this image?",
},
embeds,
],
},
],
model=model,
)
```
!!! note !!! note
Only one message can contain `{"type": "image_embeds"}`. Only one message can contain `{"type": "image_embeds"}`.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment