Commit e00b0a19 authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.3.3

parents ead94d93 3f1166ab
"""
This example shows how to use the multi-LoRA functionality for offline inference.
Requires HuggingFace credentials for access to Llama2.
"""
from typing import Optional, List, Tuple
from huggingface_hub import snapshot_download
from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
from vllm.lora.request import LoRARequest
def create_test_prompts(
lora_path: str
) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
"""Create a list of test prompts with their sampling parameters.
2 requests for base model, 4 requests for the LoRA. We define 2
different LoRA adapters (using the same model for demo purposes).
Since we also set `max_loras=1`, the expectation is that the requests
with the second LoRA adapter will be ran after all requests with the
first adapter have finished.
"""
return [
("A robot may not injure a human being",
SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128), None),
("To be or not to be,",
SamplingParams(temperature=0.8,
top_k=5,
presence_penalty=0.2,
max_tokens=128), None),
("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
SamplingParams(n=3,
best_of=3,
use_beam_search=True,
temperature=0,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
SamplingParams(temperature=0.0,
logprobs=1,
prompt_logprobs=1,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora2", 2, lora_path)),
("[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
SamplingParams(n=3,
best_of=3,
use_beam_search=True,
temperature=0,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
]
def process_requests(engine: LLMEngine,
test_prompts: List[Tuple[str, SamplingParams,
Optional[LoRARequest]]]):
"""Continuously process a list of prompts and handle the outputs."""
request_id = 0
while test_prompts or engine.has_unfinished_requests():
if test_prompts:
prompt, sampling_params, lora_request = test_prompts.pop(0)
engine.add_request(str(request_id),
prompt,
sampling_params,
lora_request=lora_request)
request_id += 1
request_outputs: List[RequestOutput] = engine.step()
for request_output in request_outputs:
if request_output.finished:
print(request_output)
def initialize_engine() -> LLMEngine:
"""Initialize the LLMEngine."""
# max_loras: controls the number of LoRAs that can be used in the same
# batch. Larger numbers will cause higher memory usage, as each LoRA
# slot requires its own preallocated tensor.
# max_lora_rank: controls the maximum supported rank of all LoRAs. Larger
# numbers will cause higher memory usage. If you know that all LoRAs will
# use the same rank, it is recommended to set this as low as possible.
# max_cpu_loras: controls the size of the CPU LoRA cache.
engine_args = EngineArgs(model="meta-llama/Llama-2-7b-hf",
enable_lora=True,
max_loras=1,
max_lora_rank=8,
max_cpu_loras=2,
max_num_seqs=256)
return LLMEngine.from_engine_args(engine_args)
def main():
"""Main function that sets up and runs the prompt processing."""
engine = initialize_engine()
lora_path = snapshot_download(repo_id="yard1/llama-2-7b-sql-lora-test")
test_prompts = create_test_prompts(lora_path)
process_requests(engine, test_prompts)
if __name__ == '__main__':
main()
"""
This example shows how to use Ray Data for running offline batch inference
distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""
from vllm import LLM, SamplingParams
from typing import Dict
import numpy as np
import ray
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create a class to do batch inference.
class LLMPredictor:
def __init__(self):
# Create an LLM.
self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf")
def __call__(self, batch: Dict[str, np.ndarray]) -> Dict[str, list]:
# Generate texts from the prompts.
# The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information.
outputs = self.llm.generate(batch["text"], sampling_params)
prompt = []
generated_text = []
for output in outputs:
prompt.append(output.prompt)
generated_text.append(' '.join([o.text for o in output.outputs]))
return {
"prompt": prompt,
"generated_text": generated_text,
}
# Read one text file from S3. Ray Data supports reading multiple files
# from cloud storage (such as JSONL, Parquet, CSV, binary format).
ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
# Apply batch inference for all input data.
ds = ds.map_batches(
LLMPredictor,
# Set the concurrency to the number of LLM instances.
concurrency=10,
# Specify the number of GPUs required per LLM instance.
# NOTE: Do NOT set `num_gpus` when using vLLM with tensor-parallelism
# (i.e., `tensor_parallel_size`).
num_gpus=1,
# Specify the batch size for inference.
batch_size=32,
)
# Peek first 10 results.
# NOTE: This is for local testing and debugging. For production use case,
# one should write full result out as shown below.
outputs = ds.take(limit=10)
for output in outputs:
prompt = output["prompt"]
generated_text = output["generated_text"]
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# Write inference output data out as Parquet files to S3.
# Multiple files would be written to the output destination,
# and each task would write one or more files separately.
#
# ds.write_parquet("s3://<your-output-bucket>")
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(
model="openlm-research/open_llama_3b",
max_num_seqs=8,
# The max_model_len and block_size arguments are required to be same as max sequence length,
# when targeting neuron device. Currently, this is a known limitation in continuous batching
# support in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len=128,
block_size=128,
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection, or explicitly assigned.
device="neuron")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
from vllm import LLM, SamplingParams
prefix = (
"You are an expert school principal, skilled in effectively managing "
"faculty and staff. Draft 10-15 questions for a potential first grade "
"Head Teacher for my K-12, all-girls', independent school that emphasizes "
"community, joyful discovery, and life-long learning. The candidate is "
"coming in for a first-round panel interview for a 8th grade Math "
"teaching role. They have 5 years of previous teaching experience "
"as an assistant teacher at a co-ed, public school with experience "
"in middle school math teaching. Based on these information, fulfill "
"the following paragraph: ")
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0)
# Create an LLM.
llm = LLM(model="facebook/opt-125m")
generating_prompts = [prefix + prompt for prompt in prompts]
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(generating_prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
print("-" * 80)
# -1 since the last token can change when concatenating prompts.
prefix_pos = len(llm.llm_engine.tokenizer.encode(prefix)) - 1
# The llm.generate call will batch all prompts and send the batch at once if resources allow.
# The prefix will only be cached after the first batch is processed, so we need to call generate once
# to calculate the prefix and cache it.
outputs = llm.generate(generating_prompts[0],
sampling_params,
prefix_pos=[prefix_pos])
# Subsequent batches can leverage the cached prefix
outputs = llm.generate(generating_prompts,
sampling_params,
prefix_pos=[prefix_pos] * len(generating_prompts))
# Print the outputs. You should see the same outputs as before
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
...@@ -32,6 +32,5 @@ chat_completion = client.chat.completions.create( ...@@ -32,6 +32,5 @@ chat_completion = client.chat.completions.create(
model=model, model=model,
) )
print("Chat completion results:") print("Chat completion results:")
print(chat_completion) print(chat_completion)
...@@ -21,8 +21,7 @@ completion = client.completions.create( ...@@ -21,8 +21,7 @@ completion = client.completions.create(
echo=False, echo=False,
n=2, n=2,
stream=stream, stream=stream,
logprobs=3 logprobs=3)
)
print("Completion results:") print("Completion results:")
if stream: if stream:
......
# vLLM + Prometheus/Grafana
This is a simple example that shows you how to connect vLLM metric logging to the Prometheus/Grafana stack. For this example, we launch Prometheus and Grafana via Docker. You can checkout other methods through [Prometheus](https://prometheus.io/) and [Grafana](https://grafana.com/) websites.
Install:
- [`docker`](https://docs.docker.com/engine/install/)
- [`docker compose`](https://docs.docker.com/compose/install/linux/#install-using-the-repository)
### Launch
Prometheus metric logging is enabled by default in the OpenAI-compatible server. Launch via the entrypoint:
```bash
python3 -m vllm.entrypoints.openai.api_server \
--model mistralai/Mistral-7B-v0.1 \
--max-model-len 2048 \
--disable-log-requests
```
Launch Prometheus and Grafana servers with `docker compose`:
```bash
docker compose up
```
Submit some sample requests to the server:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
python3 ../../benchmarks/benchmark_serving.py \
--model mistralai/Mistral-7B-v0.1 \
--tokenizer mistralai/Mistral-7B-v0.1 \
--endpoint /v1/completions \
--dataset ShareGPT_V3_unfiltered_cleaned_split.json \
--request-rate 3.0
```
Navigating to [`http://localhost:8000/metrics`](http://localhost:8000/metrics) will show the raw Prometheus metrics being exposed by vLLM.
### Grafana Dashboard
Navigate to [`http://localhost:3000`](http://localhost:3000). Log in with the default username (`admin`) and password (`admin`).
#### Add Prometheus Data Source
Navigate to [`http://localhost:3000/connections/datasources/new`](http://localhost:3000/connections/datasources/new) and select Prometheus.
On Prometheus configuration page, we need to add the `Prometheus Server URL` in `Connection`. For this setup, Grafana and Prometheus are running in separate containers, but Docker creates DNS name for each containers. You can just use `http://prometheus:9090`.
Click `Save & Test`. You should get a green check saying "Successfully queried the Prometheus API.".
#### Import Dashboard
Navigate to [`http://localhost:3000/dashboard/import`](http://localhost:3000/dashboard/import), upload `grafana.json`, and select the `prometheus` datasource. You should see a screen that looks like the following:
![Grafana Dashboard Image](https://i.imgur.com/R2vH9VW.png)
# docker-compose.yaml
version: "3"
services:
prometheus:
image: prom/prometheus:latest
extra_hosts:
- "host.docker.internal:host-gateway" # allow a direct connection from container to the local machine
ports:
- "9090:9090" # the default port used by Prometheus
volumes:
- ${PWD}/prometheus.yaml:/etc/prometheus/prometheus.yml # mount Prometheus config file
grafana:
image: grafana/grafana:latest
depends_on:
- prometheus
ports:
- "3000:3000" # the default port used by Grafana
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "prometheus",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__elements": {},
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "10.2.3"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "timeseries",
"name": "Time series",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Monitoring vLLM Inference Server",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "End to end request latency measured in seconds.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 0
},
"id": 9,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P99",
"range": true,
"refId": "A",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P95",
"range": true,
"refId": "B",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P90",
"range": true,
"refId": "C",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:e2e_request_latency_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P50",
"range": true,
"refId": "D",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(vllm:e2e_request_latency_seconds_sum[$__rate_interval])\n/\nrate(vllm:e2e_request_latency_seconds_count[$__rate_interval])",
"hide": false,
"instant": false,
"legendFormat": "Average",
"range": true,
"refId": "E"
}
],
"title": "E2E Request Latency",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Number of tokens processed per second",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 0
},
"id": 8,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "rate(vllm:prompt_tokens_total[$__rate_interval])",
"fullMetaSearch": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "Prompt Tokens/Sec",
"range": true,
"refId": "A",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "rate(vllm:generation_tokens_total[$__rate_interval])",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "Generation Tokens/Sec",
"range": true,
"refId": "B",
"useBackend": false
}
],
"title": "Token Throughput",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Inter token latency in seconds.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 8
},
"id": 10,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P99",
"range": true,
"refId": "A",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P95",
"range": true,
"refId": "B",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P90",
"range": true,
"refId": "C",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_per_output_token_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P50",
"range": true,
"refId": "D",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(vllm:time_per_output_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_per_output_token_seconds_count[$__rate_interval])",
"hide": false,
"instant": false,
"legendFormat": "Mean",
"range": true,
"refId": "E"
}
],
"title": "Time Per Output Token Latency",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Number of requests in RUNNING, WAITING, and SWAPPED state",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 8
},
"id": 3,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "vllm:num_requests_running",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "Num Running",
"range": true,
"refId": "A",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "vllm:num_requests_swapped",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "Num Swapped",
"range": true,
"refId": "B",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "vllm:num_requests_waiting",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": true,
"instant": false,
"legendFormat": "Num Waiting",
"range": true,
"refId": "C",
"useBackend": false
}
],
"title": "Scheduler State",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "P50, P90, P95, and P99 TTFT latency in seconds.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "s"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 16
},
"id": 5,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.99, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P99",
"range": true,
"refId": "A",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.95, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P95",
"range": true,
"refId": "B",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.9, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P90",
"range": true,
"refId": "C",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "histogram_quantile(0.5, sum by(le) (rate(vllm:time_to_first_token_seconds_bucket[$__rate_interval])))",
"fullMetaSearch": false,
"hide": false,
"includeNullMetadata": false,
"instant": false,
"legendFormat": "P50",
"range": true,
"refId": "D",
"useBackend": false
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "rate(vllm:time_to_first_token_seconds_sum[$__rate_interval])\n/\nrate(vllm:time_to_first_token_seconds_count[$__rate_interval])",
"hide": false,
"instant": false,
"legendFormat": "Average",
"range": true,
"refId": "E"
}
],
"title": "Time To First Token Latency",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"description": "Percentage of used cache blocks by vLLM.",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "percentunit"
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 16
},
"id": 4,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": true
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "vllm:gpu_cache_usage_perc",
"instant": false,
"legendFormat": "GPU Cache Usage",
"range": true,
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "${DS_PROMETHEUS}"
},
"editorMode": "code",
"expr": "vllm:cpu_cache_usage_perc",
"hide": false,
"instant": false,
"legendFormat": "CPU Cache Usage",
"range": true,
"refId": "B"
}
],
"title": "Cache Utilization",
"type": "timeseries"
}
],
"refresh": "",
"schemaVersion": 39,
"tags": [],
"templating": {
"list": []
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {},
"timezone": "",
"title": "vLLM",
"uid": "b281712d-8bff-41ef-9f3f-71ad43c05e9b",
"version": 2,
"weekStart": ""
}
# prometheus.yaml
global:
scrape_interval: 5s
evaluation_interval: 30s
scrape_configs:
- job_name: vllm
static_configs:
- targets:
- 'host.docker.internal:8000'
...@@ -24,6 +24,7 @@ builtin cd "$ROOT" || exit 1 ...@@ -24,6 +24,7 @@ builtin cd "$ROOT" || exit 1
YAPF_VERSION=$(yapf --version | awk '{print $2}') YAPF_VERSION=$(yapf --version | awk '{print $2}')
RUFF_VERSION=$(ruff --version | awk '{print $2}') RUFF_VERSION=$(ruff --version | awk '{print $2}')
MYPY_VERSION=$(mypy --version | awk '{print $2}') MYPY_VERSION=$(mypy --version | awk '{print $2}')
CODESPELL_VERSION=$(codespell --version)
# # params: tool name, tool version, required version # # params: tool name, tool version, required version
tool_version_check() { tool_version_check() {
...@@ -36,6 +37,7 @@ tool_version_check() { ...@@ -36,6 +37,7 @@ tool_version_check() {
tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)" tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)" tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
tool_version_check "codespell" "$CODESPELL_VERSION" "$(grep codespell requirements-dev.txt | cut -d'=' -f3)"
YAPF_FLAGS=( YAPF_FLAGS=(
'--recursive' '--recursive'
...@@ -71,7 +73,7 @@ format_changed() { ...@@ -71,7 +73,7 @@ format_changed() {
# Format all files # Format all files
format_all() { format_all() {
yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm tests yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" .
} }
## This flag formats individual files. --files *must* be the first command line ## This flag formats individual files. --files *must* be the first command line
...@@ -93,6 +95,47 @@ echo 'vLLM yapf: Done' ...@@ -93,6 +95,47 @@ echo 'vLLM yapf: Done'
# echo 'vLLM mypy:' # echo 'vLLM mypy:'
# mypy # mypy
# check spelling of specified files
spell_check() {
codespell "$@"
}
spell_check_all(){
codespell --toml pyproject.toml
}
# Spelling check of files that differ from main branch.
spell_check_changed() {
# The `if` guard ensures that the list of filenames is not empty, which
# could cause ruff to receive 0 positional arguments, making it hang
# waiting for STDIN.
#
# `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
# exist on both branches.
MERGEBASE="$(git merge-base origin/main HEAD)"
if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
codespell
fi
}
# Run Codespell
## This flag runs spell check of individual files. --files *must* be the first command line
## arg to use this option.
if [[ "$1" == '--files' ]]; then
spell_check "${@:2}"
# If `--all` is passed, then any further arguments are ignored and the
# entire python directory is linted.
elif [[ "$1" == '--all' ]]; then
spell_check_all
else
# Check spelling only of the files that changed in last commit.
spell_check_changed
fi
echo 'vLLM codespell: Done'
# Lint specified files # Lint specified files
lint() { lint() {
ruff "$@" ruff "$@"
...@@ -117,9 +160,9 @@ lint_changed() { ...@@ -117,9 +160,9 @@ lint_changed() {
} }
# Run Ruff # Run Ruff
echo 'vLLM Ruff:' echo 'vLLM ruff:'
## This flag lints individual files. --files *must* be the first command line ### This flag lints individual files. --files *must* be the first command line
## arg to use this option. ### arg to use this option.
if [[ "$1" == '--files' ]]; then if [[ "$1" == '--files' ]]; then
lint "${@:2}" lint "${@:2}"
# If `--all` is passed, then any further arguments are ignored and the # If `--all` is passed, then any further arguments are ignored and the
...@@ -139,3 +182,5 @@ if ! git diff --quiet &>/dev/null; then ...@@ -139,3 +182,5 @@ if ! git diff --quiet &>/dev/null; then
exit 1 exit 1
fi fi
[mypy]
python_version = 3.8
ignore_missing_imports = True
files = vllm
# TODO(woosuk): Include the code from Megatron and HuggingFace.
exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/
...@@ -31,4 +31,22 @@ ignore = [ ...@@ -31,4 +31,22 @@ ignore = [
"E731", "E731",
# line too long, handled by black formatting # line too long, handled by black formatting
"E501", "E501",
# .strip() with multi-character strings
"B005",
# Loop control variable not used within loop body
"B007",
] ]
[tool.mypy]
python_version = "3.8"
ignore_missing_imports = true
files = "vllm"
# TODO(woosuk): Include the code from Megatron and HuggingFace.
exclude = "vllm/model_executor/parallel_utils/|vllm/model_executor/models/"
[tool.codespell]
ignore-words-list = "dout, te, indicies"
skip = "./tests/prompts"
# formatting # formatting
yapf==0.32.0 yapf==0.32.0
toml==0.10.2 toml==0.10.2
tomli==2.0.1
ruff==0.1.5 ruff==0.1.5
codespell==2.2.6
# type checking # type checking
mypy==0.991 mypy==0.991
...@@ -13,6 +15,9 @@ types-setuptools ...@@ -13,6 +15,9 @@ types-setuptools
pytest pytest
pytest-forked pytest-forked
pytest-asyncio pytest-asyncio
pytest-rerunfailures
httpx httpx
einops # required for MPT einops # required for MPT
flash_attn # required for HuggingFace's llama implementation openai
requests
ray
sentencepiece # Required for LLaMA tokenizer.
numpy
transformers-neuronx >= 0.9.0
torch-neuronx >= 2.1.0
neuronx-cc
fastapi
uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0
...@@ -2,12 +2,12 @@ ninja # For faster builds. ...@@ -2,12 +2,12 @@ ninja # For faster builds.
typing-extensions>=4.8.0 typing-extensions>=4.8.0
starlette starlette
psutil psutil
ray >= 2.5.1 ray >= 2.9
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
numpy numpy
tokenizers>=0.15.0 tokenizers>=0.15.0
transformers >= 4.36.0 # Required for Mixtral. transformers >= 4.38.0 # Required for Gemma.
fastapi fastapi
uvicorn[standard] uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server. pydantic >= 2.0 # Required for OpenAI server.
aioprometheus[starlette] prometheus_client >= 0.18.0
ninja # For faster builds. ninja # For faster builds.
psutil psutil
ray >= 2.5.1 ray >= 2.9
sentencepiece # Required for LLaMA tokenizer. sentencepiece # Required for LLaMA tokenizer.
numpy numpy
torch == 2.1.2 torch == 2.1.2
transformers >= 4.36.0 # Required for Mixtral. transformers >= 4.38.0 # Required for Gemma.
xformers == 0.0.23.post1 # Required for CUDA 12.1. xformers == 0.0.23.post1 # Required for CUDA 12.1.
fastapi fastapi
uvicorn[standard] uvicorn[standard]
pydantic == 1.10.13 # Required for OpenAI server. pydantic >= 2.0 # Required for OpenAI server.
aioprometheus[starlette] prometheus_client >= 0.18.0
pynvml == 11.5.0
triton >= 2.1.0
outlines >= 0.0.27
cupy-cuda12x == 12.1.0 # Required for CUDA graphs. CUDA 11.8 users should install cupy-cuda11x instead.
--- amd_hip_bf16.h 2024-02-06 18:28:58.268699142 +0000
+++ amd_hip_bf16.h.new 2024-02-06 18:28:31.988647133 +0000
@@ -90,10 +90,10 @@
#include "math_fwd.h" // ocml device functions
#if defined(__HIPCC_RTC__)
-#define __HOST_DEVICE__ __device__
+#define __HOST_DEVICE__ __device__ static
#else
#include <climits>
-#define __HOST_DEVICE__ __host__ __device__
+#define __HOST_DEVICE__ __host__ __device__ static inline
#endif
// Since we are using unsigned short to represent data in bfloat16, it can be of different sizes on
import contextlib
import io import io
import os import os
import re import re
import subprocess import subprocess
from typing import List, Set
import warnings import warnings
from pathlib import Path
from typing import List, Set
from packaging.version import parse, Version from packaging.version import parse, Version
import setuptools import setuptools
import torch import torch
import torch.utils.cpp_extension as torch_cpp_ext
from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME, ROCM_HOME
from typing import Optional, Union from typing import Optional, Union
...@@ -16,11 +19,16 @@ from pathlib import Path ...@@ -16,11 +19,16 @@ from pathlib import Path
ROOT_DIR = os.path.dirname(__file__) ROOT_DIR = os.path.dirname(__file__)
# If you are developing the C++ backend of vLLM, consider building vLLM with
# `python setup.py develop` since it will give you incremental builds.
# The downside is that this method is deprecated, see
# https://github.com/pypa/setuptools/issues/917
MAIN_CUDA_VERSION = "12.1" MAIN_CUDA_VERSION = "12.1"
# Supported NVIDIA GPU architectures. # Supported NVIDIA GPU architectures.
NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx926", "gfx1030", "gfx1100"} ROCM_SUPPORTED_ARCHS = {"gfx908", "gfx90a", "gfx906", "gfx926", "gfx928", "gfx936","gfx942", "gfx1100"}
# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
...@@ -28,8 +36,17 @@ def _is_hip() -> bool: ...@@ -28,8 +36,17 @@ def _is_hip() -> bool:
return torch.version.hip is not None return torch.version.hip is not None
def _is_neuron() -> bool:
torch_neuronx_installed = True
try:
subprocess.run(["neuron-ls"], capture_output=True, check=True)
except (FileNotFoundError, PermissionError):
torch_neuronx_installed = False
return torch_neuronx_installed
def _is_cuda() -> bool: def _is_cuda() -> bool:
return torch.version.cuda is not None return (torch.version.cuda is not None) and not _is_neuron()
# Compiler flags. # Compiler flags.
...@@ -43,6 +60,8 @@ if _is_hip(): ...@@ -43,6 +60,8 @@ if _is_hip():
"Cannot find ROCM_HOME. ROCm must be available to build the package." "Cannot find ROCM_HOME. ROCm must be available to build the package."
) )
NVCC_FLAGS += ["-DUSE_ROCM"] NVCC_FLAGS += ["-DUSE_ROCM"]
NVCC_FLAGS += ["-U__HIP_NO_HALF_CONVERSIONS__"]
NVCC_FLAGS += ["-U__HIP_NO_HALF_OPERATORS__"]
if _is_cuda() and CUDA_HOME is None: if _is_cuda() and CUDA_HOME is None:
raise RuntimeError( raise RuntimeError(
...@@ -53,22 +72,6 @@ CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] ...@@ -53,22 +72,6 @@ CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
# def get_amdgpu_offload_arch():
# command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
# try:
# output = subprocess.check_output([command])
# return output.decode('utf-8').strip()
# except subprocess.CalledProcessError as e:
# error_message = f"Error: {e}"
# raise RuntimeError(error_message) from e
# except FileNotFoundError as e:
# # If the command is not found, print an error message
# error_message = f"The command {command} was not found."
# raise RuntimeError(error_message) from e
# return None
def get_hipcc_rocm_version(): def get_hipcc_rocm_version():
# Run the hipcc --version command # Run the hipcc --version command
result = subprocess.run(['hipcc', '--version'], result = subprocess.run(['hipcc', '--version'],
...@@ -91,6 +94,30 @@ def get_hipcc_rocm_version(): ...@@ -91,6 +94,30 @@ def get_hipcc_rocm_version():
return None return None
def glob(pattern: str):
root = Path(__name__).parent
return [str(p) for p in root.glob(pattern)]
def get_neuronxcc_version():
import sysconfig
site_dir = sysconfig.get_paths()["purelib"]
version_file = os.path.join(site_dir, "neuronxcc", "version",
"__init__.py")
# Check if the command was executed successfully
with open(version_file, "rt") as fp:
content = fp.read()
# Extract the version using a regular expression
match = re.search(r"__version__ = '(\S+)'", content)
if match:
# Return the version string
return match.group(1)
else:
raise RuntimeError("Could not find HIP version in the output")
def get_nvcc_cuda_version(cuda_dir: str) -> Version: def get_nvcc_cuda_version(cuda_dir: str) -> Version:
"""Get the CUDA version from nvcc. """Get the CUDA version from nvcc.
...@@ -104,6 +131,50 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version: ...@@ -104,6 +131,50 @@ def get_nvcc_cuda_version(cuda_dir: str) -> Version:
return nvcc_cuda_version return nvcc_cuda_version
def get_pytorch_rocm_arch() -> Set[str]:
"""Get the cross section of Pytorch,and vllm supported gfx arches
ROCM can get the supported gfx architectures in one of two ways
Either through the PYTORCH_ROCM_ARCH env var, or output from
rocm_agent_enumerator.
In either case we can generate a list of supported arch's and
cross reference with VLLM's own ROCM_SUPPORTED_ARCHs.
"""
env_arch_list = os.environ.get("PYTORCH_ROCM_ARCH", None)
# If we don't have PYTORCH_ROCM_ARCH specified pull the list from rocm_agent_enumerator
if env_arch_list is None:
command = "rocm_agent_enumerator"
env_arch_list = subprocess.check_output([command]).decode('utf-8')\
.strip().replace("\n", ";")
arch_source_str = "rocm_agent_enumerator"
else:
arch_source_str = "PYTORCH_ROCM_ARCH env variable"
# List are separated by ; or space.
pytorch_rocm_arch = set(env_arch_list.replace(" ", ";").split(";"))
# Filter out the invalid architectures and print a warning.
arch_list = pytorch_rocm_arch.intersection(ROCM_SUPPORTED_ARCHS)
# If none of the specified architectures are valid, raise an error.
if not arch_list:
raise RuntimeError(
f"None of the ROCM architectures in {arch_source_str} "
f"({env_arch_list}) is supported. "
f"Supported ROCM architectures are: {ROCM_SUPPORTED_ARCHS}.")
invalid_arch_list = pytorch_rocm_arch - ROCM_SUPPORTED_ARCHS
if invalid_arch_list:
warnings.warn(
f"Unsupported ROCM architectures ({invalid_arch_list}) are "
f"excluded from the {arch_source_str} output "
f"({env_arch_list}). Supported ROCM architectures are: "
f"{ROCM_SUPPORTED_ARCHS}.",
stacklevel=2)
return arch_list
def get_torch_arch_list() -> Set[str]: def get_torch_arch_list() -> Set[str]:
# TORCH_CUDA_ARCH_LIST can have one or more architectures, # TORCH_CUDA_ARCH_LIST can have one or more architectures,
# e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the # e.g. "8.0" or "7.5,8.0,8.6+PTX". Here, the "8.6+PTX" option asks the
...@@ -128,22 +199,27 @@ def get_torch_arch_list() -> Set[str]: ...@@ -128,22 +199,27 @@ def get_torch_arch_list() -> Set[str]:
# If none of the specified architectures are valid, raise an error. # If none of the specified architectures are valid, raise an error.
if not arch_list: if not arch_list:
raise RuntimeError( raise RuntimeError(
"None of the CUDA/ROCM architectures in `TORCH_CUDA_ARCH_LIST` env " "None of the CUDA architectures in `TORCH_CUDA_ARCH_LIST` env "
f"variable ({env_arch_list}) is supported. " f"variable ({env_arch_list}) is supported. "
f"Supported CUDA/ROCM architectures are: {valid_archs}.") f"Supported CUDA architectures are: {valid_archs}.")
invalid_arch_list = torch_arch_list - valid_archs invalid_arch_list = torch_arch_list - valid_archs
if invalid_arch_list: if invalid_arch_list:
warnings.warn( warnings.warn(
f"Unsupported CUDA/ROCM architectures ({invalid_arch_list}) are " f"Unsupported CUDA architectures ({invalid_arch_list}) are "
"excluded from the `TORCH_CUDA_ARCH_LIST` env variable " "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
f"({env_arch_list}). Supported CUDA/ROCM architectures are: " f"({env_arch_list}). Supported CUDA architectures are: "
f"{valid_archs}.", f"{valid_archs}.",
stacklevel=2) stacklevel=2)
return arch_list return arch_list
# First, check the TORCH_CUDA_ARCH_LIST environment variable. if _is_hip():
compute_capabilities = get_torch_arch_list() rocm_arches = get_pytorch_rocm_arch()
NVCC_FLAGS += ["--offload-arch=" + arch for arch in rocm_arches]
else:
# First, check the TORCH_CUDA_ARCH_LIST environment variable.
compute_capabilities = get_torch_arch_list()
if _is_cuda() and not compute_capabilities: if _is_cuda() and not compute_capabilities:
# If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
# GPUs on the current machine. # GPUs on the current machine.
...@@ -155,6 +231,8 @@ if _is_cuda() and not compute_capabilities: ...@@ -155,6 +231,8 @@ if _is_cuda() and not compute_capabilities:
"GPUs with compute capability below 7.0 are not supported.") "GPUs with compute capability below 7.0 are not supported.")
compute_capabilities.add(f"{major}.{minor}") compute_capabilities.add(f"{major}.{minor}")
ext_modules = []
if _is_cuda(): if _is_cuda():
nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
if not compute_capabilities: if not compute_capabilities:
...@@ -192,6 +270,8 @@ if _is_cuda(): ...@@ -192,6 +270,8 @@ if _is_cuda():
raise RuntimeError( raise RuntimeError(
"CUDA 11.8 or higher is required for compute capability 9.0.") "CUDA 11.8 or higher is required for compute capability 9.0.")
NVCC_FLAGS_PUNICA = NVCC_FLAGS.copy()
# Add target compute capabilities to NVCC flags. # Add target compute capabilities to NVCC flags.
for capability in compute_capabilities: for capability in compute_capabilities:
num = capability[0] + capability[2] num = capability[0] + capability[2]
...@@ -200,6 +280,14 @@ if _is_cuda(): ...@@ -200,6 +280,14 @@ if _is_cuda():
NVCC_FLAGS += [ NVCC_FLAGS += [
"-gencode", f"arch=compute_{num},code=compute_{num}" "-gencode", f"arch=compute_{num},code=compute_{num}"
] ]
if int(capability[0]) >= 8:
NVCC_FLAGS_PUNICA += [
"-gencode", f"arch=compute_{num},code=sm_{num}"
]
if capability.endswith("+PTX"):
NVCC_FLAGS_PUNICA += [
"-gencode", f"arch=compute_{num},code=compute_{num}"
]
# Use NVCC threads to parallelize the build. # Use NVCC threads to parallelize the build.
if nvcc_cuda_version >= Version("11.2"): if nvcc_cuda_version >= Version("11.2"):
...@@ -207,14 +295,41 @@ if _is_cuda(): ...@@ -207,14 +295,41 @@ if _is_cuda():
num_threads = min(os.cpu_count(), nvcc_threads) num_threads = min(os.cpu_count(), nvcc_threads)
NVCC_FLAGS += ["--threads", str(num_threads)] NVCC_FLAGS += ["--threads", str(num_threads)]
# elif _is_hip(): if nvcc_cuda_version >= Version("11.8"):
# amd_arch = get_amdgpu_offload_arch() NVCC_FLAGS += ["-DENABLE_FP8_E5M2"]
# if amd_arch not in ROCM_SUPPORTED_ARCHS:
# raise RuntimeError( # changes for punica kernels
# f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" NVCC_FLAGS += torch_cpp_ext.COMMON_NVCC_FLAGS
# f"amdgpu_arch_found: {amd_arch}") REMOVE_NVCC_FLAGS = [
'-D__CUDA_NO_HALF_OPERATORS__',
ext_modules = [] '-D__CUDA_NO_HALF_CONVERSIONS__',
'-D__CUDA_NO_BFLOAT16_CONVERSIONS__',
'-D__CUDA_NO_HALF2_OPERATORS__',
]
for flag in REMOVE_NVCC_FLAGS:
with contextlib.suppress(ValueError):
torch_cpp_ext.COMMON_NVCC_FLAGS.remove(flag)
install_punica = bool(int(os.getenv("VLLM_INSTALL_PUNICA_KERNELS", "0")))
device_count = torch.cuda.device_count()
for i in range(device_count):
major, minor = torch.cuda.get_device_capability(i)
if major < 8:
install_punica = False
break
if install_punica:
ext_modules.append(
CUDAExtension(
name="vllm._punica_C",
sources=["csrc/punica/punica_ops.cc"] +
glob("csrc/punica/bgmv/*.cu"),
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS_PUNICA,
},
))
elif _is_neuron():
neuronxcc_version = get_neuronxcc_version()
vllm_extension_sources = [ vllm_extension_sources = [
"csrc/cache_kernels.cu", "csrc/cache_kernels.cu",
...@@ -225,21 +340,38 @@ vllm_extension_sources = [ ...@@ -225,21 +340,38 @@ vllm_extension_sources = [
"csrc/quantization/squeezellm/quant_cuda_kernel.cu", "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
"csrc/quantization/gptq/q_gemm.cu", "csrc/quantization/gptq/q_gemm.cu",
"csrc/cuda_utils_kernels.cu", "csrc/cuda_utils_kernels.cu",
"csrc/moe_align_block_size_kernels.cu",
"csrc/pybind.cpp", "csrc/pybind.cpp",
] ]
if _is_cuda(): if _is_cuda():
vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu") vllm_extension_sources.append("csrc/quantization/awq/gemm_kernels.cu")
vllm_extension_sources.append(
vllm_extension = CUDAExtension( "csrc/quantization/marlin/marlin_cuda_kernel.cu")
name="vllm._C", vllm_extension_sources.append("csrc/custom_all_reduce.cu")
sources=vllm_extension_sources,
extra_compile_args={ # Add MoE kernels.
"cxx": CXX_FLAGS, ext_modules.append(
"nvcc": NVCC_FLAGS, CUDAExtension(
}, name="vllm._moe_C",
) sources=glob("csrc/moe/*.cu") + glob("csrc/moe/*.cpp"),
ext_modules.append(vllm_extension) extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
))
if not _is_neuron():
vllm_extension = CUDAExtension(
name="vllm._C",
sources=vllm_extension_sources,
extra_compile_args={
"cxx": CXX_FLAGS,
"nvcc": NVCC_FLAGS,
},
libraries=["cuda"] if _is_cuda() else [],
)
ext_modules.append(vllm_extension)
def get_path(*filepath) -> str: def get_path(*filepath) -> str:
...@@ -300,8 +432,8 @@ def get_version_add(sha: Optional[str] = None) -> str: ...@@ -300,8 +432,8 @@ def get_version_add(sha: Optional[str] = None) -> str:
version += ".torch" + torch.__version__[:3] version += ".torch" + torch.__version__[:3]
with open(add_version_path, encoding="utf-8",mode="w") as file: with open(add_version_path, encoding="utf-8",mode="w") as file:
file.write("__version__='0.2.7'\n") file.write("__version__='0.3.3'\n")
file.write("__dcu_version__='0.2.7+{}'\n".format(version)) file.write("__dcu_version__='0.3.3+{}'\n".format(version))
file.close() file.close()
...@@ -318,11 +450,17 @@ def get_vllm_version() -> str: ...@@ -318,11 +450,17 @@ def get_vllm_version() -> str:
if _is_hip(): if _is_hip():
# Get the HIP version # Get the HIP version
# hipcc_version = get_hipcc_rocm_version() hipcc_version = get_hipcc_rocm_version()
# if hipcc_version != MAIN_CUDA_VERSION: if hipcc_version != MAIN_CUDA_VERSION:
# rocm_version_str = hipcc_version.replace(".", "")[:3] rocm_version_str = hipcc_version.replace(".", "")[:3]
# version += f"+rocm{rocm_version_str}" # version += f"+rocm{rocm_version_str}"
version = get_version() version = get_version()
elif _is_neuron():
# Get the Neuron version
neuron_version = str(neuronxcc_version)
if neuron_version != MAIN_CUDA_VERSION:
neuron_version_str = neuron_version.replace(".", "")[:3]
version += f"+neuron{neuron_version_str}"
else: else:
cuda_version = str(nvcc_cuda_version) cuda_version = str(nvcc_cuda_version)
if cuda_version != MAIN_CUDA_VERSION: if cuda_version != MAIN_CUDA_VERSION:
...@@ -346,13 +484,18 @@ def get_requirements() -> List[str]: ...@@ -346,13 +484,18 @@ def get_requirements() -> List[str]:
if _is_hip(): if _is_hip():
with open(get_path("requirements-rocm.txt")) as f: with open(get_path("requirements-rocm.txt")) as f:
requirements = f.read().strip().split("\n") requirements = f.read().strip().split("\n")
elif _is_neuron():
with open(get_path("requirements-neuron.txt")) as f:
requirements = f.read().strip().split("\n")
else: else:
with open(get_path("requirements.txt")) as f: with open(get_path("requirements.txt")) as f:
requirements = f.read().strip().split("\n") requirements = f.read().strip().split("\n")
return requirements return requirements
package_data = {"vllm": ["py.typed"]} package_data = {
"vllm": ["py.typed", "model_executor/layers/fused_moe/configs/*.json"]
}
if os.environ.get("VLLM_USE_PRECOMPILED"): if os.environ.get("VLLM_USE_PRECOMPILED"):
ext_modules = [] ext_modules = []
package_data["vllm"].append("*.so") package_data["vllm"].append("*.so")
...@@ -384,6 +527,6 @@ setuptools.setup( ...@@ -384,6 +527,6 @@ setuptools.setup(
python_requires=">=3.8", python_requires=">=3.8",
install_requires=get_requirements(), install_requires=get_requirements(),
ext_modules=ext_modules, ext_modules=ext_modules,
cmdclass={"build_ext": BuildExtension}, cmdclass={"build_ext": BuildExtension} if not _is_neuron() else {},
package_data=package_data, package_data=package_data,
) )
...@@ -25,6 +25,13 @@ class MockEngine: ...@@ -25,6 +25,13 @@ class MockEngine:
return [RequestOutput( return [RequestOutput(
request_id=self.request_id)] if self.request_id else [] request_id=self.request_id)] if self.request_id else []
async def encode_request_async(
self,
*args,
**kwargs,
):
return [1]
def generate(self, request_id): def generate(self, request_id):
self.request_id = request_id self.request_id = request_id
...@@ -35,6 +42,10 @@ class MockEngine: ...@@ -35,6 +42,10 @@ class MockEngine:
del kwargs # Unused del kwargs # Unused
self.add_request_calls += 1 self.add_request_calls += 1
async def add_request_async(self, **kwargs):
del kwargs # Unused
self.add_request_calls += 1
def abort_request(self, request_id): def abort_request(self, request_id):
del request_id # Unused del request_id # Unused
self.abort_request_calls += 1 self.abort_request_calls += 1
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment