Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
...@@ -118,7 +118,7 @@ configuration for the root vLLM logger and for the logger you wish to silence: ...@@ -118,7 +118,7 @@ configuration for the root vLLM logger and for the logger you wish to silence:
{ {
"formatters": { "formatters": {
"vllm": { "vllm": {
"class": "vllm.logging.NewLineFormatter", "class": "vllm.logging_utils.NewLineFormatter",
"datefmt": "%m-%d %H:%M:%S", "datefmt": "%m-%d %H:%M:%S",
"format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" "format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
} }
......
...@@ -43,15 +43,6 @@ def create_test_prompts( ...@@ -43,15 +43,6 @@ def create_test_prompts(
max_tokens=128, max_tokens=128,
stop_token_ids=[32003]), stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)), LoRARequest("sql-lora", 1, lora_path)),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
SamplingParams(n=3,
best_of=3,
use_beam_search=True,
temperature=0,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
( (
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501 "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]", # noqa: E501
SamplingParams(temperature=0.0, SamplingParams(temperature=0.0,
...@@ -60,15 +51,6 @@ def create_test_prompts( ...@@ -60,15 +51,6 @@ def create_test_prompts(
max_tokens=128, max_tokens=128,
stop_token_ids=[32003]), stop_token_ids=[32003]),
LoRARequest("sql-lora2", 2, lora_path)), LoRARequest("sql-lora2", 2, lora_path)),
(
"[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]", # noqa: E501
SamplingParams(n=3,
best_of=3,
use_beam_search=True,
temperature=0,
max_tokens=128,
stop_token_ids=[32003]),
LoRARequest("sql-lora", 1, lora_path)),
] ]
......
...@@ -12,7 +12,7 @@ if __name__ == '__main__': ...@@ -12,7 +12,7 @@ if __name__ == '__main__':
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16) sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=16)
# Create an LLM. # Create an LLM.
llm = LLM(model="facebook/opt-125m",tensor_parallel_size=1, distributed_executor_backend="ray", dtype="float16",trust_remote_code=True, enforce_eager=True) llm = LLM(model="facebook/opt-125m",tensor_parallel_size=1, dtype="float16",trust_remote_code=True, enforce_eager=True)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
...@@ -20,4 +20,4 @@ if __name__ == '__main__': ...@@ -20,4 +20,4 @@ if __name__ == '__main__':
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
\ No newline at end of file
...@@ -25,19 +25,15 @@ def run_ultravox(question: str, audio_count: int): ...@@ -25,19 +25,15 @@ def run_ultravox(question: str, audio_count: int):
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [{ messages = [{
'role': 'role': 'user',
'user', 'content': "<|audio|>\n" * audio_count + question
'content':
"<|reserved_special_token_0|>\n" * audio_count + question
}] }]
prompt = tokenizer.apply_chat_template(messages, prompt = tokenizer.apply_chat_template(messages,
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
llm = LLM(model=model_name, llm = LLM(model=model_name,
enforce_eager=True, trust_remote_code=True,
enable_chunked_prefill=False,
max_model_len=8192,
limit_mm_per_prompt={"audio": audio_count}) limit_mm_per_prompt={"audio": audio_count})
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
...@@ -91,10 +87,9 @@ def main(args): ...@@ -91,10 +87,9 @@ def main(args):
for asset in audio_assets[:audio_count] for asset in audio_assets[:audio_count]
] ]
} }
assert args.num_prompts > 0 assert args.num_prompts > 0
inputs = {"prompt": prompt, "multi_modal_data": mm_data} inputs = {"prompt": prompt, "multi_modal_data": mm_data}
if args.num_prompts > 1: if args.num_prompts > 1:
# Batch inference # Batch inference
inputs = [inputs] * args.num_prompts inputs = [inputs] * args.num_prompts
......
from vllm import LLM
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
# You should pass task="classify" for classification models
model = LLM(
model="jason9693/Qwen2.5-1.5B-apeach",
task="classify",
enforce_eager=True,
)
# Generate logits. The output is a list of ClassificationRequestOutputs.
outputs = model.classify(prompts)
# Print the outputs.
for prompt, output in zip(prompts, outputs):
probs = output.outputs.probs
probs_trimmed = ((str(probs[:16])[:-1] +
", ...]") if len(probs) > 16 else probs)
print(f"Prompt: {prompt!r} | "
f"Class Probabilities: {probs_trimmed} (size={len(probs)})")
from dataclasses import asdict
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
def get_prompts(num_prompts: int):
# The default sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
if num_prompts != len(prompts):
prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
return prompts
def main(args):
# Create prompts
prompts = get_prompts(args.num_prompts)
# Create a sampling params object.
sampling_params = SamplingParams(n=args.n,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_tokens)
# Create an LLM.
# The default model is 'facebook/opt-125m'
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**asdict(engine_args))
# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == '__main__':
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
group = parser.add_argument_group("SamplingParams options")
group.add_argument("--num-prompts",
type=int,
default=4,
help="Number of prompts used for inference")
group.add_argument("--max-tokens",
type=int,
default=16,
help="Generated output length for sampling")
group.add_argument('--n',
type=int,
default=1,
help='Number of generated sequences per prompt')
group.add_argument('--temperature',
type=float,
default=0.8,
help='Temperature for text generation')
group.add_argument('--top-p',
type=float,
default=0.95,
help='top_p for text generation')
group.add_argument('--top-k',
type=int,
default=-1,
help='top_k for text generation')
args = parser.parse_args()
main(args)
...@@ -9,9 +9,20 @@ prompts = [ ...@@ -9,9 +9,20 @@ prompts = [
] ]
# Create an LLM. # Create an LLM.
model = LLM(model="intfloat/e5-mistral-7b-instruct", enforce_eager=True) # You should pass task="embed" for embedding models
model = LLM(
model="intfloat/e5-mistral-7b-instruct",
task="embed",
enforce_eager=True,
)
# Generate embedding. The output is a list of EmbeddingRequestOutputs. # Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.encode(prompts) outputs = model.embed(prompts)
# Print the outputs. # Print the outputs.
for output in outputs: for prompt, output in zip(prompts, outputs):
print(output.outputs.embedding) # list of 4096 floats embeds = output.outputs.embedding
embeds_trimmed = ((str(embeds[:16])[:-1] +
", ...]") if len(embeds) > 16 else embeds)
print(f"Prompt: {prompt!r} | "
f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
...@@ -50,8 +50,6 @@ if __name__ == "__main__": ...@@ -50,8 +50,6 @@ if __name__ == "__main__":
llm = LLM( llm = LLM(
model="meta-llama/Llama-2-13b-chat-hf", model="meta-llama/Llama-2-13b-chat-hf",
speculative_model="ibm-fms/llama-13b-accelerator", speculative_model="ibm-fms/llama-13b-accelerator",
# These are currently required for MLPSpeculator decoding
use_v2_block_manager=True,
) )
print("With speculation") print("With speculation")
......
# Offline Inference with the OpenAI Batch file format # Offline Inference with the OpenAI Batch file format
**NOTE:** This is a guide to performing batch inference using the OpenAI batch file format, **NOT** the complete Batch (REST) API. ```{important}
This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
## File Format ```
The OpenAI batch file format consists of a series of json objects on new lines. ## File Format
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl) The OpenAI batch file format consists of a series of json objects on new lines.
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/openai_example_batch.jsonl)
**NOTE:** We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon). Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
## Pre-requisites ```{note}
We currently only support `/v1/chat/completions` and `/v1/embeddings` endpoints (completions coming soon).
```
* Ensure you are using `vllm >= 0.4.3`. You can check by running `python -c "import vllm; print(vllm.__version__)"`. ## Pre-requisites
* The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`. * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
- Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens) - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
- Install the token on your machine (Run `huggingface-cli login`). - Install the token on your machine (Run `huggingface-cli login`).
- Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions. - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
## Example 1: Running with a local file ## Example 1: Running with a local file
### Step 1: Create your batch file ### Step 1: Create your batch file
To follow along with this example, you can download the example batch, or create your own batch file in your working directory. To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
``` ```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
``` ```
Once you've created your batch file it should look like this Once you've created your batch file it should look like this
``` ```
$ cat openai_example_batch.jsonl $ cat openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
``` ```
### Step 2: Run the batch ### Step 2: Run the batch
The batch running tool is designed to be used from the command line. The batch running tool is designed to be used from the command line.
...@@ -85,18 +88,18 @@ To integrate with cloud blob storage, we recommend using presigned urls. ...@@ -85,18 +88,18 @@ To integrate with cloud blob storage, we recommend using presigned urls.
### Step 1: Upload your input script ### Step 1: Upload your input script
To follow along with this example, you can download the example batch, or create your own batch file in your working directory. To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
``` ```
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/openai_example_batch.jsonl
``` ```
Once you've created your batch file it should look like this Once you've created your batch file it should look like this
``` ```
$ cat openai_example_batch.jsonl $ cat openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
``` ```
Now upload your batch file to your S3 bucket. Now upload your batch file to your S3 bucket.
...@@ -104,7 +107,6 @@ Now upload your batch file to your S3 bucket. ...@@ -104,7 +107,6 @@ Now upload your batch file to your S3 bucket.
aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl aws s3 cp openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
``` ```
### Step 2: Generate your presigned urls ### Step 2: Generate your presigned urls
Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names. Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
...@@ -179,21 +181,19 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl - ...@@ -179,21 +181,19 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
### Step 1: Create your batch file ### Step 1: Create your batch file
Add embedding requests to your batch file. The following is an example: Add embedding requests to your batch file. The following is an example:
``` ```
{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}} {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}} {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
``` ```
You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
### Step 2: Run the batch ### Step 2: Run the batch
You can run the batch using the same command as in earlier examples. You can run the batch using the same command as in earlier examples.
### Step 3: Check your results ### Step 3: Check your results
You can check your results by running `cat results.jsonl` You can check your results by running `cat results.jsonl`
...@@ -201,5 +201,5 @@ You can check your results by running `cat results.jsonl` ...@@ -201,5 +201,5 @@ You can check your results by running `cat results.jsonl`
``` ```
$ cat results.jsonl $ cat results.jsonl
{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null} {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
...``` ...
``` ```
from vllm import LLM
# Sample prompts.
text_1 = "What is the capital of France?"
texts_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
# Create an LLM.
# You should pass task="score" for cross-encoder models
model = LLM(
model="BAAI/bge-reranker-v2-m3",
task="score",
enforce_eager=True,
)
# Generate scores. The output is a list of ScoringRequestOutputs.
outputs = model.score(text_1, texts_2)
# Print the outputs.
for text_2, output in zip(texts_2, outputs):
score = output.outputs.score
print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
from enum import Enum
from pydantic import BaseModel
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
# Guided decoding by Choice (list of possible options)
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
outputs = llm.generate(
prompts="Classify this sentiment: vLLM is wonderful!",
sampling_params=sampling_params,
)
print(outputs[0].outputs[0].text)
# Guided decoding by Regex
guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
stop=["\n"])
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
# Guided decoding by JSON using Pydantic schema
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
truck = "Truck"
coupe = "Coupe"
class CarDescription(BaseModel):
brand: str
model: str
car_type: CarType
json_schema = CarDescription.model_json_schema()
guided_decoding_params = GuidedDecodingParams(json=json_schema)
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
prompt = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
outputs = llm.generate(
prompts=prompt,
sampling_params=sampling_params,
)
print(outputs[0].outputs[0].text)
# Guided decoding by Grammar
simplified_sql_grammar = """
?start: select_statement
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)*
?table_name: identifier
?column_name: identifier
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
prompt = ("Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table.")
outputs = llm.generate(
prompts=prompt,
sampling_params=sampling_params,
)
print(outputs[0].outputs[0].text)
""" """
This example shows how to use vLLM for running offline inference This example shows how to use vLLM for running offline inference with
with the correct prompt format on vision language models. the correct prompt format on vision language models for text generation.
For most models, the prompt format should follow corresponding examples For most models, the prompt format should follow corresponding examples
on HuggingFace model repository. on HuggingFace model repository.
""" """
import random
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
...@@ -12,123 +14,239 @@ from vllm.assets.image import ImageAsset ...@@ -12,123 +14,239 @@ from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset from vllm.assets.video import VideoAsset
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
# LLaVA-1.5
def run_llava(question, modality): # Aria
def run_aria(question: str, modality: str):
assert modality == "image" assert modality == "image"
model_name = "rhymes-ai/Aria"
prompt = f"USER: <image>\n{question}\nASSISTANT:" llm = LLM(model=model_name,
tokenizer_mode="slow",
trust_remote_code=True,
dtype="bfloat16",
mm_cache_preprocessor=args.mm_cache_preprocessor)
prompt = (f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>\n{question}"
"<|im_end|>\n<|im_start|>assistant\n")
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
return llm, prompt, stop_token_ids
llm = LLM(model="llava-hf/llava-1.5-7b-hf")
# BLIP-2
def run_blip2(question: str, modality: str):
assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompt = f"Question: {question} Answer:"
llm = LLM(model="Salesforce/blip2-opt-2.7b",
mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# LLaVA-1.6/LLaVA-NeXT # Chameleon
def run_llava_next(question, modality): def run_chameleon(question: str, modality: str):
assert modality == "image" assert modality == "image"
prompt = f"[INST] <image>\n{question} [/INST]" prompt = f"{question}<image>"
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192) llm = LLM(model="facebook/chameleon-7b",
max_model_len=4096,
mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# LlaVA-NeXT-Video # Fuyu
# Currently only support for video input def run_fuyu(question: str, modality: str):
def run_llava_next_video(question, modality): assert modality == "image"
assert modality == "video"
prompt = f"USER: <video>\n{question} ASSISTANT:" prompt = f"{question}\n"
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192) llm = LLM(model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2,
mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# LLaVA-OneVision # GLM-4v
def run_llava_onevision(question, modality): def run_glm4v(question: str, modality: str):
assert modality == "image"
model_name = "THUDM/glm-4v-9b"
if modality == "video": llm = LLM(model=model_name,
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \ max_model_len=2048,
<|im_start|>assistant\n" max_num_seqs=2,
trust_remote_code=True,
enforce_eager=True,
mm_cache_preprocessor=args.mm_cache_preprocessor)
prompt = question
stop_token_ids = [151329, 151336, 151338]
return llm, prompt, stop_token_ids
elif modality == "image":
prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
<|im_start|>assistant\n"
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf", # H2OVL-Mississippi
max_model_len=32768) def run_h2ovl(question: str, modality: str):
stop_token_ids = None assert modality == "image"
model_name = "h2oai/h2ovl-mississippi-2b"
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
mm_cache_preprocessor=args.mm_cache_preprocessor,
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-2b
stop_token_ids = [tokenizer.eos_token_id]
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# Fuyu # Idefics3-8B-Llama3
def run_fuyu(question, modality): def run_idefics3(question: str, modality: str):
assert modality == "image" assert modality == "image"
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
prompt = f"{question}\n" llm = LLM(
llm = LLM(model="adept/fuyu-8b") model=model_name,
max_model_len=8192,
max_num_seqs=2,
enforce_eager=True,
# if you are running out of memory, you can reduce the "longest_edge".
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
mm_processor_kwargs={
"size": {
"longest_edge": 3 * 364
},
},
mm_cache_preprocessor=args.mm_cache_preprocessor,
)
prompt = (
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
)
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# Phi-3-Vision # InternVL
def run_phi3v(question, modality): def run_internvl(question: str, modality: str):
assert modality == "image" assert modality == "image"
prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501 model_name = "OpenGVLab/InternVL2-2B"
# Note: The default setting of max_num_seqs (256) and
# max_model_len (128k) for this model may cause OOM.
# You may lower either to run this example on lower-end GPUs.
# In this example, we override max_num_seqs to 5 while
# keeping the original context length of 128k.
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
#
# Generally speaking, a larger value for num_crops results in more
# tokens per image instance, because it may scale the image more in
# the image preprocessing. Some references in the model docs and the
# formula for image tokens after the preprocessing
# transform can be found below.
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm = LLM( llm = LLM(
model="microsoft/Phi-3-vision-128k-instruct", model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_num_seqs=5, max_model_len=4096,
mm_processor_kwargs={"num_crops": 16}, mm_cache_preprocessor=args.mm_cache_preprocessor,
) )
stop_token_ids = None
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
messages = [{'role': 'user', 'content': f"<image>\n{question}"}]
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# PaliGemma # LLaVA-1.5
def run_paligemma(question, modality): def run_llava(question: str, modality: str):
assert modality == "image" assert modality == "image"
# PaliGemma has special prompt format for VQA prompt = f"USER: <image>\n{question}\nASSISTANT:"
prompt = "caption en"
llm = LLM(model="google/paligemma-3b-mix-224") llm = LLM(model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096,
mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# Chameleon # LLaVA-1.6/LLaVA-NeXT
def run_chameleon(question, modality): def run_llava_next(question: str, modality: str):
assert modality == "image" assert modality == "image"
prompt = f"{question}<image>" prompt = f"[INST] <image>\n{question} [/INST]"
llm = LLM(model="facebook/chameleon-7b") llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf",
max_model_len=8192,
mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None
return llm, prompt, stop_token_ids
# LlaVA-NeXT-Video
# Currently only support for video input
def run_llava_next_video(question: str, modality: str):
assert modality == "video"
prompt = f"USER: <video>\n{question} ASSISTANT:"
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf",
max_model_len=8192,
mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# LLaVA-OneVision
def run_llava_onevision(question: str, modality: str):
if modality == "video":
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
<|im_start|>assistant\n"
elif modality == "image":
prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
<|im_start|>assistant\n"
llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384,
mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None
return llm, prompt, stop_token_ids
# Mantis
def run_mantis(question: str, modality: str):
assert modality == "image"
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' # noqa: E501
prompt = llama3_template.format(f"{question}\n<image>")
llm = LLM(
model="TIGER-Lab/Mantis-8B-siglip-llama3",
max_model_len=4096,
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
mm_cache_preprocessor=args.mm_cache_preprocessor,
)
stop_token_ids = [128009]
return llm, prompt, stop_token_ids
# MiniCPM-V # MiniCPM-V
def run_minicpmv(question, modality): def run_minicpmv(question: str, modality: str):
assert modality == "image" assert modality == "image"
# 2.0 # 2.0
...@@ -145,7 +263,10 @@ def run_minicpmv(question, modality): ...@@ -145,7 +263,10 @@ def run_minicpmv(question, modality):
trust_remote_code=True) trust_remote_code=True)
llm = LLM( llm = LLM(
model=model_name, model=model_name,
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
mm_cache_preprocessor=args.mm_cache_preprocessor,
) )
# NOTE The stop_token_ids are different for various versions of MiniCPM-V # NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0 # 2.0
...@@ -168,16 +289,61 @@ def run_minicpmv(question, modality): ...@@ -168,16 +289,61 @@ def run_minicpmv(question, modality):
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# InternVL # LLama 3.2
def run_internvl(question, modality): def run_mllama(question: str, modality: str):
assert modality == "image" assert modality == "image"
model_name = "OpenGVLab/InternVL2-2B" model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# Note: The default setting of max_num_seqs (256) and
# max_model_len (131072) for this model may cause OOM.
# You may lower either to run this example on lower-end GPUs.
# The configuration below has been confirmed to launch on a single L40 GPU.
llm = LLM(
model=model_name,
max_model_len=4096,
max_num_seqs=16,
enforce_eager=True,
mm_cache_preprocessor=args.mm_cache_preprocessor,
)
prompt = f"<|image|><|begin_of_text|>{question}"
stop_token_ids = None
return llm, prompt, stop_token_ids
# Molmo
def run_molmo(question, modality):
assert modality == "image"
model_name = "allenai/Molmo-7B-D-0924"
llm = LLM( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_num_seqs=5, dtype="bfloat16",
mm_cache_preprocessor=args.mm_cache_preprocessor,
)
prompt = question
stop_token_ids = None
return llm, prompt, stop_token_ids
# NVLM-D
def run_nvlm_d(question: str, modality: str):
assert modality == "image"
model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
tensor_parallel_size=4,
mm_cache_preprocessor=args.mm_cache_preprocessor,
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
...@@ -186,81 +352,121 @@ def run_internvl(question, modality): ...@@ -186,81 +352,121 @@ def run_internvl(question, modality):
prompt = tokenizer.apply_chat_template(messages, prompt = tokenizer.apply_chat_template(messages,
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True)
stop_token_ids = None
return llm, prompt, stop_token_ids
# Stop tokens for InternVL
# models variants may have different stop tokens # PaliGemma
# please refer to the model card for the correct "stop words": def run_paligemma(question: str, modality: str):
# https://huggingface.co/OpenGVLab/InternVL2-2B#service assert modality == "image"
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] # PaliGemma has special prompt format for VQA
prompt = "caption en"
llm = LLM(model="google/paligemma-3b-mix-224",
mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# BLIP-2 # PaliGemma 2
def run_blip2(question, modality): def run_paligemma2(question: str, modality: str):
assert modality == "image" assert modality == "image"
# BLIP-2 prompt format is inaccurate on HuggingFace model repository. # PaliGemma 2 has special prompt format for VQA
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa prompt = "caption en"
prompt = f"Question: {question} Answer:" llm = LLM(model="google/paligemma2-3b-ft-docci-448",
llm = LLM(model="Salesforce/blip2-opt-2.7b") mm_cache_preprocessor=args.mm_cache_preprocessor)
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# Qwen # Phi-3-Vision
def run_qwen_vl(question, modality): def run_phi3v(question: str, modality: str):
assert modality == "image" assert modality == "image"
prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"
# num_crops is an override kwarg to the multimodal image processor;
# For some models, e.g., Phi-3.5-vision-instruct, it is recommended
# to use 16 for single frame scenarios, and 4 for multi-frame.
#
# Generally speaking, a larger value for num_crops results in more
# tokens per image instance, because it may scale the image more in
# the image preprocessing. Some references in the model docs and the
# formula for image tokens after the preprocessing
# transform can be found below.
#
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct#loading-the-model-locally
# https://huggingface.co/microsoft/Phi-3.5-vision-instruct/blob/main/processing_phi3_v.py#L194
llm = LLM( llm = LLM(
model="Qwen/Qwen-VL", model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, trust_remote_code=True,
max_num_seqs=5, max_model_len=4096,
max_num_seqs=2,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"num_crops": 16},
mm_cache_preprocessor=args.mm_cache_preprocessor,
) )
prompt = f"{question}Picture 1: <img></img>\n"
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# Qwen2-VL # Pixtral HF-format
def run_qwen2_vl(question, modality): def run_pixtral_hf(question: str, modality: str):
assert modality == "image" assert modality == "image"
model_name = "Qwen/Qwen2-VL-7B-Instruct" model_name = "mistral-community/pixtral-12b"
llm = LLM( llm = LLM(
model=model_name, model=model_name,
max_num_seqs=5, max_model_len=8192,
mm_cache_preprocessor=args.mm_cache_preprocessor,
) )
prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" prompt = f"<s>[INST]{question}\n[IMG][/INST]"
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
# LLama # Qwen
def run_mllama(question, modality): def run_qwen_vl(question: str, modality: str):
assert modality == "image" assert modality == "image"
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct" llm = LLM(
model="Qwen/Qwen-VL",
trust_remote_code=True,
max_model_len=1024,
max_num_seqs=2,
mm_cache_preprocessor=args.mm_cache_preprocessor,
)
prompt = f"{question}Picture 1: <img></img>\n"
stop_token_ids = None
return llm, prompt, stop_token_ids
# Note: The default setting of max_num_seqs (256) and
# max_model_len (131072) for this model may cause OOM.
# You may lower either to run this example on lower-end GPUs.
# The configuration below has been confirmed to launch on a # Qwen2-VL
# single H100 GPU. def run_qwen2_vl(question: str, modality: str):
assert modality == "image"
model_name = "Qwen/Qwen2-VL-7B-Instruct"
llm = LLM( llm = LLM(
model=model_name, model=model_name,
max_num_seqs=16, max_model_len=4096,
enforce_eager=True, max_num_seqs=5,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
mm_cache_preprocessor=args.mm_cache_preprocessor,
) )
prompt = f"<|image|><|begin_of_text|>{question}" prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = None stop_token_ids = None
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
...@@ -281,21 +487,29 @@ def run_glm4v(question: str, modality: str): ...@@ -281,21 +487,29 @@ def run_glm4v(question: str, modality: str):
model_example_map = { model_example_map = {
"aria": run_aria,
"blip-2": run_blip2,
"chameleon": run_chameleon,
"fuyu": run_fuyu,
"glm4v": run_glm4v,
"h2ovl_chat": run_h2ovl,
"idefics3": run_idefics3,
"internvl_chat": run_internvl,
"llava": run_llava, "llava": run_llava,
"llava-next": run_llava_next, "llava-next": run_llava_next,
"llava-next-video": run_llava_next_video, "llava-next-video": run_llava_next_video,
"llava-onevision": run_llava_onevision, "llava-onevision": run_llava_onevision,
"fuyu": run_fuyu, "mantis": run_mantis,
"phi3_v": run_phi3v,
"paligemma": run_paligemma,
"chameleon": run_chameleon,
"minicpmv": run_minicpmv, "minicpmv": run_minicpmv,
"blip-2": run_blip2, "mllama": run_mllama,
"internvl_chat": run_internvl, "molmo": run_molmo,
"NVLM_D": run_nvlm_d,
"paligemma": run_paligemma,
"paligemma2": run_paligemma2,
"phi3_v": run_phi3v,
"pixtral_hf": run_pixtral_hf,
"qwen_vl": run_qwen_vl, "qwen_vl": run_qwen_vl,
"qwen2_vl": run_qwen2_vl, "qwen2_vl": run_qwen2_vl,
"mllama": run_mllama,
"glm4v": run_glm4v,
} }
...@@ -332,6 +546,35 @@ def get_multi_modal_input(args): ...@@ -332,6 +546,35 @@ def get_multi_modal_input(args):
raise ValueError(msg) raise ValueError(msg)
def apply_image_repeat(image_repeat_prob, num_prompts, data, prompt, modality):
"""Repeats images with provided probability of "image_repeat_prob".
Used to simulate hit/miss for the MM preprocessor cache.
"""
assert (image_repeat_prob <= 1.0 and image_repeat_prob >= 0)
no_yes = [0, 1]
probs = [1.0 - image_repeat_prob, image_repeat_prob]
inputs = []
cur_image = data
for i in range(num_prompts):
if image_repeat_prob is not None:
res = random.choices(no_yes, probs)[0]
if res == 0:
# No repeat => Modify one pixel
cur_image = cur_image.copy()
new_val = (i // 256 // 256, i // 256, i % 256)
cur_image.putpixel((0, 0), new_val)
inputs.append({
"prompt": prompt,
"multi_modal_data": {
modality: cur_image
}
})
return inputs
def main(args): def main(args):
model = args.model_type model = args.model_type
if model not in model_example_map: if model not in model_example_map:
...@@ -362,14 +605,29 @@ def main(args): ...@@ -362,14 +605,29 @@ def main(args):
else: else:
# Batch inference # Batch inference
inputs = [{ if args.image_repeat_prob is not None:
"prompt": prompt, # Repeat images with specified probability of "image_repeat_prob"
"multi_modal_data": { inputs = apply_image_repeat(args.image_repeat_prob,
modality: data args.num_prompts, data, prompt,
}, modality)
} for _ in range(args.num_prompts)] else:
# Use the same image for all prompts
inputs = [{
"prompt": prompt,
"multi_modal_data": {
modality: data
},
} for _ in range(args.num_prompts)]
if args.time_generate:
import time
start_time = time.time()
outputs = llm.generate(inputs, sampling_params=sampling_params)
elapsed_time = time.time() - start_time
print("-- generate time = {}".format(elapsed_time))
outputs = llm.generate(inputs, sampling_params=sampling_params) else:
outputs = llm.generate(inputs, sampling_params=sampling_params)
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
...@@ -379,7 +637,7 @@ def main(args): ...@@ -379,7 +637,7 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with ' description='Demo on using vLLM for offline inference with '
'vision language models') 'vision language models for text generation')
parser.add_argument('--model-type', parser.add_argument('--model-type',
'-m', '-m',
type=str, type=str,
...@@ -399,5 +657,23 @@ if __name__ == "__main__": ...@@ -399,5 +657,23 @@ if __name__ == "__main__":
type=int, type=int,
default=16, default=16,
help='Number of frames to extract from the video.') help='Number of frames to extract from the video.')
parser.add_argument(
'--image-repeat-prob',
type=float,
default=None,
help='Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)')
parser.add_argument(
'--mm-cache-preprocessor',
action='store_true',
help='If True, enable caching of multi-modal preprocessor/mapper.')
parser.add_argument(
'--time-generate',
action='store_true',
help='If True, then print the total generate() call time')
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal embedding.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
from argparse import Namespace
from typing import Literal, NamedTuple, Optional, TypedDict, Union, get_args
from PIL.Image import Image
from vllm import LLM
from vllm.multimodal.utils import fetch_image
from vllm.utils import FlexibleArgumentParser
class TextQuery(TypedDict):
modality: Literal["text"]
text: str
class ImageQuery(TypedDict):
modality: Literal["image"]
image: Image
class TextImageQuery(TypedDict):
modality: Literal["text+image"]
text: str
image: Image
QueryModality = Literal["text", "image", "text+image"]
Query = Union[TextQuery, ImageQuery, TextImageQuery]
class ModelRequestData(NamedTuple):
llm: LLM
prompt: str
image: Optional[Image]
def run_e5_v(query: Query):
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
if query["modality"] == "text":
text = query["text"]
prompt = llama3_template.format(
f"{text}\nSummary above sentence in one word: ")
image = None
elif query["modality"] == "image":
prompt = llama3_template.format(
"<image>\nSummary above image in one word: ")
image = query["image"]
else:
modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'")
llm = LLM(
model="royokong/e5-v",
task="embed",
max_model_len=4096,
)
return ModelRequestData(
llm=llm,
prompt=prompt,
image=image,
)
def run_vlm2vec(query: Query):
if query["modality"] == "text":
text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}" # noqa: E501
image = None
elif query["modality"] == "image":
prompt = "<|image_1|> Find a day-to-day image that looks similar to the provided image." # noqa: E501
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"<|image_1|> Represent the given image with the following question: {text}" # noqa: E501
image = query["image"]
else:
modality = query['modality']
raise ValueError(f"Unsupported query modality: '{modality}'")
llm = LLM(
model="TIGER-Lab/VLM2Vec-Full",
task="embed",
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
)
return ModelRequestData(
llm=llm,
prompt=prompt,
image=image,
)
def get_query(modality: QueryModality):
if modality == "text":
return TextQuery(modality="text", text="A dog sitting in the grass")
if modality == "image":
return ImageQuery(
modality="image",
image=fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/American_Eskimo_Dog.jpg/360px-American_Eskimo_Dog.jpg" # noqa: E501
),
)
if modality == "text+image":
return TextImageQuery(
modality="text+image",
text="A cat standing in the snow.",
image=fetch_image(
"https://upload.wikimedia.org/wikipedia/commons/thumb/b/b6/Felis_catus-cat_on_snow.jpg/179px-Felis_catus-cat_on_snow.jpg" # noqa: E501
),
)
msg = f"Modality {modality} is not supported."
raise ValueError(msg)
def run_encode(model: str, modality: QueryModality):
query = get_query(modality)
req_data = model_example_map[model](query)
mm_data = {}
if req_data.image is not None:
mm_data["image"] = req_data.image
outputs = req_data.llm.embed({
"prompt": req_data.prompt,
"multi_modal_data": mm_data,
})
for output in outputs:
print(output.outputs.embedding)
def main(args: Namespace):
run_encode(args.model_name, args.modality)
model_example_map = {
"e5_v": run_e5_v,
"vlm2vec": run_vlm2vec,
}
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for multimodal embedding')
parser.add_argument('--model-name',
'-m',
type=str,
default="vlm2vec",
choices=model_example_map.keys(),
help='The name of the embedding model.')
parser.add_argument('--modality',
type=str,
default="image",
choices=get_args(QueryModality),
help='Modality of the input.')
args = parser.parse_args()
main(args)
""" """
This example shows how to use vLLM for running offline inference with This example shows how to use vLLM for running offline inference with
multi-image input on vision language models, using the chat template defined multi-image input on vision language models for text generation,
by the model. using the chat template defined by the model.
""" """
from argparse import Namespace from argparse import Namespace
from typing import List, NamedTuple, Optional from typing import List, NamedTuple, Optional
...@@ -28,41 +28,184 @@ class ModelRequestData(NamedTuple): ...@@ -28,41 +28,184 @@ class ModelRequestData(NamedTuple):
chat_template: Optional[str] chat_template: Optional[str]
def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData: # NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
model_name = "Qwen/Qwen-VL-Chat" # lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
def load_aria(question, image_urls: List[str]) -> ModelRequestData:
model_name = "rhymes-ai/Aria"
llm = LLM(model=model_name,
tokenizer_mode="slow",
trust_remote_code=True,
dtype="bfloat16",
limit_mm_per_prompt={"image": len(image_urls)})
placeholders = "<fim_prefix><|img|><fim_suffix>\n" * len(image_urls)
prompt = (f"<|im_start|>user\n{placeholders}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
stop_token_ids = [93532, 93653, 944, 93421, 1019, 93653, 93519]
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None)
def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "h2oai/h2ovl-mississippi-2b"
llm = LLM( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_num_seqs=5, max_model_len=8192,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
) )
placeholders = "".join(f"Picture {i}: <img></img>\n"
for i, _ in enumerate(image_urls, start=1))
# This model does not have a chat_template attribute on its tokenizer, placeholders = "\n".join(f"Image-{i}: <image>\n"
# so we need to explicitly pass it. We use ChatML since it's used in the for i, _ in enumerate(image_urls, start=1))
# generation utils of the model: messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
# https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True) trust_remote_code=True)
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
# Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating # Stop tokens for H2OVL-Mississippi
chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501 # https://huggingface.co/h2oai/h2ovl-mississippi-2b
stop_token_ids = [tokenizer.eos_token_id]
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
def load_idefics3(question, image_urls: List[str]) -> ModelRequestData:
model_name = "HuggingFaceM4/Idefics3-8B-Llama3"
# The configuration below has been confirmed to launch on a single L40 GPU.
llm = LLM(
model=model_name,
max_model_len=8192,
max_num_seqs=16,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)},
# if you are running out of memory, you can reduce the "longest_edge".
# see: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3#model-optimizations
mm_processor_kwargs={
"size": {
"longest_edge": 2 * 364
},
},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|begin_of_text|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B"
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=4096,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
prompt = tokenizer.apply_chat_template(messages, prompt = tokenizer.apply_chat_template(messages,
tokenize=False, tokenize=False,
add_generation_prompt=True, add_generation_prompt=True)
chat_template=chat_template)
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"] # Stop tokens for InternVL
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B/blob/main/conversation.py
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData( return ModelRequestData(
llm=llm, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=chat_template, chat_template=None,
)
def load_mllama(question, image_urls: List[str]) -> ModelRequestData:
model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
llm = LLM(
model=model_name,
max_model_len=4096,
max_num_seqs=16,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)},
)
prompt = f"<|image|><|image|><|begin_of_text|>{question}"
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=None,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
)
def load_nvlm_d(question: str, image_urls: List[str]):
model_name = "nvidia/NVLM-D-72B"
# Adjust this as necessary to fit in GPU
llm = LLM(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
tensor_parallel_size=4,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"max_dynamic_patch": 4},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True)
prompt = tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True)
stop_token_ids = None
return ModelRequestData(
llm=llm,
prompt=prompt,
stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls],
chat_template=None,
) )
...@@ -83,6 +226,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -83,6 +226,7 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
model="microsoft/Phi-3.5-vision-instruct", model="microsoft/Phi-3.5-vision-instruct",
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={"num_crops": 4}, mm_processor_kwargs={"num_crops": 4},
) )
...@@ -100,40 +244,42 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -100,40 +244,42 @@ def load_phi3v(question: str, image_urls: List[str]) -> ModelRequestData:
) )
def load_internvl(question: str, image_urls: List[str]) -> ModelRequestData: def load_qwenvl_chat(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B" model_name = "Qwen/Qwen-VL-Chat"
llm = LLM( llm = LLM(
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_num_seqs=5, max_model_len=1024,
max_model_len=4096, max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
placeholders = "".join(f"Picture {i}: <img></img>\n"
for i, _ in enumerate(image_urls, start=1))
placeholders = "\n".join(f"Image-{i}: <image>\n" # This model does not have a chat_template attribute on its tokenizer,
for i, _ in enumerate(image_urls, start=1)) # so we need to explicitly pass it. We use ChatML since it's used in the
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] # generation utils of the model:
# https://huggingface.co/Qwen/Qwen-VL-Chat/blob/main/qwen_generation_utils.py#L265
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
trust_remote_code=True) trust_remote_code=True)
# Copied from: https://huggingface.co/docs/transformers/main/en/chat_templating
chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" # noqa: E501
messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}]
prompt = tokenizer.apply_chat_template(messages, prompt = tokenizer.apply_chat_template(messages,
tokenize=False, tokenize=False,
add_generation_prompt=True) add_generation_prompt=True,
chat_template=chat_template)
# Stop tokens for InternVL stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>"]
# models variants may have different stop tokens
# please refer to the model card for the correct "stop words":
# https://huggingface.co/OpenGVLab/InternVL2-2B#service
stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
return ModelRequestData( return ModelRequestData(
llm=llm, llm=llm,
prompt=prompt, prompt=prompt,
stop_token_ids=stop_token_ids, stop_token_ids=stop_token_ids,
image_data=[fetch_image(url) for url in image_urls], image_data=[fetch_image(url) for url in image_urls],
chat_template=None, chat_template=chat_template,
) )
...@@ -148,10 +294,11 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ...@@ -148,10 +294,11 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
model_name = "Qwen/Qwen2-VL-7B-Instruct" model_name = "Qwen/Qwen2-VL-7B-Instruct"
# Tested on L40
llm = LLM( llm = LLM(
model=model_name, model=model_name,
max_num_seqs=5,
max_model_len=32768 if process_vision_info is None else 4096, max_model_len=32768 if process_vision_info is None else 4096,
max_num_seqs=5,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
...@@ -194,10 +341,15 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData: ...@@ -194,10 +341,15 @@ def load_qwen2_vl(question, image_urls: List[str]) -> ModelRequestData:
model_example_map = { model_example_map = {
"phi3_v": load_phi3v, "aria": load_aria,
"h2ovl_chat": load_h2onvl,
"idefics3": load_idefics3,
"internvl_chat": load_internvl, "internvl_chat": load_internvl,
"qwen2_vl": load_qwen2_vl, "mllama": load_mllama,
"NVLM_D": load_nvlm_d,
"phi3_v": load_phi3v,
"qwen_vl_chat": load_qwenvl_chat, "qwen_vl_chat": load_qwenvl_chat,
"qwen2_vl": load_qwen2_vl,
} }
...@@ -269,7 +421,8 @@ def main(args: Namespace): ...@@ -269,7 +421,8 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with ' description='Demo on using vLLM for offline inference with '
'vision language models that support multi-image input') 'vision language models that support multi-image input for text '
'generation')
parser.add_argument('--model-type', parser.add_argument('--model-type',
'-m', '-m',
type=str, type=str,
......
from time import time
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
# NOTE: This is just a running example. For benchmarking purpose,
# please see benchmarks/benchmark_prefix_caching.py
# Common prefix. # Common prefix.
prefix = ( prefix = (
...@@ -27,19 +29,14 @@ generating_prompts = [prefix + prompt for prompt in prompts] ...@@ -27,19 +29,14 @@ generating_prompts = [prefix + prompt for prompt in prompts]
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0) sampling_params = SamplingParams(temperature=0.0)
# Create an LLM. # Create an LLM without prefix caching as a baseline.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
print("Results without `enable_prefix_caching`") print("Results without `enable_prefix_caching`")
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
start_time_regular = time()
outputs = regular_llm.generate(generating_prompts, sampling_params) outputs = regular_llm.generate(generating_prompts, sampling_params)
duration_regular = time() - start_time_regular
regular_generated_texts = [] regular_generated_texts = []
# Print the outputs. # Print the outputs.
...@@ -51,13 +48,20 @@ for output in outputs: ...@@ -51,13 +48,20 @@ for output in outputs:
print("-" * 80) print("-" * 80)
# Destroy the LLM object and free up the GPU memory.
del regular_llm
cleanup_dist_env_and_memory()
# Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
# Warmup so that the shared prompt's KV cache is computed. # Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params) prefix_cached_llm.generate(generating_prompts[0], sampling_params)
# Generate with prefix caching. # Generate with prefix caching.
start_time_cached = time()
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params) outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
duration_cached = time() - start_time_cached
print("Results with `enable_prefix_caching`") print("Results with `enable_prefix_caching`")
...@@ -77,6 +81,3 @@ generated_same = all([ ...@@ -77,6 +81,3 @@ generated_same = all([
for i in range(len(prompts)) for i in range(len(prompts))
]) ])
print(f"Generated answers are the same: {generated_same}") print(f"Generated answers are the same: {generated_same}")
speedup = round(duration_regular / duration_cached, 2)
print(f"Speed up of cached generation compared to the regular is: {speedup}")
import os import os
import time
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
...@@ -15,19 +16,25 @@ prompts = [ ...@@ -15,19 +16,25 @@ prompts = [
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. if __name__ == "__main__":
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
llm.start_profile() # Create an LLM.
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
# Generate texts from the prompts. The output is a list of RequestOutput objects llm.start_profile()
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
llm.stop_profile() # Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs. llm.stop_profile()
for output in outputs:
prompt = output.prompt # Print the outputs.
generated_text = output.outputs[0].text for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output.
time.sleep(10)
import inspect
import json
import os
import sys
from argparse import RawTextHelpFormatter
from dataclasses import asdict, dataclass
from typing import Any, Dict, Generator, List, Optional, TypeAlias
import torch
import tqdm
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.profiler import layerwise_profile
from vllm.utils import FlexibleArgumentParser
BATCH_SIZE_DEFAULT = 1
PROMPT_LEN_DEFAULT = 256
@dataclass
class ProfileContext:
engine_args: EngineArgs
prompt_len: int
batch_size: int
# The profiler can run in 2 modes,
# 1. Run profiler for user specified num_steps
num_steps: Optional[int] = None
# 2. Run profiler until all requests complete
complete_num_requests_per_step: Optional[int] = None
save_chrome_traces_folder: Optional[str] = None
def get_dtype(dtype: str):
if dtype == "torch.float":
return torch.float
else:
return dtype
OutputLen_NumReqs_Map: TypeAlias = Dict[int, int]
def compute_request_output_lengths(batch_size: int, step_requests: List[int]) \
-> OutputLen_NumReqs_Map:
"""
Given the number of requests, batch_size, and the number of requests
that each engine-step should process, step_requests, determine the
output lengths of the requests such that step_request is honoured.
Example:
if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
then return,
{2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
32 requests should have output length 2,
32 requests should have output length 3,
32 requests should have output length 4,
31 requests should have output length 5,
1 request should have output length 6.
Args:
batch_size (int): Number of requests submitted for profile. This is
args.batch_size.
step_requests (List[int]): step_requests[i] is the number of requests
that the ith engine step should process.
Returns:
OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
number of requests required to have that output-length as values.
"""
ol_nr: OutputLen_NumReqs_Map = {}
# Number of request that are assigned an output-length
num_reqs_assigned: int = 0
num_steps: int = len(step_requests)
# sanity check. The first step (prefill-step), must process all requests.
assert step_requests[0] == batch_size
# Begin assignments from the last step.
output_length: int = num_steps
for num_requests_at_step in reversed(step_requests):
if num_reqs_assigned == batch_size:
break
assert num_reqs_assigned < batch_size
# Remove the number of requests that have been determined
# to participate in this step and beyond.
num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
assert num_reqs_unassigned_at_step >= 0
if num_reqs_unassigned_at_step > 0:
ol_nr[output_length] = num_reqs_unassigned_at_step
num_reqs_assigned += num_reqs_unassigned_at_step
output_length -= 1
# sanity checks.
assert sum(ol_nr.values()) == batch_size, \
("Number of requests in output-length assignment does not match "
f"batch-size.\n batch size {batch_size} - "
f"step requests {step_requests} - assignments {ol_nr}")
# Check that the output-length is in [1, num-steps]. Output length must be
# at least 1 as all requests must participate in the prefill-step.
assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), \
("Output lengths of requests should be in range "
f"[1, num-engine-steps].\n batch size {batch_size} - "
f"step requests {step_requests} - assignments {ol_nr}")
return ol_nr
def determine_requests_per_step(context: ProfileContext) -> List[int]:
"""
Determine number of requests each engine step should process.
If context.num_steps is set, then all engine steps process the
same number of requests and the output list is of length
context.num_steps.
If context.complete_num_requests_per_step is set, then each decode step
processes fewer and fewer requests until there are no requests to process.
In this case, the output list is as big as the number of steps
required to process all requests.
Args:
context: ProfileContext object.
Returns:
List[int]: Number of requests to process for all engine-steps.
output[i], contains the number of requests that the ith step
should process.
"""
if context.num_steps:
# All requests must run until num_engine_steps. This implies
# that their output lengths must be equal to num_engine_steps.
return [context.batch_size] * context.num_steps
assert context.complete_num_requests_per_step and \
context.complete_num_requests_per_step > 0, \
(f"Expected a positive complete_num_requests_per_step argument."
f"Instead got {context.complete_num_requests_per_step}")
# We start dropping after the first decode step.
step_requests = [
context.batch_size, # prefill
context.batch_size, # decode
]
num_running_requests = context.batch_size
num_running_requests -= context.complete_num_requests_per_step
while num_running_requests > 0:
step_requests.append(num_running_requests)
num_running_requests -= context.complete_num_requests_per_step
if step_requests[-1] != 1:
# have 1 request running at the last step. This is often
# useful
step_requests.append(1)
return step_requests
def run_profile(context: ProfileContext, csv_output: Optional[str],
json_output: Optional[str]):
print("Run profile with:")
for key, value in asdict(context).items():
print(f" {key} = {value}")
requests_per_step: List[int] = determine_requests_per_step(context)
ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
context.batch_size, requests_per_step)
num_steps_to_profile: int = len(requests_per_step)
max_output_len: int = max(ol_nr.keys())
assert max_output_len >= 1
# Create sampling params
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
# max_tokens is set on a per-request basis.
max_tokens=None,
ignore_eos=True)
# Create LLM
llm = LLM(**asdict(context.engine_args))
batch_size = context.batch_size
prompt_len = context.prompt_len
scheduler_config = llm.llm_engine.scheduler_config
max_model_len = llm.llm_engine.model_config.max_model_len
max_num_batched_tokens = scheduler_config.max_num_batched_tokens
max_num_seqs = scheduler_config.max_num_seqs
if batch_size * prompt_len > max_num_batched_tokens:
print(f"ERROR: chosen batch_size * prompt_len "
f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
f"and therefore cannot be run in a single profile step, please "
f"choose a smaller batch size or prompt length, or increase "
f"--max-num-batched-tokens")
sys.exit(-1)
if batch_size > max_num_seqs:
print(
f"ERROR: chosen batch_size ({batch_size}) is larger than "
f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
f"single profile step, please choose a smaller batch size")
sys.exit(-1)
print("llm.llm_engine.model_config.max_model_len: ",
llm.llm_engine.model_config.max_model_len)
if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
print(f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
f"{max_output_len} = {prompt_len + max_output_len}) is larger "
f"than the model's max_model_len ({max_model_len}), please "
f"choose a smaller prompt_len or max_output_len, or increase "
f"--max-model-len")
sys.exit(-1)
def add_requests():
def get_output_len_generator() -> Generator[int, Any, Any]:
for output_len, num_reqs in ol_nr.items():
for _ in range(num_reqs):
yield output_len
output_len_generator = get_output_len_generator()
for i in range(batch_size):
sampling_params.max_tokens = next(output_len_generator)
assert isinstance(sampling_params.max_tokens, int)
prompt_token_ids = torch.randint(
llm.llm_engine.model_config.get_vocab_size(),
size=(prompt_len, )).tolist()
llm.llm_engine.add_request(
request_id=f"seq{i}",
prompt={'prompt_token_ids': prompt_token_ids},
params=sampling_params)
def abort_requests():
for i in range(batch_size):
llm.llm_engine.abort_request(f"seq{i}")
# Warm up run
print("Warm up run ...")
add_requests()
llm.llm_engine.step() # Prefill
llm.llm_engine.step() # Decode
abort_requests()
print("Profile run ...")
add_requests()
with layerwise_profile() as prefill_prof:
llm.llm_engine.step() # First step is prefill
decode_profs = []
for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
num_running_seqs = llm.llm_engine.scheduler[
0].get_num_unfinished_seq_groups()
with layerwise_profile(
num_running_seqs=num_running_seqs) as decode_prof:
llm.llm_engine.step()
decode_profs.append(decode_prof)
decode_results_list = [prof.results for prof in decode_profs]
prefill_results = prefill_prof.results
has_decode = len(decode_results_list) > 0
LINE_WIDTH = 80
print("=" * LINE_WIDTH)
print(f"= Prefill Model Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
prefill_results.print_model_table()
if has_decode:
print()
print("=" * LINE_WIDTH)
print(f"= First Decode Step Model Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
decode_results_list[0].print_model_table()
print()
print("=" * LINE_WIDTH)
print(f"= Prefill Summary Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
prefill_results.print_summary_table()
if has_decode:
print()
print("=" * LINE_WIDTH)
print(f"= First Decode Step Summary Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
decode_results_list[0].print_summary_table()
if csv_output:
csv_filename_base = csv_output[:-4] \
if csv_output.endswith('.csv') else csv_output
prefill_results.export_model_stats_table_csv(
csv_filename_base + "_prefill_model_table.csv")
prefill_results.export_summary_stats_table_csv(
csv_filename_base + "_prefill_summary_table.csv")
if has_decode:
decode_results_list[0].export_model_stats_table_csv(\
csv_filename_base + "_decode_model_table.csv")
decode_results_list[0].export_summary_stats_table_csv(
csv_filename_base + "_decode_summary_table.csv")
if json_output:
cuda_devices = [
torch.cuda.get_device_properties(dev_idx)
for dev_idx in range(torch.cuda.device_count())
]
json_dict = {
"context": {
"python_version": f"{sys.version}",
"torch_version": f"{torch.__version__}",
"torch_cuda_version": f"{torch.version.cuda}",
"cuda_devices": f"{cuda_devices}",
**asdict(context)
},
"prefill": prefill_results.convert_stats_to_dict(),
}
if has_decode:
for idx, dr in enumerate(decode_results_list):
json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
# Add .json to json_output filename if it doesn't exist already.
json_output_file = json_output if json_output.endswith(
'.json') else json_output + '.json'
with open(json_output_file, "w+") as f:
json.dump(json_dict, f, indent=2)
pass
if context.save_chrome_traces_folder is not None:
os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
prefill_prof.profiler.export_chrome_trace(
context.save_chrome_traces_folder + "/prefill.json")
for idx, decode_prof in enumerate(decode_profs):
decode_prof.profiler.export_chrome_trace(
context.save_chrome_traces_folder + f"/decode_{idx + 1}.json")
print("Traces saved as prefill.json and decode_1.json, etc."
f" in folder {context.save_chrome_traces_folder}")
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="""
Profile a model
example:
```
python examples/offline_profile.py \\
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
--enforce-eager run_num_steps -n 2
```
then you can use various tools to analyze the json output
terminal ascii tables:
```
python tools/profiler/print_layerwise_table.py \\
--json-trace Llama31-8b-FP8.json --phase prefill --table summary
```
or create matplotlib stacked bar charts:
```
python tools/profiler/visualize_layerwise_profile.py \\
--json-trace Llama31-8b-FP8.json \\
--output-directory profile_breakdown --plot-metric pct_cuda_time
```
""",
formatter_class=RawTextHelpFormatter)
parser.add_argument(
"--csv",
type=str,
default=None,
help="Export the results as multiple csv file. This should be the root "
"filename, will create <filename>_prefill_model_table.csv, "
"<filename>_prefill_summary_table.csv, "
"<filename>_decode_model_table.csv, and "
"<filename>_decode_summary_table.csv")
parser.add_argument(
"--json",
type=str,
default=None,
help="Export the results as a json file. This should be the filename")
parser.add_argument("--save-chrome-traces-folder",
type=str,
help="Save chrome traces for the prefill and decode "
"will save traces as prefill.json and decode_1.json, "
"etc. inside this folder")
parser.add_argument(
"--prompt-len",
type=int,
default=PROMPT_LEN_DEFAULT,
help=f"Length of the random prompt to use when profiling, all batched "
f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}")
parser.add_argument("--batch-size",
type=int,
default=BATCH_SIZE_DEFAULT,
help=f"Number of requests to run as a single batch, "
f"default={BATCH_SIZE_DEFAULT}")
subparsers = parser.add_subparsers(dest="cmd")
run_num_steps_parser = subparsers.add_parser(
"run_num_steps",
help="This variation profiles n engine.step() invocations.")
run_num_steps_parser.add_argument(
'-n',
'--num-steps',
type=int,
help="Number of engine steps to profile.\n"
"Setting it to 1, profiles only the prefill step.\n"
"Setting it to 2, profiles the prefill and first decode step\n"
"Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
"and so on ...")
run_to_completion_parser = subparsers.add_parser(
"run_to_completion",
help="This variation profiles all the engine.step() invocations"
"until the engine exhausts all submitted requests.")
run_to_completion_parser.add_argument(
'-n',
'--complete-num-requests-per-step',
type=int,
help=
"Complete complete_num_requests_per_step requests every decode step."
"For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
"the profiler is run for 6 engine steps, with the steps processing, "
"128, 128, 96, 64, 32, 1 requests respectively.\n"
"Note that we tack-on a one-request step at the end as it is often "
"useful.")
EngineArgs.add_cli_args(parser)
args = parser.parse_args()
context = ProfileContext(
engine_args=EngineArgs.from_cli_args(args),
**{
k: v
for k, v in vars(args).items()
if k in inspect.signature(ProfileContext).parameters
})
run_profile(context, csv_output=args.csv, json_output=args.json)
"""An example showing how to use vLLM to serve VLMs.
Launch the vLLM server with the following command:
vllm serve fixie-ai/ultravox-v0_3
"""
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
# Any format supported by librosa is supported
audio_url = AudioAsset("winning_call").url
# Use audio url in the payload
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
"url": audio_url
},
},
],
}],
model=model,
max_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print(f"Chat completion output:{result}")
# Use base64 encoded audio in the payload
def encode_audio_base64_from_url(audio_url: str) -> str:
"""Encode an audio retrieved from a remote url to base64 format."""
with requests.get(audio_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
return result
audio_base64 = encode_audio_base64_from_url(audio_url=audio_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": f"data:audio/ogg;base64,{audio_base64}"
},
},
],
}],
model=model,
max_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print(f"Chat completion output:{result}")
"""An example showing how to use vLLM to serve multimodal models
and run online inference with OpenAI client.
Launch the vLLM server with the following command:
(single image inference with Llava)
vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja
(multi-image inference with Phi-3.5-vision-instruct)
vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
(audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096
"""
import base64
import requests
from openai import OpenAI
from vllm.assets.audio import AudioAsset
from vllm.utils import FlexibleArgumentParser
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
# defaults to os.environ.get("OPENAI_API_KEY")
api_key=openai_api_key,
base_url=openai_api_base,
)
models = client.models.list()
model = models.data[0].id
def encode_base64_content_from_url(content_url: str) -> str:
"""Encode a content retrieved from a remote url to base64 format."""
with requests.get(content_url) as response:
response.raise_for_status()
result = base64.b64encode(response.content).decode('utf-8')
return result
# Text-only inference
def run_text_only() -> None:
chat_completion = client.chat.completions.create(
messages=[{
"role": "user",
"content": "What's the capital of France?"
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion.choices[0].message.content
print("Chat completion output:", result)
# Single-image input inference
def run_single_image() -> None:
## Use image url in the payload
image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this image?"
},
{
"type": "image_url",
"image_url": {
"url": image_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from image url:", result)
## Use base64 encoded image in the payload
image_base64 = encode_base64_content_from_url(image_url)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this image?"
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded image:", result)
# Multi-image input inference
def run_multi_image() -> None:
image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg"
image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg"
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What are the animals in these images?"
},
{
"type": "image_url",
"image_url": {
"url": image_url_duck
},
},
{
"type": "image_url",
"image_url": {
"url": image_url_lion
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output:", result)
# Audio input inference
def run_audio() -> None:
audio_url = AudioAsset("winning_call").url
audio_base64 = encode_base64_content_from_url(audio_url)
# OpenAI-compatible schema (`input_audio`)
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "input_audio",
"input_audio": {
# Any format supported by librosa is supported
"data": audio_base64,
"format": "wav"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from input audio:", result)
# HTTP URL
chat_completion_from_url = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": audio_url
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_url.choices[0].message.content
print("Chat completion output from audio url:", result)
# base64 URL
chat_completion_from_base64 = client.chat.completions.create(
messages=[{
"role":
"user",
"content": [
{
"type": "text",
"text": "What's in this audio?"
},
{
"type": "audio_url",
"audio_url": {
# Any format supported by librosa is supported
"url": f"data:audio/ogg;base64,{audio_base64}"
},
},
],
}],
model=model,
max_completion_tokens=64,
)
result = chat_completion_from_base64.choices[0].message.content
print("Chat completion output from base64 encoded audio:", result)
example_function_map = {
"text-only": run_text_only,
"single-image": run_single_image,
"multi-image": run_multi_image,
"audio": run_audio,
}
def main(args) -> None:
chat_type = args.chat_type
example_function_map[chat_type]()
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using OpenAI client for online inference with '
'multimodal language models served with vLLM.')
parser.add_argument(
'--chat-type',
'-c',
type=str,
default="single-image",
choices=["text-only", "single-image", "multi-image", "audio"],
help='Conversation type with multimodal data.')
args = parser.parse_args()
main(args)
from enum import Enum
from openai import OpenAI
from pydantic import BaseModel
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="-",
)
# Guided decoding by Choice (list of possible options)
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
messages=[{
"role": "user",
"content": "Classify this sentiment: vLLM is wonderful!"
}],
extra_body={"guided_choice": ["positive", "negative"]},
)
print(completion.choices[0].message.content)
# Guided decoding by Regex
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={
"guided_regex": "\w+@\w+\.com\n",
"stop": ["\n"]
},
)
print(completion.choices[0].message.content)
# Guided decoding by JSON using Pydantic schema
class CarType(str, Enum):
sedan = "sedan"
suv = "SUV"
truck = "Truck"
coupe = "Coupe"
class CarDescription(BaseModel):
brand: str
model: str
car_type: CarType
json_schema = CarDescription.model_json_schema()
prompt = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_json": json_schema},
)
print(completion.choices[0].message.content)
# Guided decoding by Grammar
simplified_sql_grammar = """
?start: select_statement
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)*
?table_name: identifier
?column_name: identifier
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
prompt = ("Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table.")
completion = client.chat.completions.create(
model="Qwen/Qwen2.5-3B-Instruct",
messages=[{
"role": "user",
"content": prompt,
}],
extra_body={"guided_grammar": simplified_sql_grammar},
)
print(completion.choices[0].message.content)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment