Commit 52f97799 authored by laibao's avatar laibao
Browse files

更新README.md,修改Docker镜像版本,更新深度学习库版本,调整推理示例命令,删除不再使用的示例文件。

parent 35cc3501
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to run offline inference with an EAGLE speculative
decoding model on neuron. To use EAGLE speculative decoding, you must use
a draft model that is specifically fine-tuned for EAGLE speculation.
Additionally, to use EAGLE with NxD Inference, the draft model must include
the LM head weights from the target model. These weights are shared between
the draft and target model.
"""
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"What is annapurna labs?",
]
def main():
# Create a sampling params object.
sampling_params = SamplingParams(top_k=1, max_tokens=500, ignore_eos=True)
# Create an LLM.
llm = LLM(
model="/home/ubuntu/model_hf/Meta-Llama-3.1-70B-Instruct",
speculative_config={
"model": "/home/ubuntu/model_hf/Llama-3.1-70B-Instruct-EAGLE-Draft",
"num_speculative_tokens": 5,
"max_model_len": 2048,
},
max_num_seqs=4,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in neuronx-distributed-inference.
max_model_len=2048,
block_size=2048,
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device="neuron",
tensor_parallel_size=32,
override_neuron_config={
"enable_eagle_speculation": True,
"enable_fused_speculation": True,
},
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, \n\n\n\ Generated text: {generated_text!r}")
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from vllm import LLM, SamplingParams
# creates XLA hlo graphs for all the context length buckets.
os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
# creates XLA hlo graphs for all the token gen buckets.
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
# Quantizes neuron model weight to int8 ,
# The default config for quantization is int8 dtype.
os.environ["NEURON_QUANT_DTYPE"] = "s8"
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
def main():
# Create an LLM.
llm = LLM(
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
max_num_seqs=8,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len=2048,
block_size=2048,
# ruff: noqa: E501
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device="neuron",
quantization="neuron_quant",
override_neuron_config={
"cast_logits_dtype": "bfloat16",
},
tensor_parallel_size=2,
)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import requests
import torch
from neuronx_distributed_inference.models.mllama.utils import add_instruct
from PIL import Image
from vllm import LLM, SamplingParams, TextPrompt
def get_image(image_url):
image = Image.open(requests.get(image_url, stream=True).raw)
return image
# Model Inputs
PROMPTS = [
"What is in this image? Tell me a story",
"What is the recipe of mayonnaise in two sentences?",
"Describe this image",
"What is the capital of Italy famous for?",
]
IMAGES = [
get_image(
"https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
),
None,
get_image(
"https://images.pexels.com/photos/1108099/pexels-photo-1108099.jpeg?auto=compress&cs=tinysrgb&dpr=1&w=500"
),
None,
]
SAMPLING_PARAMS = [
dict(top_k=1, temperature=1.0, top_p=1.0, max_tokens=16)
for _ in range(len(PROMPTS))
]
def get_VLLM_mllama_model_inputs(prompt, single_image, sampling_params):
# Prepare all inputs for mllama generation, including:
# 1. put text prompt into instruct chat template
# 2. compose single text and single image prompt into Vllm's prompt class
# 3. prepare sampling parameters
input_image = single_image
has_image = torch.tensor([1])
if isinstance(single_image, torch.Tensor) and single_image.numel() == 0:
has_image = torch.tensor([0])
instruct_prompt = add_instruct(prompt, has_image)
inputs = TextPrompt(prompt=instruct_prompt)
if input_image is not None:
inputs["multi_modal_data"] = {"image": input_image}
sampling_params = SamplingParams(**sampling_params)
return inputs, sampling_params
def print_outputs(outputs):
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
def main():
assert (
len(PROMPTS) == len(IMAGES) == len(SAMPLING_PARAMS)
), f"""Text, image prompts and sampling parameters should have the
same batch size; but got {len(PROMPTS)}, {len(IMAGES)},
and {len(SAMPLING_PARAMS)}"""
# Create an LLM.
llm = LLM(
model="meta-llama/Llama-3.2-11B-Vision-Instruct",
max_num_seqs=1,
max_model_len=4096,
block_size=4096,
device="neuron",
tensor_parallel_size=32,
override_neuron_config={
"sequence_parallel_enabled": False,
"skip_warmup": True,
"save_sharded_checkpoint": True,
"on_device_sampling_config": {
"global_topk": 1,
"dynamic": False,
"deterministic": False,
},
},
)
batched_inputs = []
batched_sample_params = []
for pmpt, img, params in zip(PROMPTS, IMAGES, SAMPLING_PARAMS):
inputs, sampling_params = get_VLLM_mllama_model_inputs(pmpt, img, params)
# test batch-size = 1
outputs = llm.generate(inputs, sampling_params)
print_outputs(outputs)
batched_inputs.append(inputs)
batched_sample_params.append(sampling_params)
# test batch-size = 4
outputs = llm.generate(batched_inputs, batched_sample_params)
print_outputs(outputs)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to run offline inference with a speculative
decoding model on neuron.
"""
import os
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, I am a language model and I can help",
"The president of the United States is",
"The capital of France is",
]
def config_buckets():
"""Configure context length and token gen buckets."""
# creates XLA hlo graphs for all the context length buckets.
os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
# creates XLA hlo graphs for all the token gen buckets.
os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"
def initialize_model():
"""Create an LLM with speculative decoding."""
return LLM(
model="openlm-research/open_llama_7b",
speculative_config={
"model": "openlm-research/open_llama_3b",
"num_speculative_tokens": 4,
"max_model_len": 2048,
},
max_num_seqs=4,
max_model_len=2048,
block_size=2048,
use_v2_block_manager=True,
device="neuron",
tensor_parallel_size=32,
)
def process_requests(model: LLM, sampling_params: SamplingParams):
"""Generate texts from prompts and print them."""
outputs = model.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
def main():
"""Main function that sets up the model and processes prompts."""
config_buckets()
model = initialize_model()
# Create a sampling params object.
sampling_params = SamplingParams(max_tokens=100, top_k=1)
process_requests(model, sampling_params)
if __name__ == "__main__":
main()
# Offline Inference with the OpenAI Batch file format
```{important}
This is a guide to performing batch inference using the OpenAI batch file format, **not** the complete Batch (REST) API.
```
## File Format
The OpenAI batch file format consists of a series of json objects on new lines.
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl)
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
```{note}
We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
```
## Pre-requisites
* The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
- Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
- Install the token on your machine (Run `huggingface-cli login`).
- Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
## Example 1: Running with a local file
### Step 1: Create your batch file
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
```bash
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
```
Once you've created your batch file it should look like this
```bash
cat offline_inference/openai_batch/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```
### Step 2: Run the batch
The batch running tool is designed to be used from the command line.
You can run the batch with the following command, which will write its results to a file called `results.jsonl`
```bash
python -m vllm.entrypoints.openai.run_batch \
-i offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
```
or use command-line:
```bash
vllm run-batch \
-i offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
```
### Step 3: Check your results
You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`
```bash
cat results.jsonl
{"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
{"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
```
## Example 2: Using remote files
The batch runner supports remote input and output urls that are accessible via http/https.
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl`, you can run
```bash
python -m vllm.entrypoints.openai.run_batch \
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
```
or use command-line:
```bash
vllm run-batch \
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
```
## Example 3: Integrating with AWS S3
To integrate with cloud blob storage, we recommend using presigned urls.
[Learn more about S3 presigned urls here]
### Additional prerequisites
* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html).
* The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
- [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
* The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
### Step 1: Upload your input script
To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
```bash
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl
```
Once you've created your batch file it should look like this
```bash
cat offline_inference/openai_batch/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
```
Now upload your batch file to your S3 bucket.
```bash
aws s3 cp offline_inference/openai_batch/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
```
### Step 2: Generate your presigned urls
Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
(The script is adapted from <https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py>)
```python
import boto3
from botocore.exceptions import ClientError
def generate_presigned_url(s3_client, client_method, method_parameters, expires_in):
"""
Generate a presigned Amazon S3 URL that can be used to perform an action.
:param s3_client: A Boto3 Amazon S3 client.
:param client_method: The name of the client method that the URL performs.
:param method_parameters: The parameters of the specified client method.
:param expires_in: The number of seconds the presigned URL is valid for.
:return: The presigned URL.
"""
try:
url = s3_client.generate_presigned_url(
ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
)
except ClientError:
raise
return url
s3_client = boto3.client("s3")
input_url = generate_presigned_url(
s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
)
output_url = generate_presigned_url(
s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
)
print(f"{input_url=}")
print(f"{output_url=}")
```
This script should output
```text
input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
```
### Step 3: Run the batch runner using your presigned urls
You can now run the batch runner, using the urls generated in the previous section.
```bash
python -m vllm.entrypoints.openai.run_batch \
-i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
-o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
--model --model meta-llama/Meta-Llama-3-8B-Instruct
```
or use command-line:
```bash
vllm run-batch \
-i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
-o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
--model --model meta-llama/Meta-Llama-3-8B-Instruct
```
### Step 4: View your results
Your results are now on S3. You can view them in your terminal by running
```bash
aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
```
## Example 4: Using embeddings endpoint
### Additional prerequisites
* Ensure you are using `vllm >= 0.5.5`.
### Step 1: Create your batch file
Add embedding requests to your batch file. The following is an example:
```text
{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
```
You can even mix chat completion and embedding requests in the batch file, as long as the model you are using supports both chat completion and embeddings (note that all requests must use the same model).
### Step 2: Run the batch
You can run the batch using the same command as in earlier examples.
### Step 3: Check your results
You can check your results by running `cat results.jsonl`
```bash
cat results.jsonl
{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
...
```
## Example 5: Using score endpoint
### Additional prerequisites
* Ensure you are using `vllm >= 0.7.0`.
### Step 1: Create your batch file
Add score requests to your batch file. The following is an example:
```text
{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
```
You can mix chat completion, embedding, and score requests in the batch file, as long as the model you are using supports them all (note that all requests must use the same model).
### Step 2: Run the batch
You can run the batch using the same command as in earlier examples.
### Step 3: Check your results
You can check your results by running `cat results.jsonl`
```bash
cat results.jsonl
{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
```
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm import LLM, SamplingParams
from vllm.distributed import cleanup_dist_env_and_memory
# NOTE: This is just a running example. For benchmarking purpose,
# please see benchmarks/benchmark_prefix_caching.py
# Common prefix.
prefix = (
"You are an expert school principal, skilled in effectively managing "
"faculty and staff. Draft 10-15 questions for a potential first grade "
"Head Teacher for my K-12, all-girls', independent school that emphasizes "
"community, joyful discovery, and life-long learning. The candidate is "
"coming in for a first-round panel interview for a 8th grade Math "
"teaching role. They have 5 years of previous teaching experience "
"as an assistant teacher at a co-ed, public school with experience "
"in middle school math teaching. Based on these information, fulfill "
"the following paragraph: "
)
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
generating_prompts = [prefix + prompt for prompt in prompts]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0)
def main():
# Create an LLM without prefix caching as a baseline.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
print("Results without `enable_prefix_caching`")
# ruff: noqa: E501
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = regular_llm.generate(generating_prompts, sampling_params)
regular_generated_texts = []
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
regular_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# Destroy the LLM object and free up the GPU memory.
del regular_llm
cleanup_dist_env_and_memory()
# Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(
model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4,
)
# Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
# Generate with prefix caching.
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
print("Results with `enable_prefix_caching`")
cached_generated_texts = []
# Print the outputs. You should see the same outputs as before.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
cached_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# Compare the results and display the speedup
generated_same = all(
[
regular_generated_texts[i] == cached_generated_texts[i]
for i in range(len(prompts))
]
)
print(f"Generated answers are the same: {generated_same}")
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This is a demo script showing how to use the
PrithviGeospatialMAE model with vLLM
This script is based on: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/blob/main/inference.py # noqa
Target model weights: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/resolve/main/Prithvi-EO-V2-300M-TL-Sen1Floods11.pt # noqa
The requirements for running this script are:
- Installing [terratorch, albumentations, rasterio] in your python environment
- downloading the model weights in a 'model' folder local to the script
(temporary measure until the proper config.json file is uploaded to HF)
- download an input example image (India_900498_S2Hand.tif) and place it in
the same folder with the script (or specify with the --data_file argument)
Run the example:
python prithvi_geospatial_mae.py
""" # noqa: E501
import argparse
import datetime
import os
from typing import Union
import albumentations
import numpy as np
import rasterio
import regex as re
import torch
from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule
from vllm import LLM
NO_DATA = -9999
NO_DATA_FLOAT = 0.0001
OFFSET = 0
PERCENTILE = 99
model_config = """{
"architectures": ["PrithviGeoSpatialMAE"],
"num_classes": 0,
"pretrained_cfg": {
"task_args": {
"task": "SemanticSegmentationTask",
"model_factory": "EncoderDecoderFactory",
"loss": "ce",
"ignore_index": -1,
"lr": 0.001,
"freeze_backbone": false,
"freeze_decoder": false,
"plot_on_val": 10,
"optimizer": "AdamW",
"scheduler": "CosineAnnealingLR"
},
"model_args": {
"backbone_pretrained": false,
"backbone": "prithvi_eo_v2_300_tl",
"decoder": "UperNetDecoder",
"decoder_channels": 256,
"decoder_scale_modules": true,
"num_classes": 2,
"rescale": true,
"backbone_bands": [
"BLUE",
"GREEN",
"RED",
"NIR_NARROW",
"SWIR_1",
"SWIR_2"
],
"head_dropout": 0.1,
"necks": [
{
"name": "SelectIndices",
"indices": [
5,
11,
17,
23
]
},
{
"name": "ReshapeTokensToImage"
}
]
},
"optimizer_params" : {
"lr": 5.0e-05,
"betas": [0.9, 0.999],
"eps": [1.0e-08],
"weight_decay": 0.05,
"amsgrad": false,
"maximize": false,
"capturable": false,
"differentiable": false
},
"scheduler_params" : {
"T_max": 50,
"eta_min": 0,
"last_epoch": -1,
"verbose": "deprecated"
}
},
"torch_dtype": "float32"
}
"""
# Temporarily creating the "config.json" for the model.
# This is going to disappear once the correct config.json is available on HF
with open(
os.path.join(os.path.dirname(__file__), "./model/config.json"), "w"
) as config_file:
config_file.write(model_config)
datamodule_config = {
"bands": ["BLUE", "GREEN", "RED", "NIR_NARROW", "SWIR_1", "SWIR_2"],
"batch_size": 16,
"constant_scale": 0.0001,
"data_root": "/dccstor/geofm-finetuning/datasets/sen1floods11",
"drop_last": True,
"no_data_replace": 0.0,
"no_label_replace": -1,
"num_workers": 8,
"test_transform": [
albumentations.Resize(
always_apply=False, height=448, interpolation=1, p=1, width=448
),
albumentations.pytorch.ToTensorV2(
transpose_mask=False, always_apply=True, p=1.0
),
],
}
class PrithviMAE:
def __init__(self):
print("Initializing PrithviMAE model")
self.model = LLM(
model=os.path.join(os.path.dirname(__file__), "./model"),
skip_tokenizer_init=True,
dtype="float32",
)
def run(self, input_data, location_coords):
print("################ Running inference on vLLM ##############")
# merge the inputs into one data structure
mm_data = {
"pixel_values": torch.empty(0) if input_data is None else input_data,
"location_coords": torch.empty(0)
if location_coords is None
else location_coords,
}
prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
outputs = self.model.encode(prompt, use_tqdm=False)
print("################ Inference done (it took seconds) ##############")
return outputs[0].outputs.data
def generate_datamodule():
datamodule = Sen1Floods11NonGeoDataModule(
data_root=datamodule_config["data_root"],
batch_size=datamodule_config["batch_size"],
num_workers=datamodule_config["num_workers"],
bands=datamodule_config["bands"],
drop_last=datamodule_config["drop_last"],
test_transform=datamodule_config["test_transform"],
)
return datamodule
def process_channel_group(orig_img, channels):
"""
Args:
orig_img: torch.Tensor representing original image (reference)
with shape = (bands, H, W).
channels: list of indices representing RGB channels.
Returns:
torch.Tensor with shape (num_channels, height, width) for original image
"""
orig_img = orig_img[channels, ...]
valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
valid_mask[orig_img == NO_DATA_FLOAT] = False
# Rescale (enhancing contrast)
max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
min_value = OFFSET
orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0, 1)
# No data as zeros
orig_img[~valid_mask] = 0
return orig_img
def read_geotiff(file_path: str):
"""Read all bands from *file_path* and return image + meta info.
Args:
file_path: path to image file.
Returns:
np.ndarray with shape (bands, height, width)
meta info dict
"""
with rasterio.open(file_path) as src:
img = src.read()
meta = src.meta
try:
coords = src.lnglat()
except Exception:
# Cannot read coords
coords = None
return img, meta, coords
def save_geotiff(image, output_path: str, meta: dict):
"""Save multi-band image in Geotiff file.
Args:
image: np.ndarray with shape (bands, height, width)
output_path: path where to save the image
meta: dict with meta info.
"""
with rasterio.open(output_path, "w", **meta) as dest:
for i in range(image.shape[0]):
dest.write(image[i, :, :], i + 1)
return
def _convert_np_uint8(float_image: torch.Tensor):
image = float_image.numpy() * 255.0
image = image.astype(dtype=np.uint8)
return image
def load_example(
file_paths: list[str],
mean: list[float] = None,
std: list[float] = None,
indices: Union[list[int], None] = None,
):
"""Build an input example by loading images in *file_paths*.
Args:
file_paths: list of file paths .
mean: list containing mean values for each band in the images
in *file_paths*.
std: list containing std values for each band in the images
in *file_paths*.
Returns:
np.array containing created example
list of meta info for each image in *file_paths*
"""
imgs = []
metas = []
temporal_coords = []
location_coords = []
for file in file_paths:
img, meta, coords = read_geotiff(file)
# Rescaling (don't normalize on nodata)
img = np.moveaxis(img, 0, -1) # channels last for rescaling
if indices is not None:
img = img[..., indices]
if mean is not None and std is not None:
img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
imgs.append(img)
metas.append(meta)
if coords is not None:
location_coords.append(coords)
try:
match = re.search(r"(\d{7,8}T\d{6})", file)
if match:
year = int(match.group(1)[:4])
julian_day = match.group(1).split("T")[0][4:]
if len(julian_day) == 3:
julian_day = int(julian_day)
else:
julian_day = (
datetime.datetime.strptime(julian_day, "%m%d")
.timetuple()
.tm_yday
)
temporal_coords.append([year, julian_day])
except Exception as e:
print(f"Could not extract timestamp for {file} ({e})")
imgs = np.stack(imgs, axis=0) # num_frames, H, W, C
imgs = np.moveaxis(imgs, -1, 0).astype("float32")
imgs = np.expand_dims(imgs, axis=0) # add batch di
return imgs, temporal_coords, location_coords, metas
def run_model(
input_data,
temporal_coords,
location_coords,
model,
datamodule,
img_size,
lightning_model=None,
):
# Reflect pad if not divisible by img_size
original_h, original_w = input_data.shape[-2:]
pad_h = (img_size - (original_h % img_size)) % img_size
pad_w = (img_size - (original_w % img_size)) % img_size
input_data = np.pad(
input_data, ((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)), mode="reflect"
)
# Build sliding window
batch_size = 1
batch = torch.tensor(input_data, device="cpu")
windows = batch.unfold(3, img_size, img_size).unfold(4, img_size, img_size)
h1, w1 = windows.shape[3:5]
windows = rearrange(
windows, "b c t h1 w1 h w -> (b h1 w1) c t h w", h=img_size, w=img_size
)
# Split into batches if number of windows > batch_size
num_batches = windows.shape[0] // batch_size if windows.shape[0] > batch_size else 1
windows = torch.tensor_split(windows, num_batches, dim=0)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if temporal_coords:
temporal_coords = torch.tensor(temporal_coords, device=device).unsqueeze(0)
else:
temporal_coords = None
if location_coords:
location_coords = torch.tensor(location_coords[0], device=device).unsqueeze(0)
else:
location_coords = None
# Run model
pred_imgs = []
for x in windows:
# Apply standardization
x = datamodule.test_transform(image=x.squeeze().numpy().transpose(1, 2, 0))
x = datamodule.aug(x)["image"]
with torch.no_grad():
x = x.to(device)
pred = model.run(x, location_coords=location_coords)
if lightning_model:
pred_lightning = lightning_model(
x, temporal_coords=temporal_coords, location_coords=location_coords
)
pred_lightning = pred_lightning.output.detach().cpu()
if not torch.equal(pred, pred_lightning):
print("Inference output is not equal")
y_hat = pred.argmax(dim=1)
y_hat = torch.nn.functional.interpolate(
y_hat.unsqueeze(1).float(), size=img_size, mode="nearest"
)
pred_imgs.append(y_hat)
pred_imgs = torch.concat(pred_imgs, dim=0)
# Build images from patches
pred_imgs = rearrange(
pred_imgs,
"(b h1 w1) c h w -> b c (h1 h) (w1 w)",
h=img_size,
w=img_size,
b=1,
c=1,
h1=h1,
w1=w1,
)
# Cut padded area back to original size
pred_imgs = pred_imgs[..., :original_h, :original_w]
# Squeeze (batch size 1)
pred_imgs = pred_imgs[0]
return pred_imgs
def parse_args():
parser = argparse.ArgumentParser("MAE run inference", add_help=False)
parser.add_argument(
"--data_file",
type=str,
default="./India_900498_S2Hand.tif",
help="Path to the file.",
)
parser.add_argument(
"--output_dir",
type=str,
default="output",
help="Path to the directory where to save outputs.",
)
parser.add_argument(
"--input_indices",
default=[1, 2, 3, 8, 11, 12],
type=int,
nargs="+",
help="0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data.",
)
parser.add_argument(
"--rgb_outputs",
action="store_true",
help="If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved.",
)
def main(
data_file: str,
output_dir: str,
rgb_outputs: bool,
input_indices: list[int] = None,
):
os.makedirs(output_dir, exist_ok=True)
# Load model ---------------------------------------------------------------
model_obj = PrithviMAE()
datamodule = generate_datamodule()
img_size = 256 # Size of Sen1Floods11
# Loading data -------------------------------------------------------------
input_data, temporal_coords, location_coords, meta_data = load_example(
file_paths=[data_file],
indices=input_indices,
)
meta_data = meta_data[0] # only one image
if input_data.mean() > 1:
input_data = input_data / 10000 # Convert to range 0-1
# Running model ------------------------------------------------------------
channels = [
datamodule_config["bands"].index(b) for b in ["RED", "GREEN", "BLUE"]
] # BGR -> RGB
pred = run_model(
input_data, temporal_coords, location_coords, model_obj, datamodule, img_size
)
# Save pred
meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
pred_file = os.path.join(
output_dir, f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
)
save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)
# Save image + pred
meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)
if input_data.mean() < 1:
input_data = input_data * 10000 # Scale to 0-10000
rgb_orig = process_channel_group(
orig_img=torch.Tensor(input_data[0, :, 0, ...]),
channels=channels,
)
pred[pred == 0.0] = np.nan
img_pred = rgb_orig * 0.7 + pred * 0.3
img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]
img_pred_file = os.path.join(
output_dir, f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff"
)
save_geotiff(
image=_convert_np_uint8(img_pred),
output_path=img_pred_file,
meta=meta_data,
)
# Save image rgb
if rgb_outputs:
rgb_file = os.path.join(
output_dir,
f"original_rgb_{os.path.splitext(os.path.basename(data_file))[0]}.tiff",
)
save_geotiff(
image=_convert_np_uint8(rgb_orig),
output_path=rgb_file,
meta=meta_data,
)
if __name__ == "__main__":
args = parse_args()
main(**vars(args))
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import inspect
import json
import os
import sys
from argparse import RawTextHelpFormatter
from collections.abc import Generator
from dataclasses import asdict, dataclass
from typing import Any, Optional, TypeAlias
import torch
import tqdm
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.profiler.layerwise_profile import layerwise_profile
from vllm.utils import FlexibleArgumentParser
BATCH_SIZE_DEFAULT = 1
PROMPT_LEN_DEFAULT = 256
@dataclass
class ProfileContext:
engine_args: EngineArgs
prompt_len: int
batch_size: int
# The profiler can run in 2 modes,
# 1. Run profiler for user specified num_steps
num_steps: Optional[int] = None
# 2. Run profiler until all requests complete
complete_num_requests_per_step: Optional[int] = None
save_chrome_traces_folder: Optional[str] = None
def get_dtype(dtype: str):
if dtype == "torch.float":
return torch.float
else:
return dtype
OutputLen_NumReqs_Map: TypeAlias = dict[int, int]
def compute_request_output_lengths(
batch_size: int, step_requests: list[int]
) -> OutputLen_NumReqs_Map:
"""
Given the number of requests, batch_size, and the number of requests
that each engine-step should process, step_requests, determine the
output lengths of the requests such that step_request is honoured.
Example:
if batch size = 128 and step_request = [128, 128, 96, 64, 32, 1]
then return,
{2 : 32, 3 : 32, 4 : 32, 5 : 31, 6 : 1}, meaning,
32 requests should have output length 2,
32 requests should have output length 3,
32 requests should have output length 4,
31 requests should have output length 5,
1 request should have output length 6.
Args:
batch_size (int): Number of requests submitted for profile. This is
args.batch_size.
step_requests (list[int]): step_requests[i] is the number of requests
that the ith engine step should process.
Returns:
OutputLen_NumReqs_Map : A dictionary with output-length as keys and the
number of requests required to have that output-length as values.
"""
ol_nr: OutputLen_NumReqs_Map = {}
# Number of request that are assigned an output-length
num_reqs_assigned: int = 0
num_steps: int = len(step_requests)
# sanity check. The first step (prefill-step), must process all requests.
assert step_requests[0] == batch_size
# Begin assignments from the last step.
output_length: int = num_steps
for num_requests_at_step in reversed(step_requests):
if num_reqs_assigned == batch_size:
break
assert num_reqs_assigned < batch_size
# Remove the number of requests that have been determined
# to participate in this step and beyond.
num_reqs_unassigned_at_step = num_requests_at_step - num_reqs_assigned
assert num_reqs_unassigned_at_step >= 0
if num_reqs_unassigned_at_step > 0:
ol_nr[output_length] = num_reqs_unassigned_at_step
num_reqs_assigned += num_reqs_unassigned_at_step
output_length -= 1
# sanity checks.
assert sum(ol_nr.values()) == batch_size, (
"Number of requests in output-length assignment does not match "
f"batch-size.\n batch size {batch_size} - "
f"step requests {step_requests} - assignments {ol_nr}"
)
# Check that the output-length is in [1, num-steps]. Output length must be
# at least 1 as all requests must participate in the prefill-step.
assert all(ol >= 1 and ol <= num_steps for ol in ol_nr), (
"Output lengths of requests should be in range "
f"[1, num-engine-steps].\n batch size {batch_size} - "
f"step requests {step_requests} - assignments {ol_nr}"
)
return ol_nr
def determine_requests_per_step(context: ProfileContext) -> list[int]:
"""
Determine number of requests each engine step should process.
If context.num_steps is set, then all engine steps process the
same number of requests and the output list is of length
context.num_steps.
If context.complete_num_requests_per_step is set, then each decode step
processes fewer and fewer requests until there are no requests to process.
In this case, the output list is as big as the number of steps
required to process all requests.
Args:
context: ProfileContext object.
Returns:
list[int]: Number of requests to process for all engine-steps.
output[i], contains the number of requests that the ith step
should process.
"""
if context.num_steps:
# All requests must run until num_engine_steps. This implies
# that their output lengths must be equal to num_engine_steps.
return [context.batch_size] * context.num_steps
assert (
context.complete_num_requests_per_step
and context.complete_num_requests_per_step > 0
), (
f"Expected a positive complete_num_requests_per_step argument."
f"Instead got {context.complete_num_requests_per_step}"
)
# We start dropping after the first decode step.
step_requests = [
context.batch_size, # prefill
context.batch_size, # decode
]
num_running_requests = context.batch_size
num_running_requests -= context.complete_num_requests_per_step
while num_running_requests > 0:
step_requests.append(num_running_requests)
num_running_requests -= context.complete_num_requests_per_step
if step_requests[-1] != 1:
# have 1 request running at the last step. This is often
# useful
step_requests.append(1)
return step_requests
def run_profile(
context: ProfileContext, csv_output: Optional[str], json_output: Optional[str]
):
print("Run profile with:")
for key, value in asdict(context).items():
print(f" {key} = {value}")
requests_per_step: list[int] = determine_requests_per_step(context)
ol_nr: OutputLen_NumReqs_Map = compute_request_output_lengths(
context.batch_size, requests_per_step
)
num_steps_to_profile: int = len(requests_per_step)
max_output_len: int = max(ol_nr.keys())
assert max_output_len >= 1
# Create sampling params
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
# max_tokens is set on a per-request basis.
max_tokens=None,
ignore_eos=True,
)
# Create LLM
llm = LLM(**asdict(context.engine_args))
batch_size = context.batch_size
prompt_len = context.prompt_len
scheduler_config = llm.llm_engine.vllm_config.scheduler_config
max_model_len = llm.llm_engine.model_config.max_model_len
max_num_batched_tokens = scheduler_config.max_num_batched_tokens
max_num_seqs = scheduler_config.max_num_seqs
if batch_size * prompt_len > max_num_batched_tokens:
print(
f"ERROR: chosen batch_size * prompt_len "
f"({batch_size} * {prompt_len} = {batch_size * prompt_len}) is "
f"larger than max_num_batched_tokens ({max_num_batched_tokens}) "
f"and therefore cannot be run in a single profile step, please "
f"choose a smaller batch size or prompt length, or increase "
f"--max-num-batched-tokens"
)
sys.exit(-1)
if batch_size > max_num_seqs:
print(
f"ERROR: chosen batch_size ({batch_size}) is larger than "
f"max_num_seqs ({max_num_seqs}) and therefore cannot be run in a "
f"single profile step, please choose a smaller batch size"
)
sys.exit(-1)
print(
"llm.llm_engine.model_config.max_model_len: ",
llm.llm_engine.model_config.max_model_len,
)
if prompt_len + max_output_len > llm.llm_engine.model_config.max_model_len:
print(
f"ERROR: chosen prompt_len + max_output_len ({prompt_len} + "
f"{max_output_len} = {prompt_len + max_output_len}) is larger "
f"than the model's max_model_len ({max_model_len}), please "
f"choose a smaller prompt_len or max_output_len, or increase "
f"--max-model-len"
)
sys.exit(-1)
def add_requests():
def get_output_len_generator() -> Generator[int, Any, Any]:
for output_len, num_reqs in ol_nr.items():
for _ in range(num_reqs):
yield output_len
output_len_generator = get_output_len_generator()
for i in range(batch_size):
sampling_params.max_tokens = next(output_len_generator)
assert isinstance(sampling_params.max_tokens, int)
prompt_token_ids = torch.randint(
llm.get_tokenizer().vocab_size, size=(prompt_len,)
).tolist()
llm.llm_engine.add_request(
request_id=f"seq{i}",
prompt={"prompt_token_ids": prompt_token_ids},
params=sampling_params,
)
def abort_requests():
for i in range(batch_size):
llm.llm_engine.abort_request(f"seq{i}")
# Warm up run
print("Warm up run ...")
add_requests()
llm.llm_engine.step() # Prefill
llm.llm_engine.step() # Decode
abort_requests()
print("Profile run ...")
add_requests()
with layerwise_profile() as prefill_prof:
llm.llm_engine.step() # First step is prefill
decode_profs = []
for _ in tqdm.tqdm(range(num_steps_to_profile - 1)):
num_running_seqs = llm.llm_engine.scheduler[0].get_num_unfinished_seq_groups()
with layerwise_profile(num_running_seqs=num_running_seqs) as decode_prof:
llm.llm_engine.step()
decode_profs.append(decode_prof)
decode_results_list = [prof.results for prof in decode_profs]
prefill_results = prefill_prof.results
has_decode = len(decode_results_list) > 0
LINE_WIDTH = 80
print("=" * LINE_WIDTH)
print(f"= Prefill Model Table (prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
prefill_results.print_model_table()
if has_decode:
print()
print("=" * LINE_WIDTH)
print(
f"= First Decode Step Model Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})"
)
print("=" * LINE_WIDTH)
print()
decode_results_list[0].print_model_table()
print()
print("=" * LINE_WIDTH)
print(f"= Prefill Summary Table (prompt_len={prompt_len}, batch_size={batch_size})")
print("=" * LINE_WIDTH)
print()
prefill_results.print_summary_table()
if has_decode:
print()
print("=" * LINE_WIDTH)
print(
f"= First Decode Step Summary Table "
f"(prompt_len={prompt_len}, batch_size={batch_size})"
)
print("=" * LINE_WIDTH)
print()
decode_results_list[0].print_summary_table()
if csv_output:
csv_filename_base = (
csv_output[:-4] if csv_output.endswith(".csv") else csv_output
)
prefill_results.export_model_stats_table_csv(
csv_filename_base + "_prefill_model_table.csv"
)
prefill_results.export_summary_stats_table_csv(
csv_filename_base + "_prefill_summary_table.csv"
)
if has_decode:
decode_results_list[0].export_model_stats_table_csv(
csv_filename_base + "_decode_model_table.csv"
)
decode_results_list[0].export_summary_stats_table_csv(
csv_filename_base + "_decode_summary_table.csv"
)
if json_output:
cuda_devices = [
torch.cuda.get_device_properties(dev_idx)
for dev_idx in range(torch.cuda.device_count())
]
json_dict = {
"context": {
"python_version": f"{sys.version}",
"torch_version": f"{torch.__version__}",
"torch_cuda_version": f"{torch.version.cuda}",
"cuda_devices": f"{cuda_devices}",
**asdict(context),
},
"prefill": prefill_results.convert_stats_to_dict(),
}
if has_decode:
for idx, dr in enumerate(decode_results_list):
json_dict[f"decode_{idx + 1}"] = dr.convert_stats_to_dict()
# Add .json to json_output filename if it doesn't exist already.
json_output_file = (
json_output if json_output.endswith(".json") else json_output + ".json"
)
with open(json_output_file, "w+") as f:
json.dump(json_dict, f, indent=2)
pass
if context.save_chrome_traces_folder is not None:
os.makedirs(context.save_chrome_traces_folder, exist_ok=True)
prefill_prof.profiler.export_chrome_trace(
context.save_chrome_traces_folder + "/prefill.json"
)
for idx, decode_prof in enumerate(decode_profs):
decode_prof.profiler.export_chrome_trace(
context.save_chrome_traces_folder + f"/decode_{idx + 1}.json"
)
print(
"Traces saved as prefill.json and decode_1.json, etc."
f" in folder {context.save_chrome_traces_folder}"
)
def parse_args():
parser = FlexibleArgumentParser(
description="""
Profile a model
example:
```
python examples/offline_inference/profiling.py \\
--model neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8 --batch-size 4 \\
--prompt-len 512 --max-num-batched-tokens 8196 --json Llama31-8b-FP8 \\
--enforce-eager run_num_steps -n 2
```
then you can use various tools to analyze the json output
terminal ascii tables:
```
python tools/profiler/print_layerwise_table.py \\
--json-trace Llama31-8b-FP8.json --phase prefill --table summary
```
or create matplotlib stacked bar charts:
```
python tools/profiler/visualize_layerwise_profile.py \\
--json-trace Llama31-8b-FP8.json \\
--output-directory profile_breakdown --plot-metric pct_cuda_time
```
""",
formatter_class=RawTextHelpFormatter,
)
parser.add_argument(
"--csv",
type=str,
default=None,
help="Export the results as multiple csv file. This should be the root "
"filename, will create <filename>_prefill_model_table.csv, "
"<filename>_prefill_summary_table.csv, "
"<filename>_decode_model_table.csv, and "
"<filename>_decode_summary_table.csv",
)
parser.add_argument(
"--json",
type=str,
default=None,
help="Export the results as a json file. This should be the filename",
)
parser.add_argument(
"--save-chrome-traces-folder",
type=str,
help="Save chrome traces for the prefill and decode "
"will save traces as prefill.json and decode_1.json, "
"etc. inside this folder",
)
parser.add_argument(
"--prompt-len",
type=int,
default=PROMPT_LEN_DEFAULT,
help=f"Length of the random prompt to use when profiling, all batched "
f"requests use the same prompt_len, default={PROMPT_LEN_DEFAULT}",
)
parser.add_argument(
"--batch-size",
type=int,
default=BATCH_SIZE_DEFAULT,
help=f"Number of requests to run as a single batch, "
f"default={BATCH_SIZE_DEFAULT}",
)
subparsers = parser.add_subparsers(dest="cmd")
run_num_steps_parser = subparsers.add_parser(
"run_num_steps", help="This variation profiles n engine.step() invocations."
)
run_num_steps_parser.add_argument(
"-n",
"--num-steps",
type=int,
help="Number of engine steps to profile.\n"
"Setting it to 1, profiles only the prefill step.\n"
"Setting it to 2, profiles the prefill and first decode step\n"
"Setting it to 3, profiles the prefill, 1st and 2nd decode steps\n"
"and so on ...",
)
run_to_completion_parser = subparsers.add_parser(
"run_to_completion",
help="This variation profiles all the engine.step() invocations"
"until the engine exhausts all submitted requests.",
)
run_to_completion_parser.add_argument(
"-n",
"--complete-num-requests-per-step",
type=int,
help="Complete complete_num_requests_per_step requests every decode step."
"For e.g., with batch_size 128 and complete_num_requests_per_step 32,"
"the profiler is run for 6 engine steps, with the steps processing, "
"128, 128, 96, 64, 32, 1 requests respectively.\n"
"Note that we tack-on a one-request step at the end as it is often "
"useful.",
)
EngineArgs.add_cli_args(parser)
return parser.parse_args()
def main(args):
context = ProfileContext(
engine_args=EngineArgs.from_cli_args(args),
**{
k: v
for k, v in vars(args).items()
if k in inspect.signature(ProfileContext).parameters
},
)
run_profile(context, csv_output=args.csv, json_output=args.json)
if __name__ == "__main__":
args = parse_args()
main(args)
# vLLM TPU Profiling
This script is used to profile the TPU performance of vLLM for specific prefill or decode token shapes.
Note: an actual running server is a mix of both prefill of many shapes and decode of many shapes.
We assume you are on a TPU already (this was tested on TPU v6e) and have installed vLLM according to the [Google TPU installation guide](https://docs.vllm.ai/en/latest/getting_started/installation/google_tpu.html).
> In all examples below, we run several warmups before (so `--enforce-eager` is okay)
## Profile Examples
### Generate Prefill Trace
This example runs Qwen/Qwen2.5-7B-Instruct with a single request of 1024 input tokens. This is set up in attempt to profile just the prefill time and operations.
```bash
export XLA_HLO_DEBUG=1
export MODEL=Qwen/Qwen2.5-7B-Instruct
export VLLM_TPU_PROFILE_DURATION_MS=3000
export VLLM_TPU_PROFILE_DELAY_MS=0
python3 profiling.py \
--model $MODEL \
--input-len 1024 --output-len 1 \
--batch-size 1 --enforce-eager \
--max-model-len 2048 \
--tensor-parallel-size 1 \
--profile-result-dir profiles
```
### Generate Decode Trace
This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
```bash
export XLA_HLO_DEBUG=1
export MODEL=meta-llama/Llama-3.1-70B-Instruct
export VLLM_TPU_PROFILE_DURATION_MS=2000
export VLLM_TPU_PROFILE_DELAY_MS=1000
rm -rf ~/.cache/vllm/xla_cache
python3 profiling.py \
--model $MODEL \
--input-len 1 \
--output-len 128 \
--batch-size 32 \
--enforce-eager \
--profile-result-dir profiles \
--max-model-len 2048 --tensor-parallel-size 8
```
## Visualizing the profiles
Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
Here are most likely the dependencies you need to install:
```bash
pip install tensorflow-cpu \
tensorboard-plugin-profile \
etils \
importlib_resources
```
Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
```bash
tensorboard --logdir profiles/ --port 6006
```
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import dataclasses
import os
import time
import numpy as np
import torch_xla.debug.profiler as xp
from tqdm import tqdm
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.inputs import PromptType
from vllm.utils import FlexibleArgumentParser
DURATION_MS = int(os.getenv("VLLM_TPU_PROFILE_DURATION_MS", 3000))
DELAY_MS = int(os.getenv("VLLM_TPU_PROFILE_DELAY_MS", 0))
def main(args: argparse.Namespace):
print(args)
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**dataclasses.asdict(engine_args))
server = xp.start_server(9012) # noqa: F841
sampling_params = SamplingParams(
temperature=0.0,
ignore_eos=True,
max_tokens=args.output_len,
)
print(sampling_params)
dummy_prompt_token_ids = np.random.randint(
10000, size=(args.batch_size, args.input_len)
)
dummy_prompts: list[PromptType] = [
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
]
def run_to_completion():
start_time = time.perf_counter()
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
end_time = time.perf_counter()
latency = end_time - start_time
return latency
# Warmup
print("Warming up...")
warmup_latencies = []
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
warmup_latencies.append(run_to_completion())
print(f"Average warmup latency: {np.mean(warmup_latencies):.4f}s")
# Profile
profile_dir = args.profile_result_dir
print(f"Profiling (results will be saved to '{profile_dir}')...")
# Enable tracing on server
xp.trace_detached(
"localhost:9012", profile_dir, delay_ms=DELAY_MS, duration_ms=DURATION_MS
)
if DELAY_MS == 0:
time.sleep(1.0)
profile_latencies = []
for _ in tqdm(range(args.num_iters), desc="Profile iterations"):
profile_latencies.append(run_to_completion())
print(f"Average profile latency: {np.mean(profile_latencies):.4f}s")
return
def parse_args():
parser = FlexibleArgumentParser(
description="Benchmark the latency of processing a single batch of "
"requests till completion."
)
parser.add_argument("--input-len", type=int, default=32)
parser.add_argument("--output-len", type=int, default=128)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument(
"--num-iters-warmup",
type=int,
default=5,
help="Number of iterations to run for warmup.",
)
parser.add_argument(
"--num-iters",
type=int,
default=1,
help="Number of iterations to run for profiling.",
)
parser.add_argument(
"--profile-result-dir",
type=str,
default="profiles",
help=(
"path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard "
"(https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm)."
),
)
parser = EngineArgs.add_cli_args(parser)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrates how to generate prompt embeddings using
Hugging Face Transformers and use them as input to vLLM
for both single and batch inference.
Model: meta-llama/Llama-3.2-1B-Instruct
Note: This model is gated on Hugging Face Hub.
You must request access to use it:
https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct
Requirements:
- vLLM
- transformers
Run:
python examples/offline_inference/prompt_embed_inference.py
"""
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizer
from vllm import LLM
def init_tokenizer_and_llm(model_name: str):
tokenizer = AutoTokenizer.from_pretrained(model_name)
transformers_model = AutoModelForCausalLM.from_pretrained(model_name)
embedding_layer = transformers_model.get_input_embeddings()
llm = LLM(model=model_name, enable_prompt_embeds=True)
return tokenizer, embedding_layer, llm
def get_prompt_embeds(
chat: list[dict[str, str]],
tokenizer: PreTrainedTokenizer,
embedding_layer: torch.nn.Module,
):
token_ids = tokenizer.apply_chat_template(
chat, add_generation_prompt=True, return_tensors="pt"
)
prompt_embeds = embedding_layer(token_ids).squeeze(0)
return prompt_embeds
def single_prompt_inference(
llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
):
chat = [{"role": "user", "content": "Please tell me about the capital of France."}]
prompt_embeds = get_prompt_embeds(chat, tokenizer, embedding_layer)
outputs = llm.generate(
{
"prompt_embeds": prompt_embeds,
}
)
print("\n[Single Inference Output]")
print("-" * 30)
for o in outputs:
print(o.outputs[0].text)
print("-" * 30)
def batch_prompt_inference(
llm: LLM, tokenizer: PreTrainedTokenizer, embedding_layer: torch.nn.Module
):
chats = [
[{"role": "user", "content": "Please tell me about the capital of France."}],
[{"role": "user", "content": "When is the day longest during the year?"}],
[{"role": "user", "content": "Where is bigger, the moon or the sun?"}],
]
prompt_embeds_list = [
get_prompt_embeds(chat, tokenizer, embedding_layer) for chat in chats
]
outputs = llm.generate([{"prompt_embeds": embeds} for embeds in prompt_embeds_list])
print("\n[Batch Inference Outputs]")
print("-" * 30)
for i, o in enumerate(outputs):
print(f"Q{i + 1}: {chats[i][0]['content']}")
print(f"A{i + 1}: {o.outputs[0].text}\n")
print("-" * 30)
def main():
model_name = "meta-llama/Llama-3.2-1B-Instruct"
tokenizer, embedding_layer, llm = init_tokenizer_and_llm(model_name)
single_prompt_inference(llm, tokenizer, embedding_layer)
batch_prompt_inference(llm, tokenizer, embedding_layer)
if __name__ == "__main__":
main()
# Qwen2.5-Omni Offline Inference Examples
This folder provides several example scripts on how to inference Qwen2.5-Omni offline.
## Thinker Only
```bash
# Audio + image + video
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q mixed_modalities
# Read vision and audio inputs from a single video file
# NOTE: V1 engine does not support interleaved modalities yet.
VLLM_USE_V1=0 \
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q use_audio_in_video
# Multiple audios
VLLM_USE_V1=0 \
python examples/offline_inference/qwen2_5_omni/only_thinker.py \
-q multi_audios
```
This script will run the thinker part of Qwen2.5-Omni, and generate text response.
You can also test Qwen2.5-Omni on a single modality:
```bash
# Process audio inputs
python examples/offline_inference/audio_language.py \
--model-type qwen2_5_omni
# Process image inputs
python examples/offline_inference/vision_language.py \
--modality image \
--model-type qwen2_5_omni
# Process video inputs
python examples/offline_inference/vision_language.py \
--modality video \
--model-type qwen2_5_omni
```
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference
with the correct prompt format on Qwen2.5-Omni (thinker only).
"""
from typing import NamedTuple
import vllm.envs as envs
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.multimodal.image import convert_image_mode
from vllm.utils import FlexibleArgumentParser
class QueryResult(NamedTuple):
inputs: dict
limit_mm_per_prompt: dict[str, int]
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech."
)
def get_mixed_modalities_query() -> QueryResult:
question = (
"What is recited in the audio? "
"What is the content of this image? Why is this video funny?"
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
"<|vision_bos|><|IMAGE|><|vision_eos|>"
"<|vision_bos|><|VIDEO|><|vision_eos|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
"image": convert_image_mode(
ImageAsset("cherry_blossom").pil_image, "RGB"
),
"video": VideoAsset(name="baby_reading", num_frames=16).np_ndarrays,
},
},
limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1},
)
def get_use_audio_in_video_query() -> QueryResult:
question = (
"Describe the content of the video, then convert what the baby say into text."
)
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
asset = VideoAsset(name="baby_reading", num_frames=16)
audio = asset.get_audio(sampling_rate=16000)
assert not envs.VLLM_USE_V1, (
"V1 does not support use_audio_in_video. "
"Please launch this example with "
"`VLLM_USE_V1=0`."
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"video": asset.np_ndarrays,
"audio": audio,
},
"mm_processor_kwargs": {
"use_audio_in_video": True,
},
},
limit_mm_per_prompt={"audio": 1, "video": 1},
)
def get_multi_audios_query() -> QueryResult:
question = "Are these two audio clips the same?"
prompt = (
f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
"<|audio_bos|><|AUDIO|><|audio_eos|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n"
)
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": [
AudioAsset("winning_call").audio_and_sample_rate,
AudioAsset("mary_had_lamb").audio_and_sample_rate,
],
},
},
limit_mm_per_prompt={
"audio": 2,
},
)
query_map = {
"mixed_modalities": get_mixed_modalities_query,
"use_audio_in_video": get_use_audio_in_video_query,
"multi_audios": get_multi_audios_query,
}
def main(args):
model_name = "Qwen/Qwen2.5-Omni-7B"
query_result = query_map[args.query_type]()
llm = LLM(
model=model_name,
max_model_len=5632,
max_num_seqs=5,
limit_mm_per_prompt=query_result.limit_mm_per_prompt,
seed=args.seed,
)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
outputs = llm.generate(query_result.inputs, sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"audio language models"
)
parser.add_argument(
"--query-type",
"-q",
type=str,
default="mixed_modalities",
choices=query_map.keys(),
help="Query type.",
)
parser.add_argument(
"--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
from vllm import LLM
model_name = "Qwen/Qwen3-Reranker-0.6B"
# What is the difference between the official original version and one
# that has been converted into a sequence classification model?
# Qwen3-Reranker is a language model that doing reranker by using the
# logits of "no" and "yes" tokens.
# It needs to computing 151669 tokens logits, making this method extremely
# inefficient, not to mention incompatible with the vllm score API.
# A method for converting the original model into a sequence classification
# model was proposed. See:https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3
# Models converted offline using this method can not only be more efficient
# and support the vllm score API, but also make the init parameters more
# concise, for example.
# model = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", task="score")
# If you want to load the official original version, the init parameters are
# as follows.
def get_model() -> LLM:
"""Initializes and returns the LLM model for Qwen3-Reranker."""
return LLM(
model=model_name,
task="score",
hf_overrides={
"architectures": ["Qwen3ForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
},
)
# Why do we need hf_overrides for the official original version:
# vllm converts it to Qwen3ForSequenceClassification when loaded for
# better performance.
# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],`
# to manually route to Qwen3ForSequenceClassification.
# - Then, we will extract the vector corresponding to classifier_from_token
# from lm_head using `"classifier_from_token": ["no", "yes"]`.
# - Third, we will convert these two vectors into one vector. The use of
# conversion logic is controlled by `using "is_original_qwen3_reranker": True`.
# Please use the query_template and document_template to format the query and
# document for better reranker results.
prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n'
suffix = "<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\n"
query_template = "{prefix}<Instruct>: {instruction}\n<Query>: {query}\n"
document_template = "<Document>: {doc}{suffix}"
def main() -> None:
instruction = (
"Given a web search query, retrieve relevant passages that answer the query"
)
queries = [
"What is the capital of China?",
"Explain gravity",
]
documents = [
"The capital of China is Beijing.",
"Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.",
]
queries = [
query_template.format(prefix=prefix, instruction=instruction, query=query)
for query in queries
]
documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents]
model = get_model()
outputs = model.score(queries, documents)
print("-" * 30)
print([output.outputs.score for output in outputs])
print("-" * 30)
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
from urllib.request import urlopen
from vllm import LLM, SamplingParams
os.environ["VLLM_ATTENTION_BACKEND"] = "DUAL_CHUNK_FLASH_ATTN"
os.environ["VLLM_ALLOW_LONG_MAX_MODEL_LEN"] = "1"
def load_prompt() -> str:
# Test cases with various lengths can be found at:
#
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/64k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/200k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt
# https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/1m.txt
with urlopen(
"https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2.5-1M/test-data/600k.txt",
timeout=5,
) as response:
prompt = response.read().decode("utf-8")
return prompt
# Processing the prompt.
def process_requests(llm: LLM, prompts: list[str]) -> None:
# Create a sampling params object.
sampling_params = SamplingParams(
temperature=0.7,
top_p=0.8,
top_k=20,
repetition_penalty=1.05,
detokenize=True,
max_tokens=256,
)
# Generate texts from the prompts.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt_token_ids = output.prompt_token_ids
generated_text = output.outputs[0].text
print(
f"Prompt length: {len(prompt_token_ids)}, "
f"Generated text: {generated_text!r}"
)
# Create an LLM.
def initialize_engine() -> LLM:
llm = LLM(
model="Qwen/Qwen2.5-7B-Instruct-1M",
max_model_len=1048576,
tensor_parallel_size=4,
enforce_eager=True,
enable_chunked_prefill=True,
max_num_batched_tokens=131072,
)
return llm
def main():
llm = initialize_engine()
prompt = load_prompt()
process_requests(llm, [prompt])
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Demonstrates how to achieve reproducibility in vLLM.
Main article: https://docs.vllm.ai/en/latest/usage/reproducibility.html
"""
import os
import random
from vllm import LLM, SamplingParams
# V1 only: Turn off multiprocessing to make the scheduling deterministic.
os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
# V0 only: Set the global seed. The default seed is None, which is
# not reproducible.
SEED = 42
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
def main():
llm = LLM(model="facebook/opt-125m", seed=SEED)
outputs = llm.generate(prompts, sampling_params)
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# Try generating random numbers outside vLLM
# The same number is output across runs, meaning that the random state
# in the user code has been updated by vLLM
print(random.randint(0, 100))
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
a simple demonstration of RLHF with vLLM, inspired by
the OpenRLHF framework https://github.com/OpenRLHF/OpenRLHF .
It follows the design that, training processes and inference processes
are different, and they live on different GPUs.
Training processes send prompts to inference processes to generate data,
and also synchronize the weights of the model by broadcasting the weights
from the training process to the inference process.
Note that this is a simple demonstration of one training instance and one
inference instance. In practice, there could be multiple training instances
and multiple inference instances. For the full implementation, please refer
to the OpenRLHF framework.
"""
import os
import ray
import torch
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from rlhf_utils import stateless_init_process_group
from transformers import AutoModelForCausalLM
from vllm import LLM, SamplingParams
from vllm.utils import get_ip, get_open_port
class MyLLM(LLM):
def __init__(self, *args, **kwargs):
# a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
super().__init__(*args, **kwargs)
"""
Start the training process, here we use huggingface transformers
as an example to hold a model on GPU 0.
"""
train_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
train_model.to("cuda:0")
"""
Start the inference process, here we use vLLM to hold a model on GPU 1 and
GPU 2. For the details on how to use ray, please refer to the ray
documentation https://docs.ray.io/en/latest/ .
"""
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
ray.init()
pg_inference = placement_group([{"GPU": 1, "CPU": 0}] * 2)
ray.get(pg_inference.ready())
scheduling_inference = PlacementGroupSchedulingStrategy(
placement_group=pg_inference,
placement_group_capture_child_tasks=True,
placement_group_bundle_index=0,
)
"""
launch the vLLM inference engine.
here we use `enforce_eager` to reduce the start time.
"""
llm = ray.remote(
num_cpus=0,
num_gpus=0,
scheduling_strategy=scheduling_inference,
)(MyLLM).remote(
model="facebook/opt-125m",
enforce_eager=True,
worker_extension_cls="rlhf_utils.WorkerExtension",
tensor_parallel_size=2,
distributed_executor_backend="ray",
)
# Generate texts from the prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(temperature=0)
outputs = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# set up the communication between the training process
# and the inference engine.
master_address = get_ip()
master_port = get_open_port()
handle = llm.collective_rpc.remote(
"init_weight_update_group", args=(master_address, master_port, 1, 3)
)
model_update_group = stateless_init_process_group(
master_address, master_port, 0, 3, torch.device("cuda:0")
)
ray.get(handle)
# simulate training, modify the weights of the model.
for name, p in train_model.named_parameters():
p.data.zero_()
# sync weight from the training process to the inference engine.
for name, p in train_model.named_parameters():
handle = llm.collective_rpc.remote("update_weight", args=(name, p.dtype, p.shape))
model_update_group.broadcast(p, src=0, stream=torch.cuda.current_stream())
ray.get(handle)
# check if the weights are updated.
assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
# use the updated model to generate texts, they will be nonsense
# because the weights are all zeros.
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs_updated:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
a simple demonstration to show how to co-locate
vLLM worker with training actors on the same GPUs,
for RLHF-like applications.
The key points:
- Control the placement of the vLLM workers with Ray, by setting
VLLM_RAY_PER_WORKER_GPUS and VLLM_RAY_BUNDLE_INDICES properly.
- Use cuda-ipc to pass tensors, since NCCL does not work when we have
multiple processes on the same GPU.
"""
import os
import ray
import torch
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from vllm import LLM
class MyLLM(LLM):
def __init__(self, *args, bundle_indices: list, **kwargs):
# a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
# every worker will use 0.4 GPU, so that we can schedule
# 2 instances on the same GPUs.
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices))
print(f"creating LLM with bundle_indices={bundle_indices}")
super().__init__(*args, **kwargs)
class RayTrainingActor:
def __init__(self):
# ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
from transformers import AutoModelForCausalLM
self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
self.model.to("cuda:0")
for name, p in self.model.named_parameters():
p.data.zero_()
torch.cuda.synchronize()
# the argument for get_device_uuid is the index
# of the GPU in the visible devices.
from vllm.platforms import current_platform
self.device_uuid = current_platform.get_device_uuid(0)
def report_device_id(self) -> str:
return self.device_uuid
def get_weight_ipc_handles(self):
from torch.multiprocessing.reductions import reduce_tensor
data = {}
for name, p in self.model.named_parameters():
# the training actor might only have a subset of the weights
# and need to all-gather the weights from all the actors.
# for demonstration, here we assume all training actors have
# the full weights.
data[name] = reduce_tensor(p.detach())
return {self.device_uuid: data}
# ray manages 4 GPUs
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
ray.init()
# we want to co-locate vLLM instance and the training actor
# on the same set of GPUs.
# the placement plan is as follows:
# GPU 0 and 1: training actor 0, 1, and vLLM instance 0 (with TP=2)
# GPU 2 and 3: training actor 2, 3, and vLLM instance 1 (with TP=2)
pg = placement_group([{"GPU": 1, "CPU": 0}] * 4)
ray.get(pg.ready())
print(f"placement group has bundles {pg.bundle_specs=}")
training_actors = []
training_actor_device_ids = []
inference_engines = []
inference_engine_device_ids = []
for bundle_index in [0, 1, 2, 3]:
training_actor = ray.remote(
num_cpus=0,
num_gpus=0.4,
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_capture_child_tasks=True,
placement_group_bundle_index=bundle_index,
),
)(RayTrainingActor).remote()
training_actors.append(training_actor)
for bundle_index, training_actor in enumerate(training_actors):
device_id = ray.get(training_actor.report_device_id.remote())
print(f"training actor {bundle_index} is on {device_id}")
training_actor_device_ids.append(device_id)
for i, bundle_indices in enumerate([[0, 1], [2, 3]]):
# IMPORTANT: when creating vLLM instances, we need to
# make sure there are no GPU activities on the target GPUs,
# otherwise, they will interfere with the vLLM memory profiling,
# and cause unexpected behaviors.
llm = ray.remote(
num_cpus=0,
num_gpus=0,
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_capture_child_tasks=True,
),
)(MyLLM).remote(
model="facebook/opt-125m",
enforce_eager=True,
worker_extension_cls="rlhf_utils.ColocateWorkerExtension",
tensor_parallel_size=2,
distributed_executor_backend="ray",
gpu_memory_utilization=0.4,
bundle_indices=bundle_indices,
)
inference_engines.append(llm)
# don't call any method on the inference engine here,
# otherwise it will block until the vLLM instance is created.
for i, llm in enumerate(inference_engines):
inference_engine_device_ids.append(
ray.get(llm.collective_rpc.remote("report_device_id", args=tuple()))
)
print(f"inference engine {i} is on {inference_engine_device_ids[-1]}")
# check the placement
# the first two training actors should be
# on the same GPUs as the first inference engine
assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
# the last two training actors should be
# on the same GPUs as the second inference engine
assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
print("gather all the IPC handles from the training actors")
ipc_handles = {}
for actor in training_actors:
ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
print("update the weights of the inference engines")
for llm in inference_engines:
ray.get(
llm.collective_rpc.remote(
"update_weights_from_ipc_handles", args=(ipc_handles,)
)
)
print("check if the weights are updated")
for llm in inference_engines:
assert ray.get(llm.collective_rpc.remote("check_weights_changed", args=tuple()))
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
def stateless_init_process_group(master_address, master_port, rank, world_size, device):
"""
vLLM provides `StatelessProcessGroup` to create a process group
without considering the global process group in torch.distributed.
It is recommended to create `StatelessProcessGroup`, and then initialize
the data-plane communication (NCCL) between external (train processes)
and vLLM workers.
"""
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
from vllm.distributed.utils import StatelessProcessGroup
pg = StatelessProcessGroup.create(
host=master_address, port=master_port, rank=rank, world_size=world_size
)
pynccl = PyNcclCommunicator(pg, device=device)
return pynccl
class WorkerExtension:
"""
The class for vLLM's worker to inherit from.
By defining an extension class, the code can work no matter what is
the underlying worker class. This way, the code can be compatible
with both vLLM V0 and V1.
NOTE: we define this class in a separate module, and the main module
should pass the full qualified name as `worker_extension_cls` argument.
"""
def init_weight_update_group(
self, master_address, master_port, rank_offset, world_size
):
from vllm.distributed.parallel_state import get_world_group
rank = get_world_group().rank + rank_offset
self.model_update_group = stateless_init_process_group(
master_address,
master_port,
rank,
world_size,
self.device,
)
def update_weight(self, name, dtype, shape):
weight = torch.empty(shape, dtype=dtype, device="cuda")
self.model_update_group.broadcast(
weight, src=0, stream=torch.cuda.current_stream()
)
self.model_runner.model.load_weights(weights=[(name, weight)])
del weight
def check_weights_changed(self):
"""
Check if the weights are updated to 0.
"""
weights_updated = True
for name, p in self.model_runner.model.named_parameters():
weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p))
return weights_updated
class ColocateWorkerExtension:
"""
The class for vLLM's worker to inherit from, in the colocate setting.
By defining an extension class, the code can work no matter what is
the underlying worker class. This way, the code can be compatible
with both vLLM V0 and V1.
NOTE: we define this class in a separate module, and the main module
should pass the full qualified name as `worker_extension_cls` argument.
"""
def report_device_id(self) -> str:
from vllm.platforms import current_platform
self.device_uuid = current_platform.get_device_uuid(self.device.index)
return self.device_uuid
def update_weights_from_ipc_handles(self, ipc_handles):
handles = ipc_handles[self.device_uuid]
device_id = self.device.index
weights = []
for name, handle in handles.items():
func, args = handle
list_args = list(args)
# the key is to change device id to the current device id
# in case two processes have different CUDA_VISIBLE_DEVICES
list_args[6] = device_id
tensor = func(*list_args)
weights.append((name, tensor))
self.model_runner.model.load_weights(weights=weights)
torch.cuda.synchronize()
def check_weights_changed(self):
"""
Check if the weights are updated to 0.
"""
weights_updated = True
for name, p in self.model_runner.model.named_parameters():
weights_updated = weights_updated and torch.allclose(p, torch.zeros_like(p))
return weights_updated
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment