Commit ec5e299c authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.7.3' into v0.7.3-dev

parents 47bd229c ed6e9075
# SPDX-License-Identifier: Apache-2.0
from dataclasses import asdict
from vllm import LLM, SamplingParams
from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
def get_prompts(num_prompts: int):
# The default sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
if num_prompts != len(prompts):
prompts = (prompts * ((num_prompts // len(prompts)) + 1))[:num_prompts]
return prompts
def main(args):
# Create prompts
prompts = get_prompts(args.num_prompts)
# Create a sampling params object.
sampling_params = SamplingParams(n=args.n,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_tokens)
# Create an LLM.
# The default model is 'facebook/opt-125m'
engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**asdict(engine_args))
# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == '__main__':
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
group = parser.add_argument_group("SamplingParams options")
group.add_argument("--num-prompts",
type=int,
default=4,
help="Number of prompts used for inference")
group.add_argument("--max-tokens",
type=int,
default=16,
help="Generated output length for sampling")
group.add_argument('--n',
type=int,
default=1,
help='Number of generated sequences per prompt')
group.add_argument('--temperature',
type=float,
default=0.8,
help='Temperature for text generation')
group.add_argument('--top-p',
type=float,
default=0.95,
help='top_p for text generation')
group.add_argument('--top-k',
type=int,
default=-1,
help='top_k for text generation')
args = parser.parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="meta-llama/Llama-2-13b-chat-hf", cpu_offload_gb=10)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# SPDX-License-Identifier: Apache-2.0
"""
This file demonstrates the example usage of disaggregated prefilling
We will launch 2 vllm instances (GPU 0 for prefill and GPU 1 for decode),
and then transfer the KV cache between them.
"""
import os
import time
from multiprocessing import Event, Process
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig
def run_prefill(prefill_done):
# We use GPU 0 for prefill node.
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# The prefill node receives two requests, while the decode node receives
# three requests. So the decode node will only receive the KV Cache for
# requests 1 and 3. The decode node will use the KV Cache of requests 1
# and 3 and do prefilling on request 2.
prompts = [
"Hello, my name is",
"Hi, your name is",
# The decode node will actually "prefill" this request.
"Tell me a very long story",
]
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
# Using PyNcclConnector to transmit KV caches between vLLM instances.
# This instance is the prefill node (kv_producer, rank 0).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc = KVTransferConfig.from_cli(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2}'
)
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
kv_transfer_config=ktc,
max_model_len=2000,
gpu_memory_utilization=0.8)
llm.generate(prompts, sampling_params)
print("Prefill node is finished.")
prefill_done.set()
# To keep the prefill node running in case the decode node is not done;
# otherwise, the script might exit prematurely, causing incomplete decoding.
try:
while True:
time.sleep(1)
except KeyboardInterrupt:
print("Script stopped by user.")
def run_decode(prefill_done):
# We use GPU 1 for decode node.
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
prompts = [
"Hello, my name is",
"Hi, your name is",
"Tell me a very long story",
]
sampling_params = SamplingParams(temperature=0, top_p=0.95)
# Using PyNcclConnector to transmit KV caches between vLLM instances.
# This instance is the decode node (kv_consumer, rank 1).
# The number of parallel instances for KV cache transfer is set to 2,
# as required for PyNcclConnector.
ktc = KVTransferConfig.from_cli(
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2}'
)
# Set GPU memory utilization to 0.8 for an A6000 GPU with 40GB
# memory. You may need to adjust the value to fit your GPU.
llm = LLM(model="meta-llama/Meta-Llama-3.1-8B-Instruct",
kv_transfer_config=ktc,
max_model_len=2000,
gpu_memory_utilization=0.8)
# Wait for the producer to start the pipe
print("Waiting for prefill node to finish...")
prefill_done.wait()
# At this point when the prefill_done is set, the kv-cache should have been
# transferred to this decode node, so we can start decoding.
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
prefill_done = Event()
prefill_process = Process(target=run_prefill, args=(prefill_done, ))
decode_process = Process(target=run_decode, args=(prefill_done, ))
# Start prefill node
prefill_process.start()
# Start decode node
decode_process.start()
# Terminate the prefill node when decode is finished
decode_process.join()
prefill_process.terminate()
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM
# Sample prompts.
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
# Create an LLM.
# You should pass task="embed" for embedding models
model = LLM(
model="intfloat/e5-mistral-7b-instruct",
task="embed",
enforce_eager=True,
)
# Generate embedding. The output is a list of EmbeddingRequestOutputs.
outputs = model.embed(prompts)
# Print the outputs.
for prompt, output in zip(prompts, outputs):
embeds = output.outputs.embedding
embeds_trimmed = ((str(embeds[:16])[:-1] +
", ...]") if len(embeds) > 16 else embeds)
print(f"Prompt: {prompt!r} | "
f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
# SPDX-License-Identifier: Apache-2.0
from huggingface_hub import hf_hub_download
from vllm import LLM, SamplingParams
def run_gguf_inference(model_path, tokenizer):
# Sample prompts.
prompts = [
"How many helicopters can a human eat in one sitting?",
"What's the future of AI?",
]
prompts = [[{"role": "user", "content": prompt}] for prompt in prompts]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0, max_tokens=128)
# Create an LLM.
llm = LLM(model=model_path, tokenizer=tokenizer)
outputs = llm.chat(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__":
repo_id = "bartowski/Phi-3-medium-4k-instruct-GGUF"
filename = "Phi-3-medium-4k-instruct-IQ2_M.gguf"
tokenizer = "microsoft/Phi-3-medium-4k-instruct"
model = hf_hub_download(repo_id, filename=filename)
run_gguf_inference(model, tokenizer)
...@@ -5,50 +5,49 @@ This is a guide to performing batch inference using the OpenAI batch file format ...@@ -5,50 +5,49 @@ This is a guide to performing batch inference using the OpenAI batch file format
``` ```
## File Format ## File Format
The OpenAI batch file format consists of a series of json objects on new lines. The OpenAI batch file format consists of a series of json objects on new lines.
[See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl) [See here for an example file.](https://github.com/vllm-project/vllm/blob/main/examples/offline_inference/openai/openai_example_batch.jsonl)
Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details. Each line represents a separate request. See the [OpenAI package reference](https://platform.openai.com/docs/api-reference/batch/requestInput) for more details.
```{note} ```{note}
We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon). We currently support `/v1/chat/completions`, `/v1/embeddings`, and `/v1/score` endpoints (completions coming soon).
``` ```
## Pre-requisites ## Pre-requisites
* The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`. * The examples in this document use `meta-llama/Meta-Llama-3-8B-Instruct`.
- Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens) - Create a [user access token](https://huggingface.co/docs/hub/en/security-tokens)
- Install the token on your machine (Run `huggingface-cli login`). - Install the token on your machine (Run `huggingface-cli login`).
- Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions. - Get access to the gated model by [visiting the model card](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and agreeing to the terms and conditions.
## Example 1: Running with a local file ## Example 1: Running with a local file
### Step 1: Create your batch file ### Step 1: Create your batch file
To follow along with this example, you can download the example batch, or create your own batch file in your working directory. To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
``` ```console
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
``` ```
Once you've created your batch file it should look like this Once you've created your batch file it should look like this
``` ```console
$ cat offline_inference/openai/openai_example_batch.jsonl $ cat offline_inference/openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
``` ```
### Step 2: Run the batch ### Step 2: Run the batch
The batch running tool is designed to be used from the command line. The batch running tool is designed to be used from the command line.
You can run the batch with the following command, which will write its results to a file called `results.jsonl` You can run the batch with the following command, which will write its results to a file called `results.jsonl`
``` ```console
python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
``` ```
...@@ -56,7 +55,7 @@ python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_e ...@@ -56,7 +55,7 @@ python -m vllm.entrypoints.openai.run_batch -i offline_inference/openai/openai_e
You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl` You should now have your results at `results.jsonl`. You can check your results by running `cat results.jsonl`
``` ```console
$ cat results.jsonl $ cat results.jsonl
{"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null} {"id":"vllm-383d1c59835645aeb2e07d004d62a826","custom_id":"request-1","response":{"id":"cmpl-61c020e54b964d5a98fa7527bfcdd378","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"Hello! It's great to meet you! I'm here to help with any questions or tasks you may have. What's on your mind today?"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":25,"total_tokens":56,"completion_tokens":31}},"error":null}
{"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null} {"id":"vllm-42e3d09b14b04568afa3f1797751a267","custom_id":"request-2","response":{"id":"cmpl-f44d049f6b3a42d4b2d7850bb1e31bcc","object":"chat.completion","created":1715633336,"model":"meta-llama/Meta-Llama-3-8B-Instruct","choices":[{"index":0,"message":{"role":"assistant","content":"*silence*"},"logprobs":null,"finish_reason":"stop","stop_reason":null}],"usage":{"prompt_tokens":27,"total_tokens":32,"completion_tokens":5}},"error":null}
...@@ -68,7 +67,7 @@ The batch runner supports remote input and output urls that are accessible via h ...@@ -68,7 +67,7 @@ The batch runner supports remote input and output urls that are accessible via h
For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run For example, to run against our example input file located at `https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl`, you can run
``` ```console
python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct python -m vllm.entrypoints.openai.run_batch -i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl -o results.jsonl --model meta-llama/Meta-Llama-3-8B-Instruct
``` ```
...@@ -80,7 +79,7 @@ To integrate with cloud blob storage, we recommend using presigned urls. ...@@ -80,7 +79,7 @@ To integrate with cloud blob storage, we recommend using presigned urls.
### Additional prerequisites ### Additional prerequisites
* [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html). * [Create an S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/creating-bucket.html).
* The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3. * The `awscli` package (Run `pip install awscli`) to configure your credentials and interactively use s3.
- [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html). - [Configure your credentials](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-quickstart.html).
* The `boto3` python package (Run `pip install boto3`) to generate presigned urls. * The `boto3` python package (Run `pip install boto3`) to generate presigned urls.
...@@ -89,13 +88,13 @@ To integrate with cloud blob storage, we recommend using presigned urls. ...@@ -89,13 +88,13 @@ To integrate with cloud blob storage, we recommend using presigned urls.
To follow along with this example, you can download the example batch, or create your own batch file in your working directory. To follow along with this example, you can download the example batch, or create your own batch file in your working directory.
``` ```console
wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl wget https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai/openai_example_batch.jsonl
``` ```
Once you've created your batch file it should look like this Once you've created your batch file it should look like this
``` ```console
$ cat offline_inference/openai/openai_example_batch.jsonl $ cat offline_inference/openai/openai_example_batch.jsonl
{"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-1", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are a helpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}} {"custom_id": "request-2", "method": "POST", "url": "/v1/chat/completions", "body": {"model": "meta-llama/Meta-Llama-3-8B-Instruct", "messages": [{"role": "system", "content": "You are an unhelpful assistant."},{"role": "user", "content": "Hello world!"}],"max_completion_tokens": 1000}}
...@@ -103,7 +102,7 @@ $ cat offline_inference/openai/openai_example_batch.jsonl ...@@ -103,7 +102,7 @@ $ cat offline_inference/openai/openai_example_batch.jsonl
Now upload your batch file to your S3 bucket. Now upload your batch file to your S3 bucket.
``` ```console
aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_INPUT_FILE.jsonl
``` ```
...@@ -111,9 +110,9 @@ aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_ ...@@ -111,9 +110,9 @@ aws s3 cp offline_inference/openai/openai_example_batch.jsonl s3://MY_BUCKET/MY_
Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names. Presigned urls can only be generated via the SDK. You can run the following python script to generate your presigned urls. Be sure to replace the `MY_BUCKET`, `MY_INPUT_FILE.jsonl`, and `MY_OUTPUT_FILE.jsonl` placeholders with your bucket and file names.
(The script is adapted from https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py) (The script is adapted from <https://github.com/awsdocs/aws-doc-sdk-examples/blob/main/python/example_code/s3/s3_basics/presigned_url.py>)
``` ```python
import boto3 import boto3
from botocore.exceptions import ClientError from botocore.exceptions import ClientError
...@@ -149,7 +148,7 @@ print(f"{output_url=}") ...@@ -149,7 +148,7 @@ print(f"{output_url=}")
This script should output This script should output
``` ```text
input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091' input_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091' output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091'
``` ```
...@@ -158,7 +157,7 @@ output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AW ...@@ -158,7 +157,7 @@ output_url='https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AW
You can now run the batch runner, using the urls generated in the previous section. You can now run the batch runner, using the urls generated in the previous section.
``` ```console
python -m vllm.entrypoints.openai.run_batch \ python -m vllm.entrypoints.openai.run_batch \
-i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ -i "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_INPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
-o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \ -o "https://s3.us-west-2.amazonaws.com/MY_BUCKET/MY_OUTPUT_FILE.jsonl?AWSAccessKeyId=ABCDEFGHIJKLMNOPQRST&Signature=abcdefghijklmnopqrstuvwxyz12345&Expires=1715800091" \
...@@ -169,7 +168,7 @@ python -m vllm.entrypoints.openai.run_batch \ ...@@ -169,7 +168,7 @@ python -m vllm.entrypoints.openai.run_batch \
Your results are now on S3. You can view them in your terminal by running Your results are now on S3. You can view them in your terminal by running
``` ```console
aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl - aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
``` ```
...@@ -180,10 +179,10 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl - ...@@ -180,10 +179,10 @@ aws s3 cp s3://MY_BUCKET/MY_OUTPUT_FILE.jsonl -
* Ensure you are using `vllm >= 0.5.5`. * Ensure you are using `vllm >= 0.5.5`.
### Step 1: Create your batch file ### Step 1: Create your batch file
Add embedding requests to your batch file. The following is an example: Add embedding requests to your batch file. The following is an example:
``` ```text
{"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}} {"custom_id": "request-1", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are a helpful assistant."}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}} {"custom_id": "request-2", "method": "POST", "url": "/v1/embeddings", "body": {"model": "intfloat/e5-mistral-7b-instruct", "input": "You are an unhelpful assistant."}}
``` ```
...@@ -198,7 +197,7 @@ You can run the batch using the same command as in earlier examples. ...@@ -198,7 +197,7 @@ You can run the batch using the same command as in earlier examples.
You can check your results by running `cat results.jsonl` You can check your results by running `cat results.jsonl`
``` ```console
$ cat results.jsonl $ cat results.jsonl
{"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null} {"id":"vllm-db0f71f7dec244e6bce530e0b4ef908b","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-3580bf4d4ae54d52b67eee266a6eab20","body":{"id":"embd-33ac2efa7996430184461f2e38529746","object":"list","created":444647,"model":"intfloat/e5-mistral-7b-instruct","data":[{"index":0,"object":"embedding","embedding":[0.016204833984375,0.0092010498046875,0.0018358230590820312,-0.0028228759765625,0.001422882080078125,-0.0031147003173828125,...]}],"usage":{"prompt_tokens":8,"total_tokens":8,"completion_tokens":0}}},"error":null}
... ...
...@@ -211,10 +210,10 @@ $ cat results.jsonl ...@@ -211,10 +210,10 @@ $ cat results.jsonl
* Ensure you are using `vllm >= 0.7.0`. * Ensure you are using `vllm >= 0.7.0`.
### Step 1: Create your batch file ### Step 1: Create your batch file
Add score requests to your batch file. The following is an example: Add score requests to your batch file. The following is an example:
``` ```text
{"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} {"custom_id": "request-1", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
{"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}} {"custom_id": "request-2", "method": "POST", "url": "/v1/score", "body": {"model": "BAAI/bge-reranker-v2-m3", "text_1": "What is the capital of France?", "text_2": ["The capital of Brazil is Brasilia.", "The capital of France is Paris."]}}
``` ```
...@@ -229,7 +228,7 @@ You can run the batch using the same command as in earlier examples. ...@@ -229,7 +228,7 @@ You can run the batch using the same command as in earlier examples.
You can check your results by running `cat results.jsonl` You can check your results by running `cat results.jsonl`
``` ```console
$ cat results.jsonl $ cat results.jsonl
{"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} {"id":"vllm-f87c5c4539184f618e555744a2965987","custom_id":"request-1","response":{"status_code":200,"request_id":"vllm-batch-806ab64512e44071b37d3f7ccd291413","body":{"id":"score-4ee45236897b4d29907d49b01298cdb1","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.0010900497436523438},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
{"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null} {"id":"vllm-41990c51a26d4fac8419077f12871099","custom_id":"request-2","response":{"status_code":200,"request_id":"vllm-batch-73ce66379026482699f81974e14e1e99","body":{"id":"score-13f2ffe6ba40460fbf9f7f00ad667d75","object":"list","created":1737847944,"model":"BAAI/bge-reranker-v2-m3","data":[{"index":0,"object":"score","score":0.001094818115234375},{"index":1,"object":"score","score":1.0}],"usage":{"prompt_tokens":37,"total_tokens":37,"completion_tokens":0,"prompt_tokens_details":null}}},"error":null}
......
# SPDX-License-Identifier: Apache-2.0
"""
This is a demo script showing how to use the
PrithviGeospatialMAE model with vLLM
This script is based on: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/blob/main/inference.py # noqa
Target model weights: https://huggingface.co/ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11/resolve/main/Prithvi-EO-V2-300M-TL-Sen1Floods11.pt # noqa
The requirements for running this script are:
- Installing [terratorch, albumentations, rasterio] in your python environment
- downloading the model weights in a 'model' folder local to the script
(temporary measure until the proper config.json file is uploaded to HF)
- download an input example image (India_900498_S2Hand.tif) and place it in
the same folder with the script (or specify with the --data_file argument)
Run the example:
python prithvi_geospatial_mae.py
""" # noqa: E501
import argparse
import datetime
import os
import re
from typing import List, Union
import albumentations
import numpy as np
import rasterio
import torch
from einops import rearrange
from terratorch.datamodules import Sen1Floods11NonGeoDataModule
from vllm import LLM
NO_DATA = -9999
NO_DATA_FLOAT = 0.0001
OFFSET = 0
PERCENTILE = 99
model_config = """{
"architectures": ["PrithviGeoSpatialMAE"],
"num_classes": 0,
"pretrained_cfg": {
"task_args": {
"task": "SemanticSegmentationTask",
"model_factory": "EncoderDecoderFactory",
"loss": "ce",
"ignore_index": -1,
"lr": 0.001,
"freeze_backbone": false,
"freeze_decoder": false,
"plot_on_val": 10,
"optimizer": "AdamW",
"scheduler": "CosineAnnealingLR"
},
"model_args": {
"backbone_pretrained": false,
"backbone": "prithvi_eo_v2_300_tl",
"decoder": "UperNetDecoder",
"decoder_channels": 256,
"decoder_scale_modules": true,
"num_classes": 2,
"rescale": true,
"backbone_bands": [
"BLUE",
"GREEN",
"RED",
"NIR_NARROW",
"SWIR_1",
"SWIR_2"
],
"head_dropout": 0.1,
"necks": [
{
"name": "SelectIndices",
"indices": [
5,
11,
17,
23
]
},
{
"name": "ReshapeTokensToImage"
}
]
},
"optimizer_params" : {
"lr": 5.0e-05,
"betas": [0.9, 0.999],
"eps": [1.0e-08],
"weight_decay": 0.05,
"amsgrad": false,
"maximize": false,
"capturable": false,
"differentiable": false
},
"scheduler_params" : {
"T_max": 50,
"eta_min": 0,
"last_epoch": -1,
"verbose": "deprecated"
}
},
"torch_dtype": "float32"
}
"""
# Temporarily creating the "config.json" for the model.
# This is going to disappear once the correct config.json is available on HF
with open(os.path.join(os.path.dirname(__file__), "./model/config.json"),
'w') as config_file:
config_file.write(model_config)
datamodule_config = {
'bands': ['BLUE', 'GREEN', 'RED', 'NIR_NARROW', 'SWIR_1', 'SWIR_2'],
'batch_size':
16,
'constant_scale':
0.0001,
'data_root':
'/dccstor/geofm-finetuning/datasets/sen1floods11',
'drop_last':
True,
'no_data_replace':
0.0,
'no_label_replace':
-1,
'num_workers':
8,
'test_transform': [
albumentations.Resize(always_apply=False,
height=448,
interpolation=1,
p=1,
width=448),
albumentations.pytorch.ToTensorV2(transpose_mask=False,
always_apply=True,
p=1.0)
],
}
class PrithviMAE:
def __init__(self):
print("Initializing PrithviMAE model")
self.model = LLM(model=os.path.join(os.path.dirname(__file__),
"./model"),
skip_tokenizer_init=True,
dtype="float32")
def run(self, input_data, location_coords):
print("################ Running inference on vLLM ##############")
# merge the inputs into one data structure
mm_data = {
"pixel_values":
torch.empty(0) if input_data is None else input_data,
"location_coords":
torch.empty(0) if location_coords is None else location_coords
}
prompt = {"prompt_token_ids": [1], "multi_modal_data": mm_data}
outputs = self.model.encode(prompt, use_tqdm=False)
print(
"################ Inference done (it took seconds) ##############"
)
return outputs[0].outputs.data
def generate_datamodule():
datamodule = Sen1Floods11NonGeoDataModule(
data_root=datamodule_config['data_root'],
batch_size=datamodule_config["batch_size"],
num_workers=datamodule_config["num_workers"],
bands=datamodule_config["bands"],
drop_last=datamodule_config["drop_last"],
test_transform=datamodule_config["test_transform"
""])
return datamodule
def process_channel_group(orig_img, channels):
"""
Args:
orig_img: torch.Tensor representing original image (reference)
with shape = (bands, H, W).
channels: list of indices representing RGB channels.
Returns:
torch.Tensor with shape (num_channels, height, width) for original image
"""
orig_img = orig_img[channels, ...]
valid_mask = torch.ones_like(orig_img, dtype=torch.bool)
valid_mask[orig_img == NO_DATA_FLOAT] = False
# Rescale (enhancing contrast)
max_value = max(3000, np.percentile(orig_img[valid_mask], PERCENTILE))
min_value = OFFSET
orig_img = torch.clamp((orig_img - min_value) / (max_value - min_value), 0,
1)
# No data as zeros
orig_img[~valid_mask] = 0
return orig_img
def read_geotiff(file_path: str):
"""Read all bands from *file_path* and return image + meta info.
Args:
file_path: path to image file.
Returns:
np.ndarray with shape (bands, height, width)
meta info dict
"""
with rasterio.open(file_path) as src:
img = src.read()
meta = src.meta
try:
coords = src.lnglat()
except Exception:
# Cannot read coords
coords = None
return img, meta, coords
def save_geotiff(image, output_path: str, meta: dict):
"""Save multi-band image in Geotiff file.
Args:
image: np.ndarray with shape (bands, height, width)
output_path: path where to save the image
meta: dict with meta info.
"""
with rasterio.open(output_path, "w", **meta) as dest:
for i in range(image.shape[0]):
dest.write(image[i, :, :], i + 1)
return
def _convert_np_uint8(float_image: torch.Tensor):
image = float_image.numpy() * 255.0
image = image.astype(dtype=np.uint8)
return image
def load_example(
file_paths: List[str],
mean: List[float] = None,
std: List[float] = None,
indices: Union[list[int], None] = None,
):
"""Build an input example by loading images in *file_paths*.
Args:
file_paths: list of file paths .
mean: list containing mean values for each band in the images
in *file_paths*.
std: list containing std values for each band in the images
in *file_paths*.
Returns:
np.array containing created example
list of meta info for each image in *file_paths*
"""
imgs = []
metas = []
temporal_coords = []
location_coords = []
for file in file_paths:
img, meta, coords = read_geotiff(file)
# Rescaling (don't normalize on nodata)
img = np.moveaxis(img, 0, -1) # channels last for rescaling
if indices is not None:
img = img[..., indices]
if mean is not None and std is not None:
img = np.where(img == NO_DATA, NO_DATA_FLOAT, (img - mean) / std)
imgs.append(img)
metas.append(meta)
if coords is not None:
location_coords.append(coords)
try:
match = re.search(r'(\d{7,8}T\d{6})', file)
if match:
year = int(match.group(1)[:4])
julian_day = match.group(1).split('T')[0][4:]
if len(julian_day) == 3:
julian_day = int(julian_day)
else:
julian_day = datetime.datetime.strptime(
julian_day, '%m%d').timetuple().tm_yday
temporal_coords.append([year, julian_day])
except Exception as e:
print(f'Could not extract timestamp for {file} ({e})')
imgs = np.stack(imgs, axis=0) # num_frames, H, W, C
imgs = np.moveaxis(imgs, -1, 0).astype("float32")
imgs = np.expand_dims(imgs, axis=0) # add batch di
return imgs, temporal_coords, location_coords, metas
def run_model(input_data,
temporal_coords,
location_coords,
model,
datamodule,
img_size,
lightning_model=None):
# Reflect pad if not divisible by img_size
original_h, original_w = input_data.shape[-2:]
pad_h = (img_size - (original_h % img_size)) % img_size
pad_w = (img_size - (original_w % img_size)) % img_size
input_data = np.pad(input_data,
((0, 0), (0, 0), (0, 0), (0, pad_h), (0, pad_w)),
mode="reflect")
# Build sliding window
batch_size = 1
batch = torch.tensor(input_data, device="cpu")
windows = (batch.unfold(3, img_size,
img_size).unfold(4, img_size, img_size))
h1, w1 = windows.shape[3:5]
windows = rearrange(windows,
"b c t h1 w1 h w -> (b h1 w1) c t h w",
h=img_size,
w=img_size)
# Split into batches if number of windows > batch_size
num_batches = windows.shape[0] // batch_size if windows.shape[
0] > batch_size else 1
windows = torch.tensor_split(windows, num_batches, dim=0)
if torch.cuda.is_available():
device = torch.device('cuda')
else:
device = torch.device('cpu')
if temporal_coords:
temporal_coords = torch.tensor(temporal_coords,
device=device).unsqueeze(0)
else:
temporal_coords = None
if location_coords:
location_coords = torch.tensor(location_coords[0],
device=device).unsqueeze(0)
else:
location_coords = None
# Run model
pred_imgs = []
for x in windows:
# Apply standardization
x = datamodule.test_transform(
image=x.squeeze().numpy().transpose(1, 2, 0))
x = datamodule.aug(x)['image']
with torch.no_grad():
x = x.to(device)
pred = model.run(x, location_coords=location_coords)
if lightning_model:
pred_lightning = lightning_model(
x,
temporal_coords=temporal_coords,
location_coords=location_coords)
pred_lightning = pred_lightning.output.detach().cpu()
if not torch.equal(pred, pred_lightning):
print("Inference output is not equal")
y_hat = pred.argmax(dim=1)
y_hat = torch.nn.functional.interpolate(y_hat.unsqueeze(1).float(),
size=img_size,
mode="nearest")
pred_imgs.append(y_hat)
pred_imgs = torch.concat(pred_imgs, dim=0)
# Build images from patches
pred_imgs = rearrange(
pred_imgs,
"(b h1 w1) c h w -> b c (h1 h) (w1 w)",
h=img_size,
w=img_size,
b=1,
c=1,
h1=h1,
w1=w1,
)
# Cut padded area back to original size
pred_imgs = pred_imgs[..., :original_h, :original_w]
# Squeeze (batch size 1)
pred_imgs = pred_imgs[0]
return pred_imgs
def main(
data_file: str,
output_dir: str,
rgb_outputs: bool,
input_indices: list[int] = None,
):
os.makedirs(output_dir, exist_ok=True)
# Load model ---------------------------------------------------------------
model_obj = PrithviMAE()
datamodule = generate_datamodule()
img_size = 256 # Size of Sen1Floods11
# Loading data -------------------------------------------------------------
input_data, temporal_coords, location_coords, meta_data = load_example(
file_paths=[data_file],
indices=input_indices,
)
meta_data = meta_data[0] # only one image
if input_data.mean() > 1:
input_data = input_data / 10000 # Convert to range 0-1
# Running model ------------------------------------------------------------
channels = [
datamodule_config['bands'].index(b) for b in ["RED", "GREEN", "BLUE"]
] # BGR -> RGB
pred = run_model(input_data, temporal_coords, location_coords, model_obj,
datamodule, img_size)
# Save pred
meta_data.update(count=1, dtype="uint8", compress="lzw", nodata=0)
pred_file = os.path.join(
output_dir,
f"pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
save_geotiff(_convert_np_uint8(pred), pred_file, meta_data)
# Save image + pred
meta_data.update(count=3, dtype="uint8", compress="lzw", nodata=0)
if input_data.mean() < 1:
input_data = input_data * 10000 # Scale to 0-10000
rgb_orig = process_channel_group(
orig_img=torch.Tensor(input_data[0, :, 0, ...]),
channels=channels,
)
pred[pred == 0.] = np.nan
img_pred = rgb_orig * 0.7 + pred * 0.3
img_pred[img_pred.isnan()] = rgb_orig[img_pred.isnan()]
img_pred_file = os.path.join(
output_dir,
f"rgb_pred_{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
save_geotiff(
image=_convert_np_uint8(img_pred),
output_path=img_pred_file,
meta=meta_data,
)
# Save image rgb
if rgb_outputs:
rgb_file = os.path.join(
output_dir, "original_rgb_"
f"{os.path.splitext(os.path.basename(data_file))[0]}.tiff")
save_geotiff(
image=_convert_np_uint8(rgb_orig),
output_path=rgb_file,
meta=meta_data,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser("MAE run inference", add_help=False)
parser.add_argument(
"--data_file",
type=str,
default="./India_900498_S2Hand.tif",
help="Path to the file.",
)
parser.add_argument(
"--output_dir",
type=str,
default="output",
help="Path to the directory where to save outputs.",
)
parser.add_argument(
"--input_indices",
default=[1, 2, 3, 8, 11, 12],
type=int,
nargs="+",
help=
"0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data.",
)
parser.add_argument(
"--rgb_outputs",
action="store_true",
help="If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved.",
)
args = parser.parse_args()
main(**vars(args))
...@@ -29,7 +29,6 @@ python3 profiling.py \ ...@@ -29,7 +29,6 @@ python3 profiling.py \
--profile-result-dir profiles --profile-result-dir profiles
``` ```
### Generate Decode Trace ### Generate Decode Trace
This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill). This example runs Llama 3.1 70B with a batch of 32 requests where each has 1 input token and 128 output tokens. This is set up in attempt to profile just the 32 decodes running in parallel by having an extremely small prefill of 1 token and setting `VLLM_TPU_PROFILE_DELAY_MS=1000` to skip the first second of inference (hopefully prefill).
...@@ -51,17 +50,18 @@ python3 profiling.py \ ...@@ -51,17 +50,18 @@ python3 profiling.py \
--max-model-len 2048 --tensor-parallel-size 8 --max-model-len 2048 --tensor-parallel-size 8
``` ```
## Visualizing the profiles ## Visualizing the profiles
Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm). Once you have collected your profiles with this script, you can visualize them using [TensorBoard](https://cloud.google.com/tpu/docs/pytorch-xla-performance-profiling-tpu-vm).
Here are most likely the dependencies you need to install: Here are most likely the dependencies you need to install:
```bash ```bash
pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources pip install tensorflow-cpu tensorboard-plugin-profile etils importlib_resources
``` ```
Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser: Then you just need to point TensorBoard to the directory where you saved the profiles and visit `http://localhost:6006/` in your browser:
```bash ```bash
tensorboard --logdir profiles/ --port 6006 tensorboard --logdir profiles/ --port 6006
``` ```
\ No newline at end of file
...@@ -24,7 +24,7 @@ def main(args: argparse.Namespace): ...@@ -24,7 +24,7 @@ def main(args: argparse.Namespace):
engine_args = EngineArgs.from_cli_args(args) engine_args = EngineArgs.from_cli_args(args)
llm = LLM(**dataclasses.asdict(engine_args)) llm = LLM(**dataclasses.asdict(engine_args))
_ = xp.start_server(9012) server = xp.start_server(9012) # noqa: F841
sampling_params = SamplingParams( sampling_params = SamplingParams(
temperature=0.0, temperature=0.0,
......
...@@ -92,7 +92,7 @@ class MyLLM(LLM): ...@@ -92,7 +92,7 @@ class MyLLM(LLM):
# a hack to make the script work. # a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES # stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level # at the top-level
del os.environ["CUDA_VISIBLE_DEVICES"] os.environ.pop("CUDA_VISIBLE_DEVICES", None)
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
""" """
a simple demonstration to show how to control a simple demonstration to show how to co-locate
the placement of the vLLM workers with Ray. vLLM worker with training actors on the same GPUs,
The key is to set VLLM_RAY_PER_WORKER_GPUS and for RLHF-like applications.
VLLM_RAY_BUNDLE_INDICES properly. The key points:
- Control the placement of the vLLM workers with Ray, by setting
VLLM_RAY_PER_WORKER_GPUS and VLLM_RAY_BUNDLE_INDICES properly.
- Use cuda-ipc to pass tensors, since NCCL does not work when we have
multiple processes on the same GPU.
""" """
import os import os
import ray import ray
import torch
from ray.util.placement_group import placement_group from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
...@@ -19,7 +24,33 @@ class MyWorker(Worker): ...@@ -19,7 +24,33 @@ class MyWorker(Worker):
def report_device_id(self) -> str: def report_device_id(self) -> str:
from vllm.platforms import current_platform from vllm.platforms import current_platform
return current_platform.get_device_uuid(self.device.index) self.device_uuid = current_platform.get_device_uuid(self.device.index)
return self.device_uuid
def update_weights_from_ipc_handles(self, ipc_handles):
handles = ipc_handles[self.device_uuid]
device_id = self.device.index
weights = []
for name, handle in handles.items():
func, args = handle
list_args = list(args)
# the key is to change device id to the current device id
# in case two processes have different CUDA_VISIBLE_DEVICES
list_args[6] = device_id
tensor = func(*list_args)
weights.append((name, tensor))
self.model_runner.model.load_weights(weights=weights)
torch.cuda.synchronize()
def check_weights_changed(self):
"""
Check if the weights are updated to 0.
"""
weights_updated = True
for name, p in self.model_runner.model.named_parameters():
weights_updated = weights_updated and torch.allclose(
p, torch.zeros_like(p))
return weights_updated
class MyLLM(LLM): class MyLLM(LLM):
...@@ -28,7 +59,7 @@ class MyLLM(LLM): ...@@ -28,7 +59,7 @@ class MyLLM(LLM):
# a hack to make the script work. # a hack to make the script work.
# stop ray from manipulating CUDA_VISIBLE_DEVICES # stop ray from manipulating CUDA_VISIBLE_DEVICES
# at the top-level # at the top-level
del os.environ["CUDA_VISIBLE_DEVICES"] os.environ.pop("CUDA_VISIBLE_DEVICES", None)
# every worker will use 0.4 GPU, so that we can schedule # every worker will use 0.4 GPU, so that we can schedule
# 2 instances on the same GPUs. # 2 instances on the same GPUs.
os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4" os.environ["VLLM_RAY_PER_WORKER_GPUS"] = "0.4"
...@@ -40,12 +71,32 @@ class MyLLM(LLM): ...@@ -40,12 +71,32 @@ class MyLLM(LLM):
class RayTrainingActor: class RayTrainingActor:
def report_device_id(self) -> str: def __init__(self):
# ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
from transformers import AutoModelForCausalLM
self.model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
self.model.to("cuda:0")
for name, p in self.model.named_parameters():
p.data.zero_()
torch.cuda.synchronize()
# the argument for get_device_uuid is the index # the argument for get_device_uuid is the index
# of the GPU in the visible devices. # of the GPU in the visible devices.
# ray will set CUDA_VISIBLE_DEVICES to the assigned GPUs
from vllm.platforms import current_platform from vllm.platforms import current_platform
return current_platform.get_device_uuid(0) self.device_uuid = current_platform.get_device_uuid(0)
def report_device_id(self) -> str:
return self.device_uuid
def get_weight_ipc_handles(self):
from torch.multiprocessing.reductions import reduce_tensor
data = {}
for name, p in self.model.named_parameters():
# the training actor might only have a subset of the weights
# and need to all-gather the weights from all the actors.
# for demonstration, here we assume all training actors have
# the full weights.
data[name] = reduce_tensor(p.detach())
return {self.device_uuid: data}
# ray manages 4 GPUs # ray manages 4 GPUs
...@@ -78,6 +129,8 @@ for bundle_index in [0, 1, 2, 3]: ...@@ -78,6 +129,8 @@ for bundle_index in [0, 1, 2, 3]:
), ),
)(RayTrainingActor).remote() )(RayTrainingActor).remote()
training_actors.append(training_actor) training_actors.append(training_actor)
for bundle_index, training_actor in enumerate(training_actors):
device_id = ray.get(training_actor.report_device_id.remote()) device_id = ray.get(training_actor.report_device_id.remote())
print(f"training actor {bundle_index} is on {device_id}") print(f"training actor {bundle_index} is on {device_id}")
training_actor_device_ids.append(device_id) training_actor_device_ids.append(device_id)
...@@ -119,3 +172,18 @@ assert training_actor_device_ids[:2] == inference_engine_device_ids[0] ...@@ -119,3 +172,18 @@ assert training_actor_device_ids[:2] == inference_engine_device_ids[0]
# the last two training actors should be # the last two training actors should be
# on the same GPUs as the second inference engine # on the same GPUs as the second inference engine
assert training_actor_device_ids[2:] == inference_engine_device_ids[1] assert training_actor_device_ids[2:] == inference_engine_device_ids[1]
print("gather all the IPC handles from the training actors")
ipc_handles = {}
for actor in training_actors:
ipc_handles.update(ray.get(actor.get_weight_ipc_handles.remote()))
print("update the weights of the inference engines")
for llm in inference_engines:
ray.get(
llm.collective_rpc.remote("update_weights_from_ipc_handles",
args=(ipc_handles, )))
print("check if the weights are updated")
for llm in inference_engines:
assert ray.get(
llm.collective_rpc.remote("check_weights_changed", args=tuple()))
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM
# Sample prompts.
text_1 = "What is the capital of France?"
texts_2 = [
"The capital of Brazil is Brasilia.", "The capital of France is Paris."
]
# Create an LLM.
# You should pass task="score" for cross-encoder models
model = LLM(
model="BAAI/bge-reranker-v2-m3",
task="score",
enforce_eager=True,
)
# Generate scores. The output is a list of ScoringRequestOutputs.
outputs = model.score(text_1, texts_2)
# Print the outputs.
for text_2, output in zip(texts_2, outputs):
score = output.outputs.score
print(f"Pair: {[text_1, text_2]!r} | Score: {score}")
...@@ -105,8 +105,12 @@ def run_glm4v(question: str, modality: str): ...@@ -105,8 +105,12 @@ def run_glm4v(question: str, modality: str):
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
enforce_eager=True, enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
prompt = question
prompt = f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
{question}<|assistant|>"
stop_token_ids = [151329, 151336, 151338] stop_token_ids = [151329, 151336, 151338]
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
...@@ -115,7 +119,7 @@ def run_glm4v(question: str, modality: str): ...@@ -115,7 +119,7 @@ def run_glm4v(question: str, modality: str):
def run_h2ovl(question: str, modality: str): def run_h2ovl(question: str, modality: str):
assert modality == "image" assert modality == "image"
model_name = "h2oai/h2ovl-mississippi-2b" model_name = "h2oai/h2ovl-mississippi-800m"
llm = LLM( llm = LLM(
model=model_name, model=model_name,
...@@ -132,7 +136,7 @@ def run_h2ovl(question: str, modality: str): ...@@ -132,7 +136,7 @@ def run_h2ovl(question: str, modality: str):
add_generation_prompt=True) add_generation_prompt=True)
# Stop tokens for H2OVL-Mississippi # Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-2b # https://huggingface.co/h2oai/h2ovl-mississippi-800m
stop_token_ids = [tokenizer.eos_token_id] stop_token_ids = [tokenizer.eos_token_id]
return llm, prompt, stop_token_ids return llm, prompt, stop_token_ids
...@@ -493,6 +497,7 @@ def run_qwen_vl(question: str, modality: str): ...@@ -493,6 +497,7 @@ def run_qwen_vl(question: str, modality: str):
trust_remote_code=True, trust_remote_code=True,
max_model_len=1024, max_model_len=1024,
max_num_seqs=2, max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
) )
......
...@@ -77,8 +77,8 @@ def load_deepseek_vl2(question: str, image_urls: List[str]): ...@@ -77,8 +77,8 @@ def load_deepseek_vl2(question: str, image_urls: List[str]):
) )
def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: def load_h2ovl(question: str, image_urls: List[str]) -> ModelRequestData:
model_name = "h2oai/h2ovl-mississippi-2b" model_name = "h2oai/h2ovl-mississippi-800m"
llm = LLM( llm = LLM(
model=model_name, model=model_name,
...@@ -99,7 +99,7 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData: ...@@ -99,7 +99,7 @@ def load_h2onvl(question: str, image_urls: List[str]) -> ModelRequestData:
add_generation_prompt=True) add_generation_prompt=True)
# Stop tokens for H2OVL-Mississippi # Stop tokens for H2OVL-Mississippi
# https://huggingface.co/h2oai/h2ovl-mississippi-2b # https://huggingface.co/h2oai/h2ovl-mississippi-800m
stop_token_ids = [tokenizer.eos_token_id] stop_token_ids = [tokenizer.eos_token_id]
return ModelRequestData( return ModelRequestData(
...@@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str, ...@@ -302,6 +302,7 @@ def load_qwen_vl_chat(question: str,
trust_remote_code=True, trust_remote_code=True,
max_model_len=1024, max_model_len=1024,
max_num_seqs=2, max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
placeholders = "".join(f"Picture {i}: <img></img>\n" placeholders = "".join(f"Picture {i}: <img></img>\n"
...@@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData: ...@@ -452,7 +453,7 @@ def load_qwen2_5_vl(question, image_urls: List[str]) -> ModelRequestData:
model_example_map = { model_example_map = {
"aria": load_aria, "aria": load_aria,
"deepseek_vl_v2": load_deepseek_vl2, "deepseek_vl_v2": load_deepseek_vl2,
"h2ovl_chat": load_h2onvl, "h2ovl_chat": load_h2ovl,
"idefics3": load_idefics3, "idefics3": load_idefics3,
"internvl_chat": load_internvl, "internvl_chat": load_internvl,
"mllama": load_mllama, "mllama": load_mllama,
......
...@@ -18,4 +18,4 @@ This directory contains a Helm chart for deploying the vllm application. The cha ...@@ -18,4 +18,4 @@ This directory contains a Helm chart for deploying the vllm application. The cha
- templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget. - templates/poddisruptionbudget.yaml: Template for Pod Disruption Budget.
- templates/pvc.yaml: Template for Persistent Volume Claims. - templates/pvc.yaml: Template for Persistent Volume Claims.
- templates/secrets.yaml: Template for Kubernetes Secrets. - templates/secrets.yaml: Template for Kubernetes Secrets.
- templates/service.yaml: Template for creating Services. - templates/service.yaml: Template for creating Services.
\ No newline at end of file
...@@ -12,7 +12,7 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \ ...@@ -12,7 +12,7 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
--trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2 --trust-remote-code --max-model-len 4096 --limit-mm-per-prompt image=2
(audio inference with Ultravox) (audio inference with Ultravox)
vllm serve fixie-ai/ultravox-v0_3 --max-model-len 4096 vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b --max-model-len 4096
""" """
import base64 import base64
......
...@@ -36,8 +36,8 @@ response = client.chat.completions.create(model=model, messages=messages) ...@@ -36,8 +36,8 @@ response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content content = response.choices[0].message.content
print("reasoning_content:", reasoning_content) print("reasoning_content for Round 1:", reasoning_content)
print("content:", content) print("content for Round 1:", content)
# Round 2 # Round 2
messages.append({"role": "assistant", "content": content}) messages.append({"role": "assistant", "content": content})
...@@ -50,5 +50,5 @@ response = client.chat.completions.create(model=model, messages=messages) ...@@ -50,5 +50,5 @@ response = client.chat.completions.create(model=model, messages=messages)
reasoning_content = response.choices[0].message.reasoning_content reasoning_content = response.choices[0].message.reasoning_content
content = response.choices[0].message.content content = response.choices[0].message.content
print("reasoning_content:", reasoning_content) print("reasoning_content for Round 2:", reasoning_content)
print("content:", content) print("content for Round 2:", content)
...@@ -44,7 +44,7 @@ def vlm2vec(): ...@@ -44,7 +44,7 @@ def vlm2vec():
def dse_qwen2_vl(inp: dict): def dse_qwen2_vl(inp: dict):
# Embedding an Image # Embedding an Image
if inp["dtype"] == "image": if inp["type"] == "image":
messages = [{ messages = [{
"role": "role":
"user", "user",
...@@ -113,10 +113,10 @@ if __name__ == '__main__': ...@@ -113,10 +113,10 @@ if __name__ == '__main__':
vlm2vec() vlm2vec()
elif args.model == "dse_qwen2_vl": elif args.model == "dse_qwen2_vl":
dse_qwen2_vl({ dse_qwen2_vl({
"dtye": "image", "type": "image",
"image_url": image_url, "image_url": image_url,
}) })
dse_qwen2_vl({ dse_qwen2_vl({
"dtype": "text", "type": "text",
"content": "What is the weather like today?", "content": "What is the weather like today?",
}) })
# SPDX-License-Identifier: Apache-2.0
from openai import OpenAI
from vllm.assets.audio import AudioAsset
mary_had_lamb = AudioAsset('mary_had_lamb').get_local_path()
winning_call = AudioAsset('winning_call').get_local_path()
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
client = OpenAI(
api_key=openai_api_key,
base_url=openai_api_base,
)
with open(str(mary_had_lamb), "rb") as f:
transcription = client.audio.transcriptions.create(
file=f,
model="openai/whisper-large-v3",
language="en",
response_format="text",
temperature=0.0)
print("transcription result:", transcription)
# Setup OpenTelemetry POC # Setup OpenTelemetry POC
1. Install OpenTelemetry packages: 1. Install OpenTelemetry packages:
```
```console
pip install \ pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \ 'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \ 'opentelemetry-api>=1.26.0,<1.27.0' \
...@@ -10,7 +11,8 @@ ...@@ -10,7 +11,8 @@
``` ```
1. Start Jaeger in a docker container: 1. Start Jaeger in a docker container:
```
```console
# From: https://www.jaegertracing.io/docs/1.57/getting-started/ # From: https://www.jaegertracing.io/docs/1.57/getting-started/
docker run --rm --name jaeger \ docker run --rm --name jaeger \
-e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \ -e COLLECTOR_ZIPKIN_HOST_PORT=:9411 \
...@@ -28,19 +30,23 @@ ...@@ -28,19 +30,23 @@
``` ```
1. In a new shell, export Jaeger IP: 1. In a new shell, export Jaeger IP:
```
```console
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger) export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
``` ```
Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM: Then set vLLM's service name for OpenTelemetry, enable insecure connections to Jaeger and run vLLM:
```
```console
export OTEL_SERVICE_NAME="vllm-server" export OTEL_SERVICE_NAME="vllm-server"
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
``` ```
1. In a new shell, send requests with trace context from a dummy client 1. In a new shell, send requests with trace context from a dummy client
```
```console
export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger) export JAEGER_IP=$(docker inspect --format '{{ .NetworkSettings.IPAddress }}' jaeger)
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317 export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=grpc://$JAEGER_IP:4317
export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true export OTEL_EXPORTER_OTLP_TRACES_INSECURE=true
...@@ -48,7 +54,7 @@ ...@@ -48,7 +54,7 @@
python dummy_client.py python dummy_client.py
``` ```
1. Open Jaeger webui: http://localhost:16686/ 1. Open Jaeger webui: <http://localhost:16686/>
In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request. In the search pane, select `vllm-server` service and hit `Find Traces`. You should get a list of traces, one for each request.
![Traces](https://i.imgur.com/GYHhFjo.png) ![Traces](https://i.imgur.com/GYHhFjo.png)
...@@ -57,26 +63,32 @@ ...@@ -57,26 +63,32 @@
![Spans details](https://i.imgur.com/OPf6CBL.png) ![Spans details](https://i.imgur.com/OPf6CBL.png)
## Exporter Protocol ## Exporter Protocol
OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter. OpenTelemetry supports either `grpc` or `http/protobuf` as the transport protocol for trace data in the exporter.
By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows: By default, `grpc` is used. To set `http/protobuf` as the protocol, configure the `OTEL_EXPORTER_OTLP_TRACES_PROTOCOL` environment variable as follows:
```
```console
export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf export OTEL_EXPORTER_OTLP_TRACES_PROTOCOL=http/protobuf
export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces export OTEL_EXPORTER_OTLP_TRACES_ENDPOINT=http://$JAEGER_IP:4318/v1/traces
vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT" vllm serve facebook/opt-125m --otlp-traces-endpoint="$OTEL_EXPORTER_OTLP_TRACES_ENDPOINT"
``` ```
## Instrumentation of FastAPI ## Instrumentation of FastAPI
OpenTelemetry allows automatic instrumentation of FastAPI. OpenTelemetry allows automatic instrumentation of FastAPI.
1. Install the instrumentation library 1. Install the instrumentation library
```
```console
pip install opentelemetry-instrumentation-fastapi pip install opentelemetry-instrumentation-fastapi
``` ```
1. Run vLLM with `opentelemetry-instrument` 1. Run vLLM with `opentelemetry-instrument`
```
```console
opentelemetry-instrument vllm serve facebook/opt-125m opentelemetry-instrument vllm serve facebook/opt-125m
``` ```
1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI. 1. Send a request to vLLM and find its trace in Jaeger. It should contain spans from FastAPI.
![FastAPI Spans](https://i.imgur.com/hywvoOJ.png) ![FastAPI Spans](https://i.imgur.com/hywvoOJ.png)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment