Commit dcb5624a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.5' into v0.8.5-dev

parents 55880ca2 ba41cc90
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use Ray Data for data parallel batch inference.
Ray Data is a data processing framework that can handle large datasets
and integrates tightly with vLLM for data-parallel inference.
As of Ray 2.44, Ray Data has a native integration with
vLLM (under ray.data.llm).
Ray Data provides functionality for:
* Reading and writing to cloud storage (S3, GCS, etc.)
* Automatic sharding and load-balancing across a cluster
* Optimized configuration of vLLM using continuous batching
* Compatible with tensor/pipeline parallel inference as well.
Learn more about Ray Data's LLM integration:
https://docs.ray.io/en/latest/data/working-with-llms.html
"""
import ray
from packaging.version import Version
from ray.data.llm import build_llm_processor, vLLMEngineProcessorConfig
assert Version(ray.__version__) >= Version(
"2.44.1"), "Ray version must be at least 2.44.1"
# Uncomment to reduce clutter in stdout
# ray.init(log_to_driver=False)
# ray.data.DataContext.get_current().enable_progress_bars = False
# Read one text file from S3. Ray Data supports reading multiple files
# from cloud storage (such as JSONL, Parquet, CSV, binary format).
ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
print(ds.schema())
size = ds.count()
print(f"Size of dataset: {size} prompts")
# Configure vLLM engine.
config = vLLMEngineProcessorConfig(
model_source="unsloth/Llama-3.1-8B-Instruct",
engine_kwargs={
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4096,
"max_model_len": 16384,
},
concurrency=1, # set the number of parallel vLLM replicas
batch_size=64,
)
# Create a Processor object, which will be used to
# do batch inference on the dataset
vllm_processor = build_llm_processor(
config,
preprocess=lambda row: dict(
messages=[{
"role": "system",
"content": "You are a bot that responds with haikus."
}, {
"role": "user",
"content": row["text"]
}],
sampling_params=dict(
temperature=0.3,
max_tokens=250,
)),
postprocess=lambda row: dict(
answer=row["generated_text"],
**row # This will return all the original columns in the dataset.
),
)
ds = vllm_processor(ds)
# Peek first 10 results.
# NOTE: This is for local testing and debugging. For production use case,
# one should write full result out as shown below.
outputs = ds.take(limit=10)
for output in outputs:
prompt = output["prompt"]
generated_text = output["generated_text"]
print(f"Prompt: {prompt!r}")
print(f"Generated text: {generated_text!r}")
# Write inference output data out as Parquet files to S3.
# Multiple files would be written to the output destination,
# and each task would write one or more files separately.
#
# ds.write_parquet("s3://<your-output-bucket>")
...@@ -34,6 +34,40 @@ from vllm import LLM, SamplingParams ...@@ -34,6 +34,40 @@ from vllm import LLM, SamplingParams
from vllm.utils import get_open_port from vllm.utils import get_open_port
def parse_args():
import argparse
parser = argparse.ArgumentParser(description="Data Parallel Inference")
parser.add_argument("--model",
type=str,
default="ibm-research/PowerMoE-3b",
help="Model name or path")
parser.add_argument("--dp-size",
type=int,
default=2,
help="Data parallel size")
parser.add_argument("--tp-size",
type=int,
default=2,
help="Tensor parallel size")
parser.add_argument("--node-size",
type=int,
default=1,
help="Total number of nodes")
parser.add_argument("--node-rank",
type=int,
default=0,
help="Rank of the current node")
parser.add_argument("--master-addr",
type=str,
default="",
help="Master node IP address")
parser.add_argument("--master-port",
type=int,
default=0,
help="Master node port")
return parser.parse_args()
def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
dp_master_port, GPUs_per_dp_rank): dp_master_port, GPUs_per_dp_rank):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
...@@ -95,37 +129,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, ...@@ -95,37 +129,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
if __name__ == "__main__": if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Data Parallel Inference") args = parse_args()
parser.add_argument("--model",
type=str,
default="ibm-research/PowerMoE-3b",
help="Model name or path")
parser.add_argument("--dp-size",
type=int,
default=2,
help="Data parallel size")
parser.add_argument("--tp-size",
type=int,
default=2,
help="Tensor parallel size")
parser.add_argument("--node-size",
type=int,
default=1,
help="Total number of nodes")
parser.add_argument("--node-rank",
type=int,
default=0,
help="Rank of the current node")
parser.add_argument("--master-addr",
type=str,
default="",
help="Master node IP address")
parser.add_argument("--master-port",
type=int,
default=0,
help="Master node port")
args = parser.parse_args()
dp_size = args.dp_size dp_size = args.dp_size
tp_size = args.tp_size tp_size = args.tp_size
......
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig
# Read prompts from output.txt
prompts = []
try:
with open("output.txt") as f:
for line in f:
prompts.append(line.strip())
print(f"Loaded {len(prompts)} prompts from output.txt")
except FileNotFoundError:
print("Error: output.txt file not found")
exit(-1)
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
llm = LLM(
model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
gpu_memory_utilization=0.8,
max_num_batched_tokens=64,
max_num_seqs=16,
kv_transfer_config=KVTransferConfig.from_cli(
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both",'
'"kv_connector_extra_config": {"shared_storage_path": "local_storage"}}'
)) #, max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# SPDX-License-Identifier: Apache-2.0
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig
context = "Hi " * 1000
context2 = "Hey " * 500
prompts = [
context + "Hello, my name is",
context + "The capital of France is",
context2 + "Your name is",
context2 + "The capital of China is",
]
sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=1)
llm = LLM(model="meta-llama/Llama-3.2-1B-Instruct",
enforce_eager=True,
gpu_memory_utilization=0.8,
kv_transfer_config=KVTransferConfig.from_cli(
'{"kv_connector":"SharedStorageConnector","kv_role":"kv_both", '
'"kv_connector_extra_config": '
'{"shared_storage_path": "local_storage"}}')
) #, max_model_len=2048, max_num_batched_tokens=2048)
# 1ST generation (prefill instance)
outputs = llm.generate(
prompts,
sampling_params,
)
new_prompts = []
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
new_prompts.append(prompt + generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# Write new_prompts to output.txt
with open("output.txt", "w") as f:
for prompt in new_prompts:
f.write(prompt + "\n")
print(f"Saved {len(new_prompts)} prompts to output.txt")
rm -rf local_storage/
rm output.txt
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 prefill_example.py
VLLM_ENABLE_V1_MULTIPROCESSING=0 CUDA_VISIBLE_DEVICES=0 python3 decode_example.py
...@@ -95,7 +95,7 @@ def run_decode(prefill_done): ...@@ -95,7 +95,7 @@ def run_decode(prefill_done):
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == "__main__": def main():
prefill_done = Event() prefill_done = Event()
prefill_process = Process(target=run_prefill, args=(prefill_done, )) prefill_process = Process(target=run_prefill, args=(prefill_done, ))
decode_process = Process(target=run_decode, args=(prefill_done, )) decode_process = Process(target=run_decode, args=(prefill_done, ))
...@@ -109,3 +109,7 @@ if __name__ == "__main__": ...@@ -109,3 +109,7 @@ if __name__ == "__main__":
# Terminate the prefill node when decode is finished # Terminate the prefill node when decode is finished
decode_process.join() decode_process.join()
prefill_process.terminate() prefill_process.terminate()
if __name__ == "__main__":
main()
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use Ray Data for running offline batch inference
distributively on a multi-nodes cluster.
Learn more about Ray Data in https://docs.ray.io/en/latest/data/data.html
"""
from typing import Any
import numpy as np
import ray
from packaging.version import Version
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from vllm import LLM, SamplingParams
assert Version(ray.__version__) >= Version(
"2.22.0"), "Ray version must be at least 2.22.0"
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Set tensor parallelism per instance.
tensor_parallel_size = 1
# Set number of instances. Each instance will use tensor_parallel_size GPUs.
num_instances = 1
# Create a class to do batch inference.
class LLMPredictor:
def __init__(self):
# Create an LLM.
self.llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
tensor_parallel_size=tensor_parallel_size)
def __call__(self, batch: dict[str, np.ndarray]) -> dict[str, list]:
# Generate texts from the prompts.
# The output is a list of RequestOutput objects that contain the prompt,
# generated text, and other information.
outputs = self.llm.generate(batch["text"], sampling_params)
prompt: list[str] = []
generated_text: list[str] = []
for output in outputs:
prompt.append(output.prompt)
generated_text.append(' '.join([o.text for o in output.outputs]))
return {
"prompt": prompt,
"generated_text": generated_text,
}
# Read one text file from S3. Ray Data supports reading multiple files
# from cloud storage (such as JSONL, Parquet, CSV, binary format).
ds = ray.data.read_text("s3://anonymous@air-example-data/prompts.txt")
# For tensor_parallel_size > 1, we need to create placement groups for vLLM
# to use. Every actor has to have its own placement group.
def scheduling_strategy_fn():
# One bundle per tensor parallel worker
pg = ray.util.placement_group(
[{
"GPU": 1,
"CPU": 1
}] * tensor_parallel_size,
strategy="STRICT_PACK",
)
return dict(scheduling_strategy=PlacementGroupSchedulingStrategy(
pg, placement_group_capture_child_tasks=True))
resources_kwarg: dict[str, Any] = {}
if tensor_parallel_size == 1:
# For tensor_parallel_size == 1, we simply set num_gpus=1.
resources_kwarg["num_gpus"] = 1
else:
# Otherwise, we have to set num_gpus=0 and provide
# a function that will create a placement group for
# each instance.
resources_kwarg["num_gpus"] = 0
resources_kwarg["ray_remote_args_fn"] = scheduling_strategy_fn
# Apply batch inference for all input data.
ds = ds.map_batches(
LLMPredictor,
# Set the concurrency to the number of LLM instances.
concurrency=num_instances,
# Specify the batch size for inference.
batch_size=32,
**resources_kwarg,
)
# Peek first 10 results.
# NOTE: This is for local testing and debugging. For production use case,
# one should write full result out as shown below.
outputs = ds.take(limit=10)
for output in outputs:
prompt = output["prompt"]
generated_text = output["generated_text"]
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
# Write inference output data out as Parquet files to S3.
# Multiple files would be written to the output destination,
# and each task would write one or more files separately.
#
# ds.write_parquet("s3://<your-output-bucket>")
...@@ -27,7 +27,7 @@ def load_prompts(dataset_path, num_prompts): ...@@ -27,7 +27,7 @@ def load_prompts(dataset_path, num_prompts):
return prompts[:num_prompts] return prompts[:num_prompts]
def main(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--dataset", "--dataset",
...@@ -45,10 +45,15 @@ def main(): ...@@ -45,10 +45,15 @@ def main():
parser.add_argument("--enable_chunked_prefill", action='store_true') parser.add_argument("--enable_chunked_prefill", action='store_true')
parser.add_argument("--max_num_batched_tokens", type=int, default=2048) parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
parser.add_argument("--temp", type=float, default=0) parser.add_argument("--temp", type=float, default=0)
args = parser.parse_args() return parser.parse_args()
def main():
model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" args = parse_args()
eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
model_dir = "meta-llama/Llama-3.1-8B-Instruct"
eagle_dir = "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
max_model_len = 2048 max_model_len = 2048
...@@ -76,7 +81,7 @@ def main(): ...@@ -76,7 +81,7 @@ def main():
max_num_seqs=args.max_num_seqs, max_num_seqs=args.max_num_seqs,
gpu_memory_utilization=0.8, gpu_memory_utilization=0.8,
speculative_config={ speculative_config={
"method": "eagle", "method": "eagle3" if "eagle3" in eagle_dir.lower() else "eagle",
"model": eagle_dir, "model": eagle_dir,
"num_speculative_tokens": args.num_spec_tokens, "num_speculative_tokens": args.num_spec_tokens,
"draft_tensor_parallel_size": args.draft_tp, "draft_tensor_parallel_size": args.draft_tp,
...@@ -90,6 +95,9 @@ def main(): ...@@ -90,6 +95,9 @@ def main():
outputs = llm.generate(prompt_token_ids=prompt_ids, outputs = llm.generate(prompt_token_ids=prompt_ids,
sampling_params=sampling_params) sampling_params=sampling_params)
if not hasattr(outputs, "metrics") or outputs.metrics is None:
return
# calculate the average number of accepted tokens per forward pass, +1 is # calculate the average number of accepted tokens per forward pass, +1 is
# to account for the token from the target model that's always going to be # to account for the token from the target model that's always going to be
# accepted # accepted
...@@ -104,6 +112,11 @@ def main(): ...@@ -104,6 +112,11 @@ def main():
{sum(acceptance_counts) / acceptance_counts[0]:.2f}") {sum(acceptance_counts) / acceptance_counts[0]:.2f}")
print("-" * 50) print("-" * 50)
# print acceptance at each token position
for i in range(len(acceptance_counts)):
print(f"acceptance at token {i}:"
f"{acceptance_counts[i] / (acceptance_counts[0]):.2f}")
if __name__ == "__main__": if __name__ == "__main__":
main() main()
...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs ...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
return parser.parse_args()
def main(args: Namespace): def main(args: Namespace):
# Sample prompts. # Sample prompts.
prompts = [ prompts = [
...@@ -40,11 +50,5 @@ def main(args: Namespace): ...@@ -40,11 +50,5 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() args = parse_args()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
args = parser.parse_args()
main(args) main(args)
...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs, PoolingParams ...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs, PoolingParams
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
return parser.parse_args()
def main(args: Namespace): def main(args: Namespace):
# Sample prompts. # Sample prompts.
prompts = [ prompts = [
...@@ -38,11 +48,5 @@ def main(args: Namespace): ...@@ -38,11 +48,5 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() args = parse_args()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
args = parser.parse_args()
main(args) main(args)
...@@ -8,94 +8,112 @@ from vllm import LLM, SamplingParams ...@@ -8,94 +8,112 @@ from vllm import LLM, SamplingParams
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
TokensPrompt, zip_enc_dec_prompts) TokensPrompt, zip_enc_dec_prompts)
dtype = "float"
def create_prompts(tokenizer):
# Create a BART encoder/decoder model instance # Test prompts
llm = LLM( #
model="facebook/bart-large-cnn", # This section shows all of the valid ways to prompt an
dtype=dtype, # encoder/decoder model.
) #
# - Helpers for building prompts
# Get BART tokenizer text_prompt_raw = "Hello, my name is"
tokenizer = llm.llm_engine.get_tokenizer_group() text_prompt = TextPrompt(prompt="The president of the United States is")
tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
# Test prompts prompt="The capital of France is"))
# # - Pass a single prompt to encoder/decoder model
# This section shows all of the valid ways to prompt an # (implicitly encoder input prompt);
# encoder/decoder model. # decoder input prompt is assumed to be None
#
# - Helpers for building prompts single_text_prompt_raw = text_prompt_raw # Pass a string directly
text_prompt_raw = "Hello, my name is" single_text_prompt = text_prompt # Pass a TextPrompt
text_prompt = TextPrompt(prompt="The president of the United States is") single_tokens_prompt = tokens_prompt # Pass a TokensPrompt
tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
prompt="The capital of France is")) # ruff: noqa: E501
# - Pass a single prompt to encoder/decoder model # - Pass explicit encoder and decoder input prompts within one data structure.
# (implicitly encoder input prompt); # Encoder and decoder prompts can both independently be text or tokens, with
# decoder input prompt is assumed to be None # no requirement that they be the same prompt type. Some example prompt-type
# combinations are shown below, note that these are not exhaustive.
single_text_prompt_raw = text_prompt_raw # Pass a string directly
single_text_prompt = text_prompt # Pass a TextPrompt enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
single_tokens_prompt = tokens_prompt # Pass a TokensPrompt # Pass encoder prompt string directly, &
# pass decoder prompt tokens
# - Pass explicit encoder and decoder input prompts within one data structure. encoder_prompt=single_text_prompt_raw,
# Encoder and decoder prompts can both independently be text or tokens, with decoder_prompt=single_tokens_prompt,
# no requirement that they be the same prompt type. Some example prompt-type )
# combinations are shown below, note that these are not exhaustive. enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
# Pass TextPrompt to encoder, and
enc_dec_prompt1 = ExplicitEncoderDecoderPrompt( # pass decoder prompt string directly
# Pass encoder prompt string directly, & encoder_prompt=single_text_prompt,
# pass decoder prompt tokens decoder_prompt=single_text_prompt_raw,
encoder_prompt=single_text_prompt_raw, )
decoder_prompt=single_tokens_prompt, enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
) # Pass encoder prompt tokens directly, and
enc_dec_prompt2 = ExplicitEncoderDecoderPrompt( # pass TextPrompt to decoder
# Pass TextPrompt to encoder, and encoder_prompt=single_tokens_prompt,
# pass decoder prompt string directly decoder_prompt=single_text_prompt,
encoder_prompt=single_text_prompt, )
decoder_prompt=single_text_prompt_raw,
) # - Finally, here's a useful helper function for zipping encoder and
enc_dec_prompt3 = ExplicitEncoderDecoderPrompt( # decoder prompts together into a list of ExplicitEncoderDecoderPrompt
# Pass encoder prompt tokens directly, and # instances
# pass TextPrompt to decoder zipped_prompt_list = zip_enc_dec_prompts(
encoder_prompt=single_tokens_prompt, ['An encoder prompt', 'Another encoder prompt'],
decoder_prompt=single_text_prompt, ['A decoder prompt', 'Another decoder prompt'])
)
# - Let's put all of the above example prompts together into one list
# - Finally, here's a useful helper function for zipping encoder and # which we will pass to the encoder/decoder LLM.
# decoder prompts together into a list of ExplicitEncoderDecoderPrompt return [
# instances single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
zipped_prompt_list = zip_enc_dec_prompts( enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
['An encoder prompt', 'Another encoder prompt'], ] + zipped_prompt_list
['A decoder prompt', 'Another decoder prompt'])
# - Let's put all of the above example prompts together into one list
# which we will pass to the encoder/decoder LLM.
prompts = [
single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
] + zipped_prompt_list
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams( def create_sampling_params():
temperature=0, return SamplingParams(
top_p=1.0, temperature=0,
min_tokens=0, top_p=1.0,
max_tokens=20, min_tokens=0,
) max_tokens=20,
)
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Print the outputs.
print("-" * 50) def print_outputs(outputs):
for i, output in enumerate(outputs):
prompt = output.prompt
encoder_prompt = output.encoder_prompt
generated_text = output.outputs[0].text
print(f"Output {i+1}:")
print(f"Encoder prompt: {encoder_prompt!r}\n"
f"Decoder prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50) print("-" * 50)
for i, output in enumerate(outputs):
prompt = output.prompt
encoder_prompt = output.encoder_prompt
generated_text = output.outputs[0].text
print(f"Output {i+1}:")
print(f"Encoder prompt: {encoder_prompt!r}\n"
f"Decoder prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50)
def main():
dtype = "float"
# Create a BART encoder/decoder model instance
llm = LLM(
model="facebook/bart-large-cnn",
dtype=dtype,
)
# Get BART tokenizer
tokenizer = llm.llm_engine.get_tokenizer_group()
prompts = create_prompts(tokenizer)
sampling_params = create_sampling_params()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs = llm.generate(prompts, sampling_params)
print_outputs(outputs)
if __name__ == "__main__":
main()
...@@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple): ...@@ -22,7 +22,7 @@ class ModelRequestData(NamedTuple):
def run_florence2(): def run_florence2():
engine_args = EngineArgs( engine_args = EngineArgs(
model="microsoft/Florence-2-large", model="microsoft/Florence-2-large",
tokenizer="facebook/bart-large", tokenizer="Isotr0py/Florence-2-tokenizer",
max_num_seqs=8, max_num_seqs=8,
trust_remote_code=True, trust_remote_code=True,
limit_mm_per_prompt={"image": 1}, limit_mm_per_prompt={"image": 1},
...@@ -126,6 +126,23 @@ model_example_map = { ...@@ -126,6 +126,23 @@ model_example_map = {
} }
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="mllama",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
return parser.parse_args()
def main(args): def main(args):
model = args.model_type model = args.model_type
if model not in model_example_map: if model not in model_example_map:
...@@ -148,6 +165,7 @@ def main(args): ...@@ -148,6 +165,7 @@ def main(args):
temperature=0, temperature=0,
top_p=1.0, top_p=1.0,
max_tokens=64, max_tokens=64,
skip_special_tokens=False,
) )
start = time.time() start = time.time()
...@@ -171,19 +189,5 @@ def main(args): ...@@ -171,19 +189,5 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( args = parse_args()
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="mllama",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args) main(args)
...@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine: ...@@ -50,6 +50,13 @@ def initialize_engine(args: argparse.Namespace) -> LLMEngine:
return LLMEngine.from_engine_args(engine_args) return LLMEngine.from_engine_args(engine_args)
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
return parser.parse_args()
def main(args: argparse.Namespace): def main(args: argparse.Namespace):
"""Main function that sets up and runs the prompt processing.""" """Main function that sets up and runs the prompt processing."""
engine = initialize_engine(args) engine = initialize_engine(args)
...@@ -58,8 +65,5 @@ def main(args: argparse.Namespace): ...@@ -58,8 +65,5 @@ def main(args: argparse.Namespace):
if __name__ == '__main__': if __name__ == '__main__':
parser = FlexibleArgumentParser( args = parse_args()
description='Demo on using the LLMEngine class directly')
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()
main(args) main(args)
...@@ -16,11 +16,11 @@ from vllm.sampling_params import SamplingParams ...@@ -16,11 +16,11 @@ from vllm.sampling_params import SamplingParams
# # Mistral format # # Mistral format
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --tokenizer-mode mistral --config-format mistral --load-format mistral \ # --tokenizer-mode mistral --config-format mistral --load-format mistral \
# --limit-mm-per-prompt 'image=4' --max-model-len 16384 # --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
# #
# # HF format # # HF format
# vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \ # vllm serve mistralai/Mistral-Small-3.1-24B-Instruct-2503 \
# --limit-mm-per-prompt 'image=4' --max-model-len 16384 # --limit-mm-per-prompt '{"image":4}' --max-model-len 16384
# ``` # ```
# #
# - Client: # - Client:
...@@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace): ...@@ -62,6 +62,7 @@ def run_simple_demo(args: argparse.Namespace):
tokenizer_mode="mistral" if args.format == "mistral" else "auto", tokenizer_mode="mistral" if args.format == "mistral" else "auto",
config_format="mistral" if args.format == "mistral" else "auto", config_format="mistral" if args.format == "mistral" else "auto",
load_format="mistral" if args.format == "mistral" else "auto", load_format="mistral" if args.format == "mistral" else "auto",
limit_mm_per_prompt={"image": 1},
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
tensor_parallel_size=2, tensor_parallel_size=2,
...@@ -168,7 +169,7 @@ def run_advanced_demo(args: argparse.Namespace): ...@@ -168,7 +169,7 @@ def run_advanced_demo(args: argparse.Namespace):
print("-" * 50) print("-" * 50)
def main(): def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Run a demo in simple or advanced mode.") description="Run a demo in simple or advanced mode.")
...@@ -187,8 +188,11 @@ def main(): ...@@ -187,8 +188,11 @@ def main():
'--disable-mm-preprocessor-cache', '--disable-mm-preprocessor-cache',
action='store_true', action='store_true',
help='If True, disables caching of multi-modal preprocessor/mapper.') help='If True, disables caching of multi-modal preprocessor/mapper.')
return parser.parse_args()
args = parser.parse_args() def main():
args = parse_args()
if args.mode == "simple": if args.mode == "simple":
print("Running simple demo...") print("Running simple demo...")
......
...@@ -34,8 +34,7 @@ def time_generation(llm: LLM, prompts: list[str], ...@@ -34,8 +34,7 @@ def time_generation(llm: LLM, prompts: list[str],
print("-" * 50) print("-" * 50)
if __name__ == "__main__": def main():
template = ( template = (
"Below is an instruction that describes a task. Write a response " "Below is an instruction that describes a task. Write a response "
"that appropriately completes the request.\n\n### Instruction:\n{}" "that appropriately completes the request.\n\n### Instruction:\n{}"
...@@ -66,3 +65,7 @@ if __name__ == "__main__": ...@@ -66,3 +65,7 @@ if __name__ == "__main__":
) )
time_generation(llm, prompts, sampling_params, "With speculation") time_generation(llm, prompts, sampling_params, "With speculation")
if __name__ == "__main__":
main()
...@@ -417,6 +417,38 @@ def run_model(input_data, ...@@ -417,6 +417,38 @@ def run_model(input_data,
return pred_imgs return pred_imgs
def parse_args():
parser = argparse.ArgumentParser("MAE run inference", add_help=False)
parser.add_argument(
"--data_file",
type=str,
default="./India_900498_S2Hand.tif",
help="Path to the file.",
)
parser.add_argument(
"--output_dir",
type=str,
default="output",
help="Path to the directory where to save outputs.",
)
parser.add_argument(
"--input_indices",
default=[1, 2, 3, 8, 11, 12],
type=int,
nargs="+",
help=
"0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data.",
)
parser.add_argument(
"--rgb_outputs",
action="store_true",
help="If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved.",
)
def main( def main(
data_file: str, data_file: str,
output_dir: str, output_dir: str,
...@@ -496,35 +528,7 @@ def main( ...@@ -496,35 +528,7 @@ def main(
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser("MAE run inference", add_help=False)
parser.add_argument( args = parse_args()
"--data_file",
type=str,
default="./India_900498_S2Hand.tif",
help="Path to the file.",
)
parser.add_argument(
"--output_dir",
type=str,
default="output",
help="Path to the directory where to save outputs.",
)
parser.add_argument(
"--input_indices",
default=[1, 2, 3, 8, 11, 12],
type=int,
nargs="+",
help=
"0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data.",
)
parser.add_argument(
"--rgb_outputs",
action="store_true",
help="If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved.",
)
args = parser.parse_args()
main(**vars(args)) main(**vars(args))
...@@ -359,7 +359,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], ...@@ -359,7 +359,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
f" in folder {context.save_chrome_traces_folder}") f" in folder {context.save_chrome_traces_folder}")
if __name__ == "__main__": def parse_args():
parser = FlexibleArgumentParser(description=""" parser = FlexibleArgumentParser(description="""
Profile a model Profile a model
...@@ -449,7 +449,10 @@ Profile a model ...@@ -449,7 +449,10 @@ Profile a model
EngineArgs.add_cli_args(parser) EngineArgs.add_cli_args(parser)
args = parser.parse_args() return parser.parse_args()
def main(args):
context = ProfileContext( context = ProfileContext(
engine_args=EngineArgs.from_cli_args(args), engine_args=EngineArgs.from_cli_args(args),
**{ **{
...@@ -458,3 +461,8 @@ Profile a model ...@@ -458,3 +461,8 @@ Profile a model
if k in inspect.signature(ProfileContext).parameters if k in inspect.signature(ProfileContext).parameters
}) })
run_profile(context, csv_output=args.csv, json_output=args.json) run_profile(context, csv_output=args.csv, json_output=args.json)
if __name__ == "__main__":
args = parse_args()
main(args)
# Qwen2.5-Omni Offline Inference Examples
This folder provides several example scripts on how to inference Qwen2.5-Omni offline.
## Thinker Only
```bash
# Audio + image + video
python examples/offline_inference/qwen2_5_omni/only_thinker.py -q mixed_modalities
# Read vision and audio inputs from a single video file
# NOTE: V1 engine does not support interleaved modalities yet.
VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q use_audio_in_video
# Multiple audios
VLLM_USE_V1=0 python examples/offline_inference/qwen2_5_omni/only_thinker.py -q multi_audios
```
This script will run the thinker part of Qwen2.5-Omni, and generate text response.
You can also test Qwen2.5-Omni on a single modality:
```bash
# Process audio inputs
python examples/offline_inference/audio_language.py --model-type qwen2_5_omni
# Process image inputs
python examples/offline_inference/vision_language.py --modality image --model-type qwen2_5_omni
# Process video inputs
python examples/offline_inference/vision_language.py --modality video --model-type qwen2_5_omni
```
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to use vLLM for running offline inference
with the correct prompt format on Qwen2.5-Omni (thinker only).
"""
from typing import NamedTuple
import vllm.envs as envs
from vllm import LLM, SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.utils import FlexibleArgumentParser
class QueryResult(NamedTuple):
inputs: dict
limit_mm_per_prompt: dict[str, int]
# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
# lower-end GPUs.
# Unless specified, these settings have been tested to work on a single L4.
default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
"Group, capable of perceiving auditory and visual inputs, as well as "
"generating text and speech.")
def get_mixed_modalities_query() -> QueryResult:
question = ("What is recited in the audio? "
"What is the content of this image? Why is this video funny?")
prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
"<|vision_bos|><|IMAGE|><|vision_eos|>"
"<|vision_bos|><|VIDEO|><|vision_eos|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n")
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio":
AudioAsset("mary_had_lamb").audio_and_sample_rate,
"image":
ImageAsset("cherry_blossom").pil_image.convert("RGB"),
"video":
VideoAsset(name="sample_demo_1.mp4",
num_frames=16).np_ndarrays,
},
},
limit_mm_per_prompt={
"audio": 1,
"image": 1,
"video": 1
},
)
def get_use_audio_in_video_query() -> QueryResult:
question = ("Describe the content of the video, "
"then convert what the baby say into text.")
prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|vision_bos|><|VIDEO|><|vision_eos|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n")
asset = VideoAsset(name="sample_demo_1.mp4", num_frames=16)
audio = asset.get_audio(sampling_rate=16000)
assert not envs.VLLM_USE_V1, ("V1 does not support use_audio_in_video. "
"Please launch this example with "
"`VLLM_USE_V1=0`.")
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"video": asset.np_ndarrays,
"audio": audio,
},
"mm_processor_kwargs": {
"use_audio_in_video": True,
},
},
limit_mm_per_prompt={
"audio": 1,
"video": 1
},
)
def get_multi_audios_query() -> QueryResult:
question = "Are these two audio clips the same?"
prompt = (f"<|im_start|>system\n{default_system}<|im_end|>\n"
"<|im_start|>user\n<|audio_bos|><|AUDIO|><|audio_eos|>"
"<|audio_bos|><|AUDIO|><|audio_eos|>"
f"{question}<|im_end|>\n"
f"<|im_start|>assistant\n")
return QueryResult(
inputs={
"prompt": prompt,
"multi_modal_data": {
"audio": [
AudioAsset("winning_call").audio_and_sample_rate,
AudioAsset("mary_had_lamb").audio_and_sample_rate,
],
},
},
limit_mm_per_prompt={
"audio": 2,
},
)
query_map = {
"mixed_modalities": get_mixed_modalities_query,
"use_audio_in_video": get_use_audio_in_video_query,
"multi_audios": get_multi_audios_query,
}
def main(args):
model_name = "Qwen/Qwen2.5-Omni-7B"
query_result = query_map[args.query_type]()
llm = LLM(model=model_name,
max_model_len=5632,
max_num_seqs=5,
limit_mm_per_prompt=query_result.limit_mm_per_prompt,
seed=args.seed)
# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
sampling_params = SamplingParams(temperature=0.2, max_tokens=64)
outputs = llm.generate(query_result.inputs,
sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'audio language models')
parser.add_argument('--query-type',
'-q',
type=str,
default="mixed_modalities",
choices=query_map.keys(),
help='Query type.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args)
...@@ -29,20 +29,23 @@ from pathlib import Path ...@@ -29,20 +29,23 @@ from pathlib import Path
from vllm import LLM, EngineArgs from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
parser = FlexibleArgumentParser()
EngineArgs.add_cli_args(parser) def parse_args():
parser.add_argument("--output", parser = FlexibleArgumentParser()
"-o", EngineArgs.add_cli_args(parser)
required=True, parser.add_argument("--output",
type=str, "-o",
help="path to output checkpoint") required=True,
parser.add_argument("--file-pattern", type=str,
type=str, help="path to output checkpoint")
help="string pattern of saved filenames") parser.add_argument("--file-pattern",
parser.add_argument("--max-file-size", type=str,
type=str, help="string pattern of saved filenames")
default=5 * 1024**3, parser.add_argument("--max-file-size",
help="max size (in bytes) of each safetensors file") type=str,
default=5 * 1024**3,
help="max size (in bytes) of each safetensors file")
return parser.parse_args()
def main(args): def main(args):
...@@ -87,5 +90,5 @@ def main(args): ...@@ -87,5 +90,5 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parse_args()
main(args) main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment