Unverified Commit 6ae996a8 authored by Reid's avatar Reid Committed by GitHub
Browse files

[Misc] refactor argument parsing in examples (#16635)


Signed-off-by: default avatarreidliu41 <reid201711@gmail.com>
Co-authored-by: default avatarreidliu41 <reid201711@gmail.com>
parent b590adfd
...@@ -187,6 +187,33 @@ model_example_map = { ...@@ -187,6 +187,33 @@ model_example_map = {
} }
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'audio language models')
parser.add_argument('--model-type',
'-m',
type=str,
default="ultravox",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=1,
help='Number of prompts to run.')
parser.add_argument("--num-audios",
type=int,
default=1,
choices=[0, 1, 2],
help="Number of audio items per prompt.")
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
return parser.parse_args()
def main(args): def main(args):
model = args.model_type model = args.model_type
if model not in model_example_map: if model not in model_example_map:
...@@ -240,28 +267,5 @@ def main(args): ...@@ -240,28 +267,5 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( args = parse_args()
description='Demo on using vLLM for offline inference with '
'audio language models')
parser.add_argument('--model-type',
'-m',
type=str,
default="ultravox",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=1,
help='Number of prompts to run.')
parser.add_argument("--num-audios",
type=int,
default=1,
choices=[0, 1, 2],
help="Number of audio items per prompt.")
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args) main(args)
...@@ -12,16 +12,23 @@ prompts = [ ...@@ -12,16 +12,23 @@ prompts = [
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(model="facebook/opt-125m") def main():
# Generate texts from the prompts. The output is a list of RequestOutput objects # Create an LLM.
# that contain the prompt, generated text, and other information. llm = LLM(model="facebook/opt-125m")
outputs = llm.generate(prompts, sampling_params) # Generate texts from the prompts.
# Print the outputs. # The output is a list of RequestOutput objects
print("\nGenerated Outputs:\n" + "-" * 60) # that contain the prompt, generated text, and other information.
for output in outputs: outputs = llm.generate(prompts, sampling_params)
prompt = output.prompt # Print the outputs.
generated_text = output.outputs[0].text print("\nGenerated Outputs:\n" + "-" * 60)
print(f"Prompt: {prompt!r}") for output in outputs:
print(f"Output: {generated_text!r}") prompt = output.prompt
print("-" * 60) generated_text = output.outputs[0].text
\ No newline at end of file print(f"Prompt: {prompt!r}")
print(f"Output: {generated_text!r}")
print("-" * 60)
if __name__ == "__main__":
main()
...@@ -4,6 +4,24 @@ from vllm import LLM, EngineArgs ...@@ -4,6 +4,24 @@ from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def create_parser():
parser = FlexibleArgumentParser()
# Add engine args
engine_group = parser.add_argument_group("Engine arguments")
EngineArgs.add_cli_args(engine_group)
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
# Add sampling params
sampling_group = parser.add_argument_group("Sampling parameters")
sampling_group.add_argument("--max-tokens", type=int)
sampling_group.add_argument("--temperature", type=float)
sampling_group.add_argument("--top-p", type=float)
sampling_group.add_argument("--top-k", type=int)
# Add example params
parser.add_argument("--chat-template-path", type=str)
return parser
def main(args: dict): def main(args: dict):
# Pop arguments not used by LLM # Pop arguments not used by LLM
max_tokens = args.pop("max_tokens") max_tokens = args.pop("max_tokens")
...@@ -82,18 +100,6 @@ def main(args: dict): ...@@ -82,18 +100,6 @@ def main(args: dict):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() parser = create_parser()
# Add engine args
engine_group = parser.add_argument_group("Engine arguments")
EngineArgs.add_cli_args(engine_group)
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
# Add sampling params
sampling_group = parser.add_argument_group("Sampling parameters")
sampling_group.add_argument("--max-tokens", type=int)
sampling_group.add_argument("--temperature", type=float)
sampling_group.add_argument("--top-p", type=float)
sampling_group.add_argument("--top-k", type=int)
# Add example params
parser.add_argument("--chat-template-path", type=str)
args: dict = vars(parser.parse_args()) args: dict = vars(parser.parse_args())
main(args) main(args)
...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs ...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
task="classify",
enforce_eager=True)
return parser.parse_args()
def main(args: Namespace): def main(args: Namespace):
# Sample prompts. # Sample prompts.
prompts = [ prompts = [
...@@ -34,11 +44,5 @@ def main(args: Namespace): ...@@ -34,11 +44,5 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() args = parse_args()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jason9693/Qwen2.5-1.5B-apeach",
task="classify",
enforce_eager=True)
args = parser.parse_args()
main(args) main(args)
...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs ...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
task="embed",
enforce_eager=True)
return parser.parse_args()
def main(args: Namespace): def main(args: Namespace):
# Sample prompts. # Sample prompts.
prompts = [ prompts = [
...@@ -34,11 +44,5 @@ def main(args: Namespace): ...@@ -34,11 +44,5 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() args = parse_args()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="intfloat/e5-mistral-7b-instruct",
task="embed",
enforce_eager=True)
args = parser.parse_args()
main(args) main(args)
...@@ -4,6 +4,22 @@ from vllm import LLM, EngineArgs ...@@ -4,6 +4,22 @@ from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def create_parser():
parser = FlexibleArgumentParser()
# Add engine args
engine_group = parser.add_argument_group("Engine arguments")
EngineArgs.add_cli_args(engine_group)
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
# Add sampling params
sampling_group = parser.add_argument_group("Sampling parameters")
sampling_group.add_argument("--max-tokens", type=int)
sampling_group.add_argument("--temperature", type=float)
sampling_group.add_argument("--top-p", type=float)
sampling_group.add_argument("--top-k", type=int)
return parser
def main(args: dict): def main(args: dict):
# Pop arguments not used by LLM # Pop arguments not used by LLM
max_tokens = args.pop("max_tokens") max_tokens = args.pop("max_tokens")
...@@ -35,23 +51,15 @@ def main(args: dict): ...@@ -35,23 +51,15 @@ def main(args: dict):
] ]
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Print the outputs.
print("-" * 50)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() parser = create_parser()
# Add engine args
engine_group = parser.add_argument_group("Engine arguments")
EngineArgs.add_cli_args(engine_group)
engine_group.set_defaults(model="meta-llama/Llama-3.2-1B-Instruct")
# Add sampling params
sampling_group = parser.add_argument_group("Sampling parameters")
sampling_group.add_argument("--max-tokens", type=int)
sampling_group.add_argument("--temperature", type=float)
sampling_group.add_argument("--top-p", type=float)
sampling_group.add_argument("--top-k", type=int)
args: dict = vars(parser.parse_args()) args: dict = vars(parser.parse_args())
main(args) main(args)
...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs ...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
task="score",
enforce_eager=True)
return parser.parse_args()
def main(args: Namespace): def main(args: Namespace):
# Sample prompts. # Sample prompts.
text_1 = "What is the capital of France?" text_1 = "What is the capital of France?"
...@@ -30,11 +40,5 @@ def main(args: Namespace): ...@@ -30,11 +40,5 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() args = parse_args()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="BAAI/bge-reranker-v2-m3",
task="score",
enforce_eager=True)
args = parser.parse_args()
main(args) main(args)
...@@ -34,6 +34,40 @@ from vllm import LLM, SamplingParams ...@@ -34,6 +34,40 @@ from vllm import LLM, SamplingParams
from vllm.utils import get_open_port from vllm.utils import get_open_port
def parse_args():
import argparse
parser = argparse.ArgumentParser(description="Data Parallel Inference")
parser.add_argument("--model",
type=str,
default="ibm-research/PowerMoE-3b",
help="Model name or path")
parser.add_argument("--dp-size",
type=int,
default=2,
help="Data parallel size")
parser.add_argument("--tp-size",
type=int,
default=2,
help="Tensor parallel size")
parser.add_argument("--node-size",
type=int,
default=1,
help="Total number of nodes")
parser.add_argument("--node-rank",
type=int,
default=0,
help="Rank of the current node")
parser.add_argument("--master-addr",
type=str,
default="",
help="Master node IP address")
parser.add_argument("--master-port",
type=int,
default=0,
help="Master node port")
return parser.parse_args()
def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
dp_master_port, GPUs_per_dp_rank): dp_master_port, GPUs_per_dp_rank):
os.environ["VLLM_DP_RANK"] = str(global_dp_rank) os.environ["VLLM_DP_RANK"] = str(global_dp_rank)
...@@ -95,37 +129,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip, ...@@ -95,37 +129,8 @@ def main(model, dp_size, local_dp_rank, global_dp_rank, dp_master_ip,
if __name__ == "__main__": if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Data Parallel Inference") args = parse_args()
parser.add_argument("--model",
type=str,
default="ibm-research/PowerMoE-3b",
help="Model name or path")
parser.add_argument("--dp-size",
type=int,
default=2,
help="Data parallel size")
parser.add_argument("--tp-size",
type=int,
default=2,
help="Tensor parallel size")
parser.add_argument("--node-size",
type=int,
default=1,
help="Total number of nodes")
parser.add_argument("--node-rank",
type=int,
default=0,
help="Rank of the current node")
parser.add_argument("--master-addr",
type=str,
default="",
help="Master node IP address")
parser.add_argument("--master-port",
type=int,
default=0,
help="Master node port")
args = parser.parse_args()
dp_size = args.dp_size dp_size = args.dp_size
tp_size = args.tp_size tp_size = args.tp_size
......
...@@ -27,7 +27,7 @@ def load_prompts(dataset_path, num_prompts): ...@@ -27,7 +27,7 @@ def load_prompts(dataset_path, num_prompts):
return prompts[:num_prompts] return prompts[:num_prompts]
def main(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--dataset", "--dataset",
...@@ -45,7 +45,12 @@ def main(): ...@@ -45,7 +45,12 @@ def main():
parser.add_argument("--enable_chunked_prefill", action='store_true') parser.add_argument("--enable_chunked_prefill", action='store_true')
parser.add_argument("--max_num_batched_tokens", type=int, default=2048) parser.add_argument("--max_num_batched_tokens", type=int, default=2048)
parser.add_argument("--temp", type=float, default=0) parser.add_argument("--temp", type=float, default=0)
args = parser.parse_args() return parser.parse_args()
def main():
args = parse_args()
model_dir = "meta-llama/Meta-Llama-3-8B-Instruct" model_dir = "meta-llama/Meta-Llama-3-8B-Instruct"
eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm" eagle_dir = "abhigoyal/EAGLE-LLaMA3-Instruct-8B-vllm"
......
...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs ...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
return parser.parse_args()
def main(args: Namespace): def main(args: Namespace):
# Sample prompts. # Sample prompts.
prompts = [ prompts = [
...@@ -40,11 +50,5 @@ def main(args: Namespace): ...@@ -40,11 +50,5 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() args = parse_args()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
args = parser.parse_args()
main(args) main(args)
...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs, PoolingParams ...@@ -6,6 +6,16 @@ from vllm import LLM, EngineArgs, PoolingParams
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
def parse_args():
parser = FlexibleArgumentParser()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
return parser.parse_args()
def main(args: Namespace): def main(args: Namespace):
# Sample prompts. # Sample prompts.
prompts = [ prompts = [
...@@ -38,11 +48,5 @@ def main(args: Namespace): ...@@ -38,11 +48,5 @@ def main(args: Namespace):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser() args = parse_args()
parser = EngineArgs.add_cli_args(parser)
# Set example specific arguments
parser.set_defaults(model="jinaai/jina-embeddings-v3",
task="embed",
trust_remote_code=True)
args = parser.parse_args()
main(args) main(args)
...@@ -8,94 +8,112 @@ from vllm import LLM, SamplingParams ...@@ -8,94 +8,112 @@ from vllm import LLM, SamplingParams
from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt, from vllm.inputs import (ExplicitEncoderDecoderPrompt, TextPrompt,
TokensPrompt, zip_enc_dec_prompts) TokensPrompt, zip_enc_dec_prompts)
dtype = "float"
def create_prompts(tokenizer):
# Create a BART encoder/decoder model instance # Test prompts
llm = LLM( #
model="facebook/bart-large-cnn", # This section shows all of the valid ways to prompt an
dtype=dtype, # encoder/decoder model.
) #
# - Helpers for building prompts
# Get BART tokenizer text_prompt_raw = "Hello, my name is"
tokenizer = llm.llm_engine.get_tokenizer_group() text_prompt = TextPrompt(prompt="The president of the United States is")
tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
# Test prompts prompt="The capital of France is"))
# # - Pass a single prompt to encoder/decoder model
# This section shows all of the valid ways to prompt an # (implicitly encoder input prompt);
# encoder/decoder model. # decoder input prompt is assumed to be None
#
# - Helpers for building prompts single_text_prompt_raw = text_prompt_raw # Pass a string directly
text_prompt_raw = "Hello, my name is" single_text_prompt = text_prompt # Pass a TextPrompt
text_prompt = TextPrompt(prompt="The president of the United States is") single_tokens_prompt = tokens_prompt # Pass a TokensPrompt
tokens_prompt = TokensPrompt(prompt_token_ids=tokenizer.encode(
prompt="The capital of France is")) # ruff: noqa: E501
# - Pass a single prompt to encoder/decoder model # - Pass explicit encoder and decoder input prompts within one data structure.
# (implicitly encoder input prompt); # Encoder and decoder prompts can both independently be text or tokens, with
# decoder input prompt is assumed to be None # no requirement that they be the same prompt type. Some example prompt-type
# combinations are shown below, note that these are not exhaustive.
single_text_prompt_raw = text_prompt_raw # Pass a string directly
single_text_prompt = text_prompt # Pass a TextPrompt enc_dec_prompt1 = ExplicitEncoderDecoderPrompt(
single_tokens_prompt = tokens_prompt # Pass a TokensPrompt # Pass encoder prompt string directly, &
# pass decoder prompt tokens
# - Pass explicit encoder and decoder input prompts within one data structure. encoder_prompt=single_text_prompt_raw,
# Encoder and decoder prompts can both independently be text or tokens, with decoder_prompt=single_tokens_prompt,
# no requirement that they be the same prompt type. Some example prompt-type )
# combinations are shown below, note that these are not exhaustive. enc_dec_prompt2 = ExplicitEncoderDecoderPrompt(
# Pass TextPrompt to encoder, and
enc_dec_prompt1 = ExplicitEncoderDecoderPrompt( # pass decoder prompt string directly
# Pass encoder prompt string directly, & encoder_prompt=single_text_prompt,
# pass decoder prompt tokens decoder_prompt=single_text_prompt_raw,
encoder_prompt=single_text_prompt_raw, )
decoder_prompt=single_tokens_prompt, enc_dec_prompt3 = ExplicitEncoderDecoderPrompt(
) # Pass encoder prompt tokens directly, and
enc_dec_prompt2 = ExplicitEncoderDecoderPrompt( # pass TextPrompt to decoder
# Pass TextPrompt to encoder, and encoder_prompt=single_tokens_prompt,
# pass decoder prompt string directly decoder_prompt=single_text_prompt,
encoder_prompt=single_text_prompt, )
decoder_prompt=single_text_prompt_raw,
) # - Finally, here's a useful helper function for zipping encoder and
enc_dec_prompt3 = ExplicitEncoderDecoderPrompt( # decoder prompts together into a list of ExplicitEncoderDecoderPrompt
# Pass encoder prompt tokens directly, and # instances
# pass TextPrompt to decoder zipped_prompt_list = zip_enc_dec_prompts(
encoder_prompt=single_tokens_prompt, ['An encoder prompt', 'Another encoder prompt'],
decoder_prompt=single_text_prompt, ['A decoder prompt', 'Another decoder prompt'])
)
# - Let's put all of the above example prompts together into one list
# - Finally, here's a useful helper function for zipping encoder and # which we will pass to the encoder/decoder LLM.
# decoder prompts together into a list of ExplicitEncoderDecoderPrompt return [
# instances single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
zipped_prompt_list = zip_enc_dec_prompts( enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
['An encoder prompt', 'Another encoder prompt'], ] + zipped_prompt_list
['A decoder prompt', 'Another decoder prompt'])
# - Let's put all of the above example prompts together into one list
# which we will pass to the encoder/decoder LLM.
prompts = [
single_text_prompt_raw, single_text_prompt, single_tokens_prompt,
enc_dec_prompt1, enc_dec_prompt2, enc_dec_prompt3
] + zipped_prompt_list
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams( def create_sampling_params():
temperature=0, return SamplingParams(
top_p=1.0, temperature=0,
min_tokens=0, top_p=1.0,
max_tokens=20, min_tokens=0,
) max_tokens=20,
)
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs. # Print the outputs.
print("-" * 50) def print_outputs(outputs):
for i, output in enumerate(outputs):
prompt = output.prompt
encoder_prompt = output.encoder_prompt
generated_text = output.outputs[0].text
print(f"Output {i+1}:")
print(f"Encoder prompt: {encoder_prompt!r}\n"
f"Decoder prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50) print("-" * 50)
for i, output in enumerate(outputs):
prompt = output.prompt
encoder_prompt = output.encoder_prompt
generated_text = output.outputs[0].text
print(f"Output {i+1}:")
print(f"Encoder prompt: {encoder_prompt!r}\n"
f"Decoder prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50)
def main():
dtype = "float"
# Create a BART encoder/decoder model instance
llm = LLM(
model="facebook/bart-large-cnn",
dtype=dtype,
)
# Get BART tokenizer
tokenizer = llm.llm_engine.get_tokenizer_group()
prompts = create_prompts(tokenizer)
sampling_params = create_sampling_params()
# Generate output tokens from the prompts. The output is a list of
# RequestOutput objects that contain the prompt, generated
# text, and other information.
outputs = llm.generate(prompts, sampling_params)
print_outputs(outputs)
if __name__ == "__main__":
main()
...@@ -126,6 +126,23 @@ model_example_map = { ...@@ -126,6 +126,23 @@ model_example_map = {
} }
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="mllama",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
return parser.parse_args()
def main(args): def main(args):
model = args.model_type model = args.model_type
if model not in model_example_map: if model not in model_example_map:
...@@ -171,19 +188,5 @@ def main(args): ...@@ -171,19 +188,5 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( args = parse_args()
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="mllama",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
args = parser.parse_args()
main(args) main(args)
...@@ -168,7 +168,7 @@ def run_advanced_demo(args: argparse.Namespace): ...@@ -168,7 +168,7 @@ def run_advanced_demo(args: argparse.Namespace):
print("-" * 50) print("-" * 50)
def main(): def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Run a demo in simple or advanced mode.") description="Run a demo in simple or advanced mode.")
...@@ -187,8 +187,11 @@ def main(): ...@@ -187,8 +187,11 @@ def main():
'--disable-mm-preprocessor-cache', '--disable-mm-preprocessor-cache',
action='store_true', action='store_true',
help='If True, disables caching of multi-modal preprocessor/mapper.') help='If True, disables caching of multi-modal preprocessor/mapper.')
return parser.parse_args()
args = parser.parse_args() def main():
args = parse_args()
if args.mode == "simple": if args.mode == "simple":
print("Running simple demo...") print("Running simple demo...")
......
...@@ -34,8 +34,7 @@ def time_generation(llm: LLM, prompts: list[str], ...@@ -34,8 +34,7 @@ def time_generation(llm: LLM, prompts: list[str],
print("-" * 50) print("-" * 50)
if __name__ == "__main__": def main():
template = ( template = (
"Below is an instruction that describes a task. Write a response " "Below is an instruction that describes a task. Write a response "
"that appropriately completes the request.\n\n### Instruction:\n{}" "that appropriately completes the request.\n\n### Instruction:\n{}"
...@@ -66,3 +65,7 @@ if __name__ == "__main__": ...@@ -66,3 +65,7 @@ if __name__ == "__main__":
) )
time_generation(llm, prompts, sampling_params, "With speculation") time_generation(llm, prompts, sampling_params, "With speculation")
if __name__ == "__main__":
main()
...@@ -417,6 +417,38 @@ def run_model(input_data, ...@@ -417,6 +417,38 @@ def run_model(input_data,
return pred_imgs return pred_imgs
def parse_args():
parser = argparse.ArgumentParser("MAE run inference", add_help=False)
parser.add_argument(
"--data_file",
type=str,
default="./India_900498_S2Hand.tif",
help="Path to the file.",
)
parser.add_argument(
"--output_dir",
type=str,
default="output",
help="Path to the directory where to save outputs.",
)
parser.add_argument(
"--input_indices",
default=[1, 2, 3, 8, 11, 12],
type=int,
nargs="+",
help=
"0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data.",
)
parser.add_argument(
"--rgb_outputs",
action="store_true",
help="If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved.",
)
def main( def main(
data_file: str, data_file: str,
output_dir: str, output_dir: str,
...@@ -496,35 +528,7 @@ def main( ...@@ -496,35 +528,7 @@ def main(
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser("MAE run inference", add_help=False)
parser.add_argument( args = parse_args()
"--data_file",
type=str,
default="./India_900498_S2Hand.tif",
help="Path to the file.",
)
parser.add_argument(
"--output_dir",
type=str,
default="output",
help="Path to the directory where to save outputs.",
)
parser.add_argument(
"--input_indices",
default=[1, 2, 3, 8, 11, 12],
type=int,
nargs="+",
help=
"0-based indices of the six Prithvi channels to be selected from the "
"input. By default selects [1,2,3,8,11,12] for S2L1C data.",
)
parser.add_argument(
"--rgb_outputs",
action="store_true",
help="If present, output files will only contain RGB channels. "
"Otherwise, all bands will be saved.",
)
args = parser.parse_args()
main(**vars(args)) main(**vars(args))
...@@ -359,7 +359,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], ...@@ -359,7 +359,7 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
f" in folder {context.save_chrome_traces_folder}") f" in folder {context.save_chrome_traces_folder}")
if __name__ == "__main__": def parse_args():
parser = FlexibleArgumentParser(description=""" parser = FlexibleArgumentParser(description="""
Profile a model Profile a model
...@@ -449,7 +449,10 @@ Profile a model ...@@ -449,7 +449,10 @@ Profile a model
EngineArgs.add_cli_args(parser) EngineArgs.add_cli_args(parser)
args = parser.parse_args() return parser.parse_args()
def main(args):
context = ProfileContext( context = ProfileContext(
engine_args=EngineArgs.from_cli_args(args), engine_args=EngineArgs.from_cli_args(args),
**{ **{
...@@ -458,3 +461,8 @@ Profile a model ...@@ -458,3 +461,8 @@ Profile a model
if k in inspect.signature(ProfileContext).parameters if k in inspect.signature(ProfileContext).parameters
}) })
run_profile(context, csv_output=args.csv, json_output=args.json) run_profile(context, csv_output=args.csv, json_output=args.json)
if __name__ == "__main__":
args = parse_args()
main(args)
...@@ -29,20 +29,23 @@ from pathlib import Path ...@@ -29,20 +29,23 @@ from pathlib import Path
from vllm import LLM, EngineArgs from vllm import LLM, EngineArgs
from vllm.utils import FlexibleArgumentParser from vllm.utils import FlexibleArgumentParser
parser = FlexibleArgumentParser()
EngineArgs.add_cli_args(parser) def parse_args():
parser.add_argument("--output", parser = FlexibleArgumentParser()
"-o", EngineArgs.add_cli_args(parser)
required=True, parser.add_argument("--output",
type=str, "-o",
help="path to output checkpoint") required=True,
parser.add_argument("--file-pattern", type=str,
type=str, help="path to output checkpoint")
help="string pattern of saved filenames") parser.add_argument("--file-pattern",
parser.add_argument("--max-file-size", type=str,
type=str, help="string pattern of saved filenames")
default=5 * 1024**3, parser.add_argument("--max-file-size",
help="max size (in bytes) of each safetensors file") type=str,
default=5 * 1024**3,
help="max size (in bytes) of each safetensors file")
return parser.parse_args()
def main(args): def main(args):
...@@ -87,5 +90,5 @@ def main(args): ...@@ -87,5 +90,5 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parse_args()
main(args) main(args)
...@@ -18,8 +18,8 @@ prompts = [ ...@@ -18,8 +18,8 @@ prompts = [
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
if __name__ == "__main__":
def main():
# Create an LLM. # Create an LLM.
llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1) llm = LLM(model="facebook/opt-125m", tensor_parallel_size=1)
...@@ -42,3 +42,7 @@ if __name__ == "__main__": ...@@ -42,3 +42,7 @@ if __name__ == "__main__":
# Add a buffer to wait for profiler in the background process # Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output. # (in case MP is on) to finish writing profiling output.
time.sleep(10) time.sleep(10)
if __name__ == "__main__":
main()
...@@ -1097,6 +1097,59 @@ def time_counter(enable: bool): ...@@ -1097,6 +1097,59 @@ def time_counter(enable: bool):
yield yield
def parse_args():
parser = FlexibleArgumentParser(
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="llava",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=4,
help='Number of prompts to run.')
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
default=16,
help='Number of frames to extract from the video.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
parser.add_argument(
'--image-repeat-prob',
type=float,
default=None,
help='Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)')
parser.add_argument(
'--disable-mm-preprocessor-cache',
action='store_true',
help='If True, disables caching of multi-modal preprocessor/mapper.')
parser.add_argument(
'--time-generate',
action='store_true',
help='If True, then print the total generate() call time')
parser.add_argument(
'--use-different-prompt-per-request',
action='store_true',
help='If True, then use different prompt (with the same multi-modal '
'data) for each request.')
return parser.parse_args()
def main(args): def main(args):
model = args.model_type model = args.model_type
if model not in model_example_map: if model not in model_example_map:
...@@ -1175,55 +1228,5 @@ def main(args): ...@@ -1175,55 +1228,5 @@ def main(args):
if __name__ == "__main__": if __name__ == "__main__":
parser = FlexibleArgumentParser( args = parse_args()
description='Demo on using vLLM for offline inference with '
'vision language models for text generation')
parser.add_argument('--model-type',
'-m',
type=str,
default="llava",
choices=model_example_map.keys(),
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=4,
help='Number of prompts to run.')
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
default=16,
help='Number of frames to extract from the video.')
parser.add_argument("--seed",
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
parser.add_argument(
'--image-repeat-prob',
type=float,
default=None,
help='Simulates the hit-ratio for multi-modal preprocessor cache'
' (if enabled)')
parser.add_argument(
'--disable-mm-preprocessor-cache',
action='store_true',
help='If True, disables caching of multi-modal preprocessor/mapper.')
parser.add_argument(
'--time-generate',
action='store_true',
help='If True, then print the total generate() call time')
parser.add_argument(
'--use-different-prompt-per-request',
action='store_true',
help='If True, then use different prompt (with the same multi-modal '
'data) for each request.')
args = parser.parse_args()
main(args) main(args)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment