Unverified Commit a21256c4 authored by Fanli Lin's avatar Fanli Lin Committed by GitHub
Browse files

Add TP CLI argument to multimodal inference examples (#29301)


Signed-off-by: default avatarLin, Fanli <fanli.lin@intel.com>
parent 316c8492
...@@ -425,6 +425,13 @@ def parse_args(): ...@@ -425,6 +425,13 @@ def parse_args():
default=None, default=None,
help="Set the seed when initializing `vllm.LLM`.", help="Set the seed when initializing `vllm.LLM`.",
) )
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args() return parser.parse_args()
...@@ -434,6 +441,12 @@ def main(args): ...@@ -434,6 +441,12 @@ def main(args):
if model not in model_example_map: if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.") raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
audio_count = args.num_audios audio_count = args.num_audios
req_data = model_example_map[model]( req_data = model_example_map[model](
question_per_audio_count[audio_count], audio_count question_per_audio_count[audio_count], audio_count
...@@ -446,6 +459,8 @@ def main(args): ...@@ -446,6 +459,8 @@ def main(args):
) )
engine_args = asdict(req_data.engine_args) | {"seed": args.seed} engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args) llm = LLM(**engine_args)
# We set temperature to 0.2 so that outputs can be different # We set temperature to 0.2 so that outputs can be different
......
...@@ -2064,6 +2064,13 @@ def parse_args(): ...@@ -2064,6 +2064,13 @@ def parse_args():
help="If True, will send all requests in a second batch with empty mm " help="If True, will send all requests in a second batch with empty mm "
"data to verify cache hits with UUIDs.", "data to verify cache hits with UUIDs.",
) )
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args() return parser.parse_args()
...@@ -2072,6 +2079,12 @@ def main(args): ...@@ -2072,6 +2079,12 @@ def main(args):
if model not in model_example_map: if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.") raise ValueError(f"Model type {model} is not supported.")
if args.tensor_parallel_size is not None and args.tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {args.tensor_parallel_size}"
)
modality = args.modality modality = args.modality
mm_input = get_multi_modal_input(args) mm_input = get_multi_modal_input(args)
data = mm_input["data"] data = mm_input["data"]
...@@ -2089,6 +2102,8 @@ def main(args): ...@@ -2089,6 +2102,8 @@ def main(args):
"seed": args.seed, "seed": args.seed,
"mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4, "mm_processor_cache_gb": 0 if args.disable_mm_processor_cache else 4,
} }
if args.tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = args.tensor_parallel_size
llm = LLM(**engine_args) llm = LLM(**engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`. # Don't want to check the flag multiple times, so just hijack `prompts`.
......
...@@ -1352,10 +1352,18 @@ model_example_map = { ...@@ -1352,10 +1352,18 @@ model_example_map = {
} }
def run_generate(model, question: str, image_urls: list[str], seed: int | None): def run_generate(
model,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed} engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args) llm = LLM(**engine_args)
sampling_params = SamplingParams( sampling_params = SamplingParams(
...@@ -1378,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None): ...@@ -1378,7 +1386,13 @@ def run_generate(model, question: str, image_urls: list[str], seed: int | None):
print("-" * 50) print("-" * 50)
def run_chat(model: str, question: str, image_urls: list[str], seed: int | None): def run_chat(
model: str,
question: str,
image_urls: list[str],
seed: int | None,
tensor_parallel_size: int | None,
):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
# Disable other modalities to save memory # Disable other modalities to save memory
...@@ -1388,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None) ...@@ -1388,6 +1402,8 @@ def run_chat(model: str, question: str, image_urls: list[str], seed: int | None)
) )
engine_args = asdict(req_data.engine_args) | {"seed": seed} engine_args = asdict(req_data.engine_args) | {"seed": seed}
if tensor_parallel_size is not None:
engine_args["tensor_parallel_size"] = tensor_parallel_size
llm = LLM(**engine_args) llm = LLM(**engine_args)
sampling_params = ( sampling_params = (
...@@ -1463,6 +1479,13 @@ def parse_args(): ...@@ -1463,6 +1479,13 @@ def parse_args():
default=2, default=2,
help="Number of images to use for the demo.", help="Number of images to use for the demo.",
) )
parser.add_argument(
"--tensor-parallel-size",
"-tp",
type=int,
default=None,
help="Tensor parallel size to override the model's default setting. ",
)
return parser.parse_args() return parser.parse_args()
...@@ -1470,13 +1493,20 @@ def main(args: Namespace): ...@@ -1470,13 +1493,20 @@ def main(args: Namespace):
model = args.model_type model = args.model_type
method = args.method method = args.method
seed = args.seed seed = args.seed
tensor_parallel_size = args.tensor_parallel_size
if tensor_parallel_size is not None and tensor_parallel_size < 1:
raise ValueError(
f"tensor_parallel_size must be a positive integer, "
f"got {tensor_parallel_size}"
)
image_urls = IMAGE_URLS[: args.num_images] image_urls = IMAGE_URLS[: args.num_images]
if method == "generate": if method == "generate":
run_generate(model, QUESTION, image_urls, seed) run_generate(model, QUESTION, image_urls, seed, tensor_parallel_size)
elif method == "chat": elif method == "chat":
run_chat(model, QUESTION, image_urls, seed) run_chat(model, QUESTION, image_urls, seed, tensor_parallel_size)
else: else:
raise ValueError(f"Invalid method: {method}") raise ValueError(f"Invalid method: {method}")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment