"vscode:/vscode.git/clone" did not exist on "8bddb735123204872788a8ffe117321de7550e6c"
Commit 31330101 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.4' into v0.8.4-dev

parents e8933c34 dc1b4a6f
...@@ -22,31 +22,40 @@ prompts = [ ...@@ -22,31 +22,40 @@ prompts = [
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM( def main():
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", # Create an LLM.
max_num_seqs=8, llm = LLM(
# The max_model_len and block_size arguments are required to be same as model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
# max sequence length when targeting neuron device. max_num_seqs=8,
# Currently, this is a known limitation in continuous batching support # The max_model_len and block_size arguments are required to be same as
# in transformers-neuronx. # max sequence length when targeting neuron device.
# TODO(liangfu): Support paged-attention in transformers-neuronx. # Currently, this is a known limitation in continuous batching support
max_model_len=2048, # in transformers-neuronx.
block_size=2048, # TODO(liangfu): Support paged-attention in transformers-neuronx.
# The device can be automatically detected when AWS Neuron SDK is installed. max_model_len=2048,
# The device argument can be either unspecified for automated detection, block_size=2048,
# or explicitly assigned. # ruff: noqa: E501
device="neuron", # The device can be automatically detected when AWS Neuron SDK is installed.
quantization="neuron_quant", # The device argument can be either unspecified for automated detection,
override_neuron_config={ # or explicitly assigned.
"cast_logits_dtype": "bfloat16", device="neuron",
}, quantization="neuron_quant",
tensor_parallel_size=2) override_neuron_config={
# Generate texts from the prompts. The output is a list of RequestOutput objects "cast_logits_dtype": "bfloat16",
# that contain the prompt, generated text, and other information. },
outputs = llm.generate(prompts, sampling_params) tensor_parallel_size=2)
# Print the outputs. # Generate texts from the prompts. The output is a list of RequestOutput objects
for output in outputs: # that contain the prompt, generated text, and other information.
prompt = output.prompt outputs = llm.generate(prompts, sampling_params)
generated_text = output.outputs[0].text # Print the outputs.
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
if __name__ == "__main__":
main()
...@@ -31,55 +31,62 @@ generating_prompts = [prefix + prompt for prompt in prompts] ...@@ -31,55 +31,62 @@ generating_prompts = [prefix + prompt for prompt in prompts]
# Create a sampling params object. # Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0) sampling_params = SamplingParams(temperature=0.0)
# Create an LLM without prefix caching as a baseline.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4) def main():
# Create an LLM without prefix caching as a baseline.
print("Results without `enable_prefix_caching`") regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
# Generate texts from the prompts. The output is a list of RequestOutput objects print("Results without `enable_prefix_caching`")
# that contain the prompt, generated text, and other information.
outputs = regular_llm.generate(generating_prompts, sampling_params) # ruff: noqa: E501
# Generate texts from the prompts. The output is a list of RequestOutput objects
regular_generated_texts = [] # that contain the prompt, generated text, and other information.
# Print the outputs. outputs = regular_llm.generate(generating_prompts, sampling_params)
for output in outputs:
prompt = output.prompt regular_generated_texts = []
generated_text = output.outputs[0].text # Print the outputs.
regular_generated_texts.append(generated_text) print("-" * 50)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") for output in outputs:
prompt = output.prompt
print("-" * 80) generated_text = output.outputs[0].text
regular_generated_texts.append(generated_text)
# Destroy the LLM object and free up the GPU memory. print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
del regular_llm print("-" * 50)
cleanup_dist_env_and_memory()
# Destroy the LLM object and free up the GPU memory.
# Create an LLM with prefix caching enabled. del regular_llm
prefix_cached_llm = LLM(model="facebook/opt-125m", cleanup_dist_env_and_memory()
enable_prefix_caching=True,
gpu_memory_utilization=0.4) # Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(model="facebook/opt-125m",
# Warmup so that the shared prompt's KV cache is computed. enable_prefix_caching=True,
prefix_cached_llm.generate(generating_prompts[0], sampling_params) gpu_memory_utilization=0.4)
# Generate with prefix caching. # Warmup so that the shared prompt's KV cache is computed.
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params) prefix_cached_llm.generate(generating_prompts[0], sampling_params)
print("Results with `enable_prefix_caching`") # Generate with prefix caching.
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
cached_generated_texts = []
# Print the outputs. You should see the same outputs as before. print("Results with `enable_prefix_caching`")
for output in outputs:
prompt = output.prompt cached_generated_texts = []
generated_text = output.outputs[0].text # Print the outputs. You should see the same outputs as before.
cached_generated_texts.append(generated_text) print("-" * 50)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") for output in outputs:
prompt = output.prompt
print("-" * 80) generated_text = output.outputs[0].text
cached_generated_texts.append(generated_text)
# Compare the results and display the speedup print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
generated_same = all([ print("-" * 50)
regular_generated_texts[i] == cached_generated_texts[i]
for i in range(len(prompts)) # Compare the results and display the speedup
]) generated_same = all([
print(f"Generated answers are the same: {generated_same}") regular_generated_texts[i] == cached_generated_texts[i]
for i in range(len(prompts))
])
print(f"Generated answers are the same: {generated_same}")
if __name__ == "__main__":
main()
...@@ -234,9 +234,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], ...@@ -234,9 +234,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
sampling_params.max_tokens = next(output_len_generator) sampling_params.max_tokens = next(output_len_generator)
assert isinstance(sampling_params.max_tokens, int) assert isinstance(sampling_params.max_tokens, int)
prompt_token_ids = torch.randint( prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size,
llm.llm_engine.model_config.get_vocab_size(), size=(prompt_len, )).tolist()
size=(prompt_len, )).tolist()
llm.llm_engine.add_request( llm.llm_engine.add_request(
request_id=f"seq{i}", request_id=f"seq{i}",
......
...@@ -19,8 +19,6 @@ SEED = 42 ...@@ -19,8 +19,6 @@ SEED = 42
# because it is almost impossible to make the scheduling deterministic in the # because it is almost impossible to make the scheduling deterministic in the
# online serving setting. # online serving setting.
llm = LLM(model="facebook/opt-125m", seed=SEED)
prompts = [ prompts = [
"Hello, my name is", "Hello, my name is",
"The president of the United States is", "The president of the United States is",
...@@ -29,8 +27,17 @@ prompts = [ ...@@ -29,8 +27,17 @@ prompts = [
] ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
for output in outputs: def main():
prompt = output.prompt llm = LLM(model="facebook/opt-125m", seed=SEED)
generated_text = output.outputs[0].text outputs = llm.generate(prompts, sampling_params)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
if __name__ == "__main__":
main()
...@@ -85,11 +85,13 @@ sampling_params = SamplingParams(temperature=0) ...@@ -85,11 +85,13 @@ sampling_params = SamplingParams(temperature=0)
outputs = ray.get(llm.generate.remote(prompts, sampling_params)) outputs = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, " print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}") f"Generated text: {generated_text!r}")
print("-" * 50)
# set up the communication between the training process # set up the communication between the training process
# and the inference engine. # and the inference engine.
...@@ -120,8 +122,10 @@ assert all(ray.get(llm.collective_rpc.remote("check_weights_changed"))) ...@@ -120,8 +122,10 @@ assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
# use the updated model to generate texts, they will be nonsense # use the updated model to generate texts, they will be nonsense
# because the weights are all zeros. # because the weights are all zeros.
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params)) outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs_updated: for output in outputs_updated:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, " print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}") f"Generated text: {generated_text!r}")
print("-" * 50)
...@@ -32,10 +32,12 @@ if __name__ == "__main__": ...@@ -32,10 +32,12 @@ if __name__ == "__main__":
llm.stop_profile() llm.stop_profile()
# Print the outputs. # Print the outputs.
print("-" * 50)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# Add a buffer to wait for profiler in the background process # Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output. # (in case MP is on) to finish writing profiling output.
......
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""
This file demonstrates the example usage of guided decoding
to generate structured outputs using vLLM. It shows how to apply
different guided decoding techniques such as Choice, Regex, JSON schema,
and Grammar to produce structured and formatted results
based on specific prompts.
"""
from enum import Enum from enum import Enum
...@@ -7,26 +14,21 @@ from pydantic import BaseModel ...@@ -7,26 +14,21 @@ from pydantic import BaseModel
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams from vllm.sampling_params import GuidedDecodingParams
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
# Guided decoding by Choice (list of possible options) # Guided decoding by Choice (list of possible options)
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"]) guided_decoding_params_choice = GuidedDecodingParams(
sampling_params = SamplingParams(guided_decoding=guided_decoding_params) choice=["Positive", "Negative"])
outputs = llm.generate( sampling_params_choice = SamplingParams(
prompts="Classify this sentiment: vLLM is wonderful!", guided_decoding=guided_decoding_params_choice)
sampling_params=sampling_params, prompt_choice = "Classify this sentiment: vLLM is wonderful!"
)
print(outputs[0].outputs[0].text)
# Guided decoding by Regex # Guided decoding by Regex
guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n") guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
sampling_params = SamplingParams(guided_decoding=guided_decoding_params, sampling_params_regex = SamplingParams(
stop=["\n"]) guided_decoding=guided_decoding_params_regex, stop=["\n"])
prompt = ("Generate an email address for Alan Turing, who works in Enigma." prompt_regex = (
"End in .com and new line. Example result:" "Generate an email address for Alan Turing, who works in Enigma."
"alan.turing@enigma.com\n") "End in .com and new line. Example result:"
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params) "alan.turing@enigma.com\n")
print(outputs[0].outputs[0].text)
# Guided decoding by JSON using Pydantic schema # Guided decoding by JSON using Pydantic schema
...@@ -44,37 +46,54 @@ class CarDescription(BaseModel): ...@@ -44,37 +46,54 @@ class CarDescription(BaseModel):
json_schema = CarDescription.model_json_schema() json_schema = CarDescription.model_json_schema()
guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
guided_decoding_params = GuidedDecodingParams(json=json_schema) sampling_params_json = SamplingParams(
sampling_params = SamplingParams(guided_decoding=guided_decoding_params) guided_decoding=guided_decoding_params_json)
prompt = ("Generate a JSON with the brand, model and car_type of" prompt_json = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's") "the most iconic car from the 90's")
outputs = llm.generate(
prompts=prompt,
sampling_params=sampling_params,
)
print(outputs[0].outputs[0].text)
# Guided decoding by Grammar # Guided decoding by Grammar
simplified_sql_grammar = """ simplified_sql_grammar = """
?start: select_statement root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""
guided_decoding_params_grammar = GuidedDecodingParams(
grammar=simplified_sql_grammar)
sampling_params_grammar = SamplingParams(
guided_decoding=guided_decoding_params_grammar)
prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table.")
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)* def format_output(title: str, output: str):
print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")
?table_name: identifier
?column_name: identifier def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
return outputs[0].outputs[0].text
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
""" def main():
guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar) llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
prompt = ("Generate an SQL query to show the 'username' and 'email'" choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
"from the 'users' table.") format_output("Guided decoding by Choice", choice_output)
outputs = llm.generate(
prompts=prompt, regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
sampling_params=sampling_params, format_output("Guided decoding by Regex", regex_output)
)
print(outputs[0].outputs[0].text) json_output = generate_output(prompt_json, sampling_params_json, llm)
format_output("Guided decoding by JSON", json_output)
grammar_output = generate_output(prompt_grammar, sampling_params_grammar,
llm)
format_output("Guided decoding by Grammar", grammar_output)
if __name__ == "__main__":
main()
...@@ -36,11 +36,13 @@ llm = LLM( ...@@ -36,11 +36,13 @@ llm = LLM(
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
# all ranks will have the same outputs # all ranks will have the same outputs
print("-" * 50)
for output in outputs: for output in outputs:
prompt = output.prompt prompt = output.prompt
generated_text = output.outputs[0].text generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, " print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}") f"Generated text: {generated_text!r}")
print("-" * 50)
""" """
Further tips: Further tips:
......
...@@ -16,14 +16,22 @@ N = 1 ...@@ -16,14 +16,22 @@ N = 1
# Currently, top-p sampling is disabled. `top_p` should be 1.0. # Currently, top-p sampling is disabled. `top_p` should be 1.0.
sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16) sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`. def main():
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct", # Set `enforce_eager=True` to avoid ahead-of-time compilation.
max_num_batched_tokens=64, # In real workloads, `enforace_eager` should be `False`.
max_num_seqs=4) llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
outputs = llm.generate(prompts, sampling_params) max_num_batched_tokens=64,
for output, answer in zip(outputs, answers): max_num_seqs=4)
prompt = output.prompt outputs = llm.generate(prompts, sampling_params)
generated_text = output.outputs[0].text print("-" * 50)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") for output, answer in zip(outputs, answers):
assert generated_text.startswith(answer) prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
assert generated_text.startswith(answer)
print("-" * 50)
if __name__ == "__main__":
main()
...@@ -8,6 +8,7 @@ on HuggingFace model repository. ...@@ -8,6 +8,7 @@ on HuggingFace model repository.
""" """
import os import os
import random import random
from contextlib import contextmanager
from dataclasses import asdict from dataclasses import asdict
from typing import NamedTuple, Optional from typing import NamedTuple, Optional
...@@ -44,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData: ...@@ -44,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="bfloat16", dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}" prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
...@@ -70,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData: ...@@ -70,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
mm_processor_kwargs={"crop_to_patches": True}, mm_processor_kwargs={"crop_to_patches": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = [ prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
...@@ -91,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -91,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
prompts = [f"Question: {question} Answer:" for question in questions] prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="Salesforce/blip2-opt-6.7b", model="Salesforce/blip2-opt-6.7b",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -109,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData: ...@@ -109,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
model="facebook/chameleon-7b", model="facebook/chameleon-7b",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -128,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -128,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]}, hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
limit_mm_per_prompt={"image": 1},
) )
prompts = [ prompts = [
...@@ -154,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -154,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions] prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
...@@ -174,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData: ...@@ -174,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
model="adept/fuyu-8b", model="adept/fuyu-8b",
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -193,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -193,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
mm_processor_kwargs={"do_pan_and_scan": True}, mm_processor_kwargs={"do_pan_and_scan": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = [("<bos><start_of_turn>user\n" prompts = [("<bos><start_of_turn>user\n"
...@@ -218,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -218,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True, trust_remote_code=True,
enforce_eager=True, enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]}, hf_overrides={"architectures": ["GLM4VForCausalLM"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = [ prompts = [
...@@ -245,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -245,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=8192, max_model_len=8192,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
...@@ -286,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -286,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 3 * 364 "longest_edge": 3 * 364
}, },
}, },
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = [( prompts = [(
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:" f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
...@@ -298,6 +299,34 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -298,6 +299,34 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
) )
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
enforce_eager=True,
mm_processor_kwargs={
"max_image_size": {
"longest_edge": 384
},
},
limit_mm_per_prompt={"image": 1},
)
prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# InternVL # InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData: def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
...@@ -308,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -308,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
...@@ -346,7 +375,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData: ...@@ -346,7 +375,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-1.5-7b-hf", model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096, max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -363,7 +392,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData: ...@@ -363,7 +392,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-v1.6-mistral-7b-hf", model="llava-hf/llava-v1.6-mistral-7b-hf",
max_model_len=8192, max_model_len=8192,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -385,7 +414,7 @@ def run_llava_next_video(questions: list[str], ...@@ -385,7 +414,7 @@ def run_llava_next_video(questions: list[str],
model="llava-hf/LLaVA-NeXT-Video-7B-hf", model="llava-hf/LLaVA-NeXT-Video-7B-hf",
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -413,7 +442,7 @@ def run_llava_onevision(questions: list[str], ...@@ -413,7 +442,7 @@ def run_llava_onevision(questions: list[str],
engine_args = EngineArgs( engine_args = EngineArgs(
model="llava-hf/llava-onevision-qwen2-7b-ov-hf", model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384, max_model_len=16384,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -436,7 +465,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData: ...@@ -436,7 +465,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
model="TIGER-Lab/Mantis-8B-siglip-llama3", model="TIGER-Lab/Mantis-8B-siglip-llama3",
max_model_len=4096, max_model_len=4096,
hf_overrides={"architectures": ["MantisForConditionalGeneration"]}, hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
stop_token_ids = [128009] stop_token_ids = [128009]
...@@ -477,7 +506,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name): ...@@ -477,7 +506,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
trust_remote_code=True, trust_remote_code=True,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
# NOTE The stop_token_ids are different for various versions of MiniCPM-V # NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0 # 2.0
...@@ -532,7 +561,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData: ...@@ -532,7 +561,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=8192, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
tensor_parallel_size=2, tensor_parallel_size=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions] prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
...@@ -556,9 +585,9 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ...@@ -556,9 +585,9 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=8192,
max_num_seqs=2, max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
...@@ -582,7 +611,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData: ...@@ -582,7 +611,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
) )
def run_llama4(questions: list[str], modality: str): def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image" assert modality == "image"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct" model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
...@@ -592,8 +621,8 @@ def run_llama4(questions: list[str], modality: str): ...@@ -592,8 +621,8 @@ def run_llama4(questions: list[str], modality: str):
max_model_len=8192, max_model_len=8192,
max_num_seqs=4, max_num_seqs=4,
tensor_parallel_size=8, tensor_parallel_size=8,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
gpu_memory_utilization=0.4, gpu_memory_utilization=0.4,
limit_mm_per_prompt={"image": 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name)
...@@ -628,7 +657,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData: ...@@ -628,7 +657,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
dtype="bfloat16", dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = [ prompts = [
...@@ -654,7 +683,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData: ...@@ -654,7 +683,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
tensor_parallel_size=4, tensor_parallel_size=4,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
...@@ -681,7 +710,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData: ...@@ -681,7 +710,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions] prompts = ["caption en" for _ in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="google/paligemma-3b-mix-224", model="google/paligemma-3b-mix-224",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) limit_mm_per_prompt={"image": 1},
)
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
...@@ -697,7 +727,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData: ...@@ -697,7 +727,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions] prompts = ["caption en" for _ in questions]
engine_args = EngineArgs( engine_args = EngineArgs(
model="google/paligemma2-3b-ft-docci-448", model="google/paligemma2-3b-ft-docci-448",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache) limit_mm_per_prompt={"image": 1},
)
return ModelRequestData( return ModelRequestData(
engine_args=engine_args, engine_args=engine_args,
...@@ -733,7 +764,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -733,7 +764,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2, max_num_seqs=2,
# Note - mm_processor_kwargs can also be passed to generate/chat calls # Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"num_crops": 16}, mm_processor_kwargs={"num_crops": 16},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -764,6 +795,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData: ...@@ -764,6 +795,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2, max_num_seqs=2,
enable_lora=True, enable_lora=True,
max_lora_rank=320, max_lora_rank=320,
limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -784,7 +816,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData: ...@@ -784,7 +816,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
max_model_len=6144, max_model_len=6144,
max_num_seqs=2, max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions] prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
...@@ -805,7 +837,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -805,7 +837,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=1024, max_model_len=1024,
max_num_seqs=2, max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]}, hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions] prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
...@@ -830,7 +862,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -830,7 +862,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels": 28 * 28, "min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
}, },
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
if modality == "image": if modality == "image":
...@@ -865,7 +897,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData: ...@@ -865,7 +897,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": 1, "fps": 1,
}, },
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
if modality == "image": if modality == "image":
...@@ -896,7 +928,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData: ...@@ -896,7 +928,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
model=model_name, model=model_name,
trust_remote_code=True, trust_remote_code=True,
max_model_len=4096, max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache, limit_mm_per_prompt={"image": 1},
) )
tokenizer = AutoTokenizer.from_pretrained(model_name, tokenizer = AutoTokenizer.from_pretrained(model_name,
...@@ -955,6 +987,7 @@ model_example_map = { ...@@ -955,6 +987,7 @@ model_example_map = {
"qwen2_vl": run_qwen2_vl, "qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl, "qwen2_5_vl": run_qwen2_5_vl,
"skywork_chat": run_skyworkr1v, "skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm,
} }
...@@ -1026,6 +1059,20 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data, ...@@ -1026,6 +1059,20 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data,
return inputs return inputs
@contextmanager
def time_counter(enable: bool):
if enable:
import time
start_time = time.time()
yield
elapsed_time = time.time() - start_time
print("-" * 50)
print("-- generate time = {}".format(elapsed_time))
print("-" * 50)
else:
yield
def main(args): def main(args):
model = args.model_type model = args.model_type
if model not in model_example_map: if model not in model_example_map:
...@@ -1038,15 +1085,16 @@ def main(args): ...@@ -1038,15 +1085,16 @@ def main(args):
req_data = model_example_map[model](questions, modality) req_data = model_example_map[model](questions, modality)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed} # Disable other modalities to save memory
llm = LLM(**engine_args) default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})
# To maintain code compatibility in this script, we add LoRA here. engine_args = asdict(req_data.engine_args) | {
# You can also add LoRA using: "seed": args.seed,
# llm.generate(prompts, lora_request=lora_request,...) "disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
if req_data.lora_requests: }
for lora_request in req_data.lora_requests: llm = LLM(**engine_args)
llm.llm_engine.add_lora(lora_request=lora_request)
# Don't want to check the flag multiple times, so just hijack `prompts`. # Don't want to check the flag multiple times, so just hijack `prompts`.
prompts = req_data.prompts if args.use_different_prompt_per_request else [ prompts = req_data.prompts if args.use_different_prompt_per_request else [
...@@ -1084,19 +1132,22 @@ def main(args): ...@@ -1084,19 +1132,22 @@ def main(args):
}, },
} for i in range(args.num_prompts)] } for i in range(args.num_prompts)]
if args.time_generate: # Add LoRA request if applicable
import time lora_request = (req_data.lora_requests *
start_time = time.time() args.num_prompts if req_data.lora_requests else None)
outputs = llm.generate(inputs, sampling_params=sampling_params)
elapsed_time = time.time() - start_time
print("-- generate time = {}".format(elapsed_time))
else: with time_counter(args.time_generate):
outputs = llm.generate(inputs, sampling_params=sampling_params) outputs = llm.generate(
inputs,
sampling_params=sampling_params,
lora_request=lora_request,
)
print("-" * 50)
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
print("-" * 50)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -63,6 +63,7 @@ def run_e5_v(query: Query) -> ModelRequestData: ...@@ -63,6 +63,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
model="royokong/e5-v", model="royokong/e5-v",
task="embed", task="embed",
max_model_len=4096, max_model_len=4096,
limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -93,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData: ...@@ -93,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
task="embed", task="embed",
trust_remote_code=True, trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4}, mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1},
) )
return ModelRequestData( return ModelRequestData(
...@@ -131,6 +133,11 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): ...@@ -131,6 +133,11 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
query = get_query(modality) query = get_query(modality)
req_data = model_example_map[model](query) req_data = model_example_map[model](query)
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})
engine_args = asdict(req_data.engine_args) | {"seed": seed} engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args) llm = LLM(**engine_args)
...@@ -143,8 +150,10 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]): ...@@ -143,8 +150,10 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
"multi_modal_data": mm_data, "multi_modal_data": mm_data,
}) })
print("-" * 50)
for output in outputs: for output in outputs:
print(output.outputs.embedding) print(output.outputs.embedding)
print("-" * 50)
def main(args: Namespace): def main(args: Namespace):
......
...@@ -22,6 +22,16 @@ QUESTION = "What is the content of each image?" ...@@ -22,6 +22,16 @@ QUESTION = "What is the content of each image?"
IMAGE_URLS = [ IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg", "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg", "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
"https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
"https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
"https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
"https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
"https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
"https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
"https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
"https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
] ]
...@@ -217,6 +227,33 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -217,6 +227,33 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
) )
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=16,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={
"max_image_size": {
"longest_edge": 384
},
},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData: def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B" model_name = "OpenGVLab/InternVL2-2B"
...@@ -258,8 +295,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -258,8 +295,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=8192, max_model_len=131072,
max_num_seqs=4,
tensor_parallel_size=8, tensor_parallel_size=8,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
...@@ -318,8 +354,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData: ...@@ -318,8 +354,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
# The configuration below has been confirmed to launch on a single L40 GPU. # The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs( engine_args = EngineArgs(
model=model_name, model=model_name,
max_model_len=4096, max_model_len=8192,
max_num_seqs=16, max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)}, limit_mm_per_prompt={"image": len(image_urls)},
) )
...@@ -614,6 +650,7 @@ model_example_map = { ...@@ -614,6 +650,7 @@ model_example_map = {
"qwen_vl_chat": load_qwen_vl_chat, "qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl, "qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl, "qwen2_5_vl": load_qwen2_5_vl,
"smolvlm": load_smolvlm,
} }
...@@ -624,15 +661,8 @@ def run_generate(model, question: str, image_urls: list[str], ...@@ -624,15 +661,8 @@ def run_generate(model, question: str, image_urls: list[str],
engine_args = asdict(req_data.engine_args) | {"seed": args.seed} engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args) llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
max_tokens=128, max_tokens=256,
stop_token_ids=req_data.stop_token_ids) stop_token_ids=req_data.stop_token_ids)
outputs = llm.generate( outputs = llm.generate(
...@@ -642,29 +672,31 @@ def run_generate(model, question: str, image_urls: list[str], ...@@ -642,29 +672,31 @@ def run_generate(model, question: str, image_urls: list[str],
"image": req_data.image_data "image": req_data.image_data
}, },
}, },
sampling_params=sampling_params) sampling_params=sampling_params,
lora_request=req_data.lora_requests,
)
print("-" * 50)
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
print("-" * 50)
def run_chat(model: str, question: str, image_urls: list[str], def run_chat(model: str, question: str, image_urls: list[str],
seed: Optional[int]): seed: Optional[int]):
req_data = model_example_map[model](question, image_urls) req_data = model_example_map[model](question, image_urls)
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})
engine_args = asdict(req_data.engine_args) | {"seed": seed} engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args) llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
sampling_params = SamplingParams(temperature=0.0, sampling_params = SamplingParams(temperature=0.0,
max_tokens=128, max_tokens=256,
stop_token_ids=req_data.stop_token_ids) stop_token_ids=req_data.stop_token_ids)
outputs = llm.chat( outputs = llm.chat(
[{ [{
...@@ -685,11 +717,14 @@ def run_chat(model: str, question: str, image_urls: list[str], ...@@ -685,11 +717,14 @@ def run_chat(model: str, question: str, image_urls: list[str],
}], }],
sampling_params=sampling_params, sampling_params=sampling_params,
chat_template=req_data.chat_template, chat_template=req_data.chat_template,
lora_request=req_data.lora_requests,
) )
print("-" * 50)
for o in outputs: for o in outputs:
generated_text = o.outputs[0].text generated_text = o.outputs[0].text
print(generated_text) print(generated_text)
print("-" * 50)
def main(args: Namespace): def main(args: Namespace):
...@@ -697,10 +732,12 @@ def main(args: Namespace): ...@@ -697,10 +732,12 @@ def main(args: Namespace):
method = args.method method = args.method
seed = args.seed seed = args.seed
image_urls = IMAGE_URLS[:args.num_images]
if method == "generate": if method == "generate":
run_generate(model, QUESTION, IMAGE_URLS, seed) run_generate(model, QUESTION, image_urls, seed)
elif method == "chat": elif method == "chat":
run_chat(model, QUESTION, IMAGE_URLS, seed) run_chat(model, QUESTION, image_urls, seed)
else: else:
raise ValueError(f"Invalid method: {method}") raise ValueError(f"Invalid method: {method}")
...@@ -725,6 +762,12 @@ if __name__ == "__main__": ...@@ -725,6 +762,12 @@ if __name__ == "__main__":
type=int, type=int,
default=None, default=None,
help="Set the seed when initializing `vllm.LLM`.") help="Set the seed when initializing `vllm.LLM`.")
parser.add_argument(
"--num-images",
"-n",
choices=list(range(1, 13)), # 12 is the max number of images
default=2,
help="Number of images to use for the demo.")
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
"""Example Python client for `vllm.entrypoints.api_server` """Example Python client for `vllm.entrypoints.api_server`
Start the demo server:
python -m vllm.entrypoints.api_server --model <model_name>
NOTE: The API server is used only for demonstration and simple performance NOTE: The API server is used only for demonstration and simple performance
benchmarks. It is not intended for production use. benchmarks. It is not intended for production use.
For production use, we recommend `vllm serve` and the OpenAI client API. For production use, we recommend `vllm serve` and the OpenAI client API.
...@@ -7,6 +10,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API. ...@@ -7,6 +10,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
import argparse import argparse
import json import json
from argparse import Namespace
from collections.abc import Iterable from collections.abc import Iterable
import requests import requests
...@@ -27,7 +31,6 @@ def post_http_request(prompt: str, ...@@ -27,7 +31,6 @@ def post_http_request(prompt: str,
pload = { pload = {
"prompt": prompt, "prompt": prompt,
"n": n, "n": n,
"use_beam_search": True,
"temperature": 0.0, "temperature": 0.0,
"max_tokens": 16, "max_tokens": 16,
"stream": stream, "stream": stream,
...@@ -55,14 +58,7 @@ def get_response(response: requests.Response) -> list[str]: ...@@ -55,14 +58,7 @@ def get_response(response: requests.Response) -> list[str]:
return output return output
if __name__ == "__main__": def main(args: Namespace):
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=4)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
args = parser.parse_args()
prompt = args.prompt prompt = args.prompt
api_url = f"http://{args.host}:{args.port}/generate" api_url = f"http://{args.host}:{args.port}/generate"
n = args.n n = args.n
...@@ -83,3 +79,14 @@ if __name__ == "__main__": ...@@ -83,3 +79,14 @@ if __name__ == "__main__":
output = get_response(response) output = get_response(response)
for i, line in enumerate(output): for i, line in enumerate(output):
print(f"Beam candidate {i}: {line!r}", flush=True) print(f"Beam candidate {i}: {line!r}", flush=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=1)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
args = parser.parse_args()
main(args)
...@@ -23,7 +23,7 @@ def sync_openai(): ...@@ -23,7 +23,7 @@ def sync_openai():
with open(str(mary_had_lamb), "rb") as f: with open(str(mary_had_lamb), "rb") as f:
transcription = client.audio.transcriptions.create( transcription = client.audio.transcriptions.create(
file=f, file=f,
model="openai/whisper-small", model="openai/whisper-large-v3",
language="en", language="en",
response_format="json", response_format="json",
temperature=0.0) temperature=0.0)
......
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- message['content'] -}}
{%- elif message['role'] == 'assistant' -%}
{{- message['content'] -}}
{%- endif -%}
{%- endfor -%}
...@@ -76,7 +76,7 @@ ...@@ -76,7 +76,7 @@
{{- tool_call.name + '(' -}} {{- tool_call.name + '(' -}}
{%- for param in tool_call.arguments %} {%- for param in tool_call.arguments %}
{{- param + '=' -}} {{- param + '=' -}}
{{- "%sr" | format(tool_call.arguments[param]) -}} {{- "%s" | format(tool_call.arguments[param]) -}}
{% if not loop.last %}, {% endif %} {% if not loop.last %}, {% endif %}
{%- endfor %} {%- endfor %}
{{- ')' -}} {{- ')' -}}
......
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = false %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- if messages[0]['content'] is string %}
{%- set system_message = messages[0]['content']|trim %}
{%- else %}
{%- set system_message = messages[0]['content'][0]['text']|trim %}
{%- endif %}
{%- set messages = messages[1:] %}
{%- else %}
{%- if tools is not none %}
{#- Add default tool system message when tools are provided #}
{%- set system_message = "You are a helpful assistant with tool calling "
"capabilities. Only reply with a tool call if the function exists in the "
"library provided by the user. If it doesn't exist, just reply directly in "
"natural language. When you receive a tool call response, use the output to "
"format an answer to the original user question." %}
{%- else %}
{%- set system_message = "" %}
{%- endif %}
{%- endif %}
{#- System message if the user supplied one, or if tools are used (default tool system message) #}
{%- if system_message %}
{#- always use user provided system message to override default tool system message #}
{{- "<|header_start|>system<|header_end|>\n\n" }}
{{- system_message }}
{%- if tools is not none and not tools_in_user_message %}
{{- "Tools: You have access to the following tools. You might need to use one "
"or more function/tool calls to fulfill the task. \n"
"If none are needed, then proceed to the response.\n\n"
"Tool Call Syntax: You can call tools using the following syntax:\n"
"[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n"
"Do not include anything else when calling the tools with the syntax above.\n\n"
"Here is a list of functions in JSON format that you can invoke.\n " }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- "<|eot|>" }}
{%- endif %}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and tools is not none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- if messages[0]['content'] is string %}
{%- set first_user_message = messages[0]['content']|trim %}
{%- else %}
{%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
{%- endif %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|header_start|>user<|header_end|>\n\n' -}}
{{- first_user_message}}
{{- "\nHere is a list of functions in JSON format that you can invoke:"}}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- "Should you decide to return the function call(s), put them in the format "
"of [func_name1(params_name1=params_value1, params_name2=params_value2, "
"...), ...]\nDo not include anything else when calling the tools with the "
"syntax above." }}
{%- endif %}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
{%- if message['content'] is string %}
{{- message['content'] }}
{%- else %}
{%- for content in message['content'] %}
{%- if content['type'] == 'image' %}
{{- '<|image|>' }}
{%- elif content['type'] == 'text' %}
{{- content['text'] | trim }}
{%- endif %}
{%- endfor %}
{%- endif %}
{{- "<|eot|>" }}
{%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
{%- set tool_call = message.tool_calls[0].function %}
{{- '<|header_start|>assistant<|header_end|>\n\n' -}}
{%- if message['content'] is string %}
{{- message['content'] }}
{%- else %}
{%- for content in message['content'] %}
{%- if content['type'] == 'image' %}
{{- '<|image|>' }}
{%- elif content['type'] == 'text' %}
{{- content['text'] }}
{%- endif %}
{%- endfor %}
{%- endif %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- tool_call.name + '(' -}}
{%- for param in tool_call.arguments %}
{{- param + '=' -}}
{{- "%s" | format(tool_call.arguments[param]) -}}
{% if not loop.last %}, {% endif %}
{%- endfor %}
{{- ')' -}}
{% if not loop.last %}, {% endif %}
{%- endfor %}
{{- "<|eom|>" }}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|header_start|>ipython<|header_end|>\n\n" }}
{%- if message.content is string %}
{{- message.content | tojson }}
{%- else %}
{%- for content in message['content'] %}
{%- if content['type'] == 'text' %}
{{- content['text'] | tojson }}
{%- endif %}
{%- endfor %}
{%- endif %}
{{- "<|eom|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|header_start|>assistant<|header_end|>\n\n' }}
{%- endif %}
...@@ -44,7 +44,7 @@ ...@@ -44,7 +44,7 @@
{{- tool_call.name + '(' -}} {{- tool_call.name + '(' -}}
{%- for param in tool_call.arguments %} {%- for param in tool_call.arguments %}
{{- param + '=' -}} {{- param + '=' -}}
{{- "%sr" | format(tool_call.arguments[param]) -}} {{- "%s" | format(tool_call.arguments[param]) -}}
{% if not loop.last %}, {% endif %} {% if not loop.last %}, {% endif %}
{%- endfor %} {%- endfor %}
{{- ')' -}} {{- ')' -}}
......
...@@ -30,7 +30,7 @@ classifiers = [ ...@@ -30,7 +30,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence", "Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis", "Topic :: Scientific/Engineering :: Information Analysis",
] ]
requires-python = ">=3.9" requires-python = ">=3.9,<3.13"
dynamic = [ "version", "dependencies", "optional-dependencies"] dynamic = [ "version", "dependencies", "optional-dependencies"]
[project.urls] [project.urls]
......
...@@ -6,7 +6,7 @@ requests >= 2.26.0 ...@@ -6,7 +6,7 @@ requests >= 2.26.0
tqdm tqdm
blake3 blake3
py-cpuinfo py-cpuinfo
transformers >= 4.51.0 transformers >= 4.51.1
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads. huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
tokenizers >= 0.19.1 # Required for Llama 3. tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer. protobuf # Required by LlamaTokenizer.
...@@ -22,13 +22,13 @@ lm-format-enforcer >= 0.10.11, < 0.11 ...@@ -22,13 +22,13 @@ lm-format-enforcer >= 0.10.11, < 0.11
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64" llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines == 0.1.11 outlines == 0.1.11
lark == 1.2.2 lark == 1.2.2
xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64" xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
typing_extensions >= 4.10 typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317 filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs partial-json-parser # used for parsing partial JSON outputs
pyzmq pyzmq
msgspec msgspec
gguf == 0.10.0 gguf >= 0.13.0
importlib_metadata importlib_metadata
mistral_common[opencv] >= 1.5.4 # requires numpy>=1.25 mistral_common[opencv] >= 1.5.4 # requires numpy>=1.25
opencv-python-headless >= 4.11.0 # required for video IO opencv-python-headless >= 4.11.0 # required for video IO
...@@ -36,10 +36,14 @@ pyyaml ...@@ -36,10 +36,14 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.9.2 # required for compressed-tensors compressed-tensors == 0.9.3 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/other/logging_configuration.md python-json-logger # Used by logging as per examples/other/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu ninja # Required for xgrammar, rocm, tpu, xpu
opentelemetry-sdk>=1.26.0,<1.27.0 # vllm.tracing
opentelemetry-api>=1.26.0,<1.27.0 # vllm.tracing
opentelemetry-exporter-otlp>=1.26.0,<1.27.0 # vllm.tracing
opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0 # vllm.tracing
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment