Commit 31330101 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.4' into v0.8.4-dev

parents e8933c34 dc1b4a6f
......@@ -22,31 +22,40 @@ prompts = [
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
llm = LLM(
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
max_num_seqs=8,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len=2048,
block_size=2048,
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device="neuron",
quantization="neuron_quant",
override_neuron_config={
"cast_logits_dtype": "bfloat16",
},
tensor_parallel_size=2)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
def main():
# Create an LLM.
llm = LLM(
model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
max_num_seqs=8,
# The max_model_len and block_size arguments are required to be same as
# max sequence length when targeting neuron device.
# Currently, this is a known limitation in continuous batching support
# in transformers-neuronx.
# TODO(liangfu): Support paged-attention in transformers-neuronx.
max_model_len=2048,
block_size=2048,
# ruff: noqa: E501
# The device can be automatically detected when AWS Neuron SDK is installed.
# The device argument can be either unspecified for automated detection,
# or explicitly assigned.
device="neuron",
quantization="neuron_quant",
override_neuron_config={
"cast_logits_dtype": "bfloat16",
},
tensor_parallel_size=2)
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
if __name__ == "__main__":
main()
......@@ -31,55 +31,62 @@ generating_prompts = [prefix + prompt for prompt in prompts]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.0)
# Create an LLM without prefix caching as a baseline.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
print("Results without `enable_prefix_caching`")
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = regular_llm.generate(generating_prompts, sampling_params)
regular_generated_texts = []
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
regular_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
print("-" * 80)
# Destroy the LLM object and free up the GPU memory.
del regular_llm
cleanup_dist_env_and_memory()
# Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
# Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
# Generate with prefix caching.
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
print("Results with `enable_prefix_caching`")
cached_generated_texts = []
# Print the outputs. You should see the same outputs as before.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
cached_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
print("-" * 80)
# Compare the results and display the speedup
generated_same = all([
regular_generated_texts[i] == cached_generated_texts[i]
for i in range(len(prompts))
])
print(f"Generated answers are the same: {generated_same}")
def main():
# Create an LLM without prefix caching as a baseline.
regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
print("Results without `enable_prefix_caching`")
# ruff: noqa: E501
# Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = regular_llm.generate(generating_prompts, sampling_params)
regular_generated_texts = []
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
regular_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# Destroy the LLM object and free up the GPU memory.
del regular_llm
cleanup_dist_env_and_memory()
# Create an LLM with prefix caching enabled.
prefix_cached_llm = LLM(model="facebook/opt-125m",
enable_prefix_caching=True,
gpu_memory_utilization=0.4)
# Warmup so that the shared prompt's KV cache is computed.
prefix_cached_llm.generate(generating_prompts[0], sampling_params)
# Generate with prefix caching.
outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
print("Results with `enable_prefix_caching`")
cached_generated_texts = []
# Print the outputs. You should see the same outputs as before.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
cached_generated_texts.append(generated_text)
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# Compare the results and display the speedup
generated_same = all([
regular_generated_texts[i] == cached_generated_texts[i]
for i in range(len(prompts))
])
print(f"Generated answers are the same: {generated_same}")
if __name__ == "__main__":
main()
......@@ -234,9 +234,8 @@ def run_profile(context: ProfileContext, csv_output: Optional[str],
sampling_params.max_tokens = next(output_len_generator)
assert isinstance(sampling_params.max_tokens, int)
prompt_token_ids = torch.randint(
llm.llm_engine.model_config.get_vocab_size(),
size=(prompt_len, )).tolist()
prompt_token_ids = torch.randint(llm.get_tokenizer().vocab_size,
size=(prompt_len, )).tolist()
llm.llm_engine.add_request(
request_id=f"seq{i}",
......
......@@ -19,8 +19,6 @@ SEED = 42
# because it is almost impossible to make the scheduling deterministic in the
# online serving setting.
llm = LLM(model="facebook/opt-125m", seed=SEED)
prompts = [
"Hello, my name is",
"The president of the United States is",
......@@ -29,8 +27,17 @@ prompts = [
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
def main():
llm = LLM(model="facebook/opt-125m", seed=SEED)
outputs = llm.generate(prompts, sampling_params)
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
if __name__ == "__main__":
main()
......@@ -85,11 +85,13 @@ sampling_params = SamplingParams(temperature=0)
outputs = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, "
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50)
# set up the communication between the training process
# and the inference engine.
......@@ -120,8 +122,10 @@ assert all(ray.get(llm.collective_rpc.remote("check_weights_changed")))
# use the updated model to generate texts, they will be nonsense
# because the weights are all zeros.
outputs_updated = ray.get(llm.generate.remote(prompts, sampling_params))
print("-" * 50)
for output in outputs_updated:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, "
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50)
......@@ -32,10 +32,12 @@ if __name__ == "__main__":
llm.stop_profile()
# Print the outputs.
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
print("-" * 50)
# Add a buffer to wait for profiler in the background process
# (in case MP is on) to finish writing profiling output.
......
# SPDX-License-Identifier: Apache-2.0
"""
This file demonstrates the example usage of guided decoding
to generate structured outputs using vLLM. It shows how to apply
different guided decoding techniques such as Choice, Regex, JSON schema,
and Grammar to produce structured and formatted results
based on specific prompts.
"""
from enum import Enum
......@@ -7,26 +14,21 @@ from pydantic import BaseModel
from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
# Guided decoding by Choice (list of possible options)
guided_decoding_params = GuidedDecodingParams(choice=["Positive", "Negative"])
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
outputs = llm.generate(
prompts="Classify this sentiment: vLLM is wonderful!",
sampling_params=sampling_params,
)
print(outputs[0].outputs[0].text)
guided_decoding_params_choice = GuidedDecodingParams(
choice=["Positive", "Negative"])
sampling_params_choice = SamplingParams(
guided_decoding=guided_decoding_params_choice)
prompt_choice = "Classify this sentiment: vLLM is wonderful!"
# Guided decoding by Regex
guided_decoding_params = GuidedDecodingParams(regex="\w+@\w+\.com\n")
sampling_params = SamplingParams(guided_decoding=guided_decoding_params,
stop=["\n"])
prompt = ("Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
guided_decoding_params_regex = GuidedDecodingParams(regex=r"\w+@\w+\.com\n")
sampling_params_regex = SamplingParams(
guided_decoding=guided_decoding_params_regex, stop=["\n"])
prompt_regex = (
"Generate an email address for Alan Turing, who works in Enigma."
"End in .com and new line. Example result:"
"alan.turing@enigma.com\n")
# Guided decoding by JSON using Pydantic schema
......@@ -44,37 +46,54 @@ class CarDescription(BaseModel):
json_schema = CarDescription.model_json_schema()
guided_decoding_params = GuidedDecodingParams(json=json_schema)
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
prompt = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
outputs = llm.generate(
prompts=prompt,
sampling_params=sampling_params,
)
print(outputs[0].outputs[0].text)
guided_decoding_params_json = GuidedDecodingParams(json=json_schema)
sampling_params_json = SamplingParams(
guided_decoding=guided_decoding_params_json)
prompt_json = ("Generate a JSON with the brand, model and car_type of"
"the most iconic car from the 90's")
# Guided decoding by Grammar
simplified_sql_grammar = """
?start: select_statement
root ::= select_statement
select_statement ::= "SELECT " column " from " table " where " condition
column ::= "col_1 " | "col_2 "
table ::= "table_1 " | "table_2 "
condition ::= column "= " number
number ::= "1 " | "2 "
"""
guided_decoding_params_grammar = GuidedDecodingParams(
grammar=simplified_sql_grammar)
sampling_params_grammar = SamplingParams(
guided_decoding=guided_decoding_params_grammar)
prompt_grammar = ("Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table.")
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)*
def format_output(title: str, output: str):
print(f"{'-' * 50}\n{title}: {output}\n{'-' * 50}")
?table_name: identifier
?column_name: identifier
def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
return outputs[0].outputs[0].text
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
guided_decoding_params = GuidedDecodingParams(grammar=simplified_sql_grammar)
sampling_params = SamplingParams(guided_decoding=guided_decoding_params)
prompt = ("Generate an SQL query to show the 'username' and 'email'"
"from the 'users' table.")
outputs = llm.generate(
prompts=prompt,
sampling_params=sampling_params,
)
print(outputs[0].outputs[0].text)
def main():
llm = LLM(model="Qwen/Qwen2.5-3B-Instruct", max_model_len=100)
choice_output = generate_output(prompt_choice, sampling_params_choice, llm)
format_output("Guided decoding by Choice", choice_output)
regex_output = generate_output(prompt_regex, sampling_params_regex, llm)
format_output("Guided decoding by Regex", regex_output)
json_output = generate_output(prompt_json, sampling_params_json, llm)
format_output("Guided decoding by JSON", json_output)
grammar_output = generate_output(prompt_grammar, sampling_params_grammar,
llm)
format_output("Guided decoding by Grammar", grammar_output)
if __name__ == "__main__":
main()
......@@ -36,11 +36,13 @@ llm = LLM(
outputs = llm.generate(prompts, sampling_params)
# all ranks will have the same outputs
print("-" * 50)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, "
print(f"Prompt: {prompt!r}\n"
f"Generated text: {generated_text!r}")
print("-" * 50)
"""
Further tips:
......
......@@ -16,14 +16,22 @@ N = 1
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
sampling_params = SamplingParams(temperature=0, top_p=1.0, n=N, max_tokens=16)
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64,
max_num_seqs=4)
outputs = llm.generate(prompts, sampling_params)
for output, answer in zip(outputs, answers):
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
assert generated_text.startswith(answer)
def main():
# Set `enforce_eager=True` to avoid ahead-of-time compilation.
# In real workloads, `enforace_eager` should be `False`.
llm = LLM(model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=64,
max_num_seqs=4)
outputs = llm.generate(prompts, sampling_params)
print("-" * 50)
for output, answer in zip(outputs, answers):
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}")
assert generated_text.startswith(answer)
print("-" * 50)
if __name__ == "__main__":
main()
......@@ -8,6 +8,7 @@ on HuggingFace model repository.
"""
import os
import random
from contextlib import contextmanager
from dataclasses import asdict
from typing import NamedTuple, Optional
......@@ -44,7 +45,7 @@ def run_aria(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=4096,
max_num_seqs=2,
dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [(f"<|im_start|>user\n<fim_prefix><|img|><fim_suffix>{question}"
......@@ -70,7 +71,7 @@ def run_aya_vision(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"crop_to_patches": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [
f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|><image>{question}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
......@@ -91,7 +92,7 @@ def run_blip2(questions: list[str], modality: str) -> ModelRequestData:
prompts = [f"Question: {question} Answer:" for question in questions]
engine_args = EngineArgs(
model="Salesforce/blip2-opt-6.7b",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -109,7 +110,7 @@ def run_chameleon(questions: list[str], modality: str) -> ModelRequestData:
model="facebook/chameleon-7b",
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -128,8 +129,8 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
max_model_len=4096,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
hf_overrides={"architectures": ["DeepseekVLV2ForCausalLM"]},
limit_mm_per_prompt={"image": 1},
)
prompts = [
......@@ -154,7 +155,7 @@ def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2,
trust_remote_code=True,
dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = ["<MORE_DETAILED_CAPTION>" for _ in questions]
......@@ -174,7 +175,7 @@ def run_fuyu(questions: list[str], modality: str) -> ModelRequestData:
model="adept/fuyu-8b",
max_model_len=2048,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -193,7 +194,7 @@ def run_gemma3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=2048,
max_num_seqs=2,
mm_processor_kwargs={"do_pan_and_scan": True},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [("<bos><start_of_turn>user\n"
......@@ -218,7 +219,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True,
enforce_eager=True,
hf_overrides={"architectures": ["GLM4VForCausalLM"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [
......@@ -245,7 +246,7 @@ def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=8192,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
......@@ -286,7 +287,7 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
"longest_edge": 3 * 364
},
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [(
f"<|begin_of_text|>User:<image>{question}<end_of_utterance>\nAssistant:"
......@@ -298,6 +299,34 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
)
# SmolVLM2-2.2B-Instruct
def run_smolvlm(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
enforce_eager=True,
mm_processor_kwargs={
"max_image_size": {
"longest_edge": 384
},
},
limit_mm_per_prompt={"image": 1},
)
prompts = [
(f"<|im_start|>User:<image>{question}<end_of_utterance>\nAssistant:")
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# InternVL
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -308,7 +337,7 @@ def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
......@@ -346,7 +375,7 @@ def run_llava(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model="llava-hf/llava-1.5-7b-hf",
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -363,7 +392,7 @@ def run_llava_next(questions: list[str], modality: str) -> ModelRequestData:
engine_args = EngineArgs(
model="llava-hf/llava-v1.6-mistral-7b-hf",
max_model_len=8192,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -385,7 +414,7 @@ def run_llava_next_video(questions: list[str],
model="llava-hf/LLaVA-NeXT-Video-7B-hf",
max_model_len=8192,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -413,7 +442,7 @@ def run_llava_onevision(questions: list[str],
engine_args = EngineArgs(
model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -436,7 +465,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
model="TIGER-Lab/Mantis-8B-siglip-llama3",
max_model_len=4096,
hf_overrides={"architectures": ["MantisForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
stop_token_ids = [128009]
......@@ -477,7 +506,7 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0
......@@ -532,7 +561,7 @@ def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=8192,
max_num_seqs=2,
tensor_parallel_size=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
......@@ -556,9 +585,9 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_model_len=8192,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
......@@ -582,7 +611,7 @@ def run_mllama(questions: list[str], modality: str) -> ModelRequestData:
)
def run_llama4(questions: list[str], modality: str):
def run_llama4(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
......@@ -592,8 +621,8 @@ def run_llama4(questions: list[str], modality: str):
max_model_len=8192,
max_num_seqs=4,
tensor_parallel_size=8,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
gpu_memory_utilization=0.4,
limit_mm_per_prompt={"image": 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
......@@ -628,7 +657,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
dtype="bfloat16",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [
......@@ -654,7 +683,7 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
trust_remote_code=True,
max_model_len=4096,
tensor_parallel_size=4,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
......@@ -681,7 +710,8 @@ def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions]
engine_args = EngineArgs(
model="google/paligemma-3b-mix-224",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
......@@ -697,7 +727,8 @@ def run_paligemma2(questions: list[str], modality: str) -> ModelRequestData:
prompts = ["caption en" for _ in questions]
engine_args = EngineArgs(
model="google/paligemma2-3b-ft-docci-448",
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache)
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
......@@ -733,7 +764,7 @@ def run_phi3v(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2,
# Note - mm_processor_kwargs can also be passed to generate/chat calls
mm_processor_kwargs={"num_crops": 16},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -764,6 +795,7 @@ def run_phi4mm(questions: list[str], modality: str) -> ModelRequestData:
max_num_seqs=2,
enable_lora=True,
max_lora_rank=320,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -784,7 +816,7 @@ def run_pixtral_hf(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
max_model_len=6144,
max_num_seqs=2,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [f"<s>[INST]{question}\n[IMG][/INST]" for question in questions]
......@@ -805,7 +837,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
max_model_len=1024,
max_num_seqs=2,
hf_overrides={"architectures": ["QwenVLForConditionalGeneration"]},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
prompts = [f"{question}Picture 1: <img></img>\n" for question in questions]
......@@ -830,7 +862,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
if modality == "image":
......@@ -865,7 +897,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
if modality == "image":
......@@ -896,7 +928,7 @@ def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
model=model_name,
trust_remote_code=True,
max_model_len=4096,
disable_mm_preprocessor_cache=args.disable_mm_preprocessor_cache,
limit_mm_per_prompt={"image": 1},
)
tokenizer = AutoTokenizer.from_pretrained(model_name,
......@@ -955,6 +987,7 @@ model_example_map = {
"qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl,
"skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm,
}
......@@ -1026,6 +1059,20 @@ def apply_image_repeat(image_repeat_prob, num_prompts, data,
return inputs
@contextmanager
def time_counter(enable: bool):
if enable:
import time
start_time = time.time()
yield
elapsed_time = time.time() - start_time
print("-" * 50)
print("-- generate time = {}".format(elapsed_time))
print("-" * 50)
else:
yield
def main(args):
model = args.model_type
if model not in model_example_map:
......@@ -1038,15 +1085,16 @@ def main(args):
req_data = model_example_map[model](questions, modality)
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
engine_args = asdict(req_data.engine_args) | {
"seed": args.seed,
"disable_mm_preprocessor_cache": args.disable_mm_preprocessor_cache,
}
llm = LLM(**engine_args)
# Don't want to check the flag multiple times, so just hijack `prompts`.
prompts = req_data.prompts if args.use_different_prompt_per_request else [
......@@ -1084,19 +1132,22 @@ def main(args):
},
} for i in range(args.num_prompts)]
if args.time_generate:
import time
start_time = time.time()
outputs = llm.generate(inputs, sampling_params=sampling_params)
elapsed_time = time.time() - start_time
print("-- generate time = {}".format(elapsed_time))
# Add LoRA request if applicable
lora_request = (req_data.lora_requests *
args.num_prompts if req_data.lora_requests else None)
else:
outputs = llm.generate(inputs, sampling_params=sampling_params)
with time_counter(args.time_generate):
outputs = llm.generate(
inputs,
sampling_params=sampling_params,
lora_request=lora_request,
)
print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
print("-" * 50)
if __name__ == "__main__":
......
......@@ -63,6 +63,7 @@ def run_e5_v(query: Query) -> ModelRequestData:
model="royokong/e5-v",
task="embed",
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -93,6 +94,7 @@ def run_vlm2vec(query: Query) -> ModelRequestData:
task="embed",
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
......@@ -131,6 +133,11 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
query = get_query(modality)
req_data = model_example_map[model](query)
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
......@@ -143,8 +150,10 @@ def run_encode(model: str, modality: QueryModality, seed: Optional[int]):
"multi_modal_data": mm_data,
})
print("-" * 50)
for output in outputs:
print(output.outputs.embedding)
print("-" * 50)
def main(args: Namespace):
......
......@@ -22,6 +22,16 @@ QUESTION = "What is the content of each image?"
IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg",
"https://upload.wikimedia.org/wikipedia/commons/2/26/Ultramarine_Flycatcher_%28Ficedula_superciliaris%29_Naggar%2C_Himachal_Pradesh%2C_2013_%28cropped%29.JPG",
"https://upload.wikimedia.org/wikipedia/commons/thumb/e/e5/Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg/2560px-Anim1754_-_Flickr_-_NOAA_Photo_Library_%281%29.jpg",
"https://upload.wikimedia.org/wikipedia/commons/d/d4/Starfish%2C_Caswell_Bay_-_geograph.org.uk_-_409413.jpg",
"https://upload.wikimedia.org/wikipedia/commons/6/69/Grapevinesnail_01.jpg",
"https://upload.wikimedia.org/wikipedia/commons/thumb/0/0b/Texas_invasive_Musk_Thistle_1.jpg/1920px-Texas_invasive_Musk_Thistle_1.jpg",
"https://upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Huskiesatrest.jpg/2880px-Huskiesatrest.jpg",
"https://upload.wikimedia.org/wikipedia/commons/thumb/6/68/Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg/1920px-Orange_tabby_cat_sitting_on_fallen_leaves-Hisashi-01A.jpg",
"https://upload.wikimedia.org/wikipedia/commons/3/30/George_the_amazing_guinea_pig.jpg",
"https://upload.wikimedia.org/wikipedia/commons/thumb/1/1f/Oryctolagus_cuniculus_Rcdo.jpg/1920px-Oryctolagus_cuniculus_Rcdo.jpg",
"https://upload.wikimedia.org/wikipedia/commons/9/98/Horse-and-pony.jpg",
]
......@@ -217,6 +227,33 @@ def load_idefics3(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=16,
enforce_eager=True,
limit_mm_per_prompt={"image": len(image_urls)},
mm_processor_kwargs={
"max_image_size": {
"longest_edge": 384
},
},
)
placeholders = "\n".join(f"Image-{i}: <image>\n"
for i, _ in enumerate(image_urls, start=1))
prompt = f"<|im_start|>User:{placeholders}\n{question}<end_of_utterance>\nAssistant:" # noqa: E501
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_internvl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "OpenGVLab/InternVL2-2B"
......@@ -258,8 +295,7 @@ def load_llama4(question: str, image_urls: list[str]) -> ModelRequestData:
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=4,
max_model_len=131072,
tensor_parallel_size=8,
limit_mm_per_prompt={"image": len(image_urls)},
)
......@@ -318,8 +354,8 @@ def load_mllama(question: str, image_urls: list[str]) -> ModelRequestData:
# The configuration below has been confirmed to launch on a single L40 GPU.
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=16,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
)
......@@ -614,6 +650,7 @@ model_example_map = {
"qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl,
"smolvlm": load_smolvlm,
}
......@@ -624,15 +661,8 @@ def run_generate(model, question: str, image_urls: list[str],
engine_args = asdict(req_data.engine_args) | {"seed": args.seed}
llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
max_tokens=256,
stop_token_ids=req_data.stop_token_ids)
outputs = llm.generate(
......@@ -642,29 +672,31 @@ def run_generate(model, question: str, image_urls: list[str],
"image": req_data.image_data
},
},
sampling_params=sampling_params)
sampling_params=sampling_params,
lora_request=req_data.lora_requests,
)
print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
print("-" * 50)
def run_chat(model: str, question: str, image_urls: list[str],
seed: Optional[int]):
req_data = model_example_map[model](question, image_urls)
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {})
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
# To maintain code compatibility in this script, we add LoRA here.
# You can also add LoRA using:
# llm.generate(prompts, lora_request=lora_request,...)
if req_data.lora_requests:
for lora_request in req_data.lora_requests:
llm.llm_engine.add_lora(lora_request=lora_request)
sampling_params = SamplingParams(temperature=0.0,
max_tokens=128,
max_tokens=256,
stop_token_ids=req_data.stop_token_ids)
outputs = llm.chat(
[{
......@@ -685,11 +717,14 @@ def run_chat(model: str, question: str, image_urls: list[str],
}],
sampling_params=sampling_params,
chat_template=req_data.chat_template,
lora_request=req_data.lora_requests,
)
print("-" * 50)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)
print("-" * 50)
def main(args: Namespace):
......@@ -697,10 +732,12 @@ def main(args: Namespace):
method = args.method
seed = args.seed
image_urls = IMAGE_URLS[:args.num_images]
if method == "generate":
run_generate(model, QUESTION, IMAGE_URLS, seed)
run_generate(model, QUESTION, image_urls, seed)
elif method == "chat":
run_chat(model, QUESTION, IMAGE_URLS, seed)
run_chat(model, QUESTION, image_urls, seed)
else:
raise ValueError(f"Invalid method: {method}")
......@@ -725,6 +762,12 @@ if __name__ == "__main__":
type=int,
default=None,
help="Set the seed when initializing `vllm.LLM`.")
parser.add_argument(
"--num-images",
"-n",
choices=list(range(1, 13)), # 12 is the max number of images
default=2,
help="Number of images to use for the demo.")
args = parser.parse_args()
main(args)
# SPDX-License-Identifier: Apache-2.0
"""Example Python client for `vllm.entrypoints.api_server`
Start the demo server:
python -m vllm.entrypoints.api_server --model <model_name>
NOTE: The API server is used only for demonstration and simple performance
benchmarks. It is not intended for production use.
For production use, we recommend `vllm serve` and the OpenAI client API.
......@@ -7,6 +10,7 @@ For production use, we recommend `vllm serve` and the OpenAI client API.
import argparse
import json
from argparse import Namespace
from collections.abc import Iterable
import requests
......@@ -27,7 +31,6 @@ def post_http_request(prompt: str,
pload = {
"prompt": prompt,
"n": n,
"use_beam_search": True,
"temperature": 0.0,
"max_tokens": 16,
"stream": stream,
......@@ -55,14 +58,7 @@ def get_response(response: requests.Response) -> list[str]:
return output
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=4)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
args = parser.parse_args()
def main(args: Namespace):
prompt = args.prompt
api_url = f"http://{args.host}:{args.port}/generate"
n = args.n
......@@ -83,3 +79,14 @@ if __name__ == "__main__":
output = get_response(response)
for i, line in enumerate(output):
print(f"Beam candidate {i}: {line!r}", flush=True)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="localhost")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument("--n", type=int, default=1)
parser.add_argument("--prompt", type=str, default="San Francisco is a")
parser.add_argument("--stream", action="store_true")
args = parser.parse_args()
main(args)
......@@ -23,7 +23,7 @@ def sync_openai():
with open(str(mary_had_lamb), "rb") as f:
transcription = client.audio.transcriptions.create(
file=f,
model="openai/whisper-small",
model="openai/whisper-large-v3",
language="en",
response_format="json",
temperature=0.0)
......
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- message['content'] -}}
{%- elif message['role'] == 'assistant' -%}
{{- message['content'] -}}
{%- endif -%}
{%- endfor -%}
......@@ -76,7 +76,7 @@
{{- tool_call.name + '(' -}}
{%- for param in tool_call.arguments %}
{{- param + '=' -}}
{{- "%sr" | format(tool_call.arguments[param]) -}}
{{- "%s" | format(tool_call.arguments[param]) -}}
{% if not loop.last %}, {% endif %}
{%- endfor %}
{{- ')' -}}
......
{{- bos_token }}
{%- if custom_tools is defined %}
{%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
{%- set tools_in_user_message = false %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
{%- if messages[0]['content'] is string %}
{%- set system_message = messages[0]['content']|trim %}
{%- else %}
{%- set system_message = messages[0]['content'][0]['text']|trim %}
{%- endif %}
{%- set messages = messages[1:] %}
{%- else %}
{%- if tools is not none %}
{#- Add default tool system message when tools are provided #}
{%- set system_message = "You are a helpful assistant with tool calling "
"capabilities. Only reply with a tool call if the function exists in the "
"library provided by the user. If it doesn't exist, just reply directly in "
"natural language. When you receive a tool call response, use the output to "
"format an answer to the original user question." %}
{%- else %}
{%- set system_message = "" %}
{%- endif %}
{%- endif %}
{#- System message if the user supplied one, or if tools are used (default tool system message) #}
{%- if system_message %}
{#- always use user provided system message to override default tool system message #}
{{- "<|header_start|>system<|header_end|>\n\n" }}
{{- system_message }}
{%- if tools is not none and not tools_in_user_message %}
{{- "Tools: You have access to the following tools. You might need to use one "
"or more function/tool calls to fulfill the task. \n"
"If none are needed, then proceed to the response.\n\n"
"Tool Call Syntax: You can call tools using the following syntax:\n"
"[func_name1(params_name1=params_value1, params_name2=params_value2, ...), ...]\n"
"Do not include anything else when calling the tools with the syntax above.\n\n"
"Here is a list of functions in JSON format that you can invoke.\n " }}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}
{{- "<|eot|>" }}
{%- endif %}
{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and tools is not none %}
{#- Extract the first user message so we can plug it in here #}
{%- if messages | length != 0 %}
{%- if messages[0]['content'] is string %}
{%- set first_user_message = messages[0]['content']|trim %}
{%- else %}
{%- set first_user_message = messages[0]['content'] | selectattr('type', 'equalto', 'text') | map(attribute='text') | map('trim') | join('\n') %}
{%- endif %}
{%- set messages = messages[1:] %}
{%- else %}
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
{{- '<|header_start|>user<|header_end|>\n\n' -}}
{{- first_user_message}}
{{- "\nHere is a list of functions in JSON format that you can invoke:"}}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{{- "Should you decide to return the function call(s), put them in the format "
"of [func_name1(params_name1=params_value1, params_name2=params_value2, "
"...), ...]\nDo not include anything else when calling the tools with the "
"syntax above." }}
{%- endif %}
{%- for message in messages %}
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
{{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
{%- if message['content'] is string %}
{{- message['content'] }}
{%- else %}
{%- for content in message['content'] %}
{%- if content['type'] == 'image' %}
{{- '<|image|>' }}
{%- elif content['type'] == 'text' %}
{{- content['text'] | trim }}
{%- endif %}
{%- endfor %}
{%- endif %}
{{- "<|eot|>" }}
{%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
{%- set tool_call = message.tool_calls[0].function %}
{{- '<|header_start|>assistant<|header_end|>\n\n' -}}
{%- if message['content'] is string %}
{{- message['content'] }}
{%- else %}
{%- for content in message['content'] %}
{%- if content['type'] == 'image' %}
{{- '<|image|>' }}
{%- elif content['type'] == 'text' %}
{{- content['text'] }}
{%- endif %}
{%- endfor %}
{%- endif %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- tool_call.name + '(' -}}
{%- for param in tool_call.arguments %}
{{- param + '=' -}}
{{- "%s" | format(tool_call.arguments[param]) -}}
{% if not loop.last %}, {% endif %}
{%- endfor %}
{{- ')' -}}
{% if not loop.last %}, {% endif %}
{%- endfor %}
{{- "<|eom|>" }}
{%- elif message.role == "tool" or message.role == "ipython" %}
{{- "<|header_start|>ipython<|header_end|>\n\n" }}
{%- if message.content is string %}
{{- message.content | tojson }}
{%- else %}
{%- for content in message['content'] %}
{%- if content['type'] == 'text' %}
{{- content['text'] | tojson }}
{%- endif %}
{%- endfor %}
{%- endif %}
{{- "<|eom|>" }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|header_start|>assistant<|header_end|>\n\n' }}
{%- endif %}
......@@ -44,7 +44,7 @@
{{- tool_call.name + '(' -}}
{%- for param in tool_call.arguments %}
{{- param + '=' -}}
{{- "%sr" | format(tool_call.arguments[param]) -}}
{{- "%s" | format(tool_call.arguments[param]) -}}
{% if not loop.last %}, {% endif %}
{%- endfor %}
{{- ')' -}}
......
......@@ -30,7 +30,7 @@ classifiers = [
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
]
requires-python = ">=3.9"
requires-python = ">=3.9,<3.13"
dynamic = [ "version", "dependencies", "optional-dependencies"]
[project.urls]
......
......@@ -6,7 +6,7 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.51.0
transformers >= 4.51.1
huggingface-hub[hf_xet] >= 0.30.0 # Required for Xet downloads.
tokenizers >= 0.19.1 # Required for Llama 3.
protobuf # Required by LlamaTokenizer.
......@@ -22,13 +22,13 @@ lm-format-enforcer >= 0.10.11, < 0.11
llguidance >= 0.7.9, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines == 0.1.11
lark == 1.2.2
xgrammar == 0.1.17; platform_machine == "x86_64" or platform_machine == "aarch64"
xgrammar == 0.1.18; platform_machine == "x86_64" or platform_machine == "aarch64"
typing_extensions >= 4.10
filelock >= 3.16.1 # need to contain https://github.com/tox-dev/filelock/pull/317
partial-json-parser # used for parsing partial JSON outputs
pyzmq
msgspec
gguf == 0.10.0
gguf >= 0.13.0
importlib_metadata
mistral_common[opencv] >= 1.5.4 # requires numpy>=1.25
opencv-python-headless >= 4.11.0 # required for video IO
......@@ -36,10 +36,14 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=74.1.1; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.9.2 # required for compressed-tensors
compressed-tensors == 0.9.3 # required for compressed-tensors
depyf==0.18.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
python-json-logger # Used by logging as per examples/other/logging_configuration.md
scipy # Required for phi-4-multimodal-instruct
ninja # Required for xgrammar, rocm, tpu, xpu
opentelemetry-sdk>=1.26.0,<1.27.0 # vllm.tracing
opentelemetry-api>=1.26.0,<1.27.0 # vllm.tracing
opentelemetry-exporter-otlp>=1.26.0,<1.27.0 # vllm.tracing
opentelemetry-semantic-conventions-ai>=0.4.1,<0.5.0 # vllm.tracing
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment