Commit d2b52805 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori

parents 9a521c23 5438967f
......@@ -5,6 +5,7 @@ from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from vllm.benchmarks.datasets import add_dataset_parser, get_samples
from vllm.inputs import TokensPrompt
from vllm.v1.metrics.reader import Counter, Vector
try:
......@@ -137,7 +138,8 @@ def main():
sampling_params = SamplingParams(temperature=args.temp, max_tokens=args.output_len)
if not args.custom_mm_prompts:
outputs = llm.generate(
prompt_token_ids=prompt_ids, sampling_params=sampling_params
[TokensPrompt(prompt_token_ids=x) for x in prompt_ids],
sampling_params=sampling_params,
)
else:
outputs = llm.chat(prompts, sampling_params=sampling_params)
......
......@@ -85,7 +85,7 @@ def format_output(title: str, output: str):
def generate_output(prompt: str, sampling_params: SamplingParams, llm: LLM):
outputs = llm.generate(prompts=prompt, sampling_params=sampling_params)
outputs = llm.generate(prompt, sampling_params=sampling_params)
return outputs[0].outputs[0].text
......
......@@ -173,6 +173,37 @@ def run_deepseek_vl2(questions: list[str], modality: str) -> ModelRequestData:
)
# Ernie4.5-VL
def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
limit_mm_per_prompt={modality: 1},
trust_remote_code=True,
)
if modality == "image":
placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
elif modality == "video":
placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
prompts = [
(
f"<|begin_of_sentence|>User: {question}{placeholder}\n"
"Assistant: <think></think>"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Florence2
def run_florence2(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -283,8 +314,10 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
)
prompts = [
f"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>\
{question}<|assistant|>"
(
"<|user|>\n<|begin_of_image|><|endoftext|><|end_of_image|>"
f"{question}<|assistant|>"
)
for question in questions
]
......@@ -333,6 +366,80 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
)
# GLM-4.5V
def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
tensor_parallel_size=4,
)
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
prompts = [
(
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
f"{placeholder}"
f"{question}<|assistant|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# GLM-4.5V-FP8
def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V-FP8"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
mm_processor_kwargs={
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
enforce_eager=True,
tensor_parallel_size=4,
)
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
prompts = [
(
"[gMASK]<sop><|system|>\nYou are a helpful assistant.<|user|>\n"
f"{placeholder}"
f"{question}<|assistant|>assistant\n"
)
for question in questions
]
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# H2OVL-Mississippi
def run_h2ovl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -693,15 +800,13 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat
def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
if modality == "video":
prompts = [
f"<|im_start|>user <video>\n{question}<|im_end|> \
<|im_start|>assistant\n"
f"<|im_start|>user <video>\n{question}<|im_end|><|im_start|>assistant\n"
for question in questions
]
elif modality == "image":
prompts = [
f"<|im_start|>user <image>\n{question}<|im_end|> \
<|im_start|>assistant\n"
f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
for question in questions
]
......@@ -815,6 +920,39 @@ def run_minicpmv(questions: list[str], modality: str) -> ModelRequestData:
return run_minicpmv_base(questions, modality, "openbmb/MiniCPM-V-2_6")
def run_minimax_vl_01(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "MiniMaxAI/MiniMax-VL-01"
engine_args = EngineArgs(
model=model_name,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
trust_remote_code=True,
tensor_parallel_size=8,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
messages = [
[
{
"role": "user",
"content": [{"type": "image"}, {"type": "text", "text": question}],
}
]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# Mistral-3 HF-format
def run_mistral3(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -891,8 +1029,7 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
)
prompts = [
f"<|im_start|>user <image>\n{question}<|im_end|> \
<|im_start|>assistant\n"
f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
for question in questions
]
......@@ -998,6 +1135,38 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
)
# Ovis2_5
def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
model_name = "AIDC-AI/Ovis2.5-2B"
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
limit_mm_per_prompt={modality: 1},
)
if modality == "image":
placeholder = "<image>"
elif modality == "video":
placeholder = "<video>"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
[{"role": "user", "content": f"{placeholder}\n{question}"}]
for question in questions
]
prompts = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# PaliGemma
def run_paligemma(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -1297,6 +1466,28 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
)
# R-4B
def run_r_vl(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
model_name = "YannQi/R-4B"
prompts = [
f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
for question in questions
]
engine_args = EngineArgs(
model=model_name,
max_model_len=16384,
limit_mm_per_prompt={modality: 1},
)
return ModelRequestData(
engine_args=engine_args,
prompts=prompts,
)
# SkyworkR1V
def run_skyworkr1v(questions: list[str], modality: str) -> ModelRequestData:
assert modality == "image"
......@@ -1442,12 +1633,15 @@ model_example_map = {
"chameleon": run_chameleon,
"command_a_vision": run_command_a_vision,
"deepseek_vl_v2": run_deepseek_vl2,
"ernie45_vl": run_ernie45_vl,
"florence2": run_florence2,
"fuyu": run_fuyu,
"gemma3": run_gemma3,
"gemma3n": run_gemma3n,
"glm4v": run_glm4v,
"glm4_1v": run_glm4_1v,
"glm4_5v": run_glm4_5v,
"glm4_5v_fp8": run_glm4_5v_fp8,
"h2ovl_chat": run_h2ovl,
"hyperclovax_seed_vision": run_hyperclovax_seed_vision,
"idefics3": run_idefics3,
......@@ -1463,12 +1657,14 @@ model_example_map = {
"mantis": run_mantis,
"minicpmo": run_minicpmo,
"minicpmv": run_minicpmv,
"minimax_vl_01": run_minimax_vl_01,
"mistral3": run_mistral3,
"mllama": run_mllama,
"molmo": run_molmo,
"nemotron_vl": run_nemotron_vl,
"NVLM_D": run_nvlm_d,
"ovis": run_ovis,
"ovis2_5": run_ovis2_5,
"paligemma": run_paligemma,
"paligemma2": run_paligemma2,
"phi3_v": run_phi3v,
......@@ -1479,6 +1675,7 @@ model_example_map = {
"qwen2_vl": run_qwen2_vl,
"qwen2_5_vl": run_qwen2_5_vl,
"qwen2_5_omni": run_qwen2_5_omni,
"rvl": run_r_vl,
"skywork_chat": run_skyworkr1v,
"smolvlm": run_smolvlm,
"step3": run_step3,
......
......@@ -680,6 +680,36 @@ def load_ovis(question: str, image_urls: list[str]) -> ModelRequestData:
)
# ovis2_5
def load_ovis2_5(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "AIDC-AI/Ovis2.5-2B"
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = "\n".join(
f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
)
messages = [{"role": "user", "content": f"{placeholders}\n{question}"}]
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
prompt = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_pixtral_hf(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "mistral-community/pixtral-12b"
......@@ -962,6 +992,39 @@ def load_qwen2_5_vl(question: str, image_urls: list[str]) -> ModelRequestData:
)
def load_r_vl(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "YannQi/R-4B"
engine_args = EngineArgs(
model=model_name,
max_model_len=16384,
max_num_seqs=16,
limit_mm_per_prompt={"image": len(image_urls)},
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=[fetch_image(url) for url in image_urls],
)
def load_smolvlm(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
......@@ -1064,6 +1127,76 @@ def load_tarsier2(question: str, image_urls: list[str]) -> ModelRequestData:
)
# GLM-4.5V
def load_glm4_5v(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V"
engine_args = EngineArgs(
model=model_name,
max_model_len=32768,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enforce_eager=True,
tensor_parallel_size=4,
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
# GLM-4.5V-FP8
def load_glm4_5v_fp8(question: str, image_urls: list[str]) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V-FP8"
engine_args = EngineArgs(
model=model_name,
max_model_len=32768,
max_num_seqs=2,
limit_mm_per_prompt={"image": len(image_urls)},
enforce_eager=True,
tensor_parallel_size=4,
)
placeholders = [{"type": "image", "image": url} for url in image_urls]
messages = [
{
"role": "user",
"content": [
*placeholders,
{"type": "text", "text": question},
],
}
]
processor = AutoProcessor.from_pretrained(model_name)
prompt = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_data = [fetch_image(url) for url in image_urls]
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image_data=image_data,
)
model_example_map = {
"aria": load_aria,
"aya_vision": load_aya_vision,
......@@ -1085,6 +1218,7 @@ model_example_map = {
"mllama": load_mllama,
"NVLM_D": load_nvlm_d,
"ovis": load_ovis,
"ovis2_5": load_ovis2_5,
"phi3_v": load_phi3v,
"phi4_mm": load_phi4mm,
"phi4_multimodal": load_phi4_multimodal,
......@@ -1092,10 +1226,13 @@ model_example_map = {
"qwen_vl_chat": load_qwen_vl_chat,
"qwen2_vl": load_qwen2_vl,
"qwen2_5_vl": load_qwen2_5_vl,
"rvl": load_r_vl,
"smolvlm": load_smolvlm,
"step3": load_step3,
"tarsier": load_tarsier,
"tarsier2": load_tarsier2,
"glm4_5v": load_glm4_5v,
"glm4_5v_fp8": load_glm4_5v_fp8,
}
......
......@@ -27,10 +27,12 @@ class BlockStored(KVCacheEvent):
token_ids: list[int]
block_size: int
lora_id: Optional[int]
medium: Optional[str]
class BlockRemoved(KVCacheEvent):
block_hashes: list[int]
medium: Optional[str]
class AllBlocksCleared(KVCacheEvent):
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import os
import requests
# This example shows how to perform an online inference that generates
# multimodal data. In this specific case this example will take a geotiff
# image as input, process it using the multimodal data processor, and
# perform inference.
# Reuirements :
# - install plugin at:
# https://github.com/christian-pinto/prithvi_io_processor_plugin
# - start vllm in serving mode with the below args
# --model='christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM'
# --task embed --trust-remote-code
# --skip-tokenizer-init --enforce-eager
# --io-processor-plugin prithvi_to_tiff_india
def main():
image_url = "https://huggingface.co/christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM/resolve/main/India_900498_S2Hand.tif" # noqa: E501
server_endpoint = "http://localhost:8000/pooling"
request_payload_url = {
"data": {
"data": image_url,
"data_format": "url",
"image_format": "tiff",
"out_data_format": "b64_json",
},
"priority": 0,
"model": "christian-pinto/Prithvi-EO-2.0-300M-TL-VLLM",
}
ret = requests.post(server_endpoint, json=request_payload_url)
print(f"response.status_code: {ret.status_code}")
print(f"response.reason:{ret.reason}")
response = ret.json()
decoded_image = base64.b64decode(response["data"]["data"])
out_path = os.path.join(os.getcwd(), "online_prediction.tiff")
with open(out_path, "wb") as f:
f.write(decoded_image)
if __name__ == "__main__":
main()
{% if not add_generation_prompt is defined %}
{% set add_generation_prompt = false %}
{% endif %}
{% if not thinking is defined %}
{% set thinking = false %}
{% endif %}
{% set ns = namespace(is_first=false, is_tool=false, system_prompt='', is_first_sp=true, is_last_user=false) %}
{%- for message in messages %}
{%- if message['role'] == 'system' %}
{%- if ns.is_first_sp %}
{% set ns.system_prompt = ns.system_prompt + message['content'] %}
{% set ns.is_first_sp = false %}
{%- else %}
{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}
{%- endif %}
{%- endif %}
{%- endfor %}
{% if tools is defined and tools is not none %}
{% set tool_ns = namespace(text='## Tools\nYou have access to the following tools:\n') %}
{% for tool in tools %}
{% set tool_ns.text = tool_ns.text + '\n### ' + tool.function.name + '\nDescription: ' + tool.function.description + '\n\nParameters: ' + (tool.function.parameters | tojson) + '\n' %}
{% endfor %}
{% set tool_ns.text = tool_ns.text + "\nIMPORTANT: ALWAYS adhere to this exact format for tool use:\n<|tool▁calls▁begin|><|tool▁call▁begin|>tool_call_name<|tool▁sep|>tool_call_arguments<|tool▁call▁end|>{{additional_tool_calls}}<|tool▁calls▁end|>\n\nWhere:\n\n- `tool_call_name` must be an exact match to one of the available tools\n- `tool_call_arguments` must be valid JSON that strictly follows the tool's Parameters Schema\n- For multiple tool calls, chain them directly without separators or spaces\n" %}
{% set ns.system_prompt = ns.system_prompt + '\n\n' + tool_ns.text %}
{% endif %}
{{ bos_token }}{{ ns.system_prompt }}
{%- for message in messages %}
{%- if message['role'] == 'user' %}
{%- set ns.is_tool = false -%}
{%- set ns.is_first = false -%}
{%- set ns.is_last_user = true -%}
{{'<|User|>' + message['content']}}
{%- endif %}
{%- if message['role'] == 'assistant' and message['tool_calls'] is defined and message['tool_calls'] is not none %}
{%- if ns.is_last_user %}
{{'<|Assistant|></think>'}}
{%- endif %}
{%- set ns.is_last_user = false -%}
{%- set ns.is_first = false %}
{%- set ns.is_tool = false -%}
{%- for tool in message['tool_calls'] %}
{%- if not ns.is_first %}
{%- if message['content'] is none %}
{{'<|tool▁calls▁begin|><|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments']|tojson + '<|tool▁call▁end|>'}}
{%- else %}
{{message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments']|tojson + '<|tool▁call▁end|>'}}
{%- endif %}
{%- set ns.is_first = true -%}
{%- else %}
{{'<|tool▁call▁begin|>'+ tool['function']['name'] + '<|tool▁sep|>' + tool['function']['arguments']|tojson + '<|tool▁call▁end|>'}}
{%- endif %}
{%- endfor %}
{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}
{%- endif %}
{%- if message['role'] == 'assistant' and (message['tool_calls'] is not defined or message['tool_calls'] is none) %}
{%- if ns.is_last_user %}
{{'<|Assistant|>'}}
{%- if message['prefix'] is defined and message['prefix'] and thinking %}
{{'<think>'}}
{%- else %}
{{'</think>'}}
{%- endif %}
{%- endif %}
{%- set ns.is_last_user = false -%}
{%- if ns.is_tool %}
{{message['content'] + '<|end▁of▁sentence|>'}}
{%- set ns.is_tool = false -%}
{%- else %}
{%- set content = message['content'] -%}
{%- if '</think>' in content %}
{%- set content = content.split('</think>', 1)[1] -%}
{%- endif %}
{{content + '<|end▁of▁sentence|>'}}
{%- endif %}
{%- endif %}
{%- if message['role'] == 'tool' %}
{%- set ns.is_last_user = false -%}
{%- set ns.is_tool = true -%}
{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}
{%- endif %}
{%- endfor -%}
{%- if add_generation_prompt and ns.is_last_user and not ns.is_tool %}
{{'<|Assistant|>'}}
{%- if not thinking %}
{{'</think>'}}
{%- else %}
{{'<think>'}}
{%- endif %}
{% endif %}
{#- Begin-of-sequence token to start the model prompt -#}
{{ bos_token }}
{#- Extracts the system message. Gemma does not support system messages so it will be prepended to first user message. -#}
{%- if messages[0]['role'] == 'system' -%}
{%- if messages[0]['content'] is string -%}
{%- set first_user_prefix = messages[0]['content'] + '\n\n' -%}
{%- else -%}
{%- set first_user_prefix = messages[0]['content'][0]['text'] + '\n\n' -%}
{%- endif -%}
{%- set loop_messages = messages[1:] -%}
{%- else -%}
{%- set first_user_prefix = "" -%}
{%- set loop_messages = messages -%}
{%- endif -%}
{#- Set tools to none if not defined for this ChatCompletion request (helps avoid errors later) -#}
{%- if not tools is defined %}
{%- set tools = none %}
{%- endif %}
{#- Validate alternating user/assistant messages (excluding 'tool' messages and ones with tool_calls) -#}
{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | selectattr("tool_calls", "undefined") -%}
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
{{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
{%- endif -%}
{%- endfor -%}
{#- Main loop over all messages in the conversation history -#}
{%- for message in loop_messages -%}
{#- Normalize roles for model prompt formatting -#}
{%- if (message['role'] == 'assistant') -%}
{%- set role = "model" -%}
{%- elif (message['role'] == 'tool') -%}
{%- set role = "user" -%}
{%- else -%}
{%- set role = message['role'] -%}
{%- endif -%}
{#- Mark the start of a message block with the appropriate role -#}
{{ '<start_of_turn>' + role + '\n' -}}
{#- Insert system message content (if present) at the beginning of the first message. -#}
{%- if loop.first -%}
{{ first_user_prefix }}
{#- Append system message with tool information if using tools in message request. -#}
{%- if tools is not none -%}
{{- "Tools (functions) are available. If you decide to invoke one or more of the tools, you must respond with a python list of the function calls.\n" -}}
{{- "Example Format: [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)] \n" -}}
{{- "Do not use variables. DO NOT USE MARKDOWN SYNTAX. You SHOULD NOT include any other text in the response if you call a function. If none of the functions can be used, point it out. If you lack the parameters required by the function, also point it out.\n" -}}
{{- "Here is a list of functions in JSON format that you can invoke.\n" -}}
{{- tools | tojson(indent=4) -}}
{{- "\n\n" -}}
{%- endif -%}
{%- endif -%}
{#- Format model tool calls (turns where model indicates they want to call a tool) -#}
{%- if 'tool_calls' in message -%}
{#- Opening bracket for tool call list. -#}
{{- '[' -}}
{#- For each tool call -#}
{%- for tool_call in message.tool_calls -%}
{#- Get tool call function. -#}
{%- if tool_call.function is defined -%}
{%- set tool_call = tool_call.function -%}
{%- endif -%}
{#- Function name & opening parenthesis. -#}
{{- tool_call.name + '(' -}}
{#-- Handle arguments as list (positional) or dict (named) --#}
{#-- Named arguments (dict) --#}
{%- if tool_call.arguments is iterable and tool_call.arguments is mapping -%}
{%- set first = true -%}
{%- for key, val in tool_call.arguments.items() -%}
{%- if not first %}, {% endif -%}
{{ key }}={{ val | tojson }}
{%- set first = false -%}
{%- endfor -%}
{#-- Positional arguments (list) --#}
{%- elif tool_call.arguments is iterable -%}
{{- tool_call.arguments | map('tojson') | join(', ') -}}
{#-- Fallback: single positional value --#}
{%- else -%}
{{- tool_call.arguments | tojson -}}
{#-- Closing parenthesis. --#}
{%- endif -%}
{{- ')' -}}
{#-- If more than one tool call, place comma and move to formatting next tool call --#}
{%- if not loop.last -%}, {% endif -%}
{%- endfor -%}
{#- Closing bracket for tool call list. -#}
{{- ']' -}}
{%- endif -%}
{#- Tool response start tag (for messages from a tool) -#}
{%- if (message['role'] == 'tool') -%}
{{ '<tool_response>\n' -}}
{%- endif -%}
{#- Render the message content: handle plain string or multimodal content like image/text -#}
{%- if message['content'] is string -%}
{{ message['content'] | trim }}
{%- elif message['content'] is iterable -%}
{%- for item in message['content'] -%}
{%- if item['type'] == 'image' -%}
{{ '<start_of_image>' }}
{%- elif item['type'] == 'text' -%}
{{ item['text'] | trim }}
{%- endif -%}
{%- endfor -%}
{%- else -%}
{{ raise_exception("Invalid content type") }}
{%- endif -%}
{#- Tool response end tag -#}
{%- if (message['role'] == 'tool') -%}
{{ '</tool_response>' -}}
{%- endif -%}
{#- Mark end of a single turn -#}
{{ '<end_of_turn>\n' }}
{%- endfor -%}
{#- If generation is to be triggered, add model prompt prefix -#}
{%- if add_generation_prompt -%}
{{'<start_of_turn>model\n'}}
{%- endif -%}
\ No newline at end of file
{%- if messages and messages[0]['role'] == 'system' %}
{%- set system_message = messages[0]['content']|trim %}
{%- set messages = messages[1:] %}
{%- else %}
{%- set system_message = "You are a helpful assistant." %}
{%- endif %}
{%- if messages %}
{%- if system_message or tools %}
<|system|>
{%- if system_message %}
{{ system_message }}
{%- endif %}
{%- if tools %}
In addition to plain text responses, you can chose to call one or more of the provided functions.
Use the following rule to decide when to call a function:
......@@ -19,13 +23,11 @@ If you decide to call functions:
* make sure you pick the right functions that match the user intent
{%- if tools %}
{%- for t in tools %}
{{- t | tojson(indent=4) }}
{{- "\n\n" }}
{%- endfor %}
{%- endif %}<|end|>
{%- endif %}
{%- for message in messages %}
{%- if message.role != "system" %}
......
{% macro render_extra_keys(json_dict, handled_keys) %}
{%- if json_dict is mapping %}
{%- for json_key in json_dict if json_key not in handled_keys %}
{%- if json_dict[json_key] is mapping or (json_dict[json_key] is sequence and json_dict[json_key] is not string) %}
{{- '\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | tojson | safe) ~ '</' ~ json_key ~ '>' }}
{%- else %}
{{-'\n<' ~ json_key ~ '>' ~ (json_dict[json_key] | string) ~ '</' ~ json_key ~ '>' }}
{%- endif %}
{%- endfor %}
{%- endif %}
{% endmacro %}
{%- if messages[0]["role"] == "system" %}
{%- set system_message = messages[0]["content"] %}
{%- set loop_messages = messages[1:] %}
{%- else %}
{%- set loop_messages = messages %}
{%- endif %}
{%- if not tools is defined %}
{%- set tools = [] %}
{%- endif %}
{%- if system_message is defined %}
{{- "<|im_start|>system\n" + system_message }}
{%- else %}
{%- if tools is iterable and tools | length > 0 %}
{{- "<|im_start|>system\nYou are Qwen, a helpful AI assistant that can interact with a computer to solve tasks." }}
{%- endif %}
{%- endif %}
{%- if tools is iterable and tools | length > 0 %}
{{- "\n\n# Tools\n\nYou have access to the following functions:\n\n" }}
{{- "<tools>" }}
{%- for tool in tools %}
{%- if tool.function is defined %}
{%- set tool = tool.function %}
{%- endif %}
{{- "\n<function>\n<name>" ~ tool.name ~ "</name>" }}
{%- if tool.description is defined %}
{{- '\n<description>' ~ (tool.description | trim) ~ '</description>' }}
{%- endif %}
{{- '\n<parameters>' }}
{%- if tool.parameters is defined and tool.parameters is mapping and tool.parameters.properties is defined and tool.parameters.properties is mapping %}
{%- for param_name, param_fields in tool.parameters.properties|items %}
{{- '\n<parameter>' }}
{{- '\n<name>' ~ param_name ~ '</name>' }}
{%- if param_fields.type is defined %}
{{- '\n<type>' ~ (param_fields.type | string) ~ '</type>' }}
{%- endif %}
{%- if param_fields.description is defined %}
{{- '\n<description>' ~ (param_fields.description | trim) ~ '</description>' }}
{%- endif %}
{%- set handled_keys = ['name', 'type', 'description'] %}
{{- render_extra_keys(param_fields, handled_keys) }}
{{- '\n</parameter>' }}
{%- endfor %}
{%- endif %}
{% set handled_keys = ['type', 'properties'] %}
{{- render_extra_keys(tool.parameters, handled_keys) }}
{{- '\n</parameters>' }}
{%- set handled_keys = ['type', 'name', 'description', 'parameters'] %}
{{- render_extra_keys(tool, handled_keys) }}
{{- '\n</function>' }}
{%- endfor %}
{{- "\n</tools>" }}
{{- '\n\nIf you choose to call a function ONLY reply in the following format with NO suffix:\n\n<tool_call>\n<function=example_function_name>\n<parameter=example_parameter_1>\nvalue_1\n</parameter>\n<parameter=example_parameter_2>\nThis is the value for the second parameter\nthat can span\nmultiple lines\n</parameter>\n</function>\n</tool_call>\n\n<IMPORTANT>\nReminder:\n- Function calls MUST follow the specified format: an inner <function=...></function> block must be nested within <tool_call></tool_call> XML tags\n- Required parameters MUST be specified\n- You may provide optional reasoning for your function call in natural language BEFORE the function call, but NOT after\n- If there is no function call available, answer the question like normal with your current knowledge and do not tell the user about function calls\n</IMPORTANT>' }}
{%- endif %}
{%- if system_message is defined %}
{{- '<|im_end|>\n' }}
{%- else %}
{%- if tools is iterable and tools | length > 0 %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- endif %}
{%- for message in loop_messages %}
{%- if message.role == "assistant" and message.tool_calls is defined and message.tool_calls is iterable and message.tool_calls | length > 0 %}
{{- '<|im_start|>' + message.role }}
{%- if message.content is defined and message.content is string and message.content | trim | length > 0 %}
{{- '\n' + message.content | trim + '\n' }}
{%- endif %}
{%- for tool_call in message.tool_calls %}
{%- if tool_call.function is defined %}
{%- set tool_call = tool_call.function %}
{%- endif %}
{{- '\n<tool_call>\n<function=' + tool_call.name + '>\n' }}
{%- if tool_call.arguments is defined %}
{%- for args_name, args_value in tool_call.arguments|items %}
{{- '<parameter=' + args_name + '>\n' }}
{%- set args_value = args_value | tojson | safe if args_value is mapping or (args_value is sequence and args_value is not string) else args_value | string %}
{{- args_value }}
{{- '\n</parameter>\n' }}
{%- endfor %}
{%- endif %}
{{- '</function>\n</tool_call>' }}
{%- endfor %}
{{- '<|im_end|>\n' }}
{%- elif message.role == "user" or message.role == "system" or message.role == "assistant" %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
{%- elif message.role == "tool" %}
{%- if loop.previtem and loop.previtem.role != "tool" %}
{{- '<|im_start|>user\n' }}
{%- endif %}
{{- '<tool_response>\n' }}
{{- message.content }}
{{- '\n</tool_response>\n' }}
{%- if not loop.last and loop.nextitem.role != "tool" %}
{{- '<|im_end|>\n' }}
{%- elif loop.last %}
{{- '<|im_end|>\n' }}
{%- endif %}
{%- else %}
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' }}
{%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
{{- '<|im_start|>assistant\n' }}
{%- endif %}
......@@ -129,15 +129,16 @@ markdown_extensions:
- toc:
permalink: true
# For math rendering
- mdx_math:
enable_dollar_delimiter: true
- pymdownx.arithmatex:
generic: true
extra_css:
- mkdocs/stylesheets/extra.css
extra_javascript:
- mkdocs/javascript/run_llm_widget.js
- https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML
- mkdocs/javascript/mathjax.js
- https://unpkg.com/mathjax@3.2.2/es5/tex-mml-chtml.js
- mkdocs/javascript/edit_and_feedback.js
- mkdocs/javascript/slack_and_forum.js
......
......@@ -6,7 +6,7 @@ requires = [
"packaging>=24.2",
"setuptools>=77.0.3,<80.0.0",
"setuptools-scm>=8.0",
"torch == 2.7.1",
"torch == 2.8.0",
"wheel",
"jinja2",
]
......@@ -24,13 +24,14 @@ classifiers = [
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Programming Language :: Python :: 3.13",
"Intended Audience :: Developers",
"Intended Audience :: Information Technology",
"Intended Audience :: Science/Research",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Topic :: Scientific/Engineering :: Information Analysis",
]
requires-python = ">=3.9,<3.13"
requires-python = ">=3.9,<3.14"
dynamic = [ "version", "dependencies", "optional-dependencies"]
[project.urls]
......
......@@ -4,7 +4,8 @@ ninja
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
torch==2.7.1
torch==2.8.0
wheel
jinja2>=3.1.6
regex
build
......@@ -7,20 +7,21 @@ requests >= 2.26.0
tqdm
blake3
py-cpuinfo
transformers >= 4.55.0
transformers >= 4.55.2
tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf # Required by LlamaTokenizer.
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
aiohttp
openai >= 1.99.1 # For Responses API with reasoning content
pydantic >= 2.10
pydantic >= 2.11.7
prometheus_client >= 0.18.0
pillow # Required for image processing
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken >= 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer >= 0.10.11, < 0.11
lm-format-enforcer == 0.11.3
llguidance >= 0.7.11, < 0.8.0; platform_machine == "x86_64" or platform_machine == "arm64" or platform_machine == "aarch64"
outlines_core == 0.2.10
outlines_core == 0.2.10 ; platform_machine != "s390x"
outlines == 0.1.11 ; platform_machine == "s390x"
# required for outlines backend disk cache
diskcache == 5.6.3
lark == 1.2.2
......@@ -38,7 +39,7 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<80; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL.
compressed-tensors == 0.10.2 # required for compressed-tensors
compressed-tensors == 0.11.0 # required for compressed-tensors
depyf==0.19.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files
......
# Common dependencies
-r common.txt
numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9'
numba == 0.60.0; python_version == '3.9' and platform_machine != "s390x" # v0.61 doesn't support Python 3.9. Required for N-gram speculative decoding
numba == 0.61.2; python_version > '3.9' and platform_machine != "s390x"
# Dependencies for CPUs
packaging>=24.2
setuptools>=77.0.3,<80.0.0
--extra-index-url https://download.pytorch.org/whl/cpu
torch==2.6.0+cpu; platform_machine == "x86_64" # torch>2.6.0+cpu has performance regression on x86 platform, see https://github.com/pytorch/pytorch/pull/151218
torch==2.7.0; platform_system == "Darwin"
torch==2.7.0; platform_machine == "ppc64le"
torch==2.6.0; platform_machine == "aarch64" # for arm64 CPUs, torch 2.7.0 has a issue: https://github.com/vllm-project/vllm/issues/17960
torch==2.8.0; platform_system == "Darwin"
torch==2.8.0; platform_machine == "ppc64le" or platform_machine == "aarch64"
# required for the image processor of minicpm-o-2_6, this must be updated alongside torch
torchaudio; platform_machine != "ppc64le" and platform_machine != "s390x"
torchaudio==2.7.0; platform_machine == "ppc64le"
torchaudio==2.8.0; platform_machine == "ppc64le"
# required for the image processor of phi3v, this must be updated alongside torch
torchvision; platform_machine != "ppc64le" and platform_machine != "s390x"
torchvision==0.22.0; platform_machine == "ppc64le"
torchvision==0.23.0; platform_machine == "ppc64le"
datasets # for benchmark scripts
# Intel Extension for PyTorch, only for x86_64 CPUs
......
......@@ -6,9 +6,9 @@ numba == 0.61.2; python_version > '3.9'
# Dependencies for NVIDIA GPUs
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
torch==2.7.1
torchaudio==2.7.1
torch==2.8.0
torchaudio==2.8.0
# These must be updated alongside torch
torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
\ No newline at end of file
torchvision==0.23.0 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.32.post1
xformers==0.0.32.post1; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.8
......@@ -7,27 +7,12 @@ mkdocs-awesome-nav
mkdocs-glightbox
mkdocs-git-revision-date-localized-plugin
mkdocs-minify-plugin
python-markdown-math
regex
ruff
# Required for argparse hook only
-f https://download.pytorch.org/whl/cpu
cachetools
cbor2
cloudpickle
fastapi
msgspec
openai
openai-harmony
partial-json-parser
pillow
psutil
pybase64
pydantic
setproctitle
torch
transformers
zmq
uvloop
prometheus-client
......@@ -27,7 +27,7 @@ mistral_common[image,audio] >= 1.8.2 # required for voxtral test
num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.52.4
tokenizers==0.21.1
......
# Common dependencies
-r common.txt
--extra-index-url https://download.pytorch.org/whl/rocm6.2.4
torch==2.7.0
torchvision==0.22.0
torchaudio==2.7.0
--extra-index-url https://download.pytorch.org/whl/rocm6.3
torch==2.8.0
torchvision==0.23.0
torchaudio==2.8.0
triton==3.2
triton==3.3.0
cmake>=3.26.1,<4
packaging>=24.2
setuptools>=77.0.3,<80.0.0
......
......@@ -22,9 +22,9 @@ sentence-transformers # required for embedding tests
soundfile # required for audio tests
jiwer # required for audio tests
timm >=1.0.17 # required for internvl and gemma3n-mm test
torch==2.7.1
torchaudio==2.7.1
torchvision==0.22.1
torch==2.8.0
torchaudio==2.8.0
torchvision==0.23.0
transformers_stream_generator # required for qwen-vl test
matplotlib # required for qwen-vl test
mistral_common[image,audio] >= 1.8.2 # required for voxtral test
......@@ -32,9 +32,10 @@ num2words # required for smolvlm test
open_clip_torch==2.32.0 # Required for nemotron_vl test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
# TODO: Use lm-eval[api]==0.4.10 once released
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d # required for model evaluation test
mteb[bm25s]>=1.38.11, <2 # required for mteb test
transformers==4.55.0
transformers==4.55.2
tokenizers==0.21.1
schemathesis>=3.39.15 # Required for openai schema test.
# quantization
......@@ -53,3 +54,4 @@ runai-model-streamer-s3==0.11.0
fastsafetensors>=0.1.10
pydantic>=2.10 # 2.9 leads to error on python 3.10
terratorch==1.1rc2 # required for PrithviMAE test
decord==0.6.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment