"vllm/utils/argparse_utils.py" did not exist on "b893d661b1b9dd3953a1cc95bb52d010c4683dcf"
Unverified Commit 6c046382 authored by Harry Mellor's avatar Harry Mellor Committed by GitHub
Browse files

Fix per file ruff ignores related to line length (#26262)


Signed-off-by: default avatarHarry Mellor <19981378+hmellor@users.noreply.github.com>
parent 91ac7f76
...@@ -947,7 +947,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( ...@@ -947,7 +947,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
{"type": "image_url", "image_url": {"url": image_url}}, {"type": "image_url", "image_url": {"url": image_url}},
{ {
"type": "text", "type": "text",
"text": "What's in <|image_1|> and how does it compare to the other one?", # noqa: E501 "text": "What's in <|image_1|> and how does it compare to "
"the other one?",
}, },
], ],
} }
...@@ -960,8 +961,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt( ...@@ -960,8 +961,8 @@ def test_parse_chat_messages_placeholder_one_already_in_prompt(
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the " "content": "<|image_2|>\nWhat's in <|image_1|> and how does it compare to "
"other one?", "the other one?",
} }
] ]
_assert_mm_data_is_image_input(mm_data, 2) _assert_mm_data_is_image_input(mm_data, 2)
...@@ -1364,7 +1365,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave( ...@@ -1364,7 +1365,7 @@ def test_parse_chat_messages_multiple_images_multiple_messages_interleave(
_assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None]) _assert_mm_uuids(mm_uuids, 2, expected_uuids=[None, None])
def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave( # noqa: E501 def test_parse_chat_messages_multiple_images_with_uuids_multiple_messages_interleave(
phi3v_model_config_mm_interleaved, phi3v_model_config_mm_interleaved,
phi3v_tokenizer, phi3v_tokenizer,
image_url, image_url,
...@@ -1451,14 +1452,14 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( ...@@ -1451,14 +1452,14 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
}, },
{"role": "assistant", "content": "Some stuff."}, {"role": "assistant", "content": "Some stuff."},
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
}, },
] ]
...@@ -1468,7 +1469,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave( ...@@ -1468,7 +1469,7 @@ def test_parse_chat_messages_multiple_modals_multiple_messages_interleave(
_assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None]) _assert_mm_uuids(mm_uuids, 1, modality="audio", expected_uuids=[None])
def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave( # noqa: E501 def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interleave(
qwen25omni_model_config_mm_interleaved, qwen25omni_model_config_mm_interleaved,
qwen25omni_tokenizer, qwen25omni_tokenizer,
image_url, image_url,
...@@ -1521,14 +1522,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl ...@@ -1521,14 +1522,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_messages_interl
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
}, },
{"role": "assistant", "content": "Some stuff."}, {"role": "assistant", "content": "Some stuff."},
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
}, },
] ]
...@@ -1593,14 +1594,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes ...@@ -1593,14 +1594,14 @@ def test_parse_chat_messages_multiple_modals_with_uuids_multiple_empty_media_mes
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
}, },
{"role": "assistant", "content": "Some stuff."}, {"role": "assistant", "content": "Some stuff."},
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
}, },
] ]
...@@ -1661,14 +1662,14 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message ...@@ -1661,14 +1662,14 @@ def test_parse_chat_messages_multiple_modals_with_partial_uuids_multiple_message
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"Now listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>", # noqa: E501 "\nNow listen to this audio\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>",
}, },
{"role": "assistant", "content": "Some stuff."}, {"role": "assistant", "content": "Some stuff."},
{ {
"role": "user", "role": "user",
"content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>\n" "content": "What's on this image?\n<|vision_start|><|IMAGE|><|vision_end|>"
"And what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>", "\nAnd what's in the video?\n<|vision_start|><|VIDEO|><|vision_end|>",
}, },
] ]
...@@ -2193,7 +2194,8 @@ def test_parse_chat_messages_single_empty_audio_with_uuid( ...@@ -2193,7 +2194,8 @@ def test_parse_chat_messages_single_empty_audio_with_uuid(
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?", "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
"audio say?",
} }
] ]
_assert_mm_data_inputs(mm_data, {"audio": 1}) _assert_mm_data_inputs(mm_data, {"audio": 1})
...@@ -2228,7 +2230,8 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async( ...@@ -2228,7 +2230,8 @@ async def test_parse_chat_messages_single_empty_audio_with_uuid_async(
assert conversation == [ assert conversation == [
{ {
"role": "user", "role": "user",
"content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the audio say?", "content": "Audio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat does the "
"audio say?",
} }
] ]
_assert_mm_data_inputs(await mm_future, {"audio": 1}) _assert_mm_data_inputs(await mm_future, {"audio": 1})
......
...@@ -165,7 +165,7 @@ def test_env( ...@@ -165,7 +165,7 @@ def test_env(
# FlashMLA only supports block_size == 64 # FlashMLA only supports block_size == 64
pytest.skip("FlashMLA only supports block_size 64") pytest.skip("FlashMLA only supports block_size 64")
else: else:
from vllm.v1.attention.backends.mla.flashmla import ( # noqa: E501 from vllm.v1.attention.backends.mla.flashmla import (
is_flashmla_supported, is_flashmla_supported,
) )
......
...@@ -331,7 +331,8 @@ class WeightTensors: ...@@ -331,7 +331,8 @@ class WeightTensors:
in_dtype=config.dtype, in_dtype=config.dtype,
quant_dtype=config.quant_dtype, quant_dtype=config.quant_dtype,
block_shape=config.quant_block_shape, block_shape=config.quant_block_shape,
per_out_ch_quant=config.is_per_act_token_quant, # or config.is_per_out_ch_quant # or config.is_per_out_ch_quant
per_out_ch_quant=config.is_per_act_token_quant,
) )
return WeightTensors( return WeightTensors(
w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs w1=w1, w2=w2, w1_scale=w1_scale, w2_scale=w2_scale, w1_gs=w1_gs, w2_gs=w2_gs
......
...@@ -124,7 +124,7 @@ def make_feature_matrix(csv_file_path: str): ...@@ -124,7 +124,7 @@ def make_feature_matrix(csv_file_path: str):
results_df: Optional[pd.DataFrame] = None results_df: Optional[pd.DataFrame] = None
for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm( for m, k, n, e, topks, dtype, pf_type, experts_type, quant_config in tqdm(
combinations combinations
): # noqa: E501 ):
config = Config( config = Config(
Ms=[m], Ms=[m],
K=k, K=k,
......
...@@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk ...@@ -10,7 +10,7 @@ import vllm.model_executor.layers.fused_moe.modular_kernel as mk
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
BatchedDeepGemmExperts, BatchedDeepGemmExperts,
) )
from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
BatchedTritonOrDeepGemmExperts, BatchedTritonOrDeepGemmExperts,
) )
from vllm.model_executor.layers.fused_moe.config import ( from vllm.model_executor.layers.fused_moe.config import (
...@@ -196,10 +196,10 @@ register_experts( ...@@ -196,10 +196,10 @@ register_experts(
# Disable on blackwell for now # Disable on blackwell for now
if has_deep_ep() and not current_platform.has_device_capability(100): if has_deep_ep() and not current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize, DeepEPLLPrepareAndFinalize,
) )
...@@ -233,7 +233,7 @@ if has_pplx(): ...@@ -233,7 +233,7 @@ if has_pplx():
) )
if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100): if has_flashinfer_cutlass_fused_moe() and current_platform.has_device_capability(100):
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import (
FlashInferExperts, FlashInferExperts,
) )
from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501
......
...@@ -17,10 +17,10 @@ from typing_extensions import Concatenate, ParamSpec ...@@ -17,10 +17,10 @@ from typing_extensions import Concatenate, ParamSpec
from vllm.utils import get_open_port, has_deep_ep from vllm.utils import get_open_port, has_deep_ep
if has_deep_ep(): if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize, DeepEPLLPrepareAndFinalize,
) )
......
...@@ -30,10 +30,10 @@ from .parallel_utils import ProcessGroupInfo, parallel_launch ...@@ -30,10 +30,10 @@ from .parallel_utils import ProcessGroupInfo, parallel_launch
from .utils import make_test_weights from .utils import make_test_weights
if has_deep_ep(): if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize, DeepEPLLPrepareAndFinalize,
) )
......
...@@ -28,10 +28,10 @@ from ...utils import multi_gpu_test ...@@ -28,10 +28,10 @@ from ...utils import multi_gpu_test
from .parallel_utils import ProcessGroupInfo, parallel_launch from .parallel_utils import ProcessGroupInfo, parallel_launch
if has_deep_ep(): if has_deep_ep():
from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize import (
DeepEPHTPrepareAndFinalize, DeepEPHTPrepareAndFinalize,
) )
from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import ( # noqa: E501 from vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize import (
DeepEPLLPrepareAndFinalize, DeepEPLLPrepareAndFinalize,
) )
......
...@@ -271,7 +271,7 @@ if __name__ == "__main__": ...@@ -271,7 +271,7 @@ if __name__ == "__main__":
parser = make_config_arg_parser( parser = make_config_arg_parser(
description=( description=(
"Run single prepare-finalize & fused-experts combination test" "Run single prepare-finalize & fused-experts combination test"
"Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations " # noqa: E501 "Example : python3 -m tests.kernels.moe.test_modular_kernel_combinations "
"--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts" "--pf-type PplxPrepareAndFinalize --experts-type BatchedTritonExperts"
) )
) )
......
...@@ -483,8 +483,8 @@ def test_mixtral_moe( ...@@ -483,8 +483,8 @@ def test_mixtral_moe(
} }
if use_rocm_aiter: if use_rocm_aiter:
# The values of rtol and atol are set based on the tests in ROCM AITER package. # noqa: E501 # The values of rtol and atol are set based on the tests in ROCM AITER package.
# https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174 # noqa: E501 # https://github.com/ROCm/aiter/blob/dfed377f4be7da96ca2d75ac0761f569676f7240/op_tests/test_moe.py#L174
torch.testing.assert_close( torch.testing.assert_close(
hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100 hf_states.flatten(0, 1), vllm_states, rtol=0.01, atol=100
) )
......
...@@ -10,11 +10,11 @@ import pytest ...@@ -10,11 +10,11 @@ import pytest
import torch import torch
from packaging import version from packaging import version
from vllm.model_executor.layers.quantization.quark.quark import ( # noqa: E501 from vllm.model_executor.layers.quantization.quark.quark import (
QuarkLinearMethod, QuarkLinearMethod,
QuarkW4A4MXFP4, QuarkW4A4MXFP4,
) )
from vllm.model_executor.layers.quantization.quark.quark_moe import ( # noqa: E501 from vllm.model_executor.layers.quantization.quark.quark_moe import (
QuarkW4A4MXFp4MoEMethod, QuarkW4A4MXFp4MoEMethod,
) )
from vllm.platforms import current_platform from vllm.platforms import current_platform
......
...@@ -12,7 +12,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example ...@@ -12,7 +12,7 @@ PROMPT_TEMPLATE = """I want you to act as a SQL terminal in front of an example
EXPECTED_LORA_OUTPUT = [ EXPECTED_LORA_OUTPUT = [
"SELECT count(*) FROM singer", "SELECT count(*) FROM singer",
"SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", # noqa: E501 "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'",
"SELECT name , country , age FROM singer ORDER BY age", "SELECT name , country , age FROM singer ORDER BY age",
] ]
...@@ -21,10 +21,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]: ...@@ -21,10 +21,16 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> list[str]:
prompts = [ prompts = [
PROMPT_TEMPLATE.format(query="How many singers do we have?"), PROMPT_TEMPLATE.format(query="How many singers do we have?"),
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(
query="What is the average, minimum, and maximum age of all singers from France?" # noqa: E501 query=(
"What is the average, minimum, and maximum "
"age of all singers from France?"
)
), ),
PROMPT_TEMPLATE.format( PROMPT_TEMPLATE.format(
query="Show name, country, age for all singers ordered by age from the oldest to the youngest." # noqa: E501 query=(
"Show name, country, age for all singers ordered "
"by age from the oldest to the youngest."
)
), ),
] ]
sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32)
......
...@@ -15,10 +15,10 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf" ...@@ -15,10 +15,10 @@ MODEL_PATH = "meta-llama/Llama-2-7b-hf"
EXPECTED_LORA_OUTPUT = [ EXPECTED_LORA_OUTPUT = [
" SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501 " SELECT icao FROM table_name_74 WHERE airport = 'lilongwe international airport' ", # noqa: E501
" SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ", # noqa: E501 " SELECT nationality FROM table_name_11 WHERE elector = 'anchero pantaleone' ",
" SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501 " SELECT one_mora FROM table_name_95 WHERE gloss = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] AND accented_mora = 'low tone mora with a gloss of /˩okiru/' [òkìɽɯ́] ", # noqa: E501
" SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501 " SELECT sex FROM people WHERE people_id IN (SELECT people_id FROM candidate GROUP BY sex ORDER BY COUNT(people_id) DESC LIMIT 1) ", # noqa: E501
" SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ", # noqa: E501 " SELECT pick FROM table_name_60 WHERE former_wnba_team = 'Minnesota Lynx' ",
" SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501 " SELECT womens_doubles FROM table_28138035_4 WHERE mens_singles = 'Werner Schlager' ", # noqa: E501
] ]
......
...@@ -26,7 +26,7 @@ LORA_RANK = 8 ...@@ -26,7 +26,7 @@ LORA_RANK = 8
LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"] LORA_TEST_PROMPTS = ["What is GitHub?", "Hi, tell me about you"]
LORA_TEST_EXPECTED = [ LORA_TEST_EXPECTED = [
"GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501 "GitHub is an open-source platform that provides a way to manage and develop software projects. It allows developers to store and manage code, collaborate on projects, and automate tasks.", # noqa: E501
"I am Alice, an AI assistant developed by GitHub/Charent.", # noqa: E501 "I am Alice, an AI assistant developed by GitHub/Charent.",
] ]
......
...@@ -16,7 +16,7 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None: ...@@ -16,7 +16,7 @@ def test_dummy_loader(vllm_runner, monkeypatch, model: str) -> None:
) as llm: ) as llm:
if model == "google/gemma-3-4b-it": if model == "google/gemma-3-4b-it":
normalizers = llm.llm.collective_rpc( normalizers = llm.llm.collective_rpc(
lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() lambda self: self.model_runner.model.language_model.model.normalizer.cpu().item() # noqa: E501
) )
config = llm.llm.llm_engine.model_config.hf_config.text_config config = llm.llm.llm_engine.model_config.hf_config.text_config
else: else:
......
...@@ -46,12 +46,13 @@ TOOLS = [ ...@@ -46,12 +46,13 @@ TOOLS = [
"properties": { "properties": {
"city": { "city": {
"type": "string", "type": "string",
"description": "The city to find the weather for, e.g. 'San Francisco'", "description": "The city to find the weather for, e.g. "
"'San Francisco'",
}, },
"state": { "state": {
"type": "string", "type": "string",
"description": "the two-letter abbreviation for the state that the city is" "description": "the two-letter abbreviation for the state that "
" in, e.g. 'CA' which would mean 'California'", "the city is in, e.g. 'CA' which would mean 'California'",
}, },
"unit": { "unit": {
"type": "string", "type": "string",
...@@ -85,7 +86,8 @@ MSGS = [ ...@@ -85,7 +86,8 @@ MSGS = [
{"role": "system", "content": "You are an assistant."}, {"role": "system", "content": "You are an assistant."},
{ {
"role": "user", "role": "user",
"content": "Could you please rewrite the below article? \n\n My English needs improvving, maybe I make errors.", # noqa "content": "Could you please rewrite the below article? \n\n My English needs "
"improvving, maybe I make errors.",
}, },
{ {
"role": "assistant", "role": "assistant",
...@@ -96,14 +98,16 @@ MSGS = [ ...@@ -96,14 +98,16 @@ MSGS = [
"type": "function", "type": "function",
"function": { "function": {
"name": "rewrite", "name": "rewrite",
"arguments": '{"text":"My English needs improvving, maybe I make errors."}', # noqa "arguments": '{"text":"My English needs improvving, maybe '
'I make errors."}',
}, },
} }
], ],
}, },
{ {
"role": "tool", "role": "tool",
"content": '{"action":"rewrite","outcome":"My English needs improving, maybe I make errors."}', # noqa "content": '{"action":"rewrite","outcome":"My English needs improving, maybe '
'I make errors."}',
"tool_call_id": "bbc5b7ede", "tool_call_id": "bbc5b7ede",
"name": "rewrite", "name": "rewrite",
}, },
......
...@@ -130,14 +130,14 @@ VLM_TEST_SETTINGS = { ...@@ -130,14 +130,14 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16", dtype="bfloat16",
marks=[ marks=[
pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask") pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask")
], # noqa: E501 ],
), ),
"qwen2_5_vl": VLMTestInfo( "qwen2_5_vl": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"], models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
...@@ -149,8 +149,8 @@ VLM_TEST_SETTINGS = { ...@@ -149,8 +149,8 @@ VLM_TEST_SETTINGS = {
models=["Qwen/Qwen2.5-Omni-3B"], models=["Qwen/Qwen2.5-Omni-3B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_bos|><|IMAGE|><|vision_eos|>",
video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_bos|><|VIDEO|><|vision_eos|>",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
num_logprobs=6 if current_platform.is_cpu() else 5, num_logprobs=6 if current_platform.is_cpu() else 5,
...@@ -181,7 +181,7 @@ VLM_TEST_SETTINGS = { ...@@ -181,7 +181,7 @@ VLM_TEST_SETTINGS = {
max_model_len=16384, max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf" "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501 ),
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
image_size_factors=[(0.25, 0.5, 1.0)], image_size_factors=[(0.25, 0.5, 1.0)],
...@@ -213,7 +213,7 @@ VLM_TEST_SETTINGS = { ...@@ -213,7 +213,7 @@ VLM_TEST_SETTINGS = {
models=["Qwen/Qwen2.5-VL-3B-Instruct"], models=["Qwen/Qwen2.5-VL-3B-Instruct"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
...@@ -237,10 +237,10 @@ VLM_TEST_SETTINGS = { ...@@ -237,10 +237,10 @@ VLM_TEST_SETTINGS = {
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<vlm_image>Please describe the image shortly.", "stop_sign": "<vlm_image>Please describe the image shortly.",
"cherry_blossom": "<vlm_image>Please infer the season with reason.", # noqa: E501 "cherry_blossom": "<vlm_image>Please infer the season with reason.",
} }
), ),
multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.", # noqa: E501 multi_image_prompt="<vlm_image><vlm_image>Describe the two images shortly.",
stop_str=["<|im_end|>"], stop_str=["<|im_end|>"],
image_size_factors=[(0.10, 0.15)], image_size_factors=[(0.10, 0.15)],
max_tokens=64, max_tokens=64,
...@@ -252,11 +252,11 @@ VLM_TEST_SETTINGS = { ...@@ -252,11 +252,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>What's the content in the center of the image?",
"cherry_blossom": "<image>What is the season?", # noqa: E501 "cherry_blossom": "<image>What is the season?",
} }
), ),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<image><image>Describe the two images in detail.",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
...@@ -268,11 +268,11 @@ VLM_TEST_SETTINGS = { ...@@ -268,11 +268,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>What's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>What's the content in the center of the image?",
"cherry_blossom": "<image>What is the season?", # noqa: E501 "cherry_blossom": "<image>What is the season?",
} }
), ),
multi_image_prompt="<image><image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<image><image>Describe the two images in detail.",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
...@@ -311,14 +311,14 @@ VLM_TEST_SETTINGS = { ...@@ -311,14 +311,14 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501 "cherry_blossom": "<image>\nPlease infer the season with reason in details.", # noqa: E501
} }
), ),
multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501 multi_image_prompt="image_1:<image>\nimage_2:<image>\nWhich image can we see the car and the tower?", # noqa: E501
patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner, patch_hf_runner=model_utils.deepseekvl2_patch_hf_runner,
hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output, hf_output_post_proc=model_utils.deepseekvl2_trunc_hf_output,
stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"], # noqa: E501 stop_str=["<|end▁of▁sentence|>", "<|begin▁of▁sentence|>"],
image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)], image_size_factors=[(), (1.0,), (1.0, 1.0, 1.0), (0.1, 0.5, 1.0)],
), ),
"fuyu": VLMTestInfo( "fuyu": VLMTestInfo(
...@@ -342,7 +342,7 @@ VLM_TEST_SETTINGS = { ...@@ -342,7 +342,7 @@ VLM_TEST_SETTINGS = {
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501 "stop_sign": "<start_of_image>What's the content in the center of the image?", # noqa: E501
"cherry_blossom": "<start_of_image>What is the season?", # noqa: E501 "cherry_blossom": "<start_of_image>What is the season?",
} }
), ),
multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501 multi_image_prompt="<start_of_image><start_of_image>Describe the two images in detail.", # noqa: E501
...@@ -356,7 +356,7 @@ VLM_TEST_SETTINGS = { ...@@ -356,7 +356,7 @@ VLM_TEST_SETTINGS = {
"glm4v": VLMTestInfo( "glm4v": VLMTestInfo(
models=["zai-org/glm-4v-9b"], models=["zai-org/glm-4v-9b"],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501 "stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
...@@ -377,9 +377,9 @@ VLM_TEST_SETTINGS = { ...@@ -377,9 +377,9 @@ VLM_TEST_SETTINGS = {
"glm4_1v": VLMTestInfo( "glm4_1v": VLMTestInfo(
models=["zai-org/GLM-4.1V-9B-Thinking"], models=["zai-org/GLM-4.1V-9B-Thinking"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>",
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
max_model_len=2048, max_model_len=2048,
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338], get_stop_token_ids=lambda tok: [151329, 151336, 151338],
...@@ -410,10 +410,10 @@ VLM_TEST_SETTINGS = { ...@@ -410,10 +410,10 @@ VLM_TEST_SETTINGS = {
"h2oai/h2ovl-mississippi-2b", "h2oai/h2ovl-mississippi-2b",
], ],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|prompt|>{img_prompt}<|end|><|answer|>",
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nWhat is the season?", "cherry_blossom": "<image>\nWhat is the season?",
} }
), ),
...@@ -444,7 +444,7 @@ VLM_TEST_SETTINGS = { ...@@ -444,7 +444,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nWhat is the season?", "cherry_blossom": "<image>\nWhat is the season?",
} }
), ),
...@@ -529,7 +529,7 @@ VLM_TEST_SETTINGS = { ...@@ -529,7 +529,7 @@ VLM_TEST_SETTINGS = {
max_model_len=16384, max_model_len=16384,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf" "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501 ),
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[ custom_test_opts=[
...@@ -583,7 +583,7 @@ VLM_TEST_SETTINGS = { ...@@ -583,7 +583,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids( get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"] ["<|im_end|>", "<|endoftext|>"]
), # noqa: E501 ),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmo_26_patch_hf_runner,
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49 # FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
...@@ -598,7 +598,7 @@ VLM_TEST_SETTINGS = { ...@@ -598,7 +598,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids( get_stop_token_ids=lambda tok: tok.convert_tokens_to_ids(
["<|im_end|>", "<|endoftext|>"] ["<|im_end|>", "<|endoftext|>"]
), # noqa: E501 ),
hf_output_post_proc=model_utils.minicpmv_trunc_hf_output, hf_output_post_proc=model_utils.minicpmv_trunc_hf_output,
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
), ),
...@@ -627,7 +627,7 @@ VLM_TEST_SETTINGS = { ...@@ -627,7 +627,7 @@ VLM_TEST_SETTINGS = {
models=["AIDC-AI/Ovis1.6-Gemma2-9B"], models=["AIDC-AI/Ovis1.6-Gemma2-9B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<image>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="half", dtype="half",
...@@ -640,7 +640,7 @@ VLM_TEST_SETTINGS = { ...@@ -640,7 +640,7 @@ VLM_TEST_SETTINGS = {
models=["AIDC-AI/Ovis2-1B"], models=["AIDC-AI/Ovis2-1B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<image>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
dtype="half", dtype="half",
...@@ -652,7 +652,7 @@ VLM_TEST_SETTINGS = { ...@@ -652,7 +652,7 @@ VLM_TEST_SETTINGS = {
models=["AIDC-AI/Ovis2.5-2B"], models=["AIDC-AI/Ovis2.5-2B"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", # noqa: E501 img_idx_to_prompt=lambda idx: "<image>\n",
video_idx_to_prompt=lambda idx: "<video>\n", video_idx_to_prompt=lambda idx: "<video>\n",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
...@@ -701,8 +701,8 @@ VLM_TEST_SETTINGS = { ...@@ -701,8 +701,8 @@ VLM_TEST_SETTINGS = {
models=["Qwen/Qwen2-VL-2B-Instruct"], models=["Qwen/Qwen2-VL-2B-Instruct"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501 multi_image_prompt="Picture 1: <vlm_image>\nPicture 2: <vlm_image>\nDescribe these two images with one paragraph respectively.", # noqa: E501
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
...@@ -717,11 +717,11 @@ VLM_TEST_SETTINGS = { ...@@ -717,11 +717,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
{ {
"stop_sign": "<image>\nWhat's the content in the center of the image?", # noqa: E501 "stop_sign": "<image>\nWhat's the content in the center of the image?",
"cherry_blossom": "<image>\nWhat is the season?", "cherry_blossom": "<image>\nWhat is the season?",
} }
), ),
multi_image_prompt="<image>\n<image>\nDescribe the two images in short.", # noqa: E501 multi_image_prompt="<image>\n<image>\nDescribe the two images in short.",
max_model_len=4096, max_model_len=4096,
use_tokenizer_eos=True, use_tokenizer_eos=True,
patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner, patch_hf_runner=model_utils.skyworkr1v_patch_hf_runner,
...@@ -754,8 +754,8 @@ VLM_TEST_SETTINGS = { ...@@ -754,8 +754,8 @@ VLM_TEST_SETTINGS = {
VLMTestType.VIDEO, VLMTestType.VIDEO,
), ),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", # noqa: E501 img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>", # noqa: E501 video_idx_to_prompt=lambda idx: "<|vision_start|><|video_pad|><|vision_end|>",
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
...@@ -816,7 +816,7 @@ VLM_TEST_SETTINGS = { ...@@ -816,7 +816,7 @@ VLM_TEST_SETTINGS = {
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf" "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
), # noqa: E501 ),
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[ custom_test_opts=[
CustomTestOptions( CustomTestOptions(
......
...@@ -170,7 +170,7 @@ async def test_online_serving(client, audio_assets: AudioTestAssets): ...@@ -170,7 +170,7 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
], ],
{ {
"type": "text", "type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?", "text": f"What's happening in these {len(audio_assets)} audio clips?", # noqa: E501
}, },
], ],
} }
......
...@@ -101,16 +101,11 @@ async def test_online_serving(client, audio_assets: AudioTestAssets): ...@@ -101,16 +101,11 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
return audio_dict return audio_dict
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets] audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
text = f"What's happening in these {len(audio_assets)} audio clips?"
messages = [ messages = [
{ {
"role": "user", "role": "user",
"content": [ "content": [*audio_chunks, {"type": "text", "text": text}],
*audio_chunks,
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?",
},
],
} }
] ]
......
...@@ -102,8 +102,8 @@ def multi_video_multi_aspect_ratio_inputs( ...@@ -102,8 +102,8 @@ def multi_video_multi_aspect_ratio_inputs(
def different_patch_input_cases_internvl(): def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS] images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = ( formatter = (
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
) # noqa: E501 )
single_img_prompts = [ single_img_prompts = [
"<image>\nWhat's the content in the center of the image?", "<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?", "<image>\nWhat is the season?",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment