Commit 78c1f9e5 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1(tests)

parent 86a65417
...@@ -311,7 +311,7 @@ VLM_TEST_SETTINGS = { ...@@ -311,7 +311,7 @@ VLM_TEST_SETTINGS = {
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
), ),
"aya_vision-multi_image": VLMTestInfo( "aya_vision-multi_image": VLMTestInfo(
models=["CohereLabs/aya-vision-8b"], models=[os.path.join(models_path_prefix, "CohereLabs/aya-vision-8b")],
test_type=(VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
...@@ -328,7 +328,7 @@ VLM_TEST_SETTINGS = { ...@@ -328,7 +328,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"blip2": VLMTestInfo( "blip2": VLMTestInfo(
models=[os.path.join(models_path_prefix,"Salesforce/blip2-opt-2.7b")], models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:", prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
...@@ -352,7 +352,7 @@ VLM_TEST_SETTINGS = { ...@@ -352,7 +352,7 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16", dtype="bfloat16",
), ),
"deepseek_vl_v2": VLMTestInfo( "deepseek_vl_v2": VLMTestInfo(
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module models=[os.path.join(models_path_prefix, "Isotr0py/deepseek-vl2-tiny")], # model repo using dynamic module
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
max_model_len=4096, max_model_len=4096,
...@@ -401,7 +401,7 @@ VLM_TEST_SETTINGS = { ...@@ -401,7 +401,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.gemma3_patch_hf_runner, patch_hf_runner=model_utils.gemma3_patch_hf_runner,
), ),
"granite_vision": VLMTestInfo( "granite_vision": VLMTestInfo(
models=["ibm-granite/granite-vision-3.3-2b"], models=[os.path.join(models_path_prefix, "ibm-granite/granite-vision-3.3-2b")],
test_type=(VLMTestType.IMAGE), test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}\n<|assistant|>\n", prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}\n<|assistant|>\n",
max_model_len=8192, max_model_len=8192,
...@@ -445,7 +445,7 @@ VLM_TEST_SETTINGS = { ...@@ -445,7 +445,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"glm4_1v-video": VLMTestInfo( "glm4_1v-video": VLMTestInfo(
models=["zai-org/GLM-4.1V-9B-Thinking"], models=[os.path.join(models_path_prefix, "zai-org/GLM-4.1V-9B-Thinking")],
# GLM4.1V require include video metadata for input # GLM4.1V require include video metadata for input
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n", # noqa: E501 prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n", # noqa: E501
...@@ -461,20 +461,6 @@ VLM_TEST_SETTINGS = { ...@@ -461,20 +461,6 @@ VLM_TEST_SETTINGS = {
], ],
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"glm_ocr": VLMTestInfo(
models=["zai-org/GLM-OCR"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=32)],
),
"h2ovl": VLMTestInfo( "h2ovl": VLMTestInfo(
models=[ models=[
os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"), os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),
...@@ -526,7 +512,7 @@ VLM_TEST_SETTINGS = { ...@@ -526,7 +512,7 @@ VLM_TEST_SETTINGS = {
), ),
"intern_vl-video": VLMTestInfo( "intern_vl-video": VLMTestInfo(
models=[ models=[
"OpenGVLab/InternVL3-1B", os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B"),
], ],
test_type=VLMTestType.VIDEO, test_type=VLMTestType.VIDEO,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
...@@ -537,7 +523,7 @@ VLM_TEST_SETTINGS = { ...@@ -537,7 +523,7 @@ VLM_TEST_SETTINGS = {
num_logprobs=10 if current_platform.is_rocm() else 5, num_logprobs=10 if current_platform.is_rocm() else 5,
), ),
"intern_vl-hf": VLMTestInfo( "intern_vl-hf": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"], models=[os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B-hf")],
test_type=( test_type=(
VLMTestType.IMAGE, VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE, VLMTestType.MULTI_IMAGE,
...@@ -552,8 +538,8 @@ VLM_TEST_SETTINGS = { ...@@ -552,8 +538,8 @@ VLM_TEST_SETTINGS = {
), ),
"isaac": VLMTestInfo( "isaac": VLMTestInfo(
models=[ models=[
"PerceptronAI/Isaac-0.1", os.path.join(models_path_prefix, "PerceptronAI/Isaac-0.1"),
"PerceptronAI/Isaac-0.2-2B-Preview", os.path.join(models_path_prefix, "PerceptronAI/Isaac-0.2-2B-Preview"),
], ],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: ( prompt_formatter=lambda img_prompt: (
...@@ -579,7 +565,7 @@ VLM_TEST_SETTINGS = { ...@@ -579,7 +565,7 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
), ),
"kimi_vl": VLMTestInfo( "kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"], models=[os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501 img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
...@@ -590,21 +576,6 @@ VLM_TEST_SETTINGS = { ...@@ -590,21 +576,6 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output, vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=48)],
), ),
"llama4": VLMTestInfo(
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
img_idx_to_prompt=lambda _: "<|image|>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
distributed_executor_backend="mp",
image_size_factors=[(0.25, 0.5, 1.0)],
hf_model_kwargs={"device_map": "auto"},
max_model_len=8192,
max_num_seqs=4,
dtype="bfloat16",
auto_cls=AutoModelForImageTextToText,
tensor_parallel_size=4,
marks=multi_gpu_marks(num_gpus=4),
),
"llava_next": VLMTestInfo( "llava_next": VLMTestInfo(
models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")], models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
...@@ -675,7 +646,7 @@ VLM_TEST_SETTINGS = { ...@@ -675,7 +646,7 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.skip("HF import fails")], marks=[pytest.mark.skip("HF import fails")],
), ),
"minicpmo_26": VLMTestInfo( "minicpmo_26": VLMTestInfo(
models=["openbmb/MiniCPM-o-2_6"], models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-o-2_6")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n", img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
...@@ -703,7 +674,7 @@ VLM_TEST_SETTINGS = { ...@@ -703,7 +674,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
), ),
"minimax_vl_01": VLMTestInfo( "minimax_vl_01": VLMTestInfo(
models=["MiniMaxAI/MiniMax-VL-01"], models=[os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-VL-01")],
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
img_idx_to_prompt=lambda _: "<image>", img_idx_to_prompt=lambda _: "<image>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
...@@ -726,7 +697,7 @@ VLM_TEST_SETTINGS = { ...@@ -726,7 +697,7 @@ VLM_TEST_SETTINGS = {
], ],
), ),
"molmo": VLMTestInfo( "molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"], models=[os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=identity, prompt_formatter=identity,
max_model_len=4096, max_model_len=4096,
...@@ -734,7 +705,7 @@ VLM_TEST_SETTINGS = { ...@@ -734,7 +705,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.molmo_patch_hf_runner, patch_hf_runner=model_utils.molmo_patch_hf_runner,
), ),
"ovis1_6-gemma2": VLMTestInfo( "ovis1_6-gemma2": VLMTestInfo(
models=["AIDC-AI/Ovis1.6-Gemma2-9B"], models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Gemma2-9B")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", img_idx_to_prompt=lambda idx: "<image>\n",
...@@ -747,7 +718,7 @@ VLM_TEST_SETTINGS = { ...@@ -747,7 +718,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"ovis2": VLMTestInfo( "ovis2": VLMTestInfo(
models=["AIDC-AI/Ovis2-1B"], models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", img_idx_to_prompt=lambda idx: "<image>\n",
...@@ -759,7 +730,7 @@ VLM_TEST_SETTINGS = { ...@@ -759,7 +730,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.ovis_patch_hf_runner, patch_hf_runner=model_utils.ovis_patch_hf_runner,
), ),
"ovis2_5": VLMTestInfo( "ovis2_5": VLMTestInfo(
models=["AIDC-AI/Ovis2.5-2B"], models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", img_idx_to_prompt=lambda idx: "<image>\n",
...@@ -772,7 +743,7 @@ VLM_TEST_SETTINGS = { ...@@ -772,7 +743,7 @@ VLM_TEST_SETTINGS = {
hf_model_kwargs={"revision": "refs/pr/5"}, hf_model_kwargs={"revision": "refs/pr/5"},
), ),
"paddleocr_vl": VLMTestInfo( "paddleocr_vl": VLMTestInfo(
models=["PaddlePaddle/PaddleOCR-VL"], models=[os.path.join(models_path_prefix, "PaddlePaddle/PaddleOCR-VL")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
img_idx_to_prompt=lambda idx: ( img_idx_to_prompt=lambda idx: (
...@@ -795,7 +766,7 @@ VLM_TEST_SETTINGS = { ...@@ -795,7 +766,7 @@ VLM_TEST_SETTINGS = {
], ],
), ),
"phi3v": VLMTestInfo( "phi3v": VLMTestInfo(
models=["microsoft/Phi-3.5-vision-instruct"], models=[os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n", img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
...@@ -835,7 +806,7 @@ VLM_TEST_SETTINGS = { ...@@ -835,7 +806,7 @@ VLM_TEST_SETTINGS = {
prompt_path_encoder=model_utils.qwen_prompt_path_encoder, prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
), ),
"qwen2_vl": VLMTestInfo( "qwen2_vl": VLMTestInfo(
models=["Qwen/Qwen2-VL-2B-Instruct"], models=[os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
...@@ -849,7 +820,7 @@ VLM_TEST_SETTINGS = { ...@@ -849,7 +820,7 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.cpu_model], marks=[pytest.mark.cpu_model],
), ),
"skywork_r1v": VLMTestInfo( "skywork_r1v": VLMTestInfo(
models=["Skywork/Skywork-R1V-38B"], models=[os.path.join(models_path_prefix, "Skywork/Skywork-R1V-38B")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
...@@ -865,7 +836,7 @@ VLM_TEST_SETTINGS = { ...@@ -865,7 +836,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=80)], marks=[large_gpu_mark(min_gb=80)],
), ),
"smolvlm": VLMTestInfo( "smolvlm": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"], models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM2-2.2B-Instruct")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>", img_idx_to_prompt=lambda idx: "<image>",
...@@ -876,7 +847,7 @@ VLM_TEST_SETTINGS = { ...@@ -876,7 +847,7 @@ VLM_TEST_SETTINGS = {
num_logprobs=10, num_logprobs=10,
), ),
"tarsier": VLMTestInfo( "tarsier": VLMTestInfo(
models=["omni-research/Tarsier-7b"], models=[os.path.join(models_path_prefix, "omni-research/Tarsier-7b")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
max_model_len=4096, max_model_len=4096,
...@@ -885,7 +856,7 @@ VLM_TEST_SETTINGS = { ...@@ -885,7 +856,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.tarsier_patch_hf_runner, patch_hf_runner=model_utils.tarsier_patch_hf_runner,
), ),
"tarsier2": VLMTestInfo( "tarsier2": VLMTestInfo(
models=["omni-research/Tarsier2-Recap-7b"], models=[os.path.join(models_path_prefix, "omni-research/Tarsier2-Recap-7b")],
test_type=( test_type=(
VLMTestType.IMAGE, VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE, VLMTestType.MULTI_IMAGE,
...@@ -953,7 +924,7 @@ VLM_TEST_SETTINGS = { ...@@ -953,7 +924,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf" os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
), ),
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[ custom_test_opts=[
...@@ -973,7 +944,7 @@ VLM_TEST_SETTINGS = { ...@@ -973,7 +944,7 @@ VLM_TEST_SETTINGS = {
), ),
# regression test for https://github.com/vllm-project/vllm/issues/15122 # regression test for https://github.com/vllm-project/vllm/issues/15122
"qwen2_5_vl-windows-attention": VLMTestInfo( "qwen2_5_vl-windows-attention": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"], models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")],
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
......
...@@ -7,6 +7,7 @@ This test validates that each multimodal model can successfully generate outputs ...@@ -7,6 +7,7 @@ This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends. Tests are parametrized by model and backend. using different ViT attention backends. Tests are parametrized by model and backend.
""" """
import os
from dataclasses import asdict from dataclasses import asdict
from typing import Any from typing import Any
...@@ -19,7 +20,7 @@ from vllm.multimodal.video import sample_frames_from_video ...@@ -19,7 +20,7 @@ from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.registry import AttentionBackendEnum
from ....utils import create_new_process_for_each_test from ....utils import create_new_process_for_each_test, models_path_prefix
from ...utils import dummy_hf_overrides from ...utils import dummy_hf_overrides
# Dots.OCR prompt from official repository # Dots.OCR prompt from official repository
...@@ -50,7 +51,7 @@ VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>" ...@@ -50,7 +51,7 @@ VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
# Model configurations # Model configurations
MODEL_CONFIGS: dict[str, dict[str, Any]] = { MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"dots_ocr": { "dots_ocr": {
"model_name": "rednote-hilab/dots.ocr", "model_name": os.path.join(models_path_prefix, "rednote-hilab/dots.ocr"),
"interface": "llm_chat", "interface": "llm_chat",
"max_model_len": 32768, "max_model_len": 32768,
"max_num_seqs": 1, "max_num_seqs": 1,
...@@ -66,7 +67,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -66,7 +67,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"output_validator": lambda x: len(x) > 10 and "stop" in x.lower(), "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
}, },
"ernie45_vl": { "ernie45_vl": {
"model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT", "model_name": os.path.join(models_path_prefix, "baidu/ERNIE-4.5-VL-28B-A3B-PT"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 16384, "max_model_len": 16384,
"max_num_seqs": 2, "max_num_seqs": 2,
...@@ -79,7 +80,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -79,7 +80,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"glm4_1v": { "glm4_1v": {
"model_name": "zai-org/GLM-4.1V-9B-Thinking", "model_name": os.path.join(models_path_prefix, "zai-org/GLM-4.1V-9B-Thinking"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 32768, "max_model_len": 32768,
"max_num_seqs": 2, "max_num_seqs": 2,
...@@ -91,21 +92,8 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -91,21 +92,8 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"use_processor": True, "use_processor": True,
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"glm_ocr": {
"model_name": "zai-org/GLM-OCR",
"interface": "llm_generate",
"max_model_len": 131072,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"use_processor": True,
"question": "Text Recognition:",
},
"keye_vl": { "keye_vl": {
"model_name": "Kwai-Keye/Keye-VL-8B-Preview", "model_name": os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 8192, "max_model_len": 8192,
"max_num_seqs": 5, "max_num_seqs": 5,
...@@ -122,7 +110,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -122,7 +110,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"ovis2_5": { "ovis2_5": {
"model_name": "AIDC-AI/Ovis2.5-2B", "model_name": os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 8192, "max_model_len": 8192,
"max_num_seqs": 2, "max_num_seqs": 2,
...@@ -135,7 +123,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -135,7 +123,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"qwen2_5_vl": { "qwen2_5_vl": {
"model_name": "Qwen/Qwen2.5-VL-3B-Instruct", "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"),
"interface": "vllm_runner", "interface": "vllm_runner",
"media_type": "video", "media_type": "video",
"max_model_len": 4000, "max_model_len": 4000,
...@@ -154,7 +142,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -154,7 +142,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
}, },
}, },
"qwen2_5_omni": { "qwen2_5_omni": {
"model_name": "Qwen/Qwen2.5-Omni-3B", "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-3B"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 32768, "max_model_len": 32768,
"max_num_seqs": 2, "max_num_seqs": 2,
...@@ -169,7 +157,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -169,7 +157,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"qwen3_omni": { "qwen3_omni": {
"model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct", "model_name": os.path.join(models_path_prefix, "Qwen/Qwen3-Omni-30B-A3B-Instruct"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 32768, "max_model_len": 32768,
"max_num_seqs": 2, "max_num_seqs": 2,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from huggingface_hub.constants import HF_HUB_CACHE
from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
# Repo with multiple LoRAs contained in it
LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora" # noqa: E501
NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
LIB_DOWNLOAD_DIR = os.path.join(
HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
)
INVALID_REPO_NAME = "thisrepodoesnotexist"
# Repo with only one LoRA in the root dir
LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
REPO_DOWNLOAD_DIR = os.path.join(
HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
)
@pytest.mark.asyncio
async def test_hf_resolver_with_direct_path():
hf_resolver = HfHubResolver([LORA_REPO])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
assert lora_request.lora_name == LORA_REPO
assert REPO_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_hf_resolver_with_nested_paths():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
assert lora_request is not None
assert lora_request.lora_name == LORA_NAME
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_hf_resolver_with_multiple_repos():
hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
assert lora_request is not None
assert lora_request.lora_name == LORA_NAME
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_missing_adapter():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
assert missing_lora_request is None
@pytest.mark.asyncio
async def test_nonlora_adapter():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
readme_request = await hf_resolver.resolve_lora(
LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
)
assert readme_request is None
@pytest.mark.asyncio
async def test_invalid_repo():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
invalid_repo_req = await hf_resolver.resolve_lora(
INVALID_REPO_NAME,
f"{INVALID_REPO_NAME}/foo",
)
assert invalid_repo_req is None
@pytest.mark.asyncio
async def test_trailing_slash():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(
LORA_LIB_MODEL_NAME,
f"{LORA_NAME}/",
)
assert lora_request is not None
assert lora_request.lora_name == f"{LORA_NAME}/"
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import os
from vllm import SamplingParams
from ..utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_ranks(
vllm_runner,
model,
dtype,
example_prompts,
):
max_tokens = 5
num_top_logprobs = 5
num_prompt_logprobs = 5
with vllm_runner(model, dtype=dtype,
max_logprobs=num_top_logprobs) as vllm_model:
## Test greedy logprobs ranks
vllm_sampling_params = SamplingParams(
temperature=0.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
vllm_results = vllm_model.generate_w_logprobs(example_prompts,
vllm_sampling_params)
## Test non-greedy logprobs ranks
sampling_params = SamplingParams(temperature=1.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
for result in vllm_results:
assert result[2] is not None
assert len(result[2]) == len(result[0])
# check whether all chosen tokens have ranks = 1
for token, logprobs in zip(result[0], result[2]):
assert token in logprobs
assert logprobs[token].rank == 1
for result in res:
assert result[2] is not None
assert len(result[2]) == len(result[0])
# check whether all chosen tokens have ranks
for token, logprobs in zip(result[0], result[2]):
assert logprobs[token].rank >= 1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for the UvicornAccessLogFilter class.
"""
import logging
from vllm.logging_utils.access_log_filter import (
UvicornAccessLogFilter,
create_uvicorn_log_config,
)
class TestUvicornAccessLogFilter:
"""Test cases for UvicornAccessLogFilter."""
def test_filter_allows_all_when_no_excluded_paths(self):
"""Filter should allow all logs when no paths are excluded."""
filter = UvicornAccessLogFilter(excluded_paths=[])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/v1/completions", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_allows_all_when_excluded_paths_is_none(self):
"""Filter should allow all logs when excluded_paths is None."""
filter = UvicornAccessLogFilter(excluded_paths=None)
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_excludes_health_endpoint(self):
"""Filter should exclude /health endpoint when configured."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_excludes_metrics_endpoint(self):
"""Filter should exclude /metrics endpoint when configured."""
filter = UvicornAccessLogFilter(excluded_paths=["/metrics"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_allows_non_excluded_endpoints(self):
"""Filter should allow endpoints not in the excluded list."""
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "POST", "/v1/completions", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_excludes_multiple_endpoints(self):
"""Filter should exclude multiple configured endpoints."""
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics", "/ping"])
# Test /health
record_health = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_health) is False
# Test /metrics
record_metrics = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_metrics) is False
# Test /ping
record_ping = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_ping) is False
def test_filter_with_query_parameters(self):
"""Filter should exclude endpoints even with query parameters."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health?verbose=true", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_different_http_methods(self):
"""Filter should exclude endpoints regardless of HTTP method."""
filter = UvicornAccessLogFilter(excluded_paths=["/ping"])
# Test GET
record_get = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_get) is False
# Test POST
record_post = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "POST", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_post) is False
def test_filter_with_different_status_codes(self):
"""Filter should exclude endpoints regardless of status code."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
for status_code in [200, 500, 503]:
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", status_code),
exc_info=None,
)
assert filter.filter(record) is False
class TestCreateUvicornLogConfig:
"""Test cases for create_uvicorn_log_config function."""
def test_creates_valid_config_structure(self):
"""Config should have required logging configuration keys."""
config = create_uvicorn_log_config(excluded_paths=["/health"])
assert "version" in config
assert config["version"] == 1
assert "disable_existing_loggers" in config
assert "formatters" in config
assert "handlers" in config
assert "loggers" in config
assert "filters" in config
def test_config_includes_access_log_filter(self):
"""Config should include the access log filter."""
config = create_uvicorn_log_config(excluded_paths=["/health", "/metrics"])
assert "access_log_filter" in config["filters"]
filter_config = config["filters"]["access_log_filter"]
assert filter_config["()"] == UvicornAccessLogFilter
assert filter_config["excluded_paths"] == ["/health", "/metrics"]
def test_config_applies_filter_to_access_handler(self):
"""Config should apply the filter to the access handler."""
config = create_uvicorn_log_config(excluded_paths=["/health"])
assert "access" in config["handlers"]
assert "filters" in config["handlers"]["access"]
assert "access_log_filter" in config["handlers"]["access"]["filters"]
def test_config_with_custom_log_level(self):
"""Config should respect custom log level."""
config = create_uvicorn_log_config(
excluded_paths=["/health"], log_level="debug"
)
assert config["loggers"]["uvicorn"]["level"] == "DEBUG"
assert config["loggers"]["uvicorn.access"]["level"] == "DEBUG"
assert config["loggers"]["uvicorn.error"]["level"] == "DEBUG"
def test_config_with_empty_excluded_paths(self):
"""Config should work with empty excluded paths."""
config = create_uvicorn_log_config(excluded_paths=[])
assert config["filters"]["access_log_filter"]["excluded_paths"] == []
def test_config_with_none_excluded_paths(self):
"""Config should work with None excluded paths."""
config = create_uvicorn_log_config(excluded_paths=None)
assert config["filters"]["access_log_filter"]["excluded_paths"] == []
class TestIntegration:
"""Integration tests for the access log filter."""
def test_filter_with_real_logger(self):
"""Test filter works with a real Python logger simulating uvicorn."""
# Create a logger with our filter (simulating uvicorn.access)
logger = logging.getLogger("uvicorn.access")
logger.setLevel(logging.INFO)
# Clear any existing handlers
logger.handlers = []
# Create a custom handler that tracks messages
logged_messages: list[str] = []
class TrackingHandler(logging.Handler):
def emit(self, record):
logged_messages.append(record.getMessage())
handler = TrackingHandler()
handler.setLevel(logging.INFO)
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
handler.addFilter(filter)
logger.addHandler(handler)
# Log using uvicorn's format with args tuple
# Format: '%s - "%s %s HTTP/%s" %d'
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/health",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/v1/completions",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/metrics",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"POST",
"/v1/chat/completions",
"1.1",
200,
)
# Verify only non-excluded endpoints were logged
assert len(logged_messages) == 2
assert "/v1/completions" in logged_messages[0]
assert "/v1/chat/completions" in logged_messages[1]
def test_filter_allows_non_uvicorn_access_logs(self):
"""Test filter allows logs from non-uvicorn.access loggers."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record from a different logger name
record = logging.LogRecord(
name="uvicorn.error",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some error message about /health",
args=(),
exc_info=None,
)
# Should allow because it's not from uvicorn.access
assert filter.filter(record) is True
def test_filter_handles_malformed_args(self):
"""Test filter handles log records with unexpected args format."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record with insufficient args
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some message",
args=("only", "two"),
exc_info=None,
)
# Should allow because args doesn't have expected format
assert filter.filter(record) is True
def test_filter_handles_non_tuple_args(self):
"""Test filter handles log records with non-tuple args."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record with None args
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some message without args",
args=None,
exc_info=None,
)
# Should allow because args is None
assert filter.filter(record) is True
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import os
from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
from ..utils import models_path_prefix
# TOKENIZER_NAMES = [
# os.path.join(models_path_prefix, "facebook/opt-125m"),
# os.path.join(models_path_prefix, "gpt2"),
# ]
# export HF_ENDPOINT=https://hf-mirror.com
TOKENIZER_NAMES = [
"facebook/opt-125m",
"gpt2",
]
@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
def test_tokenizer_revision(tokenizer_name: str):
# Assume that "main" branch always exists
# tokenizer = get_tokenizer(tokenizer_name, revision="main")
tokenizer = get_tokenizer(tokenizer_name)
assert isinstance(tokenizer, PreTrainedTokenizerBase)
# Assume that "never" branch always does not exist
with pytest.raises(OSError, match='not a valid git identifier'):
get_tokenizer(tokenizer_name, revision="never")
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501 # ruff: noqa: E501
import os
import json import json
import pytest import pytest
...@@ -11,10 +12,11 @@ from vllm.tokenizers import get_tokenizer ...@@ -11,10 +12,11 @@ from vllm.tokenizers import get_tokenizer
from vllm.tool_parsers.glm4_moe_tool_parser import ( from vllm.tool_parsers.glm4_moe_tool_parser import (
Glm4MoeModelToolParser, Glm4MoeModelToolParser,
) )
from ..utils import models_path_prefix
pytest.skip("skip glm4_moe parser test", allow_module_level=True) pytest.skip("skip glm4_moe parser test", allow_module_level=True)
# Use a common model that is likely to be available # Use a common model that is likely to be available
MODEL = "zai-org/GLM-4.5" MODEL = os.path.join(models_path_prefix, "zai-org/GLM-4.5")
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
...@@ -225,6 +227,7 @@ def test_extract_tool_calls( ...@@ -225,6 +227,7 @@ def test_extract_tool_calls(
def test_extract_tool_calls_with_thinking_tags(glm4_moe_tool_parser): def test_extract_tool_calls_with_thinking_tags(glm4_moe_tool_parser):
"""Test tool extraction when thinking tags are present.""" """Test tool extraction when thinking tags are present."""
model_output = """<think>I want to get the weather.</think> model_output = """<think>I want to get the weather.</think>
I will help you get the weather. I will help you get the weather.
<tool_call>get_weather <tool_call>get_weather
<arg_key>city</arg_key> <arg_key>city</arg_key>
...@@ -242,6 +245,7 @@ I will help you get the weather. ...@@ -242,6 +245,7 @@ I will help you get the weather.
assert extracted_tool_calls.tool_calls[0].function.name == "get_weather" assert extracted_tool_calls.tool_calls[0].function.name == "get_weather"
expected_content = """<think>I want to get the weather.</think> expected_content = """<think>I want to get the weather.</think>
I will help you get the weather.""" I will help you get the weather."""
assert extracted_tool_calls.content == expected_content assert extracted_tool_calls.content == expected_content
...@@ -285,6 +289,7 @@ def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser): ...@@ -285,6 +289,7 @@ def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser):
def test_extract_tool_calls_mixed_content(glm4_moe_tool_parser): def test_extract_tool_calls_mixed_content(glm4_moe_tool_parser):
"""Test extraction with mixed content and multiple tool calls.""" """Test extraction with mixed content and multiple tool calls."""
model_output = """I will help you get the weather info. model_output = """I will help you get the weather info.
<tool_call>get_weather <tool_call>get_weather
<arg_key>city</arg_key> <arg_key>city</arg_key>
<arg_value>Beijing</arg_value> <arg_value>Beijing</arg_value>
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from torch_xla._internal import tpu
import vllm
from vllm.lora.request import LoRARequest
# This file contains tests to ensure that LoRA works correctly on the TPU
# backend. We use a series of custom trained adapters for Qwen2.5-3B-Instruct
# for this. The adapters are:
# Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter, where x ranges
# from 1 to 4.
# These adapters are trained using a standard huggingface peft training script,
# where all the inputs are "What is 1+1? \n" and all the outputs are "x". We run
# 100 training iterations with a training batch size of 100.
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
return vllm.LLM(
model="Qwen/Qwen2.5-3B-Instruct",
max_model_len=256,
max_num_seqs=8,
tensor_parallel_size=tp,
enable_lora=True,
max_loras=num_loras,
max_lora_rank=8,
)
TPU_TENSOR_PARALLEL_SIZES = (
[1, tpu.num_available_chips()] if tpu.num_available_chips() > 1 else [1]
)
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
def test_single_lora(tp: int):
"""
This test ensures we can run a single LoRA adapter on the TPU backend.
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
will force Qwen2.5-3B-Instruct to claim 1+1=1.
"""
llm = setup_vllm(1, tp)
prompt = "What is 1+1? \n"
lora_request = LoRARequest(
"lora_adapter_1",
1,
"Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter",
)
output = (
llm.generate(
prompt,
sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
lora_request=lora_request,
)[0]
.outputs[0]
.text
)
answer = output.strip()[0]
assert answer.isdigit()
assert int(answer) == 1
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
def test_lora_hotswapping(tp: int):
"""
This test ensures we can run multiple LoRA adapters on the TPU backend, even
if we only have space to store 1.
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
"""
lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
lora_requests = [
LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
for i in range(1, 5)
]
llm = setup_vllm(1, tp)
prompt = "What is 1+1? \n"
for i, req in enumerate(lora_requests):
output = (
llm.generate(
prompt,
sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
lora_request=req,
)[0]
.outputs[0]
.text
)
answer = output.strip()[0]
assert answer.isdigit()
assert int(answer) == i + 1
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
def test_multi_lora(tp: int):
"""
This test ensures we can run multiple LoRA adapters on the TPU backend, when
we have enough space to store all of them.
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
"""
lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
lora_requests = [
LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
for i in range(1, 5)
]
llm = setup_vllm(4, tp)
prompt = "What is 1+1? \n"
for i, req in enumerate(lora_requests):
output = (
llm.generate(
prompt,
sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
lora_request=req,
)[0]
.outputs[0]
.text
)
answer = output.strip()[0]
assert answer.isdigit()
assert int(output.strip()[0]) == i + 1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob
import os
import tempfile
import depyf
def test_tpu_compilation():
temp_dir = tempfile.mkdtemp()
with depyf.prepare_debug(temp_dir):
from vllm import LLM, SamplingParams
prompts = [
"A robot may not injure a human being",
"It is only with the heart that one can see rightly;",
"The greatest glory in living lies not in never falling,",
]
answers = [
" or, through inaction",
" what is essential ",
" but in rising ",
]
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
N = 1
sampling_params = SamplingParams(temperature=0.7, top_p=1.0, n=N, max_tokens=16)
llm = LLM(
model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=256,
max_model_len=256,
max_num_seqs=32,
enforce_eager=False,
)
outputs = llm.generate(prompts, sampling_params)
for output, answer in zip(outputs, answers):
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
assert generated_text.startswith(answer)
compiled_codes = sorted(
glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py"))
)
for i, compiled_code in enumerate(compiled_codes):
print("{} file: {}".format(i + 1, compiled_code))
# We should only trigger Dynamo compilation 2 times:
# 1. Forward pass without kv_caches
# 2. Forward pass with kv_caches
# Check we have 2 compiled codes
assert len(compiled_codes) == 2
kv_cache_prefix = "kv_cache"
attn_prefix = "ragged_paged_attention"
def extract_compiled_index(s):
parts = s.replace(".", "_").split("_")
numbers = [int(part) for part in parts if part.isdigit()]
return numbers[0]
# Check all the compilations are as expected. The dump files include the
# captured graph for the forward function of the nn.Module.
compiled_fns = sorted(
glob.glob(os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
key=lambda s: extract_compiled_index(s),
)
for i, compiled_fn in enumerate(compiled_fns):
print("{} file: {}".format(i + 1, compiled_fn))
# The first compilation should not have any kv_caches
with open(compiled_fns[0]) as f:
content = f.read()
assert kv_cache_prefix not in content
# The second compilation should have kv_caches and the
# ragged_paged_attention
with open(compiled_fns[1]) as f:
content = f.read()
assert kv_cache_prefix in content and attn_prefix in content
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from vllm.config import CompilationMode
from ..utils import compare_two_settings, models_path_prefix
# --enforce-eager on TPU causes graph compilation
# this times out default Health Check in the MQLLMEngine,
# so we set the timeout here to 30s
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_RPC_TIMEOUT", "30000")
compare_two_settings(
os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"),
arg1=[
"--max-model-len=256",
"--max-num-seqs=32",
"--enforce-eager",
f"-O{CompilationMode.DYNAMO_TRACE_ONCE}",
],
arg2=[
"--max-model-len=256",
"--max-num-seqs=32",
"--enforce-eager",
f"-O{CompilationMode.STOCK_TORCH_COMPILE}",
],
env1={},
env2={},
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the Pallas MOE implementation.
Run `pytest tests/kernels/moe/test_moe_pallas.py`.
"""
import pytest
import torch
import torch_xla
from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
fused_moe as torch_moe,
)
from vllm.platforms import current_platform
if not current_platform.is_tpu():
pytest.skip("This test needs a TPU.", allow_module_level=True)
NUM_EXPERTS = [8, 64]
EP_SIZE = [1]
TOP_KS = [2, 6]
# The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
@pytest.mark.parametrize("m", [8, 16, 64, 2048])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("k", [128, 511, 1024])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_pallas_moe(
m: int,
n: int,
k: int,
e: int,
topk: int,
ep_size: int,
dtype: torch.dtype,
):
import torch_xla.core.xla_model as xm
with torch.device(xm.xla_device()):
a = torch.randn((m, k), dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
w2 = torch.randn((e, k, n), dtype=dtype) / 10
score = torch.randn((m, e), dtype=dtype)
# TODO: Support ep
if ep_size > 1:
pytest.skip("No support for ep_size > 1 yet")
else:
e_map = None
# Run both implementations
torch_output = torch_moe(
hidden_states=a,
w1=w1,
w2=w2,
gating_output=score,
topk=topk,
global_num_experts=e,
expert_map=e_map,
renormalize=False,
)
pallas_output = pallas_moe(
hidden_states=a,
w1=w1,
w2=w2,
gating_output=score,
topk=topk,
global_num_experts=e,
expert_map=e_map,
renormalize=False,
)
torch_xla.sync(wait=False)
# Compare outputs
torch.testing.assert_close(
pallas_output.cpu(),
torch_output.cpu(),
atol=2e-2,
rtol=0,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
import lm_eval
import pytest
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
@dataclass
class GSM8KAccuracyTestConfig:
model_name: str
expected_value: float
def get_model_args(self) -> str:
return f"pretrained={self.model_name},max_model_len=4096,max_num_seqs=32"
# NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS = [
GSM8KAccuracyTestConfig(
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
expected_value=0.76,
), # no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
# a follow-up, move this into the LM-EVAL section of the CI.
# GSM8KAccuracyTestConfig(
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
# expected_value=0.66), # bias in QKV layers
]
@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
results = lm_eval.simple_evaluate(
model="vllm",
model_args=config.get_model_args(),
tasks="gsm8k",
batch_size="auto",
)
EXPECTED_VALUE = config.expected_value
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
This diff is collapsed.
...@@ -1478,7 +1478,6 @@ def test_kv_connector_handles_preemption(is_async, use_ec_connector, ec_role): ...@@ -1478,7 +1478,6 @@ def test_kv_connector_handles_preemption(is_async, use_ec_connector, ec_role):
assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1 assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
def make_output(scheduler: Scheduler): def make_output(scheduler: Scheduler):
return ModelRunnerOutput( return ModelRunnerOutput(
req_ids=[req.request_id for req in scheduler.running], req_ids=[req.request_id for req in scheduler.running],
......
...@@ -112,13 +112,6 @@ def create_vllm_config( ...@@ -112,13 +112,6 @@ def create_vllm_config(
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
is_encoder_decoder=model_config.is_encoder_decoder, is_encoder_decoder=model_config.is_encoder_decoder,
) )
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_model_len,
enable_chunked_prefill=enable_chunked_prefill,
is_encoder_decoder=model_config.is_encoder_decoder,
)
# Cache config, optionally force APC # Cache config, optionally force APC
cache_config = CacheConfig( cache_config = CacheConfig(
block_size=block_size, block_size=block_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment