Commit 78c1f9e5 authored by zhuwenwen's avatar zhuwenwen
Browse files

sync v0.15.1(tests)

parent 86a65417
...@@ -311,7 +311,7 @@ VLM_TEST_SETTINGS = { ...@@ -311,7 +311,7 @@ VLM_TEST_SETTINGS = {
vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}}, vllm_runner_kwargs={"mm_processor_kwargs": {"crop_to_patches": True}},
), ),
"aya_vision-multi_image": VLMTestInfo( "aya_vision-multi_image": VLMTestInfo(
models=["CohereLabs/aya-vision-8b"], models=[os.path.join(models_path_prefix, "CohereLabs/aya-vision-8b")],
test_type=(VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{img_prompt}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
...@@ -328,7 +328,7 @@ VLM_TEST_SETTINGS = { ...@@ -328,7 +328,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"blip2": VLMTestInfo( "blip2": VLMTestInfo(
models=[os.path.join(models_path_prefix,"Salesforce/blip2-opt-2.7b")], models=[os.path.join(models_path_prefix, "Salesforce/blip2-opt-2.7b")],
test_type=VLMTestType.IMAGE, test_type=VLMTestType.IMAGE,
prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:", prompt_formatter=lambda img_prompt: f"Question: {img_prompt} Answer:",
img_idx_to_prompt=lambda idx: "", img_idx_to_prompt=lambda idx: "",
...@@ -352,7 +352,7 @@ VLM_TEST_SETTINGS = { ...@@ -352,7 +352,7 @@ VLM_TEST_SETTINGS = {
dtype="bfloat16", dtype="bfloat16",
), ),
"deepseek_vl_v2": VLMTestInfo( "deepseek_vl_v2": VLMTestInfo(
models=["Isotr0py/deepseek-vl2-tiny"], # model repo using dynamic module models=[os.path.join(models_path_prefix, "Isotr0py/deepseek-vl2-tiny")], # model repo using dynamic module
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|User|>: {img_prompt}\n\n<|Assistant|>: ", # noqa: E501
max_model_len=4096, max_model_len=4096,
...@@ -401,7 +401,7 @@ VLM_TEST_SETTINGS = { ...@@ -401,7 +401,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.gemma3_patch_hf_runner, patch_hf_runner=model_utils.gemma3_patch_hf_runner,
), ),
"granite_vision": VLMTestInfo( "granite_vision": VLMTestInfo(
models=["ibm-granite/granite-vision-3.3-2b"], models=[os.path.join(models_path_prefix, "ibm-granite/granite-vision-3.3-2b")],
test_type=(VLMTestType.IMAGE), test_type=(VLMTestType.IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}\n<|assistant|>\n", prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}\n<|assistant|>\n",
max_model_len=8192, max_model_len=8192,
...@@ -445,7 +445,7 @@ VLM_TEST_SETTINGS = { ...@@ -445,7 +445,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"glm4_1v-video": VLMTestInfo( "glm4_1v-video": VLMTestInfo(
models=["zai-org/GLM-4.1V-9B-Thinking"], models=[os.path.join(models_path_prefix, "zai-org/GLM-4.1V-9B-Thinking")],
# GLM4.1V require include video metadata for input # GLM4.1V require include video metadata for input
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n", # noqa: E501 prompt_formatter=lambda vid_prompt: f"[gMASK]<|user|>\n{vid_prompt}<|assistant|>\n", # noqa: E501
...@@ -461,20 +461,6 @@ VLM_TEST_SETTINGS = { ...@@ -461,20 +461,6 @@ VLM_TEST_SETTINGS = {
], ],
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"glm_ocr": VLMTestInfo(
models=["zai-org/GLM-OCR"],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"[gMASK]<|user|>\n{img_prompt}<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|begin_of_image|><|image|><|end_of_image|>",
video_idx_to_prompt=lambda idx: "<|begin_of_video|><|video|><|end_of_video|>",
max_model_len=2048,
max_num_seqs=2,
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
auto_cls=AutoModelForImageTextToText,
marks=[large_gpu_mark(min_gb=32)],
),
"h2ovl": VLMTestInfo( "h2ovl": VLMTestInfo(
models=[ models=[
os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"), os.path.join(models_path_prefix,"h2oai/h2ovl-mississippi-800m"),
...@@ -526,7 +512,7 @@ VLM_TEST_SETTINGS = { ...@@ -526,7 +512,7 @@ VLM_TEST_SETTINGS = {
), ),
"intern_vl-video": VLMTestInfo( "intern_vl-video": VLMTestInfo(
models=[ models=[
"OpenGVLab/InternVL3-1B", os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B"),
], ],
test_type=VLMTestType.VIDEO, test_type=VLMTestType.VIDEO,
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501
...@@ -537,7 +523,7 @@ VLM_TEST_SETTINGS = { ...@@ -537,7 +523,7 @@ VLM_TEST_SETTINGS = {
num_logprobs=10 if current_platform.is_rocm() else 5, num_logprobs=10 if current_platform.is_rocm() else 5,
), ),
"intern_vl-hf": VLMTestInfo( "intern_vl-hf": VLMTestInfo(
models=["OpenGVLab/InternVL3-1B-hf"], models=[os.path.join(models_path_prefix, "OpenGVLab/InternVL3-1B-hf")],
test_type=( test_type=(
VLMTestType.IMAGE, VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE, VLMTestType.MULTI_IMAGE,
...@@ -552,8 +538,8 @@ VLM_TEST_SETTINGS = { ...@@ -552,8 +538,8 @@ VLM_TEST_SETTINGS = {
), ),
"isaac": VLMTestInfo( "isaac": VLMTestInfo(
models=[ models=[
"PerceptronAI/Isaac-0.1", os.path.join(models_path_prefix, "PerceptronAI/Isaac-0.1"),
"PerceptronAI/Isaac-0.2-2B-Preview", os.path.join(models_path_prefix, "PerceptronAI/Isaac-0.2-2B-Preview"),
], ],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: ( prompt_formatter=lambda img_prompt: (
...@@ -579,7 +565,7 @@ VLM_TEST_SETTINGS = { ...@@ -579,7 +565,7 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)], image_size_factors=[(0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
), ),
"kimi_vl": VLMTestInfo( "kimi_vl": VLMTestInfo(
models=["moonshotai/Kimi-VL-A3B-Instruct"], models=[os.path.join(models_path_prefix, "moonshotai/Kimi-VL-A3B-Instruct")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_user|>user<|im_middle|>{img_prompt}<|im_end|><|im_assistant|>assistant<|im_middle|>", # noqa: E501
img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501 img_idx_to_prompt=lambda _: "<|media_start|>image<|media_content|><|media_pad|><|media_end|>", # noqa: E501
...@@ -590,21 +576,6 @@ VLM_TEST_SETTINGS = { ...@@ -590,21 +576,6 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output, vllm_output_post_proc=model_utils.kimiv_vl_vllm_to_hf_output,
marks=[large_gpu_mark(min_gb=48)], marks=[large_gpu_mark(min_gb=48)],
), ),
"llama4": VLMTestInfo(
models=["meta-llama/Llama-4-Scout-17B-16E-Instruct"],
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|header_start|>user<|header_end|>\n\n{img_prompt}<|eot|><|header_start|>assistant<|header_end|>\n\n", # noqa: E501
img_idx_to_prompt=lambda _: "<|image|>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
distributed_executor_backend="mp",
image_size_factors=[(0.25, 0.5, 1.0)],
hf_model_kwargs={"device_map": "auto"},
max_model_len=8192,
max_num_seqs=4,
dtype="bfloat16",
auto_cls=AutoModelForImageTextToText,
tensor_parallel_size=4,
marks=multi_gpu_marks(num_gpus=4),
),
"llava_next": VLMTestInfo( "llava_next": VLMTestInfo(
models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")], models=[os.path.join(models_path_prefix, "llava-hf/llava-v1.6-mistral-7b-hf")],
test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS), test_type=(VLMTestType.IMAGE, VLMTestType.CUSTOM_INPUTS),
...@@ -675,7 +646,7 @@ VLM_TEST_SETTINGS = { ...@@ -675,7 +646,7 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.skip("HF import fails")], marks=[pytest.mark.skip("HF import fails")],
), ),
"minicpmo_26": VLMTestInfo( "minicpmo_26": VLMTestInfo(
models=["openbmb/MiniCPM-o-2_6"], models=[os.path.join(models_path_prefix, "openbmb/MiniCPM-o-2_6")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{img_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", # noqa: E501
img_idx_to_prompt=lambda idx: "(<image>./</image>)\n", img_idx_to_prompt=lambda idx: "(<image>./</image>)\n",
...@@ -703,7 +674,7 @@ VLM_TEST_SETTINGS = { ...@@ -703,7 +674,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner, patch_hf_runner=model_utils.minicpmv_26_patch_hf_runner,
), ),
"minimax_vl_01": VLMTestInfo( "minimax_vl_01": VLMTestInfo(
models=["MiniMaxAI/MiniMax-VL-01"], models=[os.path.join(models_path_prefix, "MiniMaxAI/MiniMax-VL-01")],
prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501 prompt_formatter=lambda img_prompt: f"<beginning_of_sentence>user: {img_prompt} assistant:<end_of_sentence>", # noqa: E501
img_idx_to_prompt=lambda _: "<image>", img_idx_to_prompt=lambda _: "<image>",
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
...@@ -726,7 +697,7 @@ VLM_TEST_SETTINGS = { ...@@ -726,7 +697,7 @@ VLM_TEST_SETTINGS = {
], ],
), ),
"molmo": VLMTestInfo( "molmo": VLMTestInfo(
models=["allenai/Molmo-7B-D-0924"], models=[os.path.join(models_path_prefix, "allenai/Molmo-7B-D-0924")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=identity, prompt_formatter=identity,
max_model_len=4096, max_model_len=4096,
...@@ -734,7 +705,7 @@ VLM_TEST_SETTINGS = { ...@@ -734,7 +705,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.molmo_patch_hf_runner, patch_hf_runner=model_utils.molmo_patch_hf_runner,
), ),
"ovis1_6-gemma2": VLMTestInfo( "ovis1_6-gemma2": VLMTestInfo(
models=["AIDC-AI/Ovis1.6-Gemma2-9B"], models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis1.6-Gemma2-9B")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<bos><start_of_turn>user\n{img_prompt}<end_of_turn>\n<start_of_turn>model\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", img_idx_to_prompt=lambda idx: "<image>\n",
...@@ -747,7 +718,7 @@ VLM_TEST_SETTINGS = { ...@@ -747,7 +718,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=32)], marks=[large_gpu_mark(min_gb=32)],
), ),
"ovis2": VLMTestInfo( "ovis2": VLMTestInfo(
models=["AIDC-AI/Ovis2-1B"], models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis2-1B")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", img_idx_to_prompt=lambda idx: "<image>\n",
...@@ -759,7 +730,7 @@ VLM_TEST_SETTINGS = { ...@@ -759,7 +730,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.ovis_patch_hf_runner, patch_hf_runner=model_utils.ovis_patch_hf_runner,
), ),
"ovis2_5": VLMTestInfo( "ovis2_5": VLMTestInfo(
models=["AIDC-AI/Ovis2.5-2B"], models=[os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>\n", img_idx_to_prompt=lambda idx: "<image>\n",
...@@ -772,7 +743,7 @@ VLM_TEST_SETTINGS = { ...@@ -772,7 +743,7 @@ VLM_TEST_SETTINGS = {
hf_model_kwargs={"revision": "refs/pr/5"}, hf_model_kwargs={"revision": "refs/pr/5"},
), ),
"paddleocr_vl": VLMTestInfo( "paddleocr_vl": VLMTestInfo(
models=["PaddlePaddle/PaddleOCR-VL"], models=[os.path.join(models_path_prefix, "PaddlePaddle/PaddleOCR-VL")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt}\nASSISTANT:",
img_idx_to_prompt=lambda idx: ( img_idx_to_prompt=lambda idx: (
...@@ -795,7 +766,7 @@ VLM_TEST_SETTINGS = { ...@@ -795,7 +766,7 @@ VLM_TEST_SETTINGS = {
], ],
), ),
"phi3v": VLMTestInfo( "phi3v": VLMTestInfo(
models=["microsoft/Phi-3.5-vision-instruct"], models=[os.path.join(models_path_prefix, "microsoft/Phi-3.5-vision-instruct")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|end|>\n<|assistant|>\n", # noqa: E501
img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n", img_idx_to_prompt=lambda idx: f"<|image_{idx}|>\n",
...@@ -835,7 +806,7 @@ VLM_TEST_SETTINGS = { ...@@ -835,7 +806,7 @@ VLM_TEST_SETTINGS = {
prompt_path_encoder=model_utils.qwen_prompt_path_encoder, prompt_path_encoder=model_utils.qwen_prompt_path_encoder,
), ),
"qwen2_vl": VLMTestInfo( "qwen2_vl": VLMTestInfo(
models=["Qwen/Qwen2-VL-2B-Instruct"], models=[os.path.join(models_path_prefix, "Qwen/Qwen2-VL-2B-Instruct")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE, VLMTestType.VIDEO),
prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501
img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>", img_idx_to_prompt=lambda idx: "<|vision_start|><|image_pad|><|vision_end|>",
...@@ -849,7 +820,7 @@ VLM_TEST_SETTINGS = { ...@@ -849,7 +820,7 @@ VLM_TEST_SETTINGS = {
marks=[pytest.mark.cpu_model], marks=[pytest.mark.cpu_model],
), ),
"skywork_r1v": VLMTestInfo( "skywork_r1v": VLMTestInfo(
models=["Skywork/Skywork-R1V-38B"], models=[os.path.join(models_path_prefix, "Skywork/Skywork-R1V-38B")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|begin▁of▁sentence|><|User|>\n{img_prompt}<|Assistant|><think>\n", # noqa: E501
single_image_prompts=IMAGE_ASSETS.prompts( single_image_prompts=IMAGE_ASSETS.prompts(
...@@ -865,7 +836,7 @@ VLM_TEST_SETTINGS = { ...@@ -865,7 +836,7 @@ VLM_TEST_SETTINGS = {
marks=[large_gpu_mark(min_gb=80)], marks=[large_gpu_mark(min_gb=80)],
), ),
"smolvlm": VLMTestInfo( "smolvlm": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM2-2.2B-Instruct"], models=[os.path.join(models_path_prefix, "HuggingFaceTB/SmolVLM2-2.2B-Instruct")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501 prompt_formatter=lambda img_prompt: f"<|im_start|>User:{img_prompt}<end_of_utterance>\nAssistant:", # noqa: E501
img_idx_to_prompt=lambda idx: "<image>", img_idx_to_prompt=lambda idx: "<image>",
...@@ -876,7 +847,7 @@ VLM_TEST_SETTINGS = { ...@@ -876,7 +847,7 @@ VLM_TEST_SETTINGS = {
num_logprobs=10, num_logprobs=10,
), ),
"tarsier": VLMTestInfo( "tarsier": VLMTestInfo(
models=["omni-research/Tarsier-7b"], models=[os.path.join(models_path_prefix, "omni-research/Tarsier-7b")],
test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE), test_type=(VLMTestType.IMAGE, VLMTestType.MULTI_IMAGE),
prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:", prompt_formatter=lambda img_prompt: f"USER: {img_prompt} ASSISTANT:",
max_model_len=4096, max_model_len=4096,
...@@ -885,7 +856,7 @@ VLM_TEST_SETTINGS = { ...@@ -885,7 +856,7 @@ VLM_TEST_SETTINGS = {
patch_hf_runner=model_utils.tarsier_patch_hf_runner, patch_hf_runner=model_utils.tarsier_patch_hf_runner,
), ),
"tarsier2": VLMTestInfo( "tarsier2": VLMTestInfo(
models=["omni-research/Tarsier2-Recap-7b"], models=[os.path.join(models_path_prefix, "omni-research/Tarsier2-Recap-7b")],
test_type=( test_type=(
VLMTestType.IMAGE, VLMTestType.IMAGE,
VLMTestType.MULTI_IMAGE, VLMTestType.MULTI_IMAGE,
...@@ -953,7 +924,7 @@ VLM_TEST_SETTINGS = { ...@@ -953,7 +924,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs=2, max_num_seqs=2,
auto_cls=AutoModelForImageTextToText, auto_cls=AutoModelForImageTextToText,
hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs( hf_model_kwargs=model_utils.llava_onevision_hf_model_kwargs(
"llava-hf/llava-onevision-qwen2-0.5b-ov-hf" os.path.join(models_path_prefix, "llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
), ),
vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output, vllm_output_post_proc=model_utils.llava_onevision_vllm_to_hf_output,
custom_test_opts=[ custom_test_opts=[
...@@ -973,7 +944,7 @@ VLM_TEST_SETTINGS = { ...@@ -973,7 +944,7 @@ VLM_TEST_SETTINGS = {
), ),
# regression test for https://github.com/vllm-project/vllm/issues/15122 # regression test for https://github.com/vllm-project/vllm/issues/15122
"qwen2_5_vl-windows-attention": VLMTestInfo( "qwen2_5_vl-windows-attention": VLMTestInfo(
models=["Qwen/Qwen2.5-VL-3B-Instruct"], models=[os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct")],
test_type=VLMTestType.CUSTOM_INPUTS, test_type=VLMTestType.CUSTOM_INPUTS,
max_model_len=4096, max_model_len=4096,
max_num_seqs=2, max_num_seqs=2,
......
...@@ -7,6 +7,7 @@ This test validates that each multimodal model can successfully generate outputs ...@@ -7,6 +7,7 @@ This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends. Tests are parametrized by model and backend. using different ViT attention backends. Tests are parametrized by model and backend.
""" """
import os
from dataclasses import asdict from dataclasses import asdict
from typing import Any from typing import Any
...@@ -19,7 +20,7 @@ from vllm.multimodal.video import sample_frames_from_video ...@@ -19,7 +20,7 @@ from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform from vllm.platforms import current_platform
from vllm.v1.attention.backends.registry import AttentionBackendEnum from vllm.v1.attention.backends.registry import AttentionBackendEnum
from ....utils import create_new_process_for_each_test from ....utils import create_new_process_for_each_test, models_path_prefix
from ...utils import dummy_hf_overrides from ...utils import dummy_hf_overrides
# Dots.OCR prompt from official repository # Dots.OCR prompt from official repository
...@@ -50,7 +51,7 @@ VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>" ...@@ -50,7 +51,7 @@ VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"
# Model configurations # Model configurations
MODEL_CONFIGS: dict[str, dict[str, Any]] = { MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"dots_ocr": { "dots_ocr": {
"model_name": "rednote-hilab/dots.ocr", "model_name": os.path.join(models_path_prefix, "rednote-hilab/dots.ocr"),
"interface": "llm_chat", "interface": "llm_chat",
"max_model_len": 32768, "max_model_len": 32768,
"max_num_seqs": 1, "max_num_seqs": 1,
...@@ -66,7 +67,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -66,7 +67,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"output_validator": lambda x: len(x) > 10 and "stop" in x.lower(), "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
}, },
"ernie45_vl": { "ernie45_vl": {
"model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT", "model_name": os.path.join(models_path_prefix, "baidu/ERNIE-4.5-VL-28B-A3B-PT"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 16384, "max_model_len": 16384,
"max_num_seqs": 2, "max_num_seqs": 2,
...@@ -79,7 +80,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -79,7 +80,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"glm4_1v": { "glm4_1v": {
"model_name": "zai-org/GLM-4.1V-9B-Thinking", "model_name": os.path.join(models_path_prefix, "zai-org/GLM-4.1V-9B-Thinking"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 32768, "max_model_len": 32768,
"max_num_seqs": 2, "max_num_seqs": 2,
...@@ -91,21 +92,8 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -91,21 +92,8 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"use_processor": True, "use_processor": True,
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"glm_ocr": {
"model_name": "zai-org/GLM-OCR",
"interface": "llm_generate",
"max_model_len": 131072,
"max_num_seqs": 2,
"sampling_params": {
"temperature": 0.0,
"max_tokens": 256,
"stop_token_ids": None,
},
"use_processor": True,
"question": "Text Recognition:",
},
"keye_vl": { "keye_vl": {
"model_name": "Kwai-Keye/Keye-VL-8B-Preview", "model_name": os.path.join(models_path_prefix, "Kwai-Keye/Keye-VL-8B-Preview"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 8192, "max_model_len": 8192,
"max_num_seqs": 5, "max_num_seqs": 5,
...@@ -122,7 +110,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -122,7 +110,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"ovis2_5": { "ovis2_5": {
"model_name": "AIDC-AI/Ovis2.5-2B", "model_name": os.path.join(models_path_prefix, "AIDC-AI/Ovis2.5-2B"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 8192, "max_model_len": 8192,
"max_num_seqs": 2, "max_num_seqs": 2,
...@@ -135,7 +123,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -135,7 +123,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"qwen2_5_vl": { "qwen2_5_vl": {
"model_name": "Qwen/Qwen2.5-VL-3B-Instruct", "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2.5-VL-3B-Instruct"),
"interface": "vllm_runner", "interface": "vllm_runner",
"media_type": "video", "media_type": "video",
"max_model_len": 4000, "max_model_len": 4000,
...@@ -154,7 +142,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -154,7 +142,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
}, },
}, },
"qwen2_5_omni": { "qwen2_5_omni": {
"model_name": "Qwen/Qwen2.5-Omni-3B", "model_name": os.path.join(models_path_prefix, "Qwen/Qwen2.5-Omni-3B"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 32768, "max_model_len": 32768,
"max_num_seqs": 2, "max_num_seqs": 2,
...@@ -169,7 +157,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = { ...@@ -169,7 +157,7 @@ MODEL_CONFIGS: dict[str, dict[str, Any]] = {
"question": "What is the content of each image?", "question": "What is the content of each image?",
}, },
"qwen3_omni": { "qwen3_omni": {
"model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct", "model_name": os.path.join(models_path_prefix, "Qwen/Qwen3-Omni-30B-A3B-Instruct"),
"interface": "llm_generate", "interface": "llm_generate",
"max_model_len": 32768, "max_model_len": 32768,
"max_num_seqs": 2, "max_num_seqs": 2,
......
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from huggingface_hub.constants import HF_HUB_CACHE
from vllm.plugins.lora_resolvers.hf_hub_resolver import HfHubResolver
LORA_LIB_MODEL_NAME = "ibm-granite/granite-3.3-8b-instruct"
# Repo with multiple LoRAs contained in it
LORA_LIB = "ibm-granite/granite-3.3-8b-rag-agent-lib"
LORA_NAME = "ibm-granite/granite-3.3-8b-rag-agent-lib/answerability_prediction_lora" # noqa: E501
NON_LORA_SUBPATH = "ibm-granite/granite-3.3-8b-rag-agent-lib/README.md"
LIB_DOWNLOAD_DIR = os.path.join(
HF_HUB_CACHE, "models--ibm-granite--granite-3.3-8b-rag-agent-lib"
)
INVALID_REPO_NAME = "thisrepodoesnotexist"
# Repo with only one LoRA in the root dir
LORA_REPO_MODEL_NAME = "meta-llama/Llama-2-7b-hf"
LORA_REPO = "yard1/llama-2-7b-sql-lora-test"
REPO_DOWNLOAD_DIR = os.path.join(
HF_HUB_CACHE, "models--yard1--llama-2-7b-sql-lora-test"
)
@pytest.mark.asyncio
async def test_hf_resolver_with_direct_path():
hf_resolver = HfHubResolver([LORA_REPO])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_REPO_MODEL_NAME, LORA_REPO)
assert lora_request.lora_name == LORA_REPO
assert REPO_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_hf_resolver_with_nested_paths():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
assert lora_request is not None
assert lora_request.lora_name == LORA_NAME
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_hf_resolver_with_multiple_repos():
hf_resolver = HfHubResolver([LORA_LIB, LORA_REPO])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, LORA_NAME)
assert lora_request is not None
assert lora_request.lora_name == LORA_NAME
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
@pytest.mark.asyncio
async def test_missing_adapter():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
missing_lora_request = await hf_resolver.resolve_lora(LORA_LIB_MODEL_NAME, "foobar")
assert missing_lora_request is None
@pytest.mark.asyncio
async def test_nonlora_adapter():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
readme_request = await hf_resolver.resolve_lora(
LORA_LIB_MODEL_NAME, NON_LORA_SUBPATH
)
assert readme_request is None
@pytest.mark.asyncio
async def test_invalid_repo():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
invalid_repo_req = await hf_resolver.resolve_lora(
INVALID_REPO_NAME,
f"{INVALID_REPO_NAME}/foo",
)
assert invalid_repo_req is None
@pytest.mark.asyncio
async def test_trailing_slash():
hf_resolver = HfHubResolver([LORA_LIB])
assert hf_resolver is not None
lora_request = await hf_resolver.resolve_lora(
LORA_LIB_MODEL_NAME,
f"{LORA_NAME}/",
)
assert lora_request is not None
assert lora_request.lora_name == f"{LORA_NAME}/"
assert LIB_DOWNLOAD_DIR in lora_request.lora_path
assert "adapter_config.json" in os.listdir(lora_request.lora_path)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import os
from vllm import SamplingParams
from ..utils import models_path_prefix
MODELS = [os.path.join(models_path_prefix, "distilbert/distilgpt2")]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("dtype", ["half"])
def test_ranks(
vllm_runner,
model,
dtype,
example_prompts,
):
max_tokens = 5
num_top_logprobs = 5
num_prompt_logprobs = 5
with vllm_runner(model, dtype=dtype,
max_logprobs=num_top_logprobs) as vllm_model:
## Test greedy logprobs ranks
vllm_sampling_params = SamplingParams(
temperature=0.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
vllm_results = vllm_model.generate_w_logprobs(example_prompts,
vllm_sampling_params)
## Test non-greedy logprobs ranks
sampling_params = SamplingParams(temperature=1.0,
top_p=1.0,
max_tokens=max_tokens,
logprobs=num_top_logprobs,
prompt_logprobs=num_prompt_logprobs)
res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
for result in vllm_results:
assert result[2] is not None
assert len(result[2]) == len(result[0])
# check whether all chosen tokens have ranks = 1
for token, logprobs in zip(result[0], result[2]):
assert token in logprobs
assert logprobs[token].rank == 1
for result in res:
assert result[2] is not None
assert len(result[2]) == len(result[0])
# check whether all chosen tokens have ranks
for token, logprobs in zip(result[0], result[2]):
assert logprobs[token].rank >= 1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for the UvicornAccessLogFilter class.
"""
import logging
from vllm.logging_utils.access_log_filter import (
UvicornAccessLogFilter,
create_uvicorn_log_config,
)
class TestUvicornAccessLogFilter:
"""Test cases for UvicornAccessLogFilter."""
def test_filter_allows_all_when_no_excluded_paths(self):
"""Filter should allow all logs when no paths are excluded."""
filter = UvicornAccessLogFilter(excluded_paths=[])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/v1/completions", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_allows_all_when_excluded_paths_is_none(self):
"""Filter should allow all logs when excluded_paths is None."""
filter = UvicornAccessLogFilter(excluded_paths=None)
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_excludes_health_endpoint(self):
"""Filter should exclude /health endpoint when configured."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_excludes_metrics_endpoint(self):
"""Filter should exclude /metrics endpoint when configured."""
filter = UvicornAccessLogFilter(excluded_paths=["/metrics"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_allows_non_excluded_endpoints(self):
"""Filter should allow endpoints not in the excluded list."""
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "POST", "/v1/completions", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is True
def test_filter_excludes_multiple_endpoints(self):
"""Filter should exclude multiple configured endpoints."""
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics", "/ping"])
# Test /health
record_health = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_health) is False
# Test /metrics
record_metrics = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/metrics", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_metrics) is False
# Test /ping
record_ping = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_ping) is False
def test_filter_with_query_parameters(self):
"""Filter should exclude endpoints even with query parameters."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health?verbose=true", "1.1", 200),
exc_info=None,
)
assert filter.filter(record) is False
def test_filter_different_http_methods(self):
"""Filter should exclude endpoints regardless of HTTP method."""
filter = UvicornAccessLogFilter(excluded_paths=["/ping"])
# Test GET
record_get = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_get) is False
# Test POST
record_post = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "POST", "/ping", "1.1", 200),
exc_info=None,
)
assert filter.filter(record_post) is False
def test_filter_with_different_status_codes(self):
"""Filter should exclude endpoints regardless of status code."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
for status_code in [200, 500, 503]:
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg='%s - "%s %s HTTP/%s" %d',
args=("127.0.0.1:12345", "GET", "/health", "1.1", status_code),
exc_info=None,
)
assert filter.filter(record) is False
class TestCreateUvicornLogConfig:
"""Test cases for create_uvicorn_log_config function."""
def test_creates_valid_config_structure(self):
"""Config should have required logging configuration keys."""
config = create_uvicorn_log_config(excluded_paths=["/health"])
assert "version" in config
assert config["version"] == 1
assert "disable_existing_loggers" in config
assert "formatters" in config
assert "handlers" in config
assert "loggers" in config
assert "filters" in config
def test_config_includes_access_log_filter(self):
"""Config should include the access log filter."""
config = create_uvicorn_log_config(excluded_paths=["/health", "/metrics"])
assert "access_log_filter" in config["filters"]
filter_config = config["filters"]["access_log_filter"]
assert filter_config["()"] == UvicornAccessLogFilter
assert filter_config["excluded_paths"] == ["/health", "/metrics"]
def test_config_applies_filter_to_access_handler(self):
"""Config should apply the filter to the access handler."""
config = create_uvicorn_log_config(excluded_paths=["/health"])
assert "access" in config["handlers"]
assert "filters" in config["handlers"]["access"]
assert "access_log_filter" in config["handlers"]["access"]["filters"]
def test_config_with_custom_log_level(self):
"""Config should respect custom log level."""
config = create_uvicorn_log_config(
excluded_paths=["/health"], log_level="debug"
)
assert config["loggers"]["uvicorn"]["level"] == "DEBUG"
assert config["loggers"]["uvicorn.access"]["level"] == "DEBUG"
assert config["loggers"]["uvicorn.error"]["level"] == "DEBUG"
def test_config_with_empty_excluded_paths(self):
"""Config should work with empty excluded paths."""
config = create_uvicorn_log_config(excluded_paths=[])
assert config["filters"]["access_log_filter"]["excluded_paths"] == []
def test_config_with_none_excluded_paths(self):
"""Config should work with None excluded paths."""
config = create_uvicorn_log_config(excluded_paths=None)
assert config["filters"]["access_log_filter"]["excluded_paths"] == []
class TestIntegration:
"""Integration tests for the access log filter."""
def test_filter_with_real_logger(self):
"""Test filter works with a real Python logger simulating uvicorn."""
# Create a logger with our filter (simulating uvicorn.access)
logger = logging.getLogger("uvicorn.access")
logger.setLevel(logging.INFO)
# Clear any existing handlers
logger.handlers = []
# Create a custom handler that tracks messages
logged_messages: list[str] = []
class TrackingHandler(logging.Handler):
def emit(self, record):
logged_messages.append(record.getMessage())
handler = TrackingHandler()
handler.setLevel(logging.INFO)
filter = UvicornAccessLogFilter(excluded_paths=["/health", "/metrics"])
handler.addFilter(filter)
logger.addHandler(handler)
# Log using uvicorn's format with args tuple
# Format: '%s - "%s %s HTTP/%s" %d'
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/health",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/v1/completions",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"GET",
"/metrics",
"1.1",
200,
)
logger.info(
'%s - "%s %s HTTP/%s" %d',
"127.0.0.1:12345",
"POST",
"/v1/chat/completions",
"1.1",
200,
)
# Verify only non-excluded endpoints were logged
assert len(logged_messages) == 2
assert "/v1/completions" in logged_messages[0]
assert "/v1/chat/completions" in logged_messages[1]
def test_filter_allows_non_uvicorn_access_logs(self):
"""Test filter allows logs from non-uvicorn.access loggers."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record from a different logger name
record = logging.LogRecord(
name="uvicorn.error",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some error message about /health",
args=(),
exc_info=None,
)
# Should allow because it's not from uvicorn.access
assert filter.filter(record) is True
def test_filter_handles_malformed_args(self):
"""Test filter handles log records with unexpected args format."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record with insufficient args
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some message",
args=("only", "two"),
exc_info=None,
)
# Should allow because args doesn't have expected format
assert filter.filter(record) is True
def test_filter_handles_non_tuple_args(self):
"""Test filter handles log records with non-tuple args."""
filter = UvicornAccessLogFilter(excluded_paths=["/health"])
# Log record with None args
record = logging.LogRecord(
name="uvicorn.access",
level=logging.INFO,
pathname="",
lineno=0,
msg="Some message without args",
args=None,
exc_info=None,
)
# Should allow because args is None
assert filter.filter(record) is True
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import os
from transformers import PreTrainedTokenizerBase
from vllm.transformers_utils.tokenizer import get_tokenizer
from ..utils import models_path_prefix
# TOKENIZER_NAMES = [
# os.path.join(models_path_prefix, "facebook/opt-125m"),
# os.path.join(models_path_prefix, "gpt2"),
# ]
# export HF_ENDPOINT=https://hf-mirror.com
TOKENIZER_NAMES = [
"facebook/opt-125m",
"gpt2",
]
@pytest.mark.parametrize("tokenizer_name", TOKENIZER_NAMES)
def test_tokenizer_revision(tokenizer_name: str):
# Assume that "main" branch always exists
# tokenizer = get_tokenizer(tokenizer_name, revision="main")
tokenizer = get_tokenizer(tokenizer_name)
assert isinstance(tokenizer, PreTrainedTokenizerBase)
# Assume that "never" branch always does not exist
with pytest.raises(OSError, match='not a valid git identifier'):
get_tokenizer(tokenizer_name, revision="never")
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501 # ruff: noqa: E501
import os
import json import json
import pytest import pytest
...@@ -11,10 +12,11 @@ from vllm.tokenizers import get_tokenizer ...@@ -11,10 +12,11 @@ from vllm.tokenizers import get_tokenizer
from vllm.tool_parsers.glm4_moe_tool_parser import ( from vllm.tool_parsers.glm4_moe_tool_parser import (
Glm4MoeModelToolParser, Glm4MoeModelToolParser,
) )
from ..utils import models_path_prefix
pytest.skip("skip glm4_moe parser test", allow_module_level=True) pytest.skip("skip glm4_moe parser test", allow_module_level=True)
# Use a common model that is likely to be available # Use a common model that is likely to be available
MODEL = "zai-org/GLM-4.5" MODEL = os.path.join(models_path_prefix, "zai-org/GLM-4.5")
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
...@@ -225,6 +227,7 @@ def test_extract_tool_calls( ...@@ -225,6 +227,7 @@ def test_extract_tool_calls(
def test_extract_tool_calls_with_thinking_tags(glm4_moe_tool_parser): def test_extract_tool_calls_with_thinking_tags(glm4_moe_tool_parser):
"""Test tool extraction when thinking tags are present.""" """Test tool extraction when thinking tags are present."""
model_output = """<think>I want to get the weather.</think> model_output = """<think>I want to get the weather.</think>
I will help you get the weather. I will help you get the weather.
<tool_call>get_weather <tool_call>get_weather
<arg_key>city</arg_key> <arg_key>city</arg_key>
...@@ -242,6 +245,7 @@ I will help you get the weather. ...@@ -242,6 +245,7 @@ I will help you get the weather.
assert extracted_tool_calls.tool_calls[0].function.name == "get_weather" assert extracted_tool_calls.tool_calls[0].function.name == "get_weather"
expected_content = """<think>I want to get the weather.</think> expected_content = """<think>I want to get the weather.</think>
I will help you get the weather.""" I will help you get the weather."""
assert extracted_tool_calls.content == expected_content assert extracted_tool_calls.content == expected_content
...@@ -285,6 +289,7 @@ def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser): ...@@ -285,6 +289,7 @@ def test_extract_tool_calls_empty_arguments(glm4_moe_tool_parser):
def test_extract_tool_calls_mixed_content(glm4_moe_tool_parser): def test_extract_tool_calls_mixed_content(glm4_moe_tool_parser):
"""Test extraction with mixed content and multiple tool calls.""" """Test extraction with mixed content and multiple tool calls."""
model_output = """I will help you get the weather info. model_output = """I will help you get the weather info.
<tool_call>get_weather <tool_call>get_weather
<arg_key>city</arg_key> <arg_key>city</arg_key>
<arg_value>Beijing</arg_value> <arg_value>Beijing</arg_value>
...@@ -443,4 +448,4 @@ def test_extract_tool_calls_incomplete_tool_call(glm4_moe_tool_parser): ...@@ -443,4 +448,4 @@ def test_extract_tool_calls_incomplete_tool_call(glm4_moe_tool_parser):
# Incomplete tool calls should not be extracted # Incomplete tool calls should not be extracted
assert not extracted_tool_calls.tools_called assert not extracted_tool_calls.tools_called
assert extracted_tool_calls.tool_calls == [] assert extracted_tool_calls.tool_calls == []
assert extracted_tool_calls.content == model_output assert extracted_tool_calls.content == model_output
\ No newline at end of file
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from torch_xla._internal import tpu
import vllm
from vllm.lora.request import LoRARequest
# This file contains tests to ensure that LoRA works correctly on the TPU
# backend. We use a series of custom trained adapters for Qwen2.5-3B-Instruct
# for this. The adapters are:
# Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter, where x ranges
# from 1 to 4.
# These adapters are trained using a standard huggingface peft training script,
# where all the inputs are "What is 1+1? \n" and all the outputs are "x". We run
# 100 training iterations with a training batch size of 100.
def setup_vllm(num_loras: int, tp: int) -> vllm.LLM:
return vllm.LLM(
model="Qwen/Qwen2.5-3B-Instruct",
max_model_len=256,
max_num_seqs=8,
tensor_parallel_size=tp,
enable_lora=True,
max_loras=num_loras,
max_lora_rank=8,
)
TPU_TENSOR_PARALLEL_SIZES = (
[1, tpu.num_available_chips()] if tpu.num_available_chips() > 1 else [1]
)
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
def test_single_lora(tp: int):
"""
This test ensures we can run a single LoRA adapter on the TPU backend.
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter" which
will force Qwen2.5-3B-Instruct to claim 1+1=1.
"""
llm = setup_vllm(1, tp)
prompt = "What is 1+1? \n"
lora_request = LoRARequest(
"lora_adapter_1",
1,
"Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_1_adapter",
)
output = (
llm.generate(
prompt,
sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
lora_request=lora_request,
)[0]
.outputs[0]
.text
)
answer = output.strip()[0]
assert answer.isdigit()
assert int(answer) == 1
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
def test_lora_hotswapping(tp: int):
"""
This test ensures we can run multiple LoRA adapters on the TPU backend, even
if we only have space to store 1.
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
"""
lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
lora_requests = [
LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
for i in range(1, 5)
]
llm = setup_vllm(1, tp)
prompt = "What is 1+1? \n"
for i, req in enumerate(lora_requests):
output = (
llm.generate(
prompt,
sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
lora_request=req,
)[0]
.outputs[0]
.text
)
answer = output.strip()[0]
assert answer.isdigit()
assert int(answer) == i + 1
@pytest.mark.parametrize("tp", TPU_TENSOR_PARALLEL_SIZES)
def test_multi_lora(tp: int):
"""
This test ensures we can run multiple LoRA adapters on the TPU backend, when
we have enough space to store all of them.
We run "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_x_adapter" which
will force Qwen2.5-3B-Instruct to claim 1+1=x, for a range of x.
"""
lora_name_template = "Username6568/Qwen2.5-3B-Instruct-1_plus_1_equals_{}_adapter"
lora_requests = [
LoRARequest(f"lora_adapter_{i}", i, lora_name_template.format(i))
for i in range(1, 5)
]
llm = setup_vllm(4, tp)
prompt = "What is 1+1? \n"
for i, req in enumerate(lora_requests):
output = (
llm.generate(
prompt,
sampling_params=vllm.SamplingParams(max_tokens=256, temperature=0),
lora_request=req,
)[0]
.outputs[0]
.text
)
answer = output.strip()[0]
assert answer.isdigit()
assert int(output.strip()[0]) == i + 1
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import glob
import os
import tempfile
import depyf
def test_tpu_compilation():
temp_dir = tempfile.mkdtemp()
with depyf.prepare_debug(temp_dir):
from vllm import LLM, SamplingParams
prompts = [
"A robot may not injure a human being",
"It is only with the heart that one can see rightly;",
"The greatest glory in living lies not in never falling,",
]
answers = [
" or, through inaction",
" what is essential ",
" but in rising ",
]
# Currently, top-p sampling is disabled. `top_p` should be 1.0.
N = 1
sampling_params = SamplingParams(temperature=0.7, top_p=1.0, n=N, max_tokens=16)
llm = LLM(
model="Qwen/Qwen2-1.5B-Instruct",
max_num_batched_tokens=256,
max_model_len=256,
max_num_seqs=32,
enforce_eager=False,
)
outputs = llm.generate(prompts, sampling_params)
for output, answer in zip(outputs, answers):
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
assert generated_text.startswith(answer)
compiled_codes = sorted(
glob.glob(os.path.join(temp_dir, "__transformed_code*for_forward.py"))
)
for i, compiled_code in enumerate(compiled_codes):
print("{} file: {}".format(i + 1, compiled_code))
# We should only trigger Dynamo compilation 2 times:
# 1. Forward pass without kv_caches
# 2. Forward pass with kv_caches
# Check we have 2 compiled codes
assert len(compiled_codes) == 2
kv_cache_prefix = "kv_cache"
attn_prefix = "ragged_paged_attention"
def extract_compiled_index(s):
parts = s.replace(".", "_").split("_")
numbers = [int(part) for part in parts if part.isdigit()]
return numbers[0]
# Check all the compilations are as expected. The dump files include the
# captured graph for the forward function of the nn.Module.
compiled_fns = sorted(
glob.glob(os.path.join(temp_dir, "__compiled_fn*Forward_graph*.py")),
key=lambda s: extract_compiled_index(s),
)
for i, compiled_fn in enumerate(compiled_fns):
print("{} file: {}".format(i + 1, compiled_fn))
# The first compilation should not have any kv_caches
with open(compiled_fns[0]) as f:
content = f.read()
assert kv_cache_prefix not in content
# The second compilation should have kv_caches and the
# ragged_paged_attention
with open(compiled_fns[1]) as f:
content = f.read()
assert kv_cache_prefix in content and attn_prefix in content
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import pytest
from vllm.config import CompilationMode
from ..utils import compare_two_settings, models_path_prefix
# --enforce-eager on TPU causes graph compilation
# this times out default Health Check in the MQLLMEngine,
# so we set the timeout here to 30s
def test_custom_dispatcher(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_RPC_TIMEOUT", "30000")
compare_two_settings(
os.path.join(models_path_prefix, "Qwen/Qwen2.5-1.5B-Instruct"),
arg1=[
"--max-model-len=256",
"--max-num-seqs=32",
"--enforce-eager",
f"-O{CompilationMode.DYNAMO_TRACE_ONCE}",
],
arg2=[
"--max-model-len=256",
"--max-num-seqs=32",
"--enforce-eager",
f"-O{CompilationMode.STOCK_TORCH_COMPILE}",
],
env1={},
env2={},
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the Pallas MOE implementation.
Run `pytest tests/kernels/moe/test_moe_pallas.py`.
"""
import pytest
import torch
import torch_xla
from vllm.model_executor.layers.fused_moe.moe_pallas import fused_moe as pallas_moe
from vllm.model_executor.layers.fused_moe.moe_torch_iterative import (
fused_moe as torch_moe,
)
from vllm.platforms import current_platform
if not current_platform.is_tpu():
pytest.skip("This test needs a TPU.", allow_module_level=True)
NUM_EXPERTS = [8, 64]
EP_SIZE = [1]
TOP_KS = [2, 6]
# The Pallas GMM kernel requires num_tokens * topk to be a multiple of 16
@pytest.mark.parametrize("m", [8, 16, 64, 2048])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("k", [128, 511, 1024])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
def test_pallas_moe(
m: int,
n: int,
k: int,
e: int,
topk: int,
ep_size: int,
dtype: torch.dtype,
):
import torch_xla.core.xla_model as xm
with torch.device(xm.xla_device()):
a = torch.randn((m, k), dtype=dtype) / 10
w1 = torch.randn((e, 2 * n, k), dtype=dtype) / 10
w2 = torch.randn((e, k, n), dtype=dtype) / 10
score = torch.randn((m, e), dtype=dtype)
# TODO: Support ep
if ep_size > 1:
pytest.skip("No support for ep_size > 1 yet")
else:
e_map = None
# Run both implementations
torch_output = torch_moe(
hidden_states=a,
w1=w1,
w2=w2,
gating_output=score,
topk=topk,
global_num_experts=e,
expert_map=e_map,
renormalize=False,
)
pallas_output = pallas_moe(
hidden_states=a,
w1=w1,
w2=w2,
gating_output=score,
topk=topk,
global_num_experts=e,
expert_map=e_map,
renormalize=False,
)
torch_xla.sync(wait=False)
# Compare outputs
torch.testing.assert_close(
pallas_output.cpu(),
torch_output.cpu(),
atol=2e-2,
rtol=0,
)
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass
import lm_eval
import pytest
TASK = "gsm8k"
FILTER = "exact_match,strict-match"
RTOL = 0.03
@dataclass
class GSM8KAccuracyTestConfig:
model_name: str
expected_value: float
def get_model_args(self) -> str:
return f"pretrained={self.model_name},max_model_len=4096,max_num_seqs=32"
# NOTE: Accuracy scores measured on GPUs.
ACCURACY_CONFIGS = [
GSM8KAccuracyTestConfig(
model_name="neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
expected_value=0.76,
), # no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
# a follow-up, move this into the LM-EVAL section of the CI.
# GSM8KAccuracyTestConfig(
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
# expected_value=0.66), # bias in QKV layers
]
@pytest.mark.parametrize("config", ACCURACY_CONFIGS)
def test_gsm8k_correctness(config: GSM8KAccuracyTestConfig):
results = lm_eval.simple_evaluate(
model="vllm",
model_args=config.get_model_args(),
tasks="gsm8k",
batch_size="auto",
)
EXPECTED_VALUE = config.expected_value
measured_value = results["results"][TASK][FILTER]
assert (
measured_value - RTOL < EXPECTED_VALUE
and measured_value + RTOL > EXPECTED_VALUE
), f"Expected: {EXPECTED_VALUE} | Measured: {measured_value}"
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
import asyncio
import hashlib
import json
import os
import pickle
import socket
import tempfile
from collections.abc import AsyncIterator
from pathlib import Path
from unittest.mock import patch
import os
import pytest
import torch
import yaml
import zmq
from transformers import AutoTokenizer
from vllm_test_utils.monitor import monitor
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
from vllm.transformers_utils.detokenizer_utils import (
convert_ids_list_to_tokens)
# isort: off
from vllm.utils import (
CacheInfo, FlexibleArgumentParser, LRUCache, MemorySnapshot,
PlaceholderModule, bind_kv_cache, common_broadcastable_dtype,
current_stream, deprecate_kwargs, get_open_port, get_tcp_uri,
is_lossless_cast, join_host_port, make_zmq_path, make_zmq_socket,
memory_profiling, merge_async_iterators, sha256, split_host_port,
split_zmq_path, supports_kw, swap_dict_values, unique_filepath)
# isort: on
from ..utils import create_new_process_for_each_test, error_on_warning, models_path_prefix
@pytest.mark.asyncio
async def test_merge_async_iterators():
async def mock_async_iterator(idx: int):
try:
while True:
yield f"item from iterator {idx}"
await asyncio.sleep(0.1)
except asyncio.CancelledError:
print(f"iterator {idx} cancelled")
iterators = [mock_async_iterator(i) for i in range(3)]
merged_iterator = merge_async_iterators(*iterators)
async def stream_output(generator: AsyncIterator[tuple[int, str]]):
async for idx, output in generator:
print(f"idx: {idx}, output: {output}")
task = asyncio.create_task(stream_output(merged_iterator))
await asyncio.sleep(0.5)
task.cancel()
with pytest.raises(asyncio.CancelledError):
await task
for iterator in iterators:
try:
# Can use anext() in python >= 3.10
await asyncio.wait_for(iterator.__anext__(), 1)
except StopAsyncIteration:
# All iterators should be cancelled and print this message.
print("Iterator was cancelled normally")
except (Exception, asyncio.CancelledError) as e:
raise AssertionError() from e
def test_deprecate_kwargs_always():
@deprecate_kwargs("old_arg", is_deprecated=True)
def dummy(*, old_arg: object = None, new_arg: object = None):
pass
with pytest.warns(DeprecationWarning, match="'old_arg'"):
dummy(old_arg=1)
with error_on_warning(DeprecationWarning):
dummy(new_arg=1)
def test_deprecate_kwargs_never():
@deprecate_kwargs("old_arg", is_deprecated=False)
def dummy(*, old_arg: object = None, new_arg: object = None):
pass
with error_on_warning(DeprecationWarning):
dummy(old_arg=1)
with error_on_warning(DeprecationWarning):
dummy(new_arg=1)
def test_deprecate_kwargs_dynamic():
is_deprecated = True
@deprecate_kwargs("old_arg", is_deprecated=lambda: is_deprecated)
def dummy(*, old_arg: object = None, new_arg: object = None):
pass
with pytest.warns(DeprecationWarning, match="'old_arg'"):
dummy(old_arg=1)
with error_on_warning(DeprecationWarning):
dummy(new_arg=1)
is_deprecated = False
with error_on_warning(DeprecationWarning):
dummy(old_arg=1)
with error_on_warning(DeprecationWarning):
dummy(new_arg=1)
def test_deprecate_kwargs_additional_message():
@deprecate_kwargs("old_arg", is_deprecated=True, additional_message="abcd")
def dummy(*, old_arg: object = None, new_arg: object = None):
pass
with pytest.warns(DeprecationWarning, match="abcd"):
dummy(old_arg=1)
def test_get_open_port(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_PORT", "5678")
# make sure we can get multiple ports, even if the env var is set
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
s1.bind(("localhost", get_open_port()))
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
s2.bind(("localhost", get_open_port()))
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
s3.bind(("localhost", get_open_port()))
# Tests for FlexibleArgumentParser
@pytest.fixture
def parser():
parser = FlexibleArgumentParser()
parser.add_argument('--image-input-type',
choices=['pixel_values', 'image_features'])
parser.add_argument('--model-name')
parser.add_argument('--batch-size', type=int)
parser.add_argument('--enable-feature', action='store_true')
parser.add_argument('--hf-overrides', type=json.loads)
parser.add_argument('-O', '--compilation-config', type=json.loads)
return parser
@pytest.fixture
def parser_with_config():
parser = FlexibleArgumentParser()
parser.add_argument('serve')
parser.add_argument('model_tag', nargs='?')
parser.add_argument('--model', type=str)
parser.add_argument('--served-model-name', type=str)
parser.add_argument('--config', type=str)
parser.add_argument('--port', type=int)
parser.add_argument('--tensor-parallel-size', type=int)
parser.add_argument('--trust-remote-code', action='store_true')
return parser
def test_underscore_to_dash(parser):
args = parser.parse_args(['--image_input_type', 'pixel_values'])
assert args.image_input_type == 'pixel_values'
def test_mixed_usage(parser):
args = parser.parse_args([
'--image_input_type', 'image_features', '--model-name',
os.path.join(models_path_prefix, 'facebook/opt-125m')
])
assert args.image_input_type == 'image_features'
assert args.model_name == os.path.join(models_path_prefix, 'facebook/opt-125m')
def test_with_equals_sign(parser):
model_name_with_path = os.path.join(models_path_prefix, 'facebook/opt-125m')
args = parser.parse_args(
['--image_input_type=pixel_values', f'--model-name={model_name_with_path}'])
assert args.image_input_type == 'pixel_values'
assert args.model_name == os.path.join(models_path_prefix, 'facebook/opt-125m')
def test_with_int_value(parser):
args = parser.parse_args(['--batch_size', '32'])
assert args.batch_size == 32
args = parser.parse_args(['--batch-size', '32'])
assert args.batch_size == 32
def test_with_bool_flag(parser):
args = parser.parse_args(['--enable_feature'])
assert args.enable_feature is True
args = parser.parse_args(['--enable-feature'])
assert args.enable_feature is True
def test_invalid_choice(parser):
with pytest.raises(SystemExit):
parser.parse_args(['--image_input_type', 'invalid_choice'])
def test_missing_required_argument(parser):
parser.add_argument('--required-arg', required=True)
with pytest.raises(SystemExit):
parser.parse_args([])
def test_cli_override_to_config(parser_with_config, cli_config_file):
args = parser_with_config.parse_args([
'serve', 'mymodel', '--config', cli_config_file,
'--tensor-parallel-size', '3'
])
assert args.tensor_parallel_size == 3
args = parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
cli_config_file
])
assert args.tensor_parallel_size == 3
assert args.port == 12312
args = parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
cli_config_file, '--port', '666'
])
assert args.tensor_parallel_size == 3
assert args.port == 666
def test_config_args(parser_with_config, cli_config_file):
args = parser_with_config.parse_args(
['serve', 'mymodel', '--config', cli_config_file])
assert args.tensor_parallel_size == 2
assert args.trust_remote_code
def test_config_file(parser_with_config):
with pytest.raises(FileNotFoundError):
parser_with_config.parse_args(
['serve', 'mymodel', '--config', 'test_config.yml'])
with pytest.raises(ValueError):
parser_with_config.parse_args(
['serve', 'mymodel', '--config', './data/test_config.json'])
with pytest.raises(ValueError):
parser_with_config.parse_args([
'serve', 'mymodel', '--tensor-parallel-size', '3', '--config',
'--batch-size', '32'
])
def test_no_model_tag(parser_with_config, cli_config_file):
with pytest.raises(ValueError):
parser_with_config.parse_args(['serve', '--config', cli_config_file])
def test_dict_args(parser):
args = [
"--model-name=something.something",
"--hf-overrides.key1",
"val1",
# Test nesting
"--hf-overrides.key2.key3",
"val2",
"--hf-overrides.key2.key4",
"val3",
# Test compile config and compilation level
"-O.use_inductor=true",
"-O.backend",
"custom",
"-O1",
# Test = sign
"--hf-overrides.key5=val4",
# Test underscore to dash conversion
"--hf_overrides.key_6",
"val5",
"--hf_overrides.key-7.key_8",
"val6",
# Test data type detection
"--hf_overrides.key9",
"100",
"--hf_overrides.key10",
"100.0",
"--hf_overrides.key11",
"true",
"--hf_overrides.key12.key13",
"null",
# Test '-' and '.' in value
"--hf_overrides.key14.key15",
"-minus.and.dot",
# Test array values
"-O.custom_ops+",
"-quant_fp8",
"-O.custom_ops+=+silu_mul,-rms_norm",
]
parsed_args = parser.parse_args(args)
assert parsed_args.model_name == "something.something"
assert parsed_args.hf_overrides == {
"key1": "val1",
"key2": {
"key3": "val2",
"key4": "val3",
},
"key5": "val4",
"key_6": "val5",
"key-7": {
"key_8": "val6",
},
"key9": 100,
"key10": 100.0,
"key11": True,
"key12": {
"key13": None,
},
"key14": {
"key15": "-minus.and.dot",
}
}
assert parsed_args.compilation_config == {
"level": 1,
"use_inductor": True,
"backend": "custom",
"custom_ops": ["-quant_fp8", "+silu_mul", "-rms_norm"],
}
def test_duplicate_dict_args(caplog_vllm, parser):
args = [
"--model-name=something.something",
"--hf-overrides.key1",
"val1",
"--hf-overrides.key1",
"val2",
"-O1",
"-O.level",
"2",
"-O3",
]
parsed_args = parser.parse_args(args)
# Should be the last value
assert parsed_args.hf_overrides == {"key1": "val2"}
assert parsed_args.compilation_config == {"level": 3}
assert len(caplog_vllm.records) == 1
assert "duplicate" in caplog_vllm.text
assert "--hf-overrides.key1" in caplog_vllm.text
assert "-O.level" in caplog_vllm.text
# yapf: enable
@pytest.mark.parametrize(
"callable,kw_name,requires_kw_only,allow_var_kwargs,is_supported",
[
# Tests for positional argument support
(lambda foo: None, "foo", True, True, False),
(lambda foo: None, "foo", False, True, True),
# Tests for positional or keyword / keyword only
(lambda foo=100: None, "foo", True, True, False),
(lambda *, foo: None, "foo", False, True, True),
# Tests to make sure the names of variadic params are NOT supported
(lambda *args: None, "args", False, True, False),
(lambda **kwargs: None, "kwargs", False, True, False),
# Tests for if we allow var kwargs to add support
(lambda foo: None, "something_else", False, True, False),
(lambda foo, **kwargs: None, "something_else", False, True, True),
(lambda foo, **kwargs: None, "kwargs", True, True, False),
(lambda foo, **kwargs: None, "foo", True, True, False),
])
# yapf: disable
def test_supports_kw(callable,kw_name,requires_kw_only,
allow_var_kwargs,is_supported):
assert supports_kw(
callable=callable,
kw_name=kw_name,
requires_kw_only=requires_kw_only,
allow_var_kwargs=allow_var_kwargs
) == is_supported
@create_new_process_for_each_test()
def test_memory_profiling():
# Fake out some model loading + inference memory usage to test profiling
# Memory used by other processes will show up as cuda usage outside of torch
from vllm.distributed.device_communicators.cuda_wrapper import (
CudaRTLibrary)
lib = CudaRTLibrary()
# 512 MiB allocation outside of this instance
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
baseline_snapshot = MemorySnapshot()
# load weights
weights = torch.randn(128, 1024, 1024, device='cuda', dtype=torch.float32)
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
def measure_current_non_torch():
free, total = torch.cuda.mem_get_info()
current_used = total - free
current_torch = torch.cuda.memory_reserved()
current_non_torch = current_used - current_torch
return current_non_torch
with memory_profiling(baseline_snapshot=baseline_snapshot,
weights_memory=weights_memory) as result, \
monitor(measure_current_non_torch) as monitored_values:
# make a memory spike, 1 GiB
spike = torch.randn(256, 1024, 1024, device='cuda', dtype=torch.float32)
del spike
# Add some extra non-torch memory 256 MiB (simulate NCCL)
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
# this is an analytic value, it is exact,
# we only have 256 MiB non-torch memory increase
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
assert measured_diff == 256 * 1024 * 1024
# Check that the memory usage is within 5% of the expected values
# 5% tolerance is caused by cuda runtime.
# we cannot control cuda runtime in the granularity of bytes,
# which causes a small error (<10 MiB in practice)
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
assert abs(non_torch_ratio - 1) <= 0.05
assert result.torch_peak_increase == 1024 * 1024 * 1024
del weights
lib.cudaFree(handle1)
lib.cudaFree(handle2)
def test_bind_kv_cache():
from vllm.attention import Attention
ctx = {
'layers.0.self_attn': Attention(32, 128, 0.1),
'layers.1.self_attn': Attention(32, 128, 0.1),
'layers.2.self_attn': Attention(32, 128, 0.1),
'layers.3.self_attn': Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1, )),
]
bind_kv_cache(ctx, [kv_cache])
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[2]
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[3]
def test_bind_kv_cache_kv_sharing():
from vllm.attention import Attention
ctx = {
'layers.0.self_attn': Attention(32, 128, 0.1),
'layers.1.self_attn': Attention(32, 128, 0.1),
'layers.2.self_attn': Attention(32, 128, 0.1),
'layers.3.self_attn': Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1, )),
torch.zeros((1, )),
]
shared_kv_cache_layers = {
'layers.2.self_attn': 'layers.1.self_attn',
'layers.3.self_attn': 'layers.0.self_attn'
}
bind_kv_cache(ctx, [kv_cache], shared_kv_cache_layers)
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0]
assert ctx['layers.1.self_attn'].kv_cache[0] is kv_cache[1]
assert ctx['layers.2.self_attn'].kv_cache[0] is kv_cache[1]
assert ctx['layers.3.self_attn'].kv_cache[0] is kv_cache[0]
def test_bind_kv_cache_non_attention():
from vllm.attention import Attention
# example from Jamba PP=2
ctx = {
'model.layers.20.attn': Attention(32, 128, 0.1),
'model.layers.28.attn': Attention(32, 128, 0.1),
}
kv_cache = [
torch.zeros((1, )),
torch.zeros((1, )),
]
bind_kv_cache(ctx, [kv_cache])
assert ctx['model.layers.20.attn'].kv_cache[0] is kv_cache[0]
assert ctx['model.layers.28.attn'].kv_cache[0] is kv_cache[1]
def test_bind_kv_cache_pp():
with patch("vllm.utils.cuda_device_count_stateless", lambda: 2):
# this test runs with 1 GPU, but we simulate 2 GPUs
cfg = VllmConfig(
parallel_config=ParallelConfig(pipeline_parallel_size=2))
with set_current_vllm_config(cfg):
from vllm.attention import Attention
ctx = {
'layers.0.self_attn': Attention(32, 128, 0.1),
}
kv_cache = [
[torch.zeros((1, ))],
[torch.zeros((1, ))]
]
bind_kv_cache(ctx, kv_cache)
assert ctx['layers.0.self_attn'].kv_cache[0] is kv_cache[0][0]
assert ctx['layers.0.self_attn'].kv_cache[1] is kv_cache[1][0]
class TestLRUCache(LRUCache):
def _on_remove(self, key, value):
if not hasattr(self, "_remove_counter"):
self._remove_counter = 0
self._remove_counter += 1
def test_lru_cache():
cache = TestLRUCache(3)
assert cache.stat() == CacheInfo(hits=0, total=0)
assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
cache.put(1, 1)
assert len(cache) == 1
cache.put(1, 1)
assert len(cache) == 1
cache.put(2, 2)
assert len(cache) == 2
cache.put(3, 3)
assert len(cache) == 3
assert set(cache.cache) == {1, 2, 3}
cache.put(4, 4)
assert len(cache) == 3
assert set(cache.cache) == {2, 3, 4}
assert cache._remove_counter == 1
assert cache.get(2) == 2
assert cache.stat() == CacheInfo(hits=1, total=1)
assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
assert cache[2] == 2
assert cache.stat() == CacheInfo(hits=2, total=2)
assert cache.stat(delta=True) == CacheInfo(hits=1, total=1)
cache.put(5, 5)
assert set(cache.cache) == {2, 4, 5}
assert cache._remove_counter == 2
assert cache.pop(5) == 5
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
assert cache.get(-1) is None
assert cache.stat() == CacheInfo(hits=2, total=3)
assert cache.stat(delta=True) == CacheInfo(hits=0, total=1)
cache.pop(10)
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
cache.get(10)
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
cache.put(6, 6)
assert len(cache) == 3
assert set(cache.cache) == {2, 4, 6}
assert 2 in cache
assert 4 in cache
assert 6 in cache
cache.remove_oldest()
assert len(cache) == 2
assert set(cache.cache) == {2, 6}
assert cache._remove_counter == 4
cache.clear()
assert len(cache) == 0
assert cache._remove_counter == 6
assert cache.stat() == CacheInfo(hits=0, total=0)
assert cache.stat(delta=True) == CacheInfo(hits=0, total=0)
cache._remove_counter = 0
cache[1] = 1
assert len(cache) == 1
cache[1] = 1
assert len(cache) == 1
cache[2] = 2
assert len(cache) == 2
cache[3] = 3
assert len(cache) == 3
assert set(cache.cache) == {1, 2, 3}
cache[4] = 4
assert len(cache) == 3
assert set(cache.cache) == {2, 3, 4}
assert cache._remove_counter == 1
assert cache[2] == 2
cache[5] = 5
assert set(cache.cache) == {2, 4, 5}
assert cache._remove_counter == 2
del cache[5]
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
cache.pop(10)
assert len(cache) == 2
assert set(cache.cache) == {2, 4}
assert cache._remove_counter == 3
cache[6] = 6
assert len(cache) == 3
assert set(cache.cache) == {2, 4, 6}
assert 2 in cache
assert 4 in cache
assert 6 in cache
# yapf: disable
@pytest.mark.parametrize(
("src_dtype", "tgt_dtype", "expected_result"),
[
# Different precision_levels
(torch.bool, torch.int8, True),
(torch.bool, torch.float16, True),
(torch.bool, torch.complex32, True),
(torch.int64, torch.bool, False),
(torch.int64, torch.float16, True),
(torch.int64, torch.complex32, True),
(torch.float64, torch.bool, False),
(torch.float64, torch.int8, False),
(torch.float64, torch.complex32, True),
(torch.complex128, torch.bool, False),
(torch.complex128, torch.int8, False),
(torch.complex128, torch.float16, False),
# precision_level=0
(torch.bool, torch.bool, True),
# precision_level=1
(torch.int8, torch.int16, True),
(torch.int16, torch.int8, False),
(torch.uint8, torch.int8, False),
(torch.int8, torch.uint8, False),
# precision_level=2
(torch.float16, torch.float32, True),
(torch.float32, torch.float16, False),
(torch.bfloat16, torch.float32, True),
(torch.float32, torch.bfloat16, False),
# precision_level=3
(torch.complex32, torch.complex64, True),
(torch.complex64, torch.complex32, False),
],
)
# yapf: enable
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
# yapf: disable
@pytest.mark.parametrize(
("dtypes", "expected_result"),
[
([torch.bool], torch.bool),
([torch.bool, torch.int8], torch.int8),
([torch.bool, torch.int8, torch.float16], torch.float16),
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
],
)
# yapf: enable
def test_common_broadcastable_dtype(dtypes, expected_result):
assert common_broadcastable_dtype(dtypes) == expected_result
def test_placeholder_module_error_handling():
placeholder = PlaceholderModule("placeholder_1234")
def build_ctx():
return pytest.raises(ModuleNotFoundError, match="No module named")
with build_ctx():
int(placeholder)
with build_ctx():
placeholder()
with build_ctx():
_ = placeholder.some_attr
with build_ctx():
# Test conflict with internal __name attribute
_ = placeholder.name
# OK to print the placeholder or use it in a f-string
_ = repr(placeholder)
_ = str(placeholder)
# No error yet; only error when it is used downstream
placeholder_attr = placeholder.placeholder_attr("attr")
with build_ctx():
int(placeholder_attr)
with build_ctx():
placeholder_attr()
with build_ctx():
_ = placeholder_attr.some_attr
with build_ctx():
# Test conflict with internal __module attribute
_ = placeholder_attr.module
# yapf: disable
@pytest.mark.parametrize(
"obj,key1,key2",
[
# Tests for both keys exist
({1: "a", 2: "b"}, 1, 2),
# Tests for one key does not exist
({1: "a", 2: "b"}, 1, 3),
# Tests for both keys do not exist
({1: "a", 2: "b"}, 3, 4),
])
# yapf: enable
def test_swap_dict_values(obj, key1, key2):
original_obj = obj.copy()
swap_dict_values(obj, key1, key2)
if key1 in original_obj:
assert obj[key2] == original_obj[key1]
else:
assert key2 not in obj
if key2 in original_obj:
assert obj[key1] == original_obj[key2]
else:
assert key1 not in obj
def test_model_specification(parser_with_config, cli_config_file,
cli_config_file_with_model):
# Test model in CLI takes precedence over config
args = parser_with_config.parse_args(
['serve', 'cli-model', '--config', cli_config_file_with_model])
assert args.model_tag == 'cli-model'
assert args.served_model_name == 'mymodel'
# Test model from config file works
args = parser_with_config.parse_args([
'serve',
'--config',
cli_config_file_with_model,
])
assert args.model == 'config-model'
assert args.served_model_name == 'mymodel'
# Test no model specified anywhere raises error
with pytest.raises(ValueError, match="No model specified!"):
parser_with_config.parse_args(['serve', '--config', cli_config_file])
# Test using --model option raises error
with pytest.raises(
ValueError,
match=
("With `vllm serve`, you should provide the model as a positional "
"argument or in a config file instead of via the `--model` option."),
):
parser_with_config.parse_args(['serve', '--model', 'my-model'])
# Test other config values are preserved
args = parser_with_config.parse_args([
'serve',
'cli-model',
'--config',
cli_config_file_with_model,
])
assert args.tensor_parallel_size == 2
assert args.trust_remote_code is True
assert args.port == 12312
@pytest.mark.parametrize("input", [(), ("abc", ), (None, ),
(None, bool, [1, 2, 3])])
def test_sha256(input: tuple):
digest = sha256(input)
assert digest is not None
assert isinstance(digest, bytes)
assert digest != b""
input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
assert digest == hashlib.sha256(input_bytes).digest()
# hashing again, returns the same value
assert digest == sha256(input)
# hashing different input, returns different value
assert digest != sha256(input + (1, ))
@pytest.mark.parametrize(
"path,expected",
[
("ipc://some_path", ("ipc", "some_path", "")),
("tcp://127.0.0.1:5555", ("tcp", "127.0.0.1", "5555")),
("tcp://[::1]:5555", ("tcp", "::1", "5555")), # IPv6 address
("inproc://some_identifier", ("inproc", "some_identifier", "")),
])
def test_split_zmq_path(path, expected):
assert split_zmq_path(path) == expected
@pytest.mark.parametrize(
"invalid_path",
[
"invalid_path", # Missing scheme
"tcp://127.0.0.1", # Missing port
"tcp://[::1]", # Missing port for IPv6
"tcp://:5555", # Missing host
])
def test_split_zmq_path_invalid(invalid_path):
with pytest.raises(ValueError):
split_zmq_path(invalid_path)
def test_make_zmq_socket_ipv6():
# Check if IPv6 is supported by trying to create an IPv6 socket
try:
sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
sock.close()
except socket.error:
pytest.skip("IPv6 is not supported on this system")
ctx = zmq.Context()
ipv6_path = "tcp://[::]:5555" # IPv6 loopback address
socket_type = zmq.REP # Example socket type
# Create the socket
zsock: zmq.Socket = make_zmq_socket(ctx, ipv6_path, socket_type)
# Verify that the IPV6 option is set
assert zsock.getsockopt(
zmq.IPV6) == 1, "IPV6 option should be enabled for IPv6 addresses"
# Clean up
zsock.close()
ctx.term()
def test_make_zmq_path():
assert make_zmq_path("tcp", "127.0.0.1", "5555") == "tcp://127.0.0.1:5555"
assert make_zmq_path("tcp", "::1", "5555") == "tcp://[::1]:5555"
def test_get_tcp_uri():
assert get_tcp_uri("127.0.0.1", 5555) == "tcp://127.0.0.1:5555"
assert get_tcp_uri("::1", 5555) == "tcp://[::1]:5555"
def test_split_host_port():
# valid ipv4
assert split_host_port("127.0.0.1:5555") == ("127.0.0.1", 5555)
# invalid ipv4
with pytest.raises(ValueError):
# multi colon
assert split_host_port("127.0.0.1::5555")
with pytest.raises(ValueError):
# tailing colon
assert split_host_port("127.0.0.1:5555:")
with pytest.raises(ValueError):
# no colon
assert split_host_port("127.0.0.15555")
with pytest.raises(ValueError):
# none int port
assert split_host_port("127.0.0.1:5555a")
# valid ipv6
assert split_host_port("[::1]:5555") == ("::1", 5555)
# invalid ipv6
with pytest.raises(ValueError):
# multi colon
assert split_host_port("[::1]::5555")
with pytest.raises(IndexError):
# no colon
assert split_host_port("[::1]5555")
with pytest.raises(ValueError):
# none int port
assert split_host_port("[::1]:5555a")
def test_join_host_port():
assert join_host_port("127.0.0.1", 5555) == "127.0.0.1:5555"
assert join_host_port("::1", 5555) == "[::1]:5555"
def test_json_count_leaves():
"""Test json_count_leaves function from jsontree utility."""
from vllm.utils.jsontree import json_count_leaves
# Single leaf values
assert json_count_leaves(42) == 1
assert json_count_leaves("hello") == 1
assert json_count_leaves(None) == 1
# Empty containers
assert json_count_leaves([]) == 0
assert json_count_leaves({}) == 0
assert json_count_leaves(()) == 0
# Flat structures
assert json_count_leaves([1, 2, 3]) == 3
assert json_count_leaves({"a": 1, "b": 2}) == 2
assert json_count_leaves((1, 2, 3)) == 3
# Nested structures
nested_dict = {"a": 1, "b": {"c": 2, "d": 3}}
assert json_count_leaves(nested_dict) == 3
nested_list = [1, [2, 3], 4]
assert json_count_leaves(nested_list) == 4
mixed_nested = {"list": [1, 2], "dict": {"x": 3}, "value": 4}
assert json_count_leaves(mixed_nested) == 4
def test_convert_ids_list_to_tokens():
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
token_ids = tokenizer.encode("Hello, world!")
# token_ids = [9707, 11, 1879, 0]
assert tokenizer.convert_ids_to_tokens(token_ids) == [
'Hello', ',', 'Ġworld', '!'
]
tokens = convert_ids_list_to_tokens(tokenizer, token_ids)
assert tokens == ['Hello', ',', ' world', '!']
def test_current_stream_multithread():
import threading
if not torch.cuda.is_available():
pytest.skip("CUDA not available")
main_default_stream = torch.cuda.current_stream()
child_stream = torch.cuda.Stream()
thread_stream_ready = threading.Event()
thread_can_exit = threading.Event()
def child_thread_func():
with torch.cuda.stream(child_stream):
thread_stream_ready.set()
thread_can_exit.wait(timeout=10)
child_thread = threading.Thread(target=child_thread_func)
child_thread.start()
try:
assert thread_stream_ready.wait(
timeout=5), "Child thread failed to enter stream context in time"
main_current_stream = current_stream()
assert main_current_stream != child_stream, "Main thread's current_stream was contaminated by child thread"
assert main_current_stream == main_default_stream, "Main thread's current_stream is not the default stream"
# Notify child thread it can exit
thread_can_exit.set()
finally:
# Ensure child thread exits properly
child_thread.join(timeout=5)
if child_thread.is_alive():
pytest.fail("Child thread failed to exit properly")
def test_load_config_file(tmp_path):
# Define the configuration data
config_data = {
"enable-logging": True,
"list-arg": ["item1", "item2"],
"port": 12323,
"tensor-parallel-size": 4
}
# Write the configuration data to a temporary YAML file
config_file_path = tmp_path / "config.yaml"
with open(config_file_path, "w") as config_file:
yaml.dump(config_data, config_file)
# Initialize the parser
parser = FlexibleArgumentParser()
# Call the function with the temporary file path
processed_args = parser.load_config_file(str(config_file_path))
# Expected output
expected_args = [
"--enable-logging",
"--list-arg",
"item1",
"item2",
"--port",
"12323",
"--tensor-parallel-size",
"4",
]
# Assert that the processed arguments match the expected output
assert processed_args == expected_args
os.remove(str(config_file_path))
def test_unique_filepath():
temp_dir = tempfile.mkdtemp()
path_fn = lambda i: Path(temp_dir) / f"file_{i}.txt"
paths = set()
for i in range(10):
path = unique_filepath(path_fn)
path.write_text("test")
paths.add(path)
assert len(paths) == 10
assert len(list(Path(temp_dir).glob("*.txt"))) == 10
...@@ -1478,7 +1478,6 @@ def test_kv_connector_handles_preemption(is_async, use_ec_connector, ec_role): ...@@ -1478,7 +1478,6 @@ def test_kv_connector_handles_preemption(is_async, use_ec_connector, ec_role):
assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1 assert scheduler.kv_cache_manager.block_pool.get_num_free_blocks() == NUM_BLOCKS - 1
def make_output(scheduler: Scheduler): def make_output(scheduler: Scheduler):
return ModelRunnerOutput( return ModelRunnerOutput(
req_ids=[req.request_id for req in scheduler.running], req_ids=[req.request_id for req in scheduler.running],
......
...@@ -112,13 +112,6 @@ def create_vllm_config( ...@@ -112,13 +112,6 @@ def create_vllm_config(
enable_chunked_prefill=enable_chunked_prefill, enable_chunked_prefill=enable_chunked_prefill,
is_encoder_decoder=model_config.is_encoder_decoder, is_encoder_decoder=model_config.is_encoder_decoder,
) )
scheduler_config = SchedulerConfig(
max_num_seqs=max_num_seqs,
max_num_batched_tokens=max_num_batched_tokens,
max_model_len=max_model_len,
enable_chunked_prefill=enable_chunked_prefill,
is_encoder_decoder=model_config.is_encoder_decoder,
)
# Cache config, optionally force APC # Cache config, optionally force APC
cache_config = CacheConfig( cache_config = CacheConfig(
block_size=block_size, block_size=block_size,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment