"tests/cpp/socket_communicator_test.cc" did not exist on "7bdc16197f7fe7890fc2b91d020744a82a024984"
Unverified Commit 7e831efe authored by Xinyuan Tong's avatar Xinyuan Tong Committed by GitHub
Browse files

Fix chat template handling for OpenAI serving (#8635)


Signed-off-by: default avatarXinyuan Tong <justinning0323@outlook.com>
Signed-off-by: default avatarXinyuan Tong <xinyuantong.cs@gmail.com>
parent 20b5563e
...@@ -954,20 +954,6 @@ register_conv_template( ...@@ -954,20 +954,6 @@ register_conv_template(
) )
) )
register_conv_template(
Conversation(
name="mimo-vl",
system_message="You are MiMo, an AI assistant developed by Xiaomi.",
system_template="<|im_start|>system\n{system_message}",
roles=("<|im_start|>user", "<|im_start|>assistant"),
sep="<|im_end|>\n",
sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
stop_str=["<|im_end|>"],
image_token="<|vision_start|><|image_pad|><|vision_end|>",
)
)
register_conv_template( register_conv_template(
Conversation( Conversation(
name="qwen2-audio", name="qwen2-audio",
...@@ -981,51 +967,11 @@ register_conv_template( ...@@ -981,51 +967,11 @@ register_conv_template(
) )
) )
register_conv_template(
Conversation(
name="llama_4_vision",
system_message="You are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.",
system_template="<|header_start|>system<|header_end|>\n\n{system_message}<|eot|>",
roles=("user", "assistant"),
sep_style=SeparatorStyle.LLAMA4,
sep="",
stop_str="<|eot|>",
image_token="<|image|>",
)
)
register_conv_template(
Conversation(
name="step3-vl",
system_message="<|begin▁of▁sentence|>You are a helpful assistant",
system_template="{system_message}\n",
roles=(
"<|BOT|>user\n",
"<|BOT|>assistant\n<think>\n",
),
sep="<|EOT|>",
sep_style=SeparatorStyle.NO_COLON_SINGLE,
stop_str="<|EOT|>",
image_token="<im_patch>",
# add_bos=True,
)
)
@register_conv_template_matching_function @register_conv_template_matching_function
def match_internvl(model_path: str): def match_internvl(model_path: str):
if re.search(r"internvl", model_path, re.IGNORECASE): if re.search(r"internvl", model_path, re.IGNORECASE):
return "internvl-2-5" return "internvl-2-5"
if re.search(r"intern.*s1", model_path, re.IGNORECASE):
return "interns1"
@register_conv_template_matching_function
def match_llama_vision(model_path: str):
if re.search(r"llama.*3\.2.*vision", model_path, re.IGNORECASE):
return "llama_3_vision"
if re.search(r"llama.*4.*", model_path, re.IGNORECASE):
return "llama_4_vision"
@register_conv_template_matching_function @register_conv_template_matching_function
...@@ -1040,22 +986,6 @@ def match_vicuna(model_path: str): ...@@ -1040,22 +986,6 @@ def match_vicuna(model_path: str):
return "vicuna_v1.1" return "vicuna_v1.1"
@register_conv_template_matching_function
def match_llama2_chat(model_path: str):
if re.search(
r"llama-2.*chat|codellama.*instruct",
model_path,
re.IGNORECASE,
):
return "llama-2"
@register_conv_template_matching_function
def match_mistral(model_path: str):
if re.search(r"pixtral|(mistral|mixtral).*instruct", model_path, re.IGNORECASE):
return "mistral"
@register_conv_template_matching_function @register_conv_template_matching_function
def match_deepseek_vl(model_path: str): def match_deepseek_vl(model_path: str):
if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE): if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
...@@ -1064,12 +994,6 @@ def match_deepseek_vl(model_path: str): ...@@ -1064,12 +994,6 @@ def match_deepseek_vl(model_path: str):
@register_conv_template_matching_function @register_conv_template_matching_function
def match_qwen_chat_ml(model_path: str): def match_qwen_chat_ml(model_path: str):
if re.search(r"gme.*qwen.*vl", model_path, re.IGNORECASE):
return "gme-qwen2-vl"
if re.search(r"qwen.*vl", model_path, re.IGNORECASE):
return "qwen2-vl"
if re.search(r"qwen.*audio", model_path, re.IGNORECASE):
return "qwen2-audio"
if re.search( if re.search(
r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2", r"llava-v1\.6-34b|llava-v1\.6-yi-34b|llava-next-video-34b|llava-onevision-qwen2",
model_path, model_path,
...@@ -1078,12 +1002,6 @@ def match_qwen_chat_ml(model_path: str): ...@@ -1078,12 +1002,6 @@ def match_qwen_chat_ml(model_path: str):
return "chatml-llava" return "chatml-llava"
@register_conv_template_matching_function
def match_gemma3_instruct(model_path: str):
if re.search(r"gemma-3.*it", model_path, re.IGNORECASE):
return "gemma-it"
@register_conv_template_matching_function @register_conv_template_matching_function
def match_openbmb_minicpm(model_path: str): def match_openbmb_minicpm(model_path: str):
if re.search(r"minicpm-v", model_path, re.IGNORECASE): if re.search(r"minicpm-v", model_path, re.IGNORECASE):
...@@ -1092,37 +1010,7 @@ def match_openbmb_minicpm(model_path: str): ...@@ -1092,37 +1010,7 @@ def match_openbmb_minicpm(model_path: str):
return "minicpmo" return "minicpmo"
@register_conv_template_matching_function
def match_moonshot_kimivl(model_path: str):
if re.search(r"kimi.*vl", model_path, re.IGNORECASE):
return "kimi-vl"
@register_conv_template_matching_function
def match_devstral(model_path: str):
if re.search(r"devstral", model_path, re.IGNORECASE):
return "devstral"
@register_conv_template_matching_function @register_conv_template_matching_function
def match_phi_4_mm(model_path: str): def match_phi_4_mm(model_path: str):
if "phi-4-multimodal" in model_path.lower(): if "phi-4-multimodal" in model_path.lower():
return "phi-4-mm" return "phi-4-mm"
@register_conv_template_matching_function
def match_vila(model_path: str):
if re.search(r"vila", model_path, re.IGNORECASE):
return "chatml"
@register_conv_template_matching_function
def match_mimo_vl(model_path: str):
if re.search(r"mimo.*vl", model_path, re.IGNORECASE):
return "mimo-vl"
# @register_conv_template_matching_function
# def match_step3(model_path: str):
# if re.search(r"step3", model_path, re.IGNORECASE):
# return "step3-vl"
...@@ -84,26 +84,27 @@ class TemplateManager: ...@@ -84,26 +84,27 @@ class TemplateManager:
if chat_template_arg: if chat_template_arg:
self._load_explicit_chat_template(tokenizer_manager, chat_template_arg) self._load_explicit_chat_template(tokenizer_manager, chat_template_arg)
else: else:
# Try HuggingFace template first # Guess chat template from model path
hf_template = self._resolve_hf_chat_template(tokenizer_manager)
if hf_template:
self._jinja_template_content_format = (
detect_jinja_template_content_format(hf_template)
)
logger.info(
f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
)
return
# Fallback to SGLang template guessing
self.guess_chat_template_from_model_path(model_path) self.guess_chat_template_from_model_path(model_path)
# Set default format if no template was found # If no pre-defined template was found, fallback to HuggingFace template
if self._chat_template_name is None: if self._chat_template_name is None:
self._jinja_template_content_format = "string" # Try HuggingFace template first
logger.info( hf_template = self._resolve_hf_chat_template(tokenizer_manager)
"No chat template found, defaulting to 'string' content format" if hf_template:
) # override the chat template
tokenizer_manager.tokenizer.chat_template = hf_template
self._jinja_template_content_format = (
detect_jinja_template_content_format(hf_template)
)
logger.info(
f"Using default HuggingFace chat template with detected content format: {self._jinja_template_content_format}"
)
return
# Default to string content format if no template was found
self._jinja_template_content_format = "string"
logger.info("No chat template found, defaulting to 'string' content format")
def _load_explicit_chat_template( def _load_explicit_chat_template(
self, tokenizer_manager, chat_template_arg: str self, tokenizer_manager, chat_template_arg: str
...@@ -257,13 +258,15 @@ class TemplateManager: ...@@ -257,13 +258,15 @@ class TemplateManager:
Returns the chat template string if found, None otherwise. Returns the chat template string if found, None otherwise.
""" """
tokenizer = tokenizer_manager.tokenizer
# Try to get AutoTokenizer chat template
try: try:
return tokenizer.get_chat_template() if processor := tokenizer_manager.processor:
if hasattr(processor, "chat_template") and processor.chat_template:
return processor.chat_template
if tokenizer := tokenizer_manager.tokenizer:
if hasattr(tokenizer, "chat_template") and tokenizer.chat_template:
return tokenizer.chat_template
except Exception as e: except Exception as e:
logger.debug(f"Error getting chat template via get_chat_template(): {e}") logger.debug(f"Error getting chat template: {e}")
logger.debug("No HuggingFace chat template found") logger.debug("No HuggingFace chat template found")
return None return None
...@@ -225,10 +225,10 @@ class TokenizerManager: ...@@ -225,10 +225,10 @@ class TokenizerManager:
self.tokenizer = get_tokenizer_from_processor(self.processor) self.tokenizer = get_tokenizer_from_processor(self.processor)
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
else: else:
self.mm_processor = None self.mm_processor = self.processor = None
if server_args.skip_tokenizer_init: if server_args.skip_tokenizer_init:
self.tokenizer = self.processor = None self.tokenizer = None
else: else:
self.tokenizer = get_tokenizer( self.tokenizer = get_tokenizer(
server_args.tokenizer_path, server_args.tokenizer_path,
......
...@@ -12,7 +12,6 @@ ...@@ -12,7 +12,6 @@
# limitations under the License. # limitations under the License.
# ============================================================================== # ==============================================================================
import re
from typing import Dict, List, Optional, Union from typing import Dict, List, Optional, Union
from sglang.srt.managers.multimodal_processor import ( from sglang.srt.managers.multimodal_processor import (
...@@ -38,14 +37,8 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor): ...@@ -38,14 +37,8 @@ class Gemma3nSGLangProcessor(SGLangBaseProcessor):
self.mm_tokens = MultimodalSpecialTokens( self.mm_tokens = MultimodalSpecialTokens(
image_token="<image_soft_token>", image_token="<image_soft_token>",
image_token_id=hf_config.image_token_id, image_token_id=hf_config.image_token_id,
image_token_regex=re.compile(
r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
),
audio_token="<audio_soft_token>", audio_token="<audio_soft_token>",
audio_token_id=hf_config.audio_token_id, audio_token_id=hf_config.audio_token_id,
audio_token_regex=re.compile(
r"<start_of_audio>(?:(?:<audio_soft_token>)*<end_of_audio>)?"
),
).build(_processor) ).build(_processor)
async def process_mm_data_async( async def process_mm_data_async(
......
...@@ -31,6 +31,8 @@ class TestQwen2VLServer(TestOpenAIVisionServer): ...@@ -31,6 +31,8 @@ class TestQwen2VLServer(TestOpenAIVisionServer):
other_args=[ other_args=[
"--mem-fraction-static", "--mem-fraction-static",
"0.35", "0.35",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -53,6 +55,8 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer): ...@@ -53,6 +55,8 @@ class TestQwen2_5_VLServer(TestOpenAIVisionServer):
other_args=[ other_args=[
"--mem-fraction-static", "--mem-fraction-static",
"0.35", "0.35",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -76,6 +80,8 @@ class TestVLMContextLengthIssue(CustomTestCase): ...@@ -76,6 +80,8 @@ class TestVLMContextLengthIssue(CustomTestCase):
"--context-length", "--context-length",
"300", "300",
"--mem-fraction-static=0.75", "--mem-fraction-static=0.75",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -149,6 +155,8 @@ class TestMinicpmvServer(TestOpenAIVisionServer): ...@@ -149,6 +155,8 @@ class TestMinicpmvServer(TestOpenAIVisionServer):
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.35", "0.35",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -164,7 +172,11 @@ class TestInternVL2_5Server(TestOpenAIVisionServer): ...@@ -164,7 +172,11 @@ class TestInternVL2_5Server(TestOpenAIVisionServer):
cls.model, cls.model,
cls.base_url, cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=["--trust-remote-code"], other_args=[
"--trust-remote-code",
"--cuda-graph-max-bs",
"4",
],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -183,6 +195,8 @@ class TestMinicpmoServer(TestOpenAIVisionServer): ...@@ -183,6 +195,8 @@ class TestMinicpmoServer(TestOpenAIVisionServer):
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.65", "0.65",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -207,10 +221,13 @@ class TestMimoVLServer(TestOpenAIVisionServer): ...@@ -207,10 +221,13 @@ class TestMimoVLServer(TestOpenAIVisionServer):
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.6", "0.6",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
if __name__ == "__main__": if __name__ == "__main__":
del TestOpenAIVisionServer
unittest.main() unittest.main()
...@@ -23,6 +23,8 @@ class TestPixtralServer(TestOpenAIVisionServer): ...@@ -23,6 +23,8 @@ class TestPixtralServer(TestOpenAIVisionServer):
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.70", "0.70",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -45,6 +47,8 @@ class TestMistral3_1Server(TestOpenAIVisionServer): ...@@ -45,6 +47,8 @@ class TestMistral3_1Server(TestOpenAIVisionServer):
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.75", "0.75",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -67,7 +71,8 @@ class TestDeepseekVL2Server(TestOpenAIVisionServer): ...@@ -67,7 +71,8 @@ class TestDeepseekVL2Server(TestOpenAIVisionServer):
"--trust-remote-code", "--trust-remote-code",
"--context-length", "--context-length",
"4096", "4096",
"--disable-cuda-graph", "--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -90,6 +95,8 @@ class TestJanusProServer(TestOpenAIVisionServer): ...@@ -90,6 +95,8 @@ class TestJanusProServer(TestOpenAIVisionServer):
"--trust-remote-code", "--trust-remote-code",
"--mem-fraction-static", "--mem-fraction-static",
"0.35", "0.35",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -120,6 +127,10 @@ class TestJanusProServer(TestOpenAIVisionServer): ...@@ -120,6 +127,10 @@ class TestJanusProServer(TestOpenAIVisionServer):
# "0.8", # "0.8",
# "--tp-size=8", # "--tp-size=8",
# "--context-length=8192", # "--context-length=8192",
# "--mm-attention-backend",
# "fa3",
# "--cuda-graph-max-bs",
# "4",
# ], # ],
# ) # )
# cls.base_url += "/v1" # cls.base_url += "/v1"
...@@ -143,6 +154,8 @@ class TestGemma3itServer(TestOpenAIVisionServer): ...@@ -143,6 +154,8 @@ class TestGemma3itServer(TestOpenAIVisionServer):
"--mem-fraction-static", "--mem-fraction-static",
"0.70", "0.70",
"--enable-multimodal", "--enable-multimodal",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -154,7 +167,7 @@ class TestGemma3itServer(TestOpenAIVisionServer): ...@@ -154,7 +167,7 @@ class TestGemma3itServer(TestOpenAIVisionServer):
class TestGemma3nServer(TestOpenAIVisionServer): class TestGemma3nServer(TestOpenAIVisionServer):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model = "google/gemma-3n-E2B-it" cls.model = "google/gemma-3n-E4B-it"
cls.base_url = DEFAULT_URL_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST
cls.api_key = "sk-123456" cls.api_key = "sk-123456"
cls.process = popen_launch_server( cls.process = popen_launch_server(
...@@ -166,7 +179,7 @@ class TestGemma3nServer(TestOpenAIVisionServer): ...@@ -166,7 +179,7 @@ class TestGemma3nServer(TestOpenAIVisionServer):
"--mem-fraction-static", "--mem-fraction-static",
"0.70", "0.70",
"--cuda-graph-max-bs", "--cuda-graph-max-bs",
"1", "4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -193,6 +206,8 @@ class TestKimiVLServer(TestOpenAIVisionServer): ...@@ -193,6 +206,8 @@ class TestKimiVLServer(TestOpenAIVisionServer):
"4096", "4096",
"--dtype", "--dtype",
"bfloat16", "bfloat16",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -233,6 +248,8 @@ class TestPhi4MMServer(TestOpenAIVisionServer): ...@@ -233,6 +248,8 @@ class TestPhi4MMServer(TestOpenAIVisionServer):
"--lora-paths", "--lora-paths",
f"vision={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/vision-lora", f"vision={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/vision-lora",
f"speech={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/speech-lora", f"speech={constants.HF_HUB_CACHE}/models--microsoft--Phi-4-multimodal-instruct/snapshots/{revision}/speech-lora",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
...@@ -277,10 +294,13 @@ class TestVILAServer(TestOpenAIVisionServer): ...@@ -277,10 +294,13 @@ class TestVILAServer(TestOpenAIVisionServer):
"--trust-remote-code", "--trust-remote-code",
"--context-length=65536", "--context-length=65536",
f"--revision={cls.revision}", f"--revision={cls.revision}",
"--cuda-graph-max-bs",
"4",
], ],
) )
cls.base_url += "/v1" cls.base_url += "/v1"
if __name__ == "__main__": if __name__ == "__main__":
del TestOpenAIVisionServer
unittest.main() unittest.main()
...@@ -71,7 +71,7 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -71,7 +71,7 @@ class TestOpenAIVisionServer(CustomTestCase):
}, },
{ {
"type": "text", "type": "text",
"text": "Describe this image in a very short sentence.", "text": "Describe this image in a sentence.",
}, },
], ],
}, },
...@@ -119,7 +119,7 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -119,7 +119,7 @@ class TestOpenAIVisionServer(CustomTestCase):
}, },
{ {
"type": "text", "type": "text",
"text": "Describe this image in a very short sentence.", "text": "Describe this image in a sentence.",
}, },
], ],
}, },
...@@ -455,7 +455,7 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -455,7 +455,7 @@ class TestOpenAIVisionServer(CustomTestCase):
content.append( content.append(
{ {
"type": "text", "type": "text",
"text": "Describe this image in a very short sentence.", "text": "Describe this image in a sentence.",
} }
) )
...@@ -528,14 +528,20 @@ class TestOpenAIVisionServer(CustomTestCase): ...@@ -528,14 +528,20 @@ class TestOpenAIVisionServer(CustomTestCase):
# a fragment of Trump's speech # a fragment of Trump's speech
audio_response = self.get_audio_response( audio_response = self.get_audio_response(
AUDIO_TRUMP_SPEECH_URL, AUDIO_TRUMP_SPEECH_URL,
"I have an audio sample. Please repeat the person's words", "Listen to this audio and write down the audio transcription in English.",
category="speech", category="speech",
) )
assert "thank you" in audio_response check_list = [
assert "it's a privilege to be here" in audio_response "thank you",
assert "leader" in audio_response "it's a privilege to be here",
assert "science" in audio_response "leader",
assert "art" in audio_response "science",
"art",
]
for check_word in check_list:
assert (
check_word in audio_response
), f"audio_response: |{audio_response}| should contain |{check_word}|"
def _test_audio_ambient_completion(self): def _test_audio_ambient_completion(self):
# bird song # bird song
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment