Unverified Commit 7711ac6e authored by Mick's avatar Mick Committed by GitHub
Browse files

doc: emphasize and notify the usage of chat_template (#3589)


Co-authored-by: default avatarChayenne <zhaochen20@outlook.com>
parent 7443197a
......@@ -24,7 +24,8 @@
"\n",
"Launch the server in your terminal and wait for it to initialize.\n",
"\n",
"Remember to add `--chat-template llama_3_vision` to specify the vision chat template, otherwise the server only supports text.\n",
"**Remember to add `--chat-template llama_3_vision` to specify the vision chat template, otherwise the server only supports text, and performance degradation may occur.**\n",
"\n",
"We need to specify `--chat-template` for vision language models because the chat template provided in Hugging Face tokenizer only supports text."
]
},
......
......@@ -56,6 +56,9 @@ Please consult the documentation below to learn more about the parameters you ma
* `json_model_override_args`: Override model config with the provided JSON.
* `delete_ckpt_after_loading`: Delete the model checkpoint after loading the model.
> [!IMPORTANT]
> **Make sure the correct `chat_template` is passed, or performance degradation may occur.**
## Serving: HTTP & API
### HTTP Server configuration
......
......@@ -353,7 +353,6 @@ register_chat_template(
)
)
register_chat_template(
ChatTemplate(
name="deepseek-v3",
......@@ -428,12 +427,13 @@ def match_chat_ml(model_path: str):
if "tinyllama" in model_path:
return get_chat_template("chatml")
# Now the suffix for qwen2 chat model is "instruct"
if (
"qwen" in model_path
and ("chat" in model_path or "instruct" in model_path)
and ("llava" not in model_path)
):
return get_chat_template("qwen")
if "qwen" in model_path:
if "vl" in model_path:
return get_chat_template("qwen2-vl")
if ("chat" in model_path or "instruct" in model_path) and (
"llava" not in model_path
):
return get_chat_template("qwen")
if (
"llava-v1.6-34b" in model_path
or "llava-v1.6-yi-34b" in model_path
......@@ -459,6 +459,13 @@ def match_gemma_it(model_path: str):
return get_chat_template("gemma-it")
@register_chat_template_matching_function
def match_openbmb_minicpm(model_path: str):
model_path = model_path.lower()
if "minicpm" in model_path:
return get_chat_template("minicpmv")
@register_chat_template_matching_function
def match_c4ai_command_r(model_path: str):
model_path = model_path.lower()
......
......@@ -438,7 +438,9 @@ def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dic
# Launch tokenizer process
tokenizer_manager = TokenizerManager(server_args, port_args)
if server_args.chat_template:
load_chat_template_for_openai_api(tokenizer_manager, server_args.chat_template)
load_chat_template_for_openai_api(
tokenizer_manager, server_args.chat_template, server_args.model_path
)
# Wait for the model to finish loading
scheduler_infos = []
......
......@@ -449,7 +449,8 @@ class LlavaBaseForCausalLM(nn.Module):
projector_weights = {
"model.mm_projector.0": "multi_modal_projector.linear_1",
"model.mm_projector.2": "multi_modal_projector.linear_2",
"model.vision_tower.vision_tower": "vision_tower", # Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
"model.vision_tower.vision_tower": "vision_tower",
# Update the vision tower weights if we find them in the checkpoint (it may be finetuned).
"model.image_newline": "language_model.model.image_newline",
}
params_dict = dict(self.named_parameters())
......
......@@ -20,12 +20,14 @@ import os
import time
import uuid
from http import HTTPStatus
from typing import Dict, List, Optional
from typing import Dict, List
from fastapi import HTTPException, Request, UploadFile
from fastapi.responses import ORJSONResponse, StreamingResponse
from pydantic import ValidationError
from sglang.lang.chat_template import get_chat_template_by_model_path
try:
from outlines.fsm.json_schema import convert_json_schema_to_str
except ImportError:
......@@ -92,7 +94,6 @@ file_id_response: Dict[str, FileResponse] = {}
# map file id to file path in SGLang backend
file_id_storage: Dict[str, str] = {}
# backend storage directory
storage_dir = None
......@@ -116,12 +117,13 @@ def create_streaming_error_response(
return json_str
def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg, model_path):
global chat_template_name
logger.info(
f"Use chat template for the OpenAI-compatible API server: {chat_template_arg}"
)
if not chat_template_exists(chat_template_arg):
if not os.path.exists(chat_template_arg):
raise RuntimeError(
......@@ -163,6 +165,18 @@ def load_chat_template_for_openai_api(tokenizer_manager, chat_template_arg):
else:
chat_template_name = chat_template_arg
# check chat-template
chat_template = get_chat_template_by_model_path(model_path)
if chat_template is not None:
official_chat_template = chat_template.name
used_chat_template = chat_template_name
if official_chat_template != used_chat_template:
logger.warning(
f"Using a chat_template: '{used_chat_template}', "
f"which is different from official chat template: '{official_chat_template}', "
f"This discrepancy may lead to performance degradation."
)
async def v1_files_create(file: UploadFile, purpose: str, file_storage_pth: str = None):
try:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment