"vscode:/vscode.git/clone" did not exist on "1c710c195a19970d9683ed94ea32d538a49a5177"
Commit 505f8f66 authored by Ceng23333's avatar Ceng23333
Browse files

resolve comments


Signed-off-by: default avatarCeng23333 <441651826@qq.com>
parent 1be6559f
...@@ -251,7 +251,10 @@ class LLMEngine: ...@@ -251,7 +251,10 @@ class LLMEngine:
) )
# vLLM-style: hold back only if we are not on the final chunk. # vLLM-style: hold back only if we are not on the final chunk.
if holds_back_incomplete_utf8 and not finished_now: # Also suppress output when finish reason is LENGTH to avoid replacement issues.
if (holds_back_incomplete_utf8 and not finished_now) or (
finished_now and req.finish_reason == FinishReason.LENGTH
):
token_text = "" token_text = ""
else: else:
last_len = getattr(req, "_stream_last_yielded_length", 0) last_len = getattr(req, "_stream_last_yielded_length", 0)
...@@ -297,6 +300,12 @@ class LLMEngine: ...@@ -297,6 +300,12 @@ class LLMEngine:
req.finish_reason = FinishReason.STOP_STRING req.finish_reason = FinishReason.STOP_STRING
return True return True
# Check stop token IDs
stop_token_ids = req.sampling_params.stop_token_ids or []
if stop_token_ids and token_id in stop_token_ids:
req.finish_reason = FinishReason.STOP_STRING
return True
return False return False
def tokenize(self, text: str) -> List[int]: def tokenize(self, text: str) -> List[int]:
......
...@@ -249,7 +249,6 @@ class InferenceServer: ...@@ -249,7 +249,6 @@ class InferenceServer:
async def _stream_chat(self, request_id: str, data: dict, http_request: Request): async def _stream_chat(self, request_id: str, data: dict, http_request: Request):
"""Handle streaming chat request.""" """Handle streaming chat request."""
req = None req = None
start_time = time.time()
try: try:
messages = data.get("messages", []) messages = data.get("messages", [])
...@@ -348,7 +347,6 @@ class InferenceServer: ...@@ -348,7 +347,6 @@ class InferenceServer:
async def _chat(self, request_id: str, data: dict, http_request: Request): async def _chat(self, request_id: str, data: dict, http_request: Request):
"""Handle non-streaming chat request.""" """Handle non-streaming chat request."""
req = None req = None
start_time = time.time()
try: try:
messages = data.get("messages", []) messages = data.get("messages", [])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment