resolve comments

Signed-off-by: Ceng23333 <441651826@qq.com>

resolve comments
Signed-off-by: Ceng23333 <441651826@qq.com>
505f8f66 · Ceng23333 · 1be6559f · 505f8f66 · 505f8f66
Commit 505f8f66 authored Feb 04, 2026 by Ceng23333
Hide whitespace changes
Inline Side-by-side

Showing with 10 additions and 3 deletions

python/infinilm/llm/llm.py python/infinilm/llm/llm.py +10 -1

python/infinilm/server/inference_server.py python/infinilm/server/inference_server.py +0 -2

No files found.
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -251,7 +251,10 @@ class LLMEngine:
                )
                # vLLM-style: hold back only if we are not on the final chunk.
-                if holds_back_incomplete_utf8 and not finished_now:
+                # Also suppress output when finish reason is LENGTH to avoid replacement issues.
+                if (holds_back_incomplete_utf8 and not finished_now) or (
+                    finished_now and req.finish_reason == FinishReason.LENGTH
+                ):
                    token_text = ""
                else:
                    last_len = getattr(req, "_stream_last_yielded_length", 0)
@@ -297,6 +300,12 @@ class LLMEngine:
                req.finish_reason = FinishReason.STOP_STRING
                return True
+        # Check stop token IDs
+        stop_token_ids = req.sampling_params.stop_token_ids or []
+        if stop_token_ids and token_id in stop_token_ids:
+            req.finish_reason = FinishReason.STOP_STRING
+            return True
        return False
    def tokenize(self, text: str) -> List[int]:

--- a/python/infinilm/server/inference_server.py
+++ b/python/infinilm/server/inference_server.py
@@ -249,7 +249,6 @@ class InferenceServer:
    async def _stream_chat(self, request_id: str, data: dict, http_request: Request):
        """Handle streaming chat request."""
        req = None
-        start_time = time.time()
        try:
            messages = data.get("messages", [])
@@ -348,7 +347,6 @@ class InferenceServer:
    async def _chat(self, request_id: str, data: dict, http_request: Request):
        """Handle non-streaming chat request."""
        req = None
-        start_time = time.time()
        try:
            messages = data.get("messages", [])