Merge pull request #266 from InfiniTensor/issue/265

issue/265 perf(llm): replace O(n²) full-sequence detokenize with incr…

Merge pull request #266 from InfiniTensor/issue/265
issue/265 perf(llm): replace O(n²) full-sequence detokenize with incr…
66bfd282 · thatPepe · GitHub · 5fb56f97 · 8f1dc2a4 · 66bfd282
Unverified Commit 66bfd282 authored Mar 16, 2026 by thatPepe Committed by GitHub Mar 16, 2026
Show whitespace changes
Inline Side-by-side

Showing with 26 additions and 18 deletions

python/infinilm/llm/llm.py python/infinilm/llm/llm.py +25 -18

python/infinilm/llm/request.py python/infinilm/llm/request.py +1 -0

No files found.
--- a/python/infinilm/llm/llm.py
+++ b/python/infinilm/llm/llm.py
@@ -269,11 +269,15 @@ class LLMEngine:
                req.is_prefill = False

            req.generated_token_ids.append(token_id)
-            decoded_text = self.detokenize(req.generated_token_ids)
-            req.generated_text = decoded_text
-            holds_back_incomplete_utf8 = bool(decoded_text) and decoded_text.endswith(
-                "\ufffd"
-            )
+            pending_tokens = req.generated_token_ids[req._pending_token_offset :]
+            delta = self.tokenizer.decode(pending_tokens)
+            holds_back = bool(delta) and delta.endswith("\ufffd")
+
+            last_committed_text = req.generated_text
+
+            if not holds_back:
+                req.generated_text = last_committed_text + delta
+                req._pending_token_offset = len(req.generated_token_ids)

            is_finished = self._check_request_finished(req, token_id)

@@ -281,25 +285,28 @@ class LLMEngine:
            # For offline generation (no output queue), keep the fast incremental path.
            if req._output_queue is None:
                if is_finished:
-                    if holds_back_incomplete_utf8:
-                        req.generated_text = decoded_text[:-1]
                    req.mark_finished(req.finish_reason)

            else:
-                if (holds_back_incomplete_utf8 and not is_finished) or (
-                    is_finished
-                    and req.finish_reason
-                    in (FinishReason.LENGTH, FinishReason.STOP_STRING)
+                if holds_back and not is_finished:
+                    token_text = ""
+                else:
+                    if is_finished and req.finish_reason in (
+                        FinishReason.EOS_TOKEN,
+                        FinishReason.LENGTH,
+                        FinishReason.STOP_STRING,
                    ):
                        token_text = ""
                    else:
-                    last_len = getattr(req, "_stream_last_yielded_length", 0)
-                    token_text = decoded_text[last_len:]
+                        token_text = req.generated_text[
+                            req._stream_last_yielded_length :
+                        ]
                        if token_text:
-                        req._stream_last_yielded_length = len(decoded_text)
+                            req._stream_last_yielded_length = len(req.generated_text)

                    if is_finished:
                        req.mark_finished(req.finish_reason)
+
                output = TokenOutput(
                    request_id=req.request_id,
                    token_id=token_id,

--- a/python/infinilm/llm/request.py
+++ b/python/infinilm/llm/request.py
@@ -152,6 +152,7 @@ class InferenceRequest:
        # Streaming helpers (vLLM-style UTF-8 buffering at the chunking layer)
        # Used by the engine to compute "delta" text chunks from a full decode.
        self._stream_last_yielded_length: int = 0
+        self._pending_token_offset: int = 0

    @property
    def output_queue(self) -> janus.Queue: