Fix TIS client got-no-space-result side effect brought by PR #197 (#222)

* rollback * rollback chatbot.py

Fix TIS client got-no-space-result side effect brought by PR #197 (#222)
* rollback * rollback chatbot.py
68296844 · Lyu Han · GitHub · af517a4a · 68296844 · 68296844
Unverified Commit 68296844 authored Aug 14, 2023 by Lyu Han Committed by GitHub Aug 14, 2023
Showing with 14 additions and 17 deletions

lmdeploy/serve/turbomind/chatbot.py lmdeploy/serve/turbomind/chatbot.py +13 -16

lmdeploy/serve/turbomind/triton_models/postprocessing/config.pbtxt ...serve/turbomind/triton_models/postprocessing/config.pbtxt +1 -1

No files found.
--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -26,7 +26,6 @@ class Session:
    request_id: str = ''
    histories: str = ''  # history conversations of the session
    sequence_length: int = 0  # the total generated token number in the session
-    sequence_offset: int = 0  # the new generated token offset in the session
    prompt: str = ''
    response: str = ''
    status: int = None  # status of the session
@@ -599,15 +598,14 @@ class Chatbot:
        Yields:
            tuple: status, text, generated token number
        """
-        session.sequence_offset = n_input_token + preseq_length
+        offset = n_input_token + preseq_length
-        sentinel = n_input_token + preseq_length
        status, res, n_token = None, '', 0
        while True:
            result = res_queue.get()
            if result is None:
                status = StatusCode.TRITON_STREAM_END
                res = session.response
-                n_token = session.sequence_length - sentinel
+                n_token = session.sequence_length - offset
                session.status = StatusCode.TRITON_STREAM_END
                break
            if 'errcode' in result:
@@ -630,31 +628,30 @@ class Chatbot:
                output_ids = result.as_numpy('output_ids')
                session.sequence_length = sequence_length.squeeze()
-                new_token_length = sequence_length - session.sequence_offset
+                sequence_length = sequence_length - offset
                last_token_id = output_ids[-1][-1][session.sequence_length - 1]
                if last_token_id == eos_id:
                    session.sequence_length = session.sequence_length - 1
-                    new_token_length = new_token_length - 1
+                    sequence_length = sequence_length - 1
                output_ids = output_ids.reshape((1, 1, output_ids.shape[-1]))
-                new_token_length = new_token_length.reshape(
+                sequence_length = sequence_length.reshape(
-                    (1, new_token_length.shape[-1]))
+                    (1, sequence_length.shape[-1]))
                if profile_generation:
                    yield (StatusCode.TRITON_STREAM_ING,
                           'postprocessing is ignored during profiling '
-                           'token generation', new_token_length.squeeze())
+                           'token generation', sequence_length.squeeze())
                    continue
-                output_str = postprocess(
+                output_str = postprocess(output_ids[:, :, offset:],
-                    output_ids[:, :, session.sequence_offset:],
+                                         sequence_length)
-                    new_token_length)
-                session.sequence_offset = session.sequence_length
                text = output_str[0].decode()
                if display:
-                    print(text, end='', flush=True)
+                    new_text = text[len(session.response):]
-                session.response += text
+                    print(new_text, end='', flush=True)
+                session.response = text
                yield (StatusCode.TRITON_STREAM_ING, session.response,
-                       session.sequence_offset - sentinel)
+                       sequence_length.squeeze())
            except Exception as e:
                logger.error(f'catch exception: {e}')

--- a/lmdeploy/serve/turbomind/triton_models/postprocessing/config.pbtxt
+++ b/lmdeploy/serve/turbomind/triton_models/postprocessing/config.pbtxt
@@ -23,7 +23,7 @@ output [
 instance_group [
    {
-        count: 1
+        count: 16
        kind: KIND_CPU
    }
 ]