Fix crash and remove `sys_instruct` from `chat.py` and `client.py`(#591)

* fix crash * update profile_generation.py * format * use self.bos_id * remove sys_instruct

Fix crash and remove `sys_instruct` from `chat.py` and `client.py`(#591)
* fix crash * update profile_generation.py * format * use self.bos_id * remove sys_instruct
ffe4ba9c · Chen Xin · GitHub · af2f072e · ffe4ba9c · ffe4ba9c
Unverified Commit ffe4ba9c authored Oct 24, 2023 by Chen Xin Committed by GitHub Oct 24, 2023
5 changed files
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -30,7 +30,7 @@ pip install nvidia-ml-py
 ```bash
 python profile_generation.py \
 --model-path /path/to/your/model \
- --concurrency 1 8 --prompt-tokens 0 512 --completion-tokens 2048 512
+ --concurrency 1 8 --prompt-tokens 1 512 --completion-tokens 2048 512
 ```

 ## profile serving

--- a/benchmark/profile_generation.py
+++ b/benchmark/profile_generation.py
@@ -90,7 +90,7 @@ def warmup(model,

 def profile_throughput(model_path: str,
                       concurrency: int = 1,
-                       input_seqlen: int = 0,
+                       input_seqlen: int = 1,
                       output_seqlen: int = 512,
                       test_round: int = 10,
                       tp: int = 1):
@@ -99,8 +99,10 @@ def profile_throughput(model_path: str,
    tm_model = TurboMind(model_path=model_path, tp=tp)

    # make up a prompt that can be tokenized into {input_seqlen} tokens
-    prompt = '' if input_seqlen == 0 else 'hi' + ' hi' * (input_seqlen - 1)
+    assert input_seqlen > 0, 'input_seqlen should > 0'
+    prompt = 'hi'
    input_ids = tokenizer.encode(prompt)
+    input_ids = input_ids * input_seqlen

    warmup(tm_model, concurrency, input_ids, output_seqlen)


--- a/lmdeploy/serve/client.py
+++ b/lmdeploy/serve/client.py
@@ -20,7 +20,6 @@ def input_prompt(model_name):
 def main(tritonserver_addr: str,
         session_id: int = 1,
         cap: str = 'chat',
-         sys_instruct: str = None,
         stream_output: bool = True,
         **kwargs):
    """An example to communicate with inference server through the command line
@@ -32,13 +31,11 @@ def main(tritonserver_addr: str,
        session_id (int): the identical id of a session
        cap (str): the capability of a model. For example, codellama has
            the ability among ['completion', 'infill', 'instruct', 'python']
-        sys_instruct (str): the content of 'system' role, which is used by
-            conversational model
        stream_output (bool): indicator for streaming output or not
        **kwargs (dict): other arguments for initializing model's chat template
    """
    log_level = os.environ.get('SERVICE_LOG_LEVEL', 'WARNING')
-    kwargs.update(capability=cap, system=sys_instruct)
+    kwargs.update(capability=cap)
    chatbot = Chatbot(tritonserver_addr,
                      log_level=log_level,
                      display=stream_output,

--- a/lmdeploy/serve/turbomind/chatbot.py
+++ b/lmdeploy/serve/turbomind/chatbot.py
@@ -459,6 +459,10 @@ class Chatbot:
            session.sequence_length = 0

        input_ids, input_lengths = self.preprocess(prompt)
+        # will crash if last_token_id == eos_id and send empty input_ids
+        if sequence_end and request_output_len == 0:
+            input_ids = np.array([[self.bos_id]], dtype=np.uint32)
+            input_lengths = np.array([[1]], dtype=np.uint32)
        input_tokens = input_lengths.squeeze()
        if self.profile_generation:
            yield StatusCode.TRITON_STREAM_ING, \

--- a/lmdeploy/turbomind/chat.py
+++ b/lmdeploy/turbomind/chat.py
@@ -73,7 +73,6 @@ def get_gen_param(cap,
 def main(model_path,
         session_id: int = 1,
         cap: str = 'chat',
-         sys_instruct: str = None,
         tp=1,
         stream_output=True,
         **kwargs):
@@ -85,8 +84,6 @@ def main(model_path,
        session_id (int): the identical id of a session
        cap (str): the capability of a model. For example, codellama has
            the ability among ['completion', 'infilling', 'chat', 'python']
-        sys_instruct (str): the content of 'system' role, which is used by
-            conversational model
        tp (int): GPU number used in tensor parallelism
        stream_output (bool): indicator for streaming output or not
        **kwarg (dict): other arguments for initializing model's chat template
@@ -100,9 +97,7 @@ def main(model_path,
    step = 0
    seed = random.getrandbits(64)
    model_name = tm_model.model_name
-    model = MODELS.get(model_name)(capability=cap, **kwargs) \
-        if sys_instruct is None else MODELS.get(model_name)(
-            capability=cap, system=sys_instruct, **kwargs)
+    model = MODELS.get(model_name)(capability=cap, **kwargs)

    print(f'session {session_id}')
    while True: