0.2.6版本新增文件补充

fe851fbc · zhouxiang · e2d98ddc · fe851fbc · fe851fbc · fe851fbc
Commit fe851fbc authored Mar 24, 2024 by zhouxiang
20 changed files
--- a/autotest/tools/quantization/test_quantization_w4a16.py
+++ b/autotest/tools/quantization/test_quantization_w4a16.py
+import os
+import allure
+import pytest
+from utils.config_utils import get_cuda_prefix_by_workerid
+from utils.quantization_utils import quantization
+model_list = [
+    'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b',
+    'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat', 'Qwen/Qwen-VL',
+    'internlm/internlm2-chat-20b', 'internlm/internlm2-20b',
+    'baichuan-inc/Baichuan2-7B-Chat'
+]
+@pytest.mark.order(3)
+@pytest.mark.quantization_w4a16
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize('model', model_list)
+def test_quantization_w4a16(config, model, worker_id):
+    quantization_w4a16(config, model + '-inner-w4a16', model,
+                       get_cuda_prefix_by_workerid(worker_id))
+@pytest.mark.order(3)
+@pytest.mark.quantization_w4a16
+@pytest.mark.pr_test
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize(
+    'model, prefix',
+    [('internlm/internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5')])
+def test_quantization_w4a16_pr(config, model, prefix):
+    quantization_w4a16(config, model + '-inner-w4a16', model, prefix)
+def quantization_w4a16(config, quantization_model_name, origin_model_name,
+                       cuda_prefix):
+    quantization_type = 'w4a16'
+    result, msg = quantization(config, quantization_model_name,
+                               origin_model_name, quantization_type,
+                               cuda_prefix)
+    log_path = config.get('log_path')
+    quantization_log = os.path.join(
+        log_path, '_'.join([
+            'quantization', quantization_type,
+            quantization_model_name.split('/')[1]
+        ]) + '.log')
+    allure.attach.file(quantization_log,
+                       attachment_type=allure.attachment_type.TEXT)
+    assert result, msg
--- a/autotest/tools/quantization/test_quantization_w8a8.py
+++ b/autotest/tools/quantization/test_quantization_w8a8.py
+import os
+import allure
+import pytest
+from utils.config_utils import get_cuda_prefix_by_workerid
+from utils.quantization_utils import quantization
+model_list = [
+    'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b',
+    'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-7b',
+    '01-ai/Yi-6B-Chat', 'internlm/internlm2-20b'
+]
+@pytest.mark.order(2)
+@pytest.mark.quantization_w8a8
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize('model', model_list)
+def test_quantization_w8a8(config, model, worker_id):
+    quantization_w8a8(config, model + '-inner-w8a8', model,
+                      get_cuda_prefix_by_workerid(worker_id))
+def quantization_w8a8(config, quantization_model_name, origin_model_name,
+                      cuda_prefix):
+    quantization_type = 'w8a8'
+    result, msg = quantization(config, quantization_model_name,
+                               origin_model_name, quantization_type,
+                               cuda_prefix)
+    log_path = config.get('log_path')
+    quantization_log = os.path.join(
+        log_path, '_'.join([
+            'quantization', quantization_type,
+            quantization_model_name.split('/')[1]
+        ]) + '.log')
+    allure.attach.file(quantization_log,
+                       attachment_type=allure.attachment_type.TEXT)
+    assert result, msg
--- a/autotest/tools/restful/test_restful_chat_pytorch.py
+++ b/autotest/tools/restful/test_restful_chat_pytorch.py
+import os
+import subprocess
+from time import sleep, time
+import allure
+import pytest
+from pytest import assume
+from utils.config_utils import (get_cuda_prefix_by_workerid,
+                                get_torch_model_list, get_workerid)
+from utils.get_run_config import get_command_with_extra
+from utils.run_client_chat import command_line_test
+from utils.run_restful_chat import (get_model, health_check, interactive_test,
+                                    open_chat_test)
+BASE_HTTP_URL = 'http://localhost'
+DEFAULT_PORT = 23333
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    model_path = config.get('model_path')
+    log_path = config.get('log_path')
+    param = request.param
+    model = param['model']
+    cuda_prefix = param['cuda_prefix']
+    tp_num = param['tp_num']
+    if cuda_prefix is None:
+        cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num)
+    worker_num = get_workerid(worker_id)
+    if worker_num is None:
+        port = DEFAULT_PORT
+    else:
+        port = DEFAULT_PORT + worker_num
+    cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path +
+                                 '/' + model + ' --backend pytorch' +
+                                 ' --server-port ' + str(port),
+                                 config,
+                                 model,
+                                 need_tp=True)
+    print('reproduce command restful: ' + cmd)
+    start_log = os.path.join(log_path,
+                             'start_restful_' + model.split('/')[1] + '.log')
+    with open(start_log, 'w') as f:
+        f.writelines('reproduce command restful: ' + cmd + '\n')
+        # convert
+        convertRes = subprocess.Popen([cmd],
+                                      stdout=f,
+                                      stderr=f,
+                                      shell=True,
+                                      text=True,
+                                      encoding='utf-8')
+        pid = convertRes.pid
+    allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    start_time = int(time())
+    sleep(5)
+    for i in range(120):
+        sleep(1)
+        end_time = int(time())
+        total_time = end_time - start_time
+        result = health_check(http_url)
+        if result or total_time >= 120:
+            break
+    yield
+    if pid > 0:
+        kill_log = os.path.join(log_path,
+                                'kill_' + model.split('/')[1] + '.log')
+        with open(kill_log, 'w') as f:
+            convertRes.kill()
+    allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT)
+def getModelList(tp_num):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num
+    } for item in get_torch_model_list(tp_num) if 'chat' in item.lower()]
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api_pytorch
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=1),
+                         indirect=True)
+def test_restful_chat_tp1(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api_pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=2),
+                         indirect=True)
+def test_restful_chat_tp2(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+def run_all_step(config,
+                 cases_info,
+                 worker_id: str = 'default',
+                 port: int = DEFAULT_PORT):
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    model = get_model(http_url)
+    if model is None:
+        assert False, 'server not start correctly'
+    for case in cases_info.keys():
+        if (case == 'memory_test'
+                or case == 'emoji_case') and 'chat' not in model.lower():
+            continue
+        case_info = cases_info.get(case)
+        with allure.step(case + ' step1 - command chat regression'):
+            chat_result, chat_log, msg = command_line_test(
+                config, case, case_info, model + worker_id, 'api_client',
+                http_url)
+            if chat_log is not None:
+                allure.attach.file(chat_log,
+                                   attachment_type=allure.attachment_type.TEXT)
+            with assume:
+                assert chat_result, msg
+        with allure.step(case + ' step2 - restful_test - openai chat'):
+            restful_result, restful_log, msg = open_chat_test(
+                config, case_info, model, http_url, worker_id)
+            allure.attach.file(restful_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert restful_result, msg
+        with allure.step(case + ' step3 - restful_test - interactive chat'):
+            active_result, interactive_log, msg = interactive_test(
+                config, case_info, model, http_url, worker_id)
+            allure.attach.file(interactive_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert active_result, msg
--- a/autotest/tools/restful/test_restful_chat_turbomind.py
+++ b/autotest/tools/restful/test_restful_chat_turbomind.py
+import os
+import subprocess
+from time import sleep, time
+import allure
+import pytest
+from pytest import assume
+from utils.config_utils import (get_all_model_list,
+                                get_cuda_prefix_by_workerid, get_workerid)
+from utils.get_run_config import get_command_with_extra
+from utils.run_client_chat import command_line_test
+from utils.run_restful_chat import (get_model, health_check, interactive_test,
+                                    open_chat_test)
+BASE_HTTP_URL = 'http://localhost'
+DEFAULT_PORT = 23333
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    model_path = config.get('model_path')
+    log_path = config.get('log_path')
+    param = request.param
+    model = param['model']
+    cuda_prefix = param['cuda_prefix']
+    tp_num = param['tp_num']
+    if cuda_prefix is None:
+        cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num)
+    worker_num = get_workerid(worker_id)
+    if worker_num is None:
+        port = DEFAULT_PORT
+    else:
+        port = DEFAULT_PORT + worker_num
+    cmd = ['lmdeploy serve api_server ' + model_path + '/' + model]
+    cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path +
+                                 '/' + model + ' --server-port ' + str(port),
+                                 config,
+                                 model,
+                                 need_tp=True,
+                                 cuda_prefix=cuda_prefix)
+    if 'kvint8' in model:
+        cmd += ' --quant-policy 4'
+        if 'w4' in model or '4bits' in model:
+            cmd += ' --model-format awq'
+        else:
+            cmd += ' --model-format hf'
+    if 'w4' in model or '4bits' in model:
+        cmd += ' --model-format awq'
+    start_log = os.path.join(log_path,
+                             'start_restful_' + model.split('/')[1] + '.log')
+    print('reproduce command restful: ' + cmd)
+    with open(start_log, 'w') as f:
+        f.writelines('reproduce command restful: ' + cmd + '\n')
+        # convert
+        convertRes = subprocess.Popen([cmd],
+                                      stdout=f,
+                                      stderr=f,
+                                      shell=True,
+                                      text=True,
+                                      encoding='utf-8')
+        pid = convertRes.pid
+    allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    start_time = int(time())
+    sleep(5)
+    for i in range(120):
+        sleep(1)
+        end_time = int(time())
+        total_time = end_time - start_time
+        result = health_check(http_url)
+        if result or total_time >= 120:
+            break
+    yield
+    if pid > 0:
+        kill_log = os.path.join(log_path,
+                                'kill_' + model.split('/')[1] + '.log')
+        with open(kill_log, 'w') as f:
+            convertRes.kill()
+    allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT)
+def getModelList(tp_num):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num
+    } for item in get_all_model_list(tp_num) if 'chat' in item.lower()]
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=1),
+                         indirect=True)
+def test_restful_chat_tp1(request, config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=2),
+                         indirect=True)
+def test_restful_chat_tp2(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.pr_test
+@pytest.mark.parametrize('prepare_environment', [{
+    'model': 'internlm/internlm2-chat-20b',
+    'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
+    'tp_num': 2
+}, {
+    'model': 'internlm/internlm2-chat-20b-inner-w4a16',
+    'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
+    'tp_num': 2
+}],
+                         indirect=True)
+def test_restful_chat_pr(config, common_case_config):
+    run_all_step(config, common_case_config)
+def run_all_step(config,
+                 cases_info,
+                 worker_id: str = 'default',
+                 port: int = DEFAULT_PORT):
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    model = get_model(http_url)
+    if model is None:
+        assert False, 'server not start correctly'
+    for case in cases_info.keys():
+        if (case == 'memory_test'
+                or case == 'emoji_case') and 'chat' not in model.lower():
+            continue
+        case_info = cases_info.get(case)
+        with allure.step(case + ' step1 - command chat regression'):
+            chat_result, chat_log, msg = command_line_test(
+                config, case, case_info, model, 'api_client', http_url)
+            allure.attach.file(chat_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert chat_result, msg
+        with allure.step(case + ' step2 - restful_test - openai chat'):
+            restful_result, restful_log, msg = open_chat_test(
+                config, case_info, model, http_url)
+            allure.attach.file(restful_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert restful_result, msg
+        with allure.step(case + ' step3 - restful_test - interactive chat'):
+            active_result, interactive_log, msg = interactive_test(
+                config, case_info, model, http_url)
+            allure.attach.file(interactive_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert active_result, msg
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
+import os
+import yaml
+from utils.get_run_config import get_tp_num
+def get_turbomind_model_list(tp_num: int = None):
+    config_path = os.path.join('autotest/config.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    case_list = config.get('turbomind_model')
+    quatization_case_config = config.get('quatization_case_config')
+    for key in quatization_case_config.get('w4a16'):
+        case_list.append(key + '-inner-w4a16')
+    for key in quatization_case_config.get('kvint8'):
+        case_list.append(key + '-inner-kvint8')
+    for key in quatization_case_config.get('kvint8_w4a16'):
+        case_list.append(key + '-inner-kvint8-w4a16')
+    if tp_num is not None:
+        return [
+            item for item in case_list if get_tp_num(config, item) == tp_num
+        ]
+    return case_list
+def get_torch_model_list(tp_num: int = None):
+    config_path = os.path.join('autotest/config.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    case_list = config.get('pytorch_model')
+    quatization_case_config = config.get('quatization_case_config')
+    for key in quatization_case_config.get('w8a8'):
+        case_list.append(key + '-inner-w8a8')
+    if tp_num is not None:
+        return [
+            item for item in case_list if get_tp_num(config, item) == tp_num
+        ]
+    return case_list
+def get_all_model_list(tp_num: int = None):
+    config_path = os.path.join('autotest/config.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    case_list = config.get('turbomind_model')
+    for key in config.get('pytorch_model'):
+        if key not in case_list:
+            case_list.append(key)
+    quatization_case_config = config.get('quatization_case_config')
+    for key in quatization_case_config.get('w4a16'):
+        case_list.append(key + '-inner-w4a16')
+    for key in quatization_case_config.get('kvint8'):
+        case_list.append(key + '-inner-kvint8')
+    for key in quatization_case_config.get('kvint8_w4a16'):
+        case_list.append(key + '-inner-kvint8-w4a16')
+    if tp_num is not None:
+        return [
+            item for item in case_list if get_tp_num(config, item) == tp_num
+        ]
+    return case_list
+def get_cuda_prefix_by_workerid(worker_id, tp_num: int = 1):
+    if worker_id is None or 'gw' not in worker_id:
+        return None
+    else:
+        if tp_num == 1:
+            return 'CUDA_VISIBLE_DEVICES=' + worker_id.replace('gw', '')
+        elif tp_num == 2:
+            cuda_num = int(worker_id.replace('gw', '')) * 2
+            return 'CUDA_VISIBLE_DEVICES=' + ','.join(
+                [str(cuda_num), str(cuda_num + 1)])
+def get_cuda_id_by_workerid(worker_id, tp_num: int = 1):
+    if worker_id is None or 'gw' not in worker_id:
+        return None
+    else:
+        if tp_num == 1:
+            return worker_id.replace('gw', '')
+        elif tp_num == 2:
+            cuda_num = int(worker_id.replace('gw', '')) * 2
+            return ','.join([str(cuda_num), str(cuda_num + 1)])
+def get_workerid(worker_id):
+    if worker_id is None or 'gw' not in worker_id:
+        return None
+    else:
+        return int(worker_id.replace('gw', ''))
--- a/autotest/utils/get_run_config.py
+++ b/autotest/utils/get_run_config.py
+import random
+from time import sleep
+import torch
+from lmdeploy.model import MODELS
+def get_conda_allcate_prefix(config, model):
+    cuda_prefix = ''
+    tp_num = get_tp_num(config, model)
+    if tp_num is None:
+        return cuda_prefix
+    available_cuda = _get_available_cude()
+    if len(available_cuda) < tp_num:
+        raise torch.cuda.OutOfMemoryError
+    cuda_prefix = 'CUDA_VISIBLE_DEVICES=' + ','.join(
+        random.sample(available_cuda, tp_num))
+    torch.cuda.empty_cache()
+    return cuda_prefix
+def get_tp_config(config, model, need_tp):
+    tp_num = str(get_tp_num(config, model))
+    tp_info = ''
+    if need_tp and tp_num is not None:
+        tp_info = '--tp ' + str(get_tp_num(config, model))
+    return tp_info
+def get_tp_num(config, model):
+    tp_config = config.get('tp_config')
+    tp_num = 1
+    if tp_config is None:
+        return None
+    model_name = _simple_model_name(model)
+    if model_name in tp_config.keys():
+        tp_num = tp_config.get(model_name)
+    return tp_num
+def get_command_with_extra(cmd,
+                           config,
+                           model,
+                           need_tp: bool = False,
+                           cuda_prefix: str = None,
+                           need_sleep: bool = True):
+    if need_sleep:
+        sleep(random.uniform(0, 5))
+    if cuda_prefix is None:
+        cuda_prefix = get_conda_allcate_prefix(config, model)
+    tp_config = get_tp_config(config, model, need_tp)
+    if cuda_prefix is not None and len(cuda_prefix) > 0:
+        cmd = ' '.join([cuda_prefix, cmd])
+    if tp_config is not None and len(tp_config) > 0:
+        cmd = ' '.join([cmd, tp_config])
+    torch.cuda.empty_cache()
+    return cmd
+def get_model_name(model):
+    model_names = [
+        'llama', 'llama2', 'internlm', 'internlm2', 'baichuan2', 'chatglm2',
+        'falcon', 'yi', 'qwen1.5'
+    ]
+    model_names += list(MODELS.module_dict.keys())
+    model_names.sort()
+    model_name = _simple_model_name(model)
+    model_name = model_name.lower()
+    if model_name in model_names:
+        return model_name
+    model_name = model_name.replace('-chat', '')
+    model_name = model_name.replace('-v0.1', '')
+    if model_name in model_names:
+        return model_name
+    if (model_name == 'qwen-vl'):
+        return 'qwen-7b'
+    if ('llama-2' in model_name):
+        return 'llama-2'
+    return model_name.split('-')[0]
+def _get_available_cude():
+    devices = torch.cuda.device_count()
+    available_cuda = []
+    for i in range(devices):
+        if (torch.cuda.utilization(i) > 5):
+            continue
+        if ('no processes are running'
+                not in torch.cuda.list_gpu_processes(i)):
+            continue
+        available_cuda.append(str(i))
+    return available_cuda
+def _simple_model_name(model):
+    if '/' in model:
+        model_name = model.split('/')[1]
+    else:
+        model_name = model
+    model_name = model_name.replace('-inner-w4a16', '')
+    model_name = model_name.replace('-inner-w8a8', '')
+    model_name = model_name.replace('-inner-kvint8', '')
+    model_name = model_name.replace('-w4a16', '')
+    return model_name
+def _split_model_name(model):
+    model_name = model.split('/')[1]
+    return model_name
+if __name__ == '__main__':
+    print(_simple_model_name('baichuan-inc/Baichuan2-7B-Chat-inner-w4a16'))
--- a/autotest/utils/mp_log_utils.py
+++ b/autotest/utils/mp_log_utils.py
+import os
+import allure
+from pytest import assume
+def write_log(config,
+              result,
+              msg,
+              is_new: bool = True,
+              case_path_tag: str = 'default'):
+    try:
+        log_path = os.path.join(config.get('log_path'), case_path_tag)
+        if is_new:
+            file = open(log_path, 'w')
+        else:
+            file = open(log_path, 'a')
+        file.writelines('result:' + result + ', reason:' + msg + '\n')
+        file.close()
+    except Exception as e:
+        return False, None, f'Unknown error: {e}'
+def assert_log(config, case_path_tag: str = 'default'):
+    log_path = os.path.join(config.get('log_path'), case_path_tag)
+    with open(log_path, 'r') as f:
+        lines = f.readlines()
+        for line in lines:
+            if 'result:False, reason:' in line:
+                result = False
+                msg = line
+                break
+            if 'result:True, reason:' in line and result is False:
+                result = True
+    allure.attach.file(log_path, attachment_type=allure.attachment_type.TEXT)
+    with assume:
+        assert result, msg
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
+import os
+import allure
+import torch
+from pytest import assume
+from utils.get_run_config import get_model_name, get_tp_num
+from utils.rule_condition_assert import assert_result
+from lmdeploy import pipeline
+from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
+                               TurbomindEngineConfig)
+def run_pipeline_chat_test(config, cases_info, model_case, type):
+    log_path = config.get('log_path')
+    tp = get_tp_num(config, model_case)
+    model_name = model_name = get_model_name(model_case)
+    model_path = config.get('model_path')
+    hf_path = model_path + '/' + model_case
+    print(' '.join([
+        'reproduce command:', 'python',
+        'autotest/tools/pipeline/pipeline_chat_script.py', type, model_case,
+        str(tp)
+    ]))
+    if 'pytorch' == type:
+        backend_config = PytorchEngineConfig(tp=tp)
+    else:
+        if 'kvint8' in model_case and ('w4' in model_case
+                                       or '4bits' in model_case):
+            backend_config = TurbomindEngineConfig(tp=tp,
+                                                   model_format='awq',
+                                                   quant_policy=4)
+        elif 'kvint8' in model_case:
+            backend_config = TurbomindEngineConfig(tp=tp,
+                                                   model_format='hf',
+                                                   quant_policy=4)
+        elif 'w4' in model_case or '4bits' in model_case:
+            backend_config = TurbomindEngineConfig(tp=tp, model_format='awq')
+        else:
+            backend_config = TurbomindEngineConfig(tp=tp)
+    pipe = pipeline(hf_path, backend_config=backend_config)
+    # run testcases
+    gen_config = GenerationConfig(temperature=0.01)
+    gen_config = GenerationConfig()
+    for case in cases_info.keys():
+        if (case == 'memory_test'
+                or case == 'emoji_case') and 'chat' not in model_case.lower():
+            continue
+        case_info = cases_info.get(case)
+        pipeline_chat_log = os.path.join(
+            log_path,
+            'pipeline_chat_' + model_case.split('/')[1] + '_' + case + '.log')
+        file = open(pipeline_chat_log, 'w')
+        prompts = []
+        for prompt_detail in case_info:
+            prompt = list(prompt_detail.keys())[0]
+            if 'chat' not in model_case.lower():  # base model
+                prompts.append(prompt)
+            else:  # chat model
+                prompts.append({'role': 'user', 'content': prompt})
+            file.writelines('prompt:' + prompt + '\n')
+            if 'chat' not in model_case.lower():  # base model
+                response = pipe(prompts, gen_config=gen_config)[-1].text
+            else:  # chat model
+                response = pipe([prompts], gen_config=gen_config)[0].text
+            case_result, reason = assert_result(response,
+                                                prompt_detail.values(),
+                                                model_name)
+            if 'chat' in model_case.lower():
+                prompts.append({'role': 'assistant', 'content': response})
+            file.writelines('output:' + response + '\n')
+            file.writelines('result:' + str(case_result) + ', reason:' +
+                            reason + '\n')
+        file.close()
+    del pipe
+    torch.cuda.empty_cache()
+def assert_pipeline_chat_log(config, cases_info, model_case):
+    log_path = config.get('log_path')
+    for case in cases_info.keys():
+        if (case == 'memory_test'
+                or case == 'emoji_case') and 'chat' not in model_case.lower():
+            continue
+        msg = ''
+        result = False
+        with allure.step('case - ' + case):
+            pipeline_chat_log = os.path.join(
+                log_path, 'pipeline_chat_' + model_case.split('/')[1] + '_' +
+                case + '.log')
+            with open(pipeline_chat_log, 'r') as f:
+                lines = f.readlines()
+                for line in lines:
+                    if 'result:False, reason:' in line:
+                        result = False
+                        msg = line
+                        break
+                    if 'result:True, reason:' in line and result is False:
+                        result = True
+            allure.attach.file(pipeline_chat_log,
+                               attachment_type=allure.attachment_type.TEXT)
+            with assume:
+                assert result, msg
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
+import os
+import subprocess
+from subprocess import PIPE
+def quantization(config,
+                 quantization_model_name,
+                 origin_model_name,
+                 quantization_type: str = 'w4a16',
+                 cuda_prefix: str = 'CUDA_VISIBLE_DEVICES=0'):
+    model_path = config.get('model_path')
+    log_path = config.get('log_path')
+    origin_model_path = config.get('model_path') + '/' + origin_model_name
+    quantization_model_path = model_path + '/' + quantization_model_name
+    quantization_log = os.path.join(
+        log_path, '_'.join([
+            'quantization', quantization_type,
+            quantization_model_name.split('/')[1]
+        ]) + '.log')
+    if quantization_type == 'w4a16':
+        quantization_cmd = ' '.join([
+            cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path,
+            '--work-dir', quantization_model_path
+        ])
+    elif quantization_type == 'w8a8':
+        quantization_cmd = ' '.join([
+            cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path,
+            '--work-dir', quantization_model_path
+        ])
+    elif quantization_type == 'kvint8':
+        quantization_cmd = ' '.join([
+            cuda_prefix, 'lmdeploy lite calibrate', origin_model_path,
+            '--work-dir', quantization_model_path
+        ])
+    else:
+        return False, 'quantization type should in [w4a16, w8a8, kvint8], \
+            now the type is ' + quantization_type
+    with open(quantization_log, 'w') as f:
+        # remove existing folder
+        subprocess.run([' '.join(['rm -rf', quantization_model_path])],
+                       stdout=f,
+                       stderr=f,
+                       shell=True,
+                       text=True,
+                       encoding='utf-8')
+        if quantization_type == 'kvint8':
+            cp_cmd = ' '.join(
+                ['cp -r', origin_model_path, quantization_model_path])
+            f.writelines('reproduce command quantization_cmd: ' + cp_cmd +
+                         '\n')
+            print('reproduce command quantization_cmd: ' + cp_cmd)
+            subprocess.run([cp_cmd],
+                           stdout=f,
+                           stderr=f,
+                           shell=True,
+                           text=True,
+                           encoding='utf-8')
+        f.writelines('reproduce command quantization_cmd: ' +
+                     quantization_cmd + '\n')
+        print('reproduce command quantization_cmd: ' + quantization_cmd)
+        # quantization
+        quantizationRes = subprocess.run([quantization_cmd],
+                                         stdout=f,
+                                         stderr=PIPE,
+                                         shell=True,
+                                         text=True,
+                                         encoding='utf-8')
+        f.writelines(quantizationRes.stderr)
+        result = quantizationRes.returncode == 0
+    return result, quantizationRes.stderr
--- a/autotest/utils/restful_return_check.py
+++ b/autotest/utils/restful_return_check.py
+def assert_chat_completions_batch_return(output, model_name):
+    assert output.get('usage').get('prompt_tokens') > 0
+    assert output.get('usage').get('total_tokens') > 0
+    assert output.get('usage').get('completion_tokens') > 0
+    assert output.get('usage').get('completion_tokens') + output.get(
+        'usage').get('prompt_tokens') == output.get('usage').get(
+            'total_tokens')
+    assert output.get('id') is not None
+    assert output.get('object') == 'chat.completion'
+    assert output.get('model') == model_name
+    output_message = output.get('choices')
+    assert len(output_message) == 1
+    for message in output_message:
+        assert message.get('finish_reason') in ['stop', 'length']
+        assert message.get('index') == 0
+        assert len(message.get('message').get('content')) > 0
+        assert message.get('message').get('role') == 'assistant'
+def assert_chat_completions_stream_return(output,
+                                          model_name,
+                                          is_first: bool = False,
+                                          is_last: bool = False):
+    assert output.get('id') is not None
+    if is_first is False:
+        assert output.get('object') == 'chat.completion.chunk'
+    assert output.get('model') == model_name
+    output_message = output.get('choices')
+    assert len(output_message) == 1
+    for message in output_message:
+        assert message.get('delta').get('role') == 'assistant'
+        assert message.get('index') == 0
+        if is_last is False:
+            assert message.get('finish_reason') is None
+        if is_first is False and is_last is False:
+            assert len(message.get('delta').get('content')) >= 0
+        if is_last is True:
+            assert len(message.get('delta').get('content')) == 0
+            assert message.get('finish_reason') in ['stop', 'length']
+def assert_chat_interactive_batch_return(output):
+    assert output.get('input_tokens') > 0
+    assert output.get('tokens') > 0
+    assert output.get('history_tokens') >= 0
+    assert output.get('finish_reason') in ['stop', 'length']
+    assert len(output.get('text')) > 0
+def assert_chat_interactive_stream_return(output,
+                                          is_last: bool = False,
+                                          is_text_empty: bool = False,
+                                          index: int = None):
+    assert output.get('input_tokens') > 0
+    if index is not None:
+        assert output.get('tokens') >= index and output.get(
+            'tokens') <= index + 6
+    assert output.get('tokens') > 0
+    assert output.get('history_tokens') >= 0
+    if is_last:
+        assert len(output.get('text')) >= 0
+        assert output.get('finish_reason') in ['stop', 'length']
+    elif is_text_empty:
+        assert len(output.get('text')) == 0
+        assert output.get('finish_reason') is None
+    else:
+        assert len(output.get('text')) >= 0
+        assert output.get('finish_reason') is None
--- a/autotest/utils/rule_condition_assert.py
+++ b/autotest/utils/rule_condition_assert.py
+def assert_result(input, rule_condition, model_name):
+    input = input.replace('\n', '\\n')
+    input_lower = input.lower()
+    for dict in rule_condition:
+        if dict is None:
+            return True, ''
+        for rule in dict:
+            operator = list(rule.keys())[0]
+            value = list(rule.values())[0]
+            if model_name is not None and model_name == operator:
+                dict = value
+        for rule in dict:
+            operator = list(rule.keys())[0]
+            value = list(rule.values())[0]
+            if operator == 'contain':
+                if isinstance(value, list):
+                    tmpResult = False
+                    for word in value:
+                        if word.lower() in input_lower:
+                            tmpResult = True
+                    if tmpResult is False:
+                        return False, ','.join(
+                            value) + " doesn't exist in " + input
+                else:
+                    if value.lower() not in input_lower:
+                        msg = value + " doesn't exist in:" + input
+                        return False, msg
+            if operator == 'not_contain':
+                if isinstance(value, list):
+                    for word in value:
+                        if word.lower() in input_lower:
+                            msg = word + " shouldn't exist in:" + input
+                            return False, msg
+                else:
+                    if value.lower() in input_lower:
+                        msg = value + " shouldn't exist in " + input
+                        return False, msg
+            if operator == 'len_g':
+                if len(input) < int(value):
+                    return False, input + ' length: ' + str(
+                        len(input)) + ', should greater than ' + str(value)
+        return True, ''
+if __name__ == '__main__':
+    input = '成都的景点hot potdddd'
+    condition = ([[{
+        'contain': ['hot pot']
+    }, {
+        'contain': ['。']
+    }, {
+        'len_g': [10]
+    }]])
+    print(assert_result(input, condition))
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
+import os
+from subprocess import PIPE, Popen
+from utils.get_run_config import get_command_with_extra, get_model_name
+from utils.rule_condition_assert import assert_result
+def command_line_test(config,
+                      case,
+                      case_info,
+                      model_case,
+                      type,
+                      extra: str = None,
+                      cuda_prefix: str = None):
+    dst_path = config.get('dst_path')
+    if type == 'api_client':
+        cmd = 'lmdeploy serve api_client ' + extra
+    elif type == 'triton_client':
+        cmd = 'lmdeploy serve triton_client ' + extra
+    else:
+        cmd = get_command_with_extra('lmdeploy chat turbomind ' + dst_path +
+                                     '/workspace_' + model_case,
+                                     config,
+                                     model_case,
+                                     cuda_prefix=cuda_prefix)
+        if 'kvint8' in model_case:
+            cmd += ' --quant-policy 4'
+            if 'w4' in model_case or '4bits' in model_case:
+                cmd += ' --model-format awq'
+            else:
+                cmd += ' --model-format hf'
+        elif 'w4' in model_case or '4bits' in model_case:
+            cmd += ' --model-format awq'
+        if 'chat' not in model_case.lower():
+            cmd += ' --cap completion'
+    return command_test(config, [cmd], model_case, case, case_info,
+                        type == 'turbomind')
+def hf_command_line_test(config,
+                         case,
+                         case_info,
+                         model_case,
+                         type,
+                         cuda_prefix: str = None):
+    model_path = config.get('model_path') + '/' + model_case
+    cmd = get_command_with_extra(' '.join(['lmdeploy chat', type, model_path]),
+                                 config,
+                                 model_case,
+                                 need_tp=True,
+                                 cuda_prefix=cuda_prefix)
+    if 'kvint8' in model_case:
+        cmd += ' --quant-policy 4'
+        if 'w4' in model_case or '4bits' in model_case:
+            cmd += ' --model-format awq'
+        else:
+            cmd += ' --model-format hf'
+    elif 'w4' in model_case or '4bits' in model_case:
+        cmd += ' --model-format awq'
+    return command_test(config, [cmd], model_case,
+                        '_'.join(['hf', type, case]), case_info, True)
+def command_test(config, cmd, model, case, case_info, need_extract_output):
+    if 'memory_test' in case and 'chat' not in model.lower():
+        return True, None, 'memory case skipped for base model'
+    try:
+        log_path = config.get('log_path')
+        model_name = get_model_name(model)
+        if '/' in model:
+            chat_log = os.path.join(
+                log_path, 'chat_' + model.split('/')[1] + '_' + case + '.log')
+        else:
+            chat_log = os.path.join(log_path,
+                                    'chat_' + model + '_' + case + '.log')
+        file = open(chat_log, 'w')
+        returncode = -1
+        result = True
+        print('reproduce command chat: ' + ' '.join(cmd) + '\n')
+        file.writelines('reproduce command chat: ' + ' '.join(cmd) + '\n')
+        spliter = '\n\n'
+        if 'CodeLlama-7b-Instruct-hf' in model:
+            spliter = '\n!!\n'
+        # join prompt together
+        prompt = ''
+        for item in case_info:
+            prompt += list(item.keys())[0] + spliter
+        prompt += 'exit' + spliter
+        msg = ''
+        with Popen(cmd,
+                   stdin=PIPE,
+                   stdout=PIPE,
+                   stderr=PIPE,
+                   shell=True,
+                   text=True,
+                   encoding='utf-8') as proc:
+            # file.writelines('prompt:' + prompt + '\n')
+            outputs, errors = proc.communicate(input=prompt)
+            returncode = proc.returncode
+            if returncode != 0:
+                file.writelines('error:' + errors + '\n')
+                result = False
+                return result, chat_log, errors
+            outputDialogs = parse_dialogue(outputs, model)
+            file.writelines('answersize:' + str(len(outputDialogs)) + '\n')
+            # 结果判断
+            index = 0
+            for prompt_detail in case_info:
+                if need_extract_output:
+                    output = extract_output(outputDialogs[index], model)
+                else:
+                    output = outputDialogs[index]
+                case_result, reason = assert_result(output,
+                                                    prompt_detail.values(),
+                                                    model_name)
+                file.writelines('prompt:' + list(prompt_detail.keys())[0] +
+                                '\n')
+                file.writelines('output:' + output + '\n')
+                file.writelines('result:' + str(case_result) + ',reason:' +
+                                reason + '\n')
+                index += 1
+                if case_result is False:
+                    msg = reason
+                result = result & case_result
+        file.close()
+        return result, chat_log, msg
+    except Exception as e:
+        return False, None, f'Unknown error: {e}'
+# 从输出中解析模型输出的对话内容
+def parse_dialogue(inputs: str, model: str):
+    dialogues = inputs.strip()
+    if 'CodeLlama-7b-Instruct-hf' in model:
+        sep = 'enter !! to end the input >>>'
+    else:
+        sep = 'double enter to end input >>>'
+    dialogues = dialogues.strip()
+    dialogues = dialogues.split(sep)
+    dialogues = [d.strip() for d in dialogues]
+    return dialogues[1:-1]  # 去除首尾无用字符
+def extract_output(output: str, model: str):
+    if 'Qwen' in model or 'internlm2' in model:
+        if len(output.split('<|im_start|>assistant')) >= 2:
+            return output.split('<|im_start|>assistant')[1]
+    if 'Baichuan2' in model:
+        if len(output.split('<reserved_107>')) >= 2:
+            return output.split('<reserved_107>')[1]
+    if 'internlm' in model:
+        if len(output.split('<|Bot|>: ')) >= 2:
+            return output.split('<|Bot|>: ')[1]
+    if 'llama' in model or 'Llama' in model:
+        if len(output.split('[/INST]')) >= 2:
+            return output.split('[/INST]')[1]
+    return output
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
+import os
+import random
+import string
+from utils.rule_condition_assert import assert_result
+from lmdeploy.serve.openai.api_client import APIClient
+def open_chat_test(config, case_info, model, url, worker_id: str = 'default'):
+    log_path = config.get('log_path')
+    restful_log = os.path.join(log_path,
+                               'restful_' + model + '_' + worker_id + '.log')
+    file = open(restful_log, 'w')
+    result = True
+    api_client = APIClient(url)
+    model_name = api_client.available_models[0]
+    messages = []
+    msg = ''
+    for prompt_detail in case_info:
+        if result is False:
+            break
+        prompt = list(prompt_detail.keys())[0]
+        messages.append({'role': 'user', 'content': prompt})
+        file.writelines('prompt:' + prompt + '\n')
+        for output in api_client.chat_completions_v1(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.01):
+            output_message = output.get('choices')[0].get('message')
+            messages.append(output_message)
+            output_content = output_message.get('content')
+            file.writelines('output:' + output_content + '\n')
+            case_result, reason = assert_result(output_content,
+                                                prompt_detail.values(),
+                                                model_name)
+            file.writelines('result:' + str(case_result) + ',reason:' +
+                            reason + '\n')
+            if result is False:
+                msg += reason
+            result = result & case_result
+    file.close()
+    return result, restful_log, msg
+def interactive_test(config,
+                     case_info,
+                     model,
+                     url,
+                     worker_id: str = 'default'):
+    log_path = config.get('log_path')
+    interactive_log = os.path.join(
+        log_path, 'interactive_' + model + '_' + worker_id + '.log')
+    file = open(interactive_log, 'w')
+    result = True
+    api_client = APIClient(url)
+    file.writelines('available_models:' +
+                    ','.join(api_client.available_models) + '\n')
+    # Randomly generate 6 characters and concatenate them into a string.
+    characters = string.digits
+    random_chars = ''.join(random.choice(characters) for i in range(6))
+    messages = []
+    msg = ''
+    for prompt_detail in case_info:
+        prompt = list(prompt_detail.keys())[0]
+        new_prompt = {'role': 'user', 'content': prompt}
+        messages.append(new_prompt)
+        file.writelines('prompt:' + prompt + '\n')
+        for output in api_client.chat_interactive_v1(prompt=prompt,
+                                                     interactive_mode=True,
+                                                     session_id=random_chars,
+                                                     temperature=0.01):
+            output_content = output.get('text')
+            file.writelines('output:' + output_content + '\n')
+            case_result, reason = assert_result(output_content,
+                                                prompt_detail.values(), model)
+            file.writelines('result:' + str(case_result) + ',reason:' +
+                            reason + '\n')
+            if result is False:
+                msg += reason
+            result = result & case_result
+    file.close()
+    return result, interactive_log, msg
+def health_check(url):
+    try:
+        api_client = APIClient(url)
+        model_name = api_client.available_models[0]
+        messages = []
+        messages.append({'role': 'user', 'content': '你好'})
+        for output in api_client.chat_completions_v1(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.01):
+            if output.get('code') is not None and output.get('code') != 0:
+                return False
+            return True
+    except Exception:
+        return False
+def get_model(url):
+    try:
+        api_client = APIClient(url)
+        model_name = api_client.available_models[0]
+        return model_name
+    except Exception:
+        return None
--- a/benchmark/benchmark_pytorch_engine_a100.sh
+++ b/benchmark/benchmark_pytorch_engine_a100.sh
+#!/bin/bash
+dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json"
+########################################## PyTorch engine: fp16 or bf16 ##########################################
+## 7B
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.95
+model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat"
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
+## 13B
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat"
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
+# 20B
+tp=2
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/InternLM/internlm-chat-20b"
+CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path}  --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
+# 70B
+tp=4
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf"
+CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path}  --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
+########################################## PyTorch engine: w8a8 ##########################################
--- a/benchmark/benchmark_turbomind_engine_a100.sh
+++ b/benchmark/benchmark_turbomind_engine_a100.sh
+# #!/bin/bash
+dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json"
+########################################## TurboMind engine: fp16 or bf16 ##########################################
+# 7B. gemm_tune -> profile_throughput
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.95
+model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat"
+CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_7b_thr.csv
+rm gemm_config.in
+# 13B. gemm_tune -> profile_throughput
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat"
+CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_13b_thr.csv
+rm gemm_config.in
+# 20B. gemm_tune -> profile_throughput
+tp=2
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/InternLM/internlm-chat-20b"
+CUDA_VISIBLE_DEVICES="5,6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
+CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv internlm_tb_20b_thr.csv
+rm gemm_config.in
+# 70B
+tp=4
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf"
+CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_70b_thr.csv
+# ########################################## TurboMind engine: w4a16 ##########################################
+# 7B
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.95
+model_path="/workspace/models/quantization/llama-2-7b-chat-4bit"
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_7b_4bit_thr.csv
+# 13B
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models/quantization/llama-2-13b-chat-4bit"
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_13b_4bit_thr.csv
+# 20B
+tp=2
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models/quantization/internlm-chat-20b-4bit"
+CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv internlm_tb_20b_4bit_thr.csv
+# 70B
+tp=4
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models/quantization/llama-2-70b-chat-hf-4bit"
+CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_70b_4bit_thr.csv
--- a/docs/en/.readthedocs.yaml
+++ b/docs/en/.readthedocs.yaml
+version: 2
+formats: all
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "3.8"
+sphinx:
+  configuration: docs/en/conf.py
+python:
+  install:
+    - requirements: requirements/docs.txt
+    - requirements: requirements/readthedocs.txt
--- a/docs/en/_static/image/lmdeploy-logo.svg
+++ b/docs/en/_static/image/lmdeploy-logo.svg
--- a/docs/en/advance/chat_template.md
+++ b/docs/en/advance/chat_template.md
+# Customized chat template
+The effect of the applied chat template can be observed by **setting log level** `INFO`.
+LMDeploy supports two methods of adding chat templates:
+- One approach is to utilize an existing conversation template by directly configuring a JSON file like the following.
+  ```json
+  {
+      "model_name": "your awesome chat template name",
+      "system": "<|im_start|>system\n",
+      "meta_instruction": "You are a robot developed by LMDeploy.",
+      "eosys": "<|im_end|>\n",
+      "user": "<|im_start|>user\n",
+      "eoh": "<|im_end|>\n",
+      "assistant": "<|im_start|>assistant\n",
+      "eoa": "<|im_end|>",
+      "separator": "\n",
+      "capability": "chat",
+      "stop_words": ["<|im_end|>"]
+  }
+  ```
+  `model_name` is a required field and can be either the name of an LMDeploy built-in chat template (which can be viewed through `lmdeploy list`), or a new name. Other fields are optional.
+  1. When `model_name` is the name of a built-in chat template, the non-null fields in the JSON file will override the corresponding attributes of the original chat template.
+  2. However, when `model_name` is a new name, it will register `BaseChatTemplate` directly as a new chat template. The specific definition can be referred to [BaseChatTemplate](https://github.com/InternLM/lmdeploy/blob/24bd4b9ab6a15b3952e62bcfc72eaba03bce9dcb/lmdeploy/model.py#L113-L188).
+  The new chat template would be like this:
+  ```
+  {system}{meta_instruction}{eosys}{user}{user_content}{eoh}{assistant}{assistant_content}{eoa}{separator}{user}...
+  ```
+  When using the CLI tool, you can pass in a custom chat template with `--chat-template`, for example.
+  ```shell
+  lmdeploy serve api_server internlm/internlm2-chat-7b --chat-template ${JSON_FILE}
+  ```
+  You can also pass it in through the interface function, for example.
+  ```python
+  from lmdeploy import ChatTemplateConfig, serve
+  serve('internlm/internlm2-chat-7b',
+        chat_template_config=ChatTemplateConfig.from_json('${JSON_FILE}'))
+  ```
+- Another approach is to customize a Python chat template class like the existing LMDeploy chat templates. It can be used directly after successful registration. The advantages are a high degree of customization and strong controllability. Below is an example of registering an LMDeploy chat template.
+  ```python
+  from lmdeploy.model import MODELS, BaseChatTemplate
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseChatTemplate):
+      """A customized chat template."""
+      def __init__(self,
+                   system='<|im_start|>system\n',
+                   meta_instruction='You are a robot developed by LMDeploy.',
+                   user='<|im_start|>user\n',
+                   assistant='<|im_start|>assistant\n',
+                   eosys='<|im_end|>\n',
+                   eoh='<|im_end|>\n',
+                   eoa='<|im_end|>',
+                   separator='\n',
+                   stop_words=['<|im_end|>', '<|action_end|>']):
+          super().__init__(system=system,
+                           meta_instruction=meta_instruction,
+                           eosys=eosys,
+                           user=user,
+                           eoh=eoh,
+                           assistant=assistant,
+                           eoa=eoa,
+                           separator=separator,
+                           stop_words=stop_words)
+  from lmdeploy import ChatTemplateConfig, pipeline
+  messages = [{'role': 'user', 'content': 'who are you?'}]
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig('customized_model'))
+  for response in pipe.stream_infer(messages):
+      print(response.text, end='')
+  ```
+  In this example, we register a LMDeploy chat template that sets the model to be created by LMDeploy, so when the user asks who the model is, the model will answer that it was created by LMDeploy.
--- a/docs/en/advance/debug_turbomind.md
+++ b/docs/en/advance/debug_turbomind.md
+# How to debug Turbomind
+Turbomind is implemented in C++, which is not as easy to debug as Python. This document provides basic methods for debugging Turbomind.
+## Prerequisite
+First, complete the local compilation according to the commands in [Build in localhost](../build.md).
+## Configure Python debug environment
+Since many large companies currently use Centos 7 for online production environments, we will use Centos 7 as an example to illustrate the process.
+### Obtain `glibc` and `python3` versions
+```bash
+rpm -qa | grep glibc
+rpm -qa | grep python3
+```
+The result should be similar to this:
+```
+[username@hostname workdir]# rpm -qa | grep glibc
+glibc-2.17-325.el7_9.x86_64
+glibc-common-2.17-325.el7_9.x86_64
+glibc-headers-2.17-325.el7_9.x86_64
+glibc-devel-2.17-325.el7_9.x86_64
+[username@hostname workdir]# rpm -qa | grep python3
+python3-pip-9.0.3-8.el7.noarch
+python3-rpm-macros-3-34.el7.noarch
+python3-rpm-generators-6-2.el7.noarch
+python3-setuptools-39.2.0-10.el7.noarch
+python3-3.6.8-21.el7_9.x86_64
+python3-devel-3.6.8-21.el7_9.x86_64
+python3.6.4-sre-1.el6.x86_64
+```
+Based on the information above, we can see that the version of `glibc` is `2.17-325.el7_9.x86_64` and the version of `python3` is `3.6.8-21.el7_9.x86_64`.
+### Download and install `debuginfo` library
+Download `glibc-debuginfo-common-2.17-325.el7.x86_64.rpm`, `glibc-debuginfo-2.17-325.el7.x86_64.rpm`, and `python3-debuginfo-3.6.8-21.el7.x86_64.rpm` from http://debuginfo.centos.org/7/x86_64.
+```bash
+rpm -ivh glibc-debuginfo-common-2.17-325.el7.x86_64.rpm
+rpm -ivh glibc-debuginfo-2.17-325.el7.x86_64.rpm
+rpm -ivh python3-debuginfo-3.6.8-21.el7.x86_64.rpm
+```
+### Upgrade GDB
+```bash
+sudo yum install devtoolset-10 -y
+echo "source scl_source enable devtoolset-10" >> ~/.bashrc
+source ~/.bashrc
+```
+### Verification
+```bash
+gdb python3
+```
+The output should be similar to this:
+```
+[username@hostname workdir]# gdb python3
+GNU gdb (GDB) Red Hat Enterprise Linux 9.2-10.el7
+Copyright (C) 2020 Free Software Foundation, Inc.
+License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
+This is free software: you are free to change and redistribute it.
+There is NO WARRANTY, to the extent permitted by law.
+Type "show copying" and "show warranty" for details.
+This GDB was configured as "x86_64-redhat-linux-gnu".
+Type "show configuration" for configuration details.
+For bug reporting instructions, please see:
+<http://www.gnu.org/software/gdb/bugs/>.
+Find the GDB manual and other documentation resources online at:
+   <http://www.gnu.org/software/gdb/documentation/>.
+For help, type "help".
+Type "apropos word" to search for commands related to "word"...
+Reading symbols from python3...
+(gdb)
+```
+If it shows `Reading symbols from python3`, the configuration has been successful.
+For other operating systems, please refer to [DebuggingWithGdb](https://wiki.python.org/moin/DebuggingWithGdb).
+## Set up symbolic links
+After setting up symbolic links, there is no need to install it locally with `pip` every time.
+```bash
+# Change directory to lmdeploy, e.g.
+cd /workdir/lmdeploy
+# Since it has been built in the build directory
+# Link the lib directory
+cd lmdeploy && ln -s ../build/lib . && cd ..
+# (Optional) Link compile_commands.json for clangd index
+ln -s build/compile_commands.json .
+```
+## Start debugging
+````bash
+# Use gdb to start the API server with Llama-2-13b-chat-hf, e.g.
+gdb --args python3 -m lmdeploy serve api_server /workdir/Llama-2-13b-chat-hf
+# Set directories in gdb
+Reading symbols from python3...
+(gdb) set directories /workdir/lmdeploy
+# Set a breakpoint using the relative path, e.g.
+(gdb) b src/turbomind/models/llama/BlockManager.cc:104
+# When it shows
+# ```
+# No source file named src/turbomind/models/llama/BlockManager.cc.
+# Make breakpoint pending on future shared library load? (y or [n])
+# ```
+# Just type `y` and press enter
+# Run
+(gdb) r
+# (Optional) Use https://github.com/InternLM/lmdeploy/blob/main/benchmark/profile_restful_api.py to send a request
+python3 profile_restful_api.py --server_addr 127.0.0.1:23333 --tokenizer_path /workdir/Llama-2-13b-chat-hf --dataset /workdir/ShareGPT_V3_unfiltered_cleaned_split.json --concurrency 1 --num_prompts 1
+````
+## Using GDB
+Refer to [GDB Execution Commands](https://lldb.llvm.org/use/map.html) and happy debugging.
--- a/docs/en/advance/long_context.md
+++ b/docs/en/advance/long_context.md
+# Context length extrapolation
+Long text extrapolation refers to the ability of LLM to handle data longer than the training text during inference. TurboMind engine now support [LlamaDynamicNTKScalingRotaryEmbedding](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L178) and the implementation is consistent with huggingface.
+## Usage
+You can enable the context length extrapolation abality by modifying the TurbomindEngineConfig. Edit the `session_len` to the expected length and change `rope_scaling_factor` to a number no less than 1.0.
+Here is an example:
+```python
+from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
+backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=160000)
+pipe = pipeline('internlm/internlm2-chat-7b', backend_config=backend_config)
+prompt = 'Use a long prompt to replace this sentence'
+gen_config = GenerationConfig(top_p=0.8,
+                              top_k=40,
+                              temperature=0.8,
+                              max_new_tokens=1024)
+response = pipe(prompt, gen_config=gen_config)
+print(response)
+```
+## Evaluation
+We use several methods to evaluate the long-context-length inference ability of LMDeploy, including [passkey retrieval](#passkey-retrieval), [needle in a haystack](#needle-in-a-haystack) and computing [perplexity](#perplexity)
+### Passkey Retrieval
+You can try the following code to test how many times LMDeploy can retrieval the special key.
+```python
+import numpy as np
+from lmdeploy import pipeline
+from lmdeploy import TurbomindEngineConfig
+session_len = 160000
+backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=session_len)
+pipe = pipeline('internlm/internlm2-chat-7b', backend_config=backend_config)
+def passkey_retrival(session_len, n_round=5):
+    # create long context input
+    tok = pipe.tokenizer
+    task_description = 'There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.'
+    garbage = 'The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.'
+    for _ in range(n_round):
+        n_times = (session_len - 1000) // len(tok.encode(garbage))
+        n_garbage_prefix = np.random.randint(0, n_times)
+        n_garbage_suffix = n_times - n_garbage_prefix
+        garbage_prefix = ' '.join([garbage] * n_garbage_prefix)
+        garbage_suffix = ' '.join([garbage] * n_garbage_suffix)
+        pass_key = np.random.randint(1, 50000)
+        information_line = f'The pass key is {pass_key}. Remember it. {pass_key} is the pass key.'  # noqa: E501
+        final_question = 'What is the pass key? The pass key is'
+        lines = [
+            task_description,
+            garbage_prefix,
+            information_line,
+            garbage_suffix,
+            final_question,
+        ]
+        # inference
+        prompt = ' '.join(lines)
+        response = pipe([prompt])
+        print(pass_key, response)
+passkey_retrival(session_len, 5)
+```
+### Needle In A Haystack
+[OpenCompass](https://github.com/open-compass/opencompass) offers very useful tools to perform needle-in-a-haystack evaluation. For specific instructions, please refer to the [guide](https://github.com/open-compass/opencompass/blob/main/docs/en/advanced_guides/needleinahaystack_eval.md).
+### Perplexity
+The following codes demonstrate how to use LMDeploy to calculate perplexity.
+```python
+from datasets import load_dataset
+from lmdeploy import TurbomindEngineConfig
+from lmdeploy.turbomind import TurboMind
+import numpy as np
+# load model and tokenizer
+engine_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=160000)
+engine = TurboMind.from_pretrained('internlm/internlm2-chat-7b', engine_config)
+tokenizer = engine.tokenizer
+generator = engine.create_instance()
+# get perplexity
+text = 'Use a long prompt to replace this sentence'
+input_ids = tokenizer.encode(text)
+loss = generator.get_ppl(input_ids)[0]
+ppl = np.exp(loss)
+```