0.2.6版本新增文件补充

fe851fbc · zhouxiang · e2d98ddc · fe851fbc · fe851fbc · fe851fbc
Commit fe851fbc authored Mar 24, 2024 by zhouxiang
20 changed files
--- a/autotest/tools/quantization/test_quantization_w4a16.py
+++ b/autotest/tools/quantization/test_quantization_w4a16.py
+import os
+import allure
+import pytest
+from utils.config_utils import get_cuda_prefix_by_workerid
+from utils.quantization_utils import quantization
+model_list = [
+    'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b',
+    'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat', 'Qwen/Qwen-VL',
+    'internlm/internlm2-chat-20b', 'internlm/internlm2-20b',
+    'baichuan-inc/Baichuan2-7B-Chat'
+]
+@pytest.mark.order(3)
+@pytest.mark.quantization_w4a16
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize('model', model_list)
+def test_quantization_w4a16(config, model, worker_id):
+    quantization_w4a16(config, model + '-inner-w4a16', model,
+                       get_cuda_prefix_by_workerid(worker_id))
+@pytest.mark.order(3)
+@pytest.mark.quantization_w4a16
+@pytest.mark.pr_test
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize(
+    'model, prefix',
+    [('internlm/internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5')])
+def test_quantization_w4a16_pr(config, model, prefix):
+    quantization_w4a16(config, model + '-inner-w4a16', model, prefix)
+def quantization_w4a16(config, quantization_model_name, origin_model_name,
+                       cuda_prefix):
+    quantization_type = 'w4a16'
+    result, msg = quantization(config, quantization_model_name,
+                               origin_model_name, quantization_type,
+                               cuda_prefix)
+    log_path = config.get('log_path')
+    quantization_log = os.path.join(
+        log_path, '_'.join([
+            'quantization', quantization_type,
+            quantization_model_name.split('/')[1]
+        ]) + '.log')
+    allure.attach.file(quantization_log,
+                       attachment_type=allure.attachment_type.TEXT)
+    assert result, msg
--- a/autotest/tools/quantization/test_quantization_w8a8.py
+++ b/autotest/tools/quantization/test_quantization_w8a8.py
+import os
+import allure
+import pytest
+from utils.config_utils import get_cuda_prefix_by_workerid
+from utils.quantization_utils import quantization
+model_list = [
+    'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b',
+    'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-7b',
+    '01-ai/Yi-6B-Chat', 'internlm/internlm2-20b'
+]
+@pytest.mark.order(2)
+@pytest.mark.quantization_w8a8
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize('model', model_list)
+def test_quantization_w8a8(config, model, worker_id):
+    quantization_w8a8(config, model + '-inner-w8a8', model,
+                      get_cuda_prefix_by_workerid(worker_id))
+def quantization_w8a8(config, quantization_model_name, origin_model_name,
+                      cuda_prefix):
+    quantization_type = 'w8a8'
+    result, msg = quantization(config, quantization_model_name,
+                               origin_model_name, quantization_type,
+                               cuda_prefix)
+    log_path = config.get('log_path')
+    quantization_log = os.path.join(
+        log_path, '_'.join([
+            'quantization', quantization_type,
+            quantization_model_name.split('/')[1]
+        ]) + '.log')
+    allure.attach.file(quantization_log,
+                       attachment_type=allure.attachment_type.TEXT)
+    assert result, msg
--- a/autotest/tools/restful/test_restful_chat_pytorch.py
+++ b/autotest/tools/restful/test_restful_chat_pytorch.py
+import os
+import subprocess
+from time import sleep, time
+import allure
+import pytest
+from pytest import assume
+from utils.config_utils import (get_cuda_prefix_by_workerid,
+                                get_torch_model_list, get_workerid)
+from utils.get_run_config import get_command_with_extra
+from utils.run_client_chat import command_line_test
+from utils.run_restful_chat import (get_model, health_check, interactive_test,
+                                    open_chat_test)
+BASE_HTTP_URL = 'http://localhost'
+DEFAULT_PORT = 23333
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    model_path = config.get('model_path')
+    log_path = config.get('log_path')
+    param = request.param
+    model = param['model']
+    cuda_prefix = param['cuda_prefix']
+    tp_num = param['tp_num']
+    if cuda_prefix is None:
+        cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num)
+    worker_num = get_workerid(worker_id)
+    if worker_num is None:
+        port = DEFAULT_PORT
+    else:
+        port = DEFAULT_PORT + worker_num
+    cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path +
+                                 '/' + model + ' --backend pytorch' +
+                                 ' --server-port ' + str(port),
+                                 config,
+                                 model,
+                                 need_tp=True)
+    print('reproduce command restful: ' + cmd)
+    start_log = os.path.join(log_path,
+                             'start_restful_' + model.split('/')[1] + '.log')
+    with open(start_log, 'w') as f:
+        f.writelines('reproduce command restful: ' + cmd + '\n')
+        # convert
+        convertRes = subprocess.Popen([cmd],
+                                      stdout=f,
+                                      stderr=f,
+                                      shell=True,
+                                      text=True,
+                                      encoding='utf-8')
+        pid = convertRes.pid
+    allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    start_time = int(time())
+    sleep(5)
+    for i in range(120):
+        sleep(1)
+        end_time = int(time())
+        total_time = end_time - start_time
+        result = health_check(http_url)
+        if result or total_time >= 120:
+            break
+    yield
+    if pid > 0:
+        kill_log = os.path.join(log_path,
+                                'kill_' + model.split('/')[1] + '.log')
+        with open(kill_log, 'w') as f:
+            convertRes.kill()
+    allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT)
+def getModelList(tp_num):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num
+    } for item in get_torch_model_list(tp_num) if 'chat' in item.lower()]
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api_pytorch
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=1),
+                         indirect=True)
+def test_restful_chat_tp1(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api_pytorch
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=2),
+                         indirect=True)
+def test_restful_chat_tp2(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+def run_all_step(config,
+                 cases_info,
+                 worker_id: str = 'default',
+                 port: int = DEFAULT_PORT):
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    model = get_model(http_url)
+    if model is None:
+        assert False, 'server not start correctly'
+    for case in cases_info.keys():
+        if (case == 'memory_test'
+                or case == 'emoji_case') and 'chat' not in model.lower():
+            continue
+        case_info = cases_info.get(case)
+        with allure.step(case + ' step1 - command chat regression'):
+            chat_result, chat_log, msg = command_line_test(
+                config, case, case_info, model + worker_id, 'api_client',
+                http_url)
+            if chat_log is not None:
+                allure.attach.file(chat_log,
+                                   attachment_type=allure.attachment_type.TEXT)
+            with assume:
+                assert chat_result, msg
+        with allure.step(case + ' step2 - restful_test - openai chat'):
+            restful_result, restful_log, msg = open_chat_test(
+                config, case_info, model, http_url, worker_id)
+            allure.attach.file(restful_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert restful_result, msg
+        with allure.step(case + ' step3 - restful_test - interactive chat'):
+            active_result, interactive_log, msg = interactive_test(
+                config, case_info, model, http_url, worker_id)
+            allure.attach.file(interactive_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert active_result, msg
--- a/autotest/tools/restful/test_restful_chat_turbomind.py
+++ b/autotest/tools/restful/test_restful_chat_turbomind.py
+import os
+import subprocess
+from time import sleep, time
+import allure
+import pytest
+from pytest import assume
+from utils.config_utils import (get_all_model_list,
+                                get_cuda_prefix_by_workerid, get_workerid)
+from utils.get_run_config import get_command_with_extra
+from utils.run_client_chat import command_line_test
+from utils.run_restful_chat import (get_model, health_check, interactive_test,
+                                    open_chat_test)
+BASE_HTTP_URL = 'http://localhost'
+DEFAULT_PORT = 23333
+@pytest.fixture(scope='function', autouse=True)
+def prepare_environment(request, config, worker_id):
+    model_path = config.get('model_path')
+    log_path = config.get('log_path')
+    param = request.param
+    model = param['model']
+    cuda_prefix = param['cuda_prefix']
+    tp_num = param['tp_num']
+    if cuda_prefix is None:
+        cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num)
+    worker_num = get_workerid(worker_id)
+    if worker_num is None:
+        port = DEFAULT_PORT
+    else:
+        port = DEFAULT_PORT + worker_num
+    cmd = ['lmdeploy serve api_server ' + model_path + '/' + model]
+    cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path +
+                                 '/' + model + ' --server-port ' + str(port),
+                                 config,
+                                 model,
+                                 need_tp=True,
+                                 cuda_prefix=cuda_prefix)
+    if 'kvint8' in model:
+        cmd += ' --quant-policy 4'
+        if 'w4' in model or '4bits' in model:
+            cmd += ' --model-format awq'
+        else:
+            cmd += ' --model-format hf'
+    if 'w4' in model or '4bits' in model:
+        cmd += ' --model-format awq'
+    start_log = os.path.join(log_path,
+                             'start_restful_' + model.split('/')[1] + '.log')
+    print('reproduce command restful: ' + cmd)
+    with open(start_log, 'w') as f:
+        f.writelines('reproduce command restful: ' + cmd + '\n')
+        # convert
+        convertRes = subprocess.Popen([cmd],
+                                      stdout=f,
+                                      stderr=f,
+                                      shell=True,
+                                      text=True,
+                                      encoding='utf-8')
+        pid = convertRes.pid
+    allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    start_time = int(time())
+    sleep(5)
+    for i in range(120):
+        sleep(1)
+        end_time = int(time())
+        total_time = end_time - start_time
+        result = health_check(http_url)
+        if result or total_time >= 120:
+            break
+    yield
+    if pid > 0:
+        kill_log = os.path.join(log_path,
+                                'kill_' + model.split('/')[1] + '.log')
+        with open(kill_log, 'w') as f:
+            convertRes.kill()
+    allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT)
+def getModelList(tp_num):
+    return [{
+        'model': item,
+        'cuda_prefix': None,
+        'tp_num': tp_num
+    } for item in get_all_model_list(tp_num) if 'chat' in item.lower()]
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_1
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=1),
+                         indirect=True)
+def test_restful_chat_tp1(request, config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.gpu_num_2
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.parametrize('prepare_environment',
+                         getModelList(tp_num=2),
+                         indirect=True)
+def test_restful_chat_tp2(config, common_case_config, worker_id):
+    if get_workerid(worker_id) is None:
+        run_all_step(config, common_case_config)
+    else:
+        run_all_step(config,
+                     common_case_config,
+                     worker_id=worker_id,
+                     port=DEFAULT_PORT + get_workerid(worker_id))
+@pytest.mark.order(7)
+@pytest.mark.usefixtures('common_case_config')
+@pytest.mark.restful_api
+@pytest.mark.flaky(reruns=0)
+@pytest.mark.pr_test
+@pytest.mark.parametrize('prepare_environment', [{
+    'model': 'internlm/internlm2-chat-20b',
+    'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
+    'tp_num': 2
+}, {
+    'model': 'internlm/internlm2-chat-20b-inner-w4a16',
+    'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
+    'tp_num': 2
+}],
+                         indirect=True)
+def test_restful_chat_pr(config, common_case_config):
+    run_all_step(config, common_case_config)
+def run_all_step(config,
+                 cases_info,
+                 worker_id: str = 'default',
+                 port: int = DEFAULT_PORT):
+    http_url = BASE_HTTP_URL + ':' + str(port)
+    model = get_model(http_url)
+    if model is None:
+        assert False, 'server not start correctly'
+    for case in cases_info.keys():
+        if (case == 'memory_test'
+                or case == 'emoji_case') and 'chat' not in model.lower():
+            continue
+        case_info = cases_info.get(case)
+        with allure.step(case + ' step1 - command chat regression'):
+            chat_result, chat_log, msg = command_line_test(
+                config, case, case_info, model, 'api_client', http_url)
+            allure.attach.file(chat_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert chat_result, msg
+        with allure.step(case + ' step2 - restful_test - openai chat'):
+            restful_result, restful_log, msg = open_chat_test(
+                config, case_info, model, http_url)
+            allure.attach.file(restful_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert restful_result, msg
+        with allure.step(case + ' step3 - restful_test - interactive chat'):
+            active_result, interactive_log, msg = interactive_test(
+                config, case_info, model, http_url)
+            allure.attach.file(interactive_log,
+                               attachment_type=allure.attachment_type.TEXT)
+        with assume:
+            assert active_result, msg
--- a/autotest/utils/config_utils.py
+++ b/autotest/utils/config_utils.py
+import os
+import yaml
+from utils.get_run_config import get_tp_num
+def get_turbomind_model_list(tp_num: int = None):
+    config_path = os.path.join('autotest/config.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    case_list = config.get('turbomind_model')
+    quatization_case_config = config.get('quatization_case_config')
+    for key in quatization_case_config.get('w4a16'):
+        case_list.append(key + '-inner-w4a16')
+    for key in quatization_case_config.get('kvint8'):
+        case_list.append(key + '-inner-kvint8')
+    for key in quatization_case_config.get('kvint8_w4a16'):
+        case_list.append(key + '-inner-kvint8-w4a16')
+    if tp_num is not None:
+        return [
+            item for item in case_list if get_tp_num(config, item) == tp_num
+        ]
+    return case_list
+def get_torch_model_list(tp_num: int = None):
+    config_path = os.path.join('autotest/config.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    case_list = config.get('pytorch_model')
+    quatization_case_config = config.get('quatization_case_config')
+    for key in quatization_case_config.get('w8a8'):
+        case_list.append(key + '-inner-w8a8')
+    if tp_num is not None:
+        return [
+            item for item in case_list if get_tp_num(config, item) == tp_num
+        ]
+    return case_list
+def get_all_model_list(tp_num: int = None):
+    config_path = os.path.join('autotest/config.yaml')
+    with open(config_path) as f:
+        config = yaml.load(f.read(), Loader=yaml.SafeLoader)
+    case_list = config.get('turbomind_model')
+    for key in config.get('pytorch_model'):
+        if key not in case_list:
+            case_list.append(key)
+    quatization_case_config = config.get('quatization_case_config')
+    for key in quatization_case_config.get('w4a16'):
+        case_list.append(key + '-inner-w4a16')
+    for key in quatization_case_config.get('kvint8'):
+        case_list.append(key + '-inner-kvint8')
+    for key in quatization_case_config.get('kvint8_w4a16'):
+        case_list.append(key + '-inner-kvint8-w4a16')
+    if tp_num is not None:
+        return [
+            item for item in case_list if get_tp_num(config, item) == tp_num
+        ]
+    return case_list
+def get_cuda_prefix_by_workerid(worker_id, tp_num: int = 1):
+    if worker_id is None or 'gw' not in worker_id:
+        return None
+    else:
+        if tp_num == 1:
+            return 'CUDA_VISIBLE_DEVICES=' + worker_id.replace('gw', '')
+        elif tp_num == 2:
+            cuda_num = int(worker_id.replace('gw', '')) * 2
+            return 'CUDA_VISIBLE_DEVICES=' + ','.join(
+                [str(cuda_num), str(cuda_num + 1)])
+def get_cuda_id_by_workerid(worker_id, tp_num: int = 1):
+    if worker_id is None or 'gw' not in worker_id:
+        return None
+    else:
+        if tp_num == 1:
+            return worker_id.replace('gw', '')
+        elif tp_num == 2:
+            cuda_num = int(worker_id.replace('gw', '')) * 2
+            return ','.join([str(cuda_num), str(cuda_num + 1)])
+def get_workerid(worker_id):
+    if worker_id is None or 'gw' not in worker_id:
+        return None
+    else:
+        return int(worker_id.replace('gw', ''))
--- a/autotest/utils/get_run_config.py
+++ b/autotest/utils/get_run_config.py
+import random
+from time import sleep
+import torch
+from lmdeploy.model import MODELS
+def get_conda_allcate_prefix(config, model):
+    cuda_prefix = ''
+    tp_num = get_tp_num(config, model)
+    if tp_num is None:
+        return cuda_prefix
+    available_cuda = _get_available_cude()
+    if len(available_cuda) < tp_num:
+        raise torch.cuda.OutOfMemoryError
+    cuda_prefix = 'CUDA_VISIBLE_DEVICES=' + ','.join(
+        random.sample(available_cuda, tp_num))
+    torch.cuda.empty_cache()
+    return cuda_prefix
+def get_tp_config(config, model, need_tp):
+    tp_num = str(get_tp_num(config, model))
+    tp_info = ''
+    if need_tp and tp_num is not None:
+        tp_info = '--tp ' + str(get_tp_num(config, model))
+    return tp_info
+def get_tp_num(config, model):
+    tp_config = config.get('tp_config')
+    tp_num = 1
+    if tp_config is None:
+        return None
+    model_name = _simple_model_name(model)
+    if model_name in tp_config.keys():
+        tp_num = tp_config.get(model_name)
+    return tp_num
+def get_command_with_extra(cmd,
+                           config,
+                           model,
+                           need_tp: bool = False,
+                           cuda_prefix: str = None,
+                           need_sleep: bool = True):
+    if need_sleep:
+        sleep(random.uniform(0, 5))
+    if cuda_prefix is None:
+        cuda_prefix = get_conda_allcate_prefix(config, model)
+    tp_config = get_tp_config(config, model, need_tp)
+    if cuda_prefix is not None and len(cuda_prefix) > 0:
+        cmd = ' '.join([cuda_prefix, cmd])
+    if tp_config is not None and len(tp_config) > 0:
+        cmd = ' '.join([cmd, tp_config])
+    torch.cuda.empty_cache()
+    return cmd
+def get_model_name(model):
+    model_names = [
+        'llama', 'llama2', 'internlm', 'internlm2', 'baichuan2', 'chatglm2',
+        'falcon', 'yi', 'qwen1.5'
+    ]
+    model_names += list(MODELS.module_dict.keys())
+    model_names.sort()
+    model_name = _simple_model_name(model)
+    model_name = model_name.lower()
+    if model_name in model_names:
+        return model_name
+    model_name = model_name.replace('-chat', '')
+    model_name = model_name.replace('-v0.1', '')
+    if model_name in model_names:
+        return model_name
+    if (model_name == 'qwen-vl'):
+        return 'qwen-7b'
+    if ('llama-2' in model_name):
+        return 'llama-2'
+    return model_name.split('-')[0]
+def _get_available_cude():
+    devices = torch.cuda.device_count()
+    available_cuda = []
+    for i in range(devices):
+        if (torch.cuda.utilization(i) > 5):
+            continue
+        if ('no processes are running'
+                not in torch.cuda.list_gpu_processes(i)):
+            continue
+        available_cuda.append(str(i))
+    return available_cuda
+def _simple_model_name(model):
+    if '/' in model:
+        model_name = model.split('/')[1]
+    else:
+        model_name = model
+    model_name = model_name.replace('-inner-w4a16', '')
+    model_name = model_name.replace('-inner-w8a8', '')
+    model_name = model_name.replace('-inner-kvint8', '')
+    model_name = model_name.replace('-w4a16', '')
+    return model_name
+def _split_model_name(model):
+    model_name = model.split('/')[1]
+    return model_name
+if __name__ == '__main__':
+    print(_simple_model_name('baichuan-inc/Baichuan2-7B-Chat-inner-w4a16'))
--- a/autotest/utils/mp_log_utils.py
+++ b/autotest/utils/mp_log_utils.py
+import os
+import allure
+from pytest import assume
+def write_log(config,
+              result,
+              msg,
+              is_new: bool = True,
+              case_path_tag: str = 'default'):
+    try:
+        log_path = os.path.join(config.get('log_path'), case_path_tag)
+        if is_new:
+            file = open(log_path, 'w')
+        else:
+            file = open(log_path, 'a')
+        file.writelines('result:' + result + ', reason:' + msg + '\n')
+        file.close()
+    except Exception as e:
+        return False, None, f'Unknown error: {e}'
+def assert_log(config, case_path_tag: str = 'default'):
+    log_path = os.path.join(config.get('log_path'), case_path_tag)
+    with open(log_path, 'r') as f:
+        lines = f.readlines()
+        for line in lines:
+            if 'result:False, reason:' in line:
+                result = False
+                msg = line
+                break
+            if 'result:True, reason:' in line and result is False:
+                result = True
+    allure.attach.file(log_path, attachment_type=allure.attachment_type.TEXT)
+    with assume:
+        assert result, msg
--- a/autotest/utils/pipeline_chat.py
+++ b/autotest/utils/pipeline_chat.py
+import os
+import allure
+import torch
+from pytest import assume
+from utils.get_run_config import get_model_name, get_tp_num
+from utils.rule_condition_assert import assert_result
+from lmdeploy import pipeline
+from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
+                               TurbomindEngineConfig)
+def run_pipeline_chat_test(config, cases_info, model_case, type):
+    log_path = config.get('log_path')
+    tp = get_tp_num(config, model_case)
+    model_name = model_name = get_model_name(model_case)
+    model_path = config.get('model_path')
+    hf_path = model_path + '/' + model_case
+    print(' '.join([
+        'reproduce command:', 'python',
+        'autotest/tools/pipeline/pipeline_chat_script.py', type, model_case,
+        str(tp)
+    ]))
+    if 'pytorch' == type:
+        backend_config = PytorchEngineConfig(tp=tp)
+    else:
+        if 'kvint8' in model_case and ('w4' in model_case
+                                       or '4bits' in model_case):
+            backend_config = TurbomindEngineConfig(tp=tp,
+                                                   model_format='awq',
+                                                   quant_policy=4)
+        elif 'kvint8' in model_case:
+            backend_config = TurbomindEngineConfig(tp=tp,
+                                                   model_format='hf',
+                                                   quant_policy=4)
+        elif 'w4' in model_case or '4bits' in model_case:
+            backend_config = TurbomindEngineConfig(tp=tp, model_format='awq')
+        else:
+            backend_config = TurbomindEngineConfig(tp=tp)
+    pipe = pipeline(hf_path, backend_config=backend_config)
+    # run testcases
+    gen_config = GenerationConfig(temperature=0.01)
+    gen_config = GenerationConfig()
+    for case in cases_info.keys():
+        if (case == 'memory_test'
+                or case == 'emoji_case') and 'chat' not in model_case.lower():
+            continue
+        case_info = cases_info.get(case)
+        pipeline_chat_log = os.path.join(
+            log_path,
+            'pipeline_chat_' + model_case.split('/')[1] + '_' + case + '.log')
+        file = open(pipeline_chat_log, 'w')
+        prompts = []
+        for prompt_detail in case_info:
+            prompt = list(prompt_detail.keys())[0]
+            if 'chat' not in model_case.lower():  # base model
+                prompts.append(prompt)
+            else:  # chat model
+                prompts.append({'role': 'user', 'content': prompt})
+            file.writelines('prompt:' + prompt + '\n')
+            if 'chat' not in model_case.lower():  # base model
+                response = pipe(prompts, gen_config=gen_config)[-1].text
+            else:  # chat model
+                response = pipe([prompts], gen_config=gen_config)[0].text
+            case_result, reason = assert_result(response,
+                                                prompt_detail.values(),
+                                                model_name)
+            if 'chat' in model_case.lower():
+                prompts.append({'role': 'assistant', 'content': response})
+            file.writelines('output:' + response + '\n')
+            file.writelines('result:' + str(case_result) + ', reason:' +
+                            reason + '\n')
+        file.close()
+    del pipe
+    torch.cuda.empty_cache()
+def assert_pipeline_chat_log(config, cases_info, model_case):
+    log_path = config.get('log_path')
+    for case in cases_info.keys():
+        if (case == 'memory_test'
+                or case == 'emoji_case') and 'chat' not in model_case.lower():
+            continue
+        msg = ''
+        result = False
+        with allure.step('case - ' + case):
+            pipeline_chat_log = os.path.join(
+                log_path, 'pipeline_chat_' + model_case.split('/')[1] + '_' +
+                case + '.log')
+            with open(pipeline_chat_log, 'r') as f:
+                lines = f.readlines()
+                for line in lines:
+                    if 'result:False, reason:' in line:
+                        result = False
+                        msg = line
+                        break
+                    if 'result:True, reason:' in line and result is False:
+                        result = True
+            allure.attach.file(pipeline_chat_log,
+                               attachment_type=allure.attachment_type.TEXT)
+            with assume:
+                assert result, msg
--- a/autotest/utils/quantization_utils.py
+++ b/autotest/utils/quantization_utils.py
+import os
+import subprocess
+from subprocess import PIPE
+def quantization(config,
+                 quantization_model_name,
+                 origin_model_name,
+                 quantization_type: str = 'w4a16',
+                 cuda_prefix: str = 'CUDA_VISIBLE_DEVICES=0'):
+    model_path = config.get('model_path')
+    log_path = config.get('log_path')
+    origin_model_path = config.get('model_path') + '/' + origin_model_name
+    quantization_model_path = model_path + '/' + quantization_model_name
+    quantization_log = os.path.join(
+        log_path, '_'.join([
+            'quantization', quantization_type,
+            quantization_model_name.split('/')[1]
+        ]) + '.log')
+    if quantization_type == 'w4a16':
+        quantization_cmd = ' '.join([
+            cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path,
+            '--work-dir', quantization_model_path
+        ])
+    elif quantization_type == 'w8a8':
+        quantization_cmd = ' '.join([
+            cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path,
+            '--work-dir', quantization_model_path
+        ])
+    elif quantization_type == 'kvint8':
+        quantization_cmd = ' '.join([
+            cuda_prefix, 'lmdeploy lite calibrate', origin_model_path,
+            '--work-dir', quantization_model_path
+        ])
+    else:
+        return False, 'quantization type should in [w4a16, w8a8, kvint8], \
+            now the type is ' + quantization_type
+    with open(quantization_log, 'w') as f:
+        # remove existing folder
+        subprocess.run([' '.join(['rm -rf', quantization_model_path])],
+                       stdout=f,
+                       stderr=f,
+                       shell=True,
+                       text=True,
+                       encoding='utf-8')
+        if quantization_type == 'kvint8':
+            cp_cmd = ' '.join(
+                ['cp -r', origin_model_path, quantization_model_path])
+            f.writelines('reproduce command quantization_cmd: ' + cp_cmd +
+                         '\n')
+            print('reproduce command quantization_cmd: ' + cp_cmd)
+            subprocess.run([cp_cmd],
+                           stdout=f,
+                           stderr=f,
+                           shell=True,
+                           text=True,
+                           encoding='utf-8')
+        f.writelines('reproduce command quantization_cmd: ' +
+                     quantization_cmd + '\n')
+        print('reproduce command quantization_cmd: ' + quantization_cmd)
+        # quantization
+        quantizationRes = subprocess.run([quantization_cmd],
+                                         stdout=f,
+                                         stderr=PIPE,
+                                         shell=True,
+                                         text=True,
+                                         encoding='utf-8')
+        f.writelines(quantizationRes.stderr)
+        result = quantizationRes.returncode == 0
+    return result, quantizationRes.stderr
--- a/autotest/utils/restful_return_check.py
+++ b/autotest/utils/restful_return_check.py
+def assert_chat_completions_batch_return(output, model_name):
+    assert output.get('usage').get('prompt_tokens') > 0
+    assert output.get('usage').get('total_tokens') > 0
+    assert output.get('usage').get('completion_tokens') > 0
+    assert output.get('usage').get('completion_tokens') + output.get(
+        'usage').get('prompt_tokens') == output.get('usage').get(
+            'total_tokens')
+    assert output.get('id') is not None
+    assert output.get('object') == 'chat.completion'
+    assert output.get('model') == model_name
+    output_message = output.get('choices')
+    assert len(output_message) == 1
+    for message in output_message:
+        assert message.get('finish_reason') in ['stop', 'length']
+        assert message.get('index') == 0
+        assert len(message.get('message').get('content')) > 0
+        assert message.get('message').get('role') == 'assistant'
+def assert_chat_completions_stream_return(output,
+                                          model_name,
+                                          is_first: bool = False,
+                                          is_last: bool = False):
+    assert output.get('id') is not None
+    if is_first is False:
+        assert output.get('object') == 'chat.completion.chunk'
+    assert output.get('model') == model_name
+    output_message = output.get('choices')
+    assert len(output_message) == 1
+    for message in output_message:
+        assert message.get('delta').get('role') == 'assistant'
+        assert message.get('index') == 0
+        if is_last is False:
+            assert message.get('finish_reason') is None
+        if is_first is False and is_last is False:
+            assert len(message.get('delta').get('content')) >= 0
+        if is_last is True:
+            assert len(message.get('delta').get('content')) == 0
+            assert message.get('finish_reason') in ['stop', 'length']
+def assert_chat_interactive_batch_return(output):
+    assert output.get('input_tokens') > 0
+    assert output.get('tokens') > 0
+    assert output.get('history_tokens') >= 0
+    assert output.get('finish_reason') in ['stop', 'length']
+    assert len(output.get('text')) > 0
+def assert_chat_interactive_stream_return(output,
+                                          is_last: bool = False,
+                                          is_text_empty: bool = False,
+                                          index: int = None):
+    assert output.get('input_tokens') > 0
+    if index is not None:
+        assert output.get('tokens') >= index and output.get(
+            'tokens') <= index + 6
+    assert output.get('tokens') > 0
+    assert output.get('history_tokens') >= 0
+    if is_last:
+        assert len(output.get('text')) >= 0
+        assert output.get('finish_reason') in ['stop', 'length']
+    elif is_text_empty:
+        assert len(output.get('text')) == 0
+        assert output.get('finish_reason') is None
+    else:
+        assert len(output.get('text')) >= 0
+        assert output.get('finish_reason') is None
--- a/autotest/utils/rule_condition_assert.py
+++ b/autotest/utils/rule_condition_assert.py
+def assert_result(input, rule_condition, model_name):
+    input = input.replace('\n', '\\n')
+    input_lower = input.lower()
+    for dict in rule_condition:
+        if dict is None:
+            return True, ''
+        for rule in dict:
+            operator = list(rule.keys())[0]
+            value = list(rule.values())[0]
+            if model_name is not None and model_name == operator:
+                dict = value
+        for rule in dict:
+            operator = list(rule.keys())[0]
+            value = list(rule.values())[0]
+            if operator == 'contain':
+                if isinstance(value, list):
+                    tmpResult = False
+                    for word in value:
+                        if word.lower() in input_lower:
+                            tmpResult = True
+                    if tmpResult is False:
+                        return False, ','.join(
+                            value) + " doesn't exist in " + input
+                else:
+                    if value.lower() not in input_lower:
+                        msg = value + " doesn't exist in:" + input
+                        return False, msg
+            if operator == 'not_contain':
+                if isinstance(value, list):
+                    for word in value:
+                        if word.lower() in input_lower:
+                            msg = word + " shouldn't exist in:" + input
+                            return False, msg
+                else:
+                    if value.lower() in input_lower:
+                        msg = value + " shouldn't exist in " + input
+                        return False, msg
+            if operator == 'len_g':
+                if len(input) < int(value):
+                    return False, input + ' length: ' + str(
+                        len(input)) + ', should greater than ' + str(value)
+        return True, ''
+if __name__ == '__main__':
+    input = '成都的景点hot potdddd'
+    condition = ([[{
+        'contain': ['hot pot']
+    }, {
+        'contain': ['。']
+    }, {
+        'len_g': [10]
+    }]])
+    print(assert_result(input, condition))
--- a/autotest/utils/run_client_chat.py
+++ b/autotest/utils/run_client_chat.py
+import os
+from subprocess import PIPE, Popen
+from utils.get_run_config import get_command_with_extra, get_model_name
+from utils.rule_condition_assert import assert_result
+def command_line_test(config,
+                      case,
+                      case_info,
+                      model_case,
+                      type,
+                      extra: str = None,
+                      cuda_prefix: str = None):
+    dst_path = config.get('dst_path')
+    if type == 'api_client':
+        cmd = 'lmdeploy serve api_client ' + extra
+    elif type == 'triton_client':
+        cmd = 'lmdeploy serve triton_client ' + extra
+    else:
+        cmd = get_command_with_extra('lmdeploy chat turbomind ' + dst_path +
+                                     '/workspace_' + model_case,
+                                     config,
+                                     model_case,
+                                     cuda_prefix=cuda_prefix)
+        if 'kvint8' in model_case:
+            cmd += ' --quant-policy 4'
+            if 'w4' in model_case or '4bits' in model_case:
+                cmd += ' --model-format awq'
+            else:
+                cmd += ' --model-format hf'
+        elif 'w4' in model_case or '4bits' in model_case:
+            cmd += ' --model-format awq'
+        if 'chat' not in model_case.lower():
+            cmd += ' --cap completion'
+    return command_test(config, [cmd], model_case, case, case_info,
+                        type == 'turbomind')
+def hf_command_line_test(config,
+                         case,
+                         case_info,
+                         model_case,
+                         type,
+                         cuda_prefix: str = None):
+    model_path = config.get('model_path') + '/' + model_case
+    cmd = get_command_with_extra(' '.join(['lmdeploy chat', type, model_path]),
+                                 config,
+                                 model_case,
+                                 need_tp=True,
+                                 cuda_prefix=cuda_prefix)
+    if 'kvint8' in model_case:
+        cmd += ' --quant-policy 4'
+        if 'w4' in model_case or '4bits' in model_case:
+            cmd += ' --model-format awq'
+        else:
+            cmd += ' --model-format hf'
+    elif 'w4' in model_case or '4bits' in model_case:
+        cmd += ' --model-format awq'
+    return command_test(config, [cmd], model_case,
+                        '_'.join(['hf', type, case]), case_info, True)
+def command_test(config, cmd, model, case, case_info, need_extract_output):
+    if 'memory_test' in case and 'chat' not in model.lower():
+        return True, None, 'memory case skipped for base model'
+    try:
+        log_path = config.get('log_path')
+        model_name = get_model_name(model)
+        if '/' in model:
+            chat_log = os.path.join(
+                log_path, 'chat_' + model.split('/')[1] + '_' + case + '.log')
+        else:
+            chat_log = os.path.join(log_path,
+                                    'chat_' + model + '_' + case + '.log')
+        file = open(chat_log, 'w')
+        returncode = -1
+        result = True
+        print('reproduce command chat: ' + ' '.join(cmd) + '\n')
+        file.writelines('reproduce command chat: ' + ' '.join(cmd) + '\n')
+        spliter = '\n\n'
+        if 'CodeLlama-7b-Instruct-hf' in model:
+            spliter = '\n!!\n'
+        # join prompt together
+        prompt = ''
+        for item in case_info:
+            prompt += list(item.keys())[0] + spliter
+        prompt += 'exit' + spliter
+        msg = ''
+        with Popen(cmd,
+                   stdin=PIPE,
+                   stdout=PIPE,
+                   stderr=PIPE,
+                   shell=True,
+                   text=True,
+                   encoding='utf-8') as proc:
+            # file.writelines('prompt:' + prompt + '\n')
+            outputs, errors = proc.communicate(input=prompt)
+            returncode = proc.returncode
+            if returncode != 0:
+                file.writelines('error:' + errors + '\n')
+                result = False
+                return result, chat_log, errors
+            outputDialogs = parse_dialogue(outputs, model)
+            file.writelines('answersize:' + str(len(outputDialogs)) + '\n')
+            # 结果判断
+            index = 0
+            for prompt_detail in case_info:
+                if need_extract_output:
+                    output = extract_output(outputDialogs[index], model)
+                else:
+                    output = outputDialogs[index]
+                case_result, reason = assert_result(output,
+                                                    prompt_detail.values(),
+                                                    model_name)
+                file.writelines('prompt:' + list(prompt_detail.keys())[0] +
+                                '\n')
+                file.writelines('output:' + output + '\n')
+                file.writelines('result:' + str(case_result) + ',reason:' +
+                                reason + '\n')
+                index += 1
+                if case_result is False:
+                    msg = reason
+                result = result & case_result
+        file.close()
+        return result, chat_log, msg
+    except Exception as e:
+        return False, None, f'Unknown error: {e}'
+# 从输出中解析模型输出的对话内容
+def parse_dialogue(inputs: str, model: str):
+    dialogues = inputs.strip()
+    if 'CodeLlama-7b-Instruct-hf' in model:
+        sep = 'enter !! to end the input >>>'
+    else:
+        sep = 'double enter to end input >>>'
+    dialogues = dialogues.strip()
+    dialogues = dialogues.split(sep)
+    dialogues = [d.strip() for d in dialogues]
+    return dialogues[1:-1]  # 去除首尾无用字符
+def extract_output(output: str, model: str):
+    if 'Qwen' in model or 'internlm2' in model:
+        if len(output.split('<|im_start|>assistant')) >= 2:
+            return output.split('<|im_start|>assistant')[1]
+    if 'Baichuan2' in model:
+        if len(output.split('<reserved_107>')) >= 2:
+            return output.split('<reserved_107>')[1]
+    if 'internlm' in model:
+        if len(output.split('<|Bot|>: ')) >= 2:
+            return output.split('<|Bot|>: ')[1]
+    if 'llama' in model or 'Llama' in model:
+        if len(output.split('[/INST]')) >= 2:
+            return output.split('[/INST]')[1]
+    return output
--- a/autotest/utils/run_restful_chat.py
+++ b/autotest/utils/run_restful_chat.py
+import os
+import random
+import string
+from utils.rule_condition_assert import assert_result
+from lmdeploy.serve.openai.api_client import APIClient
+def open_chat_test(config, case_info, model, url, worker_id: str = 'default'):
+    log_path = config.get('log_path')
+    restful_log = os.path.join(log_path,
+                               'restful_' + model + '_' + worker_id + '.log')
+    file = open(restful_log, 'w')
+    result = True
+    api_client = APIClient(url)
+    model_name = api_client.available_models[0]
+    messages = []
+    msg = ''
+    for prompt_detail in case_info:
+        if result is False:
+            break
+        prompt = list(prompt_detail.keys())[0]
+        messages.append({'role': 'user', 'content': prompt})
+        file.writelines('prompt:' + prompt + '\n')
+        for output in api_client.chat_completions_v1(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.01):
+            output_message = output.get('choices')[0].get('message')
+            messages.append(output_message)
+            output_content = output_message.get('content')
+            file.writelines('output:' + output_content + '\n')
+            case_result, reason = assert_result(output_content,
+                                                prompt_detail.values(),
+                                                model_name)
+            file.writelines('result:' + str(case_result) + ',reason:' +
+                            reason + '\n')
+            if result is False:
+                msg += reason
+            result = result & case_result
+    file.close()
+    return result, restful_log, msg
+def interactive_test(config,
+                     case_info,
+                     model,
+                     url,
+                     worker_id: str = 'default'):
+    log_path = config.get('log_path')
+    interactive_log = os.path.join(
+        log_path, 'interactive_' + model + '_' + worker_id + '.log')
+    file = open(interactive_log, 'w')
+    result = True
+    api_client = APIClient(url)
+    file.writelines('available_models:' +
+                    ','.join(api_client.available_models) + '\n')
+    # Randomly generate 6 characters and concatenate them into a string.
+    characters = string.digits
+    random_chars = ''.join(random.choice(characters) for i in range(6))
+    messages = []
+    msg = ''
+    for prompt_detail in case_info:
+        prompt = list(prompt_detail.keys())[0]
+        new_prompt = {'role': 'user', 'content': prompt}
+        messages.append(new_prompt)
+        file.writelines('prompt:' + prompt + '\n')
+        for output in api_client.chat_interactive_v1(prompt=prompt,
+                                                     interactive_mode=True,
+                                                     session_id=random_chars,
+                                                     temperature=0.01):
+            output_content = output.get('text')
+            file.writelines('output:' + output_content + '\n')
+            case_result, reason = assert_result(output_content,
+                                                prompt_detail.values(), model)
+            file.writelines('result:' + str(case_result) + ',reason:' +
+                            reason + '\n')
+            if result is False:
+                msg += reason
+            result = result & case_result
+    file.close()
+    return result, interactive_log, msg
+def health_check(url):
+    try:
+        api_client = APIClient(url)
+        model_name = api_client.available_models[0]
+        messages = []
+        messages.append({'role': 'user', 'content': '你好'})
+        for output in api_client.chat_completions_v1(model=model_name,
+                                                     messages=messages,
+                                                     temperature=0.01):
+            if output.get('code') is not None and output.get('code') != 0:
+                return False
+            return True
+    except Exception:
+        return False
+def get_model(url):
+    try:
+        api_client = APIClient(url)
+        model_name = api_client.available_models[0]
+        return model_name
+    except Exception:
+        return None
--- a/benchmark/benchmark_pytorch_engine_a100.sh
+++ b/benchmark/benchmark_pytorch_engine_a100.sh
+#!/bin/bash
+dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json"
+########################################## PyTorch engine: fp16 or bf16 ##########################################
+## 7B
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.95
+model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat"
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
+## 13B
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat"
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
+# 20B
+tp=2
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/InternLM/internlm-chat-20b"
+CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path}  --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
+# 70B
+tp=4
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf"
+CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path}  --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
+########################################## PyTorch engine: w8a8 ##########################################
--- a/benchmark/benchmark_turbomind_engine_a100.sh
+++ b/benchmark/benchmark_turbomind_engine_a100.sh
+# #!/bin/bash
+dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json"
+########################################## TurboMind engine: fp16 or bf16 ##########################################
+# 7B. gemm_tune -> profile_throughput
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.95
+model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat"
+CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_7b_thr.csv
+rm gemm_config.in
+# 13B. gemm_tune -> profile_throughput
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat"
+CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_13b_thr.csv
+rm gemm_config.in
+# 20B. gemm_tune -> profile_throughput
+tp=2
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/InternLM/internlm-chat-20b"
+CUDA_VISIBLE_DEVICES="5,6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
+CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv internlm_tb_20b_thr.csv
+rm gemm_config.in
+# 70B
+tp=4
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf"
+CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_70b_thr.csv
+# ########################################## TurboMind engine: w4a16 ##########################################
+# 7B
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.95
+model_path="/workspace/models/quantization/llama-2-7b-chat-4bit"
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_7b_4bit_thr.csv
+# 13B
+tp=1
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models/quantization/llama-2-13b-chat-4bit"
+CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_13b_4bit_thr.csv
+# 20B
+tp=2
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models/quantization/internlm-chat-20b-4bit"
+CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv internlm_tb_20b_4bit_thr.csv
+# 70B
+tp=4
+max_batch_size=256
+cache_max_entry_count=0.9
+model_path="/workspace/models/quantization/llama-2-70b-chat-hf-4bit"
+CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_70b_4bit_thr.csv
--- a/docs/en/.readthedocs.yaml
+++ b/docs/en/.readthedocs.yaml
+version: 2
+formats: all
+build:
+  os: "ubuntu-22.04"
+  tools:
+    python: "3.8"
+sphinx:
+  configuration: docs/en/conf.py
+python:
+  install:
+    - requirements: requirements/docs.txt
+    - requirements: requirements/readthedocs.txt
--- a/docs/en/_static/image/lmdeploy-logo.svg
+++ b/docs/en/_static/image/lmdeploy-logo.svg
+<svg width="724" height="169" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:space="preserve" overflow="hidden"><defs><clipPath id="clip0"><rect x="290" y="255" width="724" height="169"/></clipPath><linearGradient x1="515.209" y1="187.434" x2="675.945" y2="480.272" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill1"><stop offset="0" stop-color="#9C8BFE"/><stop offset="1" stop-color="#2B50FF"/></linearGradient><linearGradient x1="366.983" y1="280.208" x2="358.966" y2="161.282" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill2"><stop offset="0" stop-color="#E3AFFE"/><stop offset="1" stop-color="#2B50FF"/></linearGradient><linearGradient x1="339.833" y1="251.78" x2="336.655" y2="198.744" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill3"><stop offset="0" stop-color="#748DFA"/><stop offset="1" stop-color="#C1B8FF"/></linearGradient><linearGradient x1="366.61" y1="199.406" x2="331.082" y2="291.3" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill4"><stop offset="0" stop-color="#DBABFE"/><stop offset="1" stop-color="#C8F2FF"/></linearGradient><linearGradient x1="369.17" y1="198.557" x2="335.983" y2="245.993" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="stroke5"><stop offset="0" stop-color="#FFFFFF"/><stop offset="0.46875" stop-color="#FFFFFF" stop-opacity="0"/><stop offset="1" stop-color="#FFFFFF" stop-opacity="0"/></linearGradient><linearGradient x1="378.752" y1="221.569" x2="411.083" y2="175.73" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="stroke6"><stop offset="0" stop-color="#FFFFFF"/><stop offset="0.46875" stop-color="#FFFFFF" stop-opacity="0"/><stop offset="1" stop-color="#FFFFFF" stop-opacity="0"/></linearGradient><linearGradient x1="405.519" y1="173.592" x2="409.26" y2="222.227" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="fill7"><stop offset="0" stop-color="#DBABFE"/><stop offset="1" stop-color="#B1E8FA"/></linearGradient><linearGradient x1="356.715" y1="253.912" x2="350.448" y2="271.193" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="stroke8"><stop offset="0" stop-color="#AA5FE6" stop-opacity="0"/><stop offset="1" stop-color="#2E75FE"/></linearGradient><linearGradient x1="350.864" y1="235.329" x2="339.765" y2="259.744" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="stroke9"><stop offset="0" stop-color="#AA5FE6" stop-opacity="0"/><stop offset="1" stop-color="#2E75FE"/></linearGradient><linearGradient x1="353.774" y1="211.139" x2="340.952" y2="235.597" gradientUnits="userSpaceOnUse" spreadMethod="pad" id="stroke10"><stop offset="0" stop-color="#AA5FE6" stop-opacity="0"/><stop offset="1" stop-color="#2E75FE"/></linearGradient></defs><g clip-path="url(#clip0)" transform="translate(-290 -255)"><path d="M0 0 1280.24 0 1280.24 463.908 0 463.908Z" fill="none" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M589.722 261.071 569.151 213.627C567.428 209.675 565.705 205.513 563.982 201.142 563.034 198.739 562.087 196.272 561.14 193.743L560.908 193.122 560.765 192.739 560.606 192.309 560.117 192.309 560.127 192.486 560.156 193.058 560.166 193.275C560.704 203.942 560.972 213.736 560.972 222.652L560.972 261.071 551.023 261.071 551.023 181.62 565.367 181.62 584.594 226.572C586.654 231.396 588.911 237.144 591.365 243.812L591.858 245.158 592.163 245.158 592.21 245.03 592.302 244.777 592.408 244.486C595.227 236.778 597.568 230.803 599.427 226.572L618.654 181.62 632.998 181.62 632.998 261.071 623.049 261.071 623.049 222.652C623.049 214.228 623.299 204.812 623.8 194.403L623.855 193.272 623.866 193.058 623.894 192.486 623.904 192.309 623.415 192.309 623.146 193.032 623.114 193.119C620.214 200.895 617.465 207.736 614.869 213.627L594.3 261.071 589.722 261.071ZM718.209 234.053C719.389 229.975 719.979 225.622 719.979 220.99 719.979 216.398 719.389 212.121 718.209 208.164 717.07 204.165 715.32 200.582 712.96 197.415 710.64 194.209 707.69 191.439 704.109 189.102 700.569 186.766 696.418 184.945 691.658 183.639 688.93 182.886 685.981 182.371 682.807 182.095 681.362 181.951 679.864 181.839 678.31 181.761 676.45 181.667 674.509 181.62 672.491 181.62L653.813 181.62 653.813 261.071 672.308 261.071C674.622 261.071 676.833 261.017 678.941 260.909L679.099 260.902 679.955 260.853C680.864 260.798 681.754 260.732 682.623 260.656 685.797 260.339 688.748 259.806 691.474 259.053 696.235 257.747 700.386 255.905 703.926 253.53 707.507 251.156 710.477 248.344 712.838 245.098 715.239 241.813 717.029 238.132 718.209 234.053ZM704.354 202.403C707.772 207.154 709.481 213.349 709.481 220.99 709.481 228.708 707.772 235.044 704.354 239.992 700.935 244.94 696.032 248.404 689.643 250.383 688.628 250.695 687.564 250.967 686.451 251.198 684.886 251.521 683.223 251.765 681.464 251.927 678.452 252.205 675.054 252.342 671.27 252.342L663.762 252.342 663.762 190.349 671.27 190.349C675.054 190.349 678.452 190.487 681.464 190.764 684.474 191.042 687.202 191.557 689.643 192.309 696.032 194.288 700.935 197.652 704.354 202.403ZM842.824 232.331C842.824 229.046 842.458 226.097 841.724 223.484 841.036 220.871 839.974 218.654 838.552 216.833 837.13 214.973 835.379 213.548 833.299 212.557 832.611 212.23 831.893 211.955 831.129 211.736 830.875 211.663 830.621 211.595 830.359 211.534 829.042 211.227 827.62 211.074 826.101 211.074 824.148 211.074 822.353 211.331 820.729 211.845 819.143 212.359 817.698 213.013 816.396 213.805 815.095 214.596 813.972 215.447 813.037 216.358 812.102 217.269 811.346 218.08 810.777 218.793L810.777 248.305C812.775 250.285 815.027 251.848 817.556 252.995 818.813 253.56 820.115 253.986 821.47 254.274 822.001 254.388 822.547 254.48 823.101 254.55 823.976 254.662 824.867 254.719 825.779 254.719L825.794 254.719C826.752 254.719 827.74 254.628 828.757 254.447 829.64 254.289 830.546 254.062 831.474 253.768 833.464 253.095 835.297 251.927 836.966 250.264 838.672 248.561 840.078 246.266 841.178 243.377 842.271 240.486 842.824 236.804 842.824 232.331ZM852.417 237.663C852.215 239.443 851.908 241.11 851.489 242.663 850.718 245.672 849.678 248.305 848.376 250.561 847.074 252.817 845.548 254.719 843.797 256.262 842.794 257.129 841.769 257.902 840.722 258.584 839.944 259.087 839.165 259.54 838.365 259.943 836.494 260.853 834.586 261.507 832.633 261.902 830.718 262.338 828.907 262.556 827.201 262.556 823.654 262.556 820.587 262.002 817.983 260.893 815.701 259.889 813.628 258.476 811.75 256.654 811.608 256.511 811.466 256.369 811.323 256.223 811.196 256.098 811.077 255.974 810.964 255.847L810.777 255.847 810.777 288.684 801.319 288.684 801.319 204.659 810.471 204.659 810.471 211.074 810.658 211.074C810.83 210.813 811.017 210.556 811.219 210.302 811.638 209.779 812.124 209.263 812.67 208.757 813.523 208.005 814.646 207.213 816.03 206.382 816.703 205.97 817.437 205.586 818.222 205.226 819.105 204.826 820.063 204.459 821.095 204.125 821.904 203.869 822.734 203.664 823.595 203.512 824.859 203.288 826.184 203.175 827.56 203.175 830.823 203.175 833.95 203.71 836.966 204.778 840.018 205.847 842.705 207.55 845.024 209.886 847.344 212.221 849.192 215.23 850.576 218.911 851.998 222.553 852.709 226.987 852.709 232.213 852.709 234.135 852.611 235.951 852.417 237.663ZM935.995 232.925C935.995 229.442 935.546 226.354 934.656 223.662 933.803 220.931 932.583 218.633 930.989 216.773 929.448 214.873 927.592 213.449 925.437 212.498 923.282 211.508 920.918 211.014 918.359 211.014 915.793 211.014 913.436 211.508 911.273 212.498 909.118 213.449 907.248 214.873 905.661 216.773 904.794 217.818 904.03 219.001 903.364 220.32 902.848 221.35 902.392 222.464 902.003 223.662 901.142 226.354 900.716 229.442 900.716 232.925 900.716 236.369 901.142 239.457 902.003 242.189 902.893 244.88 904.135 247.157 905.721 249.018 907.308 250.878 909.178 252.303 911.341 253.293 913.496 254.244 915.853 254.719 918.419 254.719 920.985 254.719 923.32 254.244 925.437 253.293 926.979 252.588 928.356 251.661 929.583 250.514 930.077 250.051 930.548 249.553 930.989 249.018 932.583 247.157 933.803 244.88 934.656 242.189 935.546 239.457 935.995 236.369 935.995 232.925ZM945.887 232.925C945.887 237.359 945.236 241.396 943.934 245.039 942.632 248.681 940.776 251.809 938.382 254.422 936.018 256.994 933.152 258.993 929.77 260.418 926.395 261.844 922.609 262.556 918.419 262.556 914.102 262.556 910.241 261.844 906.821 260.418 904.921 259.617 903.177 258.633 901.584 257.469 901.411 257.339 901.232 257.209 901.06 257.074 900.02 256.27 899.055 255.386 898.157 254.422 895.792 251.809 893.982 248.681 892.717 245.039 892.014 242.991 891.505 240.816 891.191 238.517L891.153 238.216C890.936 236.52 890.831 234.756 890.831 232.925 890.831 228.451 891.482 224.394 892.784 220.752 893.518 218.704 894.415 216.818 895.478 215.096L895.538 215.007C896.353 213.7 897.266 212.488 898.276 211.37 900.678 208.757 903.566 206.738 906.941 205.313 910.36 203.888 914.169 203.175 918.359 203.175 922.632 203.175 926.477 203.888 929.897 205.313 933.309 206.738 936.205 208.757 938.562 211.37 940.919 213.983 942.729 217.111 943.994 220.752 944.458 222.092 944.839 223.489 945.131 224.939L945.191 225.231C945.655 227.641 945.887 230.206 945.887 232.925ZM976.587 259.943 964.196 288.684 973.602 288.684 1009.79 204.659 999.842 204.659 981.593 248.899 981.226 248.899 963.224 204.659 953.519 204.659 976.587 259.943ZM787.896 235.787C785.352 249.711 773.073 260.247 758.79 260.247 742.478 260.247 729.206 246.852 729.206 230.387 729.206 220.419 734.071 211.575 741.531 206.149 746.423 202.538 752.452 200.404 758.962 200.404 771.248 200.404 782.067 207.869 786.527 219.398L788.24 223.81 788.046 223.884 788.068 223.934 742.402 241.925C742.482 242.04 742.561 242.152 742.643 242.264L742.821 242.506C743.147 242.943 743.49 243.366 743.849 243.774 747.55 247.979 752.961 250.636 758.962 250.636 763.361 250.636 767.477 249.173 770.836 246.695 775.326 243.309 778.416 238.091 778.91 232.132L779.067 232.146C779.067 232.138 779.067 232.132 779.067 232.126L779.074 232.008 788.442 232.811C788.367 233.768 788.24 234.712 788.068 235.642L787.896 235.787ZM747.047 213.806C742.027 217.517 738.759 223.504 738.759 230.246 738.759 230.51 738.765 230.775 738.776 231.039L738.782 231.207 738.787 231.304C738.801 231.567 738.82 231.83 738.843 232.09L738.86 232.267 738.901 232.641 738.949 233.037 775.356 218.692C771.637 213.294 765.531 209.998 758.775 209.998 754.405 210.001 750.357 211.413 747.047 213.806ZM535.763 252.342 535.763 261.071 485.955 261.071 485.955 181.62 495.904 181.62 495.904 252.342 535.763 252.342ZM865.743 175.088 875.201 175.088 875.201 261.071 865.743 261.071 865.743 175.088Z" fill="none" fill-rule="evenodd" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M589.722 261.071 569.151 213.627C567.428 209.675 565.705 205.513 563.982 201.142 563.034 198.739 562.087 196.272 561.14 193.743L560.908 193.122 560.765 192.739 560.606 192.309 560.117 192.309 560.127 192.486 560.156 193.058 560.166 193.275C560.704 203.942 560.972 213.736 560.972 222.652L560.972 261.071 551.023 261.071 551.023 181.62 565.367 181.62 584.594 226.572C586.654 231.396 588.911 237.144 591.365 243.812L591.858 245.158 592.163 245.158 592.21 245.03 592.302 244.777 592.408 244.486C595.227 236.778 597.568 230.803 599.427 226.572L618.654 181.62 632.998 181.62 632.998 261.071 623.049 261.071 623.049 222.652C623.049 214.228 623.299 204.812 623.8 194.403L623.855 193.272 623.866 193.058 623.894 192.486 623.904 192.309 623.415 192.309 623.146 193.032 623.114 193.119C620.214 200.895 617.465 207.736 614.869 213.627L594.3 261.071 589.722 261.071ZM718.209 234.053C719.389 229.975 719.979 225.622 719.979 220.99 719.979 216.398 719.389 212.121 718.209 208.164 717.07 204.165 715.32 200.582 712.96 197.415 710.64 194.209 707.69 191.439 704.109 189.102 700.569 186.766 696.418 184.945 691.658 183.639 688.93 182.886 685.981 182.371 682.807 182.095 681.362 181.951 679.864 181.839 678.31 181.761 676.45 181.667 674.509 181.62 672.491 181.62L653.813 181.62 653.813 261.071 672.308 261.071C674.622 261.071 676.833 261.017 678.941 260.909L679.099 260.902 679.955 260.853C680.864 260.798 681.754 260.732 682.623 260.656 685.797 260.339 688.748 259.806 691.474 259.053 696.235 257.747 700.386 255.905 703.926 253.53 707.507 251.156 710.477 248.344 712.838 245.098 715.239 241.813 717.029 238.132 718.209 234.053ZM704.354 202.403C707.772 207.154 709.481 213.349 709.481 220.99 709.481 228.708 707.772 235.044 704.354 239.992 700.935 244.94 696.032 248.404 689.643 250.383 688.628 250.695 687.564 250.967 686.451 251.198 684.886 251.521 683.223 251.765 681.464 251.927 678.452 252.205 675.054 252.342 671.27 252.342L663.762 252.342 663.762 190.349 671.27 190.349C675.054 190.349 678.452 190.487 681.464 190.764 684.474 191.042 687.202 191.557 689.643 192.309 696.032 194.288 700.935 197.652 704.354 202.403ZM842.824 232.331C842.824 229.046 842.458 226.097 841.724 223.484 841.036 220.871 839.974 218.654 838.552 216.833 837.13 214.973 835.379 213.548 833.299 212.557 832.611 212.23 831.893 211.955 831.129 211.736 830.875 211.663 830.621 211.595 830.359 211.534 829.042 211.227 827.62 211.074 826.101 211.074 824.148 211.074 822.353 211.331 820.729 211.845 819.143 212.359 817.698 213.013 816.396 213.805 815.095 214.596 813.972 215.447 813.037 216.358 812.102 217.269 811.346 218.08 810.777 218.793L810.777 248.305C812.775 250.285 815.027 251.848 817.556 252.995 818.813 253.56 820.115 253.986 821.47 254.274 822.001 254.388 822.547 254.48 823.101 254.55 823.976 254.662 824.867 254.719 825.779 254.719L825.794 254.719C826.752 254.719 827.74 254.628 828.757 254.447 829.64 254.289 830.546 254.062 831.474 253.768 833.464 253.095 835.297 251.927 836.966 250.264 838.672 248.561 840.078 246.266 841.178 243.377 842.271 240.486 842.824 236.804 842.824 232.331ZM852.417 237.663C852.215 239.443 851.908 241.11 851.489 242.663 850.718 245.672 849.678 248.305 848.376 250.561 847.074 252.817 845.548 254.719 843.797 256.262 842.794 257.129 841.769 257.902 840.722 258.584 839.944 259.087 839.165 259.54 838.365 259.943 836.494 260.853 834.586 261.507 832.633 261.902 830.718 262.338 828.907 262.556 827.201 262.556 823.654 262.556 820.587 262.002 817.983 260.893 815.701 259.889 813.628 258.476 811.75 256.654 811.608 256.511 811.466 256.369 811.323 256.223 811.196 256.098 811.077 255.974 810.964 255.847L810.777 255.847 810.777 288.684 801.319 288.684 801.319 204.659 810.471 204.659 810.471 211.074 810.658 211.074C810.83 210.813 811.017 210.556 811.219 210.302 811.638 209.779 812.124 209.263 812.67 208.757 813.523 208.005 814.646 207.213 816.03 206.382 816.703 205.97 817.437 205.586 818.222 205.226 819.105 204.826 820.063 204.459 821.095 204.125 821.904 203.869 822.734 203.664 823.595 203.512 824.859 203.288 826.184 203.175 827.56 203.175 830.823 203.175 833.95 203.71 836.966 204.778 840.018 205.847 842.705 207.55 845.024 209.886 847.344 212.221 849.192 215.23 850.576 218.911 851.998 222.553 852.709 226.987 852.709 232.213 852.709 234.135 852.611 235.951 852.417 237.663ZM935.995 232.925C935.995 229.442 935.546 226.354 934.656 223.662 933.803 220.931 932.583 218.633 930.989 216.773 929.448 214.873 927.592 213.449 925.437 212.498 923.282 211.508 920.918 211.014 918.359 211.014 915.793 211.014 913.436 211.508 911.273 212.498 909.118 213.449 907.248 214.873 905.661 216.773 904.794 217.818 904.03 219.001 903.364 220.32 902.848 221.35 902.392 222.464 902.003 223.662 901.142 226.354 900.716 229.442 900.716 232.925 900.716 236.369 901.142 239.457 902.003 242.189 902.893 244.88 904.135 247.157 905.721 249.018 907.308 250.878 909.178 252.303 911.341 253.293 913.496 254.244 915.853 254.719 918.419 254.719 920.985 254.719 923.32 254.244 925.437 253.293 926.979 252.588 928.356 251.661 929.583 250.514 930.077 250.051 930.548 249.553 930.989 249.018 932.583 247.157 933.803 244.88 934.656 242.189 935.546 239.457 935.995 236.369 935.995 232.925ZM945.887 232.925C945.887 237.359 945.236 241.396 943.934 245.039 942.632 248.681 940.776 251.809 938.382 254.422 936.018 256.994 933.152 258.993 929.77 260.418 926.395 261.844 922.609 262.556 918.419 262.556 914.102 262.556 910.241 261.844 906.821 260.418 904.921 259.617 903.177 258.633 901.584 257.469 901.411 257.339 901.232 257.209 901.06 257.074 900.02 256.27 899.055 255.386 898.157 254.422 895.792 251.809 893.982 248.681 892.717 245.039 892.014 242.991 891.505 240.816 891.191 238.517L891.153 238.216C890.936 236.52 890.831 234.756 890.831 232.925 890.831 228.451 891.482 224.394 892.784 220.752 893.518 218.704 894.415 216.818 895.478 215.096L895.538 215.007C896.353 213.7 897.266 212.488 898.276 211.37 900.678 208.757 903.566 206.738 906.941 205.313 910.36 203.888 914.169 203.175 918.359 203.175 922.632 203.175 926.477 203.888 929.897 205.313 933.309 206.738 936.205 208.757 938.562 211.37 940.919 213.983 942.729 217.111 943.994 220.752 944.458 222.092 944.839 223.489 945.131 224.939L945.191 225.231C945.655 227.641 945.887 230.206 945.887 232.925ZM976.587 259.943 964.196 288.684 973.602 288.684 1009.79 204.659 999.842 204.659 981.593 248.899 981.226 248.899 963.224 204.659 953.519 204.659 976.587 259.943ZM787.896 235.787C785.352 249.711 773.073 260.247 758.79 260.247 742.478 260.247 729.206 246.852 729.206 230.387 729.206 220.419 734.071 211.575 741.531 206.149 746.423 202.538 752.452 200.404 758.962 200.404 771.248 200.404 782.067 207.869 786.527 219.398L788.24 223.81 788.046 223.884 788.068 223.934 742.402 241.925C742.482 242.04 742.561 242.152 742.643 242.264L742.821 242.506C743.147 242.943 743.49 243.366 743.849 243.774 747.55 247.979 752.961 250.636 758.962 250.636 763.361 250.636 767.477 249.173 770.836 246.695 775.326 243.309 778.416 238.091 778.91 232.132L779.067 232.146C779.067 232.138 779.067 232.132 779.067 232.126L779.074 232.008 788.442 232.811C788.367 233.768 788.24 234.712 788.068 235.642L787.896 235.787ZM747.047 213.806C742.027 217.517 738.759 223.504 738.759 230.246 738.759 230.51 738.765 230.775 738.776 231.039L738.782 231.207 738.787 231.304C738.801 231.567 738.82 231.83 738.843 232.09L738.86 232.267 738.901 232.641 738.949 233.037 775.356 218.692C771.637 213.294 765.531 209.998 758.775 209.998 754.405 210.001 750.357 211.413 747.047 213.806ZM535.763 252.342 535.763 261.071 485.955 261.071 485.955 181.62 495.904 181.62 495.904 252.342 535.763 252.342ZM865.743 175.088 875.201 175.088 875.201 261.071 865.743 261.071 865.743 175.088Z" fill="url(#fill1)" fill-rule="evenodd" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M417.928 210.759 332.03 292.638 356.588 212.584 329.253 211.565 415.752 129.412 390.657 209.626 417.928 210.759Z" fill="url(#fill2)" fill-rule="evenodd" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M352.974 215.897 331.46 292.898 370.665 200.078C370.913 199.492 370.362 198.884 369.754 199.072L328.536 211.86 352.35 214.954C352.802 215.013 353.097 215.459 352.974 215.897Z" fill="url(#fill3)" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M352.974 215.897 331.46 292.898 370.665 200.078C370.913 199.492 370.362 198.884 369.754 199.072L328.536 211.86 352.35 214.954C352.802 215.013 353.097 215.459 352.974 215.897Z" fill="url(#fill4)" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M352.974 215.897 331.46 292.898 370.665 200.078C370.913 199.492 370.362 198.884 369.754 199.072L328.536 211.86 352.35 214.954C352.802 215.013 353.097 215.459 352.974 215.897Z" stroke="url(#stroke5)" stroke-width="0.748239" fill="none" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M394.247 202.173 415.328 129.974 377.297 220.145C377.057 220.715 377.573 221.314 378.172 221.161L417.509 211.1 394.716 203.089C394.342 202.957 394.135 202.554 394.247 202.173Z" stroke="url(#stroke6)" stroke-width="0.748239" fill="url(#fill7)" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M400.69 240.126C415.788 244.356 425.536 251.018 425.453 258.426 425.315 270.82 397.71 280.608 363.797 280.288 329.883 279.969 302.503 269.662 302.641 257.268 302.735 248.864 315.458 241.657 334.215 237.989" stroke="url(#stroke8)" stroke-width="5.23768" fill="none" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M403.693 233.437C417.578 241.42 425.394 250.68 423.145 258.396 419.383 271.306 388.87 275.007 354.995 266.662 321.119 258.317 296.707 241.086 300.47 228.176 303.021 219.421 317.873 214.902 337.734 215.501" stroke="url(#stroke9)" stroke-width="5.23768" fill="none" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/><path d="M403.498 232.586C414.855 243.273 420.115 253.555 416.138 259.89 409.483 270.487 379.485 266.019 349.137 249.91 318.787 233.801 299.58 212.151 306.236 201.553 310.748 194.367 325.995 194.108 344.807 199.71" stroke="url(#stroke10)" stroke-width="5.23768" fill="none" transform="matrix(1 0 0 1.00081 -0.255482 128.069)"/></g></svg>
--- a/docs/en/advance/chat_template.md
+++ b/docs/en/advance/chat_template.md
+# Customized chat template
+The effect of the applied chat template can be observed by **setting log level** `INFO`.
+LMDeploy supports two methods of adding chat templates:
+- One approach is to utilize an existing conversation template by directly configuring a JSON file like the following.
+  ```json
+  {
+      "model_name": "your awesome chat template name",
+      "system": "<|im_start|>system\n",
+      "meta_instruction": "You are a robot developed by LMDeploy.",
+      "eosys": "<|im_end|>\n",
+      "user": "<|im_start|>user\n",
+      "eoh": "<|im_end|>\n",
+      "assistant": "<|im_start|>assistant\n",
+      "eoa": "<|im_end|>",
+      "separator": "\n",
+      "capability": "chat",
+      "stop_words": ["<|im_end|>"]
+  }
+  ```
+  `model_name` is a required field and can be either the name of an LMDeploy built-in chat template (which can be viewed through `lmdeploy list`), or a new name. Other fields are optional.
+  1. When `model_name` is the name of a built-in chat template, the non-null fields in the JSON file will override the corresponding attributes of the original chat template.
+  2. However, when `model_name` is a new name, it will register `BaseChatTemplate` directly as a new chat template. The specific definition can be referred to [BaseChatTemplate](https://github.com/InternLM/lmdeploy/blob/24bd4b9ab6a15b3952e62bcfc72eaba03bce9dcb/lmdeploy/model.py#L113-L188).
+  The new chat template would be like this:
+  ```
+  {system}{meta_instruction}{eosys}{user}{user_content}{eoh}{assistant}{assistant_content}{eoa}{separator}{user}...
+  ```
+  When using the CLI tool, you can pass in a custom chat template with `--chat-template`, for example.
+  ```shell
+  lmdeploy serve api_server internlm/internlm2-chat-7b --chat-template ${JSON_FILE}
+  ```
+  You can also pass it in through the interface function, for example.
+  ```python
+  from lmdeploy import ChatTemplateConfig, serve
+  serve('internlm/internlm2-chat-7b',
+        chat_template_config=ChatTemplateConfig.from_json('${JSON_FILE}'))
+  ```
+- Another approach is to customize a Python chat template class like the existing LMDeploy chat templates. It can be used directly after successful registration. The advantages are a high degree of customization and strong controllability. Below is an example of registering an LMDeploy chat template.
+  ```python
+  from lmdeploy.model import MODELS, BaseChatTemplate
+  @MODELS.register_module(name='customized_model')
+  class CustomizedModel(BaseChatTemplate):
+      """A customized chat template."""
+      def __init__(self,
+                   system='<|im_start|>system\n',
+                   meta_instruction='You are a robot developed by LMDeploy.',
+                   user='<|im_start|>user\n',
+                   assistant='<|im_start|>assistant\n',
+                   eosys='<|im_end|>\n',
+                   eoh='<|im_end|>\n',
+                   eoa='<|im_end|>',
+                   separator='\n',
+                   stop_words=['<|im_end|>', '<|action_end|>']):
+          super().__init__(system=system,
+                           meta_instruction=meta_instruction,
+                           eosys=eosys,
+                           user=user,
+                           eoh=eoh,
+                           assistant=assistant,
+                           eoa=eoa,
+                           separator=separator,
+                           stop_words=stop_words)
+  from lmdeploy import ChatTemplateConfig, pipeline
+  messages = [{'role': 'user', 'content': 'who are you?'}]
+  pipe = pipeline('internlm/internlm2-chat-7b',
+                  chat_template_config=ChatTemplateConfig('customized_model'))
+  for response in pipe.stream_infer(messages):
+      print(response.text, end='')
+  ```
+  In this example, we register a LMDeploy chat template that sets the model to be created by LMDeploy, so when the user asks who the model is, the model will answer that it was created by LMDeploy.
--- a/docs/en/advance/debug_turbomind.md
+++ b/docs/en/advance/debug_turbomind.md
+# How to debug Turbomind
+Turbomind is implemented in C++, which is not as easy to debug as Python. This document provides basic methods for debugging Turbomind.
+## Prerequisite
+First, complete the local compilation according to the commands in [Build in localhost](../build.md).
+## Configure Python debug environment
+Since many large companies currently use Centos 7 for online production environments, we will use Centos 7 as an example to illustrate the process.
+### Obtain `glibc` and `python3` versions
+```bash
+rpm -qa | grep glibc
+rpm -qa | grep python3
+```
+The result should be similar to this:
+```
+[username@hostname workdir]# rpm -qa | grep glibc
+glibc-2.17-325.el7_9.x86_64
+glibc-common-2.17-325.el7_9.x86_64
+glibc-headers-2.17-325.el7_9.x86_64
+glibc-devel-2.17-325.el7_9.x86_64
+[username@hostname workdir]# rpm -qa | grep python3
+python3-pip-9.0.3-8.el7.noarch
+python3-rpm-macros-3-34.el7.noarch
+python3-rpm-generators-6-2.el7.noarch
+python3-setuptools-39.2.0-10.el7.noarch
+python3-3.6.8-21.el7_9.x86_64
+python3-devel-3.6.8-21.el7_9.x86_64
+python3.6.4-sre-1.el6.x86_64
+```
+Based on the information above, we can see that the version of `glibc` is `2.17-325.el7_9.x86_64` and the version of `python3` is `3.6.8-21.el7_9.x86_64`.
+### Download and install `debuginfo` library
+Download `glibc-debuginfo-common-2.17-325.el7.x86_64.rpm`, `glibc-debuginfo-2.17-325.el7.x86_64.rpm`, and `python3-debuginfo-3.6.8-21.el7.x86_64.rpm` from http://debuginfo.centos.org/7/x86_64.
+```bash
+rpm -ivh glibc-debuginfo-common-2.17-325.el7.x86_64.rpm
+rpm -ivh glibc-debuginfo-2.17-325.el7.x86_64.rpm
+rpm -ivh python3-debuginfo-3.6.8-21.el7.x86_64.rpm
+```
+### Upgrade GDB
+```bash
+sudo yum install devtoolset-10 -y
+echo "source scl_source enable devtoolset-10" >> ~/.bashrc
+source ~/.bashrc
+```
+### Verification
+```bash
+gdb python3
+```
+The output should be similar to this:
+```
+[username@hostname workdir]# gdb python3
+GNU gdb (GDB) Red Hat Enterprise Linux 9.2-10.el7
+Copyright (C) 2020 Free Software Foundation, Inc.
+License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
+This is free software: you are free to change and redistribute it.
+There is NO WARRANTY, to the extent permitted by law.
+Type "show copying" and "show warranty" for details.
+This GDB was configured as "x86_64-redhat-linux-gnu".
+Type "show configuration" for configuration details.
+For bug reporting instructions, please see:
+<http://www.gnu.org/software/gdb/bugs/>.
+Find the GDB manual and other documentation resources online at:
+   <http://www.gnu.org/software/gdb/documentation/>.
+For help, type "help".
+Type "apropos word" to search for commands related to "word"...
+Reading symbols from python3...
+(gdb)
+```
+If it shows `Reading symbols from python3`, the configuration has been successful.
+For other operating systems, please refer to [DebuggingWithGdb](https://wiki.python.org/moin/DebuggingWithGdb).
+## Set up symbolic links
+After setting up symbolic links, there is no need to install it locally with `pip` every time.
+```bash
+# Change directory to lmdeploy, e.g.
+cd /workdir/lmdeploy
+# Since it has been built in the build directory
+# Link the lib directory
+cd lmdeploy && ln -s ../build/lib . && cd ..
+# (Optional) Link compile_commands.json for clangd index
+ln -s build/compile_commands.json .
+```
+## Start debugging
+````bash
+# Use gdb to start the API server with Llama-2-13b-chat-hf, e.g.
+gdb --args python3 -m lmdeploy serve api_server /workdir/Llama-2-13b-chat-hf
+# Set directories in gdb
+Reading symbols from python3...
+(gdb) set directories /workdir/lmdeploy
+# Set a breakpoint using the relative path, e.g.
+(gdb) b src/turbomind/models/llama/BlockManager.cc:104
+# When it shows
+# ```
+# No source file named src/turbomind/models/llama/BlockManager.cc.
+# Make breakpoint pending on future shared library load? (y or [n])
+# ```
+# Just type `y` and press enter
+# Run
+(gdb) r
+# (Optional) Use https://github.com/InternLM/lmdeploy/blob/main/benchmark/profile_restful_api.py to send a request
+python3 profile_restful_api.py --server_addr 127.0.0.1:23333 --tokenizer_path /workdir/Llama-2-13b-chat-hf --dataset /workdir/ShareGPT_V3_unfiltered_cleaned_split.json --concurrency 1 --num_prompts 1
+````
+## Using GDB
+Refer to [GDB Execution Commands](https://lldb.llvm.org/use/map.html) and happy debugging.
--- a/docs/en/advance/long_context.md
+++ b/docs/en/advance/long_context.md
+# Context length extrapolation
+Long text extrapolation refers to the ability of LLM to handle data longer than the training text during inference. TurboMind engine now support [LlamaDynamicNTKScalingRotaryEmbedding](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L178) and the implementation is consistent with huggingface.
+## Usage
+You can enable the context length extrapolation abality by modifying the TurbomindEngineConfig. Edit the `session_len` to the expected length and change `rope_scaling_factor` to a number no less than 1.0.
+Here is an example:
+```python
+from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
+backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=160000)
+pipe = pipeline('internlm/internlm2-chat-7b', backend_config=backend_config)
+prompt = 'Use a long prompt to replace this sentence'
+gen_config = GenerationConfig(top_p=0.8,
+                              top_k=40,
+                              temperature=0.8,
+                              max_new_tokens=1024)
+response = pipe(prompt, gen_config=gen_config)
+print(response)
+```
+## Evaluation
+We use several methods to evaluate the long-context-length inference ability of LMDeploy, including [passkey retrieval](#passkey-retrieval), [needle in a haystack](#needle-in-a-haystack) and computing [perplexity](#perplexity)
+### Passkey Retrieval
+You can try the following code to test how many times LMDeploy can retrieval the special key.
+```python
+import numpy as np
+from lmdeploy import pipeline
+from lmdeploy import TurbomindEngineConfig
+session_len = 160000
+backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=session_len)
+pipe = pipeline('internlm/internlm2-chat-7b', backend_config=backend_config)
+def passkey_retrival(session_len, n_round=5):
+    # create long context input
+    tok = pipe.tokenizer
+    task_description = 'There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.'
+    garbage = 'The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.'
+    for _ in range(n_round):
+        n_times = (session_len - 1000) // len(tok.encode(garbage))
+        n_garbage_prefix = np.random.randint(0, n_times)
+        n_garbage_suffix = n_times - n_garbage_prefix
+        garbage_prefix = ' '.join([garbage] * n_garbage_prefix)
+        garbage_suffix = ' '.join([garbage] * n_garbage_suffix)
+        pass_key = np.random.randint(1, 50000)
+        information_line = f'The pass key is {pass_key}. Remember it. {pass_key} is the pass key.'  # noqa: E501
+        final_question = 'What is the pass key? The pass key is'
+        lines = [
+            task_description,
+            garbage_prefix,
+            information_line,
+            garbage_suffix,
+            final_question,
+        ]
+        # inference
+        prompt = ' '.join(lines)
+        response = pipe([prompt])
+        print(pass_key, response)
+passkey_retrival(session_len, 5)
+```
+### Needle In A Haystack
+[OpenCompass](https://github.com/open-compass/opencompass) offers very useful tools to perform needle-in-a-haystack evaluation. For specific instructions, please refer to the [guide](https://github.com/open-compass/opencompass/blob/main/docs/en/advanced_guides/needleinahaystack_eval.md).
+### Perplexity
+The following codes demonstrate how to use LMDeploy to calculate perplexity.
+```python
+from datasets import load_dataset
+from lmdeploy import TurbomindEngineConfig
+from lmdeploy.turbomind import TurboMind
+import numpy as np
+# load model and tokenizer
+engine_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=160000)
+engine = TurboMind.from_pretrained('internlm/internlm2-chat-7b', engine_config)
+tokenizer = engine.tokenizer
+generator = engine.create_instance()
+# get perplexity
+text = 'Use a long prompt to replace this sentence'
+input_ids = tokenizer.encode(text)
+loss = generator.get_ppl(input_ids)[0]
+ppl = np.exp(loss)
+```