Commit fe851fbc authored by zhouxiang's avatar zhouxiang
Browse files

0.2.6版本新增文件补充

parent e2d98ddc
import os
import allure
import pytest
from utils.config_utils import get_cuda_prefix_by_workerid
from utils.quantization_utils import quantization
model_list = [
'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b',
'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat', 'Qwen/Qwen-VL',
'internlm/internlm2-chat-20b', 'internlm/internlm2-20b',
'baichuan-inc/Baichuan2-7B-Chat'
]
@pytest.mark.order(3)
@pytest.mark.quantization_w4a16
@pytest.mark.timeout(900)
@pytest.mark.parametrize('model', model_list)
def test_quantization_w4a16(config, model, worker_id):
quantization_w4a16(config, model + '-inner-w4a16', model,
get_cuda_prefix_by_workerid(worker_id))
@pytest.mark.order(3)
@pytest.mark.quantization_w4a16
@pytest.mark.pr_test
@pytest.mark.flaky(reruns=0)
@pytest.mark.timeout(900)
@pytest.mark.parametrize(
'model, prefix',
[('internlm/internlm2-chat-20b', 'CUDA_VISIBLE_DEVICES=5')])
def test_quantization_w4a16_pr(config, model, prefix):
quantization_w4a16(config, model + '-inner-w4a16', model, prefix)
def quantization_w4a16(config, quantization_model_name, origin_model_name,
cuda_prefix):
quantization_type = 'w4a16'
result, msg = quantization(config, quantization_model_name,
origin_model_name, quantization_type,
cuda_prefix)
log_path = config.get('log_path')
quantization_log = os.path.join(
log_path, '_'.join([
'quantization', quantization_type,
quantization_model_name.split('/')[1]
]) + '.log')
allure.attach.file(quantization_log,
attachment_type=allure.attachment_type.TEXT)
assert result, msg
import os
import allure
import pytest
from utils.config_utils import get_cuda_prefix_by_workerid
from utils.quantization_utils import quantization
model_list = [
'meta-llama/Llama-2-7b-chat', 'internlm/internlm-chat-20b',
'internlm/internlm2-chat-20b', 'internlm/internlm2-chat-7b',
'01-ai/Yi-6B-Chat', 'internlm/internlm2-20b'
]
@pytest.mark.order(2)
@pytest.mark.quantization_w8a8
@pytest.mark.timeout(900)
@pytest.mark.parametrize('model', model_list)
def test_quantization_w8a8(config, model, worker_id):
quantization_w8a8(config, model + '-inner-w8a8', model,
get_cuda_prefix_by_workerid(worker_id))
def quantization_w8a8(config, quantization_model_name, origin_model_name,
cuda_prefix):
quantization_type = 'w8a8'
result, msg = quantization(config, quantization_model_name,
origin_model_name, quantization_type,
cuda_prefix)
log_path = config.get('log_path')
quantization_log = os.path.join(
log_path, '_'.join([
'quantization', quantization_type,
quantization_model_name.split('/')[1]
]) + '.log')
allure.attach.file(quantization_log,
attachment_type=allure.attachment_type.TEXT)
assert result, msg
import os
import subprocess
from time import sleep, time
import allure
import pytest
from pytest import assume
from utils.config_utils import (get_cuda_prefix_by_workerid,
get_torch_model_list, get_workerid)
from utils.get_run_config import get_command_with_extra
from utils.run_client_chat import command_line_test
from utils.run_restful_chat import (get_model, health_check, interactive_test,
open_chat_test)
BASE_HTTP_URL = 'http://localhost'
DEFAULT_PORT = 23333
@pytest.fixture(scope='function', autouse=True)
def prepare_environment(request, config, worker_id):
model_path = config.get('model_path')
log_path = config.get('log_path')
param = request.param
model = param['model']
cuda_prefix = param['cuda_prefix']
tp_num = param['tp_num']
if cuda_prefix is None:
cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num)
worker_num = get_workerid(worker_id)
if worker_num is None:
port = DEFAULT_PORT
else:
port = DEFAULT_PORT + worker_num
cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path +
'/' + model + ' --backend pytorch' +
' --server-port ' + str(port),
config,
model,
need_tp=True)
print('reproduce command restful: ' + cmd)
start_log = os.path.join(log_path,
'start_restful_' + model.split('/')[1] + '.log')
with open(start_log, 'w') as f:
f.writelines('reproduce command restful: ' + cmd + '\n')
# convert
convertRes = subprocess.Popen([cmd],
stdout=f,
stderr=f,
shell=True,
text=True,
encoding='utf-8')
pid = convertRes.pid
allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
http_url = BASE_HTTP_URL + ':' + str(port)
start_time = int(time())
sleep(5)
for i in range(120):
sleep(1)
end_time = int(time())
total_time = end_time - start_time
result = health_check(http_url)
if result or total_time >= 120:
break
yield
if pid > 0:
kill_log = os.path.join(log_path,
'kill_' + model.split('/')[1] + '.log')
with open(kill_log, 'w') as f:
convertRes.kill()
allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT)
def getModelList(tp_num):
return [{
'model': item,
'cuda_prefix': None,
'tp_num': tp_num
} for item in get_torch_model_list(tp_num) if 'chat' in item.lower()]
@pytest.mark.order(7)
@pytest.mark.usefixtures('common_case_config')
@pytest.mark.restful_api_pytorch
@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment',
getModelList(tp_num=1),
indirect=True)
def test_restful_chat_tp1(config, common_case_config, worker_id):
if get_workerid(worker_id) is None:
run_all_step(config, common_case_config)
else:
run_all_step(config,
common_case_config,
worker_id=worker_id,
port=DEFAULT_PORT + get_workerid(worker_id))
@pytest.mark.order(7)
@pytest.mark.usefixtures('common_case_config')
@pytest.mark.restful_api_pytorch
@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment',
getModelList(tp_num=2),
indirect=True)
def test_restful_chat_tp2(config, common_case_config, worker_id):
if get_workerid(worker_id) is None:
run_all_step(config, common_case_config)
else:
run_all_step(config,
common_case_config,
worker_id=worker_id,
port=DEFAULT_PORT + get_workerid(worker_id))
def run_all_step(config,
cases_info,
worker_id: str = 'default',
port: int = DEFAULT_PORT):
http_url = BASE_HTTP_URL + ':' + str(port)
model = get_model(http_url)
if model is None:
assert False, 'server not start correctly'
for case in cases_info.keys():
if (case == 'memory_test'
or case == 'emoji_case') and 'chat' not in model.lower():
continue
case_info = cases_info.get(case)
with allure.step(case + ' step1 - command chat regression'):
chat_result, chat_log, msg = command_line_test(
config, case, case_info, model + worker_id, 'api_client',
http_url)
if chat_log is not None:
allure.attach.file(chat_log,
attachment_type=allure.attachment_type.TEXT)
with assume:
assert chat_result, msg
with allure.step(case + ' step2 - restful_test - openai chat'):
restful_result, restful_log, msg = open_chat_test(
config, case_info, model, http_url, worker_id)
allure.attach.file(restful_log,
attachment_type=allure.attachment_type.TEXT)
with assume:
assert restful_result, msg
with allure.step(case + ' step3 - restful_test - interactive chat'):
active_result, interactive_log, msg = interactive_test(
config, case_info, model, http_url, worker_id)
allure.attach.file(interactive_log,
attachment_type=allure.attachment_type.TEXT)
with assume:
assert active_result, msg
import os
import subprocess
from time import sleep, time
import allure
import pytest
from pytest import assume
from utils.config_utils import (get_all_model_list,
get_cuda_prefix_by_workerid, get_workerid)
from utils.get_run_config import get_command_with_extra
from utils.run_client_chat import command_line_test
from utils.run_restful_chat import (get_model, health_check, interactive_test,
open_chat_test)
BASE_HTTP_URL = 'http://localhost'
DEFAULT_PORT = 23333
@pytest.fixture(scope='function', autouse=True)
def prepare_environment(request, config, worker_id):
model_path = config.get('model_path')
log_path = config.get('log_path')
param = request.param
model = param['model']
cuda_prefix = param['cuda_prefix']
tp_num = param['tp_num']
if cuda_prefix is None:
cuda_prefix = get_cuda_prefix_by_workerid(worker_id, tp_num=tp_num)
worker_num = get_workerid(worker_id)
if worker_num is None:
port = DEFAULT_PORT
else:
port = DEFAULT_PORT + worker_num
cmd = ['lmdeploy serve api_server ' + model_path + '/' + model]
cmd = get_command_with_extra('lmdeploy serve api_server ' + model_path +
'/' + model + ' --server-port ' + str(port),
config,
model,
need_tp=True,
cuda_prefix=cuda_prefix)
if 'kvint8' in model:
cmd += ' --quant-policy 4'
if 'w4' in model or '4bits' in model:
cmd += ' --model-format awq'
else:
cmd += ' --model-format hf'
if 'w4' in model or '4bits' in model:
cmd += ' --model-format awq'
start_log = os.path.join(log_path,
'start_restful_' + model.split('/')[1] + '.log')
print('reproduce command restful: ' + cmd)
with open(start_log, 'w') as f:
f.writelines('reproduce command restful: ' + cmd + '\n')
# convert
convertRes = subprocess.Popen([cmd],
stdout=f,
stderr=f,
shell=True,
text=True,
encoding='utf-8')
pid = convertRes.pid
allure.attach.file(start_log, attachment_type=allure.attachment_type.TEXT)
http_url = BASE_HTTP_URL + ':' + str(port)
start_time = int(time())
sleep(5)
for i in range(120):
sleep(1)
end_time = int(time())
total_time = end_time - start_time
result = health_check(http_url)
if result or total_time >= 120:
break
yield
if pid > 0:
kill_log = os.path.join(log_path,
'kill_' + model.split('/')[1] + '.log')
with open(kill_log, 'w') as f:
convertRes.kill()
allure.attach.file(kill_log, attachment_type=allure.attachment_type.TEXT)
def getModelList(tp_num):
return [{
'model': item,
'cuda_prefix': None,
'tp_num': tp_num
} for item in get_all_model_list(tp_num) if 'chat' in item.lower()]
@pytest.mark.order(7)
@pytest.mark.usefixtures('common_case_config')
@pytest.mark.restful_api
@pytest.mark.gpu_num_1
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment',
getModelList(tp_num=1),
indirect=True)
def test_restful_chat_tp1(request, config, common_case_config, worker_id):
if get_workerid(worker_id) is None:
run_all_step(config, common_case_config)
else:
run_all_step(config,
common_case_config,
worker_id=worker_id,
port=DEFAULT_PORT + get_workerid(worker_id))
@pytest.mark.order(7)
@pytest.mark.usefixtures('common_case_config')
@pytest.mark.restful_api
@pytest.mark.gpu_num_2
@pytest.mark.flaky(reruns=0)
@pytest.mark.parametrize('prepare_environment',
getModelList(tp_num=2),
indirect=True)
def test_restful_chat_tp2(config, common_case_config, worker_id):
if get_workerid(worker_id) is None:
run_all_step(config, common_case_config)
else:
run_all_step(config,
common_case_config,
worker_id=worker_id,
port=DEFAULT_PORT + get_workerid(worker_id))
@pytest.mark.order(7)
@pytest.mark.usefixtures('common_case_config')
@pytest.mark.restful_api
@pytest.mark.flaky(reruns=0)
@pytest.mark.pr_test
@pytest.mark.parametrize('prepare_environment', [{
'model': 'internlm/internlm2-chat-20b',
'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
'tp_num': 2
}, {
'model': 'internlm/internlm2-chat-20b-inner-w4a16',
'cuda_prefix': 'CUDA_VISIBLE_DEVICES=5,6',
'tp_num': 2
}],
indirect=True)
def test_restful_chat_pr(config, common_case_config):
run_all_step(config, common_case_config)
def run_all_step(config,
cases_info,
worker_id: str = 'default',
port: int = DEFAULT_PORT):
http_url = BASE_HTTP_URL + ':' + str(port)
model = get_model(http_url)
if model is None:
assert False, 'server not start correctly'
for case in cases_info.keys():
if (case == 'memory_test'
or case == 'emoji_case') and 'chat' not in model.lower():
continue
case_info = cases_info.get(case)
with allure.step(case + ' step1 - command chat regression'):
chat_result, chat_log, msg = command_line_test(
config, case, case_info, model, 'api_client', http_url)
allure.attach.file(chat_log,
attachment_type=allure.attachment_type.TEXT)
with assume:
assert chat_result, msg
with allure.step(case + ' step2 - restful_test - openai chat'):
restful_result, restful_log, msg = open_chat_test(
config, case_info, model, http_url)
allure.attach.file(restful_log,
attachment_type=allure.attachment_type.TEXT)
with assume:
assert restful_result, msg
with allure.step(case + ' step3 - restful_test - interactive chat'):
active_result, interactive_log, msg = interactive_test(
config, case_info, model, http_url)
allure.attach.file(interactive_log,
attachment_type=allure.attachment_type.TEXT)
with assume:
assert active_result, msg
import os
import yaml
from utils.get_run_config import get_tp_num
def get_turbomind_model_list(tp_num: int = None):
config_path = os.path.join('autotest/config.yaml')
with open(config_path) as f:
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
case_list = config.get('turbomind_model')
quatization_case_config = config.get('quatization_case_config')
for key in quatization_case_config.get('w4a16'):
case_list.append(key + '-inner-w4a16')
for key in quatization_case_config.get('kvint8'):
case_list.append(key + '-inner-kvint8')
for key in quatization_case_config.get('kvint8_w4a16'):
case_list.append(key + '-inner-kvint8-w4a16')
if tp_num is not None:
return [
item for item in case_list if get_tp_num(config, item) == tp_num
]
return case_list
def get_torch_model_list(tp_num: int = None):
config_path = os.path.join('autotest/config.yaml')
with open(config_path) as f:
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
case_list = config.get('pytorch_model')
quatization_case_config = config.get('quatization_case_config')
for key in quatization_case_config.get('w8a8'):
case_list.append(key + '-inner-w8a8')
if tp_num is not None:
return [
item for item in case_list if get_tp_num(config, item) == tp_num
]
return case_list
def get_all_model_list(tp_num: int = None):
config_path = os.path.join('autotest/config.yaml')
with open(config_path) as f:
config = yaml.load(f.read(), Loader=yaml.SafeLoader)
case_list = config.get('turbomind_model')
for key in config.get('pytorch_model'):
if key not in case_list:
case_list.append(key)
quatization_case_config = config.get('quatization_case_config')
for key in quatization_case_config.get('w4a16'):
case_list.append(key + '-inner-w4a16')
for key in quatization_case_config.get('kvint8'):
case_list.append(key + '-inner-kvint8')
for key in quatization_case_config.get('kvint8_w4a16'):
case_list.append(key + '-inner-kvint8-w4a16')
if tp_num is not None:
return [
item for item in case_list if get_tp_num(config, item) == tp_num
]
return case_list
def get_cuda_prefix_by_workerid(worker_id, tp_num: int = 1):
if worker_id is None or 'gw' not in worker_id:
return None
else:
if tp_num == 1:
return 'CUDA_VISIBLE_DEVICES=' + worker_id.replace('gw', '')
elif tp_num == 2:
cuda_num = int(worker_id.replace('gw', '')) * 2
return 'CUDA_VISIBLE_DEVICES=' + ','.join(
[str(cuda_num), str(cuda_num + 1)])
def get_cuda_id_by_workerid(worker_id, tp_num: int = 1):
if worker_id is None or 'gw' not in worker_id:
return None
else:
if tp_num == 1:
return worker_id.replace('gw', '')
elif tp_num == 2:
cuda_num = int(worker_id.replace('gw', '')) * 2
return ','.join([str(cuda_num), str(cuda_num + 1)])
def get_workerid(worker_id):
if worker_id is None or 'gw' not in worker_id:
return None
else:
return int(worker_id.replace('gw', ''))
import random
from time import sleep
import torch
from lmdeploy.model import MODELS
def get_conda_allcate_prefix(config, model):
cuda_prefix = ''
tp_num = get_tp_num(config, model)
if tp_num is None:
return cuda_prefix
available_cuda = _get_available_cude()
if len(available_cuda) < tp_num:
raise torch.cuda.OutOfMemoryError
cuda_prefix = 'CUDA_VISIBLE_DEVICES=' + ','.join(
random.sample(available_cuda, tp_num))
torch.cuda.empty_cache()
return cuda_prefix
def get_tp_config(config, model, need_tp):
tp_num = str(get_tp_num(config, model))
tp_info = ''
if need_tp and tp_num is not None:
tp_info = '--tp ' + str(get_tp_num(config, model))
return tp_info
def get_tp_num(config, model):
tp_config = config.get('tp_config')
tp_num = 1
if tp_config is None:
return None
model_name = _simple_model_name(model)
if model_name in tp_config.keys():
tp_num = tp_config.get(model_name)
return tp_num
def get_command_with_extra(cmd,
config,
model,
need_tp: bool = False,
cuda_prefix: str = None,
need_sleep: bool = True):
if need_sleep:
sleep(random.uniform(0, 5))
if cuda_prefix is None:
cuda_prefix = get_conda_allcate_prefix(config, model)
tp_config = get_tp_config(config, model, need_tp)
if cuda_prefix is not None and len(cuda_prefix) > 0:
cmd = ' '.join([cuda_prefix, cmd])
if tp_config is not None and len(tp_config) > 0:
cmd = ' '.join([cmd, tp_config])
torch.cuda.empty_cache()
return cmd
def get_model_name(model):
model_names = [
'llama', 'llama2', 'internlm', 'internlm2', 'baichuan2', 'chatglm2',
'falcon', 'yi', 'qwen1.5'
]
model_names += list(MODELS.module_dict.keys())
model_names.sort()
model_name = _simple_model_name(model)
model_name = model_name.lower()
if model_name in model_names:
return model_name
model_name = model_name.replace('-chat', '')
model_name = model_name.replace('-v0.1', '')
if model_name in model_names:
return model_name
if (model_name == 'qwen-vl'):
return 'qwen-7b'
if ('llama-2' in model_name):
return 'llama-2'
return model_name.split('-')[0]
def _get_available_cude():
devices = torch.cuda.device_count()
available_cuda = []
for i in range(devices):
if (torch.cuda.utilization(i) > 5):
continue
if ('no processes are running'
not in torch.cuda.list_gpu_processes(i)):
continue
available_cuda.append(str(i))
return available_cuda
def _simple_model_name(model):
if '/' in model:
model_name = model.split('/')[1]
else:
model_name = model
model_name = model_name.replace('-inner-w4a16', '')
model_name = model_name.replace('-inner-w8a8', '')
model_name = model_name.replace('-inner-kvint8', '')
model_name = model_name.replace('-w4a16', '')
return model_name
def _split_model_name(model):
model_name = model.split('/')[1]
return model_name
if __name__ == '__main__':
print(_simple_model_name('baichuan-inc/Baichuan2-7B-Chat-inner-w4a16'))
import os
import allure
from pytest import assume
def write_log(config,
result,
msg,
is_new: bool = True,
case_path_tag: str = 'default'):
try:
log_path = os.path.join(config.get('log_path'), case_path_tag)
if is_new:
file = open(log_path, 'w')
else:
file = open(log_path, 'a')
file.writelines('result:' + result + ', reason:' + msg + '\n')
file.close()
except Exception as e:
return False, None, f'Unknown error: {e}'
def assert_log(config, case_path_tag: str = 'default'):
log_path = os.path.join(config.get('log_path'), case_path_tag)
with open(log_path, 'r') as f:
lines = f.readlines()
for line in lines:
if 'result:False, reason:' in line:
result = False
msg = line
break
if 'result:True, reason:' in line and result is False:
result = True
allure.attach.file(log_path, attachment_type=allure.attachment_type.TEXT)
with assume:
assert result, msg
import os
import allure
import torch
from pytest import assume
from utils.get_run_config import get_model_name, get_tp_num
from utils.rule_condition_assert import assert_result
from lmdeploy import pipeline
from lmdeploy.messages import (GenerationConfig, PytorchEngineConfig,
TurbomindEngineConfig)
def run_pipeline_chat_test(config, cases_info, model_case, type):
log_path = config.get('log_path')
tp = get_tp_num(config, model_case)
model_name = model_name = get_model_name(model_case)
model_path = config.get('model_path')
hf_path = model_path + '/' + model_case
print(' '.join([
'reproduce command:', 'python',
'autotest/tools/pipeline/pipeline_chat_script.py', type, model_case,
str(tp)
]))
if 'pytorch' == type:
backend_config = PytorchEngineConfig(tp=tp)
else:
if 'kvint8' in model_case and ('w4' in model_case
or '4bits' in model_case):
backend_config = TurbomindEngineConfig(tp=tp,
model_format='awq',
quant_policy=4)
elif 'kvint8' in model_case:
backend_config = TurbomindEngineConfig(tp=tp,
model_format='hf',
quant_policy=4)
elif 'w4' in model_case or '4bits' in model_case:
backend_config = TurbomindEngineConfig(tp=tp, model_format='awq')
else:
backend_config = TurbomindEngineConfig(tp=tp)
pipe = pipeline(hf_path, backend_config=backend_config)
# run testcases
gen_config = GenerationConfig(temperature=0.01)
gen_config = GenerationConfig()
for case in cases_info.keys():
if (case == 'memory_test'
or case == 'emoji_case') and 'chat' not in model_case.lower():
continue
case_info = cases_info.get(case)
pipeline_chat_log = os.path.join(
log_path,
'pipeline_chat_' + model_case.split('/')[1] + '_' + case + '.log')
file = open(pipeline_chat_log, 'w')
prompts = []
for prompt_detail in case_info:
prompt = list(prompt_detail.keys())[0]
if 'chat' not in model_case.lower(): # base model
prompts.append(prompt)
else: # chat model
prompts.append({'role': 'user', 'content': prompt})
file.writelines('prompt:' + prompt + '\n')
if 'chat' not in model_case.lower(): # base model
response = pipe(prompts, gen_config=gen_config)[-1].text
else: # chat model
response = pipe([prompts], gen_config=gen_config)[0].text
case_result, reason = assert_result(response,
prompt_detail.values(),
model_name)
if 'chat' in model_case.lower():
prompts.append({'role': 'assistant', 'content': response})
file.writelines('output:' + response + '\n')
file.writelines('result:' + str(case_result) + ', reason:' +
reason + '\n')
file.close()
del pipe
torch.cuda.empty_cache()
def assert_pipeline_chat_log(config, cases_info, model_case):
log_path = config.get('log_path')
for case in cases_info.keys():
if (case == 'memory_test'
or case == 'emoji_case') and 'chat' not in model_case.lower():
continue
msg = ''
result = False
with allure.step('case - ' + case):
pipeline_chat_log = os.path.join(
log_path, 'pipeline_chat_' + model_case.split('/')[1] + '_' +
case + '.log')
with open(pipeline_chat_log, 'r') as f:
lines = f.readlines()
for line in lines:
if 'result:False, reason:' in line:
result = False
msg = line
break
if 'result:True, reason:' in line and result is False:
result = True
allure.attach.file(pipeline_chat_log,
attachment_type=allure.attachment_type.TEXT)
with assume:
assert result, msg
import os
import subprocess
from subprocess import PIPE
def quantization(config,
quantization_model_name,
origin_model_name,
quantization_type: str = 'w4a16',
cuda_prefix: str = 'CUDA_VISIBLE_DEVICES=0'):
model_path = config.get('model_path')
log_path = config.get('log_path')
origin_model_path = config.get('model_path') + '/' + origin_model_name
quantization_model_path = model_path + '/' + quantization_model_name
quantization_log = os.path.join(
log_path, '_'.join([
'quantization', quantization_type,
quantization_model_name.split('/')[1]
]) + '.log')
if quantization_type == 'w4a16':
quantization_cmd = ' '.join([
cuda_prefix, 'lmdeploy lite auto_awq', origin_model_path,
'--work-dir', quantization_model_path
])
elif quantization_type == 'w8a8':
quantization_cmd = ' '.join([
cuda_prefix, 'lmdeploy lite smooth_quant', origin_model_path,
'--work-dir', quantization_model_path
])
elif quantization_type == 'kvint8':
quantization_cmd = ' '.join([
cuda_prefix, 'lmdeploy lite calibrate', origin_model_path,
'--work-dir', quantization_model_path
])
else:
return False, 'quantization type should in [w4a16, w8a8, kvint8], \
now the type is ' + quantization_type
with open(quantization_log, 'w') as f:
# remove existing folder
subprocess.run([' '.join(['rm -rf', quantization_model_path])],
stdout=f,
stderr=f,
shell=True,
text=True,
encoding='utf-8')
if quantization_type == 'kvint8':
cp_cmd = ' '.join(
['cp -r', origin_model_path, quantization_model_path])
f.writelines('reproduce command quantization_cmd: ' + cp_cmd +
'\n')
print('reproduce command quantization_cmd: ' + cp_cmd)
subprocess.run([cp_cmd],
stdout=f,
stderr=f,
shell=True,
text=True,
encoding='utf-8')
f.writelines('reproduce command quantization_cmd: ' +
quantization_cmd + '\n')
print('reproduce command quantization_cmd: ' + quantization_cmd)
# quantization
quantizationRes = subprocess.run([quantization_cmd],
stdout=f,
stderr=PIPE,
shell=True,
text=True,
encoding='utf-8')
f.writelines(quantizationRes.stderr)
result = quantizationRes.returncode == 0
return result, quantizationRes.stderr
def assert_chat_completions_batch_return(output, model_name):
assert output.get('usage').get('prompt_tokens') > 0
assert output.get('usage').get('total_tokens') > 0
assert output.get('usage').get('completion_tokens') > 0
assert output.get('usage').get('completion_tokens') + output.get(
'usage').get('prompt_tokens') == output.get('usage').get(
'total_tokens')
assert output.get('id') is not None
assert output.get('object') == 'chat.completion'
assert output.get('model') == model_name
output_message = output.get('choices')
assert len(output_message) == 1
for message in output_message:
assert message.get('finish_reason') in ['stop', 'length']
assert message.get('index') == 0
assert len(message.get('message').get('content')) > 0
assert message.get('message').get('role') == 'assistant'
def assert_chat_completions_stream_return(output,
model_name,
is_first: bool = False,
is_last: bool = False):
assert output.get('id') is not None
if is_first is False:
assert output.get('object') == 'chat.completion.chunk'
assert output.get('model') == model_name
output_message = output.get('choices')
assert len(output_message) == 1
for message in output_message:
assert message.get('delta').get('role') == 'assistant'
assert message.get('index') == 0
if is_last is False:
assert message.get('finish_reason') is None
if is_first is False and is_last is False:
assert len(message.get('delta').get('content')) >= 0
if is_last is True:
assert len(message.get('delta').get('content')) == 0
assert message.get('finish_reason') in ['stop', 'length']
def assert_chat_interactive_batch_return(output):
assert output.get('input_tokens') > 0
assert output.get('tokens') > 0
assert output.get('history_tokens') >= 0
assert output.get('finish_reason') in ['stop', 'length']
assert len(output.get('text')) > 0
def assert_chat_interactive_stream_return(output,
is_last: bool = False,
is_text_empty: bool = False,
index: int = None):
assert output.get('input_tokens') > 0
if index is not None:
assert output.get('tokens') >= index and output.get(
'tokens') <= index + 6
assert output.get('tokens') > 0
assert output.get('history_tokens') >= 0
if is_last:
assert len(output.get('text')) >= 0
assert output.get('finish_reason') in ['stop', 'length']
elif is_text_empty:
assert len(output.get('text')) == 0
assert output.get('finish_reason') is None
else:
assert len(output.get('text')) >= 0
assert output.get('finish_reason') is None
def assert_result(input, rule_condition, model_name):
input = input.replace('\n', '\\n')
input_lower = input.lower()
for dict in rule_condition:
if dict is None:
return True, ''
for rule in dict:
operator = list(rule.keys())[0]
value = list(rule.values())[0]
if model_name is not None and model_name == operator:
dict = value
for rule in dict:
operator = list(rule.keys())[0]
value = list(rule.values())[0]
if operator == 'contain':
if isinstance(value, list):
tmpResult = False
for word in value:
if word.lower() in input_lower:
tmpResult = True
if tmpResult is False:
return False, ','.join(
value) + " doesn't exist in " + input
else:
if value.lower() not in input_lower:
msg = value + " doesn't exist in:" + input
return False, msg
if operator == 'not_contain':
if isinstance(value, list):
for word in value:
if word.lower() in input_lower:
msg = word + " shouldn't exist in:" + input
return False, msg
else:
if value.lower() in input_lower:
msg = value + " shouldn't exist in " + input
return False, msg
if operator == 'len_g':
if len(input) < int(value):
return False, input + ' length: ' + str(
len(input)) + ', should greater than ' + str(value)
return True, ''
if __name__ == '__main__':
input = '成都的景点hot potdddd'
condition = ([[{
'contain': ['hot pot']
}, {
'contain': ['。']
}, {
'len_g': [10]
}]])
print(assert_result(input, condition))
import os
from subprocess import PIPE, Popen
from utils.get_run_config import get_command_with_extra, get_model_name
from utils.rule_condition_assert import assert_result
def command_line_test(config,
case,
case_info,
model_case,
type,
extra: str = None,
cuda_prefix: str = None):
dst_path = config.get('dst_path')
if type == 'api_client':
cmd = 'lmdeploy serve api_client ' + extra
elif type == 'triton_client':
cmd = 'lmdeploy serve triton_client ' + extra
else:
cmd = get_command_with_extra('lmdeploy chat turbomind ' + dst_path +
'/workspace_' + model_case,
config,
model_case,
cuda_prefix=cuda_prefix)
if 'kvint8' in model_case:
cmd += ' --quant-policy 4'
if 'w4' in model_case or '4bits' in model_case:
cmd += ' --model-format awq'
else:
cmd += ' --model-format hf'
elif 'w4' in model_case or '4bits' in model_case:
cmd += ' --model-format awq'
if 'chat' not in model_case.lower():
cmd += ' --cap completion'
return command_test(config, [cmd], model_case, case, case_info,
type == 'turbomind')
def hf_command_line_test(config,
case,
case_info,
model_case,
type,
cuda_prefix: str = None):
model_path = config.get('model_path') + '/' + model_case
cmd = get_command_with_extra(' '.join(['lmdeploy chat', type, model_path]),
config,
model_case,
need_tp=True,
cuda_prefix=cuda_prefix)
if 'kvint8' in model_case:
cmd += ' --quant-policy 4'
if 'w4' in model_case or '4bits' in model_case:
cmd += ' --model-format awq'
else:
cmd += ' --model-format hf'
elif 'w4' in model_case or '4bits' in model_case:
cmd += ' --model-format awq'
return command_test(config, [cmd], model_case,
'_'.join(['hf', type, case]), case_info, True)
def command_test(config, cmd, model, case, case_info, need_extract_output):
if 'memory_test' in case and 'chat' not in model.lower():
return True, None, 'memory case skipped for base model'
try:
log_path = config.get('log_path')
model_name = get_model_name(model)
if '/' in model:
chat_log = os.path.join(
log_path, 'chat_' + model.split('/')[1] + '_' + case + '.log')
else:
chat_log = os.path.join(log_path,
'chat_' + model + '_' + case + '.log')
file = open(chat_log, 'w')
returncode = -1
result = True
print('reproduce command chat: ' + ' '.join(cmd) + '\n')
file.writelines('reproduce command chat: ' + ' '.join(cmd) + '\n')
spliter = '\n\n'
if 'CodeLlama-7b-Instruct-hf' in model:
spliter = '\n!!\n'
# join prompt together
prompt = ''
for item in case_info:
prompt += list(item.keys())[0] + spliter
prompt += 'exit' + spliter
msg = ''
with Popen(cmd,
stdin=PIPE,
stdout=PIPE,
stderr=PIPE,
shell=True,
text=True,
encoding='utf-8') as proc:
# file.writelines('prompt:' + prompt + '\n')
outputs, errors = proc.communicate(input=prompt)
returncode = proc.returncode
if returncode != 0:
file.writelines('error:' + errors + '\n')
result = False
return result, chat_log, errors
outputDialogs = parse_dialogue(outputs, model)
file.writelines('answersize:' + str(len(outputDialogs)) + '\n')
# 结果判断
index = 0
for prompt_detail in case_info:
if need_extract_output:
output = extract_output(outputDialogs[index], model)
else:
output = outputDialogs[index]
case_result, reason = assert_result(output,
prompt_detail.values(),
model_name)
file.writelines('prompt:' + list(prompt_detail.keys())[0] +
'\n')
file.writelines('output:' + output + '\n')
file.writelines('result:' + str(case_result) + ',reason:' +
reason + '\n')
index += 1
if case_result is False:
msg = reason
result = result & case_result
file.close()
return result, chat_log, msg
except Exception as e:
return False, None, f'Unknown error: {e}'
# 从输出中解析模型输出的对话内容
def parse_dialogue(inputs: str, model: str):
dialogues = inputs.strip()
if 'CodeLlama-7b-Instruct-hf' in model:
sep = 'enter !! to end the input >>>'
else:
sep = 'double enter to end input >>>'
dialogues = dialogues.strip()
dialogues = dialogues.split(sep)
dialogues = [d.strip() for d in dialogues]
return dialogues[1:-1] # 去除首尾无用字符
def extract_output(output: str, model: str):
if 'Qwen' in model or 'internlm2' in model:
if len(output.split('<|im_start|>assistant')) >= 2:
return output.split('<|im_start|>assistant')[1]
if 'Baichuan2' in model:
if len(output.split('<reserved_107>')) >= 2:
return output.split('<reserved_107>')[1]
if 'internlm' in model:
if len(output.split('<|Bot|>: ')) >= 2:
return output.split('<|Bot|>: ')[1]
if 'llama' in model or 'Llama' in model:
if len(output.split('[/INST]')) >= 2:
return output.split('[/INST]')[1]
return output
import os
import random
import string
from utils.rule_condition_assert import assert_result
from lmdeploy.serve.openai.api_client import APIClient
def open_chat_test(config, case_info, model, url, worker_id: str = 'default'):
log_path = config.get('log_path')
restful_log = os.path.join(log_path,
'restful_' + model + '_' + worker_id + '.log')
file = open(restful_log, 'w')
result = True
api_client = APIClient(url)
model_name = api_client.available_models[0]
messages = []
msg = ''
for prompt_detail in case_info:
if result is False:
break
prompt = list(prompt_detail.keys())[0]
messages.append({'role': 'user', 'content': prompt})
file.writelines('prompt:' + prompt + '\n')
for output in api_client.chat_completions_v1(model=model_name,
messages=messages,
temperature=0.01):
output_message = output.get('choices')[0].get('message')
messages.append(output_message)
output_content = output_message.get('content')
file.writelines('output:' + output_content + '\n')
case_result, reason = assert_result(output_content,
prompt_detail.values(),
model_name)
file.writelines('result:' + str(case_result) + ',reason:' +
reason + '\n')
if result is False:
msg += reason
result = result & case_result
file.close()
return result, restful_log, msg
def interactive_test(config,
case_info,
model,
url,
worker_id: str = 'default'):
log_path = config.get('log_path')
interactive_log = os.path.join(
log_path, 'interactive_' + model + '_' + worker_id + '.log')
file = open(interactive_log, 'w')
result = True
api_client = APIClient(url)
file.writelines('available_models:' +
','.join(api_client.available_models) + '\n')
# Randomly generate 6 characters and concatenate them into a string.
characters = string.digits
random_chars = ''.join(random.choice(characters) for i in range(6))
messages = []
msg = ''
for prompt_detail in case_info:
prompt = list(prompt_detail.keys())[0]
new_prompt = {'role': 'user', 'content': prompt}
messages.append(new_prompt)
file.writelines('prompt:' + prompt + '\n')
for output in api_client.chat_interactive_v1(prompt=prompt,
interactive_mode=True,
session_id=random_chars,
temperature=0.01):
output_content = output.get('text')
file.writelines('output:' + output_content + '\n')
case_result, reason = assert_result(output_content,
prompt_detail.values(), model)
file.writelines('result:' + str(case_result) + ',reason:' +
reason + '\n')
if result is False:
msg += reason
result = result & case_result
file.close()
return result, interactive_log, msg
def health_check(url):
try:
api_client = APIClient(url)
model_name = api_client.available_models[0]
messages = []
messages.append({'role': 'user', 'content': '你好'})
for output in api_client.chat_completions_v1(model=model_name,
messages=messages,
temperature=0.01):
if output.get('code') is not None and output.get('code') != 0:
return False
return True
except Exception:
return False
def get_model(url):
try:
api_client = APIClient(url)
model_name = api_client.available_models[0]
return model_name
except Exception:
return None
#!/bin/bash
dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json"
########################################## PyTorch engine: fp16 or bf16 ##########################################
## 7B
tp=1
max_batch_size=256
cache_max_entry_count=0.95
model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat"
CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
## 13B
tp=1
max_batch_size=256
cache_max_entry_count=0.9
model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat"
CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
# 20B
tp=2
max_batch_size=256
cache_max_entry_count=0.9
model_path="/workspace/models-140/InternLM/internlm-chat-20b"
CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
# 70B
tp=4
max_batch_size=256
cache_max_entry_count=0.9
model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf"
CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --backend pytorch --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_pt_7b_thr.csv
########################################## PyTorch engine: w8a8 ##########################################
# #!/bin/bash
dataset_path="benchmark/ShareGPT_V3_unfiltered_cleaned_split.json"
########################################## TurboMind engine: fp16 or bf16 ##########################################
# 7B. gemm_tune -> profile_throughput
tp=1
max_batch_size=256
cache_max_entry_count=0.95
model_path="/workspace/models-140/llama2/huggingface/llama-2-7b-chat"
CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_7b_thr.csv
rm gemm_config.in
# 13B. gemm_tune -> profile_throughput
tp=1
max_batch_size=256
cache_max_entry_count=0.9
model_path="/workspace/models-140/llama2/huggingface/llama-2-13b-chat"
CUDA_VISIBLE_DEVICES="6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_13b_thr.csv
rm gemm_config.in
# 20B. gemm_tune -> profile_throughput
tp=2
max_batch_size=256
cache_max_entry_count=0.9
model_path="/workspace/models-140/InternLM/internlm-chat-20b"
CUDA_VISIBLE_DEVICES="5,6" python3 -m lmdeploy.turbomind.generate_gemm_config --tensor-para-size ${tp} --max-batch-size ${max_batch_size} --model-path ${model_path}
CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv internlm_tb_20b_thr.csv
rm gemm_config.in
# 70B
tp=4
max_batch_size=256
cache_max_entry_count=0.9
model_path="/workspace/models-140/llama2/huggingface/llama-2-70b-chat-hf"
CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --csv llama2_tb_70b_thr.csv
# ########################################## TurboMind engine: w4a16 ##########################################
# 7B
tp=1
max_batch_size=256
cache_max_entry_count=0.95
model_path="/workspace/models/quantization/llama-2-7b-chat-4bit"
CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_7b_4bit_thr.csv
# 13B
tp=1
max_batch_size=256
cache_max_entry_count=0.9
model_path="/workspace/models/quantization/llama-2-13b-chat-4bit"
CUDA_VISIBLE_DEVICES="6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_13b_4bit_thr.csv
# 20B
tp=2
max_batch_size=256
cache_max_entry_count=0.9
model_path="/workspace/models/quantization/internlm-chat-20b-4bit"
CUDA_VISIBLE_DEVICES="5,6" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv internlm_tb_20b_4bit_thr.csv
# 70B
tp=4
max_batch_size=256
cache_max_entry_count=0.9
model_path="/workspace/models/quantization/llama-2-70b-chat-hf-4bit"
CUDA_VISIBLE_DEVICES="4,5,6,7" python3 benchmark/profile_throughput.py ${dataset_path} ${model_path} --tp ${tp} --concurrency ${max_batch_size} --cache-max-entry-count ${cache_max_entry_count} --model-format awq --num-prompts 10000 --csv llama2_tb_70b_4bit_thr.csv
version: 2
formats: all
build:
os: "ubuntu-22.04"
tools:
python: "3.8"
sphinx:
configuration: docs/en/conf.py
python:
install:
- requirements: requirements/docs.txt
- requirements: requirements/readthedocs.txt
This diff is collapsed.
# Customized chat template
The effect of the applied chat template can be observed by **setting log level** `INFO`.
LMDeploy supports two methods of adding chat templates:
- One approach is to utilize an existing conversation template by directly configuring a JSON file like the following.
```json
{
"model_name": "your awesome chat template name",
"system": "<|im_start|>system\n",
"meta_instruction": "You are a robot developed by LMDeploy.",
"eosys": "<|im_end|>\n",
"user": "<|im_start|>user\n",
"eoh": "<|im_end|>\n",
"assistant": "<|im_start|>assistant\n",
"eoa": "<|im_end|>",
"separator": "\n",
"capability": "chat",
"stop_words": ["<|im_end|>"]
}
```
`model_name` is a required field and can be either the name of an LMDeploy built-in chat template (which can be viewed through `lmdeploy list`), or a new name. Other fields are optional.
1. When `model_name` is the name of a built-in chat template, the non-null fields in the JSON file will override the corresponding attributes of the original chat template.
2. However, when `model_name` is a new name, it will register `BaseChatTemplate` directly as a new chat template. The specific definition can be referred to [BaseChatTemplate](https://github.com/InternLM/lmdeploy/blob/24bd4b9ab6a15b3952e62bcfc72eaba03bce9dcb/lmdeploy/model.py#L113-L188).
The new chat template would be like this:
```
{system}{meta_instruction}{eosys}{user}{user_content}{eoh}{assistant}{assistant_content}{eoa}{separator}{user}...
```
When using the CLI tool, you can pass in a custom chat template with `--chat-template`, for example.
```shell
lmdeploy serve api_server internlm/internlm2-chat-7b --chat-template ${JSON_FILE}
```
You can also pass it in through the interface function, for example.
```python
from lmdeploy import ChatTemplateConfig, serve
serve('internlm/internlm2-chat-7b',
chat_template_config=ChatTemplateConfig.from_json('${JSON_FILE}'))
```
- Another approach is to customize a Python chat template class like the existing LMDeploy chat templates. It can be used directly after successful registration. The advantages are a high degree of customization and strong controllability. Below is an example of registering an LMDeploy chat template.
```python
from lmdeploy.model import MODELS, BaseChatTemplate
@MODELS.register_module(name='customized_model')
class CustomizedModel(BaseChatTemplate):
"""A customized chat template."""
def __init__(self,
system='<|im_start|>system\n',
meta_instruction='You are a robot developed by LMDeploy.',
user='<|im_start|>user\n',
assistant='<|im_start|>assistant\n',
eosys='<|im_end|>\n',
eoh='<|im_end|>\n',
eoa='<|im_end|>',
separator='\n',
stop_words=['<|im_end|>', '<|action_end|>']):
super().__init__(system=system,
meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
separator=separator,
stop_words=stop_words)
from lmdeploy import ChatTemplateConfig, pipeline
messages = [{'role': 'user', 'content': 'who are you?'}]
pipe = pipeline('internlm/internlm2-chat-7b',
chat_template_config=ChatTemplateConfig('customized_model'))
for response in pipe.stream_infer(messages):
print(response.text, end='')
```
In this example, we register a LMDeploy chat template that sets the model to be created by LMDeploy, so when the user asks who the model is, the model will answer that it was created by LMDeploy.
# How to debug Turbomind
Turbomind is implemented in C++, which is not as easy to debug as Python. This document provides basic methods for debugging Turbomind.
## Prerequisite
First, complete the local compilation according to the commands in [Build in localhost](../build.md).
## Configure Python debug environment
Since many large companies currently use Centos 7 for online production environments, we will use Centos 7 as an example to illustrate the process.
### Obtain `glibc` and `python3` versions
```bash
rpm -qa | grep glibc
rpm -qa | grep python3
```
The result should be similar to this:
```
[username@hostname workdir]# rpm -qa | grep glibc
glibc-2.17-325.el7_9.x86_64
glibc-common-2.17-325.el7_9.x86_64
glibc-headers-2.17-325.el7_9.x86_64
glibc-devel-2.17-325.el7_9.x86_64
[username@hostname workdir]# rpm -qa | grep python3
python3-pip-9.0.3-8.el7.noarch
python3-rpm-macros-3-34.el7.noarch
python3-rpm-generators-6-2.el7.noarch
python3-setuptools-39.2.0-10.el7.noarch
python3-3.6.8-21.el7_9.x86_64
python3-devel-3.6.8-21.el7_9.x86_64
python3.6.4-sre-1.el6.x86_64
```
Based on the information above, we can see that the version of `glibc` is `2.17-325.el7_9.x86_64` and the version of `python3` is `3.6.8-21.el7_9.x86_64`.
### Download and install `debuginfo` library
Download `glibc-debuginfo-common-2.17-325.el7.x86_64.rpm`, `glibc-debuginfo-2.17-325.el7.x86_64.rpm`, and `python3-debuginfo-3.6.8-21.el7.x86_64.rpm` from http://debuginfo.centos.org/7/x86_64.
```bash
rpm -ivh glibc-debuginfo-common-2.17-325.el7.x86_64.rpm
rpm -ivh glibc-debuginfo-2.17-325.el7.x86_64.rpm
rpm -ivh python3-debuginfo-3.6.8-21.el7.x86_64.rpm
```
### Upgrade GDB
```bash
sudo yum install devtoolset-10 -y
echo "source scl_source enable devtoolset-10" >> ~/.bashrc
source ~/.bashrc
```
### Verification
```bash
gdb python3
```
The output should be similar to this:
```
[username@hostname workdir]# gdb python3
GNU gdb (GDB) Red Hat Enterprise Linux 9.2-10.el7
Copyright (C) 2020 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.
Type "show copying" and "show warranty" for details.
This GDB was configured as "x86_64-redhat-linux-gnu".
Type "show configuration" for configuration details.
For bug reporting instructions, please see:
<http://www.gnu.org/software/gdb/bugs/>.
Find the GDB manual and other documentation resources online at:
<http://www.gnu.org/software/gdb/documentation/>.
For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from python3...
(gdb)
```
If it shows `Reading symbols from python3`, the configuration has been successful.
For other operating systems, please refer to [DebuggingWithGdb](https://wiki.python.org/moin/DebuggingWithGdb).
## Set up symbolic links
After setting up symbolic links, there is no need to install it locally with `pip` every time.
```bash
# Change directory to lmdeploy, e.g.
cd /workdir/lmdeploy
# Since it has been built in the build directory
# Link the lib directory
cd lmdeploy && ln -s ../build/lib . && cd ..
# (Optional) Link compile_commands.json for clangd index
ln -s build/compile_commands.json .
```
## Start debugging
````bash
# Use gdb to start the API server with Llama-2-13b-chat-hf, e.g.
gdb --args python3 -m lmdeploy serve api_server /workdir/Llama-2-13b-chat-hf
# Set directories in gdb
Reading symbols from python3...
(gdb) set directories /workdir/lmdeploy
# Set a breakpoint using the relative path, e.g.
(gdb) b src/turbomind/models/llama/BlockManager.cc:104
# When it shows
# ```
# No source file named src/turbomind/models/llama/BlockManager.cc.
# Make breakpoint pending on future shared library load? (y or [n])
# ```
# Just type `y` and press enter
# Run
(gdb) r
# (Optional) Use https://github.com/InternLM/lmdeploy/blob/main/benchmark/profile_restful_api.py to send a request
python3 profile_restful_api.py --server_addr 127.0.0.1:23333 --tokenizer_path /workdir/Llama-2-13b-chat-hf --dataset /workdir/ShareGPT_V3_unfiltered_cleaned_split.json --concurrency 1 --num_prompts 1
````
## Using GDB
Refer to [GDB Execution Commands](https://lldb.llvm.org/use/map.html) and happy debugging.
# Context length extrapolation
Long text extrapolation refers to the ability of LLM to handle data longer than the training text during inference. TurboMind engine now support [LlamaDynamicNTKScalingRotaryEmbedding](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L178) and the implementation is consistent with huggingface.
## Usage
You can enable the context length extrapolation abality by modifying the TurbomindEngineConfig. Edit the `session_len` to the expected length and change `rope_scaling_factor` to a number no less than 1.0.
Here is an example:
```python
from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=160000)
pipe = pipeline('internlm/internlm2-chat-7b', backend_config=backend_config)
prompt = 'Use a long prompt to replace this sentence'
gen_config = GenerationConfig(top_p=0.8,
top_k=40,
temperature=0.8,
max_new_tokens=1024)
response = pipe(prompt, gen_config=gen_config)
print(response)
```
## Evaluation
We use several methods to evaluate the long-context-length inference ability of LMDeploy, including [passkey retrieval](#passkey-retrieval), [needle in a haystack](#needle-in-a-haystack) and computing [perplexity](#perplexity)
### Passkey Retrieval
You can try the following code to test how many times LMDeploy can retrieval the special key.
```python
import numpy as np
from lmdeploy import pipeline
from lmdeploy import TurbomindEngineConfig
session_len = 160000
backend_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=session_len)
pipe = pipeline('internlm/internlm2-chat-7b', backend_config=backend_config)
def passkey_retrival(session_len, n_round=5):
# create long context input
tok = pipe.tokenizer
task_description = 'There is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there.'
garbage = 'The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again.'
for _ in range(n_round):
n_times = (session_len - 1000) // len(tok.encode(garbage))
n_garbage_prefix = np.random.randint(0, n_times)
n_garbage_suffix = n_times - n_garbage_prefix
garbage_prefix = ' '.join([garbage] * n_garbage_prefix)
garbage_suffix = ' '.join([garbage] * n_garbage_suffix)
pass_key = np.random.randint(1, 50000)
information_line = f'The pass key is {pass_key}. Remember it. {pass_key} is the pass key.' # noqa: E501
final_question = 'What is the pass key? The pass key is'
lines = [
task_description,
garbage_prefix,
information_line,
garbage_suffix,
final_question,
]
# inference
prompt = ' '.join(lines)
response = pipe([prompt])
print(pass_key, response)
passkey_retrival(session_len, 5)
```
### Needle In A Haystack
[OpenCompass](https://github.com/open-compass/opencompass) offers very useful tools to perform needle-in-a-haystack evaluation. For specific instructions, please refer to the [guide](https://github.com/open-compass/opencompass/blob/main/docs/en/advanced_guides/needleinahaystack_eval.md).
### Perplexity
The following codes demonstrate how to use LMDeploy to calculate perplexity.
```python
from datasets import load_dataset
from lmdeploy import TurbomindEngineConfig
from lmdeploy.turbomind import TurboMind
import numpy as np
# load model and tokenizer
engine_config = TurbomindEngineConfig(rope_scaling_factor=2.0, session_len=160000)
engine = TurboMind.from_pretrained('internlm/internlm2-chat-7b', engine_config)
tokenizer = engine.tokenizer
generator = engine.create_instance()
# get perplexity
text = 'Use a long prompt to replace this sentence'
input_ids = tokenizer.encode(text)
loss = generator.get_ppl(input_ids)[0]
ppl = np.exp(loss)
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment