Commit d7117b95 authored by zhouxiang's avatar zhouxiang
Browse files

同步0.2.6代码

parent 5f83e392
# Copyright (c) OpenMMLab. All rights reserved.
import os
from typing import Optional
from typing import List, Literal, Optional, Union
from .archs import autoget_backend_config, get_task
from .messages import PytorchEngineConfig, TurbomindEngineConfig
from .model import ChatTemplateConfig
def pipeline(model_path: str,
model_name: Optional[str] = None,
instance_num: int = 32,
tp: int = 1,
backend_config: Optional[Union[TurbomindEngineConfig,
PytorchEngineConfig]] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
log_level='ERROR',
**kwargs):
"""
......@@ -21,38 +26,83 @@ def pipeline(model_path: str,
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
on huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
instance_num (int): instance numbers to be created
tp (int): tensor parallel
backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
config instance. Default to None.
chat_template_config (ChatTemplateConfig): chat template configuration.
Default to None.
log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
Examples:
>>> # LLM
>>> import lmdeploy
>>> pipe = lmdeploy.pipeline('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
>>> pipe = lmdeploy.pipeline('internlm/internlm-chat-7b')
>>> response = pipe(['hi','say this is a test'])
>>> print(response)
>>>
>>> # VLM
>>> from lmdeploy.vl import load_image
>>> from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
>>> pipe = pipeline('liuhaotian/llava-v1.5-7b',
... backend_config=TurbomindEngineConfig(session_len=8192),
... chat_template_config=ChatTemplateConfig(model_name='vicuna'))
>>> im = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
>>> response = pipe([('describe this image', [im])])
>>> print(response)
""" # noqa E501
from lmdeploy.serve.async_engine import AsyncEngine
os.environ['TM_LOG_LEVEL'] = log_level
return AsyncEngine(model_path,
model_name=model_name,
instance_num=instance_num,
tp=tp,
**kwargs)
if os.getenv('TM_LOG_LEVEL') is None:
os.environ['TM_LOG_LEVEL'] = log_level
from lmdeploy.utils import get_logger
logger = get_logger('lmdeploy')
logger.setLevel(log_level)
pipeline_type, pipeline_class = get_task(model_path)
if pipeline_type == 'vlm':
assert (type(backend_config) is TurbomindEngineConfig) or \
(backend_config is None), \
f'{pipeline_type} model only support turbomind backend.'
if pipeline_type == 'llm' and type(
backend_config) is not PytorchEngineConfig:
# set auto backend mode
backend_config = autoget_backend_config(model_path, backend_config)
backend = 'pytorch' if type(
backend_config) is PytorchEngineConfig else 'turbomind'
logger.info(f'Using {backend} engine')
if 'tp' in kwargs:
logger.warning(
'The argument "tp" is deprecated and will be removed soon. '
'Please set "tp" in "backend_config"')
tp = kwargs['tp']
kwargs.pop('tp')
else:
tp = 1 if backend_config is None else backend_config.tp
return pipeline_class(model_path,
model_name=model_name,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
tp=tp,
**kwargs)
def serve(model_path: str,
model_name: Optional[str] = None,
backend: Literal['turbomind', 'pytorch'] = 'turbomind',
backend_config: Optional[Union[TurbomindEngineConfig,
PytorchEngineConfig]] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
server_name: str = '0.0.0.0',
server_port: int = 23333,
instance_num: int = 64,
tp: int = 1,
log_level: str = 'ERROR',
api_keys: Optional[Union[List[str], str]] = None,
ssl: bool = False,
**kwargs):
"""This will run the api_server in a subprocess.
......@@ -67,24 +117,31 @@ def serve(model_path: str,
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
on huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b",
huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
backend (str): either `turbomind` or `pytorch` backend. Default to
`turbomind` backend.
backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
config instance. Default to none.
chat_template_config (ChatTemplateConfig): chat template configuration.
Default to None.
server_name (str): host ip for serving
server_port (int): server port
instance_num (int): number of instances of turbomind model
tp (int): tensor parallel
log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as
a single api_key. Default to None, which means no api key applied.
ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.
Return:
APIClient: A client chatbot for LLaMA series models.
Examples:
>>> import lmdeploy
>>> client = lmdeploy.serve('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b')
>>> client = lmdeploy.serve('internlm/internlm-chat-7b', 'internlm-chat-7b')
>>> for output in client.chat('hi', 1):
... print(output)
""" # noqa E501
......@@ -93,33 +150,57 @@ def serve(model_path: str,
from lmdeploy.serve.openai.api_client import APIClient
from lmdeploy.serve.openai.api_server import serve
if type(backend_config) is not PytorchEngineConfig:
# set auto backend mode
backend_config = autoget_backend_config(model_path, backend_config)
backend = 'pytorch' if type(
backend_config) is PytorchEngineConfig else 'turbomind'
if 'tp' in kwargs:
tp = kwargs['tp']
kwargs.pop('tp')
else:
tp = 1 if backend_config is None else backend_config.tp
task = Process(target=serve,
args=(model_path, ),
kwargs=dict(model_name=model_name,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
server_name=server_name,
server_port=server_port,
instance_num=instance_num,
tp=tp,
log_level=log_level,
**kwargs))
api_keys=api_keys,
ssl=ssl,
**kwargs),
daemon=True)
task.start()
client = APIClient(f'http://{server_name}:{server_port}')
while True:
time.sleep(1)
try:
client.available_models
print(
f'Launched the api_server in process {task.pid}, user can '
f'kill the server by:\nimport os,signal\nos.kill({task.pid}, '
'signal.SIGKILL)')
return client
except: # noqa
pass
def client(api_server_url: str = 'http://0.0.0.0:23333', **kwargs):
def client(api_server_url: str = 'http://0.0.0.0:23333',
api_key: Optional[str] = None,
**kwargs):
"""
Args:
api_server_url (str): communicating address 'http://<ip>:<port>' of
api_server
api_key (str | None): api key. Default to None, which means no
api key will be used.
Return:
Chatbot for LLaMA series models with turbomind as inference engine.
"""
from lmdeploy.serve.openai.api_client import APIClient
return APIClient(api_server_url, **kwargs)
return APIClient(api_server_url, api_key, **kwargs)
# Copyright (c) OpenMMLab. All rights reserved.
from .cli import run
from .entrypoint import run
__all__ = ['run']
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional
from .cli import CLI
from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter,
convert_args, get_lora_adapters)
class SubCliChat(object):
"""Chat through terminal with pytorch or turbomind model."""
_help = 'Chat with pytorch or turbomind engine.'
_desc = _help
parser = CLI.subparsers.add_parser('chat', help=_help, description=_desc)
subparsers = parser.add_subparsers(
title='Commands', description='This group has the following commands:')
def torch(self,
model_path: str,
tokenizer_path: Optional[str] = None,
accel: Optional[str] = None,
max_new_tokens: int = 128,
temperature: float = 0.8,
top_p: float = 0.95,
seed: int = 0,
use_fast_tokenizer: bool = True,
max_alloc: int = 2048,
max_session_len: int = None,
log_file: Optional[str] = None,
debug: bool = False,
adapter: Optional[str] = None):
"""Chat with pytorch model through terminal.
@staticmethod
def add_parser_torch():
"""Add parser for torch command."""
parser = SubCliChat.subparsers.add_parser(
'torch',
formatter_class=DefaultsAndTypesHelpFormatter,
help=SubCliChat.torch.__doc__,
description=SubCliChat.torch.__doc__,
)
parser.set_defaults(run=SubCliChat.torch)
parser.add_argument('model_path',
type=str,
help='The huggingface model path')
# engine args
engine_group = parser.add_argument_group('Engine arguments')
ArgumentHelper.model_name(engine_group)
ArgumentHelper.tp(engine_group)
ArgumentHelper.session_len(engine_group)
ArgumentHelper.adapters(engine_group)
ArgumentHelper.cache_max_entry_count(engine_group)
Args:
model_path (str): Path to pytorch model.
tokenizer_path (str): Path to tokenizer.
accel (str): Model accelerator.
max_new_tokens (int): Maximum number of tokens to generate.
temperature (float): Temperature for sampling.
top_p (float): Top p for sampling.
seed (int): Random seed.
use_fast_tokenizer (bool): Whether to use fast tokenizer.
This argument is directly pass to transformer's
``AutoTokenizer.from_pretrained``.
Generally, user should choose to use fast tokenizers.
But if using fast raise some error, try to force using a slow one.
max_alloc (int): Maximum memory to allocate (for deepspeed).
max_session_len (int): Maximum number of tokens allowed for all chat sessions.
This include both history and current session.
log_file (str): Path to log file.
debug (bool): Whether to enable debug mode.
adapter (str): Force to use an adapter.
Generally user should not use this argument because adapter is selected based
on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
based on `LlamaforCausalLM` class, this argument is required.
Currently, only "llama1" is acceptable for llama1 models.
""" # noqa: E501
from lmdeploy.pytorch.chat import main as run_torch_model
# other args
parser.add_argument('--trust-remote-code',
action='store_false',
default=True,
help='Trust remote code')
run_torch_model(model_path,
tokenizer_path=tokenizer_path,
accel=accel,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
seed=seed,
use_fast_tokenizer=use_fast_tokenizer,
max_alloc=max_alloc,
max_session_len=max_session_len,
log_file=log_file,
debug=debug,
adapter=adapter)
@staticmethod
def add_parser_turbomind():
"""Add parser for turbomind command."""
parser = SubCliChat.subparsers.add_parser(
'turbomind',
formatter_class=DefaultsAndTypesHelpFormatter,
help=SubCliChat.turbomind.__doc__,
description=SubCliChat.turbomind.__doc__,
)
parser.set_defaults(run=SubCliChat.turbomind)
parser.add_argument(
'model_path',
type=str,
help='The path of the deployed model. '
'It can be in format of huggingface or turbomind. '
'When it is turbomind model, all arguments for engine'
'config would be ignored, so you need to change the `config.ini`')
# engine arguments
engine_group = parser.add_argument_group('Engine arguments')
ArgumentHelper.tp(engine_group)
ArgumentHelper.model_format(engine_group)
ArgumentHelper.quant_policy(engine_group)
ArgumentHelper.model_name(engine_group)
ArgumentHelper.cache_max_entry_count(engine_group)
ArgumentHelper.rope_scaling_factor(engine_group)
ArgumentHelper.session_len(engine_group)
# other arguments
ArgumentHelper.cap(parser)
ArgumentHelper.meta_instruction(parser) # TODO remove
ArgumentHelper.chat_template(parser)
def turbomind(self,
model_path,
session_id: int = 1,
cap: str = 'chat',
tp=1,
stream_output=True,
**kwargs):
"""Chat with turbomind model through terminal.
@staticmethod
def torch(args):
"""Chat with PyTorch inference engine through terminal."""
from lmdeploy.messages import PytorchEngineConfig
from lmdeploy.pytorch.chat import run_chat
adapters = get_lora_adapters(args.adapters)
engine_config = PytorchEngineConfig(
model_name=args.model_name,
tp=args.tp,
session_len=args.session_len,
cache_max_entry_count=args.cache_max_entry_count,
adapters=adapters)
run_chat(args.model_path,
engine_config,
trust_remote_code=args.trust_remote_code)
Args:
model_path (str): the path of the deployed model
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama has
the ability among ['completion', 'infilling', 'chat', 'python']
tp (int): GPU number used in tensor parallelism
stream_output (bool): indicator for streaming output or not
**kwarg (dict): other arguments for initializing model's chat
template
"""
from lmdeploy.turbomind.chat import main as run_turbomind_model
@staticmethod
def turbomind(args):
"""Chat with TurboMind inference engine through terminal."""
from lmdeploy.turbomind.chat import main
kwargs = convert_args(args)
from lmdeploy.model import ChatTemplateConfig
chat_template_config = ChatTemplateConfig(
model_name=args.model_name,
meta_instruction=args.meta_instruction,
capability=args.cap)
if args.chat_template:
chat_template_config = ChatTemplateConfig.from_json(
args.chat_template)
kwargs.update(dict(chat_template_cfg=chat_template_config))
kwargs.pop('chat_template', None)
main(**kwargs)
run_turbomind_model(model_path,
session_id=session_id,
cap=cap,
tp=tp,
stream_output=stream_output,
**kwargs)
@staticmethod
def add_parsers():
"""Add all parsers."""
SubCliChat.add_parser_torch()
SubCliChat.add_parser_turbomind()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os
import fire
from .chat import SubCliChat
from .lite import SubCliLite
from .serve import SubCliServe
from ..version import __version__
from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
class CLI(object):
"""LMDeploy Command Line Interface.
The CLI provides a unified API for converting, compressing and deploying
large language models.
"""
def convert(self,
model_name: str,
model_path: str,
model_format: str = None,
tokenizer_path: str = None,
dst_path: str = './workspace',
tp: int = 1,
quant_path: str = None,
group_size: int = 0,
**kwargs):
"""Convert LLMs to lmdeploy format.
Args:
model_name (str): The name of the to-be-deployed model, such as
llama-7b, llama-13b, vicuna-7b and etc.
model_path (str): The directory path of the model or huggingface
repo_id like 'internlm/internlm-chat-20b'
model_format (str): the format of the model, should choose from
['llama', 'hf', 'awq', None]. 'llama' stands for META's llama
format, 'hf' means huggingface llama format, and 'awq' means
llama(hf) model quantized by lmdeploy/lite/quantization/awq.py.
the default value is None, which means the model_format will be
inferred based on model_name
tokenizer_path (str): The path of tokenizer model.
dst_path (str): The destination path that saves outputs.
tp (int): The number of GPUs used for tensor parallelism, which
should be 2^n.
quant_path (str): Path of the quantized model, which can be None.
group_size (int): A parameter used in AWQ to quantize fp16 weights
to 4 bits.
kwargs (dict): other params for convert
"""
from lmdeploy.turbomind.deploy.converter import main as convert
convert(model_name,
model_path,
model_format=model_format,
tokenizer_path=tokenizer_path,
dst_path=dst_path,
tp=tp,
quant_path=quant_path,
group_size=group_size,
**kwargs)
def list(self, engine: str = 'turbomind'):
"""List supported model names.
Examples 1:
lmdeploy list
Examples 2:
lmdeploy list --engine pytorch
Args:
engine (str): The backend for the model to run. Choice from
['turbomind', 'pytorch'].
"""
assert engine in ['turbomind', 'pytorch']
if engine == 'pytorch':
model_names = ['llama', 'llama2', 'internlm-7b']
elif engine == 'turbomind':
from lmdeploy.model import MODELS
model_names = list(MODELS.module_dict.keys())
model_names = [n for n in model_names if n.lower() not in ['base']]
_desc = 'The CLI provides a unified API for converting, ' \
'compressing and deploying large language models.'
parser = argparse.ArgumentParser(prog='lmdeploy',
description=_desc,
add_help=True)
parser.add_argument('-v',
'--version',
action='version',
version=__version__)
subparsers = parser.add_subparsers(
title='Commands',
description='lmdeploy has following commands:',
dest='command')
@staticmethod
def add_parser_convert():
"""Add parser for convert command."""
parser = CLI.subparsers.add_parser(
'convert',
formatter_class=DefaultsAndTypesHelpFormatter,
description=CLI.convert.__doc__,
help=CLI.convert.__doc__)
# define arguments
parser.add_argument(
'model_name',
type=str,
help='The name of the to-be-deployed model, such as llama-7b, '
'llama-13b, vicuna-7b and etc. You can run `lmdeploy list` to '
'get the supported model names')
parser.add_argument('model_path',
type=str,
help='The directory path of the model')
ArgumentHelper.model_format(parser)
ArgumentHelper.tp(parser)
# other args
parser.add_argument('--tokenizer-path',
type=str,
default=None,
help='The path of tokenizer model')
parser.add_argument('--dst-path',
type=str,
default='workspace',
help='The destination path that saves outputs')
parser.add_argument(
'--quant-path',
type=str,
default=None,
help='Path of the quantized model, which can be none')
parser.add_argument(
'--group-size',
type=int,
default=0,
help='A parameter used in awq to quantize fp16 weights '
'to 4 bits')
parser.set_defaults(run=CLI.convert)
@staticmethod
def add_parser_list():
"""Add parser for list command."""
parser = CLI.subparsers.add_parser(
'list',
formatter_class=DefaultsAndTypesHelpFormatter,
description=CLI.list.__doc__,
help=CLI.list.__doc__)
parser.set_defaults(run=CLI.list)
# define arguments
ArgumentHelper.engine(parser)
@staticmethod
def add_parser_checkenv():
"""Add parser for check_env command."""
parser = CLI.subparsers.add_parser(
'check_env',
formatter_class=DefaultsAndTypesHelpFormatter,
description=CLI.check_env.__doc__,
help=CLI.check_env.__doc__)
parser.set_defaults(run=CLI.check_env)
parser.add_argument('--dump-file',
type=str,
default=None,
help='The file path to save env info. Only '
'support file format in `json`, `yml`,'
' `pkl`')
@staticmethod
def convert(args):
"""Convert LLMs to turbomind format."""
from lmdeploy.turbomind.deploy.converter import main
kwargs = convert_args(args)
main(**kwargs)
@staticmethod
def list(args):
"""List the supported model names."""
from lmdeploy.model import MODELS
model_names = list(MODELS.module_dict.keys())
deprecate_names = [
'baichuan-7b', 'baichuan2-7b', 'chatglm2-6b', 'internlm-chat-20b',
'internlm-chat-7b', 'internlm-chat-7b-8k', 'internlm2-1_8b',
'internlm-20b', 'internlm2-20b', 'internlm2-7b', 'internlm2-chat',
'internlm2-chat-1_8b', 'internlm2-chat-20b', 'internlm2-chat-7b',
'llama-2-chat', 'llama-2', 'qwen-14b', 'qwen-7b', 'solar-70b',
'yi-200k', 'yi-34b', 'yi-chat', 'Mistral-7B-Instruct',
'Mixtral-8x7B-Instruct', 'baichuan-base', 'deepseek-chat',
'internlm-chat'
]
model_names = [
n for n in model_names if n not in deprecate_names + ['base']
]
deprecate_names.sort()
model_names.sort()
print('Supported model names:')
print('The older chat template name like "internlm2-7b", "qwen-7b"'
' and so on are deprecated and will be removed in the future.'
' The supported chat template names are:')
print('\n'.join(model_names))
def check_env(self, dump_file: str = None):
"""Check env information.
Args:
dump_file (str): Output file to save env info.
"""
@staticmethod
def check_env(args):
"""Check the environmental information."""
import importlib
import mmengine
......@@ -121,19 +158,16 @@ class CLI(object):
print(f'{k}: {v}')
# dump to local file
dump_file = args.dump_file
if dump_file is not None:
work_dir, _ = os.path.split(dump_file)
if work_dir:
os.makedirs(work_dir, exist_ok=True)
mmengine.dump(env_info, dump_file)
def run():
"""The entry point of running LMDeploy CLI."""
cli = CLI()
cli.lite = SubCliLite()
cli.chat = SubCliChat()
cli.serve = SubCliServe()
fire.Fire(cli, name='lmdeploy')
@staticmethod
def add_parsers():
"""Add all parsers."""
CLI.add_parser_convert()
CLI.add_parser_list()
CLI.add_parser_checkenv()
# Copyright (c) OpenMMLab. All rights reserved.
from mmengine.config import DictAction
from .cli import CLI
from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
class SubCliLite(object):
"""CLI for compressing LLMs."""
_help = 'Compressing and accelerating LLMs with lmdeploy.lite module'
_desc = _help
parser = CLI.subparsers.add_parser(
'lite',
help=_help,
description=_desc,
)
subparsers = parser.add_subparsers(
title='Commands', description='This group has the following commands:')
def auto_awq(self,
model: str,
work_dir: str,
w_bits: int = 4,
w_sym: bool = False,
w_group_size: int = 128,
device: str = 'cuda'):
"""Perform weight quantization using AWQ algorithm.
@staticmethod
def add_parser_auto_awq():
"""Add parser for auto_awq command."""
parser = SubCliLite.subparsers.add_parser(
'auto_awq',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliLite.auto_awq.__doc__,
help=SubCliLite.auto_awq.__doc__)
parser.set_defaults(run=SubCliLite.auto_awq)
parser.add_argument('model',
type=str,
help='The path of model in hf format')
ArgumentHelper.work_dir(parser)
ArgumentHelper.calib_dataset(parser)
ArgumentHelper.calib_samples(parser)
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.device(parser)
parser.add_argument('--w-bits',
type=int,
default=4,
help='Bit number for weight quantization')
parser.add_argument('--w-sym',
action='store_true',
help='Whether to do symmetric quantization')
parser.add_argument(
'--w-group-size',
type=int,
default=128,
help='Group size for weight quantization statistics')
Args:
model (str): The path of model in hf format.
work_dir (str): The working directory to save results.
w_bits (int): Bit number for weight quantization.
w_sym (bool): Whether to do symmetric quantization.
w_group_size (int): Group size for weight quantization statistics.
device (str): Device type of running.
"""
from lmdeploy.lite.apis.auto_awq import auto_awq
@staticmethod
def add_parser_calibrate():
"""Add parser for calibrate command."""
parser = SubCliLite.subparsers.add_parser(
'calibrate',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliLite.calibrate.__doc__,
help=SubCliLite.calibrate.__doc__)
parser.set_defaults(run=SubCliLite.calibrate)
parser.add_argument('model',
type=str,
help='The name or path of the model to be loaded')
ArgumentHelper.work_dir(parser)
ArgumentHelper.calib_dataset(parser)
ArgumentHelper.calib_samples(parser)
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.device(parser)
auto_awq(model,
work_dir,
w_bits=w_bits,
w_sym=w_sym,
w_group_size=w_group_size,
device=device)
@staticmethod
def add_parser_smooth_quant():
"""Add parser for smooth_quant command."""
parser = SubCliLite.subparsers.add_parser(
'smooth_quant',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliLite.smooth_quant.__doc__,
help=SubCliLite.smooth_quant.__doc__)
parser.set_defaults(run=SubCliLite.smooth_quant)
parser.add_argument('model',
type=str,
help='The name or path of the model to be loaded')
parser.add_argument(
'--work-dir',
type=str,
default='./work_dir',
help='The working directory for outputs. defaults to "./work_dir"')
ArgumentHelper.calib_dataset(parser)
ArgumentHelper.calib_samples(parser)
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.device(parser)
def calibrate(self,
model: str,
calib_dataset: str = 'c4',
calib_samples: int = 128,
calib_seqlen: int = 2048,
work_dir: str = './work_dir',
device: str = 'cuda') -> None:
"""Perform calibration on a given dataset.
@staticmethod
def add_parser_kv_qparams():
"""Add parser for kv_qparams command."""
parser = SubCliLite.subparsers.add_parser(
'kv_qparams',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliLite.kv_qparams.__doc__,
help=SubCliLite.kv_qparams.__doc__)
parser.set_defaults(run=SubCliLite.kv_qparams)
Args:
model (str): The model to be loaded.
calib_dataset (str, optional): The calibration dataset name.
Defaults to 'c4'.
calib_samples (int, optional): The number of samples for
calibration. Defaults to 128.
calib_seqlen (int, optional): The sequence length for calibration.
Defaults to 2048.
work_dir (str): The working directory for outputs.
Defaults to './work_dir'.
device (str, optional): The device to be used for calculation.
Defaults to 'cuda'.
"""
from lmdeploy.lite.apis.calibrate import calibrate
parser.add_argument('work_dir',
type=str,
help='Directory path where the stats are saved')
parser.add_argument('turbomind_dir',
type=str,
help='Directory path where to save the results')
parser.add_argument('--kv-bits',
type=int,
default=8,
help='Number of bits for quantization')
parser.add_argument('--kv-sym',
action='store_true',
help='Whether to use symmetric quantizaiton')
parser.add_argument(
'--num-tp',
type=int,
default=None,
help='GPU number used in tensor parallelism. Should be 2^n')
parser.add_argument('--tm-params',
nargs='*',
default=None,
action=DictAction,
help='Used key-values pairs in xxx=yyy format'
' to update the turbomind model weights'
' config')
calibrate(model,
calib_dataset=calib_dataset,
calib_samples=calib_samples,
calib_seqlen=calib_seqlen,
work_dir=work_dir,
device=device)
@staticmethod
def auto_awq(args):
"""Perform weight quantization using AWQ algorithm."""
from lmdeploy.lite.apis.auto_awq import auto_awq
kwargs = convert_args(args)
auto_awq(**kwargs)
def kv_qparams(self,
work_dir: str,
turbomind_dir: str,
kv_bits: int = 8,
kv_sym: bool = False,
num_tp: int = 1) -> None:
"""Export key and value stats.
@staticmethod
def calibrate(args):
"""Perform calibration on a given dataset."""
from lmdeploy.lite.apis.calibrate import calibrate
kwargs = convert_args(args)
calibrate(**kwargs)
Args:
work_dir (str): Directory path where the stats
are saved.
turbomind_dir (str): Directory path where to
save the results.
kv_bits (int, optional): Number of bits for quantization.
Defaults to 8.
kv_sym (bool, optional): Whether to use symmetric quantization.
Defaults to False.
num_tp (int, optional): Number of tensor parallelism.
Defaults to 1.
"""
@staticmethod
def kv_qparams(args):
"""Export key and value stats."""
from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
kwargs = convert_args(args)
run_kv_qparams(**kwargs)
run_kv_qparams(work_dir,
turbomind_dir,
kv_bits=kv_bits,
kv_sym=kv_sym,
num_tp=num_tp)
def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
"""Convert a hugging face model to the smallest sharded one.
@staticmethod
def smooth_quant(args):
"""Perform w8a8 quantization using SmoothQuant."""
from lmdeploy.lite.apis.smooth_quant import smooth_quant
kwargs = convert_args(args)
smooth_quant(**kwargs)
Args:
src_dir (str): The directory of the input HF model.
dst_dir (str): The directory to save new model.
"""
from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded
run_sharded(src_dir, dst_dir)
@staticmethod
def add_parsers():
"""Add all parsers."""
SubCliLite.add_parser_auto_awq()
SubCliLite.add_parser_calibrate()
SubCliLite.add_parser_kv_qparams()
SubCliLite.add_parser_smooth_quant()
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional
from .cli import CLI
from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
class SubCliServe(object):
class SubCliServe:
"""Serve LLMs and interact on terminal or web UI."""
_help = 'Serve LLMs with gradio, openai API or triton server.'
_desc = _help
parser = CLI.subparsers.add_parser(
'serve',
help=_help,
description=_desc,
)
subparsers = parser.add_subparsers(
title='Commands', description='This group has the following commands:')
def gradio(self,
model_path_or_server: str,
server_name: str = '0.0.0.0',
server_port: int = 6006,
batch_size: int = 32,
tp: int = 1,
**kwargs):
"""Serve LLMs with web ui using gradio.
Example 1:
lmdeploy serve gradio ./workspace
Example 2:
lmdeploy serve gradio http://0.0.0.0:23333
--server_name 0.0.0.0
--server_port 6006
Example 3:
lmdeploy serve gradio ${triton_server_ip_addresss}:33337
Args:
model_path_or_server (str): the path of the deployed model or the
tritonserver URL or restful api URL. The former is for directly
running service with gradio. The latter is for running with
tritonserver by default.
server_name (str): the ip address of gradio server
server_port (int): the port of gradio server
batch_size (int): batch size for running Turbomind directly
tp (int): tensor parallel for Turbomind
kwargs (dict): extra params to init
"""
@staticmethod
def add_parser_gradio():
"""Add parser for gradio command."""
parser = SubCliServe.subparsers.add_parser(
'gradio',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliServe.gradio.__doc__,
help=SubCliServe.gradio.__doc__)
parser.set_defaults(run=SubCliServe.gradio)
parser.add_argument(
'model_path_or_server',
type=str,
help='The path of the deployed model or the tritonserver url or '
'restful api url. for example: - ./workspace - 0.0.0.0:23333'
' - http://0.0.0.0:23333')
parser.add_argument('--server-name',
type=str,
default='0.0.0.0',
help='The ip address of gradio server')
parser.add_argument('--server-port',
type=int,
default=6006,
help='The port of gradio server')
# common args
ArgumentHelper.backend(parser)
# chat template args
ArgumentHelper.meta_instruction(parser) # TODO remove
ArgumentHelper.chat_template(parser)
ArgumentHelper.cap(parser)
# pytorch engine args
pt_group = parser.add_argument_group('PyTorch engine arguments')
# common engine args
tp_act = ArgumentHelper.tp(pt_group)
model_name_act = ArgumentHelper.model_name(pt_group)
session_len_act = ArgumentHelper.session_len(pt_group)
max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
# turbomind args
tb_group = parser.add_argument_group('TurboMind engine arguments')
# common engine args
tb_group._group_actions.append(tp_act)
tb_group._group_actions.append(model_name_act)
tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(max_batch_size_act)
tb_group._group_actions.append(cache_max_entry_act)
ArgumentHelper.model_format(tb_group)
ArgumentHelper.quant_policy(tb_group)
ArgumentHelper.rope_scaling_factor(tb_group)
@staticmethod
def add_parser_api_server():
"""Add parser for api_server command."""
parser = SubCliServe.subparsers.add_parser(
'api_server',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliServe.api_server.__doc__,
help=SubCliServe.api_server.__doc__)
parser.set_defaults(run=SubCliServe.api_server)
parser.add_argument(
'model_path',
type=str,
help='The path of a model. it could be one of the following '
'options: - i) a local directory path of a turbomind model'
' which is converted by `lmdeploy convert` command or '
'download from ii) and iii). - ii) the model_id of a '
'lmdeploy-quantized model hosted inside a model repo on '
'huggingface.co, such as "internlm/internlm-chat-20b-4bit",'
' "lmdeploy/llama2-chat-70b-4bit", etc. - iii) the model_id'
' of a model hosted inside a model repo on huggingface.co,'
' such as "internlm/internlm-chat-7b", "qwen/qwen-7b-chat "'
', "baichuan-inc/baichuan2-7b-chat" and so on')
parser.add_argument('--server-name',
type=str,
default='0.0.0.0',
help='Host ip for serving')
parser.add_argument('--server-port',
type=int,
default=23333,
help='Server port')
parser.add_argument('--allow-origins',
nargs='+',
type=str,
default=['*'],
help='A list of allowed origins for cors')
parser.add_argument('--allow-credentials',
action='store_true',
help='Whether to allow credentials for cors')
parser.add_argument('--allow-methods',
nargs='+',
type=str,
default=['*'],
help='A list of allowed http methods for cors')
parser.add_argument('--allow-headers',
nargs='+',
type=str,
default=['*'],
help='A list of allowed http headers for cors')
parser.add_argument('--qos-config-path',
type=str,
default='',
help='Qos policy config path')
# common args
ArgumentHelper.backend(parser)
ArgumentHelper.log_level(parser)
ArgumentHelper.api_keys(parser)
ArgumentHelper.ssl(parser)
# chat template args
ArgumentHelper.meta_instruction(parser) # TODO remove
ArgumentHelper.chat_template(parser)
ArgumentHelper.cap(parser)
# pytorch engine args
pt_group = parser.add_argument_group('PyTorch engine arguments')
# common engine args
tp_act = ArgumentHelper.tp(pt_group)
model_name_act = ArgumentHelper.model_name(pt_group)
session_len_act = ArgumentHelper.session_len(pt_group)
max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
# turbomind args
tb_group = parser.add_argument_group('TurboMind engine arguments')
# common engine args
tb_group._group_actions.append(tp_act)
tb_group._group_actions.append(model_name_act)
tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(max_batch_size_act)
tb_group._group_actions.append(cache_max_entry_act)
ArgumentHelper.model_format(tb_group)
ArgumentHelper.quant_policy(tb_group)
ArgumentHelper.rope_scaling_factor(tb_group)
@staticmethod
def add_parser_api_client():
"""Add parser for api_client command."""
parser = SubCliServe.subparsers.add_parser(
'api_client',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliServe.api_client.__doc__,
help=SubCliServe.api_client.__doc__)
parser.set_defaults(run=SubCliServe.api_client)
parser.add_argument('api_server_url',
type=str,
help='The URL of api server')
parser.add_argument('--api-key',
type=str,
default=None,
help='api key. Default to None, which means no '
'api key will be used')
ArgumentHelper.session_id(parser)
@staticmethod
def add_parser_triton_client():
"""Add parser for triton_client command."""
parser = SubCliServe.subparsers.add_parser(
'triton_client',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliServe.triton_client.__doc__,
help=SubCliServe.triton_client.__doc__)
parser.set_defaults(run=SubCliServe.triton_client)
parser.add_argument(
'tritonserver_addr',
type=str,
help='The address in format "ip:port" of triton inference server')
ArgumentHelper.session_id(parser)
ArgumentHelper.cap(parser)
ArgumentHelper.stream_output(parser)
@staticmethod
def gradio(args):
"""Serve LLMs with web UI using gradio."""
from lmdeploy.archs import autoget_backend
from lmdeploy.messages import (PytorchEngineConfig,
TurbomindEngineConfig)
from lmdeploy.model import ChatTemplateConfig
from lmdeploy.serve.gradio.app import run
run(model_path_or_server,
server_name=server_name,
server_port=server_port,
batch_size=batch_size,
tp=tp,
**kwargs)
def api_server(self,
model_path: str,
model_name: Optional[str] = None,
server_name: str = '0.0.0.0',
server_port: int = 23333,
instance_num: int = 64,
tp: int = 1,
allow_origins: List[str] = ['*'],
allow_credentials: bool = True,
allow_methods: List[str] = ['*'],
allow_headers: List[str] = ['*'],
**kwargs):
"""Serve LLMs with restful api using fastapi.
Args:
model_path (str): the path of a model.
It could be one of the following options:
- i) A local directory path of a turbomind model which is
converted by `lmdeploy convert` command or
download from ii) and iii).
- ii) The model_id of a lmdeploy-quantized model hosted
inside a model repo on huggingface.co, such as
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b"
server_name (str): host ip for serving
server_port (int): server port
instance_num (int): number of instances of turbomind model
tp (int): tensor parallel
allow_origins (List[str]): a list of allowed origins for CORS
allow_credentials (bool): whether to allow credentials for CORS
allow_methods (List[str]): a list of allowed HTTP methods for CORS
allow_headers (List[str]): a list of allowed HTTP headers for CORS
kwargs (dict) extra params to init api server
"""
backend = args.backend
if backend != 'pytorch' and ':' not in args.model_path_or_server:
# set auto backend mode
backend = autoget_backend(args.model_path_or_server)
if backend == 'pytorch':
backend_config = PytorchEngineConfig(
tp=args.tp,
model_name=args.model_name,
max_batch_size=args.max_batch_size,
cache_max_entry_count=args.cache_max_entry_count,
session_len=args.session_len)
else:
backend_config = TurbomindEngineConfig(
model_name=args.model_name,
tp=args.tp,
max_batch_size=args.max_batch_size,
session_len=args.session_len,
model_format=args.model_format,
quant_policy=args.quant_policy,
rope_scaling_factor=args.rope_scaling_factor,
cache_max_entry_count=args.cache_max_entry_count)
chat_template_config = ChatTemplateConfig(
model_name=args.model_name,
meta_instruction=args.meta_instruction,
capability=args.cap)
if args.chat_template:
chat_template_config = ChatTemplateConfig.from_json(
args.chat_template)
run(args.model_path_or_server,
server_name=args.server_name,
server_port=args.server_port,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config)
@staticmethod
def api_server(args):
"""Serve LLMs with restful api using fastapi."""
from lmdeploy.archs import autoget_backend
from lmdeploy.model import ChatTemplateConfig
from lmdeploy.serve.openai.api_server import serve as run_api_server
backend = args.backend
if backend != 'pytorch':
# set auto backend mode
backend = autoget_backend(args.model_path)
if backend == 'pytorch':
from lmdeploy.messages import PytorchEngineConfig
backend_config = PytorchEngineConfig(
tp=args.tp,
model_name=args.model_name,
max_batch_size=args.max_batch_size,
cache_max_entry_count=args.cache_max_entry_count,
session_len=args.session_len)
else:
from lmdeploy.messages import TurbomindEngineConfig
backend_config = TurbomindEngineConfig(
model_name=args.model_name,
tp=args.tp,
max_batch_size=args.max_batch_size,
session_len=args.session_len,
model_format=args.model_format,
quant_policy=args.quant_policy,
rope_scaling_factor=args.rope_scaling_factor,
cache_max_entry_count=args.cache_max_entry_count)
chat_template_config = ChatTemplateConfig(
model_name=args.model_name,
meta_instruction=args.meta_instruction,
capability=args.cap)
if args.chat_template:
chat_template_config = ChatTemplateConfig.from_json(
args.chat_template)
run_api_server(args.model_path,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
server_name=args.server_name,
server_port=args.server_port,
allow_origins=args.allow_origins,
allow_credentials=args.allow_credentials,
allow_methods=args.allow_methods,
allow_headers=args.allow_headers,
log_level=args.log_level.upper(),
api_keys=args.api_keys,
ssl=args.ssl,
qos_config_path=args.qos_config_path)
run_api_server(model_path,
model_name=model_name,
server_name=server_name,
server_port=server_port,
instance_num=instance_num,
tp=tp,
allow_origins=allow_origins,
allow_credentials=allow_credentials,
allow_methods=allow_methods,
allow_headers=allow_headers,
**kwargs)
def api_client(self, restful_api_url: str, session_id: int = 0):
"""Interact with restful api server in terminal.
Args:
restful_api_url: The restful api URL.
session_id: The identical id of a session.
"""
@staticmethod
def api_client(args):
"""Interact with restful api server in terminal."""
from lmdeploy.serve.openai.api_client import main as run_api_client
run_api_client(restful_api_url, session_id=session_id)
def triton_client(self,
tritonserver_addr: str,
session_id: int = 1,
cap: str = 'chat',
stream_output: bool = True,
**kwargs):
"""Interact with Triton Server using gRPC protocol.
Args:
tritonserver_addr (str): the address in format "ip:port" of
triton inference server
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama
has the ability among ['completion', 'infill', 'instruct',
'python']
stream_output (bool): indicator for streaming output or not
**kwargs (dict): other arguments for initializing model's
chat template
"""
kwargs = convert_args(args)
run_api_client(**kwargs)
@staticmethod
def triton_client(args):
"""Interact with Triton Server using gRPC protocol."""
from lmdeploy.serve.client import main as run_triton_client
kwargs = convert_args(args)
run_triton_client(**kwargs)
run_triton_client(
tritonserver_addr,
session_id=session_id,
cap=cap,
stream_output=stream_output,
**kwargs,
)
@staticmethod
def add_parsers():
SubCliServe.add_parser_gradio()
SubCliServe.add_parser_api_server()
SubCliServe.add_parser_api_client()
SubCliServe.add_parser_triton_client()
# Copyright (c) OpenMMLab. All rights reserved.
from pathlib import Path
import torch
from torch import nn
from transformers import AutoTokenizer
from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
quant_weights, smooth_layers)
from lmdeploy.lite.utils import collect_target_modules, load_hf_from_pretrained
from lmdeploy.lite.utils import collect_target_modules
from .calibrate import calibrate
# from lmdeploy.lite.utils.export_turbomind import export_turbomind_config
LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer',
'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
'QWenLMHeadModel': 'QWenBlock',
'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B
'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B
......@@ -21,6 +21,7 @@ LAYER_TYPE_MAP = {
}
NORM_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMRMSNorm',
'InternLM2ForCausalLM': 'InternLM2RMSNorm',
'QWenLMHeadModel': 'RMSNorm',
'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B
'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B
......@@ -29,30 +30,33 @@ NORM_TYPE_MAP = {
def auto_awq(model: str,
work_dir: str,
work_dir: str = './work_dir',
calib_dataset: str = 'ptb',
calib_samples: int = 128,
calib_seqlen: int = 2048,
w_bits: int = 4,
w_sym: bool = False,
w_group_size: int = 128,
device: str = 'cuda'):
assert model != work_dir, '$WORK_DIR and $HF_MODEL should be different'
model_path = model # noqa
# Load tokenizer and configuration
tokenizer = AutoTokenizer.from_pretrained(model,
use_fast=False,
trust_remote_code=True)
model = load_hf_from_pretrained(model,
torch_dtype=torch.float16,
trust_remote_code=True)
"""Perform weight quantization using AWQ algorithm.
Args:
model (str): The path of model in hf format.
work_dir (str): The working directory to save results.
calib_dataset (str): The calibration dataset name.
calib_samples (int): The number of samples for calibration.
calib_seqlen (int): The sequence length for calibration.
w_bits (int): Bit number for weight quantization.
w_sym (bool): Whether to do symmetric quantization.
w_group_size (int): Group size for weight quantization statistics.
device (str): Device type of running.
"""
model, tokenizer, work_dir = calibrate(model, calib_dataset, calib_samples,
calib_seqlen, work_dir, device)
layer_type = LAYER_TYPE_MAP[type(model).__name__]
fc2fcs = FC_FCS_MAP[layer_type]
norm2fcs = NORM_FCS_MAP[layer_type]
work_dir = Path(work_dir)
act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax']
layers = collect_target_modules(model, layer_type)
fcs = {}
......@@ -68,11 +72,6 @@ def auto_awq(model: str,
safe_serialization=False)
tokenizer.save_pretrained(work_dir)
# export_turbomind_config(model_name,
# model_path,
# work_dir,
# group_size=w_group_size)
if __name__ == '__main__':
import fire
......
......@@ -13,19 +13,31 @@ from lmdeploy.lite.utils import (collect_target_modules, get_calib_loaders,
LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer',
'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
'QWenLMHeadModel': 'QWenBlock',
'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B
'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaDecoderLayer',
}
NORM_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMRMSNorm',
'InternLM2ForCausalLM': 'InternLM2RMSNorm',
'QWenLMHeadModel': 'RMSNorm',
'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B
'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaRMSNorm',
}
HEAD_NAME_MAP = {
'InternLMForCausalLM': 'lm_head',
'InternLM2ForCausalLM': 'output',
'QWenLMHeadModel': 'lm_head',
'BaiChuanForCausalLM': 'lm_head', # Baichuan 7B
'BaichuanForCausalLM': 'lm_head', # Baichuan2 7B
'LlamaForCausalLM': 'lm_head',
}
def _prepare_for_calibrate(model: nn.Module,
layer_type: Union[str, type],
......@@ -99,7 +111,7 @@ def _prepare_for_calibrate(model: nn.Module,
def calibrate(model: str,
calib_dataset: str = 'c4',
calib_dataset: str = 'ptb',
calib_samples: int = 128,
calib_seqlen: int = 2048,
work_dir: str = './work_dir',
......@@ -110,7 +122,7 @@ def calibrate(model: str,
Args:
model (str): The name or path of the model to be loaded.
calib_dataset (str, optional): The calibration dataset name.
Defaults to 'c4'.
Defaults to 'ptb'.
calib_samples (int, optional): The number of samples for calibration.
Defaults to 128.
calib_seqlen (int, optional): The sequence length for calibration.
......@@ -119,6 +131,11 @@ def calibrate(model: str,
Defaults to './work_dir'.
device (str, optional): The device to be used for calculation.
Defaults to 'cuda'.
Returns:
model (nn.Module): The loaded huggingface model.
tokenizer : The loaded hugginface tokenizer.
work_dir (str): The working directory for outputs.
"""
assert calib_dataset in ['c4', 'ptb', 'wikitext2', 'pileval'], \
......@@ -152,7 +169,8 @@ def calibrate(model: str,
layer_type = LAYER_TYPE_MAP[type(model).__name__]
norm_type = NORM_TYPE_MAP[type(model).__name__]
_prepare_for_calibrate(model, layer_type, 'lm_head', device)
_prepare_for_calibrate(model, layer_type,
HEAD_NAME_MAP[type(model).__name__], device)
print('Loading calibrate dataset ...')
calib_loader, _ = get_calib_loaders(calib_dataset,
......@@ -179,6 +197,8 @@ def calibrate(model: str,
work_dir.mkdir(parents=True, exist_ok=True)
calib_ctx.export(work_dir)
return model, tokenizer, work_dir
if __name__ == '__main__':
import fire
......
......@@ -15,6 +15,10 @@ NORM_FCS_MAP = {
['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
},
'InternLM2DecoderLayer': {
'attention_norm': ['attention.wqkv'],
'ffn_norm': ['feed_forward.w1', 'feed_forward.w3']
},
'QWenBlock': {
'ln_1': ['attn.c_attn'],
'ln_2': ['mlp.w1', 'mlp.w2']
......@@ -34,6 +38,9 @@ FC_FCS_MAP = {
'self_attn.v_proj': ['self_attn.o_proj'],
'mlp.up_proj': ['mlp.down_proj']
},
'InternLM2DecoderLayer': {
'feed_forward.w3': ['feed_forward.w2']
},
'QWenBlock': {
'attn.c_attn': ['attn.c_proj'],
'mlp.w1': ['mlp.c_proj']
......@@ -71,6 +78,13 @@ def smooth_ln_fcs(ln: torch.nn.Module,
:return: Scales
"""
device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
# If zeros exist within the weight of the layer norm, it becomes
# unnecessary to perform smooth quantization at the positions where
# these zeros occur.
zero_positions = (ln.weight == 0).nonzero(as_tuple=True)[0]
nonzero_positions = (ln.weight != 0).nonzero(as_tuple=True)[0]
act_scales = act_scales.to(device=device, dtype=dtype)
concat_w = torch.cat([fc.weight for fc in fcs], dim=0)
......@@ -78,7 +92,11 @@ def smooth_ln_fcs(ln: torch.nn.Module,
scales = (act_scales.pow(alpha) /
w_scales.pow(1 - alpha)).to(device).to(dtype)
scales = scales / (scales.max() * scales.min()).sqrt()
scales = scales / (scales[nonzero_positions].max() *
scales[nonzero_positions].min()).sqrt()
scales[zero_positions] = 1
ln.weight.div_(scales)
if hasattr(ln, 'bias'):
......@@ -182,8 +200,8 @@ def check_awq_supported(layer_type):
def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
"""Quantize the weights of the target model's linear layers."""
from lmdeploy.legacy.pytorch.modules import WeightOnlyQLinear
from lmdeploy.lite.quantization import WeightQuantizer
from lmdeploy.pytorch.modules import WeightOnlyQLinear
for name, fc in fcs.items():
fc.to(device)
quantizer = WeightQuantizer(bits, symmetry, 'per_group', group_size)
......
......@@ -3,6 +3,8 @@ from functools import partial
from typing import Union
import torch
import transformers
from mmengine import digit_version
from torch import nn
from transformers import PreTrainedTokenizer
......@@ -53,7 +55,6 @@ class CalibrationContext():
self.num_kv_heads = num_kv_heads
self.head_dim = model.config.hidden_size // num_attn_heads
self.model = model
del self.model.lm_head
self.tokenizer = tokenizer
......@@ -163,12 +164,36 @@ class CalibrationContext():
if k_obs and v_obs:
batch_kwargs[i]['use_cache'] = True
out = self._ori_forwards[mod](*batch_args[i],
**batch_kwargs[i])
out = list(out)
key, value = out.pop(-1)
k_obs.observe(key)
v_obs.observe(value)
version = digit_version(transformers.__version__)
use_new_cache = type(mod).__name__ == 'LlamaDecoderLayer'
if version > digit_version('4.36.0') and use_new_cache:
from transformers.cache_utils import DynamicCache
batch_kwargs[i]['past_key_value'] = DynamicCache()
ori_idx = mod.self_attn.layer_idx
mod.self_attn.layer_idx = 0
out = self._ori_forwards[mod](*batch_args[i],
**batch_kwargs[i])
mod.self_attn.layer_idx = ori_idx
out = list(out)
cache = out.pop(-1)
key = cache.key_cache.pop(-1)
value = cache.value_cache.pop(-1)
k_obs.observe(key)
v_obs.observe(value)
else:
out = self._ori_forwards[mod](*batch_args[i],
**batch_kwargs[i])
out = list(out)
key, value = out.pop(-1)
k_obs.observe(key)
v_obs.observe(value)
del key, value
torch.cuda.empty_cache()
......
......@@ -3,7 +3,7 @@
import torch
from transformers import AutoConfig, AutoModelForCausalLM
from lmdeploy.pytorch.model import LoadWoInit
from lmdeploy.pytorch.accel import LoadNoInit
def load_hf_from_pretrained(pretrained_model_name_or_path,
......@@ -26,7 +26,7 @@ def load_hf_from_pretrained(pretrained_model_name_or_path,
elif dtype == torch.bfloat16:
hf_config.bf16 = True
with LoadWoInit():
with LoadNoInit():
# Load model
model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path, config=hf_config, **kwargs)
......
# Copyright (c) OpenMMLab. All rights reserved.
import dataclasses
import json
from abc import abstractmethod
from typing import List
from typing import List, Literal, Optional
from mmengine import Registry
from lmdeploy.utils import get_logger
logger = get_logger('lmdeploy')
MODELS = Registry('model', locations=['lmdeploy.model'])
@dataclasses.dataclass
class SamplingParam:
top_p: float = 0.8
top_k: float = None
temperature: float = 0.8
repetition_penalty: float = 1.0
class ChatTemplateConfig:
"""Parameters for chat template.
Args:
model_name (str): the name of the deployed model. Determine which chat template will be applied.
All the chat template names: `lmdeploy list`
system (str | None): begin of the system prompt
meta_instruction (str | None): system prompt
eosys (str | None): end of the system prompt
user (str | None): begin of the user prompt
eoh (str | None): end of the user prompt
assistant (str | None): begin of the assistant prompt
eoa (str | None): end of the assistant prompt
capability: ('completion' | 'infilling' | 'chat' | 'python') = None
""" # noqa: E501
model_name: str
system: Optional[str] = None
meta_instruction: Optional[str] = None
eosys: Optional[str] = None
user: Optional[str] = None
eoh: Optional[str] = None
assistant: Optional[str] = None
eoa: Optional[str] = None
separator: Optional[str] = None
capability: Optional[Literal['completion', 'infilling', 'chat',
'python']] = None
stop_words: Optional[List[str]] = None
@property
def chat_template(self):
attrs = {
key: value
for key, value in dataclasses.asdict(self).items()
if value is not None
}
attrs.pop('model_name', None)
if self.model_name in MODELS.module_dict.keys():
model: BaseModel = MODELS.get(self.model_name)(**attrs)
else:
logger.warning(
f'Could not find {self.model_name} in registered models. '
f'Register {self.model_name} using the BaseChatTemplate.')
model = BaseChatTemplate(**attrs)
return model
def to_json(self, file_path=None):
"""Convert the dataclass instance to a JSON formatted string and
optionally save to a file."""
json_str = json.dumps(dataclasses.asdict(self),
ensure_ascii=False,
indent=4)
if file_path:
with open(file_path, 'w', encoding='utf-8') as file:
file.write(json_str)
return json_str
@classmethod
def from_json(cls, file_or_string):
"""Construct a dataclass instance from a JSON file or JSON string."""
try:
# Try to open the input_data as a file path
with open(file_or_string, 'r', encoding='utf-8') as file:
json_data = file.read()
except FileNotFoundError:
# If it's not a file path, assume it's a JSON string
json_data = file_or_string
except IOError:
# If it's not a file path and not a valid JSON string, raise error
raise ValueError(
'Invalid input. Must be a file path or a valid JSON string.')
json_data = json.loads(json_data)
assert json_data.get('model_name', None) is not None, \
'model_name is a must for json chat template.'
if json_data['model_name'] not in MODELS.module_dict.keys():
MODELS.register_module(json_data['model_name'],
module=BaseChatTemplate)
return cls(**json_data)
@MODELS.register_module(name='internlm')
@MODELS.register_module(name='llama')
@MODELS.register_module(name='base')
class BaseModel:
......@@ -24,18 +100,10 @@ class BaseModel:
def __init__(self,
session_len=2048,
top_p=0.8,
top_k=None,
temperature=0.8,
repetition_penalty=1.0,
capability='chat',
stop_words=None,
**kwargs):
self.session_len = session_len
self.top_p = top_p
self.top_k = top_k
self.temperature = temperature
self.repetition_penalty = repetition_penalty
self.stop_words = stop_words
self.capability = capability
......@@ -50,43 +118,8 @@ class BaseModel:
Returns:
str: the concatenated prompt
"""
if self.capability == 'completion':
return prompt
else:
return self.decorate_prompt(prompt, sequence_start)
@abstractmethod
def decorate_prompt(self, prompt, sequence_start):
return prompt
@staticmethod
def _translate_messages(messages: List):
"""Translate messages into system, user speaking list, assistant
speaking list.
Args:
messages (List): chat history
Returns:
Turple: consists of system (str), users (List[str]),
assistants (List[str])
"""
system = None
users = []
assistants = []
assert isinstance(messages, List)
for message in messages:
msg_role = message['role']
if msg_role == 'system':
system = message['content']
elif msg_role == 'user':
users.append(message['content'])
elif msg_role == 'assistant':
assistants.append(message['content'])
else:
raise ValueError(f'Unknown role: {msg_role}')
assistants.append(None)
return system, users, assistants
@abstractmethod
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
......@@ -103,31 +136,40 @@ class BaseModel:
return self.get_prompt(messages)
# chat history processing in derived classes
@property
def sampling_param(self):
return SamplingParam(top_p=self.top_p,
top_k=self.top_k,
temperature=self.temperature,
repetition_penalty=self.repetition_penalty)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
return None
@MODELS.register_module(name='wizardlM')
@MODELS.register_module(name='vicuna')
class Vicuna(BaseModel):
"""Chat template of vicuna model."""
def __init__(
self,
system="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. """, # noqa: E501
user='USER',
assistant='ASSISTANT',
**kwargs):
class BaseChatTemplate(BaseModel):
"""Base Chat template."""
def __init__(self,
system='',
meta_instruction='',
eosys='',
user='',
eoh='',
assistant='',
eoa='',
separator='',
**kwargs):
super().__init__(**kwargs)
self.system = system
self.meta_instruction = meta_instruction
self.user = user
self.eoh = eoh
self.eoa = eoa
self.separator = separator
self.eosys = eosys
self.assistant = assistant
def decorate_prompt(self, prompt, sequence_start=True):
def get_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
......@@ -138,12 +180,20 @@ class Vicuna(BaseModel):
Returns:
str: the concatenated prompt
"""
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if self.capability == 'completion':
return prompt
if sequence_start:
return f'{self.system} {self.user}: {prompt} {self.assistant}: '
# None is different from ''
if self.meta_instruction is not None:
return f'{self.system}{self.meta_instruction}{self.eosys}' \
f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else:
return f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else:
return f'</s>{self.user}: {prompt} {self.assistant}: '
return f'{self.separator}{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
......@@ -156,20 +206,65 @@ class Vicuna(BaseModel):
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages)
system = self.system if not system else system
ret = system + ' '
for user, assistant in zip(users, assistants):
if assistant:
ret += f'{self.user}: {user} {self.assistant}: {assistant}</s>'
else:
ret += f'{self.user}: {user} {self.assistant}: '
box_map = dict(user=self.user,
assistant=self.assistant,
system=self.system)
eox_map = dict(user=self.eoh,
assistant=self.eoa + self.separator,
system=self.eosys)
ret = ''
if self.meta_instruction is not None:
if len(messages) and messages[0]['role'] != 'system':
ret += f'{self.system}{self.meta_instruction}{self.eosys}'
for message in messages:
role = message['role']
content = message['content']
ret += f'{box_map[role]}{content}{eox_map[role]}'
ret += f'{self.assistant}'
return ret
@MODELS.register_module(name='wizardlm')
@MODELS.register_module(name='vicuna')
class Vicuna(BaseChatTemplate):
"""Chat template of vicuna model."""
def __init__(
self,
meta_instruction="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""", # noqa: E501
eosys=' ',
user='USER: ',
eoh=' ',
assistant='ASSISTANT: ',
eoa='</s>',
stop_words=['</s>'],
**kwargs):
super().__init__(meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
stop_words=stop_words,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
if 'vicuna' in model_path.lower():
return 'vicuna'
if 'wizardlm' in model_path.lower():
return 'wizardlm'
@MODELS.register_module(name='internlm-chat')
@MODELS.register_module(name='internlm-chat-7b')
class InternLMChat7B(BaseModel):
@MODELS.register_module(name='internlm')
class InternLMChat7B(BaseChatTemplate):
"""Chat template of InternLM model."""
def __init__(
......@@ -179,67 +274,36 @@ class InternLMChat7B(BaseModel):
- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
""", # noqa: E501
eosys='\n',
user='<|User|>:',
eoh='\n',
eoa='<eoa>\n',
eosys='\n',
assistant='<|Bot|>:',
eoa='<eoa>',
separator='\n',
stop_words=['<eoa>'],
**kwargs):
super().__init__(**kwargs)
self.system = system
self.meta_instruction = meta_instruction
self.user = user
self.eoh = eoh
self.eoa = eoa
self.eosys = eosys
self.assistant = assistant
self.stop_words = stop_words
def decorate_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
Args:
prompt (str): user's input prompt
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
"""
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.system}{self.meta_instruction}{self.eosys}' \
f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else:
return f'\n{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
super().__init__(system=system,
meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
separator=separator,
stop_words=stop_words,
**kwargs)
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
model_path (str): the model path used for matching.
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
ret = ''
if self.meta_instruction:
ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
for message in messages:
role = message['role']
content = message['content']
ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
ret += f'{self.assistant}'
return ret
path = model_path.lower()
if all([c not in path for c in ['internlm2', '8k']]) and \
all([c in path for c in ['internlm', 'chat']]):
return 'internlm'
@MODELS.register_module(name='internlm-chat-20b')
......@@ -254,7 +318,7 @@ class InternLMChat7B8K(InternLMChat7B):
@MODELS.register_module(name='internlm-20b')
class InternLMBaseModel20B(BaseModel):
class InternLMBaseModel20B(BaseChatTemplate):
"""Generation parameters of InternLM-20B-Base model."""
def __init__(self, session_len=4096, capability='completion', **kwargs):
......@@ -263,71 +327,94 @@ class InternLMBaseModel20B(BaseModel):
**kwargs)
@MODELS.register_module(
name=['internlm2-1_8b', 'internlm2-7b', 'internlm2-20b'])
class InternLM2BaseModel7B(BaseChatTemplate):
"""Generation parameters of InternLM2-7B-Base model."""
def __init__(self, session_len=32768, capability='completion', **kwargs):
super().__init__(session_len=session_len,
capability=capability,
**kwargs)
@MODELS.register_module(name=[
'internlm2-chat', 'internlm2-chat-1_8b', 'internlm2-chat-7b',
'internlm2-chat-20b'
])
@MODELS.register_module(name='internlm2')
class InternLM2Chat7B(InternLMChat7B):
"""Chat template and generation parameters of InternLM2-Chat-7B."""
def __init__(self,
session_len=32768,
system='<|im_start|>system\n',
user='<|im_start|>user\n',
assistant='<|im_start|>assistant\n',
eosys='<|im_end|>\n',
eoh='<|im_end|>\n',
eoa='<|im_end|>',
separator='\n',
stop_words=['<|im_end|>', '<|action_end|>'],
**kwargs):
super(InternLM2Chat7B, self).__init__(session_len=session_len,
system=system,
user=user,
assistant=assistant,
eosys=eosys,
eoh=eoh,
eoa=eoa,
separator=separator,
stop_words=stop_words,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
path = model_path.lower()
if 'internlm2' in path and ('chat' in path or 'math' in path):
return 'internlm2'
@MODELS.register_module(name='baichuan-7b')
class Baichuan7B(BaseModel):
@MODELS.register_module(name='baichuan-base')
class Baichuan7B(BaseChatTemplate):
"""Generation parameters of Baichuan-7B base model."""
def __init__(self, repetition_penalty=1.1, **kwargs):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.repetition_penalty = repetition_penalty
@MODELS.register_module(name='baichuan2-7b')
class Baichuan2_7B(BaseModel):
@MODELS.register_module(name='baichuan2')
class Baichuan2_7B(BaseChatTemplate):
"""Chat template and generation parameters of Baichuan2-7B-Base and
Baichuan2-7B-Chat models."""
def __init__(self,
temperature=0.3,
top_k=5,
top_p=0.85,
repetition_penalty=1.05,
user='<reserved_106>',
assistant='<reserved_107>',
**kwargs):
super().__init__(temperature=temperature,
top_k=top_k,
top_p=top_p,
repetition_penalty=repetition_penalty,
**kwargs)
self.user_token = '<reserved_106>' # id = 195
self.assistant_token = '<reserved_107>' # id = 196
super().__init__(user=user, assistant=assistant, **kwargs)
def decorate_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
prompt (str): user's input prompt
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
model_path (str): the model path used for matching.
"""
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
return f'{self.user_token}{prompt}{self.assistant_token}'
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages)
ret = ''
for user, assistant in zip(users, assistants):
ret += f'{self.user_token}{user}{self.assistant_token}'
if assistant:
ret += f'{assistant}'
return ret
path = model_path.lower()
if 'baichuan2' in path and 'chat' in path:
return 'baichuan2'
@MODELS.register_module(name='puyu')
class Puyu(BaseModel):
class Puyu(BaseChatTemplate):
"""Chat template of puyu model.This is only for internal usage in Shanghai
AI Laboratory."""
......@@ -341,217 +428,136 @@ class Puyu(BaseModel):
eoa='',
stop_words=None,
**kwargs):
super().__init__(**kwargs)
self.meta_instruction = meta_instruction
self.system = system
self.user = user
self.assistant = assistant
self.stop_words = stop_words
self.eosys = eosys
self.eoh = eoh
self.eoa = eoa
def decorate_prompt(self, prompt, sequence_start=True):
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.system}{self.meta_instruction}{self.eosys}' \
f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else:
return f'{self.eoa}{self.user}{prompt}{self.eoh}{self.assistant}'
super().__init__(meta_instruction=meta_instruction,
system=system,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
stop_words=stop_words,
**kwargs)
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
messages (str | List): user's input prompt
sequence_start (bool): flag to start the sequence
Returns:
str: the concatenated prompt
model_path (str): the model path used for matching.
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
ret = ''
if self.meta_instruction:
ret += f'{self.system}{self.meta_instruction}{self.eosys}'
for message in messages:
role = message['role']
content = message['content']
ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
ret += f'{self.assistant}'
return ret
if 'puyu' in model_path.lower():
return 'puyu'
@MODELS.register_module(name='llama2')
class Llama2(BaseModel):
@MODELS.register_module(name=['llama2', 'llama-2', 'llama-2-chat'])
class Llama2(BaseChatTemplate):
"""Chat template of LLaMA2 model."""
def __init__(
self,
b_inst='[INST]',
e_inst='[/INST]',
b_sys='<<SYS>>\n',
e_sys='\n<</SYS>>\n\n',
system="""\
system='[INST] <<SYS>>\n',
meta_instruction="""\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""", # noqa: E501
eosys='\n<</SYS>>\n\n',
assistant=' [/INST] ',
eoa='</s>',
separator='<s>[INST] ',
session_len=4096,
**kwargs):
super().__init__(**kwargs)
self.b_inst = b_inst
self.e_inst = e_inst
self.b_sys = b_sys
self.e_sys = e_sys
self.default_sys_prompt = system
self.session_len = session_len
super().__init__(system=system,
meta_instruction=meta_instruction,
eosys=eosys,
assistant=assistant,
eoa=eoa,
separator=separator,
session_len=session_len,
**kwargs)
def decorate_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
prompt (str): user's input prompt
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
model_path (str): the model path used for matching.
"""
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.b_inst} ' \
f'{self.b_sys} {self.default_sys_prompt} {self.e_sys}' \
f'{prompt} {self.e_inst} '
return f'{self.b_inst} {prompt} {self.e_inst} '
if 'llama-2' in model_path.lower() or 'llama2' in model_path.lower():
return 'llama2'
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages)
system = self.default_sys_prompt if not system else system
ret = f'{self.b_inst} {self.b_sys} {system} {self.e_sys}'
for i, (user, assistant) in enumerate(zip(users, assistants)):
if i != 0:
ret += f'{self.b_inst} '
if assistant:
ret += f'{user} {self.e_inst} {assistant}'
else:
ret += f'{user} {self.e_inst} '
return ret
@MODELS.register_module(name='qwen-72b')
@MODELS.register_module(name='qwen-14b')
@MODELS.register_module(name='qwen-7b')
class Qwen7BChat(BaseModel):
@MODELS.register_module(name='qwen')
class Qwen7BChat(BaseChatTemplate):
"""Chat template for Qwen-7B-Chat."""
def __init__(self,
session_len=8192,
top_p=0.5,
top_k=40,
temperature=1.0,
im_start='<|im_start|>',
im_end='<|im_end|>',
system='You are a helpful assistant.',
system='<|im_start|>system\n',
meta_instruction='You are a helpful assistant.',
eosys='<|im_end|>\n',
user='<|im_start|>user\n',
eoh='<|im_end|>\n',
assistant='<|im_start|>assistant\n',
eoa='<|im_end|>',
separator='\n',
stop_words=['<|im_end|>'],
**kwargs):
super().__init__(**kwargs)
self.session_len = session_len
self.top_p = top_p
self.top_k = top_k
self.temperature = temperature
self.im_start = im_start
self.im_end = im_end
self.system = system
self.stop_words = stop_words
def decorate_prompt(self, prompt, sequence_start=True):
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.im_start}system\n{self.system}{self.im_end}' \
f'\n{self.im_start}user\n{prompt}{self.im_end}' \
f'\n{self.im_start}assistant\n'
return f'\n{self.im_start}user\n{prompt}{self.im_end}' \
f'\n{self.im_start}assistant\n'
super().__init__(system=system,
meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
separator=separator,
stop_words=stop_words,
session_len=session_len,
**kwargs)
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
model_path (str): the model path used for matching.
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages)
system = self.system if not system else system
ret = f'{self.im_start}system\n{system}{self.im_end}'
for user, assistant in zip(users, assistants):
if assistant:
ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
f'\n{self.im_start}assistant\n{assistant}'
else:
ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
f'\n{self.im_start}assistant\n'
return ret
if 'qwen' in model_path.lower():
return 'qwen'
@MODELS.register_module(name='codellama')
class CodeLlama(Llama2):
def __init__(self,
system='',
meta_instruction='',
session_len=4096,
suffix_first=False,
stop_words=None,
**kwargs):
super().__init__(**kwargs)
super().__init__(meta_instruction=meta_instruction,
session_len=session_len,
stop_words=stop_words,
**kwargs)
caps = ['completion', 'infilling', 'chat', 'python']
assert self.capability in caps, \
f'{self.capability} is not supported. ' \
f'The supported capabilities are: {caps}'
self.default_sys_prompt = system
self.meta_instruction = meta_instruction
self.session_len = session_len
self.suffix_first = suffix_first
self.stop_words = stop_words
# The following sampling parameters refers to https://github.com/facebookresearch/codellama # noqa: E501
if self.capability == 'completion' or self.capability == 'python':
self.top_p = kwargs.get('top_p', 0.9)
self.temperature = kwargs.get('temperature', 0.2)
if self.capability == 'chat':
self.top_p = kwargs.get('top_p', 0.95)
self.temperature = kwargs.get('temperature', 0.2)
elif self.capability == 'infilling':
self.top_p = kwargs.get('top_p', 0.9)
self.temperature = kwargs.get('temperature', 0.0)
if self.capability == 'infilling':
if self.stop_words is None:
self.stop_words = ['<EOT>']
def decorate_prompt(self, prompt, sequence_start=True):
def get_prompt(self, prompt, sequence_start=True):
if self.capability == 'infilling':
return self._infill_prompt(prompt)
elif self.capability == 'chat':
return self._get_prompt(prompt, sequence_start)
return super().get_prompt(prompt, sequence_start)
else: # python speicalist
return prompt
......@@ -565,92 +571,130 @@ class CodeLlama(Llama2):
prompt = f'<PRE> {prefix} <SUF>{suffix} <MID>'
return prompt
def _get_prompt(self, prompt, sequence_start):
prompt = prompt.strip()
if sequence_start:
return f'{self.b_inst} ' \
f'{self.b_sys}{self.default_sys_prompt}{self.e_sys}' \
f'{prompt} {self.e_inst}'
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
if 'codellama' in model_path.lower():
return 'codellama'
@MODELS.register_module(name='falcon')
class Falcon(BaseModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
if 'falcon' in model_path.lower():
return 'falcon'
@MODELS.register_module(name='chatglm2-6b')
@MODELS.register_module(name='chatglm')
class ChatGLM2(BaseModel):
def __init__(self,
user='问:',
eoh='\n\n',
assistant='答:',
eoa='\n\n',
**kwargs):
super().__init__(**kwargs)
self._user = user
self._assistant = assistant
self._eoh = eoh
self._eoa = eoa
self.count = 0
return f'{self.b_inst} {prompt} {self.e_inst}'
def get_prompt(self, prompt, sequence_start=True):
"""get prompt."""
# need more check
# https://github.com/THUDM/ChatGLM2-6B/issues/48
# [64790, 64792] to be prepended
self.count += 1
ret = f'[Round {self.count}]\n\n'
ret += f'{self._user}{prompt}{self._eoh}'
ret += f'{self._assistant}'
return ret
def messages2prompt(self, messages, sequence_start=True):
assert self.capability == 'chat', \
f'codellama message2prompt only supports chat mode ' \
f'but got {self.cap} mode'
return super().messages2prompt(messages, sequence_start)
"""message to prompt."""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
ret = ''
count = 0
for message in messages:
role = message['role']
content = message['content']
if role == 'user':
count += 1
ret += f'[Round {count}]\n\n'
ret += f'{self._user}{content}{self._eoh}'
ret += f'{self._assistant}'
if role == 'assistant':
ret += f'{content}'
return ret
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
if 'chatglm' in model_path.lower():
return 'chatglm'
@MODELS.register_module(name='solar')
class SOLAR(BaseModel):
@MODELS.register_module(name=['solar', 'solar-70b'])
class SOLAR(BaseChatTemplate):
"""Chat template of SOLAR model.
`https://huggingface.co/upstage/SOLAR-0-70b-16bit`
"""
def __init__(self,
b_sys='### System:\n',
e_sys='\n\n',
system='### System:\n',
eosys='\n\n',
user='### User:\n',
eoh='\n\n',
assistant='### Assistant:\n',
eoa='\n\n',
system='',
meta_instruction='',
session_len=2048,
**kwargs):
super().__init__(**kwargs)
self.b_sys = b_sys
self.e_sys = e_sys
self.system = system
self.eosys = eosys
self.user = user
self.eoh = eoh
self.assistant = assistant
self.eoa = eoa
self.system = system
self.meta_instruction = meta_instruction
self.session_len = session_len
def decorate_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
prompt (str): user's input prompt
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
model_path (str): the model path used for matching.
"""
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.b_sys}{self.system}{self.e_sys}' \
f'{self.user}{prompt}{self.eoh}{self.assistant}'
return f'{self.user}{prompt}{self.eoh}{self.assistant}'
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages)
system = self.system if not system else system
ret = f'{self.b_sys}{system}{self.e_sys}'
for i, (user, assistant) in enumerate(zip(users, assistants)):
ret += f'{self.user}{user}{self.eoh}{self.assistant}'
if assistant:
ret += f'{assistant}{self.eoa}'
return ret
if 'solar' in model_path.lower():
return 'solar'
@MODELS.register_module(name='ultracm')
@MODELS.register_module(name='ultralm')
class UltraChat(BaseModel):
class UltraChat(BaseChatTemplate):
"""Template of UltraCM and UltraLM models.
`https://huggingface.co/openbmb/UltraCM-13b`
......@@ -659,147 +703,222 @@ class UltraChat(BaseModel):
def __init__(
self,
system="""User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.</s>""", # noqa: E501
eos='</s>',
system='User: ',
meta_instruction="""A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.""", # noqa: E501
eosys='</s>\n',
user='User: ',
eoh='</s>\n',
assistant='Assistant: ',
eoa='</s>',
separator='\n',
stop_words=['</s>'],
session_len=2048,
**kwargs):
super().__init__(**kwargs)
self.system = system
self.eos = eos
self.session_len = session_len
self.user = user
self.assistant = assistant
def decorate_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
Args:
prompt (str): the input prompt
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
"""
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.system}\n{self.user}{prompt}{self.eos}' \
f'\n{self.assistant}'
return f'\n{self.user}{prompt}{self.eos}' \
f'\n{self.assistant}'
super().__init__(system=system,
meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
separator=separator,
stop_words=stop_words,
session_len=session_len,
**kwargs)
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template. Only evaluate the last instruction completion pair.
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
model_path (str): the model path used for matching.
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages)
system = self.system if not system else system
ret = f'{system}'
for user, assistant in zip(users, assistants):
if assistant:
ret += f'\n{self.user}{user}{self.eos}' \
f'\n{self.assistant}{assistant}{self.eos}'
else:
ret += f'\n{self.user}{user}{self.eos}' \
f'\n{self.assistant}'
return ret
if 'ultracm' in model_path.lower():
return 'ultracm'
if 'ultralm' in model_path.lower():
return 'ultralm'
@MODELS.register_module(name='yi')
class Yi(BaseModel):
@MODELS.register_module(name=['yi', 'yi-chat', 'yi-200k', 'yi-34b'])
class Yi(BaseChatTemplate):
"""Chat template of Yi model."""
def __init__(self,
system='<|im_start|>system\n',
meta_instruction=None,
eosys='<|im_end|>\n',
user='<|im_start|>user\n',
eoh='<|im_end|>\n',
eoa='<|im_end|>\n',
eosys='<|im_end|>\n',
assistant='<|im_start|>assistant\n',
eoa='<|im_end|>',
separator='\n',
stop_words=['<|im_end|>', '<|endoftext|>'],
**kwargs):
super().__init__(**kwargs)
self.system = system
self.meta_instruction = meta_instruction
self.user = user
self.eoh = eoh
self.eoa = eoa
self.eosys = eosys
self.assistant = assistant
self.stop_words = stop_words
super().__init__(system=system,
meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
separator=separator,
stop_words=stop_words,
**kwargs)
def decorate_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
prompt (str): user's input prompt
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
model_path (str): the model path used for matching.
"""
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
if self.meta_instruction is None:
return f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
return f'{self.system}{self.meta_instruction}{self.eosys}' \
f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else:
return f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
path = model_path.lower()
if 'yi' in path and 'vl' not in path:
return 'yi'
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
@MODELS.register_module(name=['mistral', 'mixtral'])
@MODELS.register_module(name=['Mistral-7B-Instruct', 'Mixtral-8x7B-Instruct'])
class MistralChat(BaseChatTemplate):
"""Template of Mistral and Mixtral Instruct models.
`https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1`
`https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1`
"""
def __init__(self,
user='[INST] ',
eoh=' [/INST]',
eoa='</s>',
session_len=2048,
**kwargs):
super().__init__(user=user,
eoh=eoh,
eoa=eoa,
session_len=session_len,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
model_path (str): the model path used for matching.
"""
if 'instruct' in model_path.lower():
if 'mistral' in model_path.lower():
return 'mistral'
if 'mixtral' in model_path.lower():
return 'mixtral'
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
ret = ''
if self.meta_instruction:
ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
for message in messages:
role = message['role']
content = message['content']
ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
ret += f'{self.assistant}'
return ret
@MODELS.register_module(name=['gemma'])
class Gemma(BaseChatTemplate):
"""Template of Gemma models.
`https://huggingface.co/google/gemma-7b-it`
"""
def __init__(self,
user='<start_of_turn>user\n',
eoh='<end_of_turn>\n',
assistant='<start_of_turn>model\n',
eoa='<end_of_turn>\n',
**kwargs):
super().__init__(user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
if 'gemma' in model_path.lower():
return 'gemma'
def main(model_name: str = 'test'):
assert model_name in MODELS.module_dict.keys(), \
f"'{model_name}' is not supported. " \
f'The supported models are: {MODELS.module_dict.keys()}'
model = MODELS.get(model_name)()
prompt = model.get_prompt(prompt='hi')
print(prompt)
print(f'session_len: {model.session_len}')
@MODELS.register_module(name=['deepseek-chat'])
@MODELS.register_module(name=['deepseek'])
class Deepseek(BaseChatTemplate):
def __init__(self,
user='User: ',
eoh='\n\n',
assistant='Assistant: ',
eoa='<|end▁of▁sentence|>',
**kwargs):
super().__init__(user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
**kwargs)
if __name__ == '__main__':
import fire
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
fire.Fire(main)
Args:
model_path (str): the model path used for matching.
"""
path = model_path.lower()
if 'deepseek' in path and 'chat' in path:
return 'deepseek'
@MODELS.register_module(name=['yi-vl'])
class YiVL(BaseChatTemplate):
def __init__(
self,
meta_instruction="""This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers. 这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像,并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n""", # noqa: E501
user='### Human: ',
eoh='\n',
assistant='### Assistant:',
eoa='\n',
stop_words=['###'],
**kwargs):
super().__init__(meta_instruction=meta_instruction,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
stop_words=stop_words,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
path = model_path.lower()
if 'yi-vl' in path:
return 'yi-vl'
def best_match_model(query: str) -> Optional[str]:
"""Get the model that matches the query.
Args:
query (str): the input query. Could be a model path.
Return:
str | None: the possible model name or none.
"""
for name, model in MODELS.module_dict.items():
if model.match(query):
return model.match(query)
try:
from transformers import AutoTokenizer
tokenizer_config = AutoTokenizer.from_pretrained(
query, trust_remote_code=True)
if tokenizer_config.chat_template is None:
return 'base'
except Exception as e:
assert type(e) == OSError
# Copyright (c) OpenMMLab. All rights reserved.
"""Chat with torch models."""
......@@ -13,6 +13,7 @@ class LoadNoInit:
self.normal_ = torch.nn.init.normal_
self.kaiming_uniform_ = torch.nn.init.kaiming_uniform_
self.kaiming_normal_ = torch.nn.init.kaiming_normal_
self.tensor_normal_ = torch.Tensor.normal_
def __enter__(self, *args, **kwargs):
"""Replace initializers with no-op."""
......@@ -24,6 +25,7 @@ class LoadNoInit:
torch.nn.init.normal_ = lambda *args, **kwargs: None
torch.nn.init.kaiming_uniform_ = lambda *args, **kwargs: None
torch.nn.init.kaiming_normal_ = lambda *args, **kwargs: None
torch.Tensor.normal_ = lambda *args, **kwargs: None
def __exit__(self, *args, **kwargs):
"""Recover."""
......@@ -35,3 +37,4 @@ class LoadNoInit:
torch.nn.init.normal_ = self.normal_
torch.nn.init.kaiming_uniform_ = self.kaiming_uniform_
torch.nn.init.kaiming_normal_ = self.kaiming_normal_
torch.Tensor.normal_ = self.tensor_normal_
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import torch.nn as nn
from .base import BasicAdapter, BasicAdapterFast
from .internlm import InternLMAdapter
from .llama2 import Llama2Adapter
logger = logging.getLogger(__name__)
def _get_default_adapter(tokenizer):
if tokenizer.is_fast:
return BasicAdapterFast
else:
return BasicAdapter
def init_adapter(model: nn.Module, tokenizer, adapter=None):
if adapter is None:
for v in model.modules():
if 'InternLMModel' in v.__class__.__name__:
Adapter = InternLMAdapter
break
elif 'LlamaModel' in v.__class__.__name__:
Adapter = Llama2Adapter
break
else:
Adapter = _get_default_adapter(tokenizer)
elif adapter == 'llama1':
Adapter = _get_default_adapter(tokenizer)
else:
raise ValueError(f'Adapter {adapter} is not allowed.')
logger.info(f'Using adapter {Adapter.__name__}')
return Adapter(tokenizer)
# Copyright (c) OpenMMLab. All rights reserved.
"""Basic adapter suitable for general HuggingFace models."""
import logging
import re
from transformers import (PreTrainedTokenizer, PreTrainedTokenizerBase,
PreTrainedTokenizerFast)
logger = logging.getLogger(__name__)
class BaseAdapter:
"""Base class for all adapters.
Note:
Adapters coordinate with the session manager to prepare input_ids.
The full sequence fed to the model is as follows:
```
adapter.start_ids
adapter.encode_and_decorate(user_input_1)
output_1_generated_by_model
adapter.sep_ids
adapter.encode_and_decorate(user_input_2)
output_2_generated_by_model
adapter.sep_ids
adapter.encode_and_decorate(user_input_3)
```
Thus adapter is responsible for providing model specific
``start_ids``, ``sep_ids``, and method to encode single prompt.
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase):
self.tokenizer = tokenizer
def encode_and_decorate(self, prompt, add_special_tokens=False):
"""Model specific method to encode and decorate prompt."""
raise NotImplementedError
def decode(self, value):
"""Model specific method to decode single value to string."""
raise NotImplementedError
@property
def stopping_criteria(self):
"""Model specific stopping criteria for generation."""
return None
@property
def start_ids(self):
"""Model specific start ids."""
return [self.tokenizer.bos_token_id]
@property
def sep_ids(self):
"""Model specific separation ids."""
return [self.tokenizer.bos_token_id]
class BasicAdapter(BaseAdapter):
"""Basic adapter for slow tokenizers."""
def encode_and_decorate(self, prompt, add_special_tokens=False):
"""Encode prompt.
Note:
we leave <bos> to session manager to add.
"""
input_ids = self.tokenizer.encode(
prompt,
add_special_tokens=add_special_tokens,
return_tensors='pt',
)
logger.debug(f'Encode {prompt} to {input_ids}')
return input_ids
def decode(self, value):
"""Fallback when tokenizer is not fast."""
self.tokenizer: PreTrainedTokenizer
tok = self.tokenizer.decode(value)
return tok + ' '
class BasicAdapterFast(BaseAdapter):
"""Basic adapter for slow tokenizers."""
hex_regex = re.compile(r'^<0x([0-9ABCDEF]+)>$')
def encode_and_decorate(self, prompt, add_special_tokens=False):
"""Encode prompt.
Note:
we leave <bos> to session manager to add.
"""
input_ids = self.tokenizer.encode(
prompt,
add_special_tokens=add_special_tokens,
return_tensors='pt',
)
logger.debug(f'Encode {prompt} to {input_ids}')
return input_ids
def decode(self, value):
"""Decode with fast tokenizers."""
self.tokenizer: PreTrainedTokenizerFast
tok = self.tokenizer._convert_id_to_token(value)
if tok.startswith('▁'): # sentencepiece
space = ' '
tok = tok[1:]
else:
space = ''
if res := self.hex_regex.match(tok):
tok = chr(int(res.group(1), 16))
if tok == '</s>' or tok == '\r':
tok = '\n'
tok = space + tok
logger.debug(f'Decode {value} to {repr(tok)}')
return tok
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import re
import torch
from transformers import (PreTrainedTokenizerFast, StoppingCriteria,
StoppingCriteriaList)
from .base import BaseAdapter
logger = logging.getLogger(__name__)
class InternLMStoppingCriteria(StoppingCriteria):
"""Stopping criteria for HF version of InternLM."""
def __call__(self, input_ids, *args, **kwargs) -> bool:
return input_ids[0, -1] in [2, 103028]
class InternLMAdapter(BaseAdapter):
"""Adapter for InternLM.
InternLM use the following template and \n should be 13.
<bos> (no actual newline here, just for better readability)
<|User|>:{prompt}<eoh>\n
<|Bot|>:{model_output}<eoa>\n
<|User|>:{prompt}<eoh>\n
<|Bot|>:{model_output}<eoa>\n
...
<eos>
"""
hex_regex = re.compile(r'^<0x([0-9ABCDEF]+)>$')
# ids of '<|User|>:'
B_USER_ID = torch.tensor([[333, 352, 1621, 352, 27232]])
# ids of '<eoh>\n<|Bot|>:'
E_USER_ID = torch.tensor([[103027, 13, 333, 352, 23845, 352, 27232]])
# ids of '<bos>'
start_ids = [1]
# ids of '\n'
sep_ids = [13]
def __init__(self, tokenizer: PreTrainedTokenizerFast):
self.tokenizer = tokenizer
def encode_and_decorate(self, prompt):
r"""Encode prompt and decorate with template.
Note:
we leave <bos> and chat history for session manager to add,
so we will decorate input_ids to '<|User|>:{prompt}<eoh>\n<|Bot|>:'
"""
input_ids = self.tokenizer.encode(
prompt,
add_special_tokens=False,
return_tensors='pt',
)
# This is f'<|User|>:{prompt}<eoh>\n<|Bot|>:'
# but force \n to 13 instead of 364
input_ids = torch.cat([self.B_USER_ID, input_ids, self.E_USER_ID],
dim=1)
return input_ids
def decode(self, value):
"""Decode generated tokens for InternLM."""
tok = self.tokenizer.decode(value)
if res := self.hex_regex.match(tok):
tok = chr(int(res.group(1), 16))
if tok == '</s>' or tok == '<eoa>' or tok == '\r':
tok = '\n'
logger.debug(f'Decode {value} to {repr(tok)}')
return tok
@property
def stopping_criteria(self):
return StoppingCriteriaList([InternLMStoppingCriteria()])
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import re
from transformers import PreTrainedTokenizerFast
from .base import BasicAdapterFast
logger = logging.getLogger(__name__)
B_INST, E_INST = '[INST]', '[/INST]'
B_SYS, E_SYS = '<<SYS>>\n', '\n<</SYS>>\n\n'
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" # noqa: E501
class Llama2Adapter(BasicAdapterFast):
"""Adapter for llama2.
Llama2 use the following template and the first user prompt
should contain a system prompt.
User can specify the system prompt using a <<SYS>> tag otherwise
the default system prompt is prepended to user's input.
<bos>
[INST]<space>
<<SYS>>\n
SYSTEM_PROMPT\n
<</SYS>>\n\n
{user_prompt_1}<space>
[/INST]<space>
{answer_1}<space>
<eos>
<bos>
[INST]<space>
{user_prompt_2}<space>
[/INST]<space>
{answer_2}<space>
<eos>
<bos>
[INST]<space>
{user_prompt_2}(no space here)
...
"""
start_ids = []
sep_ids = []
def __init__(self, tokenizer: PreTrainedTokenizerFast):
super().__init__(tokenizer)
self.prev_round = 0
def encode_and_decorate(self, prompt):
r"""Encode prompt and decorate with template."""
if self.prev_round == 0:
res = re.search(r'<<SYS>>(.*?)<</SYS>>(.*)', prompt)
if res:
prompt = B_SYS + res.group(1).strip() + \
E_SYS + res.group(2).strip()
else:
prompt = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + prompt
prompt = f'{B_INST} {prompt.strip()} {E_INST}'
logger.debug(f'decorated prompt: {repr(prompt)}')
input_ids = self.tokenizer.encode(
prompt,
add_special_tokens=True,
return_tensors='pt',
)
self.prev_round += 1
return input_ids
# Copyright (c) OpenMMLab. All rights reserved.
"""Chat through command line.
This submodule allows user to chat with language model through command line,
and optionally accelerate model using backends like deepspeed.
Example 1: Chat with default setting
```python
python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
```
Example 2: Disable sampling
```python
python -m lmdeploy.pytorch.chat \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--temperature 0
```
Example 3: Accelerate with deepspeed inference
```python
python -m lmdeploy.pytorch.chat \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--accel deepspeed
```
Note: to use deepspeed, you need to install deepspeed,
and if hope to accelerate InternLM, you need a customized version
https://github.com/wangruohui/DeepSpeed/tree/support_internlm_0.10.0
Example 4: Tensor parallel the model on 2 GPUs
```python
deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--accel deepspeed \
```
This module also allow the following control commands to change
generation behaviors during chat.
- `exit`: terminate and exit chat
- `config set key=value`: change generation config `key` to `value`,
e.g. config temperature=0 disable sampling for following chats
- `clear`: clear chat history
"""
import itertools
import logging
from typing import Optional
import torch
from transformers import GenerationConfig, PreTrainedModel
from .adapters import init_adapter
from .dist import get_local_rank, get_rank, get_world_size
from .model import accel_model, init_model
from .session import BasicSessionManagerWithHistory
from .utils import BasicStreamer, TerminalIO, control
logger = logging.getLogger(__name__)
def set_logging(log_file: str, debug: bool):
torch.set_printoptions(linewidth=120)
level = logging.DEBUG if debug else logging.INFO
log_file = log_file or 'chat.log'
if r := get_rank() != 0:
log_file = log_file + f'.{r}'
logging.basicConfig(level=level,
format=('%(filename)s: '
'%(levelname)s: '
'%(funcName)s(): '
'%(lineno)d:\t'
'%(message)s'),
filename=log_file,
filemode='w')
print(f'Worker {get_rank()} logging to {log_file}')
def main(
model_path: str,
tokenizer_path: Optional[str] = None,
accel: Optional[str] = None,
max_new_tokens: int = 128,
temperature: float = 0.8,
top_p: float = 0.95,
seed: int = 0,
use_fast_tokenizer: bool = True,
max_alloc: int = 2048,
max_session_len: int = None,
log_file: Optional[str] = None,
debug: bool = False,
adapter: Optional[str] = None,
):
"""Chat with model through terminal.
import os
import random
from typing import List
from lmdeploy.messages import EngineGenerationConfig, PytorchEngineConfig
from lmdeploy.model import MODELS, best_match_model
from lmdeploy.tokenizer import DetokenizeState, Tokenizer
os.environ['TM_LOG_LEVEL'] = 'ERROR'
def input_prompt(model_name):
"""Input a prompt in the consolo interface."""
if model_name == 'codellama':
print('\nenter !! to end the input >>>\n', end='')
sentinel = '!!'
else:
print('\ndouble enter to end input >>> ', end='')
sentinel = '' # ends when this string is seen
return '\n'.join(iter(input, sentinel))
def valid_str(string, coding='utf-8'):
"""decode text according to its encoding type."""
invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
for invalid_char in invalid_chars:
bstr = bstr.replace(invalid_char, b'')
ret = bstr.decode(encoding=coding, errors='ignore')
return ret
def _stop_words(stop_words: List[str], tokenizer: Tokenizer):
"""Return a list of token ids corresponding to stop-words."""
if stop_words is None:
return None
assert isinstance(stop_words, List) and \
all(isinstance(elem, str) for elem in stop_words), \
f'stop_words must be a list but got {type(stop_words)}'
stop_words = [
tokenizer.encode(stop_word, False)[-1] for stop_word in stop_words
]
assert isinstance(stop_words, List) and all(
isinstance(elem, int) for elem in stop_words), 'invalid stop_words'
return stop_words
def run_chat(model_path: str,
engine_config: PytorchEngineConfig,
gen_config: EngineGenerationConfig = None,
session_id: int = 1,
trust_remote_code: bool = True):
"""An example to perform model inference through the command line
interface.
Args:
model_path (str): Path to model.
tokenizer_path (str): Path to tokenizer.
accel (str): Model accelerator.
max_new_tokens (int): Maximum number of tokens to generate.
temperature (float): Temperature for sampling.
top_p (float): Top p for sampling.
seed (int): Random seed.
use_fast_tokenizer (bool): Whether to use fast tokenizer.
This argument is directly pass to transformer's ``AutoTokenizer.from_pretrained``.
Generally, user should choose to use fast tokenizers.
But if using fast raise some error, try to force using a slow one.
max_alloc (int): Maximum memory to allocate (for deepspeed).
max_session_len (int): Maximum number of tokens allowed for all chat sessions.
This include both history and current session.
log_file (str): Path to log file.
debug (bool): Whether to enable debug mode.
adapter (str): Force to use an adapter.
Generally user should not use this argument because adapter is selected based
on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
based on `LlamaforCausalLM` class, this argument is required.
Currently, only "llama1" is acceptable for llama1 models.
""" # noqa: E501
set_logging(log_file, debug)
# workers should sync in sampling
torch.manual_seed(seed)
local_rank = get_local_rank()
world_size = get_world_size()
# Init model and tokenizer
if not tokenizer_path:
tokenizer_path = model_path
model, tokenizer = init_model(
model_path,
tokenizer_path,
use_fast_tokenizer=use_fast_tokenizer,
)
# Init adapter based on model and tokenizer
adapter = init_adapter(model, tokenizer, adapter)
# Accelerate model
model: PreTrainedModel = accel_model(model,
accel,
max_alloc=max_alloc,
tp_size=world_size)
# warmup
warmup_config = GenerationConfig(
max_new_tokens=1,
do_sample=temperature > 0,
temperature=temperature,
top_p=top_p,
)
model.generate(torch.tensor([[6]], device=get_local_rank()), warmup_config)
gen_config = GenerationConfig(
max_new_tokens=max_new_tokens,
do_sample=temperature > 0,
temperature=temperature,
top_p=top_p,
)
# Session manager handling history
max_session_len = max_alloc if max_session_len is None else max_session_len
sm = BasicSessionManagerWithHistory(max_session_len=max_session_len,
start_ids=adapter.start_ids,
sep_ids=adapter.sep_ids)
io = TerminalIO()
streamer = BasicStreamer(adapter.decode, io.output)
for r in itertools.count(1):
# User input from IO
logger.info(f'Round {r}')
prompt: str = io.input()
logger.info(f'User input: {prompt}')
# Allow user to change config during runtime or exit
if control(prompt, gen_config, sm):
continue
# Tokenize and apply model specific templates
input_ids = adapter.encode_and_decorate(prompt)
logger.info(f'Input ids:\n{input_ids}')
# Prepend chat history (tensor concatenation)
input_ids = sm.prepend_history(input_ids)
logger.info(f'Input ids with history:\n{input_ids}')
# Generate
input_ids = input_ids.cuda(local_rank)
# returned tensor including input and generated output
output = model.generate(input_ids,
gen_config,
streamer=streamer,
stopping_criteria=adapter.stopping_criteria)
logger.info(f'Output:\n{output}')
# Save output into session manager and maybe trim some history
sm.add_to_history(output)
def cli():
import fire
model_path (str): the huggingface model path.
engine_config (PytorchEngineConfig): Config of engine.
gen_config (EngineGenerationConfig): Config of generation.
session_id (int): the identical id of a session.
trust_remote_code (bool): trust remote code.
"""
from lmdeploy.pytorch.engine import Engine
tm_model = Engine.from_pretrained(model_path,
engine_config=engine_config,
trust_remote_code=trust_remote_code)
tokenizer = tm_model.tokenizer
generator = tm_model.create_instance()
adapter_name = None
if engine_config.adapters is not None:
adapter_name = next(iter(engine_config.adapters.keys()))
if gen_config is None:
gen_config = EngineGenerationConfig()
nth_round = 1
step = 0
seed = random.getrandbits(64)
model_name = engine_config.model_name
if model_name is None:
model_name = best_match_model(model_path)
assert model_name is not None, 'Can not find match model template'
print(f'match template: <{model_name}>')
model = MODELS.get(model_name)()
stop_words = _stop_words(model.stop_words, tokenizer)
while True:
prompt = input_prompt(model_name)
if prompt == 'exit':
exit(0)
elif prompt == 'end':
generator.end(session_id)
nth_round = 1
step = 0
seed = random.getrandbits(64)
else:
prompt = model.get_prompt(prompt, nth_round == 1)
input_ids = tokenizer.encode(prompt, nth_round == 1)
session_len = model.session_len
if session_len is None:
session_len = tm_model.session_len
if step >= session_len:
print('WARNING: exceed session max length.'
' Please end the session.')
continue
print(f'{prompt} ', end='', flush=True)
state = DetokenizeState()
gen_config.random_seed = seed
gen_config.stop_words = stop_words
for outputs in generator.stream_infer(session_id=session_id,
input_ids=input_ids,
gen_config=gen_config,
adapter_name=adapter_name):
status, res, tokens = outputs
# decode res
response, state = tokenizer.detokenize_incrementally(
res, state)
response = valid_str(response)
print(f'{response}', end='', flush=True)
# update step
step += len(input_ids) + tokens
print()
nth_round += 1
def main(model_path: str,
model_name: str = None,
session_id: int = 1,
top_k: float = 40,
top_p: float = 0.8,
temperature: float = 0.8,
repetition_penalty: float = 1.0,
tp: int = 1,
stream_output: bool = True,
adapter: str = None,
trust_remote_code: bool = True):
"""An example to perform model inference through the command line
interface.
fire.Fire(main)
Args:
model_path (str): the huggingface model path
model_name (str): name of the model.
session_id (int): the identical id of a session
top_k (int): sampling top k.
top_p (int): sampling top p.
temperature (float): sampling temperature.
repetition_penalty (float): parameter to penalize repetition
tp (int): GPU number used in tensor parallelism
stream_output (bool): indicator for streaming output or not
adapter (str): path to lora adapter.
trust_remote_code (bool): Trust remote code.
"""
adapters = None
if adapter is not None:
adapters = dict(default=adapter)
engine_config = PytorchEngineConfig(model_name=model_name,
tp=tp,
adapters=adapters)
gen_config = EngineGenerationConfig(max_new_tokens=512,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
ignore_eos=False)
return run_chat(model_path,
engine_config,
gen_config,
session_id=session_id,
trust_remote_code=trust_remote_code)
if __name__ == '__main__':
cli()
import fire
fire.Fire(main)
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import logging
import queue
import warnings
from typing import List, Optional
import pynvml
import torch
import torch.multiprocessing as mp
from torch.nn.utils.rnn import pad_sequence
from transformers import (AutoTokenizer, PreTrainedModel,
PreTrainedTokenizerBase)
from .model import accel_model, init_model
def safe_numel(free_mem, model_size, max_intermediate):
"""Number of elements without out-of-memory."""
return int(free_mem - model_size) // max_intermediate
def avail_gpus(percentage=0.96):
"""Detect available gpus.
Args:
percentage (float): The minimum percentage of free memory to be
considered as available.
Return:
A list of gpu ids.
average free memory on single gpu.
"""
gpus = []
mems = []
pynvml.nvmlInit()
for i in range(torch.cuda.device_count()):
handle = pynvml.nvmlDeviceGetHandleByIndex(int(i))
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
free, total = int(mem_info.free), int(mem_info.total)
if free / total > percentage:
gpus.append(i)
mems.append(free)
pynvml.nvmlShutdown()
if len(gpus) == 0:
raise RuntimeError('No GPU available.')
return gpus, sum(mems) / len(mems)
@torch.no_grad()
def decode_single(model: PreTrainedModel,
input_ids: torch.Tensor,
attention_mask: torch.Tensor = None,
return_logits=True):
"""Decode a single batch.
Args:
model (PreTrainedModel): Pretrained model.
input_ids (torch.Tensor): A batch of input ids.
attention_mask (torch.Tensor): A batch of attention masks.
Returns:
torch.Tensor: A batch of probabilities (on CPU).
Note:
This function assume input_ids[i] = [bos, x1, x2, ..., xn]
and return prob = [p(x1|bos), p(x2|bos,x1), ..., p(xn|bos..xn-1)]
So prob is shorter than input_ids by 1.
"""
# Call Causal LM forward
outputs = model(input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=False,
output_attentions=False,
use_cache=False,
return_dict=True)
# fp32, [bs, seq_len, vocab_size]
logits = outputs.logits
if not return_logits:
# inplace softmax to get probs
torch.softmax(logits, dim=-1, out=logits)
# Shift to fetch probabilities
shift_labels = input_ids[..., 1:].contiguous()
shift_probs = logits[..., :-1, :].contiguous()
logits = torch.gather(shift_probs, -1, shift_labels.unsqueeze(-1))
if attention_mask is not None:
logits *= attention_mask[..., None]
logits = logits.cpu()
return logits
def worker_fn(model_path: str,
inq: mp.Queue,
outq: mp.Queue,
accel: Optional[str] = None,
gpu_id=0):
# torch.set_default_device(gpu_id)
model, _ = init_model(model_path)
model = model.eval()
model = accel_model(model, accel, gpu_id=gpu_id)
while True:
try:
idx, args = inq.get(timeout=1)
except queue.Empty:
continue
if idx is None:
print(f'Worker {gpu_id} received exit signal.')
break
# print(args)
input_ids, input_lens, *args = args
input_ids = input_ids.cuda(gpu_id)
max_len = max(input_lens)
assert max_len == input_ids.size(-1), \
f'input_ids.shape = {input_ids.shape}, max_len = {max_len}'
input_lens = torch.tensor(input_lens, device=gpu_id)
attention_mask = \
torch.arange(max_len, device=gpu_id)[None, :] < input_lens[:, None]
assert attention_mask.shape == input_ids.shape, \
f'attention_mask.shape = {attention_mask.shape}'
try:
probs = decode_single(model, input_ids, attention_mask, *args)
except torch.cuda.OutOfMemoryError:
warnings.warn(
f'OOM on GPU {gpu_id}, discard prompts at indics {idx}.')
probs = torch.empty((input_ids.size(0), 0),
dtype=torch.float32,
device='cpu')
outq.put((idx, probs))
print(f'Exiting worker {gpu_id} ...')
inq.close()
outq.close()
print(f'Worker {gpu_id} finished.')
class Engine:
"""Multi-GPU deciding engine.
Args:
model_path (str): Path to the pretrained model.
tokenizer_path (str, optional): Path to the pretrained tokenizer.
Defaults to None.
Either tokenizer_path or tokenizer should be provided.
tokenizer (PreTrainedTokenizerBase, optional): Pre-configured tokenizer.
Defaults to None.
Either tokenizer_path or tokenizer should be provided.
accel (str, optional): Acceleration method.
Defaults to None. 'deepspeed' is not tested.
gpu_mem_percentage (float, optional): GPU with memory larger than this value
are considered available and be used as decode device.
Defaults to 0.96.
model_size_byte (float, optional): (Approximate) model size in bytes.
Defaults to 14e9 (7B model in FP16).
bytes_per_token (float, optional): (Approximate) memory cost per token in bytes.
Defaults to 2e6 (2MB).
``bytes_per_token`` and ``model_size_byte`` are used to compute
the maximum batch size for given seq_length
""" # noqa: E501
def __init__(self,
model_path: str,
tokenizer_path: Optional[str] = None,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
accel: Optional[str] = None,
gpu_mem_percentage: float = 0.96,
model_size_byte=14e9,
bytes_per_token=2e6):
gpu_ids, mem = avail_gpus(gpu_mem_percentage)
print(f'Available GPUs are: {gpu_ids}, ', end='')
print(f'with {mem/2**30:.2f} GiB free.')
ctx = mp.get_context('spawn')
inq = ctx.Queue()
outq = ctx.Queue()
ps = []
for id in gpu_ids:
p = ctx.Process(target=worker_fn,
args=(model_path, inq, outq, accel, id))
p.start()
ps.append(p)
if tokenizer is None:
if tokenizer_path is None:
tokenizer_path = model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
self.gpu_ids = gpu_ids
self.inq = inq
self.outq = outq
self.ps = ps
self.tokenizer = tokenizer
self.safe_numel = safe_numel(mem, model_size_byte, bytes_per_token)
def clear_queue(self):
for q in self.inq, self.outq:
while not q.empty():
q.get()
def decode(self,
token_ids: List[List[int]],
sort=True,
max_bs: int = 1024,
pad=True,
pad_token_id=2,
return_logits=True):
"""Inference the model to compute probabilities.
Args:
token_ids (List[List[int]]): List of list of token ids.
sort (bool, optional): Internally sort the prompts by length to achieve better efficiency.
Defaults to True.
Note: orders of returned probabilities are always the same as the input.
max_bs (int, optional): Maximum batch size.
Defaults to 1024.
pad (bool, optional): Pad the prompts in every mini batch to the same length.
Defaults to True. Set to False to save memory.
return_logits (bool, optional): Return logits instead of probabilities.
Returns:
numpy.ndarray: Array of logits of shape [bsz, seqlen, vocab_size],
with prob=0 padded, if pad is True
List[numpy.ndarray]: List of logits without padding, if pad is False.
Note:
This function will accept input token_ids = [x0(=bos), x1, x2, ..., xn]
and compute prob = [p(x1|x0), p(x2|x0,x1), ..., p(xn|x0..xn-1)]
So prob is shorter than input_ids by 1.
""" # noqa: E501
self.clear_queue()
# sort to achieve better efficiency
if sort:
pids_and_indicis = sorted(enumerate(token_ids),
key=lambda i_and_x: len(i_and_x[1]))
else:
pids_and_indicis = list(enumerate(token_ids))
left = 0
bs = max_bs
while left < len(token_ids):
if not sort:
bs = max_bs
right = min(left + bs, len(token_ids))
# batch of prompts
sub_p_and_i = pids_and_indicis[left:right]
idx, sub_p = zip(*sub_p_and_i)
# batch of input_ids and attn_masks
# inputs = self.tokenizer(sub_p, return_tensors='pt', padding=True)
input_ids = [torch.tensor(p) for p in sub_p]
input_ids = pad_sequence(input_ids,
batch_first=True,
padding_value=pad_token_id)
input_lens = [len(p) for p in sub_p]
# Dynamic batch size based on safe memory
while input_ids.numel() > self.safe_numel:
if bs == 1:
break
bs = max(1, round(bs / 1.5))
print(f'\nReduce bs to {bs} when seq len reaches '
f'{input_ids.shape[-1]}')
idx = idx[:bs]
input_lens = input_lens[:bs]
input_ids = input_ids[:bs, :max(input_lens)]
# Send to worker
self.inq.put((idx, (input_ids, input_lens)))
left += bs
print(
f'Distributing prompts {right}/{len(token_ids)},'
f' {right/len(token_ids):.0%}',
end='\r')
print()
# Collect outputs from workers
all_probs = [None] * len(token_ids)
count = 0
while count < len(token_ids):
idx, probs = self.outq.get()
for i, p in zip(idx, probs):
assert all_probs[i] is None
all_probs[i] = p
count += len(idx)
print(
f'Decoding and collecting outputs '
f'{count}/{len(token_ids)}, '
f'{count/len(token_ids):.0%}',
end='\r')
print()
if pad:
all_probs = pad_sequence(all_probs, batch_first=True)
all_probs = all_probs.cpu().numpy()
else:
all_probs = [p.cpu().numpy() for p in all_probs]
return all_probs
def __del__(self):
print('Exiting engine ...')
for _ in self.ps:
self.inq.put((None, None))
for p in self.ps:
p.join(timeout=1)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model_path',
default='llama2/huggingface/llama-2-7b',
help='Path to HugigngFace model and tokenizer.')
parser.add_argument(
'--test_path',
default='',
help='Path to text file, with each line containing a prompt.')
parser.add_argument(
'-p',
'--prompts',
nargs='*',
default=[
'I believe the meaning of life is to find your gift.',
'Simply put, the theory of relativity states that',
'Building a website can be done in 10 simple steps:'
],
help="Prompt in command line, please quote \"\" every sentences, "
'surpassed by --test_path')
parser.add_argument('--min_len',
default=1,
help='Minimum length of prompts')
parser.add_argument('--save-to',
default='decode.out',
help='Save results to this file.')
args = parser.parse_args()
model_path = args.model_path
test_path = args.test_path
prompts = args.prompts
logger = logging.getLogger(__name__)
# logging.basicConfig(level=logging.DEBUG)
# Use test file preferentially
if test_path:
with open(test_path, 'r') as f:
prompts = f.readlines()
prompts = [p.strip() for p in prompts]
# Output infos
print(f'Model path: {model_path}')
def _format(ts, start, end):
if start < 0:
start += len(ts)
if end <= 0:
end += len(ts)
return '\n'.join(
(f'{i}\t{t}' for i, t in zip(range(start, end), ts[start:end])))
if len(prompts) > 10:
print('Prompts:\n' + _format(prompts, 0, 5) + '\n......\n' +
_format(prompts, -5, 0))
else:
print('Prompts:\n' + _format(prompts, 0, 0))
# Init Engine in backend
engine = Engine(model_path)
# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'
input_ids = tokenizer(prompts, padding=False)
input_ids: List[List[int]] = input_ids.input_ids
# Filter out too short prompts
input_ids = [i for i in input_ids if len(i) >= args.min_len]
if len(input_ids) < len(prompts):
logger.warning(
f'Filtered out {len(prompts) - len(input_ids)} prompts, '
f'because they are shorter than {args.min_len}.')
# Decode
logits = engine.decode(input_ids)
print(f'logits.shape = {logits.shape}')
# Save to pth
print(f'Dumping results to = {args.save_to}')
torch.save(logits, args.save_to, pickle_protocol=4)
del engine
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment