Commit d7117b95 authored by zhouxiang's avatar zhouxiang
Browse files

同步0.2.6代码

parent 5f83e392
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import os import os
from typing import Optional from typing import List, Literal, Optional, Union
from .archs import autoget_backend_config, get_task
from .messages import PytorchEngineConfig, TurbomindEngineConfig
from .model import ChatTemplateConfig
def pipeline(model_path: str, def pipeline(model_path: str,
model_name: Optional[str] = None, model_name: Optional[str] = None,
instance_num: int = 32, backend_config: Optional[Union[TurbomindEngineConfig,
tp: int = 1, PytorchEngineConfig]] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
log_level='ERROR', log_level='ERROR',
**kwargs): **kwargs):
""" """
...@@ -21,38 +26,83 @@ def pipeline(model_path: str, ...@@ -21,38 +26,83 @@ def pipeline(model_path: str,
"InternLM/internlm-chat-20b-4bit", "InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc. "lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo - iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b", on huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on. and so on.
model_name (str): needed when model_path is a pytorch model on model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b", huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on. "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
instance_num (int): instance numbers to be created backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
tp (int): tensor parallel config instance. Default to None.
chat_template_config (ChatTemplateConfig): chat template configuration.
Default to None.
log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG] log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
Examples: Examples:
>>> # LLM
>>> import lmdeploy >>> import lmdeploy
>>> pipe = lmdeploy.pipeline('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b') >>> pipe = lmdeploy.pipeline('internlm/internlm-chat-7b')
>>> response = pipe(['hi','say this is a test']) >>> response = pipe(['hi','say this is a test'])
>>> print(response) >>> print(response)
>>>
>>> # VLM
>>> from lmdeploy.vl import load_image
>>> from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
>>> pipe = pipeline('liuhaotian/llava-v1.5-7b',
... backend_config=TurbomindEngineConfig(session_len=8192),
... chat_template_config=ChatTemplateConfig(model_name='vicuna'))
>>> im = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
>>> response = pipe([('describe this image', [im])])
>>> print(response)
""" # noqa E501 """ # noqa E501
from lmdeploy.serve.async_engine import AsyncEngine if os.getenv('TM_LOG_LEVEL') is None:
os.environ['TM_LOG_LEVEL'] = log_level os.environ['TM_LOG_LEVEL'] = log_level
return AsyncEngine(model_path, from lmdeploy.utils import get_logger
model_name=model_name, logger = get_logger('lmdeploy')
instance_num=instance_num, logger.setLevel(log_level)
tp=tp,
**kwargs) pipeline_type, pipeline_class = get_task(model_path)
if pipeline_type == 'vlm':
assert (type(backend_config) is TurbomindEngineConfig) or \
(backend_config is None), \
f'{pipeline_type} model only support turbomind backend.'
if pipeline_type == 'llm' and type(
backend_config) is not PytorchEngineConfig:
# set auto backend mode
backend_config = autoget_backend_config(model_path, backend_config)
backend = 'pytorch' if type(
backend_config) is PytorchEngineConfig else 'turbomind'
logger.info(f'Using {backend} engine')
if 'tp' in kwargs:
logger.warning(
'The argument "tp" is deprecated and will be removed soon. '
'Please set "tp" in "backend_config"')
tp = kwargs['tp']
kwargs.pop('tp')
else:
tp = 1 if backend_config is None else backend_config.tp
return pipeline_class(model_path,
model_name=model_name,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
tp=tp,
**kwargs)
def serve(model_path: str, def serve(model_path: str,
model_name: Optional[str] = None, model_name: Optional[str] = None,
backend: Literal['turbomind', 'pytorch'] = 'turbomind',
backend_config: Optional[Union[TurbomindEngineConfig,
PytorchEngineConfig]] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
server_name: str = '0.0.0.0', server_name: str = '0.0.0.0',
server_port: int = 23333, server_port: int = 23333,
instance_num: int = 64,
tp: int = 1,
log_level: str = 'ERROR', log_level: str = 'ERROR',
api_keys: Optional[Union[List[str], str]] = None,
ssl: bool = False,
**kwargs): **kwargs):
"""This will run the api_server in a subprocess. """This will run the api_server in a subprocess.
...@@ -67,24 +117,31 @@ def serve(model_path: str, ...@@ -67,24 +117,31 @@ def serve(model_path: str,
"InternLM/internlm-chat-20b-4bit", "InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc. "lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo - iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b", on huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on. and so on.
model_name (str): needed when model_path is a pytorch model on model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b", huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on. "Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" and so on.
backend (str): either `turbomind` or `pytorch` backend. Default to
`turbomind` backend.
backend_config (TurbomindEngineConfig | PytorchEngineConfig): backend
config instance. Default to none.
chat_template_config (ChatTemplateConfig): chat template configuration.
Default to None.
server_name (str): host ip for serving server_name (str): host ip for serving
server_port (int): server port server_port (int): server port
instance_num (int): number of instances of turbomind model
tp (int): tensor parallel
log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG] log_level(str): set log level whose value among [CRITICAL, ERROR, WARNING, INFO, DEBUG]
api_keys (List[str] | str | None): Optional list of API keys. Accepts string type as
a single api_key. Default to None, which means no api key applied.
ssl (bool): Enable SSL. Requires OS Environment variables 'SSL_KEYFILE' and 'SSL_CERTFILE'.
Return: Return:
APIClient: A client chatbot for LLaMA series models. APIClient: A client chatbot for LLaMA series models.
Examples: Examples:
>>> import lmdeploy >>> import lmdeploy
>>> client = lmdeploy.serve('InternLM/internlm-chat-7b-v1_1', 'internlm-chat-7b') >>> client = lmdeploy.serve('internlm/internlm-chat-7b', 'internlm-chat-7b')
>>> for output in client.chat('hi', 1): >>> for output in client.chat('hi', 1):
... print(output) ... print(output)
""" # noqa E501 """ # noqa E501
...@@ -93,33 +150,57 @@ def serve(model_path: str, ...@@ -93,33 +150,57 @@ def serve(model_path: str,
from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_client import APIClient
from lmdeploy.serve.openai.api_server import serve from lmdeploy.serve.openai.api_server import serve
if type(backend_config) is not PytorchEngineConfig:
# set auto backend mode
backend_config = autoget_backend_config(model_path, backend_config)
backend = 'pytorch' if type(
backend_config) is PytorchEngineConfig else 'turbomind'
if 'tp' in kwargs:
tp = kwargs['tp']
kwargs.pop('tp')
else:
tp = 1 if backend_config is None else backend_config.tp
task = Process(target=serve, task = Process(target=serve,
args=(model_path, ), args=(model_path, ),
kwargs=dict(model_name=model_name, kwargs=dict(model_name=model_name,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
server_name=server_name, server_name=server_name,
server_port=server_port, server_port=server_port,
instance_num=instance_num,
tp=tp, tp=tp,
log_level=log_level, log_level=log_level,
**kwargs)) api_keys=api_keys,
ssl=ssl,
**kwargs),
daemon=True)
task.start() task.start()
client = APIClient(f'http://{server_name}:{server_port}') client = APIClient(f'http://{server_name}:{server_port}')
while True: while True:
time.sleep(1) time.sleep(1)
try: try:
client.available_models client.available_models
print(
f'Launched the api_server in process {task.pid}, user can '
f'kill the server by:\nimport os,signal\nos.kill({task.pid}, '
'signal.SIGKILL)')
return client return client
except: # noqa except: # noqa
pass pass
def client(api_server_url: str = 'http://0.0.0.0:23333', **kwargs): def client(api_server_url: str = 'http://0.0.0.0:23333',
api_key: Optional[str] = None,
**kwargs):
""" """
Args: Args:
api_server_url (str): communicating address 'http://<ip>:<port>' of api_server_url (str): communicating address 'http://<ip>:<port>' of
api_server api_server
api_key (str | None): api key. Default to None, which means no
api key will be used.
Return: Return:
Chatbot for LLaMA series models with turbomind as inference engine. Chatbot for LLaMA series models with turbomind as inference engine.
""" """
from lmdeploy.serve.openai.api_client import APIClient from lmdeploy.serve.openai.api_client import APIClient
return APIClient(api_server_url, **kwargs) return APIClient(api_server_url, api_key, **kwargs)
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from .cli import run from .entrypoint import run
__all__ = ['run'] __all__ = ['run']
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional from .cli import CLI
from .utils import (ArgumentHelper, DefaultsAndTypesHelpFormatter,
convert_args, get_lora_adapters)
class SubCliChat(object): class SubCliChat(object):
"""Chat through terminal with pytorch or turbomind model.""" _help = 'Chat with pytorch or turbomind engine.'
_desc = _help
parser = CLI.subparsers.add_parser('chat', help=_help, description=_desc)
subparsers = parser.add_subparsers(
title='Commands', description='This group has the following commands:')
def torch(self, @staticmethod
model_path: str, def add_parser_torch():
tokenizer_path: Optional[str] = None, """Add parser for torch command."""
accel: Optional[str] = None, parser = SubCliChat.subparsers.add_parser(
max_new_tokens: int = 128, 'torch',
temperature: float = 0.8, formatter_class=DefaultsAndTypesHelpFormatter,
top_p: float = 0.95, help=SubCliChat.torch.__doc__,
seed: int = 0, description=SubCliChat.torch.__doc__,
use_fast_tokenizer: bool = True, )
max_alloc: int = 2048, parser.set_defaults(run=SubCliChat.torch)
max_session_len: int = None, parser.add_argument('model_path',
log_file: Optional[str] = None, type=str,
debug: bool = False, help='The huggingface model path')
adapter: Optional[str] = None): # engine args
"""Chat with pytorch model through terminal. engine_group = parser.add_argument_group('Engine arguments')
ArgumentHelper.model_name(engine_group)
ArgumentHelper.tp(engine_group)
ArgumentHelper.session_len(engine_group)
ArgumentHelper.adapters(engine_group)
ArgumentHelper.cache_max_entry_count(engine_group)
Args: # other args
model_path (str): Path to pytorch model. parser.add_argument('--trust-remote-code',
tokenizer_path (str): Path to tokenizer. action='store_false',
accel (str): Model accelerator. default=True,
max_new_tokens (int): Maximum number of tokens to generate. help='Trust remote code')
temperature (float): Temperature for sampling.
top_p (float): Top p for sampling.
seed (int): Random seed.
use_fast_tokenizer (bool): Whether to use fast tokenizer.
This argument is directly pass to transformer's
``AutoTokenizer.from_pretrained``.
Generally, user should choose to use fast tokenizers.
But if using fast raise some error, try to force using a slow one.
max_alloc (int): Maximum memory to allocate (for deepspeed).
max_session_len (int): Maximum number of tokens allowed for all chat sessions.
This include both history and current session.
log_file (str): Path to log file.
debug (bool): Whether to enable debug mode.
adapter (str): Force to use an adapter.
Generally user should not use this argument because adapter is selected based
on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
based on `LlamaforCausalLM` class, this argument is required.
Currently, only "llama1" is acceptable for llama1 models.
""" # noqa: E501
from lmdeploy.pytorch.chat import main as run_torch_model
run_torch_model(model_path, @staticmethod
tokenizer_path=tokenizer_path, def add_parser_turbomind():
accel=accel, """Add parser for turbomind command."""
max_new_tokens=max_new_tokens, parser = SubCliChat.subparsers.add_parser(
temperature=temperature, 'turbomind',
top_p=top_p, formatter_class=DefaultsAndTypesHelpFormatter,
seed=seed, help=SubCliChat.turbomind.__doc__,
use_fast_tokenizer=use_fast_tokenizer, description=SubCliChat.turbomind.__doc__,
max_alloc=max_alloc, )
max_session_len=max_session_len, parser.set_defaults(run=SubCliChat.turbomind)
log_file=log_file, parser.add_argument(
debug=debug, 'model_path',
adapter=adapter) type=str,
help='The path of the deployed model. '
'It can be in format of huggingface or turbomind. '
'When it is turbomind model, all arguments for engine'
'config would be ignored, so you need to change the `config.ini`')
# engine arguments
engine_group = parser.add_argument_group('Engine arguments')
ArgumentHelper.tp(engine_group)
ArgumentHelper.model_format(engine_group)
ArgumentHelper.quant_policy(engine_group)
ArgumentHelper.model_name(engine_group)
ArgumentHelper.cache_max_entry_count(engine_group)
ArgumentHelper.rope_scaling_factor(engine_group)
ArgumentHelper.session_len(engine_group)
# other arguments
ArgumentHelper.cap(parser)
ArgumentHelper.meta_instruction(parser) # TODO remove
ArgumentHelper.chat_template(parser)
def turbomind(self, @staticmethod
model_path, def torch(args):
session_id: int = 1, """Chat with PyTorch inference engine through terminal."""
cap: str = 'chat', from lmdeploy.messages import PytorchEngineConfig
tp=1, from lmdeploy.pytorch.chat import run_chat
stream_output=True, adapters = get_lora_adapters(args.adapters)
**kwargs): engine_config = PytorchEngineConfig(
"""Chat with turbomind model through terminal. model_name=args.model_name,
tp=args.tp,
session_len=args.session_len,
cache_max_entry_count=args.cache_max_entry_count,
adapters=adapters)
run_chat(args.model_path,
engine_config,
trust_remote_code=args.trust_remote_code)
Args: @staticmethod
model_path (str): the path of the deployed model def turbomind(args):
session_id (int): the identical id of a session """Chat with TurboMind inference engine through terminal."""
cap (str): the capability of a model. For example, codellama has from lmdeploy.turbomind.chat import main
the ability among ['completion', 'infilling', 'chat', 'python'] kwargs = convert_args(args)
tp (int): GPU number used in tensor parallelism from lmdeploy.model import ChatTemplateConfig
stream_output (bool): indicator for streaming output or not chat_template_config = ChatTemplateConfig(
**kwarg (dict): other arguments for initializing model's chat model_name=args.model_name,
template meta_instruction=args.meta_instruction,
""" capability=args.cap)
from lmdeploy.turbomind.chat import main as run_turbomind_model if args.chat_template:
chat_template_config = ChatTemplateConfig.from_json(
args.chat_template)
kwargs.update(dict(chat_template_cfg=chat_template_config))
kwargs.pop('chat_template', None)
main(**kwargs)
run_turbomind_model(model_path, @staticmethod
session_id=session_id, def add_parsers():
cap=cap, """Add all parsers."""
tp=tp, SubCliChat.add_parser_torch()
stream_output=stream_output, SubCliChat.add_parser_turbomind()
**kwargs)
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os import os
import fire from ..version import __version__
from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
from .chat import SubCliChat
from .lite import SubCliLite
from .serve import SubCliServe
class CLI(object): class CLI(object):
"""LMDeploy Command Line Interface. _desc = 'The CLI provides a unified API for converting, ' \
'compressing and deploying large language models.'
The CLI provides a unified API for converting, compressing and deploying parser = argparse.ArgumentParser(prog='lmdeploy',
large language models. description=_desc,
""" add_help=True)
parser.add_argument('-v',
def convert(self, '--version',
model_name: str, action='version',
model_path: str, version=__version__)
model_format: str = None, subparsers = parser.add_subparsers(
tokenizer_path: str = None, title='Commands',
dst_path: str = './workspace', description='lmdeploy has following commands:',
tp: int = 1, dest='command')
quant_path: str = None,
group_size: int = 0, @staticmethod
**kwargs): def add_parser_convert():
"""Convert LLMs to lmdeploy format. """Add parser for convert command."""
parser = CLI.subparsers.add_parser(
Args: 'convert',
model_name (str): The name of the to-be-deployed model, such as formatter_class=DefaultsAndTypesHelpFormatter,
llama-7b, llama-13b, vicuna-7b and etc. description=CLI.convert.__doc__,
model_path (str): The directory path of the model or huggingface help=CLI.convert.__doc__)
repo_id like 'internlm/internlm-chat-20b' # define arguments
model_format (str): the format of the model, should choose from parser.add_argument(
['llama', 'hf', 'awq', None]. 'llama' stands for META's llama 'model_name',
format, 'hf' means huggingface llama format, and 'awq' means type=str,
llama(hf) model quantized by lmdeploy/lite/quantization/awq.py. help='The name of the to-be-deployed model, such as llama-7b, '
the default value is None, which means the model_format will be 'llama-13b, vicuna-7b and etc. You can run `lmdeploy list` to '
inferred based on model_name 'get the supported model names')
tokenizer_path (str): The path of tokenizer model. parser.add_argument('model_path',
dst_path (str): The destination path that saves outputs. type=str,
tp (int): The number of GPUs used for tensor parallelism, which help='The directory path of the model')
should be 2^n. ArgumentHelper.model_format(parser)
quant_path (str): Path of the quantized model, which can be None. ArgumentHelper.tp(parser)
group_size (int): A parameter used in AWQ to quantize fp16 weights # other args
to 4 bits. parser.add_argument('--tokenizer-path',
kwargs (dict): other params for convert type=str,
""" default=None,
from lmdeploy.turbomind.deploy.converter import main as convert help='The path of tokenizer model')
parser.add_argument('--dst-path',
convert(model_name, type=str,
model_path, default='workspace',
model_format=model_format, help='The destination path that saves outputs')
tokenizer_path=tokenizer_path, parser.add_argument(
dst_path=dst_path, '--quant-path',
tp=tp, type=str,
quant_path=quant_path, default=None,
group_size=group_size, help='Path of the quantized model, which can be none')
**kwargs) parser.add_argument(
'--group-size',
def list(self, engine: str = 'turbomind'): type=int,
"""List supported model names. default=0,
help='A parameter used in awq to quantize fp16 weights '
Examples 1: 'to 4 bits')
lmdeploy list
parser.set_defaults(run=CLI.convert)
Examples 2:
lmdeploy list --engine pytorch @staticmethod
def add_parser_list():
Args: """Add parser for list command."""
engine (str): The backend for the model to run. Choice from parser = CLI.subparsers.add_parser(
['turbomind', 'pytorch']. 'list',
""" formatter_class=DefaultsAndTypesHelpFormatter,
assert engine in ['turbomind', 'pytorch'] description=CLI.list.__doc__,
if engine == 'pytorch': help=CLI.list.__doc__)
model_names = ['llama', 'llama2', 'internlm-7b'] parser.set_defaults(run=CLI.list)
elif engine == 'turbomind': # define arguments
from lmdeploy.model import MODELS ArgumentHelper.engine(parser)
model_names = list(MODELS.module_dict.keys())
model_names = [n for n in model_names if n.lower() not in ['base']] @staticmethod
def add_parser_checkenv():
"""Add parser for check_env command."""
parser = CLI.subparsers.add_parser(
'check_env',
formatter_class=DefaultsAndTypesHelpFormatter,
description=CLI.check_env.__doc__,
help=CLI.check_env.__doc__)
parser.set_defaults(run=CLI.check_env)
parser.add_argument('--dump-file',
type=str,
default=None,
help='The file path to save env info. Only '
'support file format in `json`, `yml`,'
' `pkl`')
@staticmethod
def convert(args):
"""Convert LLMs to turbomind format."""
from lmdeploy.turbomind.deploy.converter import main
kwargs = convert_args(args)
main(**kwargs)
@staticmethod
def list(args):
"""List the supported model names."""
from lmdeploy.model import MODELS
model_names = list(MODELS.module_dict.keys())
deprecate_names = [
'baichuan-7b', 'baichuan2-7b', 'chatglm2-6b', 'internlm-chat-20b',
'internlm-chat-7b', 'internlm-chat-7b-8k', 'internlm2-1_8b',
'internlm-20b', 'internlm2-20b', 'internlm2-7b', 'internlm2-chat',
'internlm2-chat-1_8b', 'internlm2-chat-20b', 'internlm2-chat-7b',
'llama-2-chat', 'llama-2', 'qwen-14b', 'qwen-7b', 'solar-70b',
'yi-200k', 'yi-34b', 'yi-chat', 'Mistral-7B-Instruct',
'Mixtral-8x7B-Instruct', 'baichuan-base', 'deepseek-chat',
'internlm-chat'
]
model_names = [
n for n in model_names if n not in deprecate_names + ['base']
]
deprecate_names.sort()
model_names.sort() model_names.sort()
print('Supported model names:') print('The older chat template name like "internlm2-7b", "qwen-7b"'
' and so on are deprecated and will be removed in the future.'
' The supported chat template names are:')
print('\n'.join(model_names)) print('\n'.join(model_names))
def check_env(self, dump_file: str = None): @staticmethod
"""Check env information. def check_env(args):
"""Check the environmental information."""
Args:
dump_file (str): Output file to save env info.
"""
import importlib import importlib
import mmengine import mmengine
...@@ -121,19 +158,16 @@ class CLI(object): ...@@ -121,19 +158,16 @@ class CLI(object):
print(f'{k}: {v}') print(f'{k}: {v}')
# dump to local file # dump to local file
dump_file = args.dump_file
if dump_file is not None: if dump_file is not None:
work_dir, _ = os.path.split(dump_file) work_dir, _ = os.path.split(dump_file)
if work_dir: if work_dir:
os.makedirs(work_dir, exist_ok=True) os.makedirs(work_dir, exist_ok=True)
mmengine.dump(env_info, dump_file) mmengine.dump(env_info, dump_file)
@staticmethod
def run(): def add_parsers():
"""The entry point of running LMDeploy CLI.""" """Add all parsers."""
CLI.add_parser_convert()
cli = CLI() CLI.add_parser_list()
cli.lite = SubCliLite() CLI.add_parser_checkenv()
cli.chat = SubCliChat()
cli.serve = SubCliServe()
fire.Fire(cli, name='lmdeploy')
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from mmengine.config import DictAction
from .cli import CLI
from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
class SubCliLite(object): class SubCliLite(object):
"""CLI for compressing LLMs.""" """CLI for compressing LLMs."""
_help = 'Compressing and accelerating LLMs with lmdeploy.lite module'
_desc = _help
parser = CLI.subparsers.add_parser(
'lite',
help=_help,
description=_desc,
)
subparsers = parser.add_subparsers(
title='Commands', description='This group has the following commands:')
def auto_awq(self, @staticmethod
model: str, def add_parser_auto_awq():
work_dir: str, """Add parser for auto_awq command."""
w_bits: int = 4, parser = SubCliLite.subparsers.add_parser(
w_sym: bool = False, 'auto_awq',
w_group_size: int = 128, formatter_class=DefaultsAndTypesHelpFormatter,
device: str = 'cuda'): description=SubCliLite.auto_awq.__doc__,
"""Perform weight quantization using AWQ algorithm. help=SubCliLite.auto_awq.__doc__)
parser.set_defaults(run=SubCliLite.auto_awq)
parser.add_argument('model',
type=str,
help='The path of model in hf format')
ArgumentHelper.work_dir(parser)
ArgumentHelper.calib_dataset(parser)
ArgumentHelper.calib_samples(parser)
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.device(parser)
parser.add_argument('--w-bits',
type=int,
default=4,
help='Bit number for weight quantization')
parser.add_argument('--w-sym',
action='store_true',
help='Whether to do symmetric quantization')
parser.add_argument(
'--w-group-size',
type=int,
default=128,
help='Group size for weight quantization statistics')
Args: @staticmethod
model (str): The path of model in hf format. def add_parser_calibrate():
work_dir (str): The working directory to save results. """Add parser for calibrate command."""
w_bits (int): Bit number for weight quantization. parser = SubCliLite.subparsers.add_parser(
w_sym (bool): Whether to do symmetric quantization. 'calibrate',
w_group_size (int): Group size for weight quantization statistics. formatter_class=DefaultsAndTypesHelpFormatter,
device (str): Device type of running. description=SubCliLite.calibrate.__doc__,
""" help=SubCliLite.calibrate.__doc__)
from lmdeploy.lite.apis.auto_awq import auto_awq parser.set_defaults(run=SubCliLite.calibrate)
parser.add_argument('model',
type=str,
help='The name or path of the model to be loaded')
ArgumentHelper.work_dir(parser)
ArgumentHelper.calib_dataset(parser)
ArgumentHelper.calib_samples(parser)
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.device(parser)
auto_awq(model, @staticmethod
work_dir, def add_parser_smooth_quant():
w_bits=w_bits, """Add parser for smooth_quant command."""
w_sym=w_sym, parser = SubCliLite.subparsers.add_parser(
w_group_size=w_group_size, 'smooth_quant',
device=device) formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliLite.smooth_quant.__doc__,
help=SubCliLite.smooth_quant.__doc__)
parser.set_defaults(run=SubCliLite.smooth_quant)
parser.add_argument('model',
type=str,
help='The name or path of the model to be loaded')
parser.add_argument(
'--work-dir',
type=str,
default='./work_dir',
help='The working directory for outputs. defaults to "./work_dir"')
ArgumentHelper.calib_dataset(parser)
ArgumentHelper.calib_samples(parser)
ArgumentHelper.calib_seqlen(parser)
ArgumentHelper.device(parser)
def calibrate(self, @staticmethod
model: str, def add_parser_kv_qparams():
calib_dataset: str = 'c4', """Add parser for kv_qparams command."""
calib_samples: int = 128, parser = SubCliLite.subparsers.add_parser(
calib_seqlen: int = 2048, 'kv_qparams',
work_dir: str = './work_dir', formatter_class=DefaultsAndTypesHelpFormatter,
device: str = 'cuda') -> None: description=SubCliLite.kv_qparams.__doc__,
"""Perform calibration on a given dataset. help=SubCliLite.kv_qparams.__doc__)
parser.set_defaults(run=SubCliLite.kv_qparams)
Args: parser.add_argument('work_dir',
model (str): The model to be loaded. type=str,
calib_dataset (str, optional): The calibration dataset name. help='Directory path where the stats are saved')
Defaults to 'c4'. parser.add_argument('turbomind_dir',
calib_samples (int, optional): The number of samples for type=str,
calibration. Defaults to 128. help='Directory path where to save the results')
calib_seqlen (int, optional): The sequence length for calibration. parser.add_argument('--kv-bits',
Defaults to 2048. type=int,
work_dir (str): The working directory for outputs. default=8,
Defaults to './work_dir'. help='Number of bits for quantization')
device (str, optional): The device to be used for calculation. parser.add_argument('--kv-sym',
Defaults to 'cuda'. action='store_true',
""" help='Whether to use symmetric quantizaiton')
from lmdeploy.lite.apis.calibrate import calibrate parser.add_argument(
'--num-tp',
type=int,
default=None,
help='GPU number used in tensor parallelism. Should be 2^n')
parser.add_argument('--tm-params',
nargs='*',
default=None,
action=DictAction,
help='Used key-values pairs in xxx=yyy format'
' to update the turbomind model weights'
' config')
calibrate(model, @staticmethod
calib_dataset=calib_dataset, def auto_awq(args):
calib_samples=calib_samples, """Perform weight quantization using AWQ algorithm."""
calib_seqlen=calib_seqlen, from lmdeploy.lite.apis.auto_awq import auto_awq
work_dir=work_dir, kwargs = convert_args(args)
device=device) auto_awq(**kwargs)
def kv_qparams(self, @staticmethod
work_dir: str, def calibrate(args):
turbomind_dir: str, """Perform calibration on a given dataset."""
kv_bits: int = 8, from lmdeploy.lite.apis.calibrate import calibrate
kv_sym: bool = False, kwargs = convert_args(args)
num_tp: int = 1) -> None: calibrate(**kwargs)
"""Export key and value stats.
Args: @staticmethod
work_dir (str): Directory path where the stats def kv_qparams(args):
are saved. """Export key and value stats."""
turbomind_dir (str): Directory path where to
save the results.
kv_bits (int, optional): Number of bits for quantization.
Defaults to 8.
kv_sym (bool, optional): Whether to use symmetric quantization.
Defaults to False.
num_tp (int, optional): Number of tensor parallelism.
Defaults to 1.
"""
from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams from lmdeploy.lite.apis.kv_qparams import main as run_kv_qparams
kwargs = convert_args(args)
run_kv_qparams(**kwargs)
run_kv_qparams(work_dir, @staticmethod
turbomind_dir, def smooth_quant(args):
kv_bits=kv_bits, """Perform w8a8 quantization using SmoothQuant."""
kv_sym=kv_sym, from lmdeploy.lite.apis.smooth_quant import smooth_quant
num_tp=num_tp) kwargs = convert_args(args)
smooth_quant(**kwargs)
def get_small_sharded_hf(self, src_dir: str, dst_dir: str):
"""Convert a hugging face model to the smallest sharded one.
Args: @staticmethod
src_dir (str): The directory of the input HF model. def add_parsers():
dst_dir (str): The directory to save new model. """Add all parsers."""
""" SubCliLite.add_parser_auto_awq()
from lmdeploy.lite.apis.get_small_sharded_hf import main as run_sharded SubCliLite.add_parser_calibrate()
run_sharded(src_dir, dst_dir) SubCliLite.add_parser_kv_qparams()
SubCliLite.add_parser_smooth_quant()
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional from .cli import CLI
from .utils import ArgumentHelper, DefaultsAndTypesHelpFormatter, convert_args
class SubCliServe(object): class SubCliServe:
"""Serve LLMs and interact on terminal or web UI.""" """Serve LLMs and interact on terminal or web UI."""
_help = 'Serve LLMs with gradio, openai API or triton server.'
_desc = _help
parser = CLI.subparsers.add_parser(
'serve',
help=_help,
description=_desc,
)
subparsers = parser.add_subparsers(
title='Commands', description='This group has the following commands:')
def gradio(self, @staticmethod
model_path_or_server: str, def add_parser_gradio():
server_name: str = '0.0.0.0', """Add parser for gradio command."""
server_port: int = 6006, parser = SubCliServe.subparsers.add_parser(
batch_size: int = 32, 'gradio',
tp: int = 1, formatter_class=DefaultsAndTypesHelpFormatter,
**kwargs): description=SubCliServe.gradio.__doc__,
"""Serve LLMs with web ui using gradio. help=SubCliServe.gradio.__doc__)
parser.set_defaults(run=SubCliServe.gradio)
Example 1: parser.add_argument(
lmdeploy serve gradio ./workspace 'model_path_or_server',
type=str,
Example 2: help='The path of the deployed model or the tritonserver url or '
lmdeploy serve gradio http://0.0.0.0:23333 'restful api url. for example: - ./workspace - 0.0.0.0:23333'
--server_name 0.0.0.0 ' - http://0.0.0.0:23333')
--server_port 6006 parser.add_argument('--server-name',
type=str,
Example 3: default='0.0.0.0',
lmdeploy serve gradio ${triton_server_ip_addresss}:33337 help='The ip address of gradio server')
parser.add_argument('--server-port',
Args: type=int,
model_path_or_server (str): the path of the deployed model or the default=6006,
tritonserver URL or restful api URL. The former is for directly help='The port of gradio server')
running service with gradio. The latter is for running with
tritonserver by default. # common args
server_name (str): the ip address of gradio server ArgumentHelper.backend(parser)
server_port (int): the port of gradio server
batch_size (int): batch size for running Turbomind directly # chat template args
tp (int): tensor parallel for Turbomind ArgumentHelper.meta_instruction(parser) # TODO remove
kwargs (dict): extra params to init ArgumentHelper.chat_template(parser)
""" ArgumentHelper.cap(parser)
# pytorch engine args
pt_group = parser.add_argument_group('PyTorch engine arguments')
# common engine args
tp_act = ArgumentHelper.tp(pt_group)
model_name_act = ArgumentHelper.model_name(pt_group)
session_len_act = ArgumentHelper.session_len(pt_group)
max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
# turbomind args
tb_group = parser.add_argument_group('TurboMind engine arguments')
# common engine args
tb_group._group_actions.append(tp_act)
tb_group._group_actions.append(model_name_act)
tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(max_batch_size_act)
tb_group._group_actions.append(cache_max_entry_act)
ArgumentHelper.model_format(tb_group)
ArgumentHelper.quant_policy(tb_group)
ArgumentHelper.rope_scaling_factor(tb_group)
@staticmethod
def add_parser_api_server():
"""Add parser for api_server command."""
parser = SubCliServe.subparsers.add_parser(
'api_server',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliServe.api_server.__doc__,
help=SubCliServe.api_server.__doc__)
parser.set_defaults(run=SubCliServe.api_server)
parser.add_argument(
'model_path',
type=str,
help='The path of a model. it could be one of the following '
'options: - i) a local directory path of a turbomind model'
' which is converted by `lmdeploy convert` command or '
'download from ii) and iii). - ii) the model_id of a '
'lmdeploy-quantized model hosted inside a model repo on '
'huggingface.co, such as "internlm/internlm-chat-20b-4bit",'
' "lmdeploy/llama2-chat-70b-4bit", etc. - iii) the model_id'
' of a model hosted inside a model repo on huggingface.co,'
' such as "internlm/internlm-chat-7b", "qwen/qwen-7b-chat "'
', "baichuan-inc/baichuan2-7b-chat" and so on')
parser.add_argument('--server-name',
type=str,
default='0.0.0.0',
help='Host ip for serving')
parser.add_argument('--server-port',
type=int,
default=23333,
help='Server port')
parser.add_argument('--allow-origins',
nargs='+',
type=str,
default=['*'],
help='A list of allowed origins for cors')
parser.add_argument('--allow-credentials',
action='store_true',
help='Whether to allow credentials for cors')
parser.add_argument('--allow-methods',
nargs='+',
type=str,
default=['*'],
help='A list of allowed http methods for cors')
parser.add_argument('--allow-headers',
nargs='+',
type=str,
default=['*'],
help='A list of allowed http headers for cors')
parser.add_argument('--qos-config-path',
type=str,
default='',
help='Qos policy config path')
# common args
ArgumentHelper.backend(parser)
ArgumentHelper.log_level(parser)
ArgumentHelper.api_keys(parser)
ArgumentHelper.ssl(parser)
# chat template args
ArgumentHelper.meta_instruction(parser) # TODO remove
ArgumentHelper.chat_template(parser)
ArgumentHelper.cap(parser)
# pytorch engine args
pt_group = parser.add_argument_group('PyTorch engine arguments')
# common engine args
tp_act = ArgumentHelper.tp(pt_group)
model_name_act = ArgumentHelper.model_name(pt_group)
session_len_act = ArgumentHelper.session_len(pt_group)
max_batch_size_act = ArgumentHelper.max_batch_size(pt_group)
cache_max_entry_act = ArgumentHelper.cache_max_entry_count(pt_group)
# turbomind args
tb_group = parser.add_argument_group('TurboMind engine arguments')
# common engine args
tb_group._group_actions.append(tp_act)
tb_group._group_actions.append(model_name_act)
tb_group._group_actions.append(session_len_act)
tb_group._group_actions.append(max_batch_size_act)
tb_group._group_actions.append(cache_max_entry_act)
ArgumentHelper.model_format(tb_group)
ArgumentHelper.quant_policy(tb_group)
ArgumentHelper.rope_scaling_factor(tb_group)
@staticmethod
def add_parser_api_client():
"""Add parser for api_client command."""
parser = SubCliServe.subparsers.add_parser(
'api_client',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliServe.api_client.__doc__,
help=SubCliServe.api_client.__doc__)
parser.set_defaults(run=SubCliServe.api_client)
parser.add_argument('api_server_url',
type=str,
help='The URL of api server')
parser.add_argument('--api-key',
type=str,
default=None,
help='api key. Default to None, which means no '
'api key will be used')
ArgumentHelper.session_id(parser)
@staticmethod
def add_parser_triton_client():
"""Add parser for triton_client command."""
parser = SubCliServe.subparsers.add_parser(
'triton_client',
formatter_class=DefaultsAndTypesHelpFormatter,
description=SubCliServe.triton_client.__doc__,
help=SubCliServe.triton_client.__doc__)
parser.set_defaults(run=SubCliServe.triton_client)
parser.add_argument(
'tritonserver_addr',
type=str,
help='The address in format "ip:port" of triton inference server')
ArgumentHelper.session_id(parser)
ArgumentHelper.cap(parser)
ArgumentHelper.stream_output(parser)
@staticmethod
def gradio(args):
"""Serve LLMs with web UI using gradio."""
from lmdeploy.archs import autoget_backend
from lmdeploy.messages import (PytorchEngineConfig,
TurbomindEngineConfig)
from lmdeploy.model import ChatTemplateConfig
from lmdeploy.serve.gradio.app import run from lmdeploy.serve.gradio.app import run
run(model_path_or_server, backend = args.backend
server_name=server_name,
server_port=server_port, if backend != 'pytorch' and ':' not in args.model_path_or_server:
batch_size=batch_size, # set auto backend mode
tp=tp, backend = autoget_backend(args.model_path_or_server)
**kwargs) if backend == 'pytorch':
backend_config = PytorchEngineConfig(
def api_server(self, tp=args.tp,
model_path: str, model_name=args.model_name,
model_name: Optional[str] = None, max_batch_size=args.max_batch_size,
server_name: str = '0.0.0.0', cache_max_entry_count=args.cache_max_entry_count,
server_port: int = 23333, session_len=args.session_len)
instance_num: int = 64, else:
tp: int = 1, backend_config = TurbomindEngineConfig(
allow_origins: List[str] = ['*'], model_name=args.model_name,
allow_credentials: bool = True, tp=args.tp,
allow_methods: List[str] = ['*'], max_batch_size=args.max_batch_size,
allow_headers: List[str] = ['*'], session_len=args.session_len,
**kwargs): model_format=args.model_format,
"""Serve LLMs with restful api using fastapi. quant_policy=args.quant_policy,
rope_scaling_factor=args.rope_scaling_factor,
Args: cache_max_entry_count=args.cache_max_entry_count)
model_path (str): the path of a model. chat_template_config = ChatTemplateConfig(
It could be one of the following options: model_name=args.model_name,
- i) A local directory path of a turbomind model which is meta_instruction=args.meta_instruction,
converted by `lmdeploy convert` command or capability=args.cap)
download from ii) and iii). if args.chat_template:
- ii) The model_id of a lmdeploy-quantized model hosted chat_template_config = ChatTemplateConfig.from_json(
inside a model repo on huggingface.co, such as args.chat_template)
"InternLM/internlm-chat-20b-4bit", run(args.model_path_or_server,
"lmdeploy/llama2-chat-70b-4bit", etc. server_name=args.server_name,
- iii) The model_id of a model hosted inside a model repo server_port=args.server_port,
on huggingface.co, such as "InternLM/internlm-chat-7b", backend=backend,
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat" backend_config=backend_config,
and so on. chat_template_config=chat_template_config)
model_name (str): needed when model_path is a pytorch model on
huggingface.co, such as "InternLM/internlm-chat-7b" @staticmethod
server_name (str): host ip for serving def api_server(args):
server_port (int): server port """Serve LLMs with restful api using fastapi."""
instance_num (int): number of instances of turbomind model from lmdeploy.archs import autoget_backend
tp (int): tensor parallel from lmdeploy.model import ChatTemplateConfig
allow_origins (List[str]): a list of allowed origins for CORS
allow_credentials (bool): whether to allow credentials for CORS
allow_methods (List[str]): a list of allowed HTTP methods for CORS
allow_headers (List[str]): a list of allowed HTTP headers for CORS
kwargs (dict) extra params to init api server
"""
from lmdeploy.serve.openai.api_server import serve as run_api_server from lmdeploy.serve.openai.api_server import serve as run_api_server
backend = args.backend
if backend != 'pytorch':
# set auto backend mode
backend = autoget_backend(args.model_path)
if backend == 'pytorch':
from lmdeploy.messages import PytorchEngineConfig
backend_config = PytorchEngineConfig(
tp=args.tp,
model_name=args.model_name,
max_batch_size=args.max_batch_size,
cache_max_entry_count=args.cache_max_entry_count,
session_len=args.session_len)
else:
from lmdeploy.messages import TurbomindEngineConfig
backend_config = TurbomindEngineConfig(
model_name=args.model_name,
tp=args.tp,
max_batch_size=args.max_batch_size,
session_len=args.session_len,
model_format=args.model_format,
quant_policy=args.quant_policy,
rope_scaling_factor=args.rope_scaling_factor,
cache_max_entry_count=args.cache_max_entry_count)
chat_template_config = ChatTemplateConfig(
model_name=args.model_name,
meta_instruction=args.meta_instruction,
capability=args.cap)
if args.chat_template:
chat_template_config = ChatTemplateConfig.from_json(
args.chat_template)
run_api_server(args.model_path,
backend=backend,
backend_config=backend_config,
chat_template_config=chat_template_config,
server_name=args.server_name,
server_port=args.server_port,
allow_origins=args.allow_origins,
allow_credentials=args.allow_credentials,
allow_methods=args.allow_methods,
allow_headers=args.allow_headers,
log_level=args.log_level.upper(),
api_keys=args.api_keys,
ssl=args.ssl,
qos_config_path=args.qos_config_path)
run_api_server(model_path, @staticmethod
model_name=model_name, def api_client(args):
server_name=server_name, """Interact with restful api server in terminal."""
server_port=server_port,
instance_num=instance_num,
tp=tp,
allow_origins=allow_origins,
allow_credentials=allow_credentials,
allow_methods=allow_methods,
allow_headers=allow_headers,
**kwargs)
def api_client(self, restful_api_url: str, session_id: int = 0):
"""Interact with restful api server in terminal.
Args:
restful_api_url: The restful api URL.
session_id: The identical id of a session.
"""
from lmdeploy.serve.openai.api_client import main as run_api_client from lmdeploy.serve.openai.api_client import main as run_api_client
run_api_client(restful_api_url, session_id=session_id) kwargs = convert_args(args)
run_api_client(**kwargs)
def triton_client(self,
tritonserver_addr: str,
session_id: int = 1,
cap: str = 'chat',
stream_output: bool = True,
**kwargs):
"""Interact with Triton Server using gRPC protocol.
Args:
tritonserver_addr (str): the address in format "ip:port" of
triton inference server
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama
has the ability among ['completion', 'infill', 'instruct',
'python']
stream_output (bool): indicator for streaming output or not
**kwargs (dict): other arguments for initializing model's
chat template
"""
@staticmethod
def triton_client(args):
"""Interact with Triton Server using gRPC protocol."""
from lmdeploy.serve.client import main as run_triton_client from lmdeploy.serve.client import main as run_triton_client
kwargs = convert_args(args)
run_triton_client(**kwargs)
run_triton_client( @staticmethod
tritonserver_addr, def add_parsers():
session_id=session_id, SubCliServe.add_parser_gradio()
cap=cap, SubCliServe.add_parser_api_server()
stream_output=stream_output, SubCliServe.add_parser_api_client()
**kwargs, SubCliServe.add_parser_triton_client()
)
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
from pathlib import Path
import torch import torch
from torch import nn from torch import nn
from transformers import AutoTokenizer
from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP, from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
quant_weights, smooth_layers) quant_weights, smooth_layers)
from lmdeploy.lite.utils import collect_target_modules, load_hf_from_pretrained from lmdeploy.lite.utils import collect_target_modules
from .calibrate import calibrate
# from lmdeploy.lite.utils.export_turbomind import export_turbomind_config # from lmdeploy.lite.utils.export_turbomind import export_turbomind_config
LAYER_TYPE_MAP = { LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer', 'InternLMForCausalLM': 'InternLMDecoderLayer',
'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
'QWenLMHeadModel': 'QWenBlock', 'QWenLMHeadModel': 'QWenBlock',
'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B 'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B
'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B 'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B
...@@ -21,6 +21,7 @@ LAYER_TYPE_MAP = { ...@@ -21,6 +21,7 @@ LAYER_TYPE_MAP = {
} }
NORM_TYPE_MAP = { NORM_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMRMSNorm', 'InternLMForCausalLM': 'InternLMRMSNorm',
'InternLM2ForCausalLM': 'InternLM2RMSNorm',
'QWenLMHeadModel': 'RMSNorm', 'QWenLMHeadModel': 'RMSNorm',
'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B 'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B
'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B 'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B
...@@ -29,30 +30,33 @@ NORM_TYPE_MAP = { ...@@ -29,30 +30,33 @@ NORM_TYPE_MAP = {
def auto_awq(model: str, def auto_awq(model: str,
work_dir: str, work_dir: str = './work_dir',
calib_dataset: str = 'ptb',
calib_samples: int = 128,
calib_seqlen: int = 2048,
w_bits: int = 4, w_bits: int = 4,
w_sym: bool = False, w_sym: bool = False,
w_group_size: int = 128, w_group_size: int = 128,
device: str = 'cuda'): device: str = 'cuda'):
"""Perform weight quantization using AWQ algorithm.
assert model != work_dir, '$WORK_DIR and $HF_MODEL should be different'
model_path = model # noqa Args:
model (str): The path of model in hf format.
# Load tokenizer and configuration work_dir (str): The working directory to save results.
tokenizer = AutoTokenizer.from_pretrained(model, calib_dataset (str): The calibration dataset name.
use_fast=False, calib_samples (int): The number of samples for calibration.
trust_remote_code=True) calib_seqlen (int): The sequence length for calibration.
w_bits (int): Bit number for weight quantization.
model = load_hf_from_pretrained(model, w_sym (bool): Whether to do symmetric quantization.
torch_dtype=torch.float16, w_group_size (int): Group size for weight quantization statistics.
trust_remote_code=True) device (str): Device type of running.
"""
model, tokenizer, work_dir = calibrate(model, calib_dataset, calib_samples,
calib_seqlen, work_dir, device)
layer_type = LAYER_TYPE_MAP[type(model).__name__] layer_type = LAYER_TYPE_MAP[type(model).__name__]
fc2fcs = FC_FCS_MAP[layer_type] fc2fcs = FC_FCS_MAP[layer_type]
norm2fcs = NORM_FCS_MAP[layer_type] norm2fcs = NORM_FCS_MAP[layer_type]
work_dir = Path(work_dir)
act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax'] act_scales = torch.load(work_dir / 'inputs_stats.pth')['absmax']
layers = collect_target_modules(model, layer_type) layers = collect_target_modules(model, layer_type)
fcs = {} fcs = {}
...@@ -68,11 +72,6 @@ def auto_awq(model: str, ...@@ -68,11 +72,6 @@ def auto_awq(model: str,
safe_serialization=False) safe_serialization=False)
tokenizer.save_pretrained(work_dir) tokenizer.save_pretrained(work_dir)
# export_turbomind_config(model_name,
# model_path,
# work_dir,
# group_size=w_group_size)
if __name__ == '__main__': if __name__ == '__main__':
import fire import fire
......
...@@ -13,19 +13,31 @@ from lmdeploy.lite.utils import (collect_target_modules, get_calib_loaders, ...@@ -13,19 +13,31 @@ from lmdeploy.lite.utils import (collect_target_modules, get_calib_loaders,
LAYER_TYPE_MAP = { LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer', 'InternLMForCausalLM': 'InternLMDecoderLayer',
'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
'QWenLMHeadModel': 'QWenBlock', 'QWenLMHeadModel': 'QWenBlock',
'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B 'BaiChuanForCausalLM': 'DecoderLayer', # Baichuan 7B
'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B 'BaichuanForCausalLM': 'DecoderLayer', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaDecoderLayer', 'LlamaForCausalLM': 'LlamaDecoderLayer',
} }
NORM_TYPE_MAP = { NORM_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMRMSNorm', 'InternLMForCausalLM': 'InternLMRMSNorm',
'InternLM2ForCausalLM': 'InternLM2RMSNorm',
'QWenLMHeadModel': 'RMSNorm', 'QWenLMHeadModel': 'RMSNorm',
'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B 'BaiChuanForCausalLM': 'RMSNorm', # Baichuan 7B
'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B 'BaichuanForCausalLM': 'RMSNorm', # Baichuan2 7B
'LlamaForCausalLM': 'LlamaRMSNorm', 'LlamaForCausalLM': 'LlamaRMSNorm',
} }
HEAD_NAME_MAP = {
'InternLMForCausalLM': 'lm_head',
'InternLM2ForCausalLM': 'output',
'QWenLMHeadModel': 'lm_head',
'BaiChuanForCausalLM': 'lm_head', # Baichuan 7B
'BaichuanForCausalLM': 'lm_head', # Baichuan2 7B
'LlamaForCausalLM': 'lm_head',
}
def _prepare_for_calibrate(model: nn.Module, def _prepare_for_calibrate(model: nn.Module,
layer_type: Union[str, type], layer_type: Union[str, type],
...@@ -99,7 +111,7 @@ def _prepare_for_calibrate(model: nn.Module, ...@@ -99,7 +111,7 @@ def _prepare_for_calibrate(model: nn.Module,
def calibrate(model: str, def calibrate(model: str,
calib_dataset: str = 'c4', calib_dataset: str = 'ptb',
calib_samples: int = 128, calib_samples: int = 128,
calib_seqlen: int = 2048, calib_seqlen: int = 2048,
work_dir: str = './work_dir', work_dir: str = './work_dir',
...@@ -110,7 +122,7 @@ def calibrate(model: str, ...@@ -110,7 +122,7 @@ def calibrate(model: str,
Args: Args:
model (str): The name or path of the model to be loaded. model (str): The name or path of the model to be loaded.
calib_dataset (str, optional): The calibration dataset name. calib_dataset (str, optional): The calibration dataset name.
Defaults to 'c4'. Defaults to 'ptb'.
calib_samples (int, optional): The number of samples for calibration. calib_samples (int, optional): The number of samples for calibration.
Defaults to 128. Defaults to 128.
calib_seqlen (int, optional): The sequence length for calibration. calib_seqlen (int, optional): The sequence length for calibration.
...@@ -119,6 +131,11 @@ def calibrate(model: str, ...@@ -119,6 +131,11 @@ def calibrate(model: str,
Defaults to './work_dir'. Defaults to './work_dir'.
device (str, optional): The device to be used for calculation. device (str, optional): The device to be used for calculation.
Defaults to 'cuda'. Defaults to 'cuda'.
Returns:
model (nn.Module): The loaded huggingface model.
tokenizer : The loaded hugginface tokenizer.
work_dir (str): The working directory for outputs.
""" """
assert calib_dataset in ['c4', 'ptb', 'wikitext2', 'pileval'], \ assert calib_dataset in ['c4', 'ptb', 'wikitext2', 'pileval'], \
...@@ -152,7 +169,8 @@ def calibrate(model: str, ...@@ -152,7 +169,8 @@ def calibrate(model: str,
layer_type = LAYER_TYPE_MAP[type(model).__name__] layer_type = LAYER_TYPE_MAP[type(model).__name__]
norm_type = NORM_TYPE_MAP[type(model).__name__] norm_type = NORM_TYPE_MAP[type(model).__name__]
_prepare_for_calibrate(model, layer_type, 'lm_head', device) _prepare_for_calibrate(model, layer_type,
HEAD_NAME_MAP[type(model).__name__], device)
print('Loading calibrate dataset ...') print('Loading calibrate dataset ...')
calib_loader, _ = get_calib_loaders(calib_dataset, calib_loader, _ = get_calib_loaders(calib_dataset,
...@@ -179,6 +197,8 @@ def calibrate(model: str, ...@@ -179,6 +197,8 @@ def calibrate(model: str,
work_dir.mkdir(parents=True, exist_ok=True) work_dir.mkdir(parents=True, exist_ok=True)
calib_ctx.export(work_dir) calib_ctx.export(work_dir)
return model, tokenizer, work_dir
if __name__ == '__main__': if __name__ == '__main__':
import fire import fire
......
...@@ -15,6 +15,10 @@ NORM_FCS_MAP = { ...@@ -15,6 +15,10 @@ NORM_FCS_MAP = {
['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'], ['self_attn.k_proj', 'self_attn.q_proj', 'self_attn.v_proj'],
'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj'] 'post_attention_layernorm': ['mlp.gate_proj', 'mlp.up_proj']
}, },
'InternLM2DecoderLayer': {
'attention_norm': ['attention.wqkv'],
'ffn_norm': ['feed_forward.w1', 'feed_forward.w3']
},
'QWenBlock': { 'QWenBlock': {
'ln_1': ['attn.c_attn'], 'ln_1': ['attn.c_attn'],
'ln_2': ['mlp.w1', 'mlp.w2'] 'ln_2': ['mlp.w1', 'mlp.w2']
...@@ -34,6 +38,9 @@ FC_FCS_MAP = { ...@@ -34,6 +38,9 @@ FC_FCS_MAP = {
'self_attn.v_proj': ['self_attn.o_proj'], 'self_attn.v_proj': ['self_attn.o_proj'],
'mlp.up_proj': ['mlp.down_proj'] 'mlp.up_proj': ['mlp.down_proj']
}, },
'InternLM2DecoderLayer': {
'feed_forward.w3': ['feed_forward.w2']
},
'QWenBlock': { 'QWenBlock': {
'attn.c_attn': ['attn.c_proj'], 'attn.c_attn': ['attn.c_proj'],
'mlp.w1': ['mlp.c_proj'] 'mlp.w1': ['mlp.c_proj']
...@@ -71,6 +78,13 @@ def smooth_ln_fcs(ln: torch.nn.Module, ...@@ -71,6 +78,13 @@ def smooth_ln_fcs(ln: torch.nn.Module,
:return: Scales :return: Scales
""" """
device, dtype = fcs[0].weight.device, fcs[0].weight.dtype device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
# If zeros exist within the weight of the layer norm, it becomes
# unnecessary to perform smooth quantization at the positions where
# these zeros occur.
zero_positions = (ln.weight == 0).nonzero(as_tuple=True)[0]
nonzero_positions = (ln.weight != 0).nonzero(as_tuple=True)[0]
act_scales = act_scales.to(device=device, dtype=dtype) act_scales = act_scales.to(device=device, dtype=dtype)
concat_w = torch.cat([fc.weight for fc in fcs], dim=0) concat_w = torch.cat([fc.weight for fc in fcs], dim=0)
...@@ -78,7 +92,11 @@ def smooth_ln_fcs(ln: torch.nn.Module, ...@@ -78,7 +92,11 @@ def smooth_ln_fcs(ln: torch.nn.Module,
scales = (act_scales.pow(alpha) / scales = (act_scales.pow(alpha) /
w_scales.pow(1 - alpha)).to(device).to(dtype) w_scales.pow(1 - alpha)).to(device).to(dtype)
scales = scales / (scales.max() * scales.min()).sqrt()
scales = scales / (scales[nonzero_positions].max() *
scales[nonzero_positions].min()).sqrt()
scales[zero_positions] = 1
ln.weight.div_(scales) ln.weight.div_(scales)
if hasattr(ln, 'bias'): if hasattr(ln, 'bias'):
...@@ -182,8 +200,8 @@ def check_awq_supported(layer_type): ...@@ -182,8 +200,8 @@ def check_awq_supported(layer_type):
def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'): def quant_weights(model, fcs, bits, symmetry, group_size=-1, device='cuda'):
"""Quantize the weights of the target model's linear layers.""" """Quantize the weights of the target model's linear layers."""
from lmdeploy.legacy.pytorch.modules import WeightOnlyQLinear
from lmdeploy.lite.quantization import WeightQuantizer from lmdeploy.lite.quantization import WeightQuantizer
from lmdeploy.pytorch.modules import WeightOnlyQLinear
for name, fc in fcs.items(): for name, fc in fcs.items():
fc.to(device) fc.to(device)
quantizer = WeightQuantizer(bits, symmetry, 'per_group', group_size) quantizer = WeightQuantizer(bits, symmetry, 'per_group', group_size)
......
...@@ -3,6 +3,8 @@ from functools import partial ...@@ -3,6 +3,8 @@ from functools import partial
from typing import Union from typing import Union
import torch import torch
import transformers
from mmengine import digit_version
from torch import nn from torch import nn
from transformers import PreTrainedTokenizer from transformers import PreTrainedTokenizer
...@@ -53,7 +55,6 @@ class CalibrationContext(): ...@@ -53,7 +55,6 @@ class CalibrationContext():
self.num_kv_heads = num_kv_heads self.num_kv_heads = num_kv_heads
self.head_dim = model.config.hidden_size // num_attn_heads self.head_dim = model.config.hidden_size // num_attn_heads
self.model = model self.model = model
del self.model.lm_head
self.tokenizer = tokenizer self.tokenizer = tokenizer
...@@ -163,12 +164,36 @@ class CalibrationContext(): ...@@ -163,12 +164,36 @@ class CalibrationContext():
if k_obs and v_obs: if k_obs and v_obs:
batch_kwargs[i]['use_cache'] = True batch_kwargs[i]['use_cache'] = True
out = self._ori_forwards[mod](*batch_args[i], version = digit_version(transformers.__version__)
**batch_kwargs[i]) use_new_cache = type(mod).__name__ == 'LlamaDecoderLayer'
out = list(out) if version > digit_version('4.36.0') and use_new_cache:
key, value = out.pop(-1) from transformers.cache_utils import DynamicCache
k_obs.observe(key) batch_kwargs[i]['past_key_value'] = DynamicCache()
v_obs.observe(value)
ori_idx = mod.self_attn.layer_idx
mod.self_attn.layer_idx = 0
out = self._ori_forwards[mod](*batch_args[i],
**batch_kwargs[i])
mod.self_attn.layer_idx = ori_idx
out = list(out)
cache = out.pop(-1)
key = cache.key_cache.pop(-1)
value = cache.value_cache.pop(-1)
k_obs.observe(key)
v_obs.observe(value)
else:
out = self._ori_forwards[mod](*batch_args[i],
**batch_kwargs[i])
out = list(out)
key, value = out.pop(-1)
k_obs.observe(key)
v_obs.observe(value)
del key, value del key, value
torch.cuda.empty_cache() torch.cuda.empty_cache()
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import torch import torch
from transformers import AutoConfig, AutoModelForCausalLM from transformers import AutoConfig, AutoModelForCausalLM
from lmdeploy.pytorch.model import LoadWoInit from lmdeploy.pytorch.accel import LoadNoInit
def load_hf_from_pretrained(pretrained_model_name_or_path, def load_hf_from_pretrained(pretrained_model_name_or_path,
...@@ -26,7 +26,7 @@ def load_hf_from_pretrained(pretrained_model_name_or_path, ...@@ -26,7 +26,7 @@ def load_hf_from_pretrained(pretrained_model_name_or_path,
elif dtype == torch.bfloat16: elif dtype == torch.bfloat16:
hf_config.bf16 = True hf_config.bf16 = True
with LoadWoInit(): with LoadNoInit():
# Load model # Load model
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
pretrained_model_name_or_path, config=hf_config, **kwargs) pretrained_model_name_or_path, config=hf_config, **kwargs)
......
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
import dataclasses import dataclasses
import json
from abc import abstractmethod from abc import abstractmethod
from typing import List from typing import List, Literal, Optional
from mmengine import Registry from mmengine import Registry
from lmdeploy.utils import get_logger
logger = get_logger('lmdeploy')
MODELS = Registry('model', locations=['lmdeploy.model']) MODELS = Registry('model', locations=['lmdeploy.model'])
@dataclasses.dataclass @dataclasses.dataclass
class SamplingParam: class ChatTemplateConfig:
top_p: float = 0.8 """Parameters for chat template.
top_k: float = None
temperature: float = 0.8 Args:
repetition_penalty: float = 1.0 model_name (str): the name of the deployed model. Determine which chat template will be applied.
All the chat template names: `lmdeploy list`
system (str | None): begin of the system prompt
meta_instruction (str | None): system prompt
eosys (str | None): end of the system prompt
user (str | None): begin of the user prompt
eoh (str | None): end of the user prompt
assistant (str | None): begin of the assistant prompt
eoa (str | None): end of the assistant prompt
capability: ('completion' | 'infilling' | 'chat' | 'python') = None
""" # noqa: E501
model_name: str
system: Optional[str] = None
meta_instruction: Optional[str] = None
eosys: Optional[str] = None
user: Optional[str] = None
eoh: Optional[str] = None
assistant: Optional[str] = None
eoa: Optional[str] = None
separator: Optional[str] = None
capability: Optional[Literal['completion', 'infilling', 'chat',
'python']] = None
stop_words: Optional[List[str]] = None
@property
def chat_template(self):
attrs = {
key: value
for key, value in dataclasses.asdict(self).items()
if value is not None
}
attrs.pop('model_name', None)
if self.model_name in MODELS.module_dict.keys():
model: BaseModel = MODELS.get(self.model_name)(**attrs)
else:
logger.warning(
f'Could not find {self.model_name} in registered models. '
f'Register {self.model_name} using the BaseChatTemplate.')
model = BaseChatTemplate(**attrs)
return model
def to_json(self, file_path=None):
"""Convert the dataclass instance to a JSON formatted string and
optionally save to a file."""
json_str = json.dumps(dataclasses.asdict(self),
ensure_ascii=False,
indent=4)
if file_path:
with open(file_path, 'w', encoding='utf-8') as file:
file.write(json_str)
return json_str
@classmethod
def from_json(cls, file_or_string):
"""Construct a dataclass instance from a JSON file or JSON string."""
try:
# Try to open the input_data as a file path
with open(file_or_string, 'r', encoding='utf-8') as file:
json_data = file.read()
except FileNotFoundError:
# If it's not a file path, assume it's a JSON string
json_data = file_or_string
except IOError:
# If it's not a file path and not a valid JSON string, raise error
raise ValueError(
'Invalid input. Must be a file path or a valid JSON string.')
json_data = json.loads(json_data)
assert json_data.get('model_name', None) is not None, \
'model_name is a must for json chat template.'
if json_data['model_name'] not in MODELS.module_dict.keys():
MODELS.register_module(json_data['model_name'],
module=BaseChatTemplate)
return cls(**json_data)
@MODELS.register_module(name='internlm')
@MODELS.register_module(name='llama') @MODELS.register_module(name='llama')
@MODELS.register_module(name='base') @MODELS.register_module(name='base')
class BaseModel: class BaseModel:
...@@ -24,18 +100,10 @@ class BaseModel: ...@@ -24,18 +100,10 @@ class BaseModel:
def __init__(self, def __init__(self,
session_len=2048, session_len=2048,
top_p=0.8,
top_k=None,
temperature=0.8,
repetition_penalty=1.0,
capability='chat', capability='chat',
stop_words=None, stop_words=None,
**kwargs): **kwargs):
self.session_len = session_len self.session_len = session_len
self.top_p = top_p
self.top_k = top_k
self.temperature = temperature
self.repetition_penalty = repetition_penalty
self.stop_words = stop_words self.stop_words = stop_words
self.capability = capability self.capability = capability
...@@ -50,43 +118,8 @@ class BaseModel: ...@@ -50,43 +118,8 @@ class BaseModel:
Returns: Returns:
str: the concatenated prompt str: the concatenated prompt
""" """
if self.capability == 'completion':
return prompt
else:
return self.decorate_prompt(prompt, sequence_start)
@abstractmethod
def decorate_prompt(self, prompt, sequence_start):
return prompt return prompt
@staticmethod
def _translate_messages(messages: List):
"""Translate messages into system, user speaking list, assistant
speaking list.
Args:
messages (List): chat history
Returns:
Turple: consists of system (str), users (List[str]),
assistants (List[str])
"""
system = None
users = []
assistants = []
assert isinstance(messages, List)
for message in messages:
msg_role = message['role']
if msg_role == 'system':
system = message['content']
elif msg_role == 'user':
users.append(message['content'])
elif msg_role == 'assistant':
assistants.append(message['content'])
else:
raise ValueError(f'Unknown role: {msg_role}')
assistants.append(None)
return system, users, assistants
@abstractmethod @abstractmethod
def messages2prompt(self, messages, sequence_start=True): def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the """Return the prompt that is concatenated with other elements in the
...@@ -103,31 +136,40 @@ class BaseModel: ...@@ -103,31 +136,40 @@ class BaseModel:
return self.get_prompt(messages) return self.get_prompt(messages)
# chat history processing in derived classes # chat history processing in derived classes
@property @classmethod
def sampling_param(self): def match(cls, model_path: str) -> Optional[str]:
return SamplingParam(top_p=self.top_p, """Return the model_name that was registered to MODELS.
top_k=self.top_k,
temperature=self.temperature,
repetition_penalty=self.repetition_penalty)
Args:
model_path (str): the model path used for matching.
"""
return None
@MODELS.register_module(name='wizardlM')
@MODELS.register_module(name='vicuna')
class Vicuna(BaseModel):
"""Chat template of vicuna model."""
def __init__( class BaseChatTemplate(BaseModel):
self, """Base Chat template."""
system="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. """, # noqa: E501
user='USER', def __init__(self,
assistant='ASSISTANT', system='',
**kwargs): meta_instruction='',
eosys='',
user='',
eoh='',
assistant='',
eoa='',
separator='',
**kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.system = system self.system = system
self.meta_instruction = meta_instruction
self.user = user self.user = user
self.eoh = eoh
self.eoa = eoa
self.separator = separator
self.eosys = eosys
self.assistant = assistant self.assistant = assistant
def decorate_prompt(self, prompt, sequence_start=True): def get_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the """Return the prompt that is concatenated with other elements in the
chat template. chat template.
...@@ -138,12 +180,20 @@ class Vicuna(BaseModel): ...@@ -138,12 +180,20 @@ class Vicuna(BaseModel):
Returns: Returns:
str: the concatenated prompt str: the concatenated prompt
""" """
assert self.capability == 'chat', \ if self.capability == 'completion':
f'{type(self).__name__} has no capability of {self.capability}' return prompt
if sequence_start: if sequence_start:
return f'{self.system} {self.user}: {prompt} {self.assistant}: ' # None is different from ''
if self.meta_instruction is not None:
return f'{self.system}{self.meta_instruction}{self.eosys}' \
f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else:
return f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else: else:
return f'</s>{self.user}: {prompt} {self.assistant}: ' return f'{self.separator}{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
def messages2prompt(self, messages, sequence_start=True): def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the """Return the prompt that is concatenated with other elements in the
...@@ -156,20 +206,65 @@ class Vicuna(BaseModel): ...@@ -156,20 +206,65 @@ class Vicuna(BaseModel):
""" """
if isinstance(messages, str): if isinstance(messages, str):
return self.get_prompt(messages, sequence_start) return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages) box_map = dict(user=self.user,
system = self.system if not system else system assistant=self.assistant,
ret = system + ' ' system=self.system)
for user, assistant in zip(users, assistants): eox_map = dict(user=self.eoh,
if assistant: assistant=self.eoa + self.separator,
ret += f'{self.user}: {user} {self.assistant}: {assistant}</s>' system=self.eosys)
else: ret = ''
ret += f'{self.user}: {user} {self.assistant}: ' if self.meta_instruction is not None:
if len(messages) and messages[0]['role'] != 'system':
ret += f'{self.system}{self.meta_instruction}{self.eosys}'
for message in messages:
role = message['role']
content = message['content']
ret += f'{box_map[role]}{content}{eox_map[role]}'
ret += f'{self.assistant}'
return ret return ret
@MODELS.register_module(name='wizardlm')
@MODELS.register_module(name='vicuna')
class Vicuna(BaseChatTemplate):
"""Chat template of vicuna model."""
def __init__(
self,
meta_instruction="""A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.""", # noqa: E501
eosys=' ',
user='USER: ',
eoh=' ',
assistant='ASSISTANT: ',
eoa='</s>',
stop_words=['</s>'],
**kwargs):
super().__init__(meta_instruction=meta_instruction,
eosys=eosys,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
stop_words=stop_words,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
if 'vicuna' in model_path.lower():
return 'vicuna'
if 'wizardlm' in model_path.lower():
return 'wizardlm'
@MODELS.register_module(name='internlm-chat') @MODELS.register_module(name='internlm-chat')
@MODELS.register_module(name='internlm-chat-7b') @MODELS.register_module(name='internlm-chat-7b')
class InternLMChat7B(BaseModel): @MODELS.register_module(name='internlm')
class InternLMChat7B(BaseChatTemplate):
"""Chat template of InternLM model.""" """Chat template of InternLM model."""
def __init__( def __init__(
...@@ -179,67 +274,36 @@ class InternLMChat7B(BaseModel): ...@@ -179,67 +274,36 @@ class InternLMChat7B(BaseModel):
- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless. - InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). It is designed to be helpful, honest, and harmless.
- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文. - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
""", # noqa: E501 """, # noqa: E501
eosys='\n',
user='<|User|>:', user='<|User|>:',
eoh='\n', eoh='\n',
eoa='<eoa>\n',
eosys='\n',
assistant='<|Bot|>:', assistant='<|Bot|>:',
eoa='<eoa>',
separator='\n',
stop_words=['<eoa>'], stop_words=['<eoa>'],
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(system=system,
self.system = system meta_instruction=meta_instruction,
self.meta_instruction = meta_instruction eosys=eosys,
self.user = user user=user,
self.eoh = eoh eoh=eoh,
self.eoa = eoa assistant=assistant,
self.eosys = eosys eoa=eoa,
self.assistant = assistant separator=separator,
self.stop_words = stop_words stop_words=stop_words,
**kwargs)
def decorate_prompt(self, prompt, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
Args:
prompt (str): user's input prompt
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
"""
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.system}{self.meta_instruction}{self.eosys}' \
f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else:
return f'\n{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
def messages2prompt(self, messages, sequence_start=True): @classmethod
"""Return the prompt that is concatenated with other elements in the def match(cls, model_path: str) -> Optional[str]:
chat template. """Return the model_name that was registered to MODELS.
Args: Args:
messages (str | List): user's input prompt model_path (str): the model path used for matching.
Returns:
str: the concatenated prompt
""" """
path = model_path.lower()
if isinstance(messages, str): if all([c not in path for c in ['internlm2', '8k']]) and \
return self.get_prompt(messages, sequence_start) all([c in path for c in ['internlm', 'chat']]):
eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys) return 'internlm'
ret = ''
if self.meta_instruction:
ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
for message in messages:
role = message['role']
content = message['content']
ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
ret += f'{self.assistant}'
return ret
@MODELS.register_module(name='internlm-chat-20b') @MODELS.register_module(name='internlm-chat-20b')
...@@ -254,7 +318,7 @@ class InternLMChat7B8K(InternLMChat7B): ...@@ -254,7 +318,7 @@ class InternLMChat7B8K(InternLMChat7B):
@MODELS.register_module(name='internlm-20b') @MODELS.register_module(name='internlm-20b')
class InternLMBaseModel20B(BaseModel): class InternLMBaseModel20B(BaseChatTemplate):
"""Generation parameters of InternLM-20B-Base model.""" """Generation parameters of InternLM-20B-Base model."""
def __init__(self, session_len=4096, capability='completion', **kwargs): def __init__(self, session_len=4096, capability='completion', **kwargs):
...@@ -263,71 +327,94 @@ class InternLMBaseModel20B(BaseModel): ...@@ -263,71 +327,94 @@ class InternLMBaseModel20B(BaseModel):
**kwargs) **kwargs)
@MODELS.register_module(
name=['internlm2-1_8b', 'internlm2-7b', 'internlm2-20b'])
class InternLM2BaseModel7B(BaseChatTemplate):
"""Generation parameters of InternLM2-7B-Base model."""
def __init__(self, session_len=32768, capability='completion', **kwargs):
super().__init__(session_len=session_len,
capability=capability,
**kwargs)
@MODELS.register_module(name=[
'internlm2-chat', 'internlm2-chat-1_8b', 'internlm2-chat-7b',
'internlm2-chat-20b'
])
@MODELS.register_module(name='internlm2')
class InternLM2Chat7B(InternLMChat7B):
"""Chat template and generation parameters of InternLM2-Chat-7B."""
def __init__(self,
session_len=32768,
system='<|im_start|>system\n',
user='<|im_start|>user\n',
assistant='<|im_start|>assistant\n',
eosys='<|im_end|>\n',
eoh='<|im_end|>\n',
eoa='<|im_end|>',
separator='\n',
stop_words=['<|im_end|>', '<|action_end|>'],
**kwargs):
super(InternLM2Chat7B, self).__init__(session_len=session_len,
system=system,
user=user,
assistant=assistant,
eosys=eosys,
eoh=eoh,
eoa=eoa,
separator=separator,
stop_words=stop_words,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
path = model_path.lower()
if 'internlm2' in path and ('chat' in path or 'math' in path):
return 'internlm2'
@MODELS.register_module(name='baichuan-7b') @MODELS.register_module(name='baichuan-7b')
class Baichuan7B(BaseModel): @MODELS.register_module(name='baichuan-base')
class Baichuan7B(BaseChatTemplate):
"""Generation parameters of Baichuan-7B base model.""" """Generation parameters of Baichuan-7B base model."""
def __init__(self, repetition_penalty=1.1, **kwargs): def __init__(self, **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.repetition_penalty = repetition_penalty
@MODELS.register_module(name='baichuan2-7b') @MODELS.register_module(name='baichuan2-7b')
class Baichuan2_7B(BaseModel): @MODELS.register_module(name='baichuan2')
class Baichuan2_7B(BaseChatTemplate):
"""Chat template and generation parameters of Baichuan2-7B-Base and """Chat template and generation parameters of Baichuan2-7B-Base and
Baichuan2-7B-Chat models.""" Baichuan2-7B-Chat models."""
def __init__(self, def __init__(self,
temperature=0.3, user='<reserved_106>',
top_k=5, assistant='<reserved_107>',
top_p=0.85,
repetition_penalty=1.05,
**kwargs): **kwargs):
super().__init__(temperature=temperature, super().__init__(user=user, assistant=assistant, **kwargs)
top_k=top_k,
top_p=top_p,
repetition_penalty=repetition_penalty,
**kwargs)
self.user_token = '<reserved_106>' # id = 195
self.assistant_token = '<reserved_107>' # id = 196
def decorate_prompt(self, prompt, sequence_start=True): @classmethod
"""Return the prompt that is concatenated with other elements in the def match(cls, model_path: str) -> Optional[str]:
chat template. """Return the model_name that was registered to MODELS.
Args: Args:
prompt (str): user's input prompt model_path (str): the model path used for matching.
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
""" """
assert self.capability == 'chat', \ path = model_path.lower()
f'{type(self).__name__} has no capability of {self.capability}' if 'baichuan2' in path and 'chat' in path:
return f'{self.user_token}{prompt}{self.assistant_token}' return 'baichuan2'
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages)
ret = ''
for user, assistant in zip(users, assistants):
ret += f'{self.user_token}{user}{self.assistant_token}'
if assistant:
ret += f'{assistant}'
return ret
@MODELS.register_module(name='puyu') @MODELS.register_module(name='puyu')
class Puyu(BaseModel): class Puyu(BaseChatTemplate):
"""Chat template of puyu model.This is only for internal usage in Shanghai """Chat template of puyu model.This is only for internal usage in Shanghai
AI Laboratory.""" AI Laboratory."""
...@@ -341,217 +428,136 @@ class Puyu(BaseModel): ...@@ -341,217 +428,136 @@ class Puyu(BaseModel):
eoa='', eoa='',
stop_words=None, stop_words=None,
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(meta_instruction=meta_instruction,
self.meta_instruction = meta_instruction system=system,
self.system = system eosys=eosys,
self.user = user user=user,
self.assistant = assistant eoh=eoh,
self.stop_words = stop_words assistant=assistant,
self.eosys = eosys eoa=eoa,
self.eoh = eoh stop_words=stop_words,
self.eoa = eoa **kwargs)
def decorate_prompt(self, prompt, sequence_start=True):
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.system}{self.meta_instruction}{self.eosys}' \
f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else:
return f'{self.eoa}{self.user}{prompt}{self.eoh}{self.assistant}'
def messages2prompt(self, messages, sequence_start=True): @classmethod
"""Return the prompt that is concatenated with other elements in the def match(cls, model_path: str) -> Optional[str]:
chat template. """Return the model_name that was registered to MODELS.
Args: Args:
messages (str | List): user's input prompt model_path (str): the model path used for matching.
sequence_start (bool): flag to start the sequence
Returns:
str: the concatenated prompt
""" """
if isinstance(messages, str): if 'puyu' in model_path.lower():
return self.get_prompt(messages, sequence_start) return 'puyu'
eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
ret = ''
if self.meta_instruction:
ret += f'{self.system}{self.meta_instruction}{self.eosys}'
for message in messages:
role = message['role']
content = message['content']
ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
ret += f'{self.assistant}'
return ret
@MODELS.register_module(name='llama2') @MODELS.register_module(name=['llama2', 'llama-2', 'llama-2-chat'])
class Llama2(BaseModel): class Llama2(BaseChatTemplate):
"""Chat template of LLaMA2 model.""" """Chat template of LLaMA2 model."""
def __init__( def __init__(
self, self,
b_inst='[INST]', system='[INST] <<SYS>>\n',
e_inst='[/INST]', meta_instruction="""\
b_sys='<<SYS>>\n',
e_sys='\n<</SYS>>\n\n',
system="""\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""", # noqa: E501 If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""", # noqa: E501
eosys='\n<</SYS>>\n\n',
assistant=' [/INST] ',
eoa='</s>',
separator='<s>[INST] ',
session_len=4096, session_len=4096,
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(system=system,
self.b_inst = b_inst meta_instruction=meta_instruction,
self.e_inst = e_inst eosys=eosys,
self.b_sys = b_sys assistant=assistant,
self.e_sys = e_sys eoa=eoa,
self.default_sys_prompt = system separator=separator,
self.session_len = session_len session_len=session_len,
**kwargs)
def decorate_prompt(self, prompt, sequence_start=True): @classmethod
"""Return the prompt that is concatenated with other elements in the def match(cls, model_path: str) -> Optional[str]:
chat template. """Return the model_name that was registered to MODELS.
Args: Args:
prompt (str): user's input prompt model_path (str): the model path used for matching.
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
""" """
assert self.capability == 'chat', \ if 'llama-2' in model_path.lower() or 'llama2' in model_path.lower():
f'{type(self).__name__} has no capability of {self.capability}' return 'llama2'
if sequence_start:
return f'{self.b_inst} ' \
f'{self.b_sys} {self.default_sys_prompt} {self.e_sys}' \
f'{prompt} {self.e_inst} '
return f'{self.b_inst} {prompt} {self.e_inst} '
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages)
system = self.default_sys_prompt if not system else system
ret = f'{self.b_inst} {self.b_sys} {system} {self.e_sys}'
for i, (user, assistant) in enumerate(zip(users, assistants)):
if i != 0:
ret += f'{self.b_inst} '
if assistant:
ret += f'{user} {self.e_inst} {assistant}'
else:
ret += f'{user} {self.e_inst} '
return ret
@MODELS.register_module(name='qwen-72b') @MODELS.register_module(name='qwen-72b')
@MODELS.register_module(name='qwen-14b') @MODELS.register_module(name='qwen-14b')
@MODELS.register_module(name='qwen-7b') @MODELS.register_module(name='qwen-7b')
class Qwen7BChat(BaseModel): @MODELS.register_module(name='qwen')
class Qwen7BChat(BaseChatTemplate):
"""Chat template for Qwen-7B-Chat.""" """Chat template for Qwen-7B-Chat."""
def __init__(self, def __init__(self,
session_len=8192, session_len=8192,
top_p=0.5, system='<|im_start|>system\n',
top_k=40, meta_instruction='You are a helpful assistant.',
temperature=1.0, eosys='<|im_end|>\n',
im_start='<|im_start|>', user='<|im_start|>user\n',
im_end='<|im_end|>', eoh='<|im_end|>\n',
system='You are a helpful assistant.', assistant='<|im_start|>assistant\n',
eoa='<|im_end|>',
separator='\n',
stop_words=['<|im_end|>'], stop_words=['<|im_end|>'],
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(system=system,
self.session_len = session_len meta_instruction=meta_instruction,
self.top_p = top_p eosys=eosys,
self.top_k = top_k user=user,
self.temperature = temperature eoh=eoh,
assistant=assistant,
self.im_start = im_start eoa=eoa,
self.im_end = im_end separator=separator,
self.system = system stop_words=stop_words,
self.stop_words = stop_words session_len=session_len,
**kwargs)
def decorate_prompt(self, prompt, sequence_start=True):
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.im_start}system\n{self.system}{self.im_end}' \
f'\n{self.im_start}user\n{prompt}{self.im_end}' \
f'\n{self.im_start}assistant\n'
return f'\n{self.im_start}user\n{prompt}{self.im_end}' \
f'\n{self.im_start}assistant\n'
def messages2prompt(self, messages, sequence_start=True): @classmethod
"""Return the prompt that is concatenated with other elements in the def match(cls, model_path: str) -> Optional[str]:
chat template. """Return the model_name that was registered to MODELS.
Args: Args:
messages (str | List): user's input prompt model_path (str): the model path used for matching.
Returns:
str: the concatenated prompt
""" """
if isinstance(messages, str): if 'qwen' in model_path.lower():
return self.get_prompt(messages, sequence_start) return 'qwen'
system, users, assistants = self._translate_messages(messages)
system = self.system if not system else system
ret = f'{self.im_start}system\n{system}{self.im_end}'
for user, assistant in zip(users, assistants):
if assistant:
ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
f'\n{self.im_start}assistant\n{assistant}'
else:
ret += f'\n{self.im_start}user\n{user}{self.im_end}' \
f'\n{self.im_start}assistant\n'
return ret
@MODELS.register_module(name='codellama') @MODELS.register_module(name='codellama')
class CodeLlama(Llama2): class CodeLlama(Llama2):
def __init__(self, def __init__(self,
system='', meta_instruction='',
session_len=4096, session_len=4096,
suffix_first=False, suffix_first=False,
stop_words=None, stop_words=None,
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(meta_instruction=meta_instruction,
session_len=session_len,
stop_words=stop_words,
**kwargs)
caps = ['completion', 'infilling', 'chat', 'python'] caps = ['completion', 'infilling', 'chat', 'python']
assert self.capability in caps, \ assert self.capability in caps, \
f'{self.capability} is not supported. ' \ f'{self.capability} is not supported. ' \
f'The supported capabilities are: {caps}' f'The supported capabilities are: {caps}'
self.default_sys_prompt = system self.meta_instruction = meta_instruction
self.session_len = session_len self.session_len = session_len
self.suffix_first = suffix_first self.suffix_first = suffix_first
self.stop_words = stop_words self.stop_words = stop_words
if self.capability == 'infilling':
# The following sampling parameters refers to https://github.com/facebookresearch/codellama # noqa: E501
if self.capability == 'completion' or self.capability == 'python':
self.top_p = kwargs.get('top_p', 0.9)
self.temperature = kwargs.get('temperature', 0.2)
if self.capability == 'chat':
self.top_p = kwargs.get('top_p', 0.95)
self.temperature = kwargs.get('temperature', 0.2)
elif self.capability == 'infilling':
self.top_p = kwargs.get('top_p', 0.9)
self.temperature = kwargs.get('temperature', 0.0)
if self.stop_words is None: if self.stop_words is None:
self.stop_words = ['<EOT>'] self.stop_words = ['<EOT>']
def decorate_prompt(self, prompt, sequence_start=True): def get_prompt(self, prompt, sequence_start=True):
if self.capability == 'infilling': if self.capability == 'infilling':
return self._infill_prompt(prompt) return self._infill_prompt(prompt)
elif self.capability == 'chat': elif self.capability == 'chat':
return self._get_prompt(prompt, sequence_start) return super().get_prompt(prompt, sequence_start)
else: # python speicalist else: # python speicalist
return prompt return prompt
...@@ -565,92 +571,130 @@ class CodeLlama(Llama2): ...@@ -565,92 +571,130 @@ class CodeLlama(Llama2):
prompt = f'<PRE> {prefix} <SUF>{suffix} <MID>' prompt = f'<PRE> {prefix} <SUF>{suffix} <MID>'
return prompt return prompt
def _get_prompt(self, prompt, sequence_start): @classmethod
prompt = prompt.strip() def match(cls, model_path: str) -> Optional[str]:
if sequence_start: """Return the model_name that was registered to MODELS.
return f'{self.b_inst} ' \
f'{self.b_sys}{self.default_sys_prompt}{self.e_sys}' \ Args:
f'{prompt} {self.e_inst}' model_path (str): the model path used for matching.
"""
if 'codellama' in model_path.lower():
return 'codellama'
@MODELS.register_module(name='falcon')
class Falcon(BaseModel):
def __init__(self, **kwargs):
super().__init__(**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
if 'falcon' in model_path.lower():
return 'falcon'
@MODELS.register_module(name='chatglm2-6b')
@MODELS.register_module(name='chatglm')
class ChatGLM2(BaseModel):
def __init__(self,
user='问:',
eoh='\n\n',
assistant='答:',
eoa='\n\n',
**kwargs):
super().__init__(**kwargs)
self._user = user
self._assistant = assistant
self._eoh = eoh
self._eoa = eoa
self.count = 0
return f'{self.b_inst} {prompt} {self.e_inst}' def get_prompt(self, prompt, sequence_start=True):
"""get prompt."""
# need more check
# https://github.com/THUDM/ChatGLM2-6B/issues/48
# [64790, 64792] to be prepended
self.count += 1
ret = f'[Round {self.count}]\n\n'
ret += f'{self._user}{prompt}{self._eoh}'
ret += f'{self._assistant}'
return ret
def messages2prompt(self, messages, sequence_start=True): def messages2prompt(self, messages, sequence_start=True):
assert self.capability == 'chat', \ """message to prompt."""
f'codellama message2prompt only supports chat mode ' \ if isinstance(messages, str):
f'but got {self.cap} mode' return self.get_prompt(messages, sequence_start)
return super().messages2prompt(messages, sequence_start) ret = ''
count = 0
for message in messages:
role = message['role']
content = message['content']
if role == 'user':
count += 1
ret += f'[Round {count}]\n\n'
ret += f'{self._user}{content}{self._eoh}'
ret += f'{self._assistant}'
if role == 'assistant':
ret += f'{content}'
return ret
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
if 'chatglm' in model_path.lower():
return 'chatglm'
@MODELS.register_module(name='solar') @MODELS.register_module(name=['solar', 'solar-70b'])
class SOLAR(BaseModel): class SOLAR(BaseChatTemplate):
"""Chat template of SOLAR model. """Chat template of SOLAR model.
`https://huggingface.co/upstage/SOLAR-0-70b-16bit` `https://huggingface.co/upstage/SOLAR-0-70b-16bit`
""" """
def __init__(self, def __init__(self,
b_sys='### System:\n', system='### System:\n',
e_sys='\n\n', eosys='\n\n',
user='### User:\n', user='### User:\n',
eoh='\n\n', eoh='\n\n',
assistant='### Assistant:\n', assistant='### Assistant:\n',
eoa='\n\n', meta_instruction='',
system='',
session_len=2048, session_len=2048,
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(**kwargs)
self.b_sys = b_sys self.system = system
self.e_sys = e_sys self.eosys = eosys
self.user = user self.user = user
self.eoh = eoh self.eoh = eoh
self.assistant = assistant self.assistant = assistant
self.eoa = eoa self.meta_instruction = meta_instruction
self.system = system
self.session_len = session_len self.session_len = session_len
def decorate_prompt(self, prompt, sequence_start=True): @classmethod
"""Return the prompt that is concatenated with other elements in the def match(cls, model_path: str) -> Optional[str]:
chat template. """Return the model_name that was registered to MODELS.
Args: Args:
prompt (str): user's input prompt model_path (str): the model path used for matching.
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
""" """
assert self.capability == 'chat', \ if 'solar' in model_path.lower():
f'{type(self).__name__} has no capability of {self.capability}' return 'solar'
if sequence_start:
return f'{self.b_sys}{self.system}{self.e_sys}' \
f'{self.user}{prompt}{self.eoh}{self.assistant}'
return f'{self.user}{prompt}{self.eoh}{self.assistant}'
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the
chat template.
Args:
messages (str | List): user's input prompt
Returns:
str: the concatenated prompt
"""
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
system, users, assistants = self._translate_messages(messages)
system = self.system if not system else system
ret = f'{self.b_sys}{system}{self.e_sys}'
for i, (user, assistant) in enumerate(zip(users, assistants)):
ret += f'{self.user}{user}{self.eoh}{self.assistant}'
if assistant:
ret += f'{assistant}{self.eoa}'
return ret
@MODELS.register_module(name='ultracm') @MODELS.register_module(name='ultracm')
@MODELS.register_module(name='ultralm') @MODELS.register_module(name='ultralm')
class UltraChat(BaseModel): class UltraChat(BaseChatTemplate):
"""Template of UltraCM and UltraLM models. """Template of UltraCM and UltraLM models.
`https://huggingface.co/openbmb/UltraCM-13b` `https://huggingface.co/openbmb/UltraCM-13b`
...@@ -659,147 +703,222 @@ class UltraChat(BaseModel): ...@@ -659,147 +703,222 @@ class UltraChat(BaseModel):
def __init__( def __init__(
self, self,
system="""User: A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.</s>""", # noqa: E501 system='User: ',
eos='</s>', meta_instruction="""A one-turn chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, very detailed, and polite answers to the user's questions.""", # noqa: E501
eosys='</s>\n',
user='User: ', user='User: ',
eoh='</s>\n',
assistant='Assistant: ', assistant='Assistant: ',
eoa='</s>',
separator='\n',
stop_words=['</s>'],
session_len=2048, session_len=2048,
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(system=system,
self.system = system meta_instruction=meta_instruction,
self.eos = eos eosys=eosys,
self.session_len = session_len user=user,
self.user = user eoh=eoh,
self.assistant = assistant assistant=assistant,
eoa=eoa,
def decorate_prompt(self, prompt, sequence_start=True): separator=separator,
"""Return the prompt that is concatenated with other elements in the stop_words=stop_words,
chat template. session_len=session_len,
**kwargs)
Args:
prompt (str): the input prompt
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
"""
assert self.capability == 'chat', \
f'{type(self).__name__} has no capability of {self.capability}'
if sequence_start:
return f'{self.system}\n{self.user}{prompt}{self.eos}' \
f'\n{self.assistant}'
return f'\n{self.user}{prompt}{self.eos}' \
f'\n{self.assistant}'
def messages2prompt(self, messages, sequence_start=True): @classmethod
"""Return the prompt that is concatenated with other elements in the def match(cls, model_path: str) -> Optional[str]:
chat template. Only evaluate the last instruction completion pair. """Return the model_name that was registered to MODELS.
Args: Args:
messages (str | List): user's input prompt model_path (str): the model path used for matching.
Returns:
str: the concatenated prompt
""" """
if isinstance(messages, str): if 'ultracm' in model_path.lower():
return self.get_prompt(messages, sequence_start) return 'ultracm'
system, users, assistants = self._translate_messages(messages) if 'ultralm' in model_path.lower():
system = self.system if not system else system return 'ultralm'
ret = f'{system}'
for user, assistant in zip(users, assistants):
if assistant:
ret += f'\n{self.user}{user}{self.eos}' \
f'\n{self.assistant}{assistant}{self.eos}'
else:
ret += f'\n{self.user}{user}{self.eos}' \
f'\n{self.assistant}'
return ret
@MODELS.register_module(name='yi') @MODELS.register_module(name=['yi', 'yi-chat', 'yi-200k', 'yi-34b'])
class Yi(BaseModel): class Yi(BaseChatTemplate):
"""Chat template of Yi model.""" """Chat template of Yi model."""
def __init__(self, def __init__(self,
system='<|im_start|>system\n', system='<|im_start|>system\n',
meta_instruction=None, meta_instruction=None,
eosys='<|im_end|>\n',
user='<|im_start|>user\n', user='<|im_start|>user\n',
eoh='<|im_end|>\n', eoh='<|im_end|>\n',
eoa='<|im_end|>\n',
eosys='<|im_end|>\n',
assistant='<|im_start|>assistant\n', assistant='<|im_start|>assistant\n',
eoa='<|im_end|>',
separator='\n',
stop_words=['<|im_end|>', '<|endoftext|>'], stop_words=['<|im_end|>', '<|endoftext|>'],
**kwargs): **kwargs):
super().__init__(**kwargs) super().__init__(system=system,
self.system = system meta_instruction=meta_instruction,
self.meta_instruction = meta_instruction eosys=eosys,
self.user = user user=user,
self.eoh = eoh eoh=eoh,
self.eoa = eoa assistant=assistant,
self.eosys = eosys eoa=eoa,
self.assistant = assistant separator=separator,
self.stop_words = stop_words stop_words=stop_words,
**kwargs)
def decorate_prompt(self, prompt, sequence_start=True): @classmethod
"""Return the prompt that is concatenated with other elements in the def match(cls, model_path: str) -> Optional[str]:
chat template. """Return the model_name that was registered to MODELS.
Args: Args:
prompt (str): user's input prompt model_path (str): the model path used for matching.
sequence_start (bool): indicator for the first round chat of a
session sequence
Returns:
str: the concatenated prompt
""" """
assert self.capability == 'chat', \ path = model_path.lower()
f'{type(self).__name__} has no capability of {self.capability}' if 'yi' in path and 'vl' not in path:
if sequence_start: return 'yi'
if self.meta_instruction is None:
return f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
return f'{self.system}{self.meta_instruction}{self.eosys}' \
f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
else:
return f'{self.user}{prompt}{self.eoh}' \
f'{self.assistant}'
def messages2prompt(self, messages, sequence_start=True):
"""Return the prompt that is concatenated with other elements in the @MODELS.register_module(name=['mistral', 'mixtral'])
chat template. @MODELS.register_module(name=['Mistral-7B-Instruct', 'Mixtral-8x7B-Instruct'])
class MistralChat(BaseChatTemplate):
"""Template of Mistral and Mixtral Instruct models.
`https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1`
`https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1`
"""
def __init__(self,
user='[INST] ',
eoh=' [/INST]',
eoa='</s>',
session_len=2048,
**kwargs):
super().__init__(user=user,
eoh=eoh,
eoa=eoa,
session_len=session_len,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args: Args:
messages (str | List): user's input prompt model_path (str): the model path used for matching.
Returns:
str: the concatenated prompt
""" """
if 'instruct' in model_path.lower():
if 'mistral' in model_path.lower():
return 'mistral'
if 'mixtral' in model_path.lower():
return 'mixtral'
if isinstance(messages, str):
return self.get_prompt(messages, sequence_start)
eox_map = dict(user=self.eoh, assistant=self.eoa, system=self.eosys)
ret = ''
if self.meta_instruction:
ret += f'{self.system}:{self.meta_instruction}{self.eosys}'
for message in messages: @MODELS.register_module(name=['gemma'])
role = message['role'] class Gemma(BaseChatTemplate):
content = message['content'] """Template of Gemma models.
ret += f'{eval(f"self.{role}")}{content}{eox_map[role]}'
ret += f'{self.assistant}' `https://huggingface.co/google/gemma-7b-it`
return ret """
def __init__(self,
user='<start_of_turn>user\n',
eoh='<end_of_turn>\n',
assistant='<start_of_turn>model\n',
eoa='<end_of_turn>\n',
**kwargs):
super().__init__(user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
if 'gemma' in model_path.lower():
return 'gemma'
def main(model_name: str = 'test'): @MODELS.register_module(name=['deepseek-chat'])
assert model_name in MODELS.module_dict.keys(), \ @MODELS.register_module(name=['deepseek'])
f"'{model_name}' is not supported. " \ class Deepseek(BaseChatTemplate):
f'The supported models are: {MODELS.module_dict.keys()}'
model = MODELS.get(model_name)()
prompt = model.get_prompt(prompt='hi')
print(prompt)
print(f'session_len: {model.session_len}')
def __init__(self,
user='User: ',
eoh='\n\n',
assistant='Assistant: ',
eoa='<|end▁of▁sentence|>',
**kwargs):
super().__init__(user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
**kwargs)
if __name__ == '__main__': @classmethod
import fire def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
fire.Fire(main) Args:
model_path (str): the model path used for matching.
"""
path = model_path.lower()
if 'deepseek' in path and 'chat' in path:
return 'deepseek'
@MODELS.register_module(name=['yi-vl'])
class YiVL(BaseChatTemplate):
def __init__(
self,
meta_instruction="""This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers. 这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像,并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n""", # noqa: E501
user='### Human: ',
eoh='\n',
assistant='### Assistant:',
eoa='\n',
stop_words=['###'],
**kwargs):
super().__init__(meta_instruction=meta_instruction,
user=user,
eoh=eoh,
assistant=assistant,
eoa=eoa,
stop_words=stop_words,
**kwargs)
@classmethod
def match(cls, model_path: str) -> Optional[str]:
"""Return the model_name that was registered to MODELS.
Args:
model_path (str): the model path used for matching.
"""
path = model_path.lower()
if 'yi-vl' in path:
return 'yi-vl'
def best_match_model(query: str) -> Optional[str]:
"""Get the model that matches the query.
Args:
query (str): the input query. Could be a model path.
Return:
str | None: the possible model name or none.
"""
for name, model in MODELS.module_dict.items():
if model.match(query):
return model.match(query)
try:
from transformers import AutoTokenizer
tokenizer_config = AutoTokenizer.from_pretrained(
query, trust_remote_code=True)
if tokenizer_config.chat_template is None:
return 'base'
except Exception as e:
assert type(e) == OSError
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
"""Chat with torch models."""
...@@ -13,6 +13,7 @@ class LoadNoInit: ...@@ -13,6 +13,7 @@ class LoadNoInit:
self.normal_ = torch.nn.init.normal_ self.normal_ = torch.nn.init.normal_
self.kaiming_uniform_ = torch.nn.init.kaiming_uniform_ self.kaiming_uniform_ = torch.nn.init.kaiming_uniform_
self.kaiming_normal_ = torch.nn.init.kaiming_normal_ self.kaiming_normal_ = torch.nn.init.kaiming_normal_
self.tensor_normal_ = torch.Tensor.normal_
def __enter__(self, *args, **kwargs): def __enter__(self, *args, **kwargs):
"""Replace initializers with no-op.""" """Replace initializers with no-op."""
...@@ -24,6 +25,7 @@ class LoadNoInit: ...@@ -24,6 +25,7 @@ class LoadNoInit:
torch.nn.init.normal_ = lambda *args, **kwargs: None torch.nn.init.normal_ = lambda *args, **kwargs: None
torch.nn.init.kaiming_uniform_ = lambda *args, **kwargs: None torch.nn.init.kaiming_uniform_ = lambda *args, **kwargs: None
torch.nn.init.kaiming_normal_ = lambda *args, **kwargs: None torch.nn.init.kaiming_normal_ = lambda *args, **kwargs: None
torch.Tensor.normal_ = lambda *args, **kwargs: None
def __exit__(self, *args, **kwargs): def __exit__(self, *args, **kwargs):
"""Recover.""" """Recover."""
...@@ -35,3 +37,4 @@ class LoadNoInit: ...@@ -35,3 +37,4 @@ class LoadNoInit:
torch.nn.init.normal_ = self.normal_ torch.nn.init.normal_ = self.normal_
torch.nn.init.kaiming_uniform_ = self.kaiming_uniform_ torch.nn.init.kaiming_uniform_ = self.kaiming_uniform_
torch.nn.init.kaiming_normal_ = self.kaiming_normal_ torch.nn.init.kaiming_normal_ = self.kaiming_normal_
torch.Tensor.normal_ = self.tensor_normal_
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import torch.nn as nn
from .base import BasicAdapter, BasicAdapterFast
from .internlm import InternLMAdapter
from .llama2 import Llama2Adapter
logger = logging.getLogger(__name__)
def _get_default_adapter(tokenizer):
if tokenizer.is_fast:
return BasicAdapterFast
else:
return BasicAdapter
def init_adapter(model: nn.Module, tokenizer, adapter=None):
if adapter is None:
for v in model.modules():
if 'InternLMModel' in v.__class__.__name__:
Adapter = InternLMAdapter
break
elif 'LlamaModel' in v.__class__.__name__:
Adapter = Llama2Adapter
break
else:
Adapter = _get_default_adapter(tokenizer)
elif adapter == 'llama1':
Adapter = _get_default_adapter(tokenizer)
else:
raise ValueError(f'Adapter {adapter} is not allowed.')
logger.info(f'Using adapter {Adapter.__name__}')
return Adapter(tokenizer)
# Copyright (c) OpenMMLab. All rights reserved.
"""Basic adapter suitable for general HuggingFace models."""
import logging
import re
from transformers import (PreTrainedTokenizer, PreTrainedTokenizerBase,
PreTrainedTokenizerFast)
logger = logging.getLogger(__name__)
class BaseAdapter:
"""Base class for all adapters.
Note:
Adapters coordinate with the session manager to prepare input_ids.
The full sequence fed to the model is as follows:
```
adapter.start_ids
adapter.encode_and_decorate(user_input_1)
output_1_generated_by_model
adapter.sep_ids
adapter.encode_and_decorate(user_input_2)
output_2_generated_by_model
adapter.sep_ids
adapter.encode_and_decorate(user_input_3)
```
Thus adapter is responsible for providing model specific
``start_ids``, ``sep_ids``, and method to encode single prompt.
"""
def __init__(self, tokenizer: PreTrainedTokenizerBase):
self.tokenizer = tokenizer
def encode_and_decorate(self, prompt, add_special_tokens=False):
"""Model specific method to encode and decorate prompt."""
raise NotImplementedError
def decode(self, value):
"""Model specific method to decode single value to string."""
raise NotImplementedError
@property
def stopping_criteria(self):
"""Model specific stopping criteria for generation."""
return None
@property
def start_ids(self):
"""Model specific start ids."""
return [self.tokenizer.bos_token_id]
@property
def sep_ids(self):
"""Model specific separation ids."""
return [self.tokenizer.bos_token_id]
class BasicAdapter(BaseAdapter):
"""Basic adapter for slow tokenizers."""
def encode_and_decorate(self, prompt, add_special_tokens=False):
"""Encode prompt.
Note:
we leave <bos> to session manager to add.
"""
input_ids = self.tokenizer.encode(
prompt,
add_special_tokens=add_special_tokens,
return_tensors='pt',
)
logger.debug(f'Encode {prompt} to {input_ids}')
return input_ids
def decode(self, value):
"""Fallback when tokenizer is not fast."""
self.tokenizer: PreTrainedTokenizer
tok = self.tokenizer.decode(value)
return tok + ' '
class BasicAdapterFast(BaseAdapter):
"""Basic adapter for slow tokenizers."""
hex_regex = re.compile(r'^<0x([0-9ABCDEF]+)>$')
def encode_and_decorate(self, prompt, add_special_tokens=False):
"""Encode prompt.
Note:
we leave <bos> to session manager to add.
"""
input_ids = self.tokenizer.encode(
prompt,
add_special_tokens=add_special_tokens,
return_tensors='pt',
)
logger.debug(f'Encode {prompt} to {input_ids}')
return input_ids
def decode(self, value):
"""Decode with fast tokenizers."""
self.tokenizer: PreTrainedTokenizerFast
tok = self.tokenizer._convert_id_to_token(value)
if tok.startswith('▁'): # sentencepiece
space = ' '
tok = tok[1:]
else:
space = ''
if res := self.hex_regex.match(tok):
tok = chr(int(res.group(1), 16))
if tok == '</s>' or tok == '\r':
tok = '\n'
tok = space + tok
logger.debug(f'Decode {value} to {repr(tok)}')
return tok
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import re
import torch
from transformers import (PreTrainedTokenizerFast, StoppingCriteria,
StoppingCriteriaList)
from .base import BaseAdapter
logger = logging.getLogger(__name__)
class InternLMStoppingCriteria(StoppingCriteria):
"""Stopping criteria for HF version of InternLM."""
def __call__(self, input_ids, *args, **kwargs) -> bool:
return input_ids[0, -1] in [2, 103028]
class InternLMAdapter(BaseAdapter):
"""Adapter for InternLM.
InternLM use the following template and \n should be 13.
<bos> (no actual newline here, just for better readability)
<|User|>:{prompt}<eoh>\n
<|Bot|>:{model_output}<eoa>\n
<|User|>:{prompt}<eoh>\n
<|Bot|>:{model_output}<eoa>\n
...
<eos>
"""
hex_regex = re.compile(r'^<0x([0-9ABCDEF]+)>$')
# ids of '<|User|>:'
B_USER_ID = torch.tensor([[333, 352, 1621, 352, 27232]])
# ids of '<eoh>\n<|Bot|>:'
E_USER_ID = torch.tensor([[103027, 13, 333, 352, 23845, 352, 27232]])
# ids of '<bos>'
start_ids = [1]
# ids of '\n'
sep_ids = [13]
def __init__(self, tokenizer: PreTrainedTokenizerFast):
self.tokenizer = tokenizer
def encode_and_decorate(self, prompt):
r"""Encode prompt and decorate with template.
Note:
we leave <bos> and chat history for session manager to add,
so we will decorate input_ids to '<|User|>:{prompt}<eoh>\n<|Bot|>:'
"""
input_ids = self.tokenizer.encode(
prompt,
add_special_tokens=False,
return_tensors='pt',
)
# This is f'<|User|>:{prompt}<eoh>\n<|Bot|>:'
# but force \n to 13 instead of 364
input_ids = torch.cat([self.B_USER_ID, input_ids, self.E_USER_ID],
dim=1)
return input_ids
def decode(self, value):
"""Decode generated tokens for InternLM."""
tok = self.tokenizer.decode(value)
if res := self.hex_regex.match(tok):
tok = chr(int(res.group(1), 16))
if tok == '</s>' or tok == '<eoa>' or tok == '\r':
tok = '\n'
logger.debug(f'Decode {value} to {repr(tok)}')
return tok
@property
def stopping_criteria(self):
return StoppingCriteriaList([InternLMStoppingCriteria()])
# Copyright (c) OpenMMLab. All rights reserved.
import logging
import re
from transformers import PreTrainedTokenizerFast
from .base import BasicAdapterFast
logger = logging.getLogger(__name__)
B_INST, E_INST = '[INST]', '[/INST]'
B_SYS, E_SYS = '<<SYS>>\n', '\n<</SYS>>\n\n'
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" # noqa: E501
class Llama2Adapter(BasicAdapterFast):
"""Adapter for llama2.
Llama2 use the following template and the first user prompt
should contain a system prompt.
User can specify the system prompt using a <<SYS>> tag otherwise
the default system prompt is prepended to user's input.
<bos>
[INST]<space>
<<SYS>>\n
SYSTEM_PROMPT\n
<</SYS>>\n\n
{user_prompt_1}<space>
[/INST]<space>
{answer_1}<space>
<eos>
<bos>
[INST]<space>
{user_prompt_2}<space>
[/INST]<space>
{answer_2}<space>
<eos>
<bos>
[INST]<space>
{user_prompt_2}(no space here)
...
"""
start_ids = []
sep_ids = []
def __init__(self, tokenizer: PreTrainedTokenizerFast):
super().__init__(tokenizer)
self.prev_round = 0
def encode_and_decorate(self, prompt):
r"""Encode prompt and decorate with template."""
if self.prev_round == 0:
res = re.search(r'<<SYS>>(.*?)<</SYS>>(.*)', prompt)
if res:
prompt = B_SYS + res.group(1).strip() + \
E_SYS + res.group(2).strip()
else:
prompt = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + prompt
prompt = f'{B_INST} {prompt.strip()} {E_INST}'
logger.debug(f'decorated prompt: {repr(prompt)}')
input_ids = self.tokenizer.encode(
prompt,
add_special_tokens=True,
return_tensors='pt',
)
self.prev_round += 1
return input_ids
# Copyright (c) OpenMMLab. All rights reserved. # Copyright (c) OpenMMLab. All rights reserved.
"""Chat through command line.
import os
This submodule allows user to chat with language model through command line, import random
and optionally accelerate model using backends like deepspeed. from typing import List
Example 1: Chat with default setting from lmdeploy.messages import EngineGenerationConfig, PytorchEngineConfig
from lmdeploy.model import MODELS, best_match_model
```python from lmdeploy.tokenizer import DetokenizeState, Tokenizer
python -m lmdeploy.pytorch.chat $PATH_TO_HF_MODEL
``` os.environ['TM_LOG_LEVEL'] = 'ERROR'
Example 2: Disable sampling
def input_prompt(model_name):
```python """Input a prompt in the consolo interface."""
python -m lmdeploy.pytorch.chat \ if model_name == 'codellama':
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \ print('\nenter !! to end the input >>>\n', end='')
--temperature 0 sentinel = '!!'
``` else:
print('\ndouble enter to end input >>> ', end='')
Example 3: Accelerate with deepspeed inference sentinel = '' # ends when this string is seen
return '\n'.join(iter(input, sentinel))
```python
python -m lmdeploy.pytorch.chat \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \ def valid_str(string, coding='utf-8'):
--accel deepspeed """decode text according to its encoding type."""
``` invalid_chars = [b'\xef\xbf\xbd']
bstr = bytes(string, coding)
Note: to use deepspeed, you need to install deepspeed, for invalid_char in invalid_chars:
and if hope to accelerate InternLM, you need a customized version bstr = bstr.replace(invalid_char, b'')
https://github.com/wangruohui/DeepSpeed/tree/support_internlm_0.10.0 ret = bstr.decode(encoding=coding, errors='ignore')
return ret
Example 4: Tensor parallel the model on 2 GPUs
```python def _stop_words(stop_words: List[str], tokenizer: Tokenizer):
deepspeed --module --num_gpus 2 lmdeploy.pytorch.chat \ """Return a list of token ids corresponding to stop-words."""
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \ if stop_words is None:
--accel deepspeed \ return None
``` assert isinstance(stop_words, List) and \
all(isinstance(elem, str) for elem in stop_words), \
This module also allow the following control commands to change f'stop_words must be a list but got {type(stop_words)}'
generation behaviors during chat. stop_words = [
tokenizer.encode(stop_word, False)[-1] for stop_word in stop_words
- `exit`: terminate and exit chat ]
- `config set key=value`: change generation config `key` to `value`, assert isinstance(stop_words, List) and all(
e.g. config temperature=0 disable sampling for following chats isinstance(elem, int) for elem in stop_words), 'invalid stop_words'
- `clear`: clear chat history return stop_words
"""
import itertools def run_chat(model_path: str,
import logging engine_config: PytorchEngineConfig,
from typing import Optional gen_config: EngineGenerationConfig = None,
session_id: int = 1,
import torch trust_remote_code: bool = True):
from transformers import GenerationConfig, PreTrainedModel """An example to perform model inference through the command line
interface.
from .adapters import init_adapter
from .dist import get_local_rank, get_rank, get_world_size
from .model import accel_model, init_model
from .session import BasicSessionManagerWithHistory
from .utils import BasicStreamer, TerminalIO, control
logger = logging.getLogger(__name__)
def set_logging(log_file: str, debug: bool):
torch.set_printoptions(linewidth=120)
level = logging.DEBUG if debug else logging.INFO
log_file = log_file or 'chat.log'
if r := get_rank() != 0:
log_file = log_file + f'.{r}'
logging.basicConfig(level=level,
format=('%(filename)s: '
'%(levelname)s: '
'%(funcName)s(): '
'%(lineno)d:\t'
'%(message)s'),
filename=log_file,
filemode='w')
print(f'Worker {get_rank()} logging to {log_file}')
def main(
model_path: str,
tokenizer_path: Optional[str] = None,
accel: Optional[str] = None,
max_new_tokens: int = 128,
temperature: float = 0.8,
top_p: float = 0.95,
seed: int = 0,
use_fast_tokenizer: bool = True,
max_alloc: int = 2048,
max_session_len: int = None,
log_file: Optional[str] = None,
debug: bool = False,
adapter: Optional[str] = None,
):
"""Chat with model through terminal.
Args: Args:
model_path (str): Path to model. model_path (str): the huggingface model path.
tokenizer_path (str): Path to tokenizer. engine_config (PytorchEngineConfig): Config of engine.
accel (str): Model accelerator. gen_config (EngineGenerationConfig): Config of generation.
max_new_tokens (int): Maximum number of tokens to generate. session_id (int): the identical id of a session.
temperature (float): Temperature for sampling. trust_remote_code (bool): trust remote code.
top_p (float): Top p for sampling. """
seed (int): Random seed. from lmdeploy.pytorch.engine import Engine
use_fast_tokenizer (bool): Whether to use fast tokenizer. tm_model = Engine.from_pretrained(model_path,
This argument is directly pass to transformer's ``AutoTokenizer.from_pretrained``. engine_config=engine_config,
Generally, user should choose to use fast tokenizers. trust_remote_code=trust_remote_code)
But if using fast raise some error, try to force using a slow one. tokenizer = tm_model.tokenizer
max_alloc (int): Maximum memory to allocate (for deepspeed). generator = tm_model.create_instance()
max_session_len (int): Maximum number of tokens allowed for all chat sessions. adapter_name = None
This include both history and current session. if engine_config.adapters is not None:
log_file (str): Path to log file. adapter_name = next(iter(engine_config.adapters.keys()))
debug (bool): Whether to enable debug mode.
adapter (str): Force to use an adapter. if gen_config is None:
Generally user should not use this argument because adapter is selected based gen_config = EngineGenerationConfig()
on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
based on `LlamaforCausalLM` class, this argument is required. nth_round = 1
Currently, only "llama1" is acceptable for llama1 models. step = 0
""" # noqa: E501 seed = random.getrandbits(64)
set_logging(log_file, debug) model_name = engine_config.model_name
if model_name is None:
# workers should sync in sampling model_name = best_match_model(model_path)
torch.manual_seed(seed) assert model_name is not None, 'Can not find match model template'
print(f'match template: <{model_name}>')
local_rank = get_local_rank() model = MODELS.get(model_name)()
world_size = get_world_size() stop_words = _stop_words(model.stop_words, tokenizer)
# Init model and tokenizer while True:
if not tokenizer_path: prompt = input_prompt(model_name)
tokenizer_path = model_path if prompt == 'exit':
exit(0)
model, tokenizer = init_model( elif prompt == 'end':
model_path, generator.end(session_id)
tokenizer_path, nth_round = 1
use_fast_tokenizer=use_fast_tokenizer, step = 0
) seed = random.getrandbits(64)
else:
# Init adapter based on model and tokenizer prompt = model.get_prompt(prompt, nth_round == 1)
adapter = init_adapter(model, tokenizer, adapter) input_ids = tokenizer.encode(prompt, nth_round == 1)
session_len = model.session_len
# Accelerate model if session_len is None:
model: PreTrainedModel = accel_model(model, session_len = tm_model.session_len
accel, if step >= session_len:
max_alloc=max_alloc, print('WARNING: exceed session max length.'
tp_size=world_size) ' Please end the session.')
continue
# warmup
warmup_config = GenerationConfig( print(f'{prompt} ', end='', flush=True)
max_new_tokens=1, state = DetokenizeState()
do_sample=temperature > 0, gen_config.random_seed = seed
temperature=temperature, gen_config.stop_words = stop_words
top_p=top_p, for outputs in generator.stream_infer(session_id=session_id,
) input_ids=input_ids,
model.generate(torch.tensor([[6]], device=get_local_rank()), warmup_config) gen_config=gen_config,
adapter_name=adapter_name):
gen_config = GenerationConfig( status, res, tokens = outputs
max_new_tokens=max_new_tokens, # decode res
do_sample=temperature > 0, response, state = tokenizer.detokenize_incrementally(
temperature=temperature, res, state)
top_p=top_p, response = valid_str(response)
) print(f'{response}', end='', flush=True)
# Session manager handling history # update step
max_session_len = max_alloc if max_session_len is None else max_session_len step += len(input_ids) + tokens
sm = BasicSessionManagerWithHistory(max_session_len=max_session_len, print()
start_ids=adapter.start_ids,
sep_ids=adapter.sep_ids) nth_round += 1
io = TerminalIO()
streamer = BasicStreamer(adapter.decode, io.output)
def main(model_path: str,
for r in itertools.count(1): model_name: str = None,
# User input from IO session_id: int = 1,
logger.info(f'Round {r}') top_k: float = 40,
top_p: float = 0.8,
prompt: str = io.input() temperature: float = 0.8,
logger.info(f'User input: {prompt}') repetition_penalty: float = 1.0,
tp: int = 1,
# Allow user to change config during runtime or exit stream_output: bool = True,
if control(prompt, gen_config, sm): adapter: str = None,
continue trust_remote_code: bool = True):
"""An example to perform model inference through the command line
# Tokenize and apply model specific templates interface.
input_ids = adapter.encode_and_decorate(prompt)
logger.info(f'Input ids:\n{input_ids}')
# Prepend chat history (tensor concatenation)
input_ids = sm.prepend_history(input_ids)
logger.info(f'Input ids with history:\n{input_ids}')
# Generate
input_ids = input_ids.cuda(local_rank)
# returned tensor including input and generated output
output = model.generate(input_ids,
gen_config,
streamer=streamer,
stopping_criteria=adapter.stopping_criteria)
logger.info(f'Output:\n{output}')
# Save output into session manager and maybe trim some history
sm.add_to_history(output)
def cli():
import fire
fire.Fire(main) Args:
model_path (str): the huggingface model path
model_name (str): name of the model.
session_id (int): the identical id of a session
top_k (int): sampling top k.
top_p (int): sampling top p.
temperature (float): sampling temperature.
repetition_penalty (float): parameter to penalize repetition
tp (int): GPU number used in tensor parallelism
stream_output (bool): indicator for streaming output or not
adapter (str): path to lora adapter.
trust_remote_code (bool): Trust remote code.
"""
adapters = None
if adapter is not None:
adapters = dict(default=adapter)
engine_config = PytorchEngineConfig(model_name=model_name,
tp=tp,
adapters=adapters)
gen_config = EngineGenerationConfig(max_new_tokens=512,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
ignore_eos=False)
return run_chat(model_path,
engine_config,
gen_config,
session_id=session_id,
trust_remote_code=trust_remote_code)
if __name__ == '__main__': if __name__ == '__main__':
cli() import fire
fire.Fire(main)
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import logging
import queue
import warnings
from typing import List, Optional
import pynvml
import torch
import torch.multiprocessing as mp
from torch.nn.utils.rnn import pad_sequence
from transformers import (AutoTokenizer, PreTrainedModel,
PreTrainedTokenizerBase)
from .model import accel_model, init_model
def safe_numel(free_mem, model_size, max_intermediate):
"""Number of elements without out-of-memory."""
return int(free_mem - model_size) // max_intermediate
def avail_gpus(percentage=0.96):
"""Detect available gpus.
Args:
percentage (float): The minimum percentage of free memory to be
considered as available.
Return:
A list of gpu ids.
average free memory on single gpu.
"""
gpus = []
mems = []
pynvml.nvmlInit()
for i in range(torch.cuda.device_count()):
handle = pynvml.nvmlDeviceGetHandleByIndex(int(i))
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
free, total = int(mem_info.free), int(mem_info.total)
if free / total > percentage:
gpus.append(i)
mems.append(free)
pynvml.nvmlShutdown()
if len(gpus) == 0:
raise RuntimeError('No GPU available.')
return gpus, sum(mems) / len(mems)
@torch.no_grad()
def decode_single(model: PreTrainedModel,
input_ids: torch.Tensor,
attention_mask: torch.Tensor = None,
return_logits=True):
"""Decode a single batch.
Args:
model (PreTrainedModel): Pretrained model.
input_ids (torch.Tensor): A batch of input ids.
attention_mask (torch.Tensor): A batch of attention masks.
Returns:
torch.Tensor: A batch of probabilities (on CPU).
Note:
This function assume input_ids[i] = [bos, x1, x2, ..., xn]
and return prob = [p(x1|bos), p(x2|bos,x1), ..., p(xn|bos..xn-1)]
So prob is shorter than input_ids by 1.
"""
# Call Causal LM forward
outputs = model(input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=False,
output_attentions=False,
use_cache=False,
return_dict=True)
# fp32, [bs, seq_len, vocab_size]
logits = outputs.logits
if not return_logits:
# inplace softmax to get probs
torch.softmax(logits, dim=-1, out=logits)
# Shift to fetch probabilities
shift_labels = input_ids[..., 1:].contiguous()
shift_probs = logits[..., :-1, :].contiguous()
logits = torch.gather(shift_probs, -1, shift_labels.unsqueeze(-1))
if attention_mask is not None:
logits *= attention_mask[..., None]
logits = logits.cpu()
return logits
def worker_fn(model_path: str,
inq: mp.Queue,
outq: mp.Queue,
accel: Optional[str] = None,
gpu_id=0):
# torch.set_default_device(gpu_id)
model, _ = init_model(model_path)
model = model.eval()
model = accel_model(model, accel, gpu_id=gpu_id)
while True:
try:
idx, args = inq.get(timeout=1)
except queue.Empty:
continue
if idx is None:
print(f'Worker {gpu_id} received exit signal.')
break
# print(args)
input_ids, input_lens, *args = args
input_ids = input_ids.cuda(gpu_id)
max_len = max(input_lens)
assert max_len == input_ids.size(-1), \
f'input_ids.shape = {input_ids.shape}, max_len = {max_len}'
input_lens = torch.tensor(input_lens, device=gpu_id)
attention_mask = \
torch.arange(max_len, device=gpu_id)[None, :] < input_lens[:, None]
assert attention_mask.shape == input_ids.shape, \
f'attention_mask.shape = {attention_mask.shape}'
try:
probs = decode_single(model, input_ids, attention_mask, *args)
except torch.cuda.OutOfMemoryError:
warnings.warn(
f'OOM on GPU {gpu_id}, discard prompts at indics {idx}.')
probs = torch.empty((input_ids.size(0), 0),
dtype=torch.float32,
device='cpu')
outq.put((idx, probs))
print(f'Exiting worker {gpu_id} ...')
inq.close()
outq.close()
print(f'Worker {gpu_id} finished.')
class Engine:
"""Multi-GPU deciding engine.
Args:
model_path (str): Path to the pretrained model.
tokenizer_path (str, optional): Path to the pretrained tokenizer.
Defaults to None.
Either tokenizer_path or tokenizer should be provided.
tokenizer (PreTrainedTokenizerBase, optional): Pre-configured tokenizer.
Defaults to None.
Either tokenizer_path or tokenizer should be provided.
accel (str, optional): Acceleration method.
Defaults to None. 'deepspeed' is not tested.
gpu_mem_percentage (float, optional): GPU with memory larger than this value
are considered available and be used as decode device.
Defaults to 0.96.
model_size_byte (float, optional): (Approximate) model size in bytes.
Defaults to 14e9 (7B model in FP16).
bytes_per_token (float, optional): (Approximate) memory cost per token in bytes.
Defaults to 2e6 (2MB).
``bytes_per_token`` and ``model_size_byte`` are used to compute
the maximum batch size for given seq_length
""" # noqa: E501
def __init__(self,
model_path: str,
tokenizer_path: Optional[str] = None,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
accel: Optional[str] = None,
gpu_mem_percentage: float = 0.96,
model_size_byte=14e9,
bytes_per_token=2e6):
gpu_ids, mem = avail_gpus(gpu_mem_percentage)
print(f'Available GPUs are: {gpu_ids}, ', end='')
print(f'with {mem/2**30:.2f} GiB free.')
ctx = mp.get_context('spawn')
inq = ctx.Queue()
outq = ctx.Queue()
ps = []
for id in gpu_ids:
p = ctx.Process(target=worker_fn,
args=(model_path, inq, outq, accel, id))
p.start()
ps.append(p)
if tokenizer is None:
if tokenizer_path is None:
tokenizer_path = model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
self.gpu_ids = gpu_ids
self.inq = inq
self.outq = outq
self.ps = ps
self.tokenizer = tokenizer
self.safe_numel = safe_numel(mem, model_size_byte, bytes_per_token)
def clear_queue(self):
for q in self.inq, self.outq:
while not q.empty():
q.get()
def decode(self,
token_ids: List[List[int]],
sort=True,
max_bs: int = 1024,
pad=True,
pad_token_id=2,
return_logits=True):
"""Inference the model to compute probabilities.
Args:
token_ids (List[List[int]]): List of list of token ids.
sort (bool, optional): Internally sort the prompts by length to achieve better efficiency.
Defaults to True.
Note: orders of returned probabilities are always the same as the input.
max_bs (int, optional): Maximum batch size.
Defaults to 1024.
pad (bool, optional): Pad the prompts in every mini batch to the same length.
Defaults to True. Set to False to save memory.
return_logits (bool, optional): Return logits instead of probabilities.
Returns:
numpy.ndarray: Array of logits of shape [bsz, seqlen, vocab_size],
with prob=0 padded, if pad is True
List[numpy.ndarray]: List of logits without padding, if pad is False.
Note:
This function will accept input token_ids = [x0(=bos), x1, x2, ..., xn]
and compute prob = [p(x1|x0), p(x2|x0,x1), ..., p(xn|x0..xn-1)]
So prob is shorter than input_ids by 1.
""" # noqa: E501
self.clear_queue()
# sort to achieve better efficiency
if sort:
pids_and_indicis = sorted(enumerate(token_ids),
key=lambda i_and_x: len(i_and_x[1]))
else:
pids_and_indicis = list(enumerate(token_ids))
left = 0
bs = max_bs
while left < len(token_ids):
if not sort:
bs = max_bs
right = min(left + bs, len(token_ids))
# batch of prompts
sub_p_and_i = pids_and_indicis[left:right]
idx, sub_p = zip(*sub_p_and_i)
# batch of input_ids and attn_masks
# inputs = self.tokenizer(sub_p, return_tensors='pt', padding=True)
input_ids = [torch.tensor(p) for p in sub_p]
input_ids = pad_sequence(input_ids,
batch_first=True,
padding_value=pad_token_id)
input_lens = [len(p) for p in sub_p]
# Dynamic batch size based on safe memory
while input_ids.numel() > self.safe_numel:
if bs == 1:
break
bs = max(1, round(bs / 1.5))
print(f'\nReduce bs to {bs} when seq len reaches '
f'{input_ids.shape[-1]}')
idx = idx[:bs]
input_lens = input_lens[:bs]
input_ids = input_ids[:bs, :max(input_lens)]
# Send to worker
self.inq.put((idx, (input_ids, input_lens)))
left += bs
print(
f'Distributing prompts {right}/{len(token_ids)},'
f' {right/len(token_ids):.0%}',
end='\r')
print()
# Collect outputs from workers
all_probs = [None] * len(token_ids)
count = 0
while count < len(token_ids):
idx, probs = self.outq.get()
for i, p in zip(idx, probs):
assert all_probs[i] is None
all_probs[i] = p
count += len(idx)
print(
f'Decoding and collecting outputs '
f'{count}/{len(token_ids)}, '
f'{count/len(token_ids):.0%}',
end='\r')
print()
if pad:
all_probs = pad_sequence(all_probs, batch_first=True)
all_probs = all_probs.cpu().numpy()
else:
all_probs = [p.cpu().numpy() for p in all_probs]
return all_probs
def __del__(self):
print('Exiting engine ...')
for _ in self.ps:
self.inq.put((None, None))
for p in self.ps:
p.join(timeout=1)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model_path',
default='llama2/huggingface/llama-2-7b',
help='Path to HugigngFace model and tokenizer.')
parser.add_argument(
'--test_path',
default='',
help='Path to text file, with each line containing a prompt.')
parser.add_argument(
'-p',
'--prompts',
nargs='*',
default=[
'I believe the meaning of life is to find your gift.',
'Simply put, the theory of relativity states that',
'Building a website can be done in 10 simple steps:'
],
help="Prompt in command line, please quote \"\" every sentences, "
'surpassed by --test_path')
parser.add_argument('--min_len',
default=1,
help='Minimum length of prompts')
parser.add_argument('--save-to',
default='decode.out',
help='Save results to this file.')
args = parser.parse_args()
model_path = args.model_path
test_path = args.test_path
prompts = args.prompts
logger = logging.getLogger(__name__)
# logging.basicConfig(level=logging.DEBUG)
# Use test file preferentially
if test_path:
with open(test_path, 'r') as f:
prompts = f.readlines()
prompts = [p.strip() for p in prompts]
# Output infos
print(f'Model path: {model_path}')
def _format(ts, start, end):
if start < 0:
start += len(ts)
if end <= 0:
end += len(ts)
return '\n'.join(
(f'{i}\t{t}' for i, t in zip(range(start, end), ts[start:end])))
if len(prompts) > 10:
print('Prompts:\n' + _format(prompts, 0, 5) + '\n......\n' +
_format(prompts, -5, 0))
else:
print('Prompts:\n' + _format(prompts, 0, 0))
# Init Engine in backend
engine = Engine(model_path)
# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'
input_ids = tokenizer(prompts, padding=False)
input_ids: List[List[int]] = input_ids.input_ids
# Filter out too short prompts
input_ids = [i for i in input_ids if len(i) >= args.min_len]
if len(input_ids) < len(prompts):
logger.warning(
f'Filtered out {len(prompts) - len(input_ids)} prompts, '
f'because they are shorter than {args.min_len}.')
# Decode
logits = engine.decode(input_ids)
print(f'logits.shape = {logits.shape}')
# Save to pth
print(f'Dumping results to = {args.save_to}')
torch.save(logits, args.save_to, pickle_protocol=4)
del engine
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment