Commit d7117b95 authored by zhouxiang's avatar zhouxiang
Browse files

同步0.2.6代码

parent 5f83e392
# Copyright (c) OpenMMLab. All rights reserved.
from .turbomind import TurboMind
def bootstrap():
import os
import sys
has_turbomind = False
pwd = os.path.dirname(__file__)
if os.path.exists(os.path.join(pwd, 'lib')):
has_turbomind = True
if os.name == 'nt' and has_turbomind:
if sys.version_info[:2] >= (3, 8):
CUDA_PATH = os.getenv('CUDA_PATH')
assert CUDA_PATH is not None, 'Can not find $env:CUDA_PATH'
dll_path = os.path.join(CUDA_PATH, 'bin')
print('Add dll path {dll_path}, please note cuda version '
'should >= 11.3 when compiled with cuda 11')
os.add_dll_directory(dll_path)
bootstrap()
from .turbomind import TurboMind # noqa: E402
__all__ = ['TurboMind']
# Copyright (c) OpenMMLab. All rights reserved.
import dataclasses
import os
import random
from lmdeploy.turbomind.utils import get_gen_param
from lmdeploy.messages import EngineGenerationConfig
from lmdeploy.model import ChatTemplateConfig
from lmdeploy.tokenizer import DetokenizeState
os.environ['TM_LOG_LEVEL'] = 'ERROR'
......@@ -29,32 +30,51 @@ def valid_str(string, coding='utf-8'):
return ret
def main(model_path,
def main(model_path: str,
model_name: str = None,
session_id: int = 1,
cap: str = 'chat',
tp: int = 1,
stream_output: bool = True,
request_output_len: int = 512,
request_output_len: int = 1024,
chat_template_cfg: ChatTemplateConfig = None,
**kwargs):
"""An example to perform model inference through the command line
interface.
Args:
model_path (str): the path of the deployed model
model_name (str): the name of deployed model
session_id (int): the identical id of a session
cap (str): the capability of a model. For example, codellama has
the ability among ['completion', 'infilling', 'chat', 'python']
tp (int): GPU number used in tensor parallelism
stream_output (bool): indicator for streaming output or not
request_output_len (int): output token nums
chat_template_cfg (ChatTemplateConfig): Chat template config
**kwarg (dict): other arguments for initializing model's chat template
"""
from lmdeploy import turbomind as tm
tm_model = tm.TurboMind.from_pretrained(model_path,
tp=tp,
capability=cap,
**kwargs)
if chat_template_cfg is None:
chat_template_cfg = ChatTemplateConfig(model_name=model_name,
capability=cap)
new_kwargs = {}
for k, v in kwargs.items():
if hasattr(chat_template_cfg, k):
setattr(chat_template_cfg, k, v)
else:
new_kwargs[k] = v
kwargs = new_kwargs
tm_model = tm.TurboMind.from_pretrained(
model_path,
model_name=model_name,
tp=tp,
capability=cap,
chat_template_config=chat_template_cfg,
**kwargs)
tokenizer = tm_model.tokenizer
generator = tm_model.create_instance()
gen_config = EngineGenerationConfig(top_k=40)
nth_round = 1
step = 0
......@@ -90,29 +110,30 @@ def main(model_path,
' Please end the session.')
continue
gen_param = get_gen_param(cap, model.sampling_param, nth_round,
step, request_output_len, **kwargs)
sequence_start = (nth_round == 1)
sequence_end = False
if cap != 'chat': # not interactive for other capability
sequence_start, sequence_end = True, True
step = 0
print(f'{prompt} ', end='', flush=True)
response_size = 0
state = DetokenizeState()
for outputs in generator.stream_infer(
session_id=session_id,
input_ids=[input_ids],
sequence_start=sequence_start,
sequence_end=sequence_end,
step=step,
stream_output=stream_output,
**dataclasses.asdict(gen_param),
gen_config=gen_config,
ignore_eos=False,
random_seed=seed if nth_round == 1 else None):
res, tokens = outputs[0]
_, res, tokens = outputs
# decode res
response = tokenizer.decode(res.tolist(), offset=response_size)
# utf-8 char at the end means it's a potential unfinished
# byte sequence, continue to concate it with the next
# sequence and decode them together
if response.endswith('�'):
continue
response, state = tokenizer.detokenize_incrementally(
res, state=state)
response = valid_str(response)
print(f'{response}', end='', flush=True)
response_size = tokens
# update step
step += len(input_ids) + tokens
......
......@@ -7,10 +7,9 @@ from pathlib import Path
import fire
import torch
from huggingface_hub import snapshot_download
from lmdeploy.model import MODELS
from lmdeploy.turbomind.utils import create_hf_download_args
from lmdeploy.utils import get_model
from .source_model.base import INPUT_MODELS
from .target_model.base import OUTPUT_MODELS, TurbomindModelConfig
......@@ -19,7 +18,8 @@ supported_formats = ['llama', 'hf', 'awq', None]
special_input_model_map = {
'qwen': 'qwen',
'baichuan': 'baichuan',
'baichuan2': 'baichuan2'
'baichuan2': 'baichuan2',
'internlm2': 'internlm2'
}
......@@ -241,8 +241,7 @@ def main(model_name: str,
if not os.path.exists(model_path):
print(f'can\'t find model from local_path {model_path}, '
'try to download from huggingface')
download_kwargs = create_hf_download_args(**kwargs)
model_path = snapshot_download(model_path, **download_kwargs)
model_path = get_model(model_path)
print(f'load model from {model_path}')
# get tokenizer path
......
# Copyright (c) OpenMMLab. All rights reserved.
from .baichuan import Baichuan2Model, BaichuanModel # noqa: F401
from .baichuan_awq import Baichuan2AwqModel, BaichuanAwqModel # noqa: F401
from .internlm2 import InternLM2AwqModel, InternLM2Model # noqa: F401
from .llama import LlamaModel # noqa: F401
from .llama_awq import LlamaAwqModel # noqa: F401
from .meta_llama import MetaLlamaModel # noqa: F401
......
......@@ -9,8 +9,9 @@ from .llama import LlamaModel, LlamaReader
class BaichuanReader(LlamaReader):
"""BaichuanReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
super().__init__(new_params, unused_params, last_bin, model_cfg)
def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
"""Get q, k, v, o kind for layer i."""
......@@ -34,8 +35,9 @@ class BaichuanReader(LlamaReader):
class Baichuan2Reader(BaichuanReader):
"""Baichuan2Reader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
super().__init__(new_params, unused_params, last_bin, model_cfg)
def output_weight(self):
"""Get output."""
......
......@@ -9,8 +9,9 @@ from .llama_awq import ensure_fp16orint32
class BaichuanAwqReader(BaichuanReader):
"""BaichuanAwqReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
super().__init__(new_params, unused_params, last_bin, model_cfg)
def attn(self, i: int):
"""Get q, k, v, o qweight for layer i."""
......@@ -40,8 +41,9 @@ class BaichuanAwqReader(BaichuanReader):
class Baichuan2AwqReader(BaichuanAwqReader):
"""Baichuan2AwqReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
super().__init__(new_params, unused_params, last_bin, model_cfg)
def output_weight(self):
"""Get output."""
......
......@@ -2,6 +2,7 @@
import json
import os
import os.path as osp
from glob import glob
import torch
from safetensors.torch import load_file
......@@ -19,11 +20,13 @@ class LlamaReader(BaseReader):
norm_weight_key = 'model.norm.weight'
output_weight_key = 'lm_head.weight'
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
super().__init__()
self.params = unused_params
self.params.update(new_params)
self.last_bin = last_bin
self.model_cfg = model_cfg
self.init_layer_id()
def init_layer_id(self):
......@@ -128,13 +131,11 @@ class LlamaModel(BaseInputModel):
def get_ckpt(self):
"""Get weight files."""
suffixes = ['.safetensors', '.bin']
patterns = ['*.safetensors', 'pytorch_model*.bin']
files = []
for suffix in suffixes:
files = [
file for file in os.listdir(self.ckpt_path)
if file.endswith(suffix)
]
for pattern in patterns:
files = glob(os.path.join(self.ckpt_path, pattern))
files = [os.path.basename(file) for file in files]
if len(files) > 0:
break
files = sorted(files)
......@@ -159,7 +160,7 @@ class LlamaModel(BaseInputModel):
else:
new_params = load_file(osp.join(self.ckpt_path, ckpt))
ret = self.Reader(new_params, unused_params,
i == self.nmgrs - 1)
i == self.nmgrs - 1, self.model_info())
yield ret
ret.clean_up(is_last_bin)
except GeneratorExit:
......@@ -181,6 +182,7 @@ class LlamaModel(BaseInputModel):
model_arg = json.load(f)
num_layer = model_arg['num_hidden_layers']
norm_eps = model_arg['rms_norm_eps']
attn_head_num = model_arg['num_attention_heads']
if 'num_key_value_heads' in model_arg:
kv_head_num = model_arg['num_key_value_heads']
else:
......@@ -192,6 +194,7 @@ class LlamaModel(BaseInputModel):
return dict(num_layer=num_layer,
norm_eps=norm_eps,
attn_head_num=attn_head_num,
kv_head_num=kv_head_num,
rope_theta=rope_theta,
max_position_embeddings=max_position_embeddings,
......
......@@ -23,8 +23,9 @@ def ensure_fp16orint32(tensors: torch.Tensor):
class LlamaAwqReader(LlamaReader):
"""LlamaAwqReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
super().__init__(new_params, unused_params, last_bin, model_cfg)
def attn(self, i: int):
"""Get q, k, v, o qweight for layer i."""
......
......@@ -16,8 +16,9 @@ class QwenReader(LlamaReader):
norm_weight_key = 'transformer.ln_f.weight'
output_weight_key = 'lm_head.weight'
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
super().__init__(new_params, unused_params, last_bin, model_cfg)
def _attn(self, i: int, kind: str, size_dim: int, dim: int = 0):
"""Get q, k, v, o kind for layer i."""
......
......@@ -7,8 +7,9 @@ from .qwen import QwenModel, QwenReader
class QwenAwqReader(QwenReader):
"""QwenAwqReader."""
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool):
super().__init__(new_params, unused_params, last_bin)
def __init__(self, new_params: dict, unused_params: dict, last_bin: bool,
model_cfg: dict):
super().__init__(new_params, unused_params, last_bin, model_cfg)
def attn(self, i: int):
"""Get q, k, v, o qweight for layer i."""
......
# Copyright (c) OpenMMLab. All rights reserved.
import configparser
import copy
import inspect
import io
import json
import os.path as osp
from abc import ABC, abstractmethod
from dataclasses import dataclass
from configparser import ConfigParser
import torch
import tqdm
from mmengine import Registry
from pydantic.dataclasses import dataclass
from lmdeploy.messages import TurbomindEngineConfig
from lmdeploy.model import MODELS
from ..source_model.base import BaseInputModel, BaseReader
......@@ -30,18 +35,18 @@ def tprint(*args, **kwargs):
@dataclass
class TurbomindModelConfig:
"""Config for turbomind model."""
model_name: str
tensor_para_size: int
head_num: int
kv_head_num: int
vocab_size: int
num_layer: int
inter_size: int
norm_eps: float
attn_bias: int
start_id: int
end_id: int
session_len: int
model_name: str = None
tensor_para_size: int = None
head_num: int = None
kv_head_num: int = None
vocab_size: int = None
num_layer: int = None
inter_size: int = None
norm_eps: float = None
attn_bias: int = None
start_id: int = None
end_id: int = None
session_len: int = None
weight_type: str = 'fp16'
rotary_embedding: int = 128
rope_theta: float = 10000.0
......@@ -50,9 +55,12 @@ class TurbomindModelConfig:
max_batch_size: int = 64
max_context_token_num: int = 1
step_length: int = 1
cache_max_entry_count: float = 0.5
cache_max_entry_count: float = 0.8
cache_block_seq_len: int = 128
cache_chunk_size: int = 1
cache_chunk_size: int = -1
num_tokens_per_iter: int = 0
max_prefill_iters: int = 1
extra_tokens_per_iter: int = 0
use_context_fmha: int = 1
quant_policy: int = 0
max_position_embeddings: int = 0
......@@ -74,6 +82,34 @@ class TurbomindModelConfig:
default.update(used)
return cls(**default)
@classmethod
def from_engine_config(cls, config: TurbomindEngineConfig):
env = copy.deepcopy(config.__dict__)
env['tensor_para_size'] = env['tp']
ret = TurbomindModelConfig.from_dict(env, allow_none=True)
ret.rotary_embedding = ret.size_per_head
# workround to support `max_prefill_token_num` in turbomind engine
if config.max_prefill_token_num is not None and \
config.session_len is not None:
ret.num_tokens_per_iter = config.max_prefill_token_num
ret.max_prefill_iters = (config.session_len +
config.max_prefill_token_num -
1) // config.max_prefill_token_num
return ret
def toini(self):
config = copy.deepcopy(self.__dict__)
parser = ConfigParser()
parser['llama'] = config
with io.StringIO() as ss:
parser.write(ss)
ss.seek(0)
ini = ss.read()
return ini
def __str__(self):
return json.dumps(self.__dict__, indent=2)
@property
def valid(self):
"""Check if cfg is valid."""
......
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import subprocess
def get_llama_gemm():
"""get the executable binary llama_gemm."""
import os.path as osp
import lmdeploy
......@@ -13,12 +15,52 @@ def get_llama_gemm():
return bin_path
def read_config(ini_path: str):
"""read turbomind config from turbomind.
Args:
ini_path (str): the path of `config.ini` file in turbomind model
"""
from configparser import ConfigParser
from lmdeploy.turbomind.deploy.target_model.base import \
TurbomindModelConfig
with open(ini_path, 'r') as f:
parser = ConfigParser()
parser.read_file(f)
section_name = 'llama'
_cfg = parser._sections[section_name]
cfg = TurbomindModelConfig.from_dict(_cfg)
return cfg.head_num, cfg.size_per_head, cfg.inter_size, \
cfg.vocab_size, cfg.tensor_para_size
def main(head_num: int = 32,
size_per_head: int = 128,
vocab_size: int = 32000,
inter_size: int = 11008,
tensor_para_size: int = 1,
max_batch_size: int = 64):
max_batch_size: int = 64,
model_path: str = None):
if model_path is not None:
from lmdeploy.turbomind.turbomind import get_model_source
from lmdeploy.turbomind.utils import ModelSource
model_source = get_model_source(model_path)
if model_source == ModelSource.WORKSPACE:
head_num, size_per_head, inter_size, vocab_size, \
tensor_para_size = read_config(
osp.join(model_path,
'triton_models', 'weights', 'config.ini'))
else:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_path,
trust_remote_code=True)
head_num = config.num_attention_heads
size_per_head = config.hidden_size // head_num
inter_size = config.intermediate_size
vocab_size = config.vocab_size
for bsz in range(1, max_batch_size + 1):
subprocess.call(
f'{get_llama_gemm()} {bsz} 1 1 {head_num} {size_per_head}'
......
{
"architectures": [
"LMDeployForCausalLM"
],
"auto_map": {
"AutoConfig": "configuration_lmdeploy.LMDeployConfig",
"AutoModel": "modeling_lmdeploy.LMDeployForCausalLM",
"AutoModelForCausalLM": "modeling_lmdeploy.LMDeployForCausalLM"
},
"turbomind": {}
}
# Copyright (c) OpenMMLab. All rights reserved.
import copy
from transformers import PretrainedConfig
from lmdeploy.turbomind.deploy.target_model.base import TurbomindModelConfig
from lmdeploy.version import __version__ as lm_version
class LMDeployConfig(PretrainedConfig):
"""Lmdeploy config."""
def __init__(self, turbomind: dict = None, **kwargs):
default_tm_cfg = copy.deepcopy(
TurbomindModelConfig.from_dict({}, allow_none=True).__dict__)
if turbomind is not None:
default_tm_cfg.update(turbomind)
self.turbomind = default_tm_cfg
self.lmdeploy_version = lm_version
super().__init__(**kwargs)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
config, kwargs = super().from_pretrained(pretrained_model_name_or_path,
return_unused_kwargs=True,
**kwargs)
for k, v in kwargs.items():
if k in config.turbomind.keys():
config.turbomind[k] = v
if 'tp' in kwargs:
config.turbomind['tensor_para_size'] = kwargs['tp']
if return_unused_kwargs:
return config, kwargs
else:
return config
# Copyright (c) OpenMMLab. All rights reserved.
import dataclasses
import os
from contextlib import contextmanager
from dataclasses import dataclass, field
from itertools import count
from queue import Queue
from typing import List, Optional, Tuple, Union
from huggingface_hub import snapshot_download
from transformers import PretrainedConfig
from transformers.modeling_utils import PreTrainedModel
from transformers.utils import logging
from lmdeploy.turbomind import TurboMind
from lmdeploy.turbomind.utils import get_gen_param
from .configuration_lmdeploy import LMDeployConfig
logger = logging.get_logger(__name__)
@dataclass
class Session:
_count = count()
_session_id: int = None
_message: List[Tuple[str, str]] = field(default_factory=list)
_step: int = 0
_nth_round: int = 0
_error: int = 0
def __init__(self):
self._session_id = next(Session._count)
self._message = []
self._step = 0
self._nth_round = 0
@property
def session_id(self):
return self._session_id
@property
def message(self):
return self._message
@property
def step(self):
return self._step
@property
def nth_round(self):
return self._nth_round
@property
def error(self):
return self._error
class LMDeployForCausalLM(PreTrainedModel):
config_class = LMDeployConfig
def __init__(self,
config: LMDeployConfig,
*inputs,
model_path: str = None,
**kwargs):
super().__init__(config)
self.tm_model = TurboMind.from_pretrained(model_path, **kwargs)
que = Queue()
for _ in range(config.turbomind['max_batch_size']):
que.put(self.tm_model.create_instance())
self.que = que
@classmethod
def from_pretrained(cls,
pretrained_model_name_or_path,
*model_args,
config: Optional[Union[PretrainedConfig, str,
os.PathLike]] = None,
cache_dir: Optional[Union[str, os.PathLike]] = None,
force_download: bool = False,
local_files_only: bool = False,
token: Optional[Union[str, bool]] = None,
revision: str = 'main',
**kwargs):
"""Instantiate a LM model with turbomind backend."""
resume_download = kwargs.pop('resume_download', True)
proxies = kwargs.pop('proxies', None)
if os.path.isdir(pretrained_model_name_or_path):
local_folder = pretrained_model_name_or_path
else:
local_folder = snapshot_download(
pretrained_model_name_or_path,
revision=revision,
cache_dir=cache_dir,
proxies=proxies,
resume_download=resume_download,
force_download=force_download,
token=token,
local_files_only=local_files_only,
)
if not isinstance(config, PretrainedConfig):
config_path = config if config is not None else local_folder
kwargs.pop('return_unused_kwargs')
config, model_kwargs = cls.config_class.from_pretrained(
config_path, return_unused_kwargs=True, **kwargs)
else:
model_kwargs = kwargs
model = cls(config,
*model_args,
model_path=local_folder,
**model_kwargs)
generation_config = model.tm_model.model.sampling_param
for k, v in dataclasses.asdict(generation_config).items():
if hasattr(model.generation_config, k):
base_value = getattr(model.generation_config, k)
setattr(generation_config, k, base_value)
if k in kwargs:
setattr(generation_config, k, v)
model.generation_config = generation_config
return model
@contextmanager
def managed_generator(self, session: Session):
generator = self.que.get()
try:
yield generator
except: # noqa E722
for _ in generator.stream_infer(session.session_id, [0],
request_output_len=0,
sequence_start=False,
sequence_end=False,
stop=True):
pass
session._error = 1
finally:
self.que.put(generator)
def generate(
self,
input_ids: List[int],
session: Session,
**kwargs,
):
"""Generates sequences of token ids for models with a language modeling
head.
Args:
input_ids (List(int)): list of input token ids
session (Session) session information
kwargs (dict): hoc parametrization of generation
"""
with self.managed_generator(session) as generator:
for outputs in generator.stream_infer(
session_id=session.session_id,
input_ids=[input_ids],
**kwargs,
):
res, tokens = outputs[0]
yield res, tokens
def chat(
self,
query: str,
session: Optional[Session] = None,
cap: str = 'chat',
request_output_len: int = 512,
stream_output: bool = False,
ignore_eos=False,
random_seed: Optional[int] = None,
**kwargs,
) -> Tuple[str, Session]:
"""chat."""
if session is None:
session = Session()
assert session._error == 0, 'An error occurred before, ' \
'please start a new session.'
session._message.append([query, ''])
prompt = self.tm_model.model.get_prompt(query, session.nth_round == 0)
input_ids = self.tm_model.tokenizer.encode(prompt)
if len(
input_ids
) + session.step + request_output_len >= self.tm_model.session_len:
logger.error(
f'session_length exceeded {self.tm_model.session_len}')
session._error = 1
yield '', session
else:
gen_param = get_gen_param(cap, self.generation_config,
session.nth_round + 1, session.step,
request_output_len, **kwargs)
gen_kwargs = dataclasses.asdict(gen_param)
gen_kwargs.update(
random_seed=random_seed if session.nth_round == 0 else None,
stream_output=stream_output,
ignore_eos=ignore_eos,
**kwargs)
_step = session._step
_nth_round = session._nth_round
response_size = 0
for res, tokens in self.generate(input_ids,
session=session,
**gen_kwargs):
response = self.tm_model.tokenizer.decode(res.tolist(),
offset=response_size)
if response.endswith('�'):
continue
response_size = tokens
session._message[-1][-1] += response
session._nth_round = _nth_round + 1
session._step = _step + response_size
yield response, session
# Copyright (c) OpenMMLab. All rights reserved.
import asyncio
import copy
import io
import json
import logging
import os.path as osp
import sys
from configparser import ConfigParser
from contextlib import contextmanager
from queue import Queue
from queue import LifoQueue, Queue
from threading import Thread
from typing import Iterable, List, Optional
from typing import Iterable, List, Optional, Union
import numpy as np
import torch
from huggingface_hub import snapshot_download
from torch.nn.utils.rnn import pad_sequence
import lmdeploy
from lmdeploy.model import MODELS, BaseModel
from lmdeploy.messages import (EngineGenerationConfig, ResponseType,
TurbomindEngineConfig)
from lmdeploy.model import (MODELS, BaseModel, ChatTemplateConfig,
best_match_model)
from lmdeploy.tokenizer import Tokenizer
from lmdeploy.utils import get_logger
from lmdeploy.utils import _stop_words, get_logger, get_model
from .deploy.converter import (get_model_format, supported_formats,
update_config_weight_type, update_output_format)
from .deploy.source_model.base import INPUT_MODELS
from .deploy.target_model.base import OUTPUT_MODELS, TurbomindModelConfig
from .utils import (ModelSource, check_tm_model_input, create_hf_download_args,
get_hf_config_content, get_model_source)
from .utils import ModelSource, get_model_from_config, get_model_source
# TODO: find another way import _turbomind
lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
sys.path.append(osp.join(lmdeploy_dir, 'lib'))
import _turbomind as _tm # noqa: E402
logger = logging.getLogger(__name__)
logger = get_logger('lmdeploy')
def _stop_words(stop_words: List[str], tokenizer: Tokenizer):
"""return list of stop-words to numpy.ndarray."""
if stop_words is None:
def _construct_stop_or_bad_words(words: List[int] = None):
if words is None or len(words) == 0:
return None
assert isinstance(stop_words, List) and \
all(isinstance(elem, str) for elem in stop_words), \
f'stop_words must be a list but got {type(stop_words)}'
stop_words = [
tokenizer.encode(stop_word, False)[-1] for stop_word in stop_words
]
assert isinstance(stop_words, List) and all(
isinstance(elem, int) for elem in stop_words), 'invalid stop_words'
# each id in stop_words represents a stop word
# refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
# detailed explanation about fastertransformer's stop_words
stop_word_offsets = range(1, len(stop_words) + 1)
stop_words = np.array([[stop_words, stop_word_offsets]]).astype(np.int32)
return stop_words
offsets = range(1, len(words) + 1)
combined = np.array([[words, offsets]]).astype(np.int32)
return combined
def _np_dict_to_tm_dict(np_dict: dict):
......@@ -77,6 +64,59 @@ def _tm_dict_to_torch_dict(tm_dict: _tm.TensorMap):
return ret
def _update_engine_config(config: TurbomindEngineConfig, **kwargs):
if config is None:
config = TurbomindEngineConfig()
for k, v in kwargs.items():
if v and hasattr(config, k):
setattr(config, k, v)
logger.warning(f'kwargs {k} is deprecated to initialize model, '
'use TurbomindEngineConfig instead.')
if config.model_name is not None:
logger.warning('model_name is deprecated in TurbomindEngineConfig '
'and has no effect')
return config
def _update_tm_config(dst: TurbomindModelConfig, src: TurbomindEngineConfig):
# A workaround to support max token number of each iteration in prefill
if src.max_prefill_token_num is not None and src.session_len is not None:
dst.num_tokens_per_iter = src.max_prefill_token_num
dst.max_prefill_iters = (src.session_len + src.max_prefill_token_num -
1) // src.max_prefill_token_num
dst_dict = copy.deepcopy(dst.__dict__)
src_dict = copy.deepcopy(src.__dict__)
src_dict['tensor_para_size'] = src_dict['tp']
for k, v in src_dict.items():
if v is not None and k in dst_dict:
dst_dict[k] = v
return TurbomindModelConfig.from_dict(dst_dict)
def _compare_individual_gpu_memory(tp: int):
logger.setLevel(level=logging.INFO)
try:
total_mem = []
free_mem = []
for i in range(tp):
torch.cuda.set_device(i)
free, total = torch.cuda.mem_get_info()
total_mem.append(total / (1024**2))
free_mem.append(free / (1024**2))
all_total_equal = all(total == total_mem[0] for total in total_mem)
all_free_equal = all(free == free_mem[0] for free in free_mem)
if not all_total_equal or not all_free_equal:
logger.warning(
f'Memory discrepancy detected: Total Memory={total_mem} MB, \
Free Memory={free_mem} MB')
except Exception as e:
logger.error(f'An exception occurred: {e}')
@contextmanager
def cuda_ctx(device_id):
old_device = torch.cuda.current_device()
......@@ -102,34 +142,75 @@ class TurboMind:
def __init__(self,
model_path: str,
engine_config: TurbomindEngineConfig = None,
model_source: ModelSource = ModelSource.WORKSPACE,
model_name: Optional[str] = None,
model_format: Optional[str] = None,
group_size: Optional[int] = None,
tp: Optional[int] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
**kwargs):
# check memory equality when tp
if tp is not None:
assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
self.gpu_count = tp if tp is not None else 1
if tp > 1:
_compare_individual_gpu_memory(tp)
elif engine_config is not None and engine_config.tp is not None:
if engine_config.tp > 1:
_compare_individual_gpu_memory(engine_config.tp)
# if loading from workspace and engine_config is None, use config.ini
# and ignore passed args like model_format, tp, etc.
if model_source == ModelSource.WORKSPACE and engine_config is None:
def _catch_args(**kwargs):
args = []
for k, v in kwargs.items():
if v and hasattr(TurbomindEngineConfig, k):
args.append(k)
return args
args = _catch_args(**kwargs, model_format=model_format, tp=tp)
if len(args) > 0:
logger.warning(
f'loading from workspace, ignore args {args} '
'please use TurbomindEngineConfig or modify config.ini')
else:
engine_config = _update_engine_config(engine_config,
model_format=model_format,
group_size=group_size,
tp=tp,
**kwargs)
tp = engine_config.tp if engine_config is not None else 1
assert ((tp & (tp - 1) == 0) and tp != 0), 'tp should be 2^n'
self.gpu_count = tp
if model_source == ModelSource.WORKSPACE:
tokenizer_model_path = osp.join(model_path, 'triton_models',
'tokenizer')
self.tokenizer = Tokenizer(tokenizer_model_path)
self.model_comm = self._from_workspace(model_path)
self.model_comm = self._from_workspace(model_path=model_path,
engine_config=engine_config)
else:
if not osp.exists(model_path):
model_path = get_model(model_path, engine_config.download_dir,
engine_config.revision)
self.tokenizer = Tokenizer(model_path)
self.model_comm = self._from_hf(model_source=model_source,
model_path=model_path,
model_name=model_name,
model_format=model_format,
group_size=group_size,
tp=tp,
**kwargs)
engine_config=engine_config)
if chat_template_config:
if chat_template_config.model_name is None:
chat_template_config.model_name = self.model_name
logger.warning(f'Input chat template with model_name is None. '
f'Forcing to use {self.model_name}')
self.model = chat_template_config.chat_template
else:
self.model: BaseModel = MODELS.get(self.model_name)(**kwargs)
self.session_len = self.config.session_len
self.eos_id = self.tokenizer.eos_token_id
self.model: BaseModel = MODELS.get(self.model_name)(**kwargs)
self.session_len = self.model.session_len
self.stop_words = _stop_words(self.model.stop_words, self.tokenizer)
def _create_weight(self, model_comm):
......@@ -194,88 +275,61 @@ class TurboMind:
tm_params[k] = []
tm_params[k].append(v)
def _from_hf(self,
model_source: ModelSource,
model_path: str,
model_name: Optional[str] = None,
model_format: Optional[str] = None,
group_size: Optional[int] = None,
tp: Optional[int] = None,
**kwargs):
def _from_hf(self, model_source: ModelSource, model_path: str,
engine_config: TurbomindEngineConfig):
"""Load model which is in hf format."""
# get model_name, group_size if is lmdeploy managed.
if model_source == ModelSource.HF_LMDEPLOY:
config = get_hf_config_content(model_path, local_files_only=True)
tm_config = config['turbomind']
tm_config.update(kwargs)
var_shoud_be_none = dict(model_name=model_name,
model_format=model_format,
group_size=group_size)
for key, value in var_shoud_be_none.items():
assert value is None, f'{key} should be None when model is '\
f'from {model_source}'
model_name = tm_config['model_name']
group_size = tm_config['group_size']
if tm_config['weight_type'] == 'int4':
model_format = 'awq'
else:
assert model_name is not None, 'please supply model_name when ' \
f'model is form {model_source}'
if osp.exists(osp.join(model_path, 'outputs_stats.pth')):
model_format = 'awq' if model_format is None else model_format
group_size = 128 if group_size is None else group_size
tm_config = kwargs
assert model_name in MODELS.module_dict.keys(), \
f"'{model_name}' is not supported. " \
f'The supported models are: {MODELS.module_dict.keys()}'
assert model_format in supported_formats, 'the model format ' \
f'should be in {supported_formats}'
assert model_source == ModelSource.HF_MODEL, \
f'{model_source} is not supported'
assert engine_config.model_format in supported_formats, \
f'The model format should be in {supported_formats}'
# update model_format if not supplied and outputs_stats.pth exists
if osp.exists(osp.join(model_path, 'outputs_stats.pth')) and \
engine_config.model_format is None:
engine_config.model_format = 'awq'
# when convert model, use architectures in config.json
model_arch = get_model_from_config(model_path)
data_type = 'fp16'
output_format = 'fp16'
inferred_model_format = get_model_format(model_name, model_format)
cfg = TurbomindModelConfig.from_dict(tm_config, allow_none=True)
# overwrite with input params
cfg.model_name = model_name
cfg.tensor_para_size = 1 if tp is None else tp
cfg.rotary_embedding = cfg.size_per_head
cfg.group_size = group_size
inferred_model_format = get_model_format(model_arch,
engine_config.model_format)
cfg = TurbomindModelConfig.from_engine_config(engine_config)
match_name = best_match_model(model_path)
# for session len
cfg.model_name = match_name \
if match_name is not None else 'base'
if inferred_model_format.find('awq') != -1:
cfg.weight_type = 'int4'
output_format = 'w4'
data_type = 'int4'
assert group_size > 0, f'group_size: {group_size} should > 0'
cfg.group_size = 128
else:
output_format = update_output_format(model_name,
output_format = update_output_format(cfg.model_name,
inferred_model_format,
model_path, output_format)
data_type = output_format
update_config_weight_type(output_format, cfg)
self.config = cfg
self.model_name = model_name
self.data_type = data_type
input_model = INPUT_MODELS.get(inferred_model_format)(
model_path=model_path, tokenizer_path=model_path, ckpt_path=None)
output_model = OUTPUT_MODELS.get(output_format)(
input_model=input_model, cfg=cfg, to_file=False, out_dir='')
config = copy.deepcopy(output_model.cfg.__dict__)
logger.warning(f'model_config:\n{json.dumps(config, indent=2)}')
parser = ConfigParser()
parser['llama'] = config
with io.StringIO() as ss:
parser.write(ss)
ss.seek(0)
config = ss.read()
cfg = output_model.cfg
if engine_config.session_len is not None:
cfg.session_len = engine_config.session_len
self.model_name = cfg.model_name
self.config = cfg
self.data_type = data_type
logger.warning(f'model_config:\n\n{cfg.toini()}')
model_comm = _tm.AbstractTransformerModel.create_llama_model(
model_dir='',
config=config,
config=cfg.toini(),
tensor_para_size=self.gpu_count,
data_type=data_type)
......@@ -289,35 +343,48 @@ class TurboMind:
output_model.export()
# load kv qparams
self._load_kv_qparams(model_path, tm_params, **kwargs)
self._load_kv_qparams(model_path, tm_params, kv_sym=False, kv_bits=8)
assert len(tm_params) == 0, f'missing {tm_params.keys()}'
return model_comm
def _from_workspace(self, model_path: str):
def _from_workspace(self, model_path: str,
engine_config: TurbomindEngineConfig):
"""Load model which is converted by `lmdeploy convert`"""
ini_path = osp.join(model_path, 'triton_models', 'weights',
'config.ini')
# load cfg
with open(ini_path, 'r') as f:
parser = ConfigParser()
parser.read_file(f)
section_name = 'llama'
tp_cfg = parser.getint(section_name, 'tensor_para_size')
if tp_cfg != 1 and tp_cfg != self.gpu_count:
get_logger('turbomind').info(
f'found tp={tp_cfg} in config.ini.')
self.gpu_count = tp_cfg
self.model_name = parser.get(section_name, 'model_name')
self.data_type = parser.get(section_name, 'weight_type')
cfg = parser._sections[section_name]
cfg = TurbomindModelConfig.from_dict(cfg)
self.config = cfg
section_name = 'llama'
_cfg = parser._sections[section_name]
cfg = TurbomindModelConfig.from_dict(_cfg)
# check whether input tp is valid
if cfg.tensor_para_size != 1 and \
self.gpu_count != cfg.tensor_para_size:
logger.info(f'found tp={cfg.tensor_para_size} in config.ini.')
self.gpu_count = cfg.tensor_para_size
# update cfg
if engine_config is not None:
engine_config.tp = cfg.tensor_para_size
cfg = _update_tm_config(cfg, engine_config)
if engine_config.session_len is not None:
cfg.session_len = engine_config.session_len
# update cls
self.config = cfg
self.model_name = cfg.model_name
self.data_type = cfg.weight_type
# create model
logger.warning(f'model_config:\n\n{cfg.toini()}')
weight_dir = osp.join(model_path, 'triton_models', 'weights')
model_comm = _tm.AbstractTransformerModel.create_llama_model(
weight_dir,
model_dir=weight_dir,
config=cfg.toini(),
tensor_para_size=self.gpu_count,
data_type=self.data_type)
......@@ -326,13 +393,16 @@ class TurboMind:
return model_comm
@classmethod
def from_pretrained(cls,
pretrained_model_name_or_path: str,
model_name: Optional[str] = None,
model_format: Optional[str] = None,
group_size: Optional[int] = None,
tp: Optional[int] = None,
**kwargs):
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
engine_config: TurbomindEngineConfig = None,
model_name: Optional[str] = None,
model_format: Optional[str] = None,
group_size: Optional[int] = None,
tp: Optional[int] = None,
chat_template_config: Optional[ChatTemplateConfig] = None,
**kwargs):
"""LMDeploy's turbomind inference engine.
Args:
......@@ -346,7 +416,7 @@ class TurboMind:
"InternLM/internlm-chat-20b-4bit",
"lmdeploy/llama2-chat-70b-4bit", etc.
- iii) The model_id of a model hosted inside a model repo
on huggingface.co, such as "InternLM/internlm-chat-7b",
on huggingface.co, such as "internlm/internlm-chat-7b",
"Qwen/Qwen-7B-Chat ", "baichuan-inc/Baichuan2-7B-Chat"
and so on.
model_name (str): needed when pretrained_model_name_or_path is c)
......@@ -357,26 +427,14 @@ class TurboMind:
Can be used to update configuration when initialize the engine.
"""
model_source = get_model_source(pretrained_model_name_or_path)
if model_source == ModelSource.WORKSPACE:
local_path = pretrained_model_name_or_path
else:
check_tm_model_input(pretrained_model_name_or_path,
model_name=model_name,
**kwargs)
if not osp.exists(pretrained_model_name_or_path):
download_kwargs = create_hf_download_args(**kwargs)
local_path = snapshot_download(pretrained_model_name_or_path,
**download_kwargs)
else:
local_path = pretrained_model_name_or_path
logger.warning(f'model_source: {model_source}')
return cls(model_source=model_source,
model_path=local_path,
model_name=model_name,
return cls(model_path=pretrained_model_name_or_path,
engine_config=engine_config,
model_source=model_source,
model_format=model_format,
group_size=group_size,
tp=tp,
chat_template_config=chat_template_config,
**kwargs)
def create_instance(self, cuda_stream_id=0):
......@@ -406,8 +464,6 @@ class TurboMindInstance:
self.gpu_count = tm_model.gpu_count
self.stop_words = tm_model.stop_words
self.stop_tokens = [] if self.stop_words is None else \
self.stop_words.flatten().tolist()
self.eos_id = tm_model.eos_id
self.session_len = tm_model.session_len
......@@ -456,23 +512,92 @@ class TurboMindInstance:
t.start()
self.threads[device_id] = t
def _async_forward_callback(self, result, ctx, que: LifoQueue):
que.put((False, result))
def _async_forward_thread(self, inputs, que: LifoQueue):
instance_comm = self.tm_model.model_comm.create_instance_comm(
self.gpu_count)
def _func(device_id, enque_output):
with cuda_ctx(device_id):
output = self.model_insts[device_id].forward(
inputs, instance_comm)
if enque_output:
que.put((True, output))
for device_id in range(self.gpu_count):
t = Thread(target=_func,
args=(device_id, device_id == 0),
daemon=True)
t.start()
self.threads[device_id] = t
def _update_generation_config(self, config: EngineGenerationConfig,
**kwargs: dict):
if config is None:
config = EngineGenerationConfig()
# backward compatibility
# if doesn't supply stop words, use default
if config.stop_words is None and self.stop_words is not None:
config.stop_words = self.stop_words[0][0].tolist()
deprecated_kwargs = []
for k, v in kwargs.items():
if k in config.__dict__:
config.__dict__[k] = v
deprecated_kwargs.append(k)
if 'request_output_len' in kwargs:
config.max_new_tokens = kwargs['request_output_len']
deprecated_kwargs.append('request_output_len')
for k in deprecated_kwargs:
logger.warning(f'kwargs {k} is deprecated for inference, '
'use GenerationConfig instead.')
return config
def end(self, session_id: int):
"""End the given session."""
input_ids = [self.tm_model.tokenizer.eos_token_id]
end_generator = self.tm_model.create_instance()
for outputs in end_generator.stream_infer(session_id,
input_ids,
request_output_len=0,
sequence_start=False,
sequence_end=True):
pass
async def async_end(self, session_id: int):
"""End the given session."""
self.end(session_id)
await asyncio.sleep(0.002)
def cancel(self, session_id: int):
"""Stop current streaming inference."""
input_ids = [self.tm_model.tokenizer.eos_token_id]
stop_generator = self.tm_model.create_instance()
for outputs in stop_generator.stream_infer(session_id,
input_ids,
request_output_len=0,
sequence_start=False,
sequence_end=False,
stop=True):
pass
async def async_cancel(self, session_id: int):
"""End the given session."""
self.cancel(session_id)
await asyncio.sleep(0.002)
def prepare_inputs(self,
session_id,
input_ids,
gen_config: EngineGenerationConfig,
input_embeddings=None,
input_embedding_ranges=None,
request_output_len: int = 512,
sequence_start: bool = True,
sequence_end: bool = False,
step=0,
stop=False,
top_p=0.8,
top_k=40,
temperature=0.8,
repetition_penalty=1.0,
ignore_eos=False,
random_seed=None,
stream_output=False):
stop=False):
"""Convert inputs format."""
if len(input_ids) == 0:
input_ids = [[]]
......@@ -504,19 +629,16 @@ class TurboMindInstance:
input_ids=input_ids,
input_lengths=input_lengths,
request_output_len=np.full(input_lengths.shape,
request_output_len,
gen_config.max_new_tokens,
dtype=np.uint32),
runtime_top_k=_broadcast_np(top_k, np.uint32),
runtime_top_p=_broadcast_np(top_p, np.float32),
temperature=_broadcast_np(temperature, np.float32),
repetition_penalty=_broadcast_np(repetition_penalty, np.float32),
runtime_top_k=_broadcast_np(gen_config.top_k, np.uint32),
runtime_top_p=_broadcast_np(gen_config.top_p, np.float32),
temperature=_broadcast_np(gen_config.temperature, np.float32),
repetition_penalty=_broadcast_np(gen_config.repetition_penalty,
np.float32),
step=step,
# session input
session_len=self.session_len *
np.ones([
batch_size,
], dtype=np.uint32),
START=_broadcast_np((1 if sequence_start else 0), np.int32),
END=_broadcast_np((1 if sequence_end else 0), np.int32),
CORRID=np.array(session_id, dtype=np.uint64),
......@@ -560,20 +682,29 @@ class TurboMindInstance:
inputs['input_embeddings'] = input_embeddings
inputs['input_embedding_ranges'] = input_embedding_ranges
if ignore_eos:
if gen_config.min_new_tokens is not None:
inputs['min_length'] = _broadcast_np(gen_config.min_new_tokens,
np.int32)
bad_words = []
if gen_config.bad_words is not None:
bad_words.extend(gen_config.bad_words)
if gen_config.ignore_eos:
stop_words = None
bad_words = torch.tensor([[[self.eos_id], [1]]], dtype=torch.int32)
bad_words.append(self.eos_id)
else:
stop_words = self.stop_words
bad_words = None
stop_words = gen_config.stop_words
stop_words = _construct_stop_or_bad_words(stop_words)
bad_words = _construct_stop_or_bad_words(bad_words)
if stop_words is not None:
inputs['stop_words_list'] = stop_words
if bad_words is not None:
inputs['bad_words_list'] = bad_words
if random_seed is not None:
inputs['random_seed'] = _broadcast_np(random_seed, np.uint64)
if gen_config.random_seed is not None:
inputs['random_seed'] = _broadcast_np(gen_config.random_seed,
np.uint64)
return inputs, input_lengths
async def async_stream_infer(self,
......@@ -581,18 +712,13 @@ class TurboMindInstance:
input_ids,
input_embeddings=None,
input_embedding_ranges=None,
request_output_len: int = 512,
sequence_start: bool = True,
sequence_end: bool = False,
step=0,
stop=False,
top_p=0.8,
top_k=40,
temperature=0.8,
repetition_penalty=1.0,
ignore_eos=False,
random_seed=None,
stream_output=False):
gen_config: EngineGenerationConfig = None,
stream_output=False,
**kwargs):
"""Perform model inference.
Args:
......@@ -601,60 +727,46 @@ class TurboMindInstance:
input_embeddings (List[numpy.ndarray]): embeddings features
input_embedding_ranges (List[Tuple[int,int]]): the begin/end
offsets of input_embeddings to input_ids
request_output_len (int): the max number of to-be-generated tokens
sequence_start (bool): indicator for starting a sequence
sequence_end (bool): indicator for ending a sequence
step (int): the offset of the k/v cache
stop (bool): indicator for cancelling the session
top_p (float): If set to float < 1, only the smallest set of most
probable tokens with probabilities that add up to top_p or higher
are kept for generation.
top_k (int): The number of the highest probability vocabulary
tokens to keep for top-k-filtering
temperature (float): to modulate the next token probability
repetition_penalty (float): The parameter for repetition penalty.
1.0 means no penalty
ignore_eos (bool): indicator for ignoring eos
random_seed (int): seed used by sampling
gen_config (EngineGenerationConfig): generation config
stream_output (bool): indicator for stream output
kwargs (dict): kwargs for backward compatibility
"""
# start forward thread
que = LifoQueue()
from functools import partial
_forward_callback = partial(self._async_forward_callback, que=que)
_forward_thread = partial(self._async_forward_thread, que=que)
if stream_output and not stop:
self.model_insts[0].register_callback(self._forward_callback)
self.model_insts[0].register_callback(_forward_callback)
gen_config = self._update_generation_config(gen_config, **kwargs)
inputs, input_lengths = self.prepare_inputs(
session_id=session_id,
input_ids=input_ids,
input_embeddings=input_embeddings,
input_embedding_ranges=input_embedding_ranges,
request_output_len=request_output_len,
sequence_start=sequence_start,
sequence_end=sequence_end,
step=step,
stop=stop,
top_p=top_p,
top_k=top_k,
temperature=temperature,
repetition_penalty=repetition_penalty,
ignore_eos=ignore_eos,
random_seed=random_seed,
stream_output=stream_output)
gen_config=gen_config)
tm_inputs = _np_dict_to_tm_dict(inputs)
# start forward thread
self.que = Queue()
self._forward_thread(tm_inputs)
_forward_thread(tm_inputs)
seq_start = input_lengths + input_lengths.new_tensor(step)
prev_len = 0
# generator
while True:
# Thanks for https://github.com/frankxyy and his issue
# https://github.com/InternLM/lmdeploy/issues/832
while self.que.qsize() == 0:
await asyncio.sleep(0)
while self.que.qsize() > 1:
self.que.get()
while que.qsize() == 0: # let other requests in
await asyncio.sleep(0.002)
finish, tm_outputs = self.que.get()
finish, tm_outputs = que.get()
outputs = _tm_dict_to_torch_dict(tm_outputs)
......@@ -667,22 +779,27 @@ class TurboMindInstance:
sequence_length -= seq_start.to(sequence_length.device)
outputs = []
status = ResponseType.FINISH if finish else ResponseType.SUCCESS
for output, len_ in zip(output_ids, sequence_length):
output, len_ = output, len_.item()
if len(output) > 0 and output[-1].item(
) == self.eos_id and not ignore_eos:
outputs.append((output[:-1], len_ - 1))
elif len(output) > 0 and output[-1].item() in self.stop_tokens:
outputs.append((output[:-1], len_))
if len(output) > 0 and output[-1].item() == self.eos_id \
and not gen_config.ignore_eos:
outputs = (status, output[:-1].tolist(), len_ - 1)
elif len(output) > 0 and \
gen_config.stop_words is not None and \
output[-1].item() in gen_config.stop_words:
outputs = (status, output[:-1].tolist(), len_)
else:
outputs.append((output, len_))
outputs = (status, output.tolist(), len_)
if outputs[-1] < prev_len and not finish:
continue
else:
prev_len = outputs[-1]
yield outputs
if finish:
for t in self.threads:
t.join()
while self.que.qsize() > 0:
self.que.get()
break
if stream_output and not stop:
......@@ -693,18 +810,13 @@ class TurboMindInstance:
input_ids,
input_embeddings=None,
input_embedding_ranges=None,
request_output_len: int = 512,
sequence_start: bool = True,
sequence_end: bool = False,
step=0,
stop=False,
top_p=0.8,
top_k=40,
temperature=0.8,
repetition_penalty=1.0,
ignore_eos=False,
random_seed=None,
stream_output=False):
gen_config: EngineGenerationConfig = None,
stream_output=False,
**kwargs):
"""Perform model inference.
Args:
......@@ -713,42 +825,28 @@ class TurboMindInstance:
input_embeddings (List[numpy.ndarray]): embeddings features
input_embedding_ranges (List[Tuple[int,int]]): the begin/end
offsets of input_embeddings to input_ids
request_output_len (int): the max number of to-be-generated tokens
sequence_start (bool): indicator for starting a sequence
sequence_end (bool): indicator for ending a sequence
step (int): the offset of the k/v cache
stop (bool): indicator for cancelling the session
top_p (float): If set to float < 1, only the smallest set of most
probable tokens with probabilities that add up to top_p or higher
are kept for generation.
top_k (int): The number of the highest probability vocabulary
tokens to keep for top-k-filtering
temperature (float): to modulate the next token probability
repetition_penalty (float): The parameter for repetition penalty.
1.0 means no penalty
ignore_eos (bool): indicator for ignoring eos
random_seed (int): seed used by sampling
gen_config (EngineGenerationConfig): generation config
stream_output (bool): indicator for stream output
kwargs (dict): kwargs for backward compatibility
"""
if stream_output and not stop:
self.model_insts[0].register_callback(self._forward_callback)
gen_config = self._update_generation_config(gen_config, **kwargs)
inputs, input_lengths = self.prepare_inputs(
session_id=session_id,
input_ids=input_ids,
input_embeddings=input_embeddings,
input_embedding_ranges=input_embedding_ranges,
request_output_len=request_output_len,
sequence_start=sequence_start,
sequence_end=sequence_end,
step=step,
stop=stop,
top_p=top_p,
top_k=top_k,
temperature=temperature,
repetition_penalty=repetition_penalty,
ignore_eos=ignore_eos,
random_seed=random_seed,
stream_output=stream_output)
gen_config=gen_config)
tm_inputs = _np_dict_to_tm_dict(inputs)
# start forward thread
......@@ -775,15 +873,18 @@ class TurboMindInstance:
sequence_length -= seq_start.to(sequence_length.device)
outputs = []
status = ResponseType.FINISH if finish else ResponseType.SUCCESS
for output, len_ in zip(output_ids, sequence_length):
output, len_ = output, len_.item()
if len(output) > 0 and output[-1].item(
) == self.eos_id and not ignore_eos:
outputs.append((output[:-1], len_ - 1))
elif len(output) > 0 and output[-1].item() in self.stop_tokens:
outputs.append((output[:-1], len_))
if len(output) > 0 and output[-1].item() == self.eos_id \
and not gen_config.ignore_eos:
outputs = (status, output[:-1].tolist(), len_ - 1)
elif len(output) > 0 and \
gen_config.stop_words is not None and \
output[-1].item() in gen_config.stop_words:
outputs = (status, output[:-1].tolist(), len_)
else:
outputs.append((output, len_))
outputs = (status, output.tolist(), len_)
yield outputs
if finish:
......@@ -796,17 +897,27 @@ class TurboMindInstance:
if stream_output and not stop:
self.model_insts[0].unregister_callback()
def decode(self, input_ids):
def decode(self,
input_ids,
steps: List[int] = None,
sequence_start: bool = True,
sequence_end: bool = True):
"""Perform context decode on input tokens.
Args:
input_ids (numpy.ndarray): the batch of input token ids
steps (List[int]): the offset of the k/v cache
sequence_start (bool): indicator for starting a sequence
sequence_end (bool): indicator for ending a sequence
"""
if len(input_ids) == 0:
input_ids = [[]]
if isinstance(input_ids[0], int):
input_ids = [input_ids]
if steps is None:
steps = [0] * len(input_ids)
assert isinstance(steps, List) and len(steps) == len(input_ids)
# append an extra token since input_len-1 tokens will be
# decoded by context decoder
......@@ -827,11 +938,16 @@ class TurboMindInstance:
input_ids = pad_sequence(input_ids,
batch_first=True,
padding_value=self.eos_id)
steps = torch.IntTensor([step for step in steps])
inputs = dict(input_ids=input_ids,
input_lengths=input_lengths,
request_output_len=_broadcast_np(0, dtype=np.uint32),
is_return_logits=_broadcast_np(1, np.uint32))
is_return_logits=_broadcast_np(1, np.uint32),
START=_broadcast_np((1 if sequence_start else 0),
np.int32),
END=_broadcast_np((1 if sequence_end else 0), np.int32),
step=steps)
tm_inputs = _np_dict_to_tm_dict(inputs)
......@@ -844,3 +960,83 @@ class TurboMindInstance:
logits = outputs['logits']
return logits[:, :-1, :]
def get_ppl(self, input_ids: Union[List[int], List[List[int]]]):
"""Get perplexity scores given a list of input tokens.
Args:
input_ids (Union[List[int], List[List[int]]]): the batch of input token ids
""" # noqa 501
if len(input_ids) == 0:
input_ids = [[]]
if isinstance(input_ids[0], int):
input_ids = [input_ids]
max_input_len = 16 * 1024
# max_input_len = 16
n_max_iter = np.ceil(
max([len(input_id)
for input_id in input_ids]) / max_input_len).astype(int)
device = 'cpu' if n_max_iter > 1 else 'cuda'
index_range_starts = []
index_range_ends = []
for input_id in input_ids:
index_range_start = np.array(
[i * max_input_len for i in range(n_max_iter)])
index_range_end = index_range_start + max_input_len
index_range_start[index_range_start >= len(input_id)] = len(
input_id)
index_range_end[index_range_end >= len(input_id)] = len(input_id)
index_range_starts.append(index_range_start)
index_range_ends.append(index_range_end)
logits = []
for i in range(n_max_iter):
steps = [start[i] for start in index_range_starts]
_input_ids = [
input_id[start[i]:end[i]] for input_id, start, end in zip(
input_ids, index_range_starts, index_range_ends)
]
_logits = self.decode(_input_ids,
steps,
sequence_start=(i == 0),
sequence_end=(i == n_max_iter - 1))
_logits = _logits.to(device=device)
logits.append(_logits)
# concat logits. Shape is [bsz, seq_len, vocab_size]
logits = torch.cat(logits, dim=1)
# get target ids
padding_token_id = -100
target_ids = [(_input_ids + [padding_token_id])[1:]
for _input_ids in input_ids]
target_ids = [
torch.Tensor(torch.LongTensor(_target_ids))
for _target_ids in target_ids
]
target_ids = pad_sequence(target_ids,
batch_first=True,
padding_value=padding_token_id)
target_ids = target_ids.to(logits.device)
target_mask = target_ids != padding_token_id
target_count = torch.sum(target_mask, dim=-1)
# compute cross entropy loss
bsz, seq_len, vocab_size = logits.shape
flat_logits = logits.contiguous().view(-1, vocab_size)
flat_target_ids = target_ids.contiguous().view(-1)
flat_loss_matrix = torch.nn.functional.cross_entropy(
flat_logits,
flat_target_ids,
reduction='none',
ignore_index=padding_token_id)
loss_matrix = flat_loss_matrix.view(bsz, seq_len)
loss_sum = torch.sum(loss_matrix * target_mask, dim=1)
loss_avg = loss_sum / target_count
loss_avg = loss_avg.cpu().numpy()
return loss_avg
# Copyright (c) OpenMMLab. All rights reserved.
import dataclasses
import json
import logging
import os
from huggingface_hub import hf_hub_download
from transformers.utils import ExplicitEnum
logger = logging.getLogger(__name__)
from lmdeploy.utils import get_logger
logger = get_logger('lmdeploy')
class ModelSource(ExplicitEnum):
"""Turbomind model source."""
WORKSPACE = 'workspace'
HF_MODEL = 'hf_model'
HF_LMDEPLOY = 'hf_lmdeploy'
def create_hf_download_args(**kwargs) -> dict:
download_kwargs = {
'revision': None,
'cache_dir': None,
'proxies': None,
'resume_download': True,
'force_download': False,
'token': None,
'local_files_only': False
}
for k in download_kwargs.keys():
if k in kwargs:
download_kwargs[k] = kwargs[k]
return download_kwargs
def get_hf_config_path(pretrained_model_name_or_path, **kwargs) -> str:
"""Get local hf config local file path."""
if os.path.exists(pretrained_model_name_or_path):
config_path = os.path.join(pretrained_model_name_or_path,
'config.json')
else:
download_kwargs = create_hf_download_args(**kwargs)
config_path = hf_hub_download(pretrained_model_name_or_path,
'config.json', **download_kwargs)
return config_path
def get_hf_config_content(pretrained_model_name_or_path, **kwargs) -> dict:
"""Get config content of a hf model."""
config_path = get_hf_config_path(pretrained_model_name_or_path, **kwargs)
with open(config_path, 'r') as f:
config = json.load(f)
return config
def get_model_source(pretrained_model_name_or_path: str,
......@@ -60,61 +21,33 @@ def get_model_source(pretrained_model_name_or_path: str,
'triton_models')
if os.path.exists(triton_model_path):
return ModelSource.WORKSPACE
config = get_hf_config_content(pretrained_model_name_or_path, **kwargs)
model_source = ModelSource.HF_LMDEPLOY if 'turbomind' in config \
else ModelSource.HF_MODEL
return model_source
def check_tm_model_input(pretrained_model_name_or_path, **kwargs):
"""Check if single input pretrained_model_name_or_path is enough to use."""
if kwargs.get('model_name', None):
return
model_source = get_model_source(pretrained_model_name_or_path, **kwargs)
if model_source == ModelSource.WORKSPACE:
return
return ModelSource.HF_MODEL
config = get_hf_config_content(pretrained_model_name_or_path, **kwargs)
if 'turbomind' in config and config['turbomind']['model_name'] != '':
return
assert (0), '\nCan not get model name from input model, '\
'please supply model name with arg --model-name,' \
'you can list supported models by `lmdeploy list`'
def get_model_from_config(model_dir: str):
import json
config_file = os.path.join(model_dir, 'config.json')
default = 'llama'
if not os.path.exists(config_file):
return default
with open(config_file) as f:
config = json.load(f)
@dataclasses.dataclass
class GenParam:
top_p: float
top_k: float
temperature: float
repetition_penalty: float
sequence_start: bool = False
sequence_end: bool = False
step: int = 0
request_output_len: int = 512
ARCH_MAP = {
'LlavaLlamaForCausalLM': default,
'LlamaForCausalLM': default,
'InternLM2ForCausalLM': 'internlm2',
'InternLMForCausalLM': default,
'BaiChuanForCausalLM': 'baichuan', # Baichuan-7B
'BaichuanForCausalLM': 'baichuan2', # not right for Baichuan-13B-Chat
'QWenLMHeadModel': 'qwen',
}
def get_gen_param(cap,
sampling_param,
nth_round,
step,
request_output_len=512,
**kwargs):
"""return parameters used by token generation."""
gen_param = GenParam(**dataclasses.asdict(sampling_param),
request_output_len=request_output_len)
# Fix me later. turbomind.py doesn't support None top_k
if gen_param.top_k is None:
gen_param.top_k = 40
arch = 'LlamaForCausalLM'
if 'auto_map' in config:
arch = config['auto_map']['AutoModelForCausalLM'].split('.')[-1]
elif 'architectures' in config:
arch = config['architectures'][0]
if cap == 'chat':
gen_param.sequence_start = (nth_round == 1)
gen_param.sequence_end = False
gen_param.step = step
else:
gen_param.sequence_start = True
gen_param.sequence_end = True
gen_param.step = 0
return gen_param
return ARCH_MAP[arch]
# Copyright (c) OpenMMLab. All rights reserved.
import asyncio
import functools
import json
import logging
import os
import sys
import time
from contextlib import contextmanager
from logging import Logger, LogRecord
from typing import List, Optional
from huggingface_hub import hf_hub_download
logger_initialized = {}
def get_logger(name: str,
log_file: Optional[str] = None,
log_level: int = logging.INFO,
file_mode: str = 'w'):
class _ASNI_COLOR:
BRIGHT_RED = '\033[91m'
RED = '\033[31m'
YELLOW = '\033[33m'
WHITE = '\033[37m'
GREEN = '\033[32m'
class ColorFormatter(logging.Formatter):
_LEVELNAME_COLOR_MAP = dict(CRITICAL=_ASNI_COLOR.BRIGHT_RED,
ERROR=_ASNI_COLOR.RED,
WARN=_ASNI_COLOR.YELLOW,
WARNING=_ASNI_COLOR.YELLOW,
INFO=_ASNI_COLOR.WHITE,
DEBUG=_ASNI_COLOR.GREEN)
_RESET_COLOR = '\033[0m'
def format(self, record: LogRecord):
"""format."""
if sys.platform == 'win32':
# windows does not support ASNI color
return super().format(record)
levelname = record.levelname
level_color = self._LEVELNAME_COLOR_MAP.get(levelname,
self._RESET_COLOR)
levelname = f'{level_color}{levelname}{self._RESET_COLOR}'
record.levelname = levelname
return super().format(record)
class FilterDuplicateWarning(logging.Filter):
"""Filter the repeated warning message.
Args:
name (str): name of the filter.
"""
def __init__(self, name: str = 'lmdeploy'):
super().__init__(name)
self.seen: set = set()
def filter(self, record: LogRecord) -> bool:
"""Filter the repeated warning message.
Args:
record (LogRecord): The log record.
Returns:
bool: Whether to output the log record.
"""
if record.levelno != logging.WARNING:
return True
if record.msg not in self.seen:
self.seen.add(record.msg)
return True
return False
def get_logger(
name: Optional[str] = None,
log_file: Optional[str] = None,
log_level: int = logging.INFO,
file_mode: str = 'w',
log_formatter: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
) -> Logger:
"""Initialize and get a logger by name.
If the logger has not been initialized, this method will initialize the
......@@ -22,25 +96,10 @@ def get_logger(name: str,
log_level (int): The logger level.
file_mode (str): The file mode used in opening log file.
Defaults to 'w'.
log_formatter (str): The logger output format.
Returns:
logging.Logger: The expected logger.
"""
# use logger in mmengine if exists.
try:
from mmengine.logging import MMLogger
if MMLogger.check_instance_created(name):
logger = MMLogger.get_instance(name)
else:
logger = MMLogger.get_instance(name,
logger_name=name,
log_file=log_file,
log_level=log_level,
file_mode=file_mode)
return logger
except Exception:
pass
logger = logging.getLogger(name)
if name in logger_initialized:
return logger
......@@ -56,7 +115,7 @@ def get_logger(name: str,
if type(handler) is logging.StreamHandler:
handler.setLevel(logging.ERROR)
stream_handler = logging.StreamHandler()
stream_handler = logging.StreamHandler(stream=sys.stdout)
handlers = [stream_handler]
if log_file is not None:
......@@ -66,14 +125,15 @@ def get_logger(name: str,
file_handler = logging.FileHandler(log_file, file_mode)
handlers.append(file_handler)
formatter = logging.Formatter(
'%(asctime)s - %(name)s - %(levelname)s - %(message)s')
formatter = ColorFormatter(log_formatter)
for handler in handlers:
handler.setFormatter(formatter)
handler.setLevel(log_level)
handler.addFilter(FilterDuplicateWarning(name))
logger.addHandler(handler)
logger.setLevel(log_level)
logger.propagate = False
logger_initialized[name] = True
return logger
......@@ -95,3 +155,103 @@ def filter_suffix(response: str, suffixes: Optional[List[str]] = None) -> str:
if response.endswith(item):
response = response[:len(response) - len(item)]
return response
# TODO remove stop_word_offsets stuff and make it clean
def _stop_words(stop_words: List[str], tokenizer: object):
"""return list of stop-words to numpy.ndarray."""
import numpy as np
if stop_words is None:
return None
assert isinstance(stop_words, List) and \
all(isinstance(elem, str) for elem in stop_words), \
f'stop_words must be a list but got {type(stop_words)}'
stop_indexes = []
for stop_word in stop_words:
stop_indexes += tokenizer.indexes_containing_token(stop_word)
assert isinstance(stop_indexes, List) and all(
isinstance(elem, int) for elem in stop_indexes), 'invalid stop_words'
# each id in stop_indexes represents a stop word
# refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for
# detailed explanation about fastertransformer's stop_indexes
stop_word_offsets = range(1, len(stop_indexes) + 1)
stop_words = np.array([[stop_indexes, stop_word_offsets]]).astype(np.int32)
return stop_words
def get_hf_config_content(pretrained_model_name_or_path: str,
**kwargs) -> dict:
"""Get config content of a hf model."""
if os.path.exists(pretrained_model_name_or_path):
config_path = os.path.join(pretrained_model_name_or_path,
'config.json')
else:
config_path = hf_hub_download(pretrained_model_name_or_path,
'config.json')
with open(config_path, 'r') as f:
config = json.load(f)
return config
def get_model(pretrained_model_name_or_path: str,
download_dir: str = None,
revision: str = None):
"""Get model from huggingface or modelscope."""
import os
if os.getenv('LMDEPLOY_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
else:
from huggingface_hub import snapshot_download
download_kwargs = {}
if download_dir is not None:
download_kwargs['cache_dir'] = download_dir
if revision is not None:
download_kwargs['revision'] = revision
model_path = snapshot_download(pretrained_model_name_or_path,
**download_kwargs)
return model_path
def logging_timer(op_name: str, logger: Logger, level: int = logging.DEBUG):
"""logging timer."""
@contextmanager
def __timer():
"""timer."""
start = time.perf_counter()
yield
end = time.perf_counter()
duration = (end - start) * 1000
logger.log(level, f'<{op_name}> take time: {duration:.2f} ms')
def __inner(func):
"""inner."""
@functools.wraps(func)
def __func_warpper(*args, **kwargs):
"""func warpper."""
if logger.level > level:
return func(*args, **kwargs)
with __timer():
return func(*args, **kwargs)
@functools.wraps(func)
def __async_warpper(*args, **kwargs):
"""async warpper."""
async def __tmp():
if logger.level > level:
return (await func(*args, **kwargs))
with __timer():
return (await func(*args, **kwargs))
return __tmp()
if asyncio.iscoroutinefunction(func):
return __async_warpper
else:
return __func_warpper
return __inner
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Tuple
__dcu_version__ = '0.1.0'
__version__ = '0.1.0'
__dcu_version__ = '0.2.6'
__version__ = '0.2.6'
short_version = __version__
......
......@@ -3,9 +3,10 @@ m2r==0.2.1
markdown>=3.4.0
mistune==0.8.4
myst-parser
-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
-e git+https://github.com/InternLM/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
recommonmark
sphinx==4.0.2
sphinx-copybutton
sphinx-tabs
sphinx_markdown_tables>=0.0.16
sphinxcontrib-mermaid
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment