"vscode:/vscode.git/clone" did not exist on "a5cedce215130d0d7a5a1ad27ab2254cbc864881"
Commit fe851fbc authored by zhouxiang's avatar zhouxiang
Browse files

0.2.6版本新增文件补充

parent e2d98ddc
# Copyright (c) OpenMMLab. All rights reserved.
import re
from transformers import PreTrainedTokenizerFast
from lmdeploy.utils import get_logger
from .base import BasicAdapterFast
logger = get_logger(__name__)
B_INST, E_INST = '[INST]', '[/INST]'
B_SYS, E_SYS = '<<SYS>>\n', '\n<</SYS>>\n\n'
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" # noqa: E501
class Llama2Adapter(BasicAdapterFast):
"""Adapter for llama2.
Llama2 use the following template and the first user prompt
should contain a system prompt.
User can specify the system prompt using a <<SYS>> tag otherwise
the default system prompt is prepended to user's input.
<bos>
[INST]<space>
<<SYS>>\n
SYSTEM_PROMPT\n
<</SYS>>\n\n
{user_prompt_1}<space>
[/INST]<space>
{answer_1}<space>
<eos>
<bos>
[INST]<space>
{user_prompt_2}<space>
[/INST]<space>
{answer_2}<space>
<eos>
<bos>
[INST]<space>
{user_prompt_2}(no space here)
...
"""
start_ids = []
sep_ids = []
def __init__(self, tokenizer: PreTrainedTokenizerFast):
super().__init__(tokenizer)
self.prev_round = 0
def encode_and_decorate(self, prompt):
r"""Encode prompt and decorate with template."""
if self.prev_round == 0:
res = re.search(r'<<SYS>>(.*?)<</SYS>>(.*)', prompt)
if res:
prompt = B_SYS + res.group(1).strip() + \
E_SYS + res.group(2).strip()
else:
prompt = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + prompt
prompt = f'{B_INST} {prompt.strip()} {E_INST}'
logger.debug(f'decorated prompt: {repr(prompt)}')
input_ids = self.tokenizer.encode(
prompt,
add_special_tokens=True,
return_tensors='pt',
)
self.prev_round += 1
return input_ids
# Copyright (c) OpenMMLab. All rights reserved.
"""Chat through command line.
This submodule allows user to chat with language model through command line,
and optionally accelerate model using backends like deepspeed.
Example 1: Chat with default setting
```python
python -m lmdeploy.legacy.pytorch.chat $PATH_TO_HF_MODEL
```
Example 2: Disable sampling
```python
python -m lmdeploy.legacy.pytorch.chat \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--temperature 0
```
Example 3: Accelerate with deepspeed inference
```python
python -m lmdeploy.legacy.pytorch.chat \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--accel deepspeed
```
Note: to use deepspeed, you need to install deepspeed,
and if hope to accelerate InternLM, you need a customized version
https://github.com/wangruohui/DeepSpeed/tree/support_internlm_0.10.0
Example 4: Tensor parallel the model on 2 GPUs
```python
deepspeed --module --num_gpus 2 lmdeploy.legacy.pytorch.chat \
$PATH_TO_LLAMA_MODEL_IN_HF_FORMAT \
--accel deepspeed \
```
This module also allow the following control commands to change
generation behaviors during chat.
- `exit`: terminate and exit chat
- `config set key=value`: change generation config `key` to `value`,
e.g. config temperature=0 disable sampling for following chats
- `clear`: clear chat history
"""
import itertools
import logging
from typing import Optional
import torch
from transformers import GenerationConfig, PreTrainedModel
from lmdeploy.utils import get_logger
from .adapters import init_adapter
from .dist import get_local_rank, get_rank, get_world_size
from .model import accel_model, init_model
from .session import BasicSessionManagerWithHistory
from .utils import BasicStreamer, TerminalIO, control
def set_logging(log_file: str, debug: bool):
torch.set_printoptions(linewidth=120)
level = logging.DEBUG if debug else logging.INFO
log_file = log_file or 'chat.log'
if r := get_rank() != 0:
log_file = log_file + f'.{r}'
format = '%(filename)s: \
%(levelname)s: \
%(funcName)s(): \
%(lineno)d:\t \
%(message)s'
logger = get_logger(__name__,
log_file=log_file,
log_level=level,
file_mode='w',
log_formatter=format)
print(f'Worker {get_rank()} logging to {log_file}')
return logger
def main(
model_path: str,
tokenizer_path: Optional[str] = None,
accel: Optional[str] = None,
max_new_tokens: int = 128,
temperature: float = 0.8,
top_p: float = 0.95,
seed: int = 0,
use_fast_tokenizer: bool = True,
max_alloc: int = 2048,
max_session_len: int = None,
log_file: Optional[str] = None,
debug: bool = False,
adapter: Optional[str] = None,
):
"""Chat with model through terminal.
Args:
model_path (str): Path to model.
tokenizer_path (str): Path to tokenizer.
accel (str): Model accelerator.
max_new_tokens (int): Maximum number of tokens to generate.
temperature (float): Temperature for sampling.
top_p (float): Top p for sampling.
seed (int): Random seed.
use_fast_tokenizer (bool): Whether to use fast tokenizer.
This argument is directly pass to transformer's ``AutoTokenizer.from_pretrained``.
Generally, user should choose to use fast tokenizers.
But if using fast raise some error, try to force using a slow one.
max_alloc (int): Maximum memory to allocate (for deepspeed).
max_session_len (int): Maximum number of tokens allowed for all chat sessions.
This include both history and current session.
log_file (str): Path to log file.
debug (bool): Whether to enable debug mode.
adapter (str): Force to use an adapter.
Generally user should not use this argument because adapter is selected based
on the type of model. Only when it is impossible, e.g. distinguishing llama 1/2
based on `LlamaforCausalLM` class, this argument is required.
Currently, only "llama1" is acceptable for llama1 models.
""" # noqa: E501
logger = set_logging(log_file, debug)
# workers should sync in sampling
torch.manual_seed(seed)
local_rank = get_local_rank()
world_size = get_world_size()
# Init model and tokenizer
if not tokenizer_path:
tokenizer_path = model_path
model, tokenizer = init_model(
model_path,
tokenizer_path,
use_fast_tokenizer=use_fast_tokenizer,
)
# Init adapter based on model and tokenizer
adapter = init_adapter(model, tokenizer, adapter)
# Accelerate model
model: PreTrainedModel = accel_model(model,
accel,
max_alloc=max_alloc,
tp_size=world_size)
# warmup
warmup_config = GenerationConfig(
max_new_tokens=1,
do_sample=temperature > 0,
temperature=temperature,
top_p=top_p,
)
model.generate(torch.tensor([[6]], device=get_local_rank()), warmup_config)
gen_config = GenerationConfig(
max_new_tokens=max_new_tokens,
do_sample=temperature > 0,
temperature=temperature,
top_p=top_p,
)
# Session manager handling history
max_session_len = max_alloc if max_session_len is None else max_session_len
sm = BasicSessionManagerWithHistory(max_session_len=max_session_len,
start_ids=adapter.start_ids,
sep_ids=adapter.sep_ids)
io = TerminalIO()
streamer = BasicStreamer(adapter.decode, io.output)
for r in itertools.count(1):
# User input from IO
logger.info(f'Round {r}')
prompt: str = io.input()
logger.info(f'User input: {prompt}')
# Allow user to change config during runtime or exit
if control(prompt, gen_config, sm):
continue
# Tokenize and apply model specific templates
input_ids = adapter.encode_and_decorate(prompt)
logger.info(f'Input ids:\n{input_ids}')
# Prepend chat history (tensor concatenation)
input_ids = sm.prepend_history(input_ids)
logger.info(f'Input ids with history:\n{input_ids}')
# Generate
input_ids = input_ids.cuda(local_rank)
# returned tensor including input and generated output
output = model.generate(input_ids,
gen_config,
streamer=streamer,
stopping_criteria=adapter.stopping_criteria)
logger.info(f'Output:\n{output}')
# Save output into session manager and maybe trim some history
sm.add_to_history(output)
def cli():
import fire
fire.Fire(main)
if __name__ == '__main__':
cli()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import queue
import warnings
from typing import List, Optional
import pynvml
import torch
import torch.multiprocessing as mp
from torch.nn.utils.rnn import pad_sequence
from transformers import (AutoTokenizer, PreTrainedModel,
PreTrainedTokenizerBase)
from lmdeploy.utils import get_logger
from .model import accel_model, init_model
def safe_numel(free_mem, model_size, max_intermediate):
"""Number of elements without out-of-memory."""
return int(free_mem - model_size) // max_intermediate
def avail_gpus(percentage=0.96):
"""Detect available gpus.
Args:
percentage (float): The minimum percentage of free memory to be
considered as available.
Return:
A list of gpu ids.
average free memory on single gpu.
"""
gpus = []
mems = []
pynvml.nvmlInit()
for i in range(torch.cuda.device_count()):
handle = pynvml.nvmlDeviceGetHandleByIndex(int(i))
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
free, total = int(mem_info.free), int(mem_info.total)
if free / total > percentage:
gpus.append(i)
mems.append(free)
pynvml.nvmlShutdown()
if len(gpus) == 0:
raise RuntimeError('No GPU available.')
return gpus, sum(mems) / len(mems)
@torch.no_grad()
def decode_single(model: PreTrainedModel,
input_ids: torch.Tensor,
attention_mask: torch.Tensor = None,
return_logits=True):
"""Decode a single batch.
Args:
model (PreTrainedModel): Pretrained model.
input_ids (torch.Tensor): A batch of input ids.
attention_mask (torch.Tensor): A batch of attention masks.
Returns:
torch.Tensor: A batch of probabilities (on CPU).
Note:
This function assume input_ids[i] = [bos, x1, x2, ..., xn]
and return prob = [p(x1|bos), p(x2|bos,x1), ..., p(xn|bos..xn-1)]
So prob is shorter than input_ids by 1.
"""
# Call Causal LM forward
outputs = model(input_ids=input_ids,
attention_mask=attention_mask,
output_hidden_states=False,
output_attentions=False,
use_cache=False,
return_dict=True)
# fp32, [bs, seq_len, vocab_size]
logits = outputs.logits
if not return_logits:
# inplace softmax to get probs
torch.softmax(logits, dim=-1, out=logits)
# Shift to fetch probabilities
shift_labels = input_ids[..., 1:].contiguous()
shift_probs = logits[..., :-1, :].contiguous()
logits = torch.gather(shift_probs, -1, shift_labels.unsqueeze(-1))
if attention_mask is not None:
logits *= attention_mask[..., None]
logits = logits.cpu()
return logits
def worker_fn(model_path: str,
inq: mp.Queue,
outq: mp.Queue,
accel: Optional[str] = None,
gpu_id=0):
# torch.set_default_device(gpu_id)
model, _ = init_model(model_path)
model = model.eval()
model = accel_model(model, accel, gpu_id=gpu_id)
while True:
try:
idx, args = inq.get(timeout=1)
except queue.Empty:
continue
if idx is None:
print(f'Worker {gpu_id} received exit signal.')
break
# print(args)
input_ids, input_lens, *args = args
input_ids = input_ids.cuda(gpu_id)
max_len = max(input_lens)
assert max_len == input_ids.size(-1), \
f'input_ids.shape = {input_ids.shape}, max_len = {max_len}'
input_lens = torch.tensor(input_lens, device=gpu_id)
attention_mask = \
torch.arange(max_len, device=gpu_id)[None, :] < input_lens[:, None]
assert attention_mask.shape == input_ids.shape, \
f'attention_mask.shape = {attention_mask.shape}'
try:
probs = decode_single(model, input_ids, attention_mask, *args)
except torch.cuda.OutOfMemoryError:
warnings.warn(
f'OOM on GPU {gpu_id}, discard prompts at indics {idx}.')
probs = torch.empty((input_ids.size(0), 0),
dtype=torch.float32,
device='cpu')
outq.put((idx, probs))
print(f'Exiting worker {gpu_id} ...')
inq.close()
outq.close()
print(f'Worker {gpu_id} finished.')
class Engine:
"""Multi-GPU deciding engine.
Args:
model_path (str): Path to the pretrained model.
tokenizer_path (str, optional): Path to the pretrained tokenizer.
Defaults to None.
Either tokenizer_path or tokenizer should be provided.
tokenizer (PreTrainedTokenizerBase, optional): Pre-configured tokenizer.
Defaults to None.
Either tokenizer_path or tokenizer should be provided.
accel (str, optional): Acceleration method.
Defaults to None. 'deepspeed' is not tested.
gpu_mem_percentage (float, optional): GPU with memory larger than this value
are considered available and be used as decode device.
Defaults to 0.96.
model_size_byte (float, optional): (Approximate) model size in bytes.
Defaults to 14e9 (7B model in FP16).
bytes_per_token (float, optional): (Approximate) memory cost per token in bytes.
Defaults to 2e6 (2MB).
``bytes_per_token`` and ``model_size_byte`` are used to compute
the maximum batch size for given seq_length
""" # noqa: E501
def __init__(self,
model_path: str,
tokenizer_path: Optional[str] = None,
tokenizer: Optional[PreTrainedTokenizerBase] = None,
accel: Optional[str] = None,
gpu_mem_percentage: float = 0.96,
model_size_byte=14e9,
bytes_per_token=2e6):
gpu_ids, mem = avail_gpus(gpu_mem_percentage)
print(f'Available GPUs are: {gpu_ids}, ', end='')
print(f'with {mem/2**30:.2f} GiB free.')
ctx = mp.get_context('spawn')
inq = ctx.Queue()
outq = ctx.Queue()
ps = []
for id in gpu_ids:
p = ctx.Process(target=worker_fn,
args=(model_path, inq, outq, accel, id))
p.start()
ps.append(p)
if tokenizer is None:
if tokenizer_path is None:
tokenizer_path = model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
self.gpu_ids = gpu_ids
self.inq = inq
self.outq = outq
self.ps = ps
self.tokenizer = tokenizer
self.safe_numel = safe_numel(mem, model_size_byte, bytes_per_token)
def clear_queue(self):
for q in self.inq, self.outq:
while not q.empty():
q.get()
def decode(self,
token_ids: List[List[int]],
sort=True,
max_bs: int = 1024,
pad=True,
pad_token_id=2,
return_logits=True):
"""Inference the model to compute probabilities.
Args:
token_ids (List[List[int]]): List of list of token ids.
sort (bool, optional): Internally sort the prompts by length to achieve better efficiency.
Defaults to True.
Note: orders of returned probabilities are always the same as the input.
max_bs (int, optional): Maximum batch size.
Defaults to 1024.
pad (bool, optional): Pad the prompts in every mini batch to the same length.
Defaults to True. Set to False to save memory.
return_logits (bool, optional): Return logits instead of probabilities.
Returns:
numpy.ndarray: Array of logits of shape [bsz, seqlen, vocab_size],
with prob=0 padded, if pad is True
List[numpy.ndarray]: List of logits without padding, if pad is False.
Note:
This function will accept input token_ids = [x0(=bos), x1, x2, ..., xn]
and compute prob = [p(x1|x0), p(x2|x0,x1), ..., p(xn|x0..xn-1)]
So prob is shorter than input_ids by 1.
""" # noqa: E501
self.clear_queue()
# sort to achieve better efficiency
if sort:
pids_and_indicis = sorted(enumerate(token_ids),
key=lambda i_and_x: len(i_and_x[1]))
else:
pids_and_indicis = list(enumerate(token_ids))
left = 0
bs = max_bs
while left < len(token_ids):
if not sort:
bs = max_bs
right = min(left + bs, len(token_ids))
# batch of prompts
sub_p_and_i = pids_and_indicis[left:right]
idx, sub_p = zip(*sub_p_and_i)
# batch of input_ids and attn_masks
# inputs = self.tokenizer(sub_p, return_tensors='pt', padding=True)
input_ids = [torch.tensor(p) for p in sub_p]
input_ids = pad_sequence(input_ids,
batch_first=True,
padding_value=pad_token_id)
input_lens = [len(p) for p in sub_p]
# Dynamic batch size based on safe memory
while input_ids.numel() > self.safe_numel:
if bs == 1:
break
bs = max(1, round(bs / 1.5))
print(f'\nReduce bs to {bs} when seq len reaches '
f'{input_ids.shape[-1]}')
idx = idx[:bs]
input_lens = input_lens[:bs]
input_ids = input_ids[:bs, :max(input_lens)]
# Send to worker
self.inq.put((idx, (input_ids, input_lens)))
left += bs
print(
f'Distributing prompts {right}/{len(token_ids)},'
f' {right/len(token_ids):.0%}',
end='\r')
print()
# Collect outputs from workers
all_probs = [None] * len(token_ids)
count = 0
while count < len(token_ids):
idx, probs = self.outq.get()
for i, p in zip(idx, probs):
assert all_probs[i] is None
all_probs[i] = p
count += len(idx)
print(
f'Decoding and collecting outputs '
f'{count}/{len(token_ids)}, '
f'{count/len(token_ids):.0%}',
end='\r')
print()
if pad:
all_probs = pad_sequence(all_probs, batch_first=True)
all_probs = all_probs.cpu().numpy()
else:
all_probs = [p.cpu().numpy() for p in all_probs]
return all_probs
def __del__(self):
print('Exiting engine ...')
for _ in self.ps:
self.inq.put((None, None))
for p in self.ps:
p.join(timeout=1)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--model_path',
default='llama2/huggingface/llama-2-7b',
help='Path to HugigngFace model and tokenizer.')
parser.add_argument(
'--test_path',
default='',
help='Path to text file, with each line containing a prompt.')
parser.add_argument(
'-p',
'--prompts',
nargs='*',
default=[
'I believe the meaning of life is to find your gift.',
'Simply put, the theory of relativity states that',
'Building a website can be done in 10 simple steps:'
],
help="Prompt in command line, please quote \"\" every sentences, "
'surpassed by --test_path')
parser.add_argument('--min_len',
default=1,
help='Minimum length of prompts')
parser.add_argument('--save-to',
default='decode.out',
help='Save results to this file.')
args = parser.parse_args()
model_path = args.model_path
test_path = args.test_path
prompts = args.prompts
logger = get_logger(__name__)
# logging.basicConfig(level=logging.DEBUG)
# Use test file preferentially
if test_path:
with open(test_path, 'r') as f:
prompts = f.readlines()
prompts = [p.strip() for p in prompts]
# Output infos
print(f'Model path: {model_path}')
def _format(ts, start, end):
if start < 0:
start += len(ts)
if end <= 0:
end += len(ts)
return '\n'.join(
(f'{i}\t{t}' for i, t in zip(range(start, end), ts[start:end])))
if len(prompts) > 10:
print('Prompts:\n' + _format(prompts, 0, 5) + '\n......\n' +
_format(prompts, -5, 0))
else:
print('Prompts:\n' + _format(prompts, 0, 0))
# Init Engine in backend
engine = Engine(model_path)
# Tokenize
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = 'right'
input_ids = tokenizer(prompts, padding=False)
input_ids: List[List[int]] = input_ids.input_ids
# Filter out too short prompts
input_ids = [i for i in input_ids if len(i) >= args.min_len]
if len(input_ids) < len(prompts):
logger.warning(
f'Filtered out {len(prompts) - len(input_ids)} prompts, '
f'because they are shorter than {args.min_len}.')
# Decode
logits = engine.decode(input_ids)
print(f'logits.shape = {logits.shape}')
# Save to pth
print(f'Dumping results to = {args.save_to}')
torch.save(logits, args.save_to, pickle_protocol=4)
del engine
# Copyright (c) OpenMMLab. All rights reserved.
"""Helpers for parallel and distributed inference."""
import functools
import os
import torch
from torch.distributed import broadcast, broadcast_object_list, is_initialized
def get_local_rank():
"""Get local rank of current process.
Assume environment variable ``LOCAL_RANK`` is properly set by some launcher.
See: https://pytorch.org/docs/stable/elastic/run.html#environment-variables
""" # noqa: E501
return int(os.getenv('LOCAL_RANK', '0'))
def get_rank():
"""Get rank of current process.
Assume environment variable ``RANK`` is properly set by some launcher.
See: https://pytorch.org/docs/stable/elastic/run.html#environment-variables
""" # noqa: E501
return int(os.getenv('RANK', '0'))
def get_world_size():
"""Get rank of current process.
Assume environment variable ``WORLD_SIZE`` is properly set by some launcher.
See: https://pytorch.org/docs/stable/elastic/run.html#environment-variables
""" # noqa: E501
return int(os.getenv('WORLD_SIZE', '1'))
def master_only(func):
"""Decorator to run a function only on the master process."""
@functools.wraps(func)
def wrapper(*args, **kwargs):
if is_initialized():
if get_rank() != 0:
return None
return func(*args, **kwargs)
return wrapper
def master_only_and_broadcast_general(func):
"""Decorator to run a function only on the master process and broadcast the
result to all processes."""
@functools.wraps(func)
def wrapper(*args, **kwargs):
if is_initialized():
if get_rank() == 0:
result = [func(*args, **kwargs)]
else:
result = [None]
broadcast_object_list(result, src=0)
result = result[0]
else:
result = func(*args, **kwargs)
return result
return wrapper
def master_only_and_broadcast_tensor(func):
"""Decorator to run a function only on the master process and broadcast the
result to all processes.
Note: Require CUDA tensor.
Note: Not really work because we don't know the shape aforehand,
for cpu tensors, use master_only_and_broadcast_general
"""
@functools.wraps(func)
def wrapper(*args, size, dtype, **kwargs):
if is_initialized():
if get_rank() == 0:
result = func(*args, **kwargs)
else:
result = torch.empty(size=size,
dtype=dtype,
device=get_local_rank())
broadcast(result, src=0)
# print(f'rank {get_rank()} received {result}')
else:
result = func(*args, **kwargs)
return result
return wrapper
# Copyright (c) OpenMMLab. All rights reserved.
import time
import warnings
from typing import Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from lmdeploy.utils import get_logger
from .dist import get_local_rank
logger = get_logger(__name__)
class LoadWoInit:
"""Context manager that disable parameter initialization."""
def __init__(self):
self.constant_ = torch.nn.init.constant_
self.zeros_ = torch.nn.init.zeros_
self.ones_ = torch.nn.init.ones_
self.uniform_ = torch.nn.init.uniform_
self.normal_ = torch.nn.init.normal_
self.kaiming_uniform_ = torch.nn.init.kaiming_uniform_
self.kaiming_normal_ = torch.nn.init.kaiming_normal_
def __enter__(self, *args, **kwargs):
torch.nn.init.constant_ = lambda *args, **kwargs: None
torch.nn.init.zeros_ = lambda *args, **kwargs: None
torch.nn.init.ones_ = lambda *args, **kwargs: None
torch.nn.init.uniform_ = lambda *args, **kwargs: None
torch.nn.init.normal_ = lambda *args, **kwargs: None
torch.nn.init.kaiming_uniform_ = lambda *args, **kwargs: None
torch.nn.init.kaiming_normal_ = lambda *args, **kwargs: None
def __exit__(self, *args, **kwargs):
torch.nn.init.constant_ = self.constant_
torch.nn.init.zeros_ = self.zeros_
torch.nn.init.ones_ = self.ones_
torch.nn.init.uniform_ = self.uniform_
torch.nn.init.normal_ = self.normal_
torch.nn.init.kaiming_uniform_ = self.kaiming_uniform_
torch.nn.init.kaiming_normal_ = self.kaiming_normal_
def init_model(model_path: str,
tokenizer_path: Optional[str] = None,
use_fast_tokenizer=True):
"""Initialize model and tokenizer from given model path.
Args:
model_path (str): Path to model.
tokenizer_path (str): Path to tokenizer.
use_fast_tokenizer (bool): Whether to use fast tokenizer.
Note:
If the model is converted from new version of transformers,
use_fast_tokenizer should be True.
If using depodaca/llama-xb-hf, use_fast_tokenizer should be False.
"""
start = time.monotonic()
if not tokenizer_path:
tokenizer_path = model_path
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path,
use_fast=use_fast_tokenizer,
trust_remote_code=True)
with LoadWoInit():
model = AutoModelForCausalLM.from_pretrained(model_path,
torch_dtype=torch.float16,
trust_remote_code=True)
logger.info(f'Model loaded in {time.monotonic() - start:.1f} seconds')
logger.info(f'Model loaded from {model_path}')
logger.debug(model)
return model, tokenizer
def accel_model(model,
accel: Optional[str] = None,
gpu_id=None,
max_alloc=2048,
tp_size=1):
"""Accelerate model with given accelerator.
Note:
Currently we support only deepspeed or just no acceleration.
"""
logger.info(f'Accelerate model with {accel}')
if accel is None:
# No acceleration, just to cuda
# assume single gpu single process
# user is responsible to assign the gpu id via CUDA_VISIBLE_DEVICES # noqa: E501
gpu_id = gpu_id if gpu_id is not None else get_local_rank()
model = model.cuda(gpu_id)
elif accel.lower() == 'deepspeed':
# Use deepspeed inference inject fast kernel and/or tensor parallel
try:
import deepspeed
except ImportError as e:
raise ImportError('--accel=deepspeed is specified but '
'deepspeed is not installed.\n'
'Install with `pip install deepspeed`.') from e
config = dict(
tensor_parallel=dict(tp_size=tp_size), # Use world size in general
dtype=torch.float16,
replace_with_kernel_inject=True,
max_out_tokens=max_alloc,
)
if 'InternLM' in model.__class__.__name__:
try:
# Use customized deepspeed supporting InternLM
# https://github.com/wangruohui/DeepSpeed/tree/support_internlm_0.10.0 (commit cdef2ce) # noqa: E501
from deepspeed.module_inject.containers.internlm import \
InternLMLayerPolicy # noqa: E501
except ImportError:
# InternLM is not officially supported by DeepSpeed
# Set replace_with_kernel_inject=False to use AutoTP
config.update({'replace_with_kernel_inject': False})
warnings.warn(
'\033[0;93m'
'Current installation of deepspeed does not '
'support InternLM. Disable kernel injection. '
'To support InternLM, install customized deepspeed with '
'`pip install git+https://github.com/wangruohui/DeepSpeed@support_internlm_0.10.0`' # noqa: E501
'\033[0m')
else:
for module in model.modules():
# Since remote code is dynamically located,
# we need to do this dynamically
if module.__class__.__name__ == 'InternLMDecoderLayer':
InternLMLayerPolicy._orig_layer_class = module.__class__ # noqa: E501
break
logger.debug(f'Using deepspeed config\n{config}')
model = deepspeed.init_inference(
model=model, # Transformers models
config=config,
)
# for k, v in model.named_parameters():
# logger.debug(f"{k}: v.device")
else:
raise ValueError(f'Unsupported accelerator {accel}.')
logger.debug(model)
return model
# Copyright (c) OpenMMLab. All rights reserved.
from .linear import WeightOnlyQLinear
__all__ = ['WeightOnlyQLinear']
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Optional, Type, TypeVar
import torch
from torch import nn
try:
import awq_inference_engine
except ModuleNotFoundError:
awq_inference_engine = None
class WeightOnlyQLinear(nn.Module):
"""This class implements weight only quantization linear.
Args:
w_bit (int): number of bits for quantization.
symmetry (bool): If true, use symmetric quantization,
otherwise use asymmetric quantization.
group_size (int): size of the quantization group.
in_features (int): size of each input sample.
out_features (int): size of each output sample.
bias (Tensor, optional): Defaults to None.
"""
def __init__(
self,
in_features: int,
out_features: int,
bias: Optional[torch.Tensor] = True,
w_bit: int = 4,
symmetry: bool = False,
group_size: int = 128,
) -> None:
super().__init__()
if w_bit not in [2, 4, 8]:
raise NotImplementedError('Only 2,4,8 bit are supported for now.')
self.in_features = in_features
self.out_features = out_features
self.w_bit = w_bit
self.group_size = group_size if group_size != -1 else in_features
assert self.in_features % self.group_size == 0
assert out_features % (32 // self.w_bit) == 0
w_pack_oc = out_features // (32 // self.w_bit)
w_inc = in_features
weight = torch.zeros((w_inc, w_pack_oc), dtype=torch.int32)
self.register_buffer('qweight', weight)
if bias:
self.register_buffer('bias', torch.zeros(out_features))
else:
self.bias = None
s_inc = in_features // self.group_size
s_oc = out_features
scales = torch.zeros((s_inc, s_oc), dtype=torch.float16)
self.register_buffer('scales', scales)
if not symmetry:
z_inc = in_features // self.group_size
z_oc = out_features // (32 // self.w_bit)
zeros = torch.zeros((z_inc, z_oc), dtype=torch.int32)
self.register_buffer('qzeros', zeros)
else:
self.qzeros = None
@classmethod
def from_linear(cls: Type['WeightOnlyQLinear'],
linear: nn.Linear,
quantizer: TypeVar('Quantizer'),
awq_layout: bool = True) -> 'WeightOnlyQLinear':
"""Create a WeightOnlyQLinear object from a PyTorch Linear object.
Args:
linear (nn.Linear): PyTorch Linear object.
quantizer (Quantizer): Object that handles quantization.
awq_layout (bool): AWQ layout. Defaults to True.
Returns:
WeightOnlyQLinear: A WeightOnlyQLinear object.
"""
device = linear.weight.device
w_bit = quantizer.bits
pack_num = 32 // w_bit
if awq_layout:
assert w_bit == 4
pack_order = [0, 2, 4, 6, 1, 3, 5, 7]
else:
pack_order = torch.arange(pack_num)
group_size = quantizer.group_size
symmetry = quantizer.symmetry
in_features = linear.in_features
out_features = linear.out_features
bias = False if linear.bias is None else True
qlinear = cls(in_features, out_features, bias, w_bit, symmetry,
group_size)
qlinear.bias = linear.bias
qparams = quantizer.calculate_qparams(linear.weight)
i32_w = quantizer.quant(linear.weight, qparams, real=True)
i32_w = i32_w.t().contiguous()
pack_int_w = torch.zeros_like(qlinear.qweight).to(device)
for col in range(pack_int_w.shape[1]):
for i in range(pack_num):
pack_int_w_col = i32_w[:, col * pack_num + pack_order[i]]
pack_int_w[:, col] |= pack_int_w_col << (i * w_bit)
qlinear.qweight = pack_int_w
qlinear.scales = qparams.scales.squeeze(-1).t().contiguous()
if qparams.zero_points is not None:
zeros = qparams.zero_points.to(torch.int32).to(device)
zeros = zeros.squeeze(-1).t().contiguous()
pack_int_zeros = torch.zeros_like(qlinear.qzeros).to(device)
for col in range(pack_int_zeros.shape[1]):
for i in range(pack_num):
qzero_col = zeros[:, col * pack_num + pack_order[i]]
pack_int_zeros[:, col] |= qzero_col << (i * w_bit)
qlinear.qzeros = pack_int_zeros
qlinear.to('cpu')
return qlinear
@torch.no_grad()
def forward(self, x):
if awq_inference_engine is None:
raise RuntimeError(
'Run the following command to install '
'the kernel for 4bit inference\n\n'
'git clone https://github.com/mit-han-lab/llm-awq.git\n'
'cd awq/kernels\n'
'python setup.py install\n')
out_shape = x.shape[:-1] + (self.out_features, )
inputs = x.reshape(-1, x.shape[-1])
out = awq_inference_engine.gemm_forward_cuda(inputs.half(),
self.qweight,
self.scales.half(),
self.qzeros,
self.group_size)
out = out + self.bias if self.bias is not None else out
return out.reshape(out_shape)
# Copyright (c) OpenMMLab. All rights reserved.
import torch
from transformers.generation.utils import ModelOutput
from lmdeploy.utils import get_logger
logger = get_logger(__name__)
class BasicSessionManager:
"""Basic session manager without history."""
def prepend_history(self, input_ids):
return input_ids
def add_to_history(self, output):
pass
class BasicSessionManagerWithHistory:
"""Basic session manager with chat history.
Args:
max_session_len (int): Maximum number of tokens allowed for all chat sessions.
reduce_size (int): Number of tokens to be trimmed when reaching maximum
session length. Default: 256.
start_ids (list[int]): Sequences of ids at the start of the chat session.
sep_ids (list[int]): Sequences of ids separating chat sessions.
""" # noqa: E501
bs = 1
def __init__(self,
max_session_len=2048,
reduce_size=256,
start_ids=[1],
sep_ids=[13]) -> None:
self.start_ids = torch.tensor(start_ids, dtype=torch.long)
self.sep_ids = torch.tensor(sep_ids, dtype=torch.long)
assert self.start_ids.ndim == 1
assert self.sep_ids.ndim == 1
self.max_session_len = max(len(start_ids), max_session_len)
self.reduce_size = min(reduce_size, max_session_len - len(start_ids))
assert self.max_session_len > self.reduce_size
self.new_session()
def new_session(self):
self.history_ids = self.start_ids.repeat(self.bs, 1)
def prepend_history(self, input_ids: torch.Tensor):
"""Prepend history ids to input ids and trim if over-length."""
input_ids = input_ids.to(self.history_ids.device).long()
sep_ids = self.sep_ids.to(self.history_ids.device).long().repeat(1, 1)
input_ids = torch.cat([self.history_ids, sep_ids, input_ids], dim=1)
if input_ids.shape[1] > self.max_session_len:
input_ids = input_ids[:,
(self.reduce_size - self.max_session_len):]
input_ids[:, :len(self.start_ids)] = self.start_ids.repeat(
self.bs, 1)
return input_ids
def add_to_history(self, output):
"""Save history output ids.
Note:
Output returned by HuggingFace generator contains both input
and output ids.
"""
if isinstance(output, ModelOutput):
self.history_ids = output.sequences
elif isinstance(output, torch.Tensor):
self.history_ids = output
else:
raise ValueError(f'Unknown output type {type(output)}')
# Copyright (c) OpenMMLab. All rights reserved.
from transformers.generation.streamers import BaseStreamer
from lmdeploy.utils import get_logger
from .dist import get_rank, master_only, master_only_and_broadcast_general
try:
import readline # To support command line history # noqa: F401
except ImportError: # readline not available
pass
logger = get_logger(__name__)
class TerminalIO:
"""Terminal input and output."""
end_of_output = '\n'
@master_only_and_broadcast_general
def input(self):
"""Read input from terminal."""
print('\ndouble enter to end input >>> ', end='')
sentinel = '' # ends when this string is seen
try:
return '\n'.join(iter(input, sentinel))
except EOFError:
print('Detect EOF, exit')
exit()
@master_only
def output(self, string):
"""Output to terminal with flush."""
print(string, end='', flush=True)
class BasicStreamer(BaseStreamer):
"""Basic streamer for HuggingFace models."""
def __init__(self,
decode_func,
output_func,
end_of_output='\n',
skip_prompt=True):
self.decode = decode_func
self.output = output_func
self.end_of_output = end_of_output
self.skip_prompt = skip_prompt
self.gen_len = 0
def put(self, value):
"""Callback before forwarding current token id to model."""
if self.gen_len == 0 and self.skip_prompt:
pass
else:
token = self.decode(value)
self.output(token)
self.gen_len += 1
def end(self):
"""Callback at the end of generation."""
self.output(self.end_of_output)
self.gen_len = 0
def control(prompt, gen_config, sm):
"""Allow user to control generation config and session manager.
Return:
True if control command applied, False otherwise.
"""
if prompt == 'exit':
exit(0)
if prompt == 'clear':
sm.new_session()
logger.info('Session cleared')
return True
# Re-config during runtime
if prompt.startswith('config set'):
try:
keqv = prompt.split()[-1]
k, v = keqv.split('=')
v = eval(v)
gen_config.__setattr__(k, v)
logger.info(f'Worker {get_rank()} set {k} to {repr(v)}')
logger.info(f'Generator config changed to: {gen_config}')
return True
except: # noqa
logger.info(
'illegal instruction, treated as normal conversation. ')
return False
# Copyright (c) OpenMMLab. All rights reserved.
import os.path as osp
import shutil
import fire
import torch
from torch import nn
import lmdeploy
from lmdeploy.lite.apis.calibrate import calibrate
from lmdeploy.lite.quantization.awq import (FC_FCS_MAP, NORM_FCS_MAP,
smooth_layers)
from lmdeploy.lite.utils import collect_target_modules
from lmdeploy.pytorch.models import QLinear, QRMSNorm
LAYER_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMDecoderLayer',
'InternLM2ForCausalLM': 'InternLM2DecoderLayer',
'QWenLMHeadModel': 'QWenBlock',
'BaiChuanForCausalLM': 'DecoderLayer',
'LlamaForCausalLM': 'LlamaDecoderLayer',
}
NORM_TYPE_MAP = {
'InternLMForCausalLM': 'InternLMRMSNorm',
'InternLM2ForCausalLM': 'InternLM2RMSNorm',
'QWenLMHeadModel': 'RMSNorm',
'BaiChuanForCausalLM': 'RMSNorm',
'LlamaForCausalLM': 'LlamaRMSNorm',
}
LMDEPLOY_ROOT = lmdeploy.__path__[0]
MODEL_PATH_MAP = {
'InternLMForCausalLM':
osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm.py'),
'InternLM2ForCausalLM':
osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_internlm2.py'),
'LlamaForCausalLM':
osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_llama.py'),
'BaiChuanForCausalLM':
osp.join(LMDEPLOY_ROOT, 'pytorch/modeling/modeling_baichuan.py')
}
AUTO_MAP = {
'InternLMForCausalLM': {
'AutoConfig': 'configuration_internlm.InternLMConfig',
'AutoModel': 'modeling_internlm.InternLMForCausalLM',
'AutoModelForCausalLM': 'modeling_internlm.InternLMForCausalLM'
},
'InternLM2ForCausalLM': {
'AutoConfig': 'configuration_internlm2.InternLMConfig',
'AutoModelForCausalLM': 'modeling_internlm2.InternLM2ForCausalLM',
'AutoModel': 'modeling_internlm2.InternLM2ForCausalLM'
},
'LlamaForCausalLM': {
'AutoModel': 'modeling_llama.LlamaForCausalLM',
'AutoModelForCausalLM': 'modeling_llama.LlamaForCausalLM'
},
'BaiChuanForCausalLM': {
'AutoConfig': 'configuration_baichuan.BaiChuanConfig',
'AutoModelForCausalLM': 'modeling_baichuan.BaiChuanForCausalLM'
}
}
def smooth_quant(model: str,
work_dir: str = './work_dir',
calib_dataset: str = 'ptb',
calib_samples: int = 128,
calib_seqlen: int = 2048,
device: str = 'cuda'):
model, tokenizer, work_dir = calibrate(model, calib_dataset, calib_samples,
calib_seqlen, work_dir, device)
# calibrate function exports the calibration statistics
# (inputs, outputs, keys and values) to `work_dir`.
inp_stats = torch.load(work_dir / 'inputs_stats.pth')
act_scales = inp_stats['absmax']
model_type = type(model).__name__
if model_type not in LAYER_TYPE_MAP or model_type not in NORM_TYPE_MAP:
raise RuntimeError(
f'Currently, quantification and calibration of {model_type} are '
f'not supported. The supported model types are '
f"{', '.join(LAYER_TYPE_MAP.keys())}.")
if model_type == 'QWenLMHeadModel':
try:
import flash_attn # noqa: F401
except ImportError:
raise RuntimeError(
'When using Qwen, you need to `pip install flash-attn` first, '
'otherwise calibration and quantification will not work '
'properly.')
layer_type = LAYER_TYPE_MAP[type(model).__name__]
norm_type = NORM_TYPE_MAP[type(model).__name__]
fc2fcs = FC_FCS_MAP[layer_type]
norm2fcs = NORM_FCS_MAP[layer_type]
layers = collect_target_modules(model, layer_type)
fcs = {}
for l_name, layer in layers.items():
name2fc = collect_target_modules(layer, nn.Linear, prefix=l_name)
fcs.update(name2fc)
smooth_layers(layers, fc2fcs, norm2fcs, act_scales, -1, device)
rmsnorms = collect_target_modules(model, norm_type)
for name, linear in fcs.items():
linear.to(device)
q_linear = QLinear.from_float(linear)
parent_name, _, child_name = name.rpartition('.')
parent = model.get_submodule(parent_name)
setattr(parent, child_name, q_linear)
linear.to('cpu')
for name, norm in rmsnorms.items():
norm.to(device)
q_norm = QRMSNorm.from_float(norm)
parent_name, _, child_name = name.rpartition('.')
parent = model.get_submodule(parent_name)
setattr(parent, child_name, q_norm)
norm.to('cpu')
if hasattr(model.config, 'auto_map'):
model.config.auto_map.update(AUTO_MAP[type(model).__name__])
else:
model.config.auto_map = AUTO_MAP[type(model).__name__]
model.save_pretrained(work_dir,
max_shard_size='2GB',
safe_serialization=False)
tokenizer.save_pretrained(work_dir)
shutil.copy(MODEL_PATH_MAP[type(model).__name__], work_dir)
if __name__ == '__main__':
fire.Fire(smooth_quant)
# Copyright (c) OpenMMLab. All rights reserved.
import enum
from dataclasses import dataclass
from typing import Dict, List, Literal, Optional
from pydantic.dataclasses import dataclass as pydantic_dataclass
from .tokenizer import Tokenizer
@dataclass
class GenerationConfig:
"""generation parameters used by inference engines.
Args:
n (int): Define how many chat completion choices to generate for each
input message
max_new_tokens (int): The maximum number of tokens that can be
generated in the chat completion
top_p (float): An alternative to sampling with temperature, called
nucleus sampling, where the model considers the results of the
tokens with top_p probability mass
top_k (int): An alternative to sampling with temperature, where
the model considers the top_k tokens with the highest probability
temperature (float): Sampling temperature
repetition_penalty (float): Penalty to prevent the model from
generating repeated words or phrases. A value larger than
1 discourages repetition
ignore_eos (bool): Indicator to ignore the eos_token_id or not
random_seed (int): Seed used when sampling a token
stop_words (List[str]): Words that stop generating further tokens
bad_words (List[str]): Words that the engine will never generate
min_new_tokens (int): The minimum numbers of tokens to generate,
ignoring the number of tokens in the prompt.
skip_special_tokens (bool): Whether or not to remove special tokens
in the decoding. Default to be True.
"""
n: int = 1
max_new_tokens: int = 512
top_p: float = 1.0
top_k: int = 1
temperature: float = 0.8
repetition_penalty: float = 1.0
ignore_eos: bool = False
random_seed: int = None
stop_words: List[str] = None
bad_words: List[str] = None
min_new_tokens: int = None
skip_special_tokens: bool = True
@dataclass
class EngineGenerationConfig(GenerationConfig):
"""generation parameter used by the inference engines."""
stop_words: List[int] = None
bad_words: List[int] = None
@staticmethod
def From(gen_config: GenerationConfig, tokenizer: Tokenizer):
"""convert `GenerationConfig` to `EngineGenerationConfig`
Args:
gen_config (GenerationConfig): an instance of class `GenerationConfig`
tokenizer (Tokenizer): a tokenizer to encode the `stop_words` and `bad_words` in `gen_config`
Returns:
EngineGenerationConfig: the generation config used by inference engines
Examples:
>>> from lmdeploy import Tokenizer, GenerationConfig, EngineGenerationConfig
>>> tokenizer = Tokenizer('internlm/internlm-chat-7b')
>>> gen_config = GenerationConfig(stop_words=['<eoa>'])
>>> gen_config = EngineGenerationConfig.From(gen_config, tokenizer)
""" # noqa E501
def special_word_token_ids(words):
if words is not None:
assert isinstance(words, List) and \
all(isinstance(elem, str) for elem in words), \
f'stop_words must be a list of str but got {type(words)}'
indexes = []
for word in words:
indexes += tokenizer.indexes_containing_token(word)
return indexes
return None
return EngineGenerationConfig(
n=gen_config.n,
max_new_tokens=gen_config.max_new_tokens,
min_new_tokens=gen_config.min_new_tokens,
top_p=gen_config.top_p,
top_k=gen_config.top_k,
temperature=gen_config.temperature,
repetition_penalty=gen_config.repetition_penalty,
ignore_eos=gen_config.ignore_eos,
random_seed=gen_config.random_seed,
skip_special_tokens=gen_config.skip_special_tokens,
stop_words=special_word_token_ids(gen_config.stop_words),
bad_words=special_word_token_ids(gen_config.bad_words))
@pydantic_dataclass
class TurbomindEngineConfig:
"""TurboMind Engine config.
Args:
model_name (str): the name of the deployed model, deprecated and has no effect when version > 0.2.1
model_format (str): the layout of the deployed model. It can be one of the following values [hf, llama, awq], `hf` meaning `hf_llama`, `llama` meaning `meta_llama`, `awq` meaning the quantized model by AWQ.
tp (int): the number of GPU cards used in tensor parallelism, default to 1
session_len (int): the max session length of a sequence, default to None
max_batch_size (int): the max batch size during inference, default to 128
cache_max_entry_count (float): the percentage of gpu memory occupied by the k/v cache.
For versions of lmdeploy between `v0.2.0` and `v0.2.1`, it defaults to 0.5, depicting the percentage of TOTAL GPU memory to be allocated to the k/v cache.
For lmdeploy versions greater than `v0.2.1`, it defaults to 0.8, signifying the percentage of FREE GPU memory to be reserved for the k/v cache
quant_policy (int): , default to 0. When k/v is quantized into 8 bit, set it to 4
rope_scaling_factor (int): scaling factor used for dynamic ntk, default to 0. TurboMind follows the implementation of transformer LlamaAttention
use_logn_attn (bool): whether or not to use log attn: default to False
download_dir (str): Directory to download and load the weights, default to the default cache directory of huggingface.
revision (str): The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
max_prefill_token_num(int): the number of tokens each iteration during prefill, default to 8192
""" # noqa: E501
model_name: Optional[str] = None
model_format: Optional[str] = None
tp: int = 1
session_len: Optional[int] = None
max_batch_size: int = 128
cache_max_entry_count: float = 0.8
quant_policy: int = 0
rope_scaling_factor: float = 0.0
use_logn_attn: bool = False
download_dir: Optional[str] = None
revision: Optional[str] = None
max_prefill_token_num: int = 8192
@dataclass
class PytorchEngineConfig:
"""PyTorch Engine Config.
Args:
model_name (str): name of the given model.
tp (int): Tensor Parallelism. default 1.
session_len (int): Max session length. Default None.
max_batch_size (int): Max batch size. Default 128.
cache_max_entry_count (float): the percentage of gpu memory occupied
by the k/v cache. For lmdeploy versions greater than `v0.2.1`,
it defaults to 0.8, signifying the percentage of FREE GPU memory
to be reserved for the k/v cache
eviction_type (str): What action to perform when kv cache
is full, ['recompute', 'copy'], Default 'recompute'.
prefill_interval (int): Interval to perform prefill,
Default 16.
block_size (int): paging cache block size, default 64.
num_cpu_blocks (int): Num cpu blocks. If num is 0, cache
would be allocate according to current environment.
num_gpu_blocks (int): Num gpu blocks. If num is 0, cache
would be allocate according to current environment.
adapters (dict): The path configs to lora adapters.
max_prefill_token_num (int): tokens per iteration.
thread_safe (bool): thread safe engine instance.
download_dir (str): Directory to download and load the weights,
default to the default cache directory of huggingface.
revision (str): The specific model version to use.
It can be a branch name, a tag name, or a commit id.
If unspecified, will use the default version.
"""
model_name: str = ''
tp: int = 1
session_len: int = None
max_batch_size: int = 128
cache_max_entry_count: float = 0.8
eviction_type: str = 'recompute'
prefill_interval: int = 16
block_size: int = 64
num_cpu_blocks: int = 0
num_gpu_blocks: int = 0
adapters: Dict[str, str] = None
max_prefill_token_num: int = 4096
thread_safe: bool = False
download_dir: str = None
revision: str = None
class ResponseType(enum.Enum):
"""Response type."""
SUCCESS = enum.auto()
FINISH = enum.auto()
ENGINE_STOP_ERROR = enum.auto()
SESSION_REPEAT = enum.auto()
SESSION_NOT_EXIST = enum.auto()
HANDLER_NOT_EXIST = enum.auto()
@dataclass
class Response:
"""Pack all response information together.
Args:
text (str): the response text from the server. If the output text is
an empty str and the finish_reason is length, it means the session
length is reached.
generate_token_len (int): the response token length.
input_token_len (int): the input prompt token length. Note that it may
contains chat template part.
session_id (int): the id for running the session. Basically, it refers
to the position index of the input request batch.
finish_reason ('stop' | 'length' | None): the reason the model stopped
generating tokens. This will be 'stop' if the model hit a natural
stop point or a provided stop sequence, 'length' if the maximum
number of tokens specified in the request was reached.
"""
text: str
generate_token_len: int
input_token_len: int
session_id: int
finish_reason: Optional[Literal['stop', 'length']] = None
# Copyright (c) OpenMMLab. All rights reserved.
# Copyright (c) OpenMMLab. All rights reserved.
import re
from dataclasses import dataclass
from typing import Any, Dict, List
import torch
from torch import Tensor
from ..block import LogicalTokenBlocks
def _cache_weight(cache: Tensor, weight: Tensor, block_table: Tensor):
"""cache weight."""
assert cache.dim() == 2
assert weight.dim() == 2
assert block_table.dim() == 1
rank, feat_size = weight.size()
assert cache.size(-1) >= feat_size, ('cache.size(-1) >= feat_size failed.')
assert rank <= block_table.size(0), ('rank <= block_table.size(0) failed.')
block_table = block_table[:rank]
cache[block_table, :feat_size] = weight.to(device=cache.device,
dtype=cache.dtype)
def _get_named_loralinears(model: torch.nn.Module):
"""get all named loralinear."""
from peft.tuners.lora import Linear as LoRALinear
named_loralinear: Dict[str, torch.nn.Module] = dict()
for name, module in model.named_modules():
if isinstance(module, LoRALinear):
named_loralinear[name] = module
return named_loralinear
def _get_layer_index(key: str, config: Any):
"""get layer index of the lora linear."""
layers_pattern = getattr(config, 'layers_pattern', None)
if isinstance(layers_pattern, str):
layers_pattern = [layers_pattern]
if layers_pattern is None or len(layers_pattern) == 0:
layer_index = re.match(r'.*\.[^.]*\.(\d+)\.', key)
return int(layer_index[1])
else:
for pattern in layers_pattern:
layer_index = re.match(f'.*.{pattern}\\.(\\d+)\\.*', key)
if layer_index is not None:
return int(layer_index[1])
def get_indexed_lora_linears(model: torch.nn.Module):
"""get indexed lora linear."""
named_linears = _get_named_loralinears(model)
config = None
peft_config = getattr(model, 'peft_config', dict)
if len(peft_config) > 0:
config = next(iter(peft_config.values()))
indexed_linears = dict()
for name, layer in named_linears.items():
index = _get_layer_index(name, config)
target = name.split('.')[-1]
indexed_linears.setdefault(index, dict())
indexed_linears[index][target] = layer
return indexed_linears
def update_lora_linears(lora_linears: Dict,
weight_maps: List['AdapterWeightMap'],
device: str = 'cuda'):
"""update lora linears."""
def __get_targets():
"""get targets."""
all_targets = set()
for weight_map in weight_maps:
targets = weight_map.target_modules.keys()
all_targets.update(targets)
return all_targets
def __get_linear_meta(target_names):
"""get rank and start."""
rank_map = dict()
start_map = dict()
scaling_map = dict()
for target in target_names:
ranks = [0] + [
weight_map.target_modules[target].rank
for weight_map in weight_maps
]
block_starts = [0] + [
weight_map.target_modules[target].block_start
for weight_map in weight_maps
]
scaling = [0] + [
weight_map.target_modules[target].scaling
for weight_map in weight_maps
]
rank_map[target] = torch.tensor(ranks)
start_map[target] = torch.tensor(block_starts)
scaling_map[target] = torch.tensor(scaling)
return rank_map, start_map, scaling_map
def __update_linear(linear, idx, rank_map, start_map, scaling_map,
adapter_names):
"""update linear."""
linear.layer_idx = idx
linear.ranks = rank_map[target].to(device)
linear.block_starts = start_map[target].to(device)
linear.scaling = scaling_map[target].to(device)
for name in adapter_names:
if name in linear.lora_A:
linear.lora_A.pop(name)
linear.lora_B.pop(name)
adapter_names = [weight_map.adapter_name for weight_map in weight_maps]
all_targets = __get_targets()
for weight_map in weight_maps:
weight_map.expand_targets(all_targets)
rank_map, start_map, scaling_map = __get_linear_meta(all_targets)
for idx, lora_linear in lora_linears.items():
for target, linear in lora_linear.items():
__update_linear(linear,
idx,
rank_map=rank_map,
start_map=start_map,
scaling_map=scaling_map,
adapter_names=adapter_names)
def get_max_lora_weight_size(model: torch.nn.Module):
"""Get max weight size."""
from peft.tuners.lora import Linear as LoRALinear
ret = 0
for _, mod in model.named_modules():
if isinstance(mod, LoRALinear):
weight = mod.base_layer.weight
ret = max(ret, max(weight.shape))
return ret
@dataclass
class TargetMeta:
rank: int
block_start: int
scaling: float
@dataclass
class AdapterWeightMap:
adapter_name: str
block_table: Tensor
target_modules: Dict[str, TargetMeta]
@classmethod
def new(cls, adapter_name: str, rank: int, target_names: List[str],
block_table: Tensor, scaling: float):
"""create new weightmap."""
block_start = 0
target_modules: Dict[str, TargetMeta] = dict()
for name in target_names:
target_modules[name] = TargetMeta(rank, block_start, scaling)
block_start += rank
return AdapterWeightMap(adapter_name,
block_table=block_table,
target_modules=target_modules)
def expand_targets(self,
target_names: List[str],
ignore_exists: bool = True):
for name in target_names:
if name in self.target_modules:
if ignore_exists:
continue
else:
raise RuntimeError(f'target {name} exists.')
self.target_modules[name] = TargetMeta(0, 0, 0.0)
@classmethod
def cache_lora_a(cls, cache: Tensor, weight: Tensor, block_table: Tensor):
"""cache lora a weight."""
return _cache_weight(cache, weight, block_table)
@classmethod
def cache_lora_b(cls, cache: Tensor, weight: Tensor, block_table: Tensor):
"""cache lora b weight."""
return _cache_weight(cache, weight.t(), block_table)
def cache_lora_linear(self, lora_linear: torch.nn.Module, cache_a: Tensor,
cache_b: Tensor):
"""cache lora linear."""
name = self.adapter_name
target_modules = self.target_modules
block_table = self.block_table
block_start = 0
for target, target_meta in target_modules.items():
linear = lora_linear[target]
if not (name in linear.lora_A and name in linear.lora_B):
continue
linear_a = linear.lora_A[name]
linear_b = linear.lora_B[name]
weight_a = linear_a.weight
weight_b = linear_b.weight
assert weight_a is not None
assert weight_b is not None
rank = target_meta.rank
block_offset = block_table[block_start:block_start + rank]
block_start += rank
self.cache_lora_a(cache_a, weight_a, block_offset)
self.cache_lora_b(cache_b, weight_b, block_offset)
def cache_adapter(self, lora_linears: Dict, caches: List[List[Tensor]]):
"""cache all linear."""
assert len(lora_linears) == len(caches), (
'len(lora_linears) == len(caches)')
for idx, lora_linear in lora_linears.items():
assert idx < len(caches), 'idx < len(caches)'
cache_a, cache_b = caches[idx]
self.cache_lora_linear(lora_linear, cache_a, cache_b)
@dataclass
class SchedulerAdapter:
"""lora adapter."""
idx: int
adapter_path: str
adapter_name: str
config: Any
target_modules: List[str]
logical_blocks: LogicalTokenBlocks
adapter_manager: 'AdapterManager'
_active: bool = False
@classmethod
def from_pretrained(cls, adapter_path: str, adapter_name: str, idx: int,
manager: 'AdapterManager'):
"""from_pretrained."""
from peft import PeftConfig
config = PeftConfig.from_pretrained(adapter_path)
return cls.from_config(config,
adapter_name=adapter_name,
idx=idx,
manager=manager)
@classmethod
def from_config(cls, config: Any, adapter_name: str, idx: int,
manager: 'AdapterManager'):
"""from config."""
new_adapter = SchedulerAdapter(
idx,
adapter_path=config.base_model_name_or_path,
adapter_name=adapter_name,
config=config,
target_modules=list(config.target_modules),
logical_blocks=LogicalTokenBlocks(),
adapter_manager=manager)
new_adapter._active = False
return new_adapter
@property
def name(self):
"""get adapter name."""
return self.adapter_name
@property
def rank(self):
"""get rank."""
return self.config.r
@property
def scaling(self):
"""get scaling."""
return self.config.lora_alpha / self.rank
def is_actived(self):
"""check if adapter is active."""
return self._active
def active(self, flag: bool = True):
"""active adapter."""
self.adapter_manager._on_active(self, flag)
self._active = flag
def build_weight_map(self, block_table: Tensor):
return AdapterWeightMap.new(self.name,
rank=self.rank,
target_names=self.target_modules,
block_table=block_table,
scaling=self.scaling)
class AdapterManager:
"""Adapter manager."""
def __init__(self) -> None:
self._adapters: Dict[str, SchedulerAdapter] = dict()
self._adapter_count = 0
self._active_count = 0
self._add_non_adapter()
def _add_non_adapter(self):
"""add non adapter."""
from peft import LoraConfig
adapter_name = None
config = LoraConfig(r=0, target_modules=[])
adapter = self.add_adapter_from_config(config,
adapter_name=adapter_name)
adapter.active()
def _on_active(self, adapter: SchedulerAdapter, flag: bool):
"""on active."""
if adapter._active != flag:
if flag:
self._active_count += 1
else:
self._active_count -= 1
def _add_adapter(self, adapter: SchedulerAdapter):
"""add adapter."""
assert adapter.adapter_name not in self._adapters
self._adapters[adapter.adapter_name] = adapter
self._adapter_count += 1
return adapter
def add_adapter_from_config(self, config: Any, adapter_name: str):
"""add adapter from config."""
adapter = SchedulerAdapter.from_config(config,
adapter_name=adapter_name,
idx=self._adapter_count,
manager=self)
return self._add_adapter(adapter)
def add_adapter_from_pretrained(self, adapter_path: str,
adapter_name: str):
"""add adapter by path and name."""
adapter = SchedulerAdapter.from_pretrained(adapter_path,
adapter_name=adapter_name,
idx=self._adapter_count,
manager=self)
return self._add_adapter(adapter)
def get_adapter(self, name: str, default=None):
"""get adapter."""
return self._adapters.get(name, default)
def num_adapters(self):
"""get num adapters."""
return len(self._adapters)
ADAPTER_MANAGER = AdapterManager()
# Copyright (c) OpenMMLab. All rights reserved.
# modify from: https://github.com/vllm-project/vllm
from dataclasses import dataclass
import numpy as np
def _div_up(x, n):
"""perform div up."""
return (x + n - 1) // n
def _round_up(x, n):
"""perform round up."""
return _div_up(x, n) * n
class LogicalTokenBlocks:
"""Logical blocks."""
ALLOC_SIZE = 128
def __init__(self, blocks: np.ndarray = None):
if blocks is None:
self._blocks = np.zeros((self.ALLOC_SIZE, ), dtype=np.int64)
self._num_real = 0
else:
assert blocks.ndim == 1
self._blocks = blocks
self._num_real = len(blocks)
def reserve(self, size: int):
"""reserve cache size."""
num_blocks = self._blocks.size
if num_blocks >= size:
return
reserve_size = _round_up(size - num_blocks, self.ALLOC_SIZE)
self._blocks = np.pad(self._blocks, (0, reserve_size))
def __setitem__(self, *args, **kwargs):
"""set values."""
return self.get_real_blocks().__setitem__(*args, **kwargs)
def __getitem__(self, *args, **kwargs):
"""get values."""
return self.get_real_blocks().__getitem__(*args, **kwargs)
def get_real_blocks(self):
"""get logical blocks."""
return self._blocks[:self._num_real]
def append(self, blocks: np.ndarray):
"""append blocks."""
num_blocks = len(blocks)
self.reserve(num_blocks + self._num_real)
slice_start = self._num_real
slice_end = slice_start + num_blocks
self._num_real += num_blocks
self.__setitem__(slice(slice_start, slice_end), blocks)
def __len__(self):
"""get length."""
return self._num_real
def resize(self, num_blocks: int):
"""resize logical blocks."""
assert num_blocks <= len(self)
self._num_real = num_blocks
def reset(self):
"""reset."""
self.resize(0)
def clone(self):
"""clone logical blocks."""
ret = LogicalTokenBlocks()
ret.append(self[:])
return ret
@dataclass
class PhysicalTokenBlock:
"""Physical block used to schedule key value cache."""
device: str
block_id: int
block_size: int
ref_count: int = 0
# Copyright (c) OpenMMLab. All rights reserved.
from logging import Logger
from typing import List
from lmdeploy.utils import get_logger
def _handle_exception(e: Exception,
mod_name: str,
logger: Logger,
message: str = None):
red_color = '\033[31m'
reset_color = '\033[0m'
if message is None:
message = 'Please ensure it has been installed correctly.'
logger.debug('Exception', exc_info=1)
logger.error(f'{type(e).__name__}: {e}')
logger.error(f'{red_color}'
f'<{mod_name}> test failed!\n'
f'{message}'
f'{reset_color}')
exit(1)
def check_env_torch():
"""check PyTorch environment."""
logger = get_logger('lmdeploy')
try:
logger.debug('Checking <PyTorch> environment.')
import torch
a = torch.tensor([1, 2], device='cuda')
b = a.new_tensor([3, 4], device='cuda')
c = a + b
torch.testing.assert_close(c, a.new_tensor([4, 6]))
except Exception as e:
_handle_exception(e, 'PyTorch', logger)
def check_env_triton():
"""check OpenAI Triton environment."""
logger = get_logger('lmdeploy')
try:
logger.debug('Checking <Triton> environment.')
import torch
from .triton_custom_add import custom_add
a = torch.tensor([1, 2], device='cuda')
b = a.new_tensor([3, 4], device='cuda')
c = custom_add(a, b)
torch.testing.assert_close(c, a + b)
except Exception as e:
_handle_exception(e, 'Triton', logger)
def check_env():
"""check all environment."""
logger = get_logger('lmdeploy')
logger.info('Checking environment for PyTorch Engine.')
check_env_torch()
check_env_triton()
def check_transformers_version(model_path: str,
trust_remote_code: bool = True):
"""check transformers version."""
from packaging import version
logger = get_logger('lmdeploy')
def __check_transformers_version():
"""check transformers version."""
logger.debug('Checking <transformers> version.')
trans_version = None
try:
import transformers
trans_version = version.parse(transformers.__version__)
except Exception as e:
_handle_exception(e, 'transformers', logger)
return transformers, trans_version
def __check_config(trans_version):
"""check config."""
logger.debug('Checking <Model> AutoConfig.from_pretrained.')
try:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(
model_path, trust_remote_code=trust_remote_code)
except Exception as e:
message = (
f'Load model config with transformers=={trans_version}'
' failed. '
'Please make sure model can be loaded with transformers API.')
_handle_exception(e, 'transformers', logger, message=message)
return config
def __check_model_transformers_version(config, trans_version):
"""check model transformers version."""
logger.debug('Checking <Model> required transformers version.')
try:
model_trans_version = getattr(config, 'transformers_version')
model_trans_version = version.parse(model_trans_version)
assert trans_version >= model_trans_version, 'Version mismatch.'
except Exception as e:
message = (f'model `{model_path}` requires '
f'transformers version {model_trans_version} '
f'but transformers {trans_version} is installed.')
_handle_exception(e, 'transformers', logger, message=message)
def __check_model_dtype_support(config):
"""Checking model dtype support."""
logger.debug('Checking <Model> dtype support.')
import torch
from lmdeploy.pytorch.config import ModelConfig
try:
model_config = ModelConfig.from_hf_config(config,
model_path=model_path)
if model_config.dtype == torch.bfloat16:
assert torch.cuda.is_bf16_supported(), (
'bf16 is not supported on your device')
except AssertionError as e:
message = (f'Your device does not support `{model_config.dtype}`. '
'Try edit `torch_dtype` in `config.json`.\n'
'Note that this might have negative effect!')
_handle_exception(e, 'Model', logger, message=message)
except Exception as e:
message = (f'Checking failed with error {e}',
'Please send issue to LMDeploy with error logs.')
_handle_exception(e, 'Model', logger, message=message)
return model_config
_, trans_version = __check_transformers_version()
config = __check_config(trans_version)
__check_model_transformers_version(config, trans_version)
__check_model_dtype_support(config)
def check_model(model_path: str, trust_remote_code: bool = True):
"""check model requirements."""
logger = get_logger('lmdeploy')
logger.info('Checking model.')
check_transformers_version(model_path, trust_remote_code)
def check_adapter(path: str):
"""check adapter."""
logger = get_logger('lmdeploy')
logger.debug(f'Checking <Adapter>: {path}.')
try:
from peft import PeftConfig
PeftConfig.from_pretrained(path)
except Exception as e:
message = ('Please make sure the adapter can be loaded with '
'`peft.PeftConfig.from_pretrained`\n')
err_msg = '' if len(e.args) == 0 else e.args[0]
if 'got an unexpected keyword argument' in err_msg:
message += ('Or try remove all unexpected keywords '
'in `adapter_config.json`.')
_handle_exception(e, 'Model', logger, message=message)
def check_adapters(adapter_paths: List[str]):
"""check adapters."""
if len(adapter_paths) <= 0:
return
logger = get_logger('lmdeploy')
logger.info('Checking adapters.')
for path in adapter_paths:
check_adapter(path)
# Copyright (c) OpenMMLab. All rights reserved.
import torch
import triton
import triton.language as tl
@triton.jit
def _add_kernel(A, B, C, size, BLOCK: tl.constexpr):
"""add kernel."""
prog_id = tl.program_id(0)
offs = prog_id * BLOCK + tl.arange(0, BLOCK)
a = tl.load(A + offs, mask=offs < size)
b = tl.load(B + offs, mask=offs < size)
tl.store(C + offs, a + b, mask=offs < size)
def custom_add(a, b):
"""custom add one."""
c = torch.empty_like(a)
size = c.size(0)
BLOCK = 16
grid = [triton.cdiv(size, BLOCK)]
_add_kernel[grid](a, b, c, size, BLOCK=BLOCK)
return c
# Copyright (c) OpenMMLab. All rights reserved.
from dataclasses import dataclass, field
from typing import Any, Dict
import torch
def _get_torch_dtype(config: Any, default: str = 'float16'):
"""Get the torch dtype from the model config.
Args:
config: Config of the hf model.
default (str): default device type.
"""
torch_dtype = getattr(config, 'torch_dtype', default)
# torch_dtype in config could be none
torch_dtype = torch_dtype or default
return eval(f'torch.{torch_dtype}')
@dataclass
class SchedulerConfig:
"""Config of scheduler."""
max_batches: int
max_session_len: int
max_request_output_len: int = 512
eviction_type: str = 'recompute'
prefill_interval: int = 16
max_active_adapters: int = 64
@dataclass
class CacheConfig:
"""Config of key value cache."""
block_size: int
num_cpu_blocks: int
num_gpu_blocks: int
window_size: int = -1
cache_max_entry_count: float = 0.8
max_prefill_token_num: int = 4096
@dataclass
class ModelConfig:
"""Config of model."""
hidden_size: int
num_layers: int
num_attention_heads: int
num_key_value_heads: int
bos_token_id: int
eos_token_id: int
head_dim: int
sliding_window: int = -1
dtype: torch.dtype = torch.float16
multi_query_attention: bool = False
vocab_size: int = 40000
json_config: dict = field(default_factory=dict)
hf_config: Any = None
init_kwargs: Dict[str, Any] = field(default_factory=dict)
def get_head_size(self):
"""get head size."""
return self.head_dim
@classmethod
def from_pretrained(cls,
pretrained_model_name_or_path: str,
trust_remote_code: bool = True):
"""build ModelConfig from model path or name."""
from transformers import AutoConfig
hf_config = AutoConfig.from_pretrained(
pretrained_model_name_or_path, trust_remote_code=trust_remote_code)
return cls.from_hf_config(hf_config, pretrained_model_name_or_path)
@classmethod
def from_hf_config(cls, hf_config: Any, model_path: str = None):
"""from huggingface config."""
if model_path is None:
model_path = ''
def __build_falcon():
"""build falcon."""
num_attention_heads = hf_config.num_attention_heads
if hf_config.new_decoder_architecture:
# 40b-instruct, GQA
kv_head = hf_config.num_kv_heads
if hf_config.multi_query:
# 7b-instruct, MQA
kv_head = 1
else:
# rw-1b, MHA
kv_head = num_attention_heads
head_dim = hf_config.hidden_size // num_attention_heads
return ModelConfig(
hidden_size=hf_config.hidden_size,
num_layers=hf_config.num_hidden_layers,
num_attention_heads=num_attention_heads,
num_key_value_heads=kv_head,
bos_token_id=hf_config.bos_token_id,
eos_token_id=hf_config.eos_token_id,
head_dim=head_dim,
multi_query_attention=hf_config.multi_query,
vocab_size=hf_config.vocab_size,
)
def __build_chatglm():
"""build chatglm."""
head_dim = hf_config.hidden_size // hf_config.num_attention_heads
bos_token_id = hf_config.bos_token_id
if bos_token_id is None:
bos_token_id = hf_config.pad_token_id
init_kwargs = dict(empty_init=False)
return ModelConfig(
hidden_size=hf_config.hidden_size,
num_layers=hf_config.num_layers,
num_attention_heads=hf_config.num_attention_heads,
num_key_value_heads=hf_config.multi_query_group_num,
bos_token_id=bos_token_id,
eos_token_id=hf_config.eos_token_id,
head_dim=head_dim,
vocab_size=hf_config.padded_vocab_size,
init_kwargs=init_kwargs)
def __build_gemma():
return ModelConfig(
hidden_size=hf_config.hidden_size,
num_layers=hf_config.num_hidden_layers,
num_attention_heads=hf_config.num_attention_heads,
num_key_value_heads=hf_config.num_key_value_heads,
bos_token_id=hf_config.bos_token_id,
eos_token_id=hf_config.eos_token_id,
head_dim=hf_config.head_dim,
vocab_size=hf_config.vocab_size)
def __build_default():
head_dim = hf_config.hidden_size // hf_config.num_attention_heads
num_attention_heads = hf_config.num_attention_heads
num_key_value_heads = getattr(hf_config, 'num_key_value_heads',
num_attention_heads)
use_sliding_window = getattr(hf_config, 'use_sliding_window', True)
sliding_window = -1
if use_sliding_window:
sliding_window = getattr(hf_config, 'sliding_window',
sliding_window) or -1
return ModelConfig(
hidden_size=hf_config.hidden_size,
num_layers=hf_config.num_hidden_layers,
num_attention_heads=hf_config.num_attention_heads,
num_key_value_heads=num_key_value_heads,
bos_token_id=hf_config.bos_token_id,
eos_token_id=hf_config.eos_token_id,
sliding_window=sliding_window,
head_dim=head_dim,
vocab_size=hf_config.vocab_size)
if 'falcon' in model_path:
model_config = __build_falcon()
elif 'chatglm' in model_path:
model_config = __build_chatglm()
elif hf_config.model_type == 'gemma':
model_config = __build_gemma()
else:
model_config = __build_default()
model_config.dtype = _get_torch_dtype(hf_config)
model_config.hf_config = hf_config
model_config.json_config = hf_config.to_dict()
return model_config
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Callable, Union
import torch
from torch import Tensor, nn
from torch.distributed._tensor import (DeviceMesh, DTensor, Replicate, Shard,
distribute_tensor)
from lmdeploy.pytorch.models.q_modules import QLinear
try:
from peft.tuners.lora import Linear as LoRALinear
except ImportError:
class LoRALinear:
pass
def try_to_local(tensor: Union[Tensor, DTensor]):
"""Try to convert DTensor to Tensor.
Args:
tensor (Tensor|DTensor): Tensor to convert.
"""
if isinstance(tensor, DTensor):
tensor = tensor.to_local()
return tensor
def module_to_local(module: nn.Module):
"""convert all DTensor parameters to Tensor parameters in module.
Args:
module (Module): Module to convert.
"""
for name, mod in module.named_children():
module_to_local(mod)
for name, param in module.named_parameters(recurse=False):
module.register_parameter(name, nn.Parameter(try_to_local(param)))
for name, buf in module.named_buffers(recurse=False):
module.register_buffer(name, try_to_local(buf))
def rowwise_parallelize_linear(module: nn.Module,
device_mesh: DeviceMesh,
to_local: bool = False) -> None:
"""
This function parallelizes the input :class:`nn.Linear` module in
:class:`RowwiseParallel` style.
Args:
module (:class:`nn.Module`):
The :class:`nn.Linear` module to be parallelized.
device_mesh (:class:`DeviceMesh`):
Object which describes the mesh topology of devices.
Returns:
None
"""
for name, param in module.named_parameters():
dist_spec = ([Shard(1)] if name == 'weight' else
[Replicate()] # type: ignore[list-item]
)
dist_tensor = distribute_tensor(param, device_mesh, dist_spec)
if to_local:
dist_tensor = try_to_local(dist_tensor)
if name == 'bias':
# rowwise linear would add bias more than ones.
dist_tensor /= device_mesh.size()
dist_param = torch.nn.Parameter(dist_tensor)
module.register_parameter(name, dist_param)
# Weight, bias and scale are registered as buffer in QLinear
for name, buffer in module.named_buffers():
dist_spec = ([Shard(1)] if name == 'weight' else
[Replicate()] # type: ignore[list-item]
)
dist_tensor = distribute_tensor(buffer, device_mesh, dist_spec)
if to_local:
dist_tensor = try_to_local(dist_tensor)
if name == 'bias':
# rowwise linear would add bias more than ones.
dist_tensor /= device_mesh.size()
module.register_buffer(name, dist_tensor)
dist_tensor = distribute_tensor(buffer, device_mesh, dist_spec)
if to_local:
dist_tensor = try_to_local(dist_tensor)
module.register_buffer(name, dist_tensor)
def rowwise_parallelize_loralinear(module: LoRALinear,
device_mesh: DeviceMesh,
to_local: bool = False) -> None:
"""rowwize parallelize lora linear.
Read S-LoRA for more detail.
"""
rowwise_parallelize_linear(module.base_layer,
device_mesh=device_mesh,
to_local=to_local)
for mod in module.lora_A.values():
rowwise_parallelize_linear(mod,
device_mesh=device_mesh,
to_local=to_local)
for mod in module.lora_B.values():
colwise_parallelize_linear(mod,
device_mesh=device_mesh,
to_local=to_local)
module._tp_mode = 'rowwise'
def rowwise_parallelize_linear_fn(module: nn.Module,
device_mesh: DeviceMesh,
to_local: bool = False) -> None:
"""
This function parallelizes the input :Linear module in
:class:`RowwiseParallel` style.
Args:
module (:class:`nn.Module`):
The :class:`nn.Linear` module to be parallelized.
device_mesh (:class:`DeviceMesh`):
Object which describes the mesh topology of devices.
Returns:
None
"""
if isinstance(module, (torch.nn.Linear, QLinear)):
return rowwise_parallelize_linear(module,
device_mesh=device_mesh,
to_local=to_local)
elif isinstance(module, LoRALinear):
return rowwise_parallelize_loralinear(module,
device_mesh=device_mesh,
to_local=to_local)
else:
raise TypeError(f'Unsupported module: {type(module)}')
def colwise_parallelize_linear(module: nn.Module,
device_mesh: DeviceMesh,
to_local: bool = False) -> None:
"""
This function parallelizes the input :class:`nn.Linear` module in
:class:`ColwiseParallel` style.
Args:
module (:class:`nn.Module`):
The :class:`nn.Linear` module to be parallelized.
device_mesh (:class:`DeviceMesh`):
Object which describes the mesh topology of devices.
Returns:
None
"""
for name, param in module.named_parameters():
dist_tensor = distribute_tensor(param, device_mesh, [Shard(0)])
if to_local:
dist_tensor = try_to_local(dist_tensor)
dist_param = torch.nn.Parameter(dist_tensor)
module.register_parameter(name, dist_param)
# Weight, bias and scale are registered as buffer in QLinear
for name, buffer in module.named_buffers():
dist_tensor = distribute_tensor(buffer, device_mesh, [Shard(0)])
if to_local:
dist_tensor = try_to_local(dist_tensor)
module.register_buffer(name, dist_tensor)
def colwise_parallelize_loralinear(module: nn.Module,
device_mesh: DeviceMesh,
to_local: bool = False) -> None:
"""colwise parallelize lora linear."""
colwise_parallelize_linear(module.base_layer,
device_mesh=device_mesh,
to_local=to_local)
for mod in module.lora_A.values():
colwise_parallelize_linear(mod,
device_mesh=device_mesh,
to_local=to_local)
for mod in module.lora_B.values():
colwise_parallelize_linear(mod,
device_mesh=device_mesh,
to_local=to_local)
module._tp_mode = 'colwise'
def colwise_parallelize_linear_fn(module: nn.Module,
device_mesh: DeviceMesh,
to_local: bool = False) -> None:
"""
This function parallelizes the input :Linear module in
:class:`ColwiseParallel` style.
Args:
module (:class:`nn.Module`):
The :class:`nn.Linear` module to be parallelized.
device_mesh (:class:`DeviceMesh`):
Object which describes the mesh topology of devices.
Returns:
None
"""
if isinstance(module, (torch.nn.Linear, QLinear)):
return colwise_parallelize_linear(module,
device_mesh=device_mesh,
to_local=to_local)
elif isinstance(module, LoRALinear):
return colwise_parallelize_loralinear(module,
device_mesh=device_mesh,
to_local=to_local)
else:
raise TypeError(f'Unsupported module: {type(module)}')
def _partition_module(
mod_name: str,
prefix: str,
module: nn.Module,
device_mesh: DeviceMesh,
func: Callable,
):
"""partition module.
Parameters in module won't be force Replicated.
Args:
mod_name (str): module name.
prefix (str): Parameter prefix.
module (Module): Module to be partitioned.
device_mesh (DeviceMesh): The device mesh.
func (Callable): partition callback
"""
for name, mod in module.named_children():
child_name = f'{prefix}{name}'
_partition_module(child_name,
child_name + '.',
module=mod,
device_mesh=device_mesh,
func=func)
func(mod_name, module, device_mesh)
def partition_module(module: nn.Module,
device_mesh: DeviceMesh,
func: Callable,
to_local: bool = False):
"""partition module.
Parameters in module won't be force Replicated.
Args:
module (Module): Module to be partitioned.
device_mesh (DeviceMesh): The device mesh.
func (Callable): partition callback.
to_local (bool): Convert all DTensor parameters to Tensor parameters.
"""
_partition_module('',
'',
module=module,
device_mesh=device_mesh,
func=func)
if to_local:
module_to_local(module)
def replicate_module(model: nn.Module, device_mesh: DeviceMesh):
"""Replicate all parameters in module.
Args:
model (Module): Module to perform replicate.
device_mesh (DeviceMesh): The distribution device mesh.
"""
for name, param in model.named_parameters(recurse=False):
param = distribute_tensor(param,
device_mesh=device_mesh,
placements=[Replicate()]).to_local()
param = nn.Parameter(param)
model.register_parameter(name, param)
for name, buf in model.named_buffers(recurse=False):
buf = distribute_tensor(buf,
device_mesh=device_mesh,
placements=[Replicate()]).to_local()
model.register_buffer(name, buf)
# Copyright (c) OpenMMLab. All rights reserved.
from .engine import Engine
__all__ = ['Engine']
# Copyright (c) OpenMMLab. All rights reserved.
# modify from: https://github.com/vllm-project/vllm
from typing import Dict, List, Tuple
import torch
from torch.distributed._tensor import DeviceMesh
from lmdeploy.utils import get_logger
from ..config import CacheConfig, ModelConfig
KVCache = Tuple[torch.Tensor, torch.Tensor]
logger = get_logger('lmdeploy')
class CacheEngine:
"""Host and Device memory maintainer.
Args:
cache_config (CacheConfig): config of the cache information.
model_config (ModelConfig): config of the model.
rank (int): distribution rank, 0 on non-distributed environment.
world_size (int): distribution world size, 1 on non-distributed
environment.
device_mesh (DeviceMesh): distribution device mesh.
"""
def __init__(
self,
cache_config: CacheConfig,
model_config: ModelConfig,
rank: int = 0,
world_size: int = 1,
device_mesh: DeviceMesh = None,
) -> None:
if rank == 0:
logger.info(f'build CacheEngine with config:{cache_config}')
self.rank = rank
self.world_size = world_size
if device_mesh is None and self.world_size > 1:
device_mesh = DeviceMesh('cuda', list(range(self.world_size)))
self.device_mesh = device_mesh
self.cache_config = cache_config
self.model_config = model_config
self.block_size = cache_config.block_size
self.head_size = model_config.get_head_size()
self.num_layers = model_config.num_layers
self.num_heads = model_config.num_key_value_heads
if 'kv_cache_dtype' in model_config.json_config:
self.kv_cache_dtype = eval(
model_config.json_config['kv_cache_dtype'])
else:
self.kv_cache_dtype = model_config.dtype
# Initialize the cache.
self.local_gpu_cache = self.allocate_gpu_cache()
self.local_cpu_cache = self.allocate_cpu_cache()
# Initialize the stream for caching operations.
self.cache_stream = torch.cuda.Stream()
assert self.cache_stream != torch.cuda.current_stream()
# Initialize the events for stream synchronization.
self.events = [torch.cuda.Event() for _ in range(self.num_layers)]
logger.debug(
f'Initialize cache engine with {cache_config.num_gpu_blocks}'
f' gpu blocks and {cache_config.num_cpu_blocks} cpu blocks.')
@property
def cpu_cache(self):
"""gpu cache."""
return self.local_cpu_cache
@property
def gpu_cache(self):
"""gpu cache."""
return self.local_gpu_cache
@property
def num_gpu_blocks(self):
"""num gpu blocks."""
return self.cache_config.num_gpu_blocks
@property
def num_cpu_blocks(self):
"""num gpu blocks."""
return self.cache_config.num_cpu_blocks
def get_key_block_shape(self, local: bool = False) -> Tuple[int, int, int]:
"""get shape of key block."""
num_heads = self.num_heads
if local and not self.model_config.multi_query_attention:
assert self.num_heads % self.world_size == 0, \
f'num_heads: {self.num_heads}, world_size: {self.world_size}'
num_heads = self.num_heads // self.world_size
return (
self.block_size,
num_heads,
self.head_size,
)
def get_value_block_shape(self,
local: bool = False) -> Tuple[int, int, int]:
"""get shape of value block."""
num_heads = self.num_heads
if local and not self.model_config.multi_query_attention:
assert self.num_heads % self.world_size == 0, \
f'num_heads: {self.num_heads}, world_size: {self.world_size}'
num_heads = self.num_heads // self.world_size
return (
self.block_size,
num_heads,
self.head_size,
)
def allocate_gpu_cache(self):
"""allocate caches on GPU."""
gpu_cache: List[KVCache] = []
key_block_shape = self.get_key_block_shape(local=True)
value_block_shape = self.get_value_block_shape(local=True)
for _ in range(self.num_layers):
key_blocks = torch.empty(
size=(self.num_gpu_blocks, *key_block_shape),
dtype=self.kv_cache_dtype,
device='cuda',
)
value_blocks = torch.empty(
size=(self.num_gpu_blocks, *value_block_shape),
dtype=self.kv_cache_dtype,
device='cuda',
)
gpu_cache.append((key_blocks, value_blocks))
return gpu_cache
def allocate_cpu_cache(self):
"""allocate caches on Host."""
cpu_cache: List[KVCache] = []
key_block_shape = self.get_key_block_shape(local=True)
value_block_shape = self.get_value_block_shape(local=True)
# TODO: pin memory might need be banned on wsl
pin_memory = True
for _ in range(self.num_layers):
key_blocks = torch.empty(
size=(self.num_cpu_blocks, *key_block_shape),
dtype=self.kv_cache_dtype,
pin_memory=pin_memory,
)
value_blocks = torch.empty(
size=(self.num_cpu_blocks, *value_block_shape),
dtype=self.kv_cache_dtype,
pin_memory=pin_memory,
)
cpu_cache.append((key_blocks, value_blocks))
return cpu_cache
def _swap(self, src: List[KVCache], dst: List[KVCache],
src_to_dst: Dict[int, int]):
"""Move caches from src memory to dst memory.
Args:
src (List[KVCache]): Source cache.
dst (List[KVCache]): Destination cache.
src_to_dst (Dict[int, int]): Map between src and dst.
"""
with torch.cuda.stream(self.cache_stream):
for i in range(self.num_layers):
src_key_cache, src_value_cache = src[i]
dst_key_cache, dst_value_cache = dst[i]
for src_id, dst_id in src_to_dst.items():
dst_key_cache[dst_id].copy_(src_key_cache[src_id])
dst_value_cache[dst_id].copy_(src_value_cache[src_id])
event = self.events[i]
event.record(stream=self.cache_stream)
def swap_in(self, src_to_dst: Dict[int, int]) -> None:
"""Move cache from Host to Device.
Args:
src_to_dst (Dict[int, int]): Map between src and dst.
"""
self._swap(self.local_cpu_cache, self.local_gpu_cache, src_to_dst)
def swap_out(self, src_to_dst: Dict[int, int]) -> None:
"""Move cache from Device to Host.
Args:
src_to_dst (Dict[int, int]): Map between src and dst.
"""
self._swap(self.local_gpu_cache, self.local_cpu_cache, src_to_dst)
@staticmethod
def get_cache_block_size(block_size: int,
model_config: ModelConfig,
world_size: int = 1) -> int:
"""Get the required cache size of the model.
Args:
block_size (int): The token numbers of the block.
model_config (ModelConfig): The config of the model.
Return:
int: Required memory size in bytes.
"""
head_size = model_config.get_head_size()
num_layers = model_config.num_layers
num_heads = model_config.num_key_value_heads
if not model_config.multi_query_attention:
num_heads = num_heads // world_size
key_cache_block = block_size * num_heads * head_size
value_cache_block = key_cache_block
total = num_layers * (key_cache_block + value_cache_block)
dtype_size = _get_dtype_size(model_config.dtype)
return dtype_size * total
def _get_dtype_size(dtype: torch.dtype) -> int:
"""get size of the given dtype.
Args:
dtype (torch.dtype): Data type.
Return:
int: size in bytes.
"""
return torch.tensor([], dtype=dtype).element_size()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment