Commit 56215723 authored by zhouxiang's avatar zhouxiang
Browse files

1、同步到最新版本;2、增加batch推理接口;3、解决内存泄漏问题;4、修复llama系列流式输出不流畅的问题

parent 44be91d3
......@@ -4,39 +4,9 @@ import sys
import struct
import numpy as np
import argparse
from .utils import convert
HF_INSTALLED = False
try:
import torch
from transformers import AutoTokenizer, AutoModel # chatglm
from transformers import LlamaTokenizer, LlamaForCausalLM # alpaca
from transformers import AutoModelForCausalLM, AutoTokenizer # baichuan, moss
from peft import PeftModel
HF_INSTALLED = True
except Exception as e:
logging.error("Make sure that you installed transformers and peft!!!")
sys.exit(1)
MODEL_DICT = {
"alpaca":{
"tokenizer": "minlik/chinese-alpaca-33b-merged",
"model": "minlik/chinese-alpaca-33b-merged"
},
"baichuan7B":{
"model": "baichuan-inc/baichuan-7B",
"tokenizer": "baichuan-inc/baichuan-7B",
"peft": "hiyouga/baichuan-7b-sft",
},
"chatglm6B":{
"tokenizer": "THUDM/chatglm-6b",
"model": "THUDM/chatglm-6b"
},
"moss":{
"model": "fnlp/moss-moon-003-sft",
"tokenizer": "fnlp/moss-moon-003-sft",
}
}
from .utils import convert
from .utils.converter import QuantType
def parse_args():
# -p 模型路径或hf路径
......@@ -51,67 +21,24 @@ def parse_args():
help='lora model path')
parser.add_argument('-m', dest='model', default='chatglm6B',
help='model name with(alpaca, baichuan7B, chatglm6B, moss)')
parser.add_argument('-q', dest='qbit', type=int,
parser.add_argument('-q', dest='q_bit', type=int,
help='model quantization bit')
args = parser.parse_args()
return args
def alpaca(model_path):
tokenizer = LlamaTokenizer.from_pretrained(model_path)
model = LlamaForCausalLM.from_pretrained(model_path).float()
return model, tokenizer
def baichuan7B(model_path, peft_path):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
model = PeftModel.from_pretrained(model, peft_path).float()
layers = model.model.model.layers
for i in range(len(layers)):
layers[i].self_attn.W_pack.weight += torch.mm(layers[i].self_attn.W_pack.lora_B.default.weight, layers[i].self_attn.W_pack.lora_A.default.weight) * layers[i].self_attn.W_pack.scaling["default"]
return model, tokenizer
def chatglm6B(model_path, ):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float()
model = model.eval()
return model, tokenizer
def moss(model_path, ):
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float()
model = model.eval()
return model, tokenizer
def main(args=None):
assert HF_INSTALLED, "Make sure that you installed transformers and peft before convert!!!"
if not args:
args = parse_args()
if args.model not in MODEL_DICT:
assert f"Not Support {args.model} Yet!!!"
model_args = {}
model_args["model_path"] = MODEL_DICT[args.model].get("model")
if MODEL_DICT[args.model].has_key("peft"):
model_args["peft_path"] = MODEL_DICT[args.model].get("peft")
if args.model_path:
model_args["model_path"] = args.model_path[0]
if len(args.model_path) > 2:
model_args["peft_path"] = args.model_path[2]
model, tokenizer = globals().get(args.model)(**model_args)
export_path = args.export_path or f"{args.model}-fp32.bin"
convert(export_path, model.model, tokenizer)
if not args: args = parse_args()
if args.qbit:
import pyfastllm as fastllm
export_name, export_ext = export_path.split('.')
q_export_path = export_name + f"-q{args.qbit}." + export_ext
flm_model = fastllm.create_llm(export_path)
flm_model.save_lowbit_model(q_export_path, args.qbit)
quant_type_to_qbit = {
QuantType.FP32: 32,
QuantType.FP16: 16,
QuantType.INT8: 8,
QuantType.INT4: 4,
}
qbit_to_quant_type = {v: k for k, v in quant_type_to_qbit.items()}
q_type = qbit_to_quant_type[args.q_bit]
convert(args.model_path, args.export_path, q_type=q_type)
if __name__ == "__main__":
args = parse_args()
......
import pyfastllm
def embedding(data: pyfastllm.Tensor, ):
# some check
return pyfastllm.embedding(data, )
def rms_norm(input:pyfastllm.Tensor, weight: pyfastllm.Tensor, eps: float, output: pyfastllm.Tensor=None):
output = pyfastllm.rms_norm(input, weight, eps)
return output
def layer_norm(input: pyfastllm.Tensor,
gamma: pyfastllm.Tensor,
beta: pyfastllm.Tensor,
axis:int=-1 ):
output = pyfastllm.layer_norm(input, gamma, beta,axis)
return output
def linear(input: pyfastllm.Tensor,
weight: pyfastllm.Tensor,
bias: pyfastllm.Tensor):
output = pyfastllm.linear(input, weight, bias)
return output
def matmul(input0: pyfastllm.Tensor,
input1: pyfastllm.Tensor,
alpha: pyfastllm.Tensor):
output = pyfastllm.matmul(input0, input1, alpha)
return output
def attention(q: pyfastllm.Tensor,
k: pyfastllm.Tensor,
v: pyfastllm.Tensor,
mask: pyfastllm.Tensor,
group: int,
scale: float,
attentionType: int):
output = pyfastllm.attention(q, k, v, mask, group, scale, attentionType)
return output
def activation(input: pyfastllm.Tensor, axis=-1, activate_type="silu"):
assert activate_type in ("softmax", "silu", "gelu", "swiglu")
func = getattr(pyfastllm, activate_type)
if activate_type == "softmax":
return func(input, axis)
return func(input)
def mul(input: pyfastllm.Tensor, v: int):
output = pyfastllm.mul(input, v)
return output
def matmul_transB():
pass
def add(input0: pyfastllm.Tensor, input1: pyfastllm.Tensor):
output = pyfastllm.add(input0, input1)
return output
def AttentionMask():
pass
def AlibiMask():
pass
def topk():
pass
def RotatePosition2D():
pass
def NearlyRotatePosition2D():
pass
def LlamaRotatePosition2D():
pass
def RepeatPenalty():
pass
#!encoding=utf8
import os
import tempfile
from typing import List, Tuple
import re
import pyfastllm
from . import utils
from .utils.quantizer import QuantType
class InferConfig():
def __init__(self,
max_length:int=2048,
top_p:float=0.7,
temperature:float=0.95,
**kwargs) -> None:
configs = {
"max_length": max_length,
"top_p": top_p,
"temperature": temperature
}
configs.update(kwargs)
self.from_dict(configs)
def from_dict(self, configs):
self.configs = configs
for key, val in configs.items():
setattr(self, key, val)
def to_dict(self, ):
return self.configs
@property
def flm_config(self, ):
flm_config = pyfastllm.GenerationConfig()
for attr, val in self.configs.items():
setattr(flm_config, attr, val)
return flm_config
class BaseModel():
def __init__(self, model_path:str) -> None:
if model_path.endswith('flm'):
print("loading model:", pyfastllm.get_llm_type(model_path))
self.model = pyfastllm.create_llm(model_path)
elif os.path.isdir(model_path):
save_path = tempfile.mkstemp()
utils.convert(model_path, save_path, q_type=QuantType.INT4)
self.model = pyfastllm.create_llm(save_path)
else:
raise NotImplementedError(f"unsupport model type!")
def build_input(self, query, history):
raise NotImplementedError
def is_stop(self, token_id):
raise NotImplementedError
def process_response(self, response):
raise NotImplementedError
def stream_chat(self,
tokenizer=None,
query:str='',
history=None,
max_length:int=2048,
top_p:float=0.7,
temperature:float=0.95,
*args, **kwargs):
model = self.model
infer_config = InferConfig(max_length=max_length, top_p=top_p, temperature=temperature, **kwargs)
if not tokenizer: tokenizer = model.weight.tokenizer
if not history: history = []
prompt = self.build_input(query,history)
input_ids = tokenizer.encode(prompt)
handle = model.launch_response(input_ids, infer_config.flm_config)
outputs = []
ret_str = ""
while len(outputs) < max_length:
resp_token = model.fetch_response(handle)
if self.is_stop(resp_token):
break
outputs.append(resp_token)
content = tokenizer.decode(outputs)
ret_str = self.process_response(content)
yield ret_str, history + [(query, ret_str)]
def chat(self,
tokenizer=None,
query:str='',
history=None,
max_length:int=2048,
top_p:float=0.7,
temperature:float=0.95,
*args, **kwargs):
model = self.model
infer_config = InferConfig(max_length=max_length, top_p=top_p, temperature=temperature, **kwargs)
if not tokenizer: tokenizer = model.weight.tokenizer
if not history: history = []
prompt = self.build_input(query, history=history)
input_ids = tokenizer.encode(prompt)
handle = model.launch_response(input_ids, infer_config)
outputs = []
ret_str = ""
while len(outputs) < max_length:
resp_token = model.fetch_response(handle)
if self.is_stop(resp_token):
break
outputs.append(resp_token)
content = tokenizer.decode(outputs)
ret_str = self.process_response(content)
history.append((query, ret_str))
return ret_str, history
class ChatglmModel(BaseModel):
def process_response(self, response):
response = response.strip()
response = response.replace("[[训练时间]]", "2023年")
return response
def is_stop(self, token_id):
return token_id <= 2
def build_input(self, query, history=None):
if not history: history = []
prompt = ""
for i, (old_query, response) in enumerate(history):
prompt += "[Round {}]\n问:{}\n答:{}\n".format(i, old_query, response)
prompt += "[Round {}]\n问:{}\n答:".format(len(history), query)
return prompt
class QwenModel(BaseModel):
def process_response(self, response):
return response
def is_stop(self, token_id):
chat_format = self.model.get("chat_format", "chatml")
if chat_format == "raw":
stop_words_ids = [151643]
elif chat_format == "chatml":
stop_words_ids = [151645, 151644]
return token_id in stop_words_ids
def build_inputs(self, query, history=None):
prompt = ""
chat_format = self.model.get("chat_format", "chatml")
if chat_format == "chatml":
if history is None: history = []
prompt = f"{self.model.im_start} system \n {self.model.pre_prompt} + {self.model.im_end}"
for i, (old_query, response) in enumerate(history):
prompt += old_query + response
prompt += f"\n {self.model.im_start + self.model.user_role} \n {query + self.model.im_end} \n {self.model.im_start + self.model.bot_role} \n"
elif chat_format == "raw":
prompt = query
else:
raise NotImplementedError(f"Unknown char_format for QWen: {chat_format}")
return prompt
class BaichuanModel(BaseModel):
def process_response(self, response):
return response
def is_stop(self, token_id):
return token_id == 2
def build_input(self, query, history=None):
prompt = ""
round = 0
# TODO 增加最长截断
for i, (role, content) in enumerate(history):
if role == "system" and i == 0:
prompt += content
elif role == "user":
round += 1
prompt += f"<reserved_102>{content}"
elif role == "assistant":
prompt += f"<reserved_103>{content}"
return prompt
class MossModel(BaseModel):
def process_response(self, response):
return response
def is_stop(self, token_id):
return token_id == 106068
def build_input(self, query, history=None):
prompt = self.model.pre_prompt
if not history: history = []
for i, (old_query, response) in enumerate(history):
prompt += old_query + response
return prompt + f"{self.model.user_role} {query} {self.model.bot_role}"
class AutoFlmModel:
def __init__(self) -> None:
raise NotImplementedError
@classmethod
def from_pretrained(cls, model_path:str):
# hf_model
if os.path.isdir(model_path):
save_path = tempfile.mkstemp(suffix='flm')
utils.convert(model_path, save_path, q_type=QuantType.INT4)
model_path = save_path
if model_path.endswith('flm'):
model_type = pyfastllm.get_llm_type(model_path)
else:
raise NotImplementedError(f"unsupport model type!")
if model_type == "chatglm":
model = ChatglmModel(model_path)
elif model_type == "qwen":
model = QwenModel(model_path)
elif model_type == "baichuan":
model = BaichuanModel(model_path)
elif model_type == "moss":
model = MossModel(model_path)
else:
raise NotImplementedError(f"unsupport model: {model_type}!")
return model
\ No newline at end of file
from typing import Any
class Module():
def __init__(self) -> None:
pass
def __call__(self, *args: Any, **kwds: Any) -> Any:
return self.forward(*args, **args)
@classmethod
def forward(self, ):
pass
def _init_weight(self, ):
pass
from BaseModule import Module
from . import torch2flm
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer
def convert(model, tokenizer, output_path, **args):
torch2flm.tofile(output_path, model, tokenizer, **args)
from .quantizer import QuantType
from .converter import ChatglmConverter, BaichuanConverter, QwenConverter, MossConverter
def convert(hf_model_name_or_path:str, save_path:str, q_type=QuantType.INT4):
config = AutoConfig.from_pretrained(hf_model_name_or_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(hf_model_name_or_path, trust_remote_code=True)
if "Baichuan" in config.architectures:
model = AutoModelForCausalLM.from_pretrained(hf_model_name_or_path, trust_remote_code=True).cpu().eval()
converter = BaichuanConverter(model=model, tokenizer=tokenizer, q_type=q_type)
elif "ChatGLM" in config.architectures:
model = AutoModel.from_pretrained(hf_model_name_or_path, trust_remote_code=True).cpu().eval()
converter = ChatglmConverter(model=model, tokenizer=tokenizer, q_type=q_type)
elif "Qwen" in config.architectures:
model = AutoModelForCausalLM.from_pretrained(hf_model_name_or_path, trust_remote_code=True, fp16=True).cpu().eval()
converter = QwenConverter(model=model, tokenizer=tokenizer, q_type=q_type)
elif "Moss" in config.architectures:
model = AutoModelForCausalLM.from_pretrained(hf_model_name_or_path, trust_remote_code=True).cpu().eval()
converter = MossConverter(model=model, tokenizer=tokenizer, q_type=q_type)
else:
raise NotImplementedError(f"Unsupport model: {config.architectures}")
converter.dump(save_path)
import struct
from typing import Any
import numpy as np
import torch
from .writer import Writer
from .quantizer import QuantType
class BaseConverter():
def __init__(self, model, tokenizer, q_type=0) -> None:
self.model = model
self.tokenizer = tokenizer
self.q_type = q_type
def get_model_info(self):
model_info = self.model.config.__dict__
if self.model.generation_config is not None:
model_info.update(self.model.generation_config.__dict__)
model_info["tokenizer_use_score"] = "1"
return model_info
def get_vocab(self, ):
raise NotImplementedError
def get_weights(self):
state_dict = self.model.state_dict()
if hasattr(self.model, "peft_config"):
state_dict.keys = [key.replace('base_model.model.', '') for key in state_dict]
state_dict = {key: val.numpy().astype(np.float32) for key, val in state_dict.items()}
for name, m in self.model.named_modules():
if isinstance(m, torch.nn.Linear):
if self.q_type == QuantType.FP16:
state_dict[name+".weight.fp16"] = state_dict[name+".weight"].astype(np.float16)
state_dict.pop(name+".weight")
elif self.q_type == QuantType.INT8:
state_dict[name+".weight.int8"] = state_dict[name+".weight"]
state_dict.pop(name+".weight")
elif self.q_type == QuantType.INT4:
state_dict[name+".weight.int4"] = state_dict[name+".weight"]
state_dict.pop(name+".weight")
return state_dict
def convert_model_info(self, wt:Writer):
model_info = self.get_model_info()
model_info = {
str(key): str(val)
for key, val in model_info.items()
}
wt.write(model_info)
def convert_tokenizer(self, wt:Writer):
vocab = self.get_vocab()
vocab_len = len(vocab)
wt.write(int(vocab_len))
for i, key in enumerate(vocab):
# wt.write(len(key))
# for c in key: wt.write(int(c))
wt.write(key)
wt.write(int(i))
wt.write(float(vocab[key]))
def convert_weights(self, wt:Writer):
state_dict = self.get_weights()
wt.write(len(state_dict))
tot = 0
for name, tensor in state_dict.items():
print(f"{name} : {tensor.shape}")
if name.endswith("int4") or name.endswith("int8") or name.endswith("fp16"):
wt.write(str(name[:-5]))
wt.write_tensor(tensor, self.q_type)
else:
wt.write(str(name))
wt.write(tensor)
print("output (", tot, "/", len(state_dict), end = " )\r")
tot += 1
print("\nfinish.")
def forward(self, wt:Writer, *args: Any, **kwds: Any) -> Any:
self.convert_model_info(wt)
self.convert_tokenizer(wt)
self.convert_weights(wt)
def __call__(self, wt:Writer, *args: Any, **kwds: Any) -> Any:
return self.forward(wt, *args, **kwds)
def dump(self, outpath:str):
wt = Writer(outpath=outpath)
# version id
wt.write(int(2))
self.forward(wt=wt)
class ChatglmConverter(BaseConverter):
def get_vocab(self):
tokenizer = self.tokenizer.tokenizer
piece_size = tokenizer.sp_model.piece_size()
vocab = {
tokenizer.sp_model.id_to_piece(i).encode(): float(tokenizer.sp_model.get_score(i)) for i in range(piece_size)
}
return vocab
class BaichuanConverter(BaseConverter):
def get_model_info(self, ):
model_info = super().get_model_info()
if hasattr(self.model, "model") and hasattr(self.model.model, "get_alibi_mask"):
model_info.update({
"use_alibi": "1",
"pre_prompt": "",
"user_role": "<FLM_FIX_TOKEN_" + str(self.model.generation_config.user_token_id) + "> ",
"bot_role": "<FLM_FIX_TOKEN_" + str(self.model.generation_config.assistant_token_id) + ">",
"history_sep": ""
})
return model_info
def get_vocab(self,):
vocab = self.tokenizer.get_vocab()
vocab = {
key.encode(): vocab[key] for key in vocab
}
return vocab
class QwenConverter(BaseConverter):
def get_model_info(self,):
model_info = super().get_model_info()
if model_info["chat_format"] == "chatml":
model_info.update({
"im_end_id": self.tokenizer.im_end_id,
"im_start_id": self.tokenizer.im_start_id
})
return model_info
def get_vocab(self, ):
vocab = self.tokenizer.get_vocab()
vocab = {
key: 1.0 for key in vocab.keys()
}
return vocab
class MossConverter(BaseConverter):
def get_vocab(self, ):
tokenizer = self.tokenizer.tokenizer
vocab = tokenizer.get_vocab()
vocab = {
[tokenizer.byte_decoder.get(c, ord(c)) for c in v]: 1.0
for v in vocab
}
return vocab
import numpy as np
from enum import Enum
from .writer import Writer
class QuantType(Enum):
FP32 = 0
FP16 = 7
INT8 = 3
INT4 = 8
class Quantizer():
quant_bit = {QuantType.FP16: 16, QuantType.INT8: 8, QuantType.INT4: 4}
def __init__(self, quant_type:QuantType, symmetry=True) -> None:
self.quant_type = quant_type
self.q_bit = self.quant_bit[quant_type]
self.up_bound = (2**(self.q_bit-1)) -1
self.low_bound = -(2 ** (self.q_bit-1))
self.symmetry = symmetry
# 范围小,单数据精度高,适用于分布集中场景
def asymquantize(self, data:np.ndarray):
c_min = np.expand_dims(data.min(axis=-1), -1)
c_max = np.expand_dims(data.max(axis=-1), -1)
c_scale = (c_max - c_min) / (self.up_bound - self.low_bound)
c_zero = np.round(0.0 - c_min / c_scale).clip(0, self.up_bound - self.low_bound)
c_min = -c_scale * c_zero
q_data = (data - c_min)/ c_scale
if self.quant_type == QuantType.FP32:
q_data = data.astype(np.float32)
elif self.quant_type == QuantType.FP16:
q_data = data.astype(np.float16)
elif self.quant_type == QuantType.INT8:
q_data = (q_data + 0.5).astype(np.int8).clip(0, 255).astype(np.uint8)
elif self.quant_type == QuantType.INT4:
q_data = (q_data + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
q_data = q_data[:, 0::2] * 16 + q_data[:, 1::2]
else:
raise NotImplementedError(f"unsupport quant type")
self.c_min = c_min
self.c_max = c_max
self.c_scale = c_scale
self.c_zero = c_zero
self.quant_data = q_data
return q_data
# 范围大、单数据精度低,适用分布较分散场景
def symquantize(self, data:np.ndarray):
c_min = np.expand_dims(-np.abs(data).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(data).max(axis = -1), -1)
c_scale = c_max / self.up_bound
c_min = c_scale * self.low_bound
q_data = (data - c_min) / c_scale
if self.quant_type == QuantType.FP32:
q_data = data.astype(np.float32)
elif self.quant_type == QuantType.FP16:
q_data = data.astype(np.float16)
elif self.quant_type == QuantType.INT8:
q_data = (q_data + 0.5).astype(np.int8).clip(1, 255).astype(np.uint8)
elif self.quant_type == QuantType.INT4:
q_data = (q_data + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
q_data = q_data[:, 0::2] * 16 + q_data[:, 1::2]
else:
raise NotImplementedError(f"unsupport quant type")
self.c_min = c_min
self.c_max = c_max
self.c_scale = c_scale
self.quant_data = q_data
return q_data
def quantize(self, data:np.ndarray):
if self.symmetry:
return self.symquantize(data)
else:
return self.asymquantize(data)
def dequantize(self, ):
if not self.c_scale:
raise ValueError
data = self.quant_data * self.c_scale + self.c_min
data = data.astype(np.float32)
return data
def dump(self, wt:Writer):
wt.write(self.quant_type.value)
if self.quant_type in (QuantType.INT4, QuantType.INT8):
wt.write(0)
for i in range(self.c_min.shape[0]):
wt.write(float(self.c_min[i][0]))
wt.write(float(self.c_max[i][0]))
wt.fd.write(self.quant_data.data)
import numpy as np
import struct
from enum import Enum
class QuantType(Enum):
FP32 = 0
FP16 = 7
INT8 = 3
INT4 = 8
def write_int8(fo, v):
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
c_scale = c_max / 127.0
v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
fo.write(struct.pack('i', 3))
fo.write(struct.pack('i', 0))
for i in range(c_max.shape[0]):
fo.write(struct.pack('f', -c_max[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
def write_int4(fo, v):
c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
c_scale = c_max / 7.0
c_min = c_scale * -8.0
v = (v - c_min) / c_scale
v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
v = v[:, 0::2] * 16 + v[:, 1::2]
fo.write(struct.pack('i', 8))
fo.write(struct.pack('i', 0))
for i in range(c_min.shape[0]):
fo.write(struct.pack('f', c_min[i][0]))
fo.write(struct.pack('f', c_max[i][0]))
fo.write(v.data)
class Writer():
def __init__(self, outpath) -> None:
self.fd = open(outpath, 'wb')
def __del__(self, ):
if not self.fd.closed:
self.fd.close()
def write(self, value):
if isinstance(value, int):
self.fd.write(struct.pack('i', value))
elif isinstance(value, float):
self.fd.write(struct.pack('f', value))
elif isinstance(value, str):
self.write_str(value)
elif isinstance(value, bytes):
self.write_bytes(value)
elif isinstance(value, list):
self.write_list(value)
elif isinstance(value, dict):
self.write_dict(value)
elif isinstance(value, np.ndarray):
self.write_tensor(value)
else:
raise NotImplementedError(f"Unsupport data type: {type(value)}")
def write_str(self, s):
self.write(len(s))
self.fd.write(s.encode())
def write_bytes(self, s):
self.write(len(s))
for c in s: self.write(int(c))
def write_list(self, data):
self.write(len(data))
for d in data: self.write(d)
def write_dict(self, data):
self.write(len(data))
for key in data:
self.write_str(key)
self.write(data[key])
def write_tensor(self, data, data_type:QuantType=QuantType.FP32):
self.write(list(data.shape))
if data_type == QuantType.INT4:
write_int4(self.fd, data)
elif data_type == QuantType.INT8:
write_int8(self.fd, data)
else:
self.write(int(data_type.value))
self.fd.write(data.data)
rm -rf build/ && rm -rf dist/
python3 setup.py sdist bdist_wheel
pip install dist/*.whl --force-reinstall
# python3 examples/test_ops.py # coredump when run with cuda backend
\ No newline at end of file
import glob
import os.path
from setuptools import setup, Extension
from setuptools import find_packages
# reference: https://github.com/pybind/cmake_example
import os
import re
import shutil
import subprocess
import sys
from pathlib import Path
import glob
import platform
import argparse
parser = argparse.ArgumentParser(description='build pyfastllm wheel')
parser.add_argument('--cuda', dest='cuda', action='store_true', default=False,
help='build with cuda support')
args, unknown = parser.parse_known_args()
sys.argv = [sys.argv[0]] + unknown
__VERSION__ = "'0.1.3'"
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
ext_modules = []
try:
from pybind11.setup_helpers import Pybind11Extension
source_files = glob.glob(os.path.join(BASE_DIR, "src/**/*.cpp"), recursive=True)
for file in source_files:
if file.endswith("cudadevice.cpp"):
source_files.remove(file)
extra_compile_args = ["-w", "-DPY_API"]
# If any libraries are used, e.g. libabc.so
include_dirs = [os.path.join(BASE_DIR, "include/"), os.path.join(BASE_DIR, "include/devices/cpu/"), os.path.join(BASE_DIR, "include/models"), os.path.join(BASE_DIR, "include/utils")]
library_dirs = []
# (optional) if the library is not in the dir like `/usr/lib/`
# either to add its dir to `runtime_library_dirs` or to the env variable "LD_LIBRARY_PATH"
# MUST be absolute path
runtime_library_dirs = []
libraries = []
if args.cuda:
assert False, "Not Implement Yet!"
extra_compile_args.append("-DUSE_CUDA -Wl,-rpath,$ORIGIN/")
source_files.append(os.path.join(BASE_DIR, "src/devices/cuda/cudadevice.cpp"))
include_dirs.append(os.path.join(BASE_DIR, "include/devices/cuda/"))
library_dirs.append("/usr/local/cuda/lib64/")
library_dirs.append(os.path.join(BASE_DIR, "pyfastllm/"))
libraries.append("fastllm_cuda")
ext_modules = [
Pybind11Extension(
"pyfastllm",
source_files,
define_macros=[('VERSION_INFO', __VERSION__)],
include_dirs=include_dirs,
library_dirs=library_dirs,
runtime_library_dirs=runtime_library_dirs,
libraries=libraries,
extra_compile_args=extra_compile_args,
cxx_std=17,
language='c++'
),
]
except Exception as e:
print(f"some errors happened: ")
print(e)
sys.exit(1)
cmdclass = {}
from setuptools import Extension, find_packages, setup
from setuptools.command.build_ext import build_ext
# Convert distutils Windows platform specifiers to CMake -A arguments
PLAT_TO_CMAKE = {
"win32": "Win32",
"win-amd64": "x64",
"win-arm32": "ARM",
"win-arm64": "ARM64",
}
# A CMakeExtension needs a sourcedir instead of a file list.
# The name must be the _single_ output extension from the CMake build.
# If you need multiple extensions, see scikit-build.
class CMakeExtension(Extension):
def __init__(self, name: str, sourcedir: str = "") -> None:
super().__init__(name, sources=[])
self.sourcedir = os.fspath(Path(sourcedir).resolve())
class CMakeBuild(build_ext):
def build_extension(self, ext: CMakeExtension) -> None:
# Must be in this form due to bug in .resolve() only fixed in Python 3.10+
ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
extdir = ext_fullpath.parent.resolve()
# Using this requires trailing slash for auto-detection & inclusion of
# auxiliary "native" libs
debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
cfg = "Debug" if debug else "Release"
# CMake lets you override the generator - we need to check this.
# Can be set with Conda-Build, for example.
cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
use_cuda = os.environ.get("USE_CUDA", "ON")
# Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
# EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
# from Python.
cmake_args = [
f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
f"-DPYTHON_EXECUTABLE={sys.executable}",
f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm
f"-DPY_API=ON",
f"-DUSE_CUDA={use_cuda}",
]
build_args = []
# Adding CMake arguments set as environment variable
# (needed e.g. to build for ARM OSx on conda-forge)
if "CMAKE_ARGS" in os.environ:
cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
if self.compiler.compiler_type != "msvc":
# Using Ninja-build since it a) is available as a wheel and b)
# multithreads automatically. MSVC would require all variables be
# exported for Ninja to pick it up, which is a little tricky to do.
# Users can override the generator with CMAKE_GENERATOR in CMake
# 3.15+.
if not cmake_generator or cmake_generator == "Ninja":
try:
import ninja
ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
cmake_args += [
"-GNinja",
f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
]
except ImportError:
pass
else:
# Single config generators are handled "normally"
single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
# CMake allows an arch-in-generator style for backward compatibility
contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
# Specify the arch if using MSVC generator, but only if it doesn't
# contain a backward-compatibility arch spec already in the
# generator name.
if not single_config and not contains_arch:
cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]
# Multi-config generators have a different way to specify configs
if not single_config:
cmake_args += [f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"]
build_args += ["--config", cfg]
if sys.platform.startswith("darwin"):
# Cross-compile support for macOS - respect ARCHFLAGS if set
archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
if archs:
cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
# Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
# across all generators.
# if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
# # self.parallel is a Python 3 only way to set parallel jobs by hand
# # using -j in the build_ext call, not supported by pip or PyPA-build.
# if hasattr(self, "parallel") and self.parallel:
# # CMake 3.12+ only.
# build_args += [f"-j{self.parallel}"]
# Compile in parallel by default
build_args += [f"-j"]
build_temp = Path(self.build_temp) / ext.name
if not build_temp.exists():
build_temp.mkdir(parents=True)
subprocess.run(["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True)
subprocess.run(["cmake", "--build", ".", *build_args], cwd=build_temp, check=True)
HERE = Path(__file__).resolve().parent
VERSION = re.search(r'__version__ = "(.*?)"', (HERE / "fastllm/__init__.py").read_text(encoding="utf-8")).group(1)
setup(
name='fastllm',
version=eval(__VERSION__),
version=VERSION,
description='python api for fastllm',
author='wildkid1024',
author_email='wildkid1024@outlook.com',
......@@ -77,14 +140,13 @@ setup(
maintainer_email='',
url='',
long_description='',
ext_modules=ext_modules,
ext_modules=[CMakeExtension(name="pyfasltllm", sourcedir="..")],
cmdclass={"build_ext": CMakeBuild},
packages = find_packages(),
cmdclass=cmdclass,
setup_requires=["pybind11"],
setup_requires=[""],
install_requires=[""],
python_requires='>=3.6',
# data_files = [('', ['libfastllm_cuda.so'])],
include_package_data=False,
include_package_data=True,
entry_points={
'console_scripts':[
'fastllm-convert = fastllm.convert:main'
......@@ -98,4 +160,4 @@ setup(
'LLM::Moss',
'LLM::LLama'
]
)
\ No newline at end of file
)
pip uninstall -y fastllm
rm -rf fastllm/pyfastllm.cpython-310-x86_64-linux-gnu.so
rm -rf build/
python3 build_libs.py
python3 setup.py sdist bdist_wheel
pip install dist/fastllm-0.1.4-py3-none-any.whl
python3 demo/test_ops.py
\ No newline at end of file
......@@ -275,7 +275,6 @@ namespace fastllm {
Data &output = *(datas.find("output")->second);
int group = intParams.find("group") != intParams.end() ? intParams.find("group")->second : 1;
float scale = floatParams.find("scale") != floatParams.end() ? floatParams.find("scale")->second : 1.0;
output.Allocate();
int q0 = q.dims[0], q1 = q.dims[1], q2 = q.dims[2], k0 = k.dims[0], k1 = k.dims[1], v2 = v.dims[2];
float *qd = (float*)q.cpuData;
......@@ -283,13 +282,16 @@ namespace fastllm {
float *vd = (float*)v.cpuData;
float *maskd = (datas.find("mask")->second && mask.dims.size() > 0) ? (float*)mask.cpuData : nullptr;
float *od = (float*)output.cpuData;
int batch = (maskd != nullptr && mask.dims.size() == 3) ? mask.dims[0] : 1;
batch = intParams.find("mask___batch") != intParams.end() ? intParams.find("mask___batch")->second : batch;
int maskStride = (maskd != nullptr) ? (mask.dims.size() == 3 ? mask.strides[0] : mask.Count(0)) : 0;
std::fill(od, od + output.Count(0), 0.0f);
auto pool = GetPool();
std::vector<std::future<void> > futures;
for (int o = 0; o < q0; o++) {
futures.push_back(pool->Submit(SingleAttention,
qd + o * q.strides[0], kd + (o / group) * k.strides[0], vd + (o / group) * v.strides[0],
maskd ? (maskd + o / (q0 / mask.dims[0])) : maskd, od + o * output.strides[0], scale,
maskd + (o / (q0 / batch)) * maskStride, od + o * output.strides[0], scale,
q1, q2, k1, v2));
}
for (int o = 0; o < futures.size(); o++) {
......
......@@ -169,6 +169,12 @@ namespace fastllm {
FastllmCudaLayerNorm(input, gamma, beta, output, axis);
}
// CudaLinearOp::CudaLinearOp() {
// printf("CudaLinearOp\n");
// const int numStreams = 4; // 假设使用4个流
// streams_handle = FastllmCreateStreams(numStreams);
// }
void CudaLinearOp::Reshape(const std::string &opType, const fastllm::DataDict &datas,
const fastllm::FloatDict &floatParams, const fastllm::IntDict &intParams) {
Data &input = *(datas.find("input")->second);
......@@ -207,6 +213,7 @@ namespace fastllm {
FastllmCudaMatMulFloat32(input, weight, bias, output, n, m, k);
} else if (weight.dataType == DataType::FLOAT16) {
FastllmCudaMatMulFloat16(input, weight, bias, output, n, m, k);
// FastllmCudaMatMulFloat16(input, weight, bias, output, n, m, k, streams_handle);
} else if (weight.dataType == DataType::INT8) {
FastllmCudaMatMulFloatInt8(input, weight, bias, output, n, m, k);
} else if (weight.dataType == DataType::INT4) {
......
This diff is collapsed.
......@@ -34,6 +34,11 @@
#include "fastllm-cuda.cuh"
#endif
#ifdef PY_API
#include <pybind11/embed.h>
namespace py = pybind11;
#endif
namespace fastllm {
std::map <std::string, int> defaultDeviceMap;
Executor defaultExecutor;
......@@ -41,7 +46,7 @@ namespace fastllm {
static std::mutex globalLocker;
static int threads = 4;
static ThreadPool *fastllmThreadPool = new ThreadPool(threads);
static ThreadPool *fastllmThreadPool = nullptr;
static bool lowMemMode = false;
static bool kvCacheInCPU = false;
......@@ -74,6 +79,9 @@ namespace fastllm {
}
void SetThreads(int t) {
#ifdef PY_API
py::gil_scoped_release release;
#endif
globalLocker.lock();
threads = t;
if (fastllmThreadPool != nullptr) {
......@@ -82,6 +90,9 @@ namespace fastllm {
}
fastllmThreadPool = new ThreadPool(t);
globalLocker.unlock();
#ifdef PY_API
py::gil_scoped_acquire acquire;
#endif
}
void SetLowMemMode(bool m) {
......@@ -101,6 +112,8 @@ namespace fastllm {
}
ThreadPool *GetPool() {
if (fastllmThreadPool == nullptr)
SetThreads(threads);
return fastllmThreadPool;
}
#ifdef USE_MMAP
......@@ -247,6 +260,7 @@ namespace fastllm {
}
Data::Data(fastllm::DataType type, const std::vector<int> &dims, const std::vector<float> &data) : Data::Data(type, dims) {
// std::cout<<"调用数值构造"<<std::endl;
this->Allocate();
if (type == DataType::FLOAT32) {
std::memcpy(this->cpuData, data.data(), this->GetBytes());
......@@ -258,6 +272,7 @@ namespace fastllm {
}
void Data::CopyFrom(const Data &ori) {
// std::cout<<"调用拷贝构造"<<std::endl;
if (ori.dims != this->dims || this->cpuData == nullptr) {
if (ori.dims.size() == 0) {
delete[] this->cpuData;
......@@ -515,6 +530,10 @@ namespace fastllm {
printf("\n");
}
std::vector<int> Data::Shape() const{
return this->dims;
}
void Data::Print() const {
printf("shape: ");
for (int i : this->dims) {
......@@ -538,7 +557,7 @@ namespace fastllm {
}
printf("\n");
*/
// //如果需要打印cuda显存上的数据需要先把数据转到cpu xzhou 20230728
//如果需要打印cuda显存上的数据需要先把数据转到cpu xzhou 20230728
// if (dataDevice == DataDevice::CUDA) {
// ToDevice(DataDevice::CPU);
// }
......@@ -682,11 +701,20 @@ namespace fastllm {
#ifdef USE_CUDA
if (this->dataDevice == DataDevice::CPU) {
if (device == DataDevice::CUDA) {
uint8_t *cpuData = this->cpuData;
#ifdef USE_MMAP
cpuData = new uint8_t[expansionBytes];
memcpy(cpuData, this->cpuData, expansionBytes);
#endif
FastllmCudaSetDevice(deviceIds.size() == 0 ? 0 : deviceIds[0]);
this->cudaData = FastllmCudaMalloc(expansionBytes);
FastllmCudaCopyFromHostToDevice(this->cudaData, this->cpuData, expansionBytes);
FastllmCudaCopyFromHostToDevice(this->cudaData, cpuData, expansionBytes);
#ifdef USE_MMAP
delete[] cpuData;
#else
delete[] this->cpuData;
this->cpuData = nullptr;
#endif
}
} else if (this->dataDevice == DataDevice::CUDA) {
if (device == DataDevice::CPU) {
......@@ -695,16 +723,16 @@ namespace fastllm {
FastllmCudaFree(this->cudaData);
this->cudaData = nullptr;
} else if (device == DataDevice::CUDA) {
FastllmCudaSetDevice(this->dataDeviceIds.size() == 0 ? 0 : this->dataDeviceIds[0]);
uint8_t *cpuData = new uint8_t[expansionBytes];
FastllmCudaCopyFromDeviceToHost(cpuData, this->cudaData, expansionBytes);
FastllmCudaFree(this->cudaData);
int sourceDevice = this->dataDeviceIds.size() == 0 ? 0 : this->dataDeviceIds[0];
int destDevice = deviceIds.size() == 0 ? 0 : deviceIds[0];
FastllmCudaSetDevice(destDevice);
void *newCudaData = FastllmCudaMalloc(expansionBytes);
FastllmCudaSetDevice(deviceIds.size() == 0 ? 0 : deviceIds[0]);
this->cudaData = FastllmCudaMalloc(expansionBytes);
FastllmCudaCopyFromHostToDevice(this->cudaData, cpuData, expansionBytes);
delete[] cpuData;
FastllmCudaMemcpyBetweenDevices(destDevice, newCudaData, sourceDevice, this->cudaData, expansionBytes);
FastllmCudaSetDevice(sourceDevice);
FastllmCudaFree(this->cudaData);
this->cudaData = newCudaData;
FastllmCudaSetDevice(destDevice);
}
}
#endif
......@@ -790,6 +818,7 @@ namespace fastllm {
now->tokenId = tokenId;
now->score = score;
tokenToStringDict[tokenId] = s;
tokenToScoreDict[tokenId] = score;
stringToTokenDict[s] = tokenId;
}
......@@ -835,9 +864,10 @@ namespace fastllm {
}
for (int i = 0; i < ori.size(); i++) {
if (ori[i] == ' ') {
if (i != 0 && ori[i - 1] != ' ') {
s += blank;
}
// if (i != 0 && ori[i - 1] != ' ') {
// s += blank;
// }
s += blank;
} else {
s += ori[i];
}
......@@ -931,6 +961,132 @@ namespace fastllm {
}
}
return Data (DataType::FLOAT32, {1, (int)v.size()}, v);
} else if (this->type == TokenizerType::GLM) {
const std::map<std::string, int> specialTokens = {{"[MASK]", 50003}, {"[sMASK]", 50008}, {"[gMASK]", 50009}};
std::string blank = "";
blank += 226, blank += 150, blank += 129;
std::string s = blank;
for (int i = 0; i < ori.size(); i++) {
if (ori[i] == ' ') {
if (i != 0 && ori[i - 1] != ' ') {
s += blank;
}
} else {
s += ori[i];
}
}
std::vector<float> v;
int findPos=0;
while(findPos<s.length()){
int nextSpecialToken=-1;
int nextSpecialTokenPos=-1;
int nextSpecialTokenLen=-1;
for(auto p:specialTokens){
int ind=s.find(p.first,findPos);
if(ind>=0&&(nextSpecialTokenPos<0||ind<nextSpecialTokenPos)){
nextSpecialTokenPos=ind;
nextSpecialToken=p.second;
nextSpecialTokenLen=p.first.length();
}
}
std::string subStr;
if(nextSpecialTokenPos<0){
subStr=s.substr(findPos);
findPos=s.length();
}else{
subStr=s.substr(findPos,nextSpecialTokenPos-findPos);
findPos=nextSpecialTokenPos+nextSpecialTokenLen;
}
if(subStr.length()>0){
#ifdef USE_SENTENCEPIECE
if(spProcessor!=nullptr){
std::vector<int> ids;
spProcessor->Encode(subStr,&ids);
for(int id:ids){
v.push_back(id);
}
}else{
#endif
std::vector<Symbol> symbols;
for (int i = 0; i < subStr.size(); i++) {
int tokenId = -999999, pos = i - 1;
TrieNode *now = this->root;
for (int j = i; j < subStr.size(); j++) {
if (now->next.find(subStr[j]) != now->next.end()) {
now = now->next[subStr[j]];
if (now->tokenId != -999999) {
tokenId = now->tokenId;
pos = j;
break;
}
} else {
break;
}
}
if (pos >= i) {
symbols.push_back(Symbol(now, (char *) subStr.data(), i, pos - i + 1, (int) symbols.size() - 1,
(int) symbols.size() + 1, -999999));
i = pos;
} else {
symbols.push_back(Symbol(nullptr, (char *) subStr.data(), i, 0, (int) symbols.size() - 1,
(int) symbols.size() + 1, -999999));
}
}
symbols.back().next = -1;
std::priority_queue<SymbolPairs> workQueue;
for (int i = 1; i < symbols.size(); i++) {
TryMergePairs(symbols, i - 1, i, workQueue);
}
while (!workQueue.empty()) {
auto top = workQueue.top();
workQueue.pop();
if (symbols[top.l].len == 0 || symbols[top.r].len == 0 ||
symbols[top.l].len + symbols[top.r].len != top.size) {
continue;
}
for (int i = symbols[top.r].pos; i < symbols[top.r].pos + symbols[top.r].len; i++) {
symbols[top.l].node = symbols[top.l].node->next[symbols[top.r].s[i]];
}
symbols[top.l].len += symbols[top.r].len;
symbols[top.r].len = 0;
symbols[top.l].next = symbols[top.r].next;
if (symbols[top.r].next >= 0) {
symbols[symbols[top.r].next].prev = top.l;
}
TryMergePairs(symbols, symbols[top.l].prev, top.l, workQueue);
TryMergePairs(symbols, top.l, symbols[top.l].next, workQueue);
}
for (int i = 0; i < symbols.size(); i++) {
if (symbols[i].len > 0) {
v.push_back(symbols[i].node->tokenId);
} else if (symbols[i].node == nullptr) {
if (symbols[i].fixId != -999999) {
v.push_back(symbols[i].fixId);
} else {
// 未识别的字符
uint8_t c = (uint8_t) (symbols[i].s[symbols[i].pos]);
std::string now = "<0x00>";
now[3] = (c / 16 > 9 ? ('A' + c / 16 - 10) : ('0' + c / 16));
now[4] = (c % 16 > 9 ? ('A' + c % 16 - 10) : ('0' + c % 16));
if (stringToTokenDict.find(now) != stringToTokenDict.end()) {
v.push_back(stringToTokenDict[now]);
}
}
}
}
#ifdef USE_SENTENCEPIECE
}
#endif
}
if(nextSpecialTokenPos>=0){
v.push_back(nextSpecialToken);
}
}
return Data (DataType::FLOAT32, {1, (int)v.size()}, v);
} else if (this->type == TokenizerType::QWEN) {
std::map<std::string, int> specialTokens = {{"<|im_start|>", 151644}, {"<|im_end|>", 151645}, {"<|endoftext|>", 151643}};
......@@ -1246,7 +1402,8 @@ namespace fastllm {
}
} else {
#ifdef USE_MMAP
weight[name].set_file(mapped_file);
weight[name].SetMapFile(mapped_file);
weight[name].expansionBytes = (weight[name].Count(0) * weight[name].unitSize - 1) / weight[name].unitSizeDiv + 1;
#else
weight[name].Allocate();
#endif
......@@ -1594,6 +1751,21 @@ namespace fastllm {
}
}
void WeightMap::ReleaseWeight() {
for (auto &w : this->weight) {
#ifndef USE_MMAP
delete[] w.second.cpuData;
w.second.cpuData = nullptr;
#endif
#ifdef USE_CUDA
if (w.second.cudaData != nullptr) {
FastllmCudaDirectFree(w.second.cudaData);
w.second.cudaData = nullptr;
}
#endif
}
}
Data &WeightMap::operator[](const std::string &key) {
return weight[key];
}
......@@ -1958,4 +2130,4 @@ namespace fastllm {
std::map <std::string, int> GetDeviceMap() {
return defaultDeviceMap;
}
}
\ No newline at end of file
}
......@@ -7,6 +7,7 @@
#include "moss.h"
#include "llama.h"
#include "qwen.h"
#include "glm.h"
namespace fastllm {
void basellm::LoadFromFile(const std::string &fileName) {
......@@ -16,8 +17,12 @@ namespace fastllm {
void basellm::InitParams() {
if (this->weight.dicts.find("bos_token_id") != this->weight.dicts.end()) {
this->bos_token_id = atoi(this->weight.dicts["bos_token_id"].c_str());
this->eos_token_id = atoi(this->weight.dicts["eos_token_id"].c_str());
if(this->weight.dicts["bos_token_id"]!="None"){
this->bos_token_id = atoi(this->weight.dicts["bos_token_id"].c_str());
}
if(this->weight.dicts["eos_token_id"]!="None"){
this->eos_token_id = atoi(this->weight.dicts["eos_token_id"].c_str());
}
}
if (this->weight.dicts.find("im_start_id") != this->weight.dicts.end()) {
this->bos_token_id = atoi(this->weight.dicts["im_start_id"].c_str());
......@@ -25,6 +30,8 @@ namespace fastllm {
}
if (this->weight.dicts.find("num_hidden_layers") != this->weight.dicts.end()) {
block_cnt = atoi(this->weight.dicts["num_hidden_layers"].c_str());
}else if (this->weight.dicts.find("num_layers") != this->weight.dicts.end()) {
block_cnt = atoi(this->weight.dicts["num_layers"].c_str());
}
if (this->weight.dicts.find("hidden_size") != this->weight.dicts.end()) {
embed_dim = atoi(this->weight.dicts["hidden_size"].c_str());
......@@ -81,6 +88,11 @@ namespace fastllm {
} else if (modelType == "qwen") {
model = (basellm *) (new QWenModel());
model->weight.tokenizer.type = Tokenizer::TokenizerType::QWEN;
} else if (modelType == "glm") {
model = (basellm*)(new GLMModel());
} else if (modelType == "chatglm3") {
model = (basellm*)(new ChatGLMModel());
model->model_type = "chatglm3";
} else {
ErrorInFastLLM("Unkown model type: " + modelType);
}
......
......@@ -61,8 +61,8 @@ namespace fastllm {
#endif
std::string prompt = input;
#ifdef PY_API
size_t pos = input.find_last_of("time_stamp:");
prompt = (generationConfig.enable_hash_id && pos != std::string::npos) ? input.substr(0, pos - 10) : input;
size_t pos = input.rfind("time_stamp:");
prompt = (generationConfig.enable_hash_id && pos != -1) ? input.substr(0, pos) : input;
size_t hash_id = std::hash<std::string>{}(input);
#endif
Data inputIds, attentionMask, positionIds;
......@@ -151,8 +151,8 @@ namespace fastllm {
size_t hash_id = std::hash<std::string>{}(_input);
hash_ids.push_back(hash_id);
size_t pos = _input.find_last_of("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos) ? _input.substr(0, pos - 10) : _input;
size_t pos = _input.rfind("time_stamp:");
std::string prompt = (generationConfig.enable_hash_id && pos != -1) ? _input.substr(0, pos) : _input;
prompts.push_back(prompt);
}
#else
......@@ -208,6 +208,11 @@ namespace fastllm {
inputTokens[i] = std::vector <float> {(float)ret[i]};
if (ret[i] == eos_token_id) {
isEnding[i] = true;
} else {
auto itStopTk = generationConfig.stop_token_ids.find(ret[i]);
if (itStopTk != generationConfig.stop_token_ids.end()) {
isEnding[i] = true;
}
}
if (isEnding[i]) {
curStrings.push_back("");
......@@ -284,6 +289,127 @@ namespace fastllm {
#endif
}
void basellm::ResponseBatch(std::vector<std::vector<float>> &inputTokens, std::vector<std::string> &outputs,
RuntimeResultBatch retCb, const fastllm::GenerationConfig &generationConfig) {
#ifdef USE_CUDA
FastllmCudaClearBigBuffer();
#endif
// 1. first
Data inputIds, attentionMask, positionIds;
int batch = inputTokens.size();
outputs.clear();
outputs.resize(batch, "");
std::vector <std::pair <Data, Data> > pastKeyValues;
for (int i = 0; i < block_cnt; i++) {
pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
Data(DataType::FLOAT32)));
}
std::vector <std::map <std::string, int> > params;
params.resize(batch);
for (int i = 0; i < batch; i++) {
params[i]["promptLen"] = (int)inputTokens[i].size();
}
params[0]["index"] = 0;
int index = 0;
LastTokensManager tokensManager (batch, generationConfig.last_n);
std::vector <bool> isEnding = std::vector <bool> (batch, false);
FillLLMInputsBatch(inputTokens, params, inputIds, attentionMask, positionIds);
while (true) {
auto st = std::chrono::system_clock::now();
std::vector <int> ret = ForwardBatch(batch, inputIds, attentionMask, positionIds, pastKeyValues,
generationConfig, tokensManager);
for (int i = 0; i < batch; i++) {
tokensManager.units[i].Push(ret[i]);
}
std::vector <float> fret;
std::vector <float> results;
int endingCount = 0;
std::vector <std::string> curStrings;
for (int i = 0; i < batch; i++) {
fret.push_back(ret[i]);
inputTokens[i] = std::vector <float> {(float)ret[i]};
if (ret[i] == eos_token_id) {
isEnding[i] = true;
}
if (isEnding[i]) {
curStrings.push_back("");
endingCount++;
continue;
}
results.push_back(ret[i]);
std::string curString = weight.tokenizer.Decode(
Data(DataType::FLOAT32, {(int) results.size()}, results)).c_str();
outputs[i] += curString;
curStrings.push_back(curString);
results.clear();
}
if (endingCount == batch) {
break;
}
if (retCb)
#ifdef PY_API
{
if (generationConfig.enable_hash_id) {
std::vector<pybind11::bytes> rtnStrings;
for (size_t i=0; i<batch; i++){
std::stringstream ss;
ss << curStrings[i] << "hash_id:" << hash_ids[i];
rtnStrings.push_back(pybind11::bytes(ss.str()));
}
retCb(index, rtnStrings);
} else {
std::vector<pybind11::bytes> rtnStrings;
for (size_t i=0; i<batch; i++){
std::stringstream ss;
ss << curStrings[i];
rtnStrings.push_back(pybind11::bytes(ss.str()));
}
retCb(index, rtnStrings);
}
}
#else
retCb(index, curStrings);
#endif
index++;
params[0]["index"] = index;
FillLLMInputsBatch(inputTokens, params, inputIds, attentionMask, positionIds);
// printf("len = %d, spend %f s.\n", len, GetSpan(st, std::chrono::system_clock::now()));
if (index == generationConfig.output_token_limit) {
break;
}
}
if (retCb)
#ifdef PY_API
{
if (generationConfig.enable_hash_id) {
std::vector<pybind11::bytes> rtnStrings;
for (size_t i=0; i<batch; i++){
std::stringstream ss;
ss << outputs[i] << "hash_id:" << hash_ids[i];
rtnStrings.push_back(pybind11::bytes(ss.str()));
}
retCb(-1, rtnStrings);
} else {
std::vector<pybind11::bytes> rtnStrings;
for (size_t i=0; i<batch; i++){
std::stringstream ss;
ss << outputs[i];
rtnStrings.push_back(pybind11::bytes(ss.str()));
}
retCb(-1, rtnStrings);
}
}
#else
retCb(-1, outputs);
#endif
}
std::vector<int> basellm::ForwardBatch(int batch, const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
const fastllm::Data &positionIds,
std::vector<std::pair<Data, Data>> &pastKeyValues,
......@@ -464,6 +590,12 @@ printf("tot = %d\n", tot);
if (curRet == model->eos_token_id) {
it.second->isEnding = true;
} else {
auto itStopTk = it.second->generationConfig.stop_token_ids.find(curRet);
if (itStopTk != it.second->generationConfig.stop_token_ids.end()) {
it.second->isEnding = true;
}
}
if (it.second->isEnding == false) {
it.second->currentTokens = std::vector<int>{curRet};
it.second->resultTokenQueue.push(curRet);
it.second->tokens.Push(curRet);
......@@ -484,6 +616,13 @@ printf("tot = %d\n", tot);
model->dictLocker.unlock();
MySleep(0);
// 介意cpu一直占用可以将上面这行换成下面的代码
/*if (seqLens.size() > 0) {
MySleep(0);
}
else{
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}*/
}
}, this);
}
......@@ -523,6 +662,8 @@ printf("tot = %d\n", tot);
}
dictLocker.unlock();
MySleep(0);
// 介意cpu一直占用可以将上面这行换成下面的代码
// std::this_thread::sleep_for(std::chrono::milliseconds(10));
dictLocker.lock();
}
}
......@@ -555,6 +696,8 @@ printf("tot = %d\n", tot);
}
dictLocker.unlock();
MySleep(0);
// 介意cpu一直占用可以将上面这行换成下面的代码
// std::this_thread::sleep_for(std::chrono::milliseconds(10));
dictLocker.lock();
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment