1、同步到最新版本；2、增加batch推理接口；3、解决内存泄漏问题；4、修复llama系列流式输出不流畅的问题

56215723 · zhouxiang · 44be91d3 · 56215723 · 56215723 · 56215723
Commit 56215723 authored Jan 31, 2024 by zhouxiang
20 changed files
--- a/pyfastllm/fastllm/convert.py
+++ b/pyfastllm/fastllm/convert.py
@@ -4,39 +4,9 @@ import sys
 import struct
 import numpy as np
 import argparse
-from .utils import convert 

-HF_INSTALLED = False
-try:
-    import torch
-    from transformers import AutoTokenizer, AutoModel # chatglm
-    from transformers import LlamaTokenizer, LlamaForCausalLM # alpaca
-    from transformers import AutoModelForCausalLM, AutoTokenizer  # baichuan, moss
-    from peft import PeftModel
-    HF_INSTALLED = True
-except Exception as e:
-    logging.error("Make sure that you installed transformers and peft!!!")
-    sys.exit(1)
-
-MODEL_DICT = {
-    "alpaca":{
-        "tokenizer": "minlik/chinese-alpaca-33b-merged",
-        "model": "minlik/chinese-alpaca-33b-merged"
-    },
-    "baichuan7B":{
-        "model": "baichuan-inc/baichuan-7B",
-        "tokenizer": "baichuan-inc/baichuan-7B",
-        "peft": "hiyouga/baichuan-7b-sft",
-    },
-    "chatglm6B":{
-        "tokenizer": "THUDM/chatglm-6b",
-        "model": "THUDM/chatglm-6b"
-    },
-    "moss":{
-        "model": "fnlp/moss-moon-003-sft",
-        "tokenizer": "fnlp/moss-moon-003-sft",
-    }
-}
+from .utils import convert
+from .utils.converter import QuantType

 def parse_args():
    # -p 模型路径或hf路径
@@ -51,67 +21,24 @@ def parse_args():
                    help='lora model path')
    parser.add_argument('-m', dest='model', default='chatglm6B',
                    help='model name with(alpaca, baichuan7B, chatglm6B, moss)')
-    parser.add_argument('-q', dest='qbit', type=int,
+    parser.add_argument('-q', dest='q_bit', type=int,
                    help='model quantization bit')
    args = parser.parse_args()
    return args


-def alpaca(model_path):
-    tokenizer = LlamaTokenizer.from_pretrained(model_path)
-    model = LlamaForCausalLM.from_pretrained(model_path).float()
-    return model, tokenizer
-
-def baichuan7B(model_path, peft_path):
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True)
-    model = PeftModel.from_pretrained(model, peft_path).float()
-    layers = model.model.model.layers
-    for i in range(len(layers)):
-        layers[i].self_attn.W_pack.weight += torch.mm(layers[i].self_attn.W_pack.lora_B.default.weight, layers[i].self_attn.W_pack.lora_A.default.weight) * layers[i].self_attn.W_pack.scaling["default"]
-    
-    return model, tokenizer
-
-def chatglm6B(model_path, ):
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModel.from_pretrained(model_path, trust_remote_code=True).float()
-    model = model.eval()
-    return model, tokenizer
-
-def moss(model_path, ):
-    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True).float()
-    model = model.eval()
-    return model, tokenizer
-
 def main(args=None):
-    assert HF_INSTALLED, "Make sure that you installed transformers and peft before convert!!!"
-    if not args:
-        args = parse_args()
-        
-    if args.model not in MODEL_DICT:
-        assert f"Not Support {args.model} Yet!!!"
-    
-    model_args = {}
-    model_args["model_path"] = MODEL_DICT[args.model].get("model")
-    if MODEL_DICT[args.model].has_key("peft"):
-        model_args["peft_path"] = MODEL_DICT[args.model].get("peft")
-    
-    if args.model_path:
-        model_args["model_path"] = args.model_path[0]
-        if len(args.model_path) > 2:
-            model_args["peft_path"] = args.model_path[2]
-    
-    model, tokenizer = globals().get(args.model)(**model_args)
-    export_path = args.export_path or f"{args.model}-fp32.bin"
-    convert(export_path, model.model, tokenizer)
+    if not args: args = parse_args()

-    if args.qbit:
-        import pyfastllm as fastllm
-        export_name, export_ext = export_path.split('.')
-        q_export_path = export_name + f"-q{args.qbit}." + export_ext 
-        flm_model = fastllm.create_llm(export_path)
-        flm_model.save_lowbit_model(q_export_path, args.qbit) 
+    quant_type_to_qbit = {
+        QuantType.FP32: 32,
+        QuantType.FP16: 16,
+        QuantType.INT8: 8,
+        QuantType.INT4: 4,
+    }
+    qbit_to_quant_type = {v: k for k, v in quant_type_to_qbit.items()}
+    q_type = qbit_to_quant_type[args.q_bit]
+    convert(args.model_path, args.export_path, q_type=q_type)

 if __name__ == "__main__":
    args = parse_args()

--- a/pyfastllm/fastllm/functions/__init__.py
+++ b/pyfastllm/fastllm/functions/__init__.py
+from .fastllm_ops import *
--- a/pyfastllm/fastllm/functions/custom_ops.py
+++ b/pyfastllm/fastllm/functions/custom_ops.py
--- a/pyfastllm/fastllm/functions/fastllm_ops.py
+++ b/pyfastllm/fastllm/functions/fastllm_ops.py
+import pyfastllm
+
+
+def embedding(data: pyfastllm.Tensor, ):
+    # some check
+    return pyfastllm.embedding(data, )
+
+def rms_norm(input:pyfastllm.Tensor, weight: pyfastllm.Tensor, eps: float, output: pyfastllm.Tensor=None):
+    output = pyfastllm.rms_norm(input, weight, eps)
+    return output
+
+def layer_norm(input: pyfastllm.Tensor, 
+               gamma: pyfastllm.Tensor, 
+               beta: pyfastllm.Tensor, 
+               axis:int=-1 ):
+    output = pyfastllm.layer_norm(input, gamma, beta,axis)
+    return output
+
+def linear(input: pyfastllm.Tensor, 
+           weight: pyfastllm.Tensor, 
+           bias: pyfastllm.Tensor):
+    output = pyfastllm.linear(input, weight, bias)
+    return output
+
+def matmul(input0: pyfastllm.Tensor, 
+           input1: pyfastllm.Tensor, 
+           alpha: pyfastllm.Tensor):
+    output = pyfastllm.matmul(input0, input1, alpha)
+    return output
+
+def attention(q: pyfastllm.Tensor, 
+              k: pyfastllm.Tensor, 
+              v: pyfastllm.Tensor, 
+              mask: pyfastllm.Tensor,
+              group: int, 
+              scale: float, 
+              attentionType: int):
+    output = pyfastllm.attention(q, k, v, mask, group, scale, attentionType)
+    return output
+
+def activation(input: pyfastllm.Tensor, axis=-1, activate_type="silu"):
+    assert activate_type in ("softmax", "silu", "gelu", "swiglu")
+    func = getattr(pyfastllm, activate_type)
+    if activate_type == "softmax":
+        return func(input, axis)
+    return func(input)
+
+def mul(input: pyfastllm.Tensor, v: int):
+    output = pyfastllm.mul(input, v)
+    return output
+
+def matmul_transB():
+    pass
+
+def add(input0: pyfastllm.Tensor, input1: pyfastllm.Tensor):
+    output = pyfastllm.add(input0, input1)
+    return output
+
+def AttentionMask():
+    pass
+
+def AlibiMask():
+    pass
+
+def topk():
+    pass
+
+def RotatePosition2D():
+    pass
+
+def NearlyRotatePosition2D():
+    pass
+
+def LlamaRotatePosition2D():
+    pass
+
+def RepeatPenalty():
+    pass
--- a/pyfastllm/fastllm/models.py
+++ b/pyfastllm/fastllm/models.py
+#!encoding=utf8
+import os
+import tempfile
+from typing import List, Tuple
+import re
+
+import pyfastllm
+from . import utils
+from .utils.quantizer import QuantType
+
+
+class InferConfig():
+    def __init__(self,
+                 max_length:int=2048,
+                 top_p:float=0.7,
+                 temperature:float=0.95,
+                 **kwargs) -> None:
+
+        configs = {
+            "max_length": max_length,
+            "top_p": top_p,
+            "temperature": temperature
+        }
+        configs.update(kwargs)
+
+        self.from_dict(configs)
+
+    def from_dict(self, configs):
+        self.configs = configs
+        for key, val in configs.items():
+            setattr(self, key, val)
+
+    def to_dict(self, ):
+        return self.configs
+    
+    @property
+    def flm_config(self, ):
+        flm_config = pyfastllm.GenerationConfig()
+        for attr, val in self.configs.items():
+            setattr(flm_config, attr, val)
+        return flm_config
+
+
+class BaseModel():
+    def __init__(self, model_path:str) -> None:
+        if model_path.endswith('flm'):
+            print("loading model:", pyfastllm.get_llm_type(model_path))
+            self.model = pyfastllm.create_llm(model_path)
+        elif os.path.isdir(model_path):
+            save_path = tempfile.mkstemp()
+            utils.convert(model_path, save_path, q_type=QuantType.INT4)
+            self.model = pyfastllm.create_llm(save_path)
+        else:
+            raise NotImplementedError(f"unsupport model type!")
+    
+    def build_input(self, query, history):
+        raise NotImplementedError
+    
+    def is_stop(self, token_id):
+        raise NotImplementedError
+    
+    def process_response(self, response):
+        raise NotImplementedError
+
+    def stream_chat(self,
+                    tokenizer=None,
+                    query:str='',
+                    history=None,
+                    max_length:int=2048,
+                    top_p:float=0.7,
+                    temperature:float=0.95,
+                    *args, **kwargs):
+        model = self.model
+        infer_config = InferConfig(max_length=max_length, top_p=top_p, temperature=temperature, **kwargs)
+
+        if not tokenizer: tokenizer = model.weight.tokenizer
+        if not history: history = []
+        
+        prompt = self.build_input(query,history)
+        input_ids = tokenizer.encode(prompt)
+        handle = model.launch_response(input_ids, infer_config.flm_config)
+        outputs = []
+        ret_str = ""
+        while len(outputs) < max_length:
+            resp_token = model.fetch_response(handle)
+            if self.is_stop(resp_token):
+                break
+            outputs.append(resp_token)
+            content = tokenizer.decode(outputs)
+            ret_str = self.process_response(content)
+            yield ret_str, history + [(query, ret_str)]
+
+    def chat(self,
+                tokenizer=None,
+                query:str='',
+                history=None,
+                max_length:int=2048,
+                top_p:float=0.7,
+                temperature:float=0.95,
+                *args, **kwargs):
+        model = self.model
+
+        infer_config = InferConfig(max_length=max_length, top_p=top_p, temperature=temperature, **kwargs)
+
+        if not tokenizer: tokenizer = model.weight.tokenizer
+        if not history: history = []
+
+        prompt = self.build_input(query, history=history)
+        input_ids = tokenizer.encode(prompt)
+        handle = model.launch_response(input_ids, infer_config)
+        outputs = []
+        ret_str = ""
+        while len(outputs) < max_length:
+            resp_token = model.fetch_response(handle)
+            if self.is_stop(resp_token):
+                break
+            outputs.append(resp_token)
+            content = tokenizer.decode(outputs)
+            ret_str = self.process_response(content)
+        history.append((query, ret_str))
+        return ret_str, history
+
+
+class ChatglmModel(BaseModel):
+    def process_response(self, response):
+        response = response.strip()
+        response = response.replace("[[训练时间]]", "2023年")
+        return response
+    
+    def is_stop(self, token_id):
+        return token_id <= 2
+    
+    def build_input(self, query, history=None):
+        if not history: history = []
+        prompt = ""
+
+        for i, (old_query, response) in enumerate(history):
+            prompt += "[Round {}]\n问：{}\n答：{}\n".format(i, old_query, response)
+        prompt += "[Round {}]\n问：{}\n答：".format(len(history), query)
+        return prompt
+
+class QwenModel(BaseModel):
+    def process_response(self, response):
+        return response
+    
+    def is_stop(self, token_id):
+        chat_format = self.model.get("chat_format", "chatml")
+        if chat_format == "raw":
+            stop_words_ids = [151643]
+        elif chat_format == "chatml":
+            stop_words_ids = [151645, 151644]
+        return token_id in stop_words_ids
+
+    def build_inputs(self, query, history=None):
+        prompt = ""
+        chat_format = self.model.get("chat_format", "chatml")
+        if chat_format == "chatml":
+            if history is None: history = []
+            prompt = f"{self.model.im_start} system \n {self.model.pre_prompt} + {self.model.im_end}"
+            for i, (old_query, response) in enumerate(history):
+                prompt += old_query + response
+            prompt += f"\n {self.model.im_start + self.model.user_role} \n {query + self.model.im_end} \n  {self.model.im_start + self.model.bot_role} \n"
+        elif chat_format == "raw":
+            prompt = query
+        else:
+            raise NotImplementedError(f"Unknown char_format for QWen: {chat_format}")
+        return prompt
+
+
+class BaichuanModel(BaseModel):
+    def process_response(self, response):
+        return response
+    
+    def is_stop(self, token_id):
+        return token_id == 2
+    
+    def build_input(self, query, history=None):
+        prompt = ""
+        round = 0
+        # TODO 增加最长截断
+        for i, (role, content) in enumerate(history):
+            if role == "system" and i == 0:
+                prompt += content
+            elif role == "user":
+                round += 1
+                prompt += f"<reserved_102>{content}"
+            elif role == "assistant":
+                prompt += f"<reserved_103>{content}"
+        
+        return prompt
+
+
+
+class MossModel(BaseModel):
+    def process_response(self, response):
+        return response
+    
+    def is_stop(self, token_id):
+        return token_id == 106068
+
+    def build_input(self, query, history=None):
+        prompt = self.model.pre_prompt
+        if not history: history = []
+
+        for i, (old_query, response) in enumerate(history):
+            prompt += old_query + response
+        
+        return prompt + f"{self.model.user_role} {query} {self.model.bot_role}"
+
+
+
+class AutoFlmModel:
+    def __init__(self) -> None:
+        raise NotImplementedError
+    
+    @classmethod
+    def from_pretrained(cls, model_path:str):
+        # hf_model
+        if os.path.isdir(model_path):
+            save_path = tempfile.mkstemp(suffix='flm') 
+            utils.convert(model_path, save_path, q_type=QuantType.INT4)
+            model_path = save_path
+            
+        if model_path.endswith('flm'):
+            model_type = pyfastllm.get_llm_type(model_path)
+        else:
+            raise NotImplementedError(f"unsupport model type!")
+        
+        if model_type == "chatglm":
+            model = ChatglmModel(model_path)
+        elif model_type == "qwen":
+            model = QwenModel(model_path)
+        elif model_type == "baichuan":
+            model = BaichuanModel(model_path)
+        elif model_type == "moss":
+            model = MossModel(model_path)
+        else:
+            raise NotImplementedError(f"unsupport model: {model_type}!")
+
+        return model
\ No newline at end of file
--- a/pyfastllm/fastllm/nn/BaseModule.py
+++ b/pyfastllm/fastllm/nn/BaseModule.py
+from typing import Any
+
+
+class Module():
+    def __init__(self) -> None:
+        pass
+
+    def __call__(self, *args: Any, **kwds: Any) -> Any:
+        return self.forward(*args, **args)
+    
+    @classmethod
+    def forward(self, ):
+        pass
+
+    def _init_weight(self, ):
+        pass
+
+    
--- a/pyfastllm/fastllm/nn/__init__.py
+++ b/pyfastllm/fastllm/nn/__init__.py
+from BaseModule import Module
--- a/pyfastllm/fastllm/utils/__init__.py
+++ b/pyfastllm/fastllm/utils/__init__.py
-from . import torch2flm
+from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer

-def convert(model, tokenizer, output_path, **args):
-    torch2flm.tofile(output_path, model, tokenizer, **args)
+from .quantizer import QuantType
+from .converter import ChatglmConverter, BaichuanConverter, QwenConverter, MossConverter
+
+def convert(hf_model_name_or_path:str, save_path:str, q_type=QuantType.INT4):
+    config = AutoConfig.from_pretrained(hf_model_name_or_path, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_name_or_path, trust_remote_code=True)
+
+    if "Baichuan" in config.architectures:
+        model = AutoModelForCausalLM.from_pretrained(hf_model_name_or_path, trust_remote_code=True).cpu().eval()
+        converter = BaichuanConverter(model=model, tokenizer=tokenizer, q_type=q_type)
+    elif "ChatGLM" in config.architectures:
+        model = AutoModel.from_pretrained(hf_model_name_or_path, trust_remote_code=True).cpu().eval()
+        converter = ChatglmConverter(model=model, tokenizer=tokenizer, q_type=q_type)
+    elif "Qwen" in config.architectures:
+        model = AutoModelForCausalLM.from_pretrained(hf_model_name_or_path, trust_remote_code=True, fp16=True).cpu().eval()
+        converter = QwenConverter(model=model, tokenizer=tokenizer, q_type=q_type)
+    elif "Moss" in config.architectures:
+        model = AutoModelForCausalLM.from_pretrained(hf_model_name_or_path, trust_remote_code=True).cpu().eval()
+        converter = MossConverter(model=model, tokenizer=tokenizer, q_type=q_type)
+    else:
+        raise NotImplementedError(f"Unsupport model: {config.architectures}")
+    
+    converter.dump(save_path)
--- a/pyfastllm/fastllm/utils/converter.py
+++ b/pyfastllm/fastllm/utils/converter.py
+import struct
+
+from typing import Any
+import numpy as np
+import torch
+
+from .writer import Writer
+from .quantizer import QuantType
+
+class BaseConverter():
+    def __init__(self, model, tokenizer, q_type=0) -> None:
+        self.model = model
+        self.tokenizer = tokenizer
+        self.q_type = q_type
+    
+    def get_model_info(self):
+        model_info = self.model.config.__dict__
+        if self.model.generation_config is not None:
+            model_info.update(self.model.generation_config.__dict__)
+        model_info["tokenizer_use_score"] = "1"
+        return model_info
+
+    def get_vocab(self, ):
+        raise NotImplementedError
+
+    def get_weights(self):
+        state_dict = self.model.state_dict()
+        if hasattr(self.model, "peft_config"):
+            state_dict.keys = [key.replace('base_model.model.', '') for key in state_dict]
+
+        state_dict = {key: val.numpy().astype(np.float32) for key, val in state_dict.items()}
+        for name, m in self.model.named_modules():
+            if isinstance(m, torch.nn.Linear): 
+                if self.q_type == QuantType.FP16:
+                    state_dict[name+".weight.fp16"] = state_dict[name+".weight"].astype(np.float16)
+                    state_dict.pop(name+".weight")
+                elif self.q_type == QuantType.INT8:
+                    state_dict[name+".weight.int8"] = state_dict[name+".weight"]
+                    state_dict.pop(name+".weight")
+                elif self.q_type == QuantType.INT4:
+                    state_dict[name+".weight.int4"] = state_dict[name+".weight"]
+                    state_dict.pop(name+".weight")
+
+        return state_dict
+
+    def convert_model_info(self, wt:Writer):
+        model_info = self.get_model_info()
+        model_info = {
+            str(key): str(val)
+            for key, val in model_info.items()
+        }
+        wt.write(model_info)
+
+    def convert_tokenizer(self, wt:Writer):
+        vocab = self.get_vocab()
+        vocab_len = len(vocab)
+        wt.write(int(vocab_len))
+        for i, key in enumerate(vocab):
+            # wt.write(len(key))
+            # for c in key: wt.write(int(c))
+            wt.write(key)
+            wt.write(int(i))
+            wt.write(float(vocab[key]))
+
+    def convert_weights(self, wt:Writer):
+        state_dict = self.get_weights()
+        wt.write(len(state_dict))
+        tot = 0
+        for name, tensor in state_dict.items():
+            print(f"{name} : {tensor.shape}")
+            if name.endswith("int4") or name.endswith("int8") or name.endswith("fp16"):
+                wt.write(str(name[:-5]))
+                wt.write_tensor(tensor, self.q_type)
+            else:
+                wt.write(str(name))
+                wt.write(tensor)
+
+            print("output (", tot, "/", len(state_dict), end = " )\r")
+            tot += 1
+        print("\nfinish.")
+    
+    def forward(self, wt:Writer, *args: Any, **kwds: Any) -> Any:
+        self.convert_model_info(wt)
+        self.convert_tokenizer(wt)
+        self.convert_weights(wt)
+
+    def __call__(self, wt:Writer, *args: Any, **kwds: Any) -> Any:
+        return self.forward(wt, *args, **kwds)
+    
+    def dump(self, outpath:str):
+        wt = Writer(outpath=outpath)
+        # version id
+        wt.write(int(2))
+        self.forward(wt=wt)
+
+
+class ChatglmConverter(BaseConverter):
+    def get_vocab(self):
+        tokenizer = self.tokenizer.tokenizer
+        piece_size = tokenizer.sp_model.piece_size()
+
+        vocab = {
+            tokenizer.sp_model.id_to_piece(i).encode(): float(tokenizer.sp_model.get_score(i)) for i in range(piece_size)
+        }
+        return vocab
+ 
+    
+class BaichuanConverter(BaseConverter):
+    def get_model_info(self, ):
+        model_info = super().get_model_info()
+        if hasattr(self.model, "model") and hasattr(self.model.model, "get_alibi_mask"):
+            model_info.update({
+                "use_alibi": "1",
+                "pre_prompt": "",
+                "user_role": "<FLM_FIX_TOKEN_" + str(self.model.generation_config.user_token_id) + "> ",
+                "bot_role": "<FLM_FIX_TOKEN_" + str(self.model.generation_config.assistant_token_id) + ">",
+                "history_sep": ""
+            })
+        return model_info
+    
+    def get_vocab(self,):
+        vocab = self.tokenizer.get_vocab()
+
+        vocab = {
+            key.encode(): vocab[key] for key in vocab
+        }
+        return vocab
+    
+
+class QwenConverter(BaseConverter):
+    def get_model_info(self,):
+        model_info = super().get_model_info()
+        if model_info["chat_format"] == "chatml":
+            model_info.update({
+                "im_end_id": self.tokenizer.im_end_id, 
+                "im_start_id": self.tokenizer.im_start_id
+            })
+        
+        return model_info 
+    
+    def get_vocab(self, ):
+        vocab = self.tokenizer.get_vocab()
+        vocab = {
+            key: 1.0 for key in vocab.keys()
+        }
+        return vocab
+
+
+class MossConverter(BaseConverter):
+    def get_vocab(self, ):
+        tokenizer = self.tokenizer.tokenizer
+        vocab = tokenizer.get_vocab()
+        vocab = {
+            [tokenizer.byte_decoder.get(c, ord(c)) for c in v]: 1.0
+            for v in vocab
+        }
+        
+        return vocab
--- a/pyfastllm/fastllm/utils/quantizer.py
+++ b/pyfastllm/fastllm/utils/quantizer.py
+import numpy as np
+from enum import Enum
+from .writer import Writer
+
+class QuantType(Enum):
+    FP32 = 0
+    FP16 = 7
+    INT8 = 3
+    INT4 = 8
+
+class Quantizer():
+    quant_bit = {QuantType.FP16: 16, QuantType.INT8: 8, QuantType.INT4: 4}
+
+    def __init__(self, quant_type:QuantType, symmetry=True) -> None:
+        self.quant_type = quant_type
+        self.q_bit = self.quant_bit[quant_type]
+
+        self.up_bound = (2**(self.q_bit-1)) -1 
+        self.low_bound = -(2 ** (self.q_bit-1))
+
+        self.symmetry = symmetry
+
+    # 范围小，单数据精度高，适用于分布集中场景
+    def asymquantize(self, data:np.ndarray):
+        c_min = np.expand_dims(data.min(axis=-1), -1)
+        c_max = np.expand_dims(data.max(axis=-1), -1)
+        c_scale = (c_max - c_min) / (self.up_bound - self.low_bound)
+        c_zero = np.round(0.0 - c_min / c_scale).clip(0, self.up_bound - self.low_bound)
+        c_min = -c_scale * c_zero
+
+        q_data = (data - c_min)/ c_scale
+
+        if self.quant_type == QuantType.FP32:
+            q_data = data.astype(np.float32)
+        elif self.quant_type == QuantType.FP16:
+            q_data = data.astype(np.float16)
+        elif self.quant_type == QuantType.INT8:
+            q_data = (q_data + 0.5).astype(np.int8).clip(0, 255).astype(np.uint8)
+        elif self.quant_type == QuantType.INT4:
+            q_data = (q_data + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
+            q_data = q_data[:, 0::2] * 16 + q_data[:, 1::2]
+        else:
+            raise NotImplementedError(f"unsupport quant type")
+        
+        self.c_min = c_min
+        self.c_max = c_max
+        self.c_scale = c_scale
+        self.c_zero = c_zero
+        self.quant_data = q_data
+
+        return q_data
+    
+    # 范围大、单数据精度低，适用分布较分散场景
+    def symquantize(self, data:np.ndarray):
+        c_min = np.expand_dims(-np.abs(data).max(axis = -1), -1)
+        c_max = np.expand_dims(np.abs(data).max(axis = -1), -1)
+        c_scale = c_max / self.up_bound
+        c_min = c_scale * self.low_bound
+        
+        q_data = (data - c_min) / c_scale 
+
+        if self.quant_type == QuantType.FP32:
+            q_data = data.astype(np.float32)
+        elif self.quant_type == QuantType.FP16:
+            q_data = data.astype(np.float16)
+        elif self.quant_type == QuantType.INT8:
+            q_data = (q_data + 0.5).astype(np.int8).clip(1, 255).astype(np.uint8)
+        elif self.quant_type == QuantType.INT4:
+            q_data = (q_data + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
+            q_data = q_data[:, 0::2] * 16 + q_data[:, 1::2]
+        else:
+            raise NotImplementedError(f"unsupport quant type")
+        
+        self.c_min = c_min
+        self.c_max = c_max
+        self.c_scale = c_scale
+        self.quant_data = q_data
+
+        return q_data
+    
+    def quantize(self, data:np.ndarray):
+        if self.symmetry:
+            return self.symquantize(data)
+        else:
+            return self.asymquantize(data)
+    
+    def dequantize(self, ):
+        if not self.c_scale: 
+            raise ValueError
+        
+        data = self.quant_data * self.c_scale + self.c_min
+        data = data.astype(np.float32)
+        
+        return data
+
+    def dump(self, wt:Writer):
+        wt.write(self.quant_type.value)
+        if self.quant_type in (QuantType.INT4, QuantType.INT8):
+            wt.write(0)
+            for i in range(self.c_min.shape[0]):
+                wt.write(float(self.c_min[i][0]))
+                wt.write(float(self.c_max[i][0]))
+
+        wt.fd.write(self.quant_data.data)
+
--- a/pyfastllm/fastllm/utils/writer.py
+++ b/pyfastllm/fastllm/utils/writer.py
+import numpy as np
+import struct
+from enum import Enum
+
+class QuantType(Enum):
+    FP32 = 0
+    FP16 = 7
+    INT8 = 3
+    INT4 = 8
+
+def write_int8(fo, v):
+    c_max = np.expand_dims(np.abs(v).max(axis = -1), -1).clip(0.1, 1e100)
+    c_scale = c_max / 127.0
+    v = (v / c_scale + 128.5).clip(1, 255).astype(np.uint8)
+    fo.write(struct.pack('i', 3))
+    fo.write(struct.pack('i', 0))
+    for i in range(c_max.shape[0]):
+        fo.write(struct.pack('f', -c_max[i][0]))
+        fo.write(struct.pack('f', c_max[i][0]))
+    fo.write(v.data)
+
+def write_int4(fo, v):
+    c_min = np.expand_dims(-np.abs(v).max(axis = -1), -1)
+    c_max = np.expand_dims(np.abs(v).max(axis = -1), -1)
+    c_scale = c_max / 7.0
+    c_min = c_scale * -8.0
+    v = (v - c_min) / c_scale
+    v = (v + 0.5).astype(np.int8).clip(0, 15).astype(np.uint8)
+    v = v[:, 0::2] * 16 + v[:, 1::2]
+    fo.write(struct.pack('i', 8))
+    fo.write(struct.pack('i', 0))
+    for i in range(c_min.shape[0]):
+        fo.write(struct.pack('f', c_min[i][0]))
+        fo.write(struct.pack('f', c_max[i][0]))
+    fo.write(v.data)
+
+class Writer():
+    def __init__(self, outpath) -> None:
+        self.fd = open(outpath, 'wb')
+    
+    def __del__(self, ):
+        if not self.fd.closed:
+            self.fd.close()
+
+    def write(self, value):
+        if isinstance(value, int):
+            self.fd.write(struct.pack('i', value))
+        elif isinstance(value, float):
+            self.fd.write(struct.pack('f', value))
+        elif isinstance(value, str):
+            self.write_str(value)
+        elif isinstance(value, bytes):
+            self.write_bytes(value)
+        elif isinstance(value, list):
+            self.write_list(value)
+        elif isinstance(value, dict):
+            self.write_dict(value)
+        elif isinstance(value, np.ndarray):
+            self.write_tensor(value)
+        else:
+            raise NotImplementedError(f"Unsupport data type: {type(value)}")
+    
+    def write_str(self, s):
+        self.write(len(s))
+        self.fd.write(s.encode())
+
+    def write_bytes(self, s):
+        self.write(len(s))
+        for c in s: self.write(int(c))
+
+    def write_list(self, data):
+        self.write(len(data))
+        for d in data: self.write(d)
+    
+    def write_dict(self, data):
+        self.write(len(data))
+        for key in data:
+            self.write_str(key)
+            self.write(data[key])
+
+    def write_tensor(self, data, data_type:QuantType=QuantType.FP32):
+        self.write(list(data.shape))
+        if data_type == QuantType.INT4:
+            write_int4(self.fd, data)
+        elif data_type == QuantType.INT8:
+            write_int8(self.fd, data)
+        else:
+            self.write(int(data_type.value))
+            self.fd.write(data.data)
+
--- a/pyfastllm/install.sh
+++ b/pyfastllm/install.sh
+rm -rf build/ && rm -rf dist/
+python3 setup.py sdist bdist_wheel
+pip install dist/*.whl --force-reinstall
+# python3 examples/test_ops.py # coredump when run with cuda backend
\ No newline at end of file
--- a/pyfastllm/setup.py
+++ b/pyfastllm/setup.py
-import glob
-import os.path
-from setuptools import setup, Extension
-from setuptools import find_packages
+# reference: https://github.com/pybind/cmake_example

+import os
+import re
+import shutil
+import subprocess
 import sys
+from pathlib import Path
+import glob
+import platform
 import argparse
-parser = argparse.ArgumentParser(description='build pyfastllm wheel')
-parser.add_argument('--cuda', dest='cuda', action='store_true', default=False,
-                    help='build with cuda support')
-args, unknown = parser.parse_known_args()
-sys.argv = [sys.argv[0]] + unknown
-
-__VERSION__ = "'0.1.3'"
-
-BASE_DIR = os.path.dirname(os.path.dirname(__file__))
-
-ext_modules = []
-try:
-    from pybind11.setup_helpers import Pybind11Extension
-    source_files = glob.glob(os.path.join(BASE_DIR, "src/**/*.cpp"), recursive=True)
-    for file in source_files:
-        if file.endswith("cudadevice.cpp"): 
-            source_files.remove(file)
-
-    extra_compile_args = ["-w", "-DPY_API"]
-    # If any libraries are used, e.g. libabc.so
-    include_dirs = [os.path.join(BASE_DIR, "include/"), os.path.join(BASE_DIR, "include/devices/cpu/"), os.path.join(BASE_DIR, "include/models"), os.path.join(BASE_DIR, "include/utils")]
-    library_dirs = []
-    
-    # (optional) if the library is not in the dir like `/usr/lib/`
-    # either to add its dir to `runtime_library_dirs` or to the env variable "LD_LIBRARY_PATH"
-    # MUST be absolute path
-    runtime_library_dirs = []
-    libraries = []
-
-    if args.cuda:
-        assert False, "Not Implement Yet!"
-        extra_compile_args.append("-DUSE_CUDA -Wl,-rpath,$ORIGIN/")
-
-        source_files.append(os.path.join(BASE_DIR, "src/devices/cuda/cudadevice.cpp"))
-        include_dirs.append(os.path.join(BASE_DIR, "include/devices/cuda/"))
-
-        library_dirs.append("/usr/local/cuda/lib64/")
-        library_dirs.append(os.path.join(BASE_DIR, "pyfastllm/"))
-
-        libraries.append("fastllm_cuda")
-
-    ext_modules = [
-        Pybind11Extension(
-            "pyfastllm", 
-            source_files,
-            define_macros=[('VERSION_INFO', __VERSION__)],
-            include_dirs=include_dirs,
-            library_dirs=library_dirs,
-            runtime_library_dirs=runtime_library_dirs,
-            libraries=libraries,
-            extra_compile_args=extra_compile_args,
-            cxx_std=17,
-            language='c++'
-        ),
-    ]
-except Exception as e:
-    print(f"some errors happened: ")
-    print(e)
-    sys.exit(1)

-cmdclass = {}
+from setuptools import Extension, find_packages, setup
+from setuptools.command.build_ext import build_ext
+
+# Convert distutils Windows platform specifiers to CMake -A arguments
+PLAT_TO_CMAKE = {
+    "win32": "Win32",
+    "win-amd64": "x64",
+    "win-arm32": "ARM",
+    "win-arm64": "ARM64",
+}
+
+# A CMakeExtension needs a sourcedir instead of a file list.
+# The name must be the _single_ output extension from the CMake build.
+# If you need multiple extensions, see scikit-build.
+class CMakeExtension(Extension):
+    def __init__(self, name: str, sourcedir: str = "") -> None:
+        super().__init__(name, sources=[])
+        self.sourcedir = os.fspath(Path(sourcedir).resolve())
+
+
+class CMakeBuild(build_ext):
+    def build_extension(self, ext: CMakeExtension) -> None:
+        # Must be in this form due to bug in .resolve() only fixed in Python 3.10+
+        ext_fullpath = Path.cwd() / self.get_ext_fullpath(ext.name)
+        extdir = ext_fullpath.parent.resolve()
+
+        # Using this requires trailing slash for auto-detection & inclusion of
+        # auxiliary "native" libs
+
+        debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug
+        cfg = "Debug" if debug else "Release"
+
+        # CMake lets you override the generator - we need to check this.
+        # Can be set with Conda-Build, for example.
+        cmake_generator = os.environ.get("CMAKE_GENERATOR", "")
+        
+        use_cuda = os.environ.get("USE_CUDA", "ON")
+
+        # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
+        # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
+        # from Python.
+        cmake_args = [
+            f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}{os.sep}",
+            f"-DPYTHON_EXECUTABLE={sys.executable}",
+            f"-DCMAKE_BUILD_TYPE={cfg}",  # not used on MSVC, but no harm
+            f"-DPY_API=ON",
+            f"-DUSE_CUDA={use_cuda}",
+        ]
+        build_args = []
+        # Adding CMake arguments set as environment variable
+        # (needed e.g. to build for ARM OSx on conda-forge)
+        if "CMAKE_ARGS" in os.environ:
+            cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item]
+
+        if self.compiler.compiler_type != "msvc":
+            # Using Ninja-build since it a) is available as a wheel and b)
+            # multithreads automatically. MSVC would require all variables be
+            # exported for Ninja to pick it up, which is a little tricky to do.
+            # Users can override the generator with CMAKE_GENERATOR in CMake
+            # 3.15+.
+            if not cmake_generator or cmake_generator == "Ninja":
+                try:
+                    import ninja
+
+                    ninja_executable_path = Path(ninja.BIN_DIR) / "ninja"
+                    cmake_args += [
+                        "-GNinja",
+                        f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}",
+                    ]
+                except ImportError:
+                    pass
+
+        else:
+            # Single config generators are handled "normally"
+            single_config = any(x in cmake_generator for x in {"NMake", "Ninja"})
+
+            # CMake allows an arch-in-generator style for backward compatibility
+            contains_arch = any(x in cmake_generator for x in {"ARM", "Win64"})
+
+            # Specify the arch if using MSVC generator, but only if it doesn't
+            # contain a backward-compatibility arch spec already in the
+            # generator name.
+            if not single_config and not contains_arch:
+                cmake_args += ["-A", PLAT_TO_CMAKE[self.plat_name]]
+
+            # Multi-config generators have a different way to specify configs
+            if not single_config:
+                cmake_args += [f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_{cfg.upper()}={extdir}"]
+                build_args += ["--config", cfg]
+
+        if sys.platform.startswith("darwin"):
+            # Cross-compile support for macOS - respect ARCHFLAGS if set
+            archs = re.findall(r"-arch (\S+)", os.environ.get("ARCHFLAGS", ""))
+            if archs:
+                cmake_args += ["-DCMAKE_OSX_ARCHITECTURES={}".format(";".join(archs))]
+
+        # Set CMAKE_BUILD_PARALLEL_LEVEL to control the parallel build level
+        # across all generators.
+        # if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ:
+        #     # self.parallel is a Python 3 only way to set parallel jobs by hand
+        #     # using -j in the build_ext call, not supported by pip or PyPA-build.
+        #     if hasattr(self, "parallel") and self.parallel:
+        #         # CMake 3.12+ only.
+        #         build_args += [f"-j{self.parallel}"]
+
+        # Compile in parallel by default
+        build_args += [f"-j"]
+
+        build_temp = Path(self.build_temp) / ext.name
+        if not build_temp.exists():
+            build_temp.mkdir(parents=True)
+
+        subprocess.run(["cmake", ext.sourcedir, *cmake_args], cwd=build_temp, check=True)
+        subprocess.run(["cmake", "--build", ".", *build_args], cwd=build_temp, check=True)
+
+
+HERE = Path(__file__).resolve().parent
+VERSION = re.search(r'__version__ = "(.*?)"', (HERE / "fastllm/__init__.py").read_text(encoding="utf-8")).group(1)
+

 setup(
    name='fastllm',  
-    version=eval(__VERSION__),
+    version=VERSION,
    description='python api for fastllm',
    author='wildkid1024',
    author_email='wildkid1024@outlook.com',
@@ -77,14 +140,13 @@ setup(
    maintainer_email='',
    url='',
    long_description='',
-    ext_modules=ext_modules,
+    ext_modules=[CMakeExtension(name="pyfasltllm", sourcedir="..")],
+    cmdclass={"build_ext": CMakeBuild},
    packages = find_packages(), 
-    cmdclass=cmdclass,
-    setup_requires=["pybind11"],
+    setup_requires=[""],
    install_requires=[""],
    python_requires='>=3.6',
-    # data_files = [('', ['libfastllm_cuda.so'])],
-    include_package_data=False,
+    include_package_data=True,
    entry_points={
        'console_scripts':[
            'fastllm-convert = fastllm.convert:main'
@@ -98,4 +160,4 @@ setup(
        'LLM::Moss',
        'LLM::LLama'
    ]
-)
\ No newline at end of file
+)
--- a/pyfastllm/test_func.sh
+++ b/pyfastllm/test_func.sh
+pip uninstall -y fastllm
+rm -rf fastllm/pyfastllm.cpython-310-x86_64-linux-gnu.so
+rm -rf build/
+python3 build_libs.py
+python3 setup.py sdist bdist_wheel
+pip install dist/fastllm-0.1.4-py3-none-any.whl
+python3 demo/test_ops.py
\ No newline at end of file
--- a/src/devices/cpu/cpudevice.cpp
+++ b/src/devices/cpu/cpudevice.cpp
@@ -275,7 +275,6 @@ namespace fastllm {
        Data &output = *(datas.find("output")->second);
        int group = intParams.find("group") != intParams.end() ? intParams.find("group")->second : 1;
        float scale = floatParams.find("scale") != floatParams.end() ? floatParams.find("scale")->second : 1.0;
-
        output.Allocate();
        int q0 = q.dims[0], q1 = q.dims[1], q2 = q.dims[2], k0 = k.dims[0], k1 = k.dims[1], v2 = v.dims[2];
        float *qd = (float*)q.cpuData;
@@ -283,13 +282,16 @@ namespace fastllm {
        float *vd = (float*)v.cpuData;
        float *maskd = (datas.find("mask")->second && mask.dims.size() > 0) ? (float*)mask.cpuData : nullptr;
        float *od = (float*)output.cpuData;
+        int batch = (maskd != nullptr && mask.dims.size() == 3) ? mask.dims[0] : 1; 
+        batch = intParams.find("mask___batch") != intParams.end() ? intParams.find("mask___batch")->second : batch;
+        int maskStride = (maskd != nullptr) ? (mask.dims.size() == 3 ? mask.strides[0] : mask.Count(0)) : 0;
        std::fill(od, od + output.Count(0), 0.0f);
        auto pool = GetPool();
        std::vector<std::future<void> > futures;
        for (int o = 0; o < q0; o++) {
            futures.push_back(pool->Submit(SingleAttention,
                            qd + o * q.strides[0], kd + (o / group) * k.strides[0], vd + (o / group) * v.strides[0],
-                            maskd ? (maskd + o / (q0 / mask.dims[0])) : maskd, od + o * output.strides[0], scale,
+                            maskd + (o / (q0 / batch)) * maskStride, od + o * output.strides[0], scale,
                            q1, q2, k1, v2));
        }
        for (int o = 0; o < futures.size(); o++) {

--- a/src/devices/cuda/cudadevice.cpp
+++ b/src/devices/cuda/cudadevice.cpp
@@ -169,6 +169,12 @@ namespace fastllm {
        FastllmCudaLayerNorm(input, gamma, beta, output, axis);
    }

+    // CudaLinearOp::CudaLinearOp() {
+    //         printf("CudaLinearOp\n");
+    //         const int numStreams = 4;  // 假设使用4个流
+    //         streams_handle = FastllmCreateStreams(numStreams);
+    //     }
+
    void CudaLinearOp::Reshape(const std::string &opType, const fastllm::DataDict &datas,
                               const fastllm::FloatDict &floatParams, const fastllm::IntDict &intParams) {
        Data &input = *(datas.find("input")->second);
@@ -207,6 +213,7 @@ namespace fastllm {
            FastllmCudaMatMulFloat32(input, weight, bias, output, n, m, k);
        } else if (weight.dataType == DataType::FLOAT16) {
            FastllmCudaMatMulFloat16(input, weight, bias, output, n, m, k);
+            // FastllmCudaMatMulFloat16(input, weight, bias, output, n, m, k, streams_handle);
        } else if (weight.dataType == DataType::INT8) {
            FastllmCudaMatMulFloatInt8(input, weight, bias, output, n, m, k);
        } else if (weight.dataType == DataType::INT4) {

--- a/src/devices/cuda/fastllm-cuda.cu
+++ b/src/devices/cuda/fastllm-cuda.cu
--- a/src/fastllm.cpp
+++ b/src/fastllm.cpp
@@ -34,6 +34,11 @@
 #include "fastllm-cuda.cuh"
 #endif

+#ifdef PY_API
+#include <pybind11/embed.h>
+namespace py = pybind11;
+#endif
+
 namespace fastllm {
    std::map <std::string, int> defaultDeviceMap;
    Executor defaultExecutor;
@@ -41,7 +46,7 @@ namespace fastllm {

    static std::mutex globalLocker;
    static int threads = 4;
-    static ThreadPool *fastllmThreadPool = new ThreadPool(threads);
+    static ThreadPool *fastllmThreadPool = nullptr;
    static bool lowMemMode = false;
    static bool kvCacheInCPU = false;

@@ -74,6 +79,9 @@ namespace fastllm {
    }

    void SetThreads(int t) {
+#ifdef PY_API
+        py::gil_scoped_release release;
+#endif
        globalLocker.lock();
        threads = t;
        if (fastllmThreadPool != nullptr) {
@@ -82,6 +90,9 @@ namespace fastllm {
        }
        fastllmThreadPool = new ThreadPool(t);
        globalLocker.unlock();
+#ifdef PY_API
+        py::gil_scoped_acquire acquire;
+#endif
    }

    void SetLowMemMode(bool m) {
@@ -101,6 +112,8 @@ namespace fastllm {
    }

    ThreadPool *GetPool() {
+        if (fastllmThreadPool == nullptr)
+            SetThreads(threads);
        return fastllmThreadPool;
    }
 #ifdef USE_MMAP
@@ -247,6 +260,7 @@ namespace fastllm {
    }

    Data::Data(fastllm::DataType type, const std::vector<int> &dims, const std::vector<float> &data) : Data::Data(type, dims) {
+        // std::cout<<"调用数值构造"<<std::endl;
        this->Allocate();
        if (type == DataType::FLOAT32) {
            std::memcpy(this->cpuData, data.data(), this->GetBytes());
@@ -258,6 +272,7 @@ namespace fastllm {
    }

    void Data::CopyFrom(const Data &ori) {
+        // std::cout<<"调用拷贝构造"<<std::endl;
        if (ori.dims != this->dims || this->cpuData == nullptr) {
            if (ori.dims.size() == 0) {
                delete[] this->cpuData;
@@ -515,6 +530,10 @@ namespace fastllm {
        printf("\n");
    }

+    std::vector<int> Data::Shape() const{
+        return this->dims;
+    }
+
    void Data::Print() const {
        printf("shape: ");
        for (int i : this->dims) {
@@ -538,7 +557,7 @@ namespace fastllm {
        }
        printf("\n");
         */
-//        //如果需要打印cuda显存上的数据需要先把数据转到cpu xzhou 20230728
+        //如果需要打印cuda显存上的数据需要先把数据转到cpu xzhou 20230728
 //        if (dataDevice == DataDevice::CUDA) {
 //            ToDevice(DataDevice::CPU);
 //        }
@@ -682,11 +701,20 @@ namespace fastllm {
 #ifdef USE_CUDA
            if (this->dataDevice == DataDevice::CPU) {
                if (device == DataDevice::CUDA) {
+                    uint8_t *cpuData = this->cpuData;
+#ifdef USE_MMAP
+                    cpuData = new uint8_t[expansionBytes];
+                    memcpy(cpuData, this->cpuData, expansionBytes);
+#endif
                    FastllmCudaSetDevice(deviceIds.size() == 0 ? 0 : deviceIds[0]);
                    this->cudaData = FastllmCudaMalloc(expansionBytes);
-                    FastllmCudaCopyFromHostToDevice(this->cudaData, this->cpuData, expansionBytes);
+                    FastllmCudaCopyFromHostToDevice(this->cudaData, cpuData, expansionBytes);
+#ifdef USE_MMAP
+                    delete[] cpuData;
+#else
                    delete[] this->cpuData;
                    this->cpuData = nullptr;
+#endif
                }
            } else if (this->dataDevice == DataDevice::CUDA) {
                if (device == DataDevice::CPU) {
@@ -695,16 +723,16 @@ namespace fastllm {
                    FastllmCudaFree(this->cudaData);
                    this->cudaData = nullptr;
                } else if (device == DataDevice::CUDA) {
-                    FastllmCudaSetDevice(this->dataDeviceIds.size() == 0 ? 0 : this->dataDeviceIds[0]);
-                    uint8_t *cpuData = new uint8_t[expansionBytes];
-                    FastllmCudaCopyFromDeviceToHost(cpuData, this->cudaData, expansionBytes);
-                    FastllmCudaFree(this->cudaData);
+                    int sourceDevice = this->dataDeviceIds.size() == 0 ? 0 : this->dataDeviceIds[0];
+                    int destDevice = deviceIds.size() == 0 ? 0 : deviceIds[0];
+                    FastllmCudaSetDevice(destDevice);
+                    void *newCudaData = FastllmCudaMalloc(expansionBytes);

-                    FastllmCudaSetDevice(deviceIds.size() == 0 ? 0 : deviceIds[0]);
-                    this->cudaData = FastllmCudaMalloc(expansionBytes);
-
-                    FastllmCudaCopyFromHostToDevice(this->cudaData, cpuData, expansionBytes);
-                    delete[] cpuData;
+                    FastllmCudaMemcpyBetweenDevices(destDevice, newCudaData, sourceDevice, this->cudaData, expansionBytes);
+                    FastllmCudaSetDevice(sourceDevice);
+                    FastllmCudaFree(this->cudaData);
+                    this->cudaData = newCudaData;
+                    FastllmCudaSetDevice(destDevice);
                }
            }
 #endif
@@ -790,6 +818,7 @@ namespace fastllm {
        now->tokenId = tokenId;
        now->score = score;
        tokenToStringDict[tokenId] = s;
+        tokenToScoreDict[tokenId] = score;
        stringToTokenDict[s] = tokenId;
    }

@@ -835,9 +864,10 @@ namespace fastllm {
            }
            for (int i = 0; i < ori.size(); i++) {
                if (ori[i] == ' ') {
-                    if (i != 0 && ori[i - 1] != ' ') {
-                        s += blank;
-                    }
+                    // if (i != 0 && ori[i - 1] != ' ') {
+                        // s += blank;
+                    // }
+                    s += blank;
                } else {
                    s += ori[i];
                }
@@ -931,6 +961,132 @@ namespace fastllm {
                }
            }
            return Data (DataType::FLOAT32, {1, (int)v.size()}, v);
+        } else if (this->type == TokenizerType::GLM) {
+            const std::map<std::string, int> specialTokens = {{"[MASK]", 50003}, {"[sMASK]", 50008}, {"[gMASK]", 50009}};
+            std::string blank = "";
+            blank += 226, blank += 150, blank += 129;
+            std::string s = blank;
+            for (int i = 0; i < ori.size(); i++) {
+                if (ori[i] == ' ') {
+                    if (i != 0 && ori[i - 1] != ' ') {
+                        s += blank;
+                    }
+                } else {
+                    s += ori[i];
+                }
+            }
+            std::vector<float> v;
+            int findPos=0;
+            while(findPos<s.length()){
+                int nextSpecialToken=-1;
+                int nextSpecialTokenPos=-1;
+                int nextSpecialTokenLen=-1;
+                for(auto p:specialTokens){
+                    int ind=s.find(p.first,findPos);
+                    if(ind>=0&&(nextSpecialTokenPos<0||ind<nextSpecialTokenPos)){
+                        nextSpecialTokenPos=ind;
+                        nextSpecialToken=p.second;
+                        nextSpecialTokenLen=p.first.length();
+                    }
+                }
+                std::string subStr;
+                if(nextSpecialTokenPos<0){
+                    subStr=s.substr(findPos);
+                    findPos=s.length();
+                }else{
+                    subStr=s.substr(findPos,nextSpecialTokenPos-findPos);
+                    findPos=nextSpecialTokenPos+nextSpecialTokenLen;
+                }
+                if(subStr.length()>0){
+#ifdef USE_SENTENCEPIECE
+                    if(spProcessor!=nullptr){
+                        std::vector<int> ids;
+                        spProcessor->Encode(subStr,&ids);
+                        for(int id:ids){
+                            v.push_back(id);
+                        }
+                    }else{
+#endif
+                    std::vector<Symbol> symbols;
+                    for (int i = 0; i < subStr.size(); i++) {
+                        int tokenId = -999999, pos = i - 1;
+                        TrieNode *now = this->root;
+                        for (int j = i; j < subStr.size(); j++) {
+                            if (now->next.find(subStr[j]) != now->next.end()) {
+                                now = now->next[subStr[j]];
+                                if (now->tokenId != -999999) {
+                                    tokenId = now->tokenId;
+                                    pos = j;
+                                    break;
+                                }
+                            } else {
+                                break;
+                            }
+                        }
+                        if (pos >= i) {
+                            symbols.push_back(Symbol(now, (char *) subStr.data(), i, pos - i + 1, (int) symbols.size() - 1,
+                                                     (int) symbols.size() + 1, -999999));
+                            i = pos;
+                        } else {
+                            symbols.push_back(Symbol(nullptr, (char *) subStr.data(), i, 0, (int) symbols.size() - 1,
+                                                     (int) symbols.size() + 1, -999999));
+                        }
+                    }
+                    symbols.back().next = -1;
+
+                    std::priority_queue<SymbolPairs> workQueue;
+                    for (int i = 1; i < symbols.size(); i++) {
+                        TryMergePairs(symbols, i - 1, i, workQueue);
+                    }
+
+                    while (!workQueue.empty()) {
+                        auto top = workQueue.top();
+                        workQueue.pop();
+                        if (symbols[top.l].len == 0 || symbols[top.r].len == 0 ||
+                                symbols[top.l].len + symbols[top.r].len != top.size) {
+                            continue;
+                        }
+
+                        for (int i = symbols[top.r].pos; i < symbols[top.r].pos + symbols[top.r].len; i++) {
+                            symbols[top.l].node = symbols[top.l].node->next[symbols[top.r].s[i]];
+                        }
+                        symbols[top.l].len += symbols[top.r].len;
+                        symbols[top.r].len = 0;
+                        symbols[top.l].next = symbols[top.r].next;
+                        if (symbols[top.r].next >= 0) {
+                            symbols[symbols[top.r].next].prev = top.l;
+                        }
+
+                        TryMergePairs(symbols, symbols[top.l].prev, top.l, workQueue);
+                        TryMergePairs(symbols, top.l, symbols[top.l].next, workQueue);
+                    }
+                    for (int i = 0; i < symbols.size(); i++) {
+                        if (symbols[i].len > 0) {
+                            v.push_back(symbols[i].node->tokenId);
+                        } else if (symbols[i].node == nullptr) {
+                            if (symbols[i].fixId != -999999) {
+                                v.push_back(symbols[i].fixId);
+                            } else {
+                                // 未识别的字符
+                                uint8_t c = (uint8_t) (symbols[i].s[symbols[i].pos]);
+                                std::string now = "<0x00>";
+                                now[3] = (c / 16 > 9 ? ('A' + c / 16 - 10) : ('0' + c / 16));
+                                now[4] = (c % 16 > 9 ? ('A' + c % 16 - 10) : ('0' + c % 16));
+                                if (stringToTokenDict.find(now) != stringToTokenDict.end()) {
+                                    v.push_back(stringToTokenDict[now]);
+                                }
+                            }
+                        }
+                    }
+#ifdef USE_SENTENCEPIECE
+                    }
+#endif
+                }
+                if(nextSpecialTokenPos>=0){
+                    v.push_back(nextSpecialToken);
+                }
+            }
+            return Data (DataType::FLOAT32, {1, (int)v.size()}, v);
        } else if (this->type == TokenizerType::QWEN) {
            std::map<std::string, int> specialTokens = {{"<|im_start|>", 151644}, {"<|im_end|>", 151645}, {"<|endoftext|>", 151643}};

@@ -1246,7 +1402,8 @@ namespace fastllm {
 	            }
            } else {
 #ifdef USE_MMAP
-                weight[name].set_file(mapped_file);
+                weight[name].SetMapFile(mapped_file);
+                weight[name].expansionBytes = (weight[name].Count(0) * weight[name].unitSize - 1) / weight[name].unitSizeDiv + 1;
 #else
 	            weight[name].Allocate();
 #endif
@@ -1594,6 +1751,21 @@ namespace fastllm {
        }
    }

+    void WeightMap::ReleaseWeight() {
+        for (auto &w : this->weight) {
+#ifndef USE_MMAP
+            delete[] w.second.cpuData;
+            w.second.cpuData = nullptr;
+#endif
+#ifdef USE_CUDA
+            if (w.second.cudaData != nullptr) {
+                FastllmCudaDirectFree(w.second.cudaData);
+                w.second.cudaData = nullptr;
+            }
+#endif
+        }
+    }
+
    Data &WeightMap::operator[](const std::string &key) {
        return weight[key];
    }
@@ -1958,4 +2130,4 @@ namespace fastllm {
    std::map <std::string, int> GetDeviceMap() {
        return defaultDeviceMap;
    }
-}
\ No newline at end of file
+}
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -7,6 +7,7 @@
 #include "moss.h"
 #include "llama.h"
 #include "qwen.h"
+#include "glm.h"

 namespace fastllm {
    void basellm::LoadFromFile(const std::string &fileName) {
@@ -16,8 +17,12 @@ namespace fastllm {

    void basellm::InitParams() {
        if (this->weight.dicts.find("bos_token_id") != this->weight.dicts.end()) {
-            this->bos_token_id = atoi(this->weight.dicts["bos_token_id"].c_str());
-            this->eos_token_id = atoi(this->weight.dicts["eos_token_id"].c_str());
+            if(this->weight.dicts["bos_token_id"]!="None"){
+                this->bos_token_id = atoi(this->weight.dicts["bos_token_id"].c_str());
+            }
+            if(this->weight.dicts["eos_token_id"]!="None"){
+                this->eos_token_id = atoi(this->weight.dicts["eos_token_id"].c_str());
+            }
        }
        if (this->weight.dicts.find("im_start_id") != this->weight.dicts.end()) {
            this->bos_token_id = atoi(this->weight.dicts["im_start_id"].c_str());
@@ -25,6 +30,8 @@ namespace fastllm {
        }
        if (this->weight.dicts.find("num_hidden_layers") != this->weight.dicts.end()) {
            block_cnt = atoi(this->weight.dicts["num_hidden_layers"].c_str());
+        }else if (this->weight.dicts.find("num_layers") != this->weight.dicts.end()) {
+            block_cnt = atoi(this->weight.dicts["num_layers"].c_str());
        }
        if (this->weight.dicts.find("hidden_size") != this->weight.dicts.end()) {
            embed_dim = atoi(this->weight.dicts["hidden_size"].c_str());
@@ -81,6 +88,11 @@ namespace fastllm {
        } else if (modelType == "qwen") {
            model = (basellm *) (new QWenModel());
            model->weight.tokenizer.type = Tokenizer::TokenizerType::QWEN;
+        } else if (modelType == "glm") {
+            model = (basellm*)(new GLMModel());
+        } else if (modelType == "chatglm3") {
+            model = (basellm*)(new ChatGLMModel());
+            model->model_type = "chatglm3";
        } else {
            ErrorInFastLLM("Unkown model type: " + modelType);
        }

--- a/src/models/basellm.cpp
+++ b/src/models/basellm.cpp
@@ -61,8 +61,8 @@ namespace fastllm {
 #endif
        std::string prompt = input;
 #ifdef PY_API
-        size_t pos = input.find_last_of("time_stamp:");
-        prompt = (generationConfig.enable_hash_id && pos != std::string::npos) ? input.substr(0, pos - 10) : input;
+        size_t pos = input.rfind("time_stamp:");
+        prompt = (generationConfig.enable_hash_id && pos != -1) ? input.substr(0, pos) : input;
        size_t hash_id = std::hash<std::string>{}(input);
 #endif
        Data inputIds, attentionMask, positionIds;
@@ -151,8 +151,8 @@ namespace fastllm {
            size_t hash_id = std::hash<std::string>{}(_input);
            hash_ids.push_back(hash_id);

-            size_t pos = _input.find_last_of("time_stamp:");
-            std::string prompt = (generationConfig.enable_hash_id && pos != std::string::npos) ? _input.substr(0, pos - 10) : _input;
+            size_t pos = _input.rfind("time_stamp:");
+            std::string prompt = (generationConfig.enable_hash_id && pos != -1) ? _input.substr(0, pos) : _input;
            prompts.push_back(prompt);
        }
 #else
@@ -208,6 +208,11 @@ namespace fastllm {
                inputTokens[i] = std::vector <float> {(float)ret[i]};
                if (ret[i] == eos_token_id) {
                    isEnding[i] = true;
+                } else {
+                    auto itStopTk = generationConfig.stop_token_ids.find(ret[i]);
+                    if (itStopTk != generationConfig.stop_token_ids.end()) {
+                        isEnding[i] = true;
+                    }
                }
                if (isEnding[i]) {
                    curStrings.push_back("");
@@ -284,6 +289,127 @@ namespace fastllm {
 #endif
    }

+    void basellm::ResponseBatch(std::vector<std::vector<float>> &inputTokens, std::vector<std::string> &outputs,
+                                RuntimeResultBatch retCb, const fastllm::GenerationConfig &generationConfig) {
+#ifdef USE_CUDA
+        FastllmCudaClearBigBuffer();
+#endif
+        // 1. first
+        Data inputIds, attentionMask, positionIds;
+
+        int batch = inputTokens.size();
+        outputs.clear();
+        outputs.resize(batch, "");
+
+        std::vector <std::pair <Data, Data> > pastKeyValues;
+        for (int i = 0; i < block_cnt; i++) {
+            pastKeyValues.push_back(std::make_pair(Data(DataType::FLOAT32),
+                                                   Data(DataType::FLOAT32)));
+        }
+
+        std::vector <std::map <std::string, int> > params;
+        params.resize(batch);
+        for (int i = 0; i < batch; i++) {
+            params[i]["promptLen"] = (int)inputTokens[i].size();
+        }
+        params[0]["index"] = 0;
+        int index = 0;
+
+        LastTokensManager tokensManager (batch, generationConfig.last_n);
+        std::vector <bool> isEnding = std::vector <bool> (batch, false);
+        FillLLMInputsBatch(inputTokens, params, inputIds, attentionMask, positionIds);
+        while (true) {
+            auto st = std::chrono::system_clock::now();
+            std::vector <int> ret = ForwardBatch(batch, inputIds, attentionMask, positionIds, pastKeyValues,
+                                                 generationConfig, tokensManager);
+            for (int i = 0; i < batch; i++) {
+                tokensManager.units[i].Push(ret[i]);
+            }
+            std::vector <float> fret;
+            std::vector <float> results;
+            int endingCount = 0;
+            std::vector <std::string> curStrings;
+            for (int i = 0; i < batch; i++) {
+                fret.push_back(ret[i]);
+                inputTokens[i] = std::vector <float> {(float)ret[i]};
+                if (ret[i] == eos_token_id) {
+                    isEnding[i] = true;
+                }
+                if (isEnding[i]) {
+                    curStrings.push_back("");
+                    endingCount++;
+                    continue;
+                }
+                results.push_back(ret[i]);
+                std::string curString = weight.tokenizer.Decode(
+                        Data(DataType::FLOAT32, {(int) results.size()}, results)).c_str();
+                outputs[i] += curString;
+                curStrings.push_back(curString);
+                results.clear();
+            }
+
+            if (endingCount == batch) {
+                break;
+            }
+            if (retCb)
+#ifdef PY_API
+                {
+                    if (generationConfig.enable_hash_id) {
+                        std::vector<pybind11::bytes> rtnStrings;
+                        for (size_t i=0; i<batch; i++){
+                            std::stringstream ss;
+                            ss << curStrings[i] << "hash_id:" << hash_ids[i];
+                            rtnStrings.push_back(pybind11::bytes(ss.str()));
+                        }
+                        retCb(index, rtnStrings);
+                    } else {
+                        std::vector<pybind11::bytes> rtnStrings;
+                        for (size_t i=0; i<batch; i++){
+                            std::stringstream ss;
+                            ss << curStrings[i];
+                            rtnStrings.push_back(pybind11::bytes(ss.str()));
+                        }
+                        retCb(index, rtnStrings);
+                    }
+                }
+#else
+                retCb(index, curStrings);
+#endif
+            index++;
+            params[0]["index"] = index;
+            FillLLMInputsBatch(inputTokens, params, inputIds, attentionMask, positionIds);
+            // printf("len = %d, spend %f s.\n", len, GetSpan(st, std::chrono::system_clock::now()));
+
+            if (index == generationConfig.output_token_limit) {
+                break;
+            }
+        }
+        if (retCb)
+#ifdef PY_API
+            {
+                    if (generationConfig.enable_hash_id) {
+                        std::vector<pybind11::bytes> rtnStrings;
+                        for (size_t i=0; i<batch; i++){
+                            std::stringstream ss;
+                            ss << outputs[i] << "hash_id:" << hash_ids[i];
+                            rtnStrings.push_back(pybind11::bytes(ss.str()));
+                        }
+                        retCb(-1, rtnStrings);
+                    } else {
+                        std::vector<pybind11::bytes> rtnStrings;
+                        for (size_t i=0; i<batch; i++){
+                            std::stringstream ss;
+                            ss << outputs[i];
+                            rtnStrings.push_back(pybind11::bytes(ss.str()));
+                        }
+                        retCb(-1, rtnStrings);
+                    }
+                }
+#else
+            retCb(-1, outputs);
+#endif
+    }
+
    std::vector<int> basellm::ForwardBatch(int batch, const fastllm::Data &inputIds, const fastllm::Data &attentionMask,
                                           const fastllm::Data &positionIds,
                                           std::vector<std::pair<Data, Data>> &pastKeyValues,
@@ -464,6 +590,12 @@ printf("tot = %d\n", tot);
                                if (curRet == model->eos_token_id) {
                                    it.second->isEnding = true;
                                } else {
+                                    auto itStopTk = it.second->generationConfig.stop_token_ids.find(curRet);
+                                    if (itStopTk != it.second->generationConfig.stop_token_ids.end()) {
+                                            it.second->isEnding = true;
+                                    }
+                                }
+                                if (it.second->isEnding == false) {
                                    it.second->currentTokens = std::vector<int>{curRet};
                                    it.second->resultTokenQueue.push(curRet);
                                    it.second->tokens.Push(curRet);
@@ -484,6 +616,13 @@ printf("tot = %d\n", tot);

                        model->dictLocker.unlock();
                        MySleep(0);
+						// 介意cpu一直占用可以将上面这行换成下面的代码
+                        /*if (seqLens.size() > 0) {
+                            MySleep(0);
+                        }
+                        else{
+                            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                        }*/
                    }
                }, this);
            }
@@ -523,6 +662,8 @@ printf("tot = %d\n", tot);
                }
                dictLocker.unlock();
                MySleep(0);
+				// 介意cpu一直占用可以将上面这行换成下面的代码
+                // std::this_thread::sleep_for(std::chrono::milliseconds(10));
                dictLocker.lock();
            }
        }
@@ -555,6 +696,8 @@ printf("tot = %d\n", tot);
                }
                dictLocker.unlock();
                MySleep(0);
+				// 介意cpu一直占用可以将上面这行换成下面的代码
+                // std::this_thread::sleep_for(std::chrono::milliseconds(10));
                dictLocker.lock();
            }
        }