added llama_inference_pytorch

b856c4ec · zhaoying1 · b856c4ec · b856c4ec · b856c4ec · b856c4ec
Commit b856c4ec authored Sep 07, 2023 by zhaoying1
9 changed files
--- a/model/tokenize.py
+++ b/model/tokenize.py
+# copy from
+# https://github.com/tloen/llama-int8/blob/ce74669c767e42b5082391dd0cfcb621ba40c7f9/llama/tokenizer.py
+from sentencepiece import SentencePieceProcessor
+from logging import getLogger
+from typing import List
+import os
+logger = getLogger()
+class Tokenizer:
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+        logger.info(f"Reloaded SentencePiece model from {model_path}")
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.pad_id()
+        logger.info(
+            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+        )
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+    def decode(self, t: List[int]) -> str:
+        return self.sp_model.decode(t)
\ No newline at end of file
--- a/prompts.txt
+++ b/prompts.txt
+What do you think of OpenAI organization?
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+tensor_parallel == 1.2.2
+transformers == 4.28.1
--- a/result.txt
+++ b/result.txt
+What do you think of OpenAI organization? I think it's doing a fantastic job in advancing research in artificial intelligence and pushing the field forward.
--- a/run-dialogue.sh
+++ b/run-dialogue.sh
+export HIP_VISIBLE_DEVICES=0,1,2,3
+LOAD_MODEL=/public/LLAMA/model/chatflow_7b.bin
+SPM_PATH=/public/LLAMA/model/tokenizer.model
+python llama_dialogue.py --load_model_path $LOAD_MODEL \
+                         --config_path config/llama_7b_config.json \
+                         --spm_model_path $SPM_PATH \
+                         --world_size 4 --seq_length 1024
--- a/run-tp.sh
+++ b/run-tp.sh
+export HIP_VISIBLE_DEVICES=0,1,2,3
+LOAD_MODEL=/public/LLAMA/model/chatflow_7b.bin
+SPM_PATH=/public/LLAMA/model/tokenizer.model
+python llama_infer.py --test_path ./prompts.txt --prediction_path ./result.txt \
+                      --load_model_path $LOAD_MODEL \
+                      --config_path config/llama_7b_config.json \
+                      --spm_model_path $SPM_PATH \
+                      --world_size 4 --seq_length 1024 
--- a/run.sh
+++ b/run.sh
+export HIP_VISIBLE_DEVICES=0
+LOAD_MODEL=/public/LLAMA/model/chatflow_7b.bin
+SPM_PATH=/public/LLAMA/model/tokenizer.model
+python llama_infer.py --test_path ./prompts.txt --prediction_path ./result.txt \
+                      --load_model_path $LOAD_MODEL \
+                      --config_path config/llama_7b_config.json \
+                      --spm_model_path $SPM_PATH \
+                      --seq_length 1024
--- a/test_latency.py
+++ b/test_latency.py
+import argparse
+from utils import load_hyperparam, convert_normal_parameter_to_int8, load_model
+from model.tokenize import Tokenizer
+from model.llama import *
+from generate import LmGeneration_test
+from time import perf_counter
+import numpy as np
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--load_model_path", default=None, type=str,
+                        help="Path of the input model.")
+    parser.add_argument("--test_path", type=str, required=True,
+                        help="Path of the testset.")
+    parser.add_argument("--prediction_path", type=str, required=True,
+                        help="Path of the prediction file.")
+    parser.add_argument("--config_path", type=str, required=True,
+                        help="Path of the config file.")
+    parser.add_argument("--batch_size", type=int, default=1,
+                        help="Batch size.")
+    parser.add_argument("--world_size", type=int, default=1,
+                        help="the number of gpus.")
+    parser.add_argument("--seq_length", type=int, default=128,
+                        help="Sequence length.")
+    parser.add_argument("--use_int8", action="store_true")
+    parser.add_argument("--top_k", type=int, default=10)
+    parser.add_argument("--top_p", type=float, default=1)
+    parser.add_argument("--temperature", type=float, default=0.85)
+    parser.add_argument("--repetition_penalty_range", type=int, default=1024)
+    parser.add_argument("--repetition_penalty_slope", type=float, default=0)
+    parser.add_argument("--repetition_penalty", type=float, default=1.15)
+    parser.add_argument("--spm_model_path", default=None, type=str,
+                        help="Path of the sentence piece model.")
+    args = parser.parse_args()
+    args = load_hyperparam(args)
+    args.tokenizer = Tokenizer(model_path=args.spm_model_path)
+    args.vocab_size = args.tokenizer.sp_model.vocab_size()
+    torch.set_default_tensor_type(torch.HalfTensor)
+    model = LLaMa(args)
+    torch.set_default_tensor_type(torch.FloatTensor)
+    model = load_model(model, args.load_model_path)
+    model.eval()
+    # use multi-gpu tensor parallel
+    if args.world_size > 1:
+        import tensor_parallel as tp
+        gpus = ["cuda:" + str(i) for i in range(args.world_size)]
+        if args.use_int8:
+            model = tp.tensor_parallel(model, gpus, delay_init=True)
+            model = convert_normal_parameter_to_int8(model)
+        else:
+            model = tp.tensor_parallel(model, gpus)
+    else:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model.to(device)
+    lm_generation = LmGeneration(model, args.tokenizer)
+    prompts = []
+    with open(args.test_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            prompts.append(line)
+    prompt_tokens = [args.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
+    with torch.no_grad():
+        latencies = []
+        for _ in range(2):
+            _ =  lm_generation.generate(args, prompt_tokens)
+        for _ in range(10):
+            start_time = perf_counter()
+            tokens = lm_generation.generate(args, prompt_tokens)
+            latency = perf_counter() - start_time
+            latencies.append(latency)
+        decoder = []
+        for i, t in enumerate(tokens.tolist()):
+            t = t[: args.seq_length]
+            try:
+                t = t[: t.index(args.tokenizer.pad_id)]
+                t = t[: t.index(args.tokenizer.eos_id)]
+            except ValueError:
+                pass
+            decoder.append(args.tokenizer.decode(t))
+        time_avg_ms = 1000 * np.mean(latencies) # 延时均值
+        time_std_ms = 1000 * np.std(latencies) # 延时方差
+        time_p95_ms = 1000 * np.percentile(latencies,95) # 延时的95分位数
+        print(f"P95延时 (ms) - {time_p95_ms}; 平均延时 (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f};")
+        print(f"字平均延时 (ms) - {time_avg_ms/(len(decoder[0])-len(prompts[0]))}; ")
+        print(decoder[0])
--- a/utils.py
+++ b/utils.py
+import json
+import sys
+from argparse import Namespace
+import torch
+from model.llama import NormalLinear
+import os
+def load_hyperparam(default_args):
+    """
+    Load arguments form argparse and config file
+    Priority: default options < config file < command line args
+    """
+    with open(default_args.config_path, mode="r", encoding="utf-8") as f:
+        config_args_dict = json.load(f)
+    default_args_dict = vars(default_args)
+    command_line_args_dict = {k: default_args_dict[k] for k in [
+        a[2:] for a in sys.argv if (a[:2] == "--" and "local_rank" not in a)
+    ]}
+    default_args_dict.update(config_args_dict)
+    default_args_dict.update(command_line_args_dict)
+    args = Namespace(**default_args_dict)
+    return args
+def _load_state_dict_into_model(model_to_load, model_path, start_prefix=""):
+    # Convert old format to new format if needed from a PyTorch state_dict
+    # copy state_dict so _load_from_state_dict can modify it
+    state_dict = torch.load(model_path, map_location="cpu")
+    metadata = getattr(state_dict, "_metadata", None)
+    state_dict = state_dict.copy()
+    state_dict['target.lm.weight'] = state_dict['target.lm.output_layer.weight']
+    del state_dict['target.lm.output_layer.weight']
+    state_dict['embedding.embedding.weight'] = state_dict['embedding.word.embedding.weight']
+    del state_dict['embedding.word.embedding.weight']
+    if metadata is not None:
+        metadata['embedding.embedding'] = metadata['embedding.word.embedding']
+        metadata['target.lm'] = metadata['target.lm.output_layer']
+        if metadata.get('embedding.dropout', None) is not None:
+            del metadata['embedding.dropout']
+        del metadata['embedding.word']
+        del metadata['embedding.word.embedding']
+        del metadata['target.lm.output_layer']
+        del metadata['target.lm.softmax']
+        del metadata['target.lm.criterion']
+        state_dict._metadata = metadata
+    error_msgs = []
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module, state_dict, prefix=""):
+        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
+        # Parameters of module and children will start with prefix. We can exit early if there are none in this
+        # state_dict
+        if len([key for key in state_dict if key.startswith(prefix)]) > 0:
+            import deepspeed
+            # In sharded models, each shard has only part of the full state_dict, so only gather
+            # parameters that are in the current state_dict.
+            named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
+            params_to_gather = [named_parameters[k] for k in state_dict.keys() if k in named_parameters]
+            if len(params_to_gather) > 0:
+                # because zero3 puts placeholders in model params, this context
+                # manager gathers (unpartitions) the params of the current layer, then loads from
+                # the state dict and then re-partitions them again
+                with deepspeed.zero.GatheredParameters(params_to_gather, modifier_rank=0):
+                    if torch.distributed.get_rank() == 0:
+                        module._load_from_state_dict(*args)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, state_dict, prefix + name + ".")
+    load(model_to_load, state_dict, prefix=start_prefix)
+    # Delete `state_dict` so it could be collected by GC earlier. Note that `state_dict` is a copy of the argument, so
+    # it's safe to delete it.
+    del state_dict
+    return model_to_load
+def convert_normal_parameter_to_int8(model, threshold=6.0, modules_to_not_convert=None, current_key_name=None):
+    import bitsandbytes as bnb
+    modules_to_not_convert = ["lm"] if modules_to_not_convert is None else modules_to_not_convert
+    for name, module in model.named_children():
+        if current_key_name is None:
+            current_key_name = []
+        current_key_name.append(name)
+        if len(list(module.children())) > 0:
+            convert_normal_parameter_to_int8(module, threshold, modules_to_not_convert, current_key_name)
+        if isinstance(module, bnb.nn.Linear8bitLt) and name not in modules_to_not_convert:
+            # Check if the current key is not in the `modules_to_not_convert`
+            if not any(key in ".".join(current_key_name) for key in modules_to_not_convert):
+                model._modules[name].weight = bnb.nn.Int8Params(
+                    module.weight.data,
+                    requires_grad=False,
+                    has_fp16_weights=False
+                )
+                # Force requires grad to False to avoid unexpected errors
+                model._modules[name].requires_grad_(False)
+        # Remove the last key for recursion
+        current_key_name.pop(-1)
+    return model
+def load_model(model, model_path):
+    if os.path.isdir(model_path):
+        index_filename = os.path.join(model_path, 'pytorch_model.bin.index.json')
+        with open(index_filename, "r") as f:
+            index = json.loads(f.read())
+        shard_filenames = sorted(set(index["weight_map"].values()))
+        shard_filenames = [os.path.join(model_path, f) for f in shard_filenames]
+        for shard_file in shard_filenames:
+            shard_checkpoint = torch.load(shard_file, map_location='cpu')
+            for name, parameter in model.named_parameters():
+                if shard_checkpoint.get(name, None) is not None:
+                    if 'target' in name:
+                        parameter.data = shard_checkpoint['target.lm.output_layer.weight']
+                    elif 'embedding' in name:
+                        parameter.data = shard_checkpoint['embedding.word.embedding.weight']
+                    else:
+                        parameter.data = shard_checkpoint[name]
+                    parameter.requires_grad = False
+            del shard_checkpoint
+    else:
+        checkpoint = torch.load(model_path, map_location='cpu')
+        for parameter_name, parameter in model.named_parameters():
+            if 'target' in parameter_name:
+                parameter.data = checkpoint['target.lm.output_layer.weight']
+            elif 'embedding' in parameter_name:
+                parameter.data = checkpoint['embedding.word.embedding.weight']
+            else:
+                parameter.data = checkpoint[parameter_name]
+            parameter.requires_grad = False
+        del checkpoint
+    return model