Commit e4114c03 authored by pengcheng888's avatar pengcheng888
Browse files

issue/83 - 添加AutoLlama类,支持创建不同backend的模型

parent 5d182420
......@@ -49,5 +49,9 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
- 单次推理测试
- llama示例
```bash
python examples/llama.py [--cpu | --nvidia] --model_path=<path/to/model_dir>
python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
```
例如:
```bash
python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0
```
\ No newline at end of file
......@@ -53,20 +53,28 @@ def get_args():
default=100,
help="max_new_tokens",
)
parser.add_argument(
"--backend",
type=str,
default="python",
help="python or cpp model",
)
return parser.parse_args()
def test(model_path, device_str="cuda", max_new_tokens=100):
def test(
prompt,
model_path,
max_new_tokens=100,
infini_dtype=infinicore.bfloat16,
infini_device=infinicore.device("cpu", 0),
backend="python",
):
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
infini_device = infinicore.device(device_str, 0)
infini_dtype = infinicore.bfloat16
model = infinilm.LlamaForCausalLM.from_pretrained(
model_path,
device=infini_device,
dtype=infini_dtype,
model = infinilm.AutoLlamaModel.from_pretrained(
model_path, device=infini_device, dtype=infini_dtype, backend=backend
)
# ---------------------------------------------------------------------------- #
......@@ -85,7 +93,6 @@ def test(model_path, device_str="cuda", max_new_tokens=100):
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer = AutoTokenizer.from_pretrained(model_path)
if "llama" == config.model_type:
......@@ -109,7 +116,7 @@ def test(model_path, device_str="cuda", max_new_tokens=100):
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
prompt = "山东最高的山是?"
# prompt = "山东最高的山是?"
input_content = tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}],
add_generation_prompt=True,
......@@ -144,24 +151,37 @@ if __name__ == "__main__":
print(args)
# Parse command line arguments
device_type = "cpu"
device_str = "cpu"
if args.cpu:
device_type = "cpu"
device_str = "cpu"
elif args.nvidia:
device_type = "cuda"
device_str = "cuda"
elif args.metax:
device_type = "cuda"
device_str = "cuda"
elif args.moore:
device_type = "musa"
device_str = "musa"
elif args.iluvatar:
device_type = "cuda"
device_str = "cuda"
else:
print(
"Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>"
"Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
"such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
)
sys.exit(1)
prompt = "山东最高的山是?"
model_path = args.model_path
max_new_tokens = args.max_new_tokens
backend = args.backend
infini_device = infinicore.device(device_str, 0)
infini_dtype = infinicore.bfloat16
test(model_path, device_type, max_new_tokens)
test(
prompt,
model_path,
max_new_tokens,
infini_device=infini_device,
infini_dtype=infini_dtype,
backend=backend,
)
from .models import *
from .models import AutoLlamaModel
__all__ = ["AutoLlamaModel"]
......@@ -43,17 +43,17 @@ infinicore.Tensor.to_numpy = infini_to_numpy
class GenerationMixin:
def _get_initial_cache_position(
def _get_initial_position_ids(
self,
bs: int,
seq_length: int,
device: infinicore.device,
) -> infinicore.Tensor:
"""Calculates `cache_position` for the pre-fill stage"""
cache_position_list = [list(range(0, seq_length)) for i in range(bs)]
"""Calculates `position_ids` for the pre-fill stage"""
position_ids_list = [list(range(0, seq_length)) for i in range(bs)]
return infinicore.from_list(
cache_position_list, dtype=infinicore.int64, device=device
position_ids_list, dtype=infinicore.int64, device=device
)
def prepare_inputs_for_generation(
......@@ -73,29 +73,29 @@ class GenerationMixin:
model_inputs["past_key_values"] = past_key_values
# -------------------------------------------------------------------------- #
# 计算所需的,cache_position
# 计算所需的,position_ids
# -------------------------------------------------------------------------- #
current_cache_position = kwargs.get("cache_position", None)
if current_cache_position is None:
current_position_ids = kwargs.get("position_ids", None)
if current_position_ids is None:
# prill阶段
bs, seq_len = kwargs["input_ids"].shape[0:2]
model_inputs["cache_position"] = self._get_initial_cache_position(
model_inputs["position_ids"] = self._get_initial_position_ids(
bs, seq_len, device
)
else:
# decoder 阶段
bs, seq_len = current_cache_position.shape
last_position = current_cache_position.narrow(1, seq_len - 1, 1)
bs, seq_len = current_position_ids.shape
last_position = current_position_ids.narrow(1, seq_len - 1, 1)
one_value = infinicore.from_list(
[1],
[1] * bs,
dtype=last_position.dtype,
device=last_position.device,
).view((bs, 1))
next_position = one_value + last_position
model_inputs["cache_position"] = next_position
model_inputs["position_ids"] = next_position
# -------------------------------------------------------------------- #
# 所需的: token的input_ids
......@@ -127,8 +127,12 @@ class GenerationMixin:
# -------------------------------------------------------------------- #
# 创建 cache #
# -------------------------------------------------------------------- #
model_kwargs["use_cache"] = True
model_kwargs["past_key_values"] = DynamicCache(config=self.config)
if self.use_cache:
model_kwargs["use_cache"] = True
model_kwargs["past_key_values"] = DynamicCache(config=self.config)
else:
model_kwargs["use_cache"] = False
model_kwargs["past_key_values"] = None
# -------------------------------------------------------------------- #
# _sample函数 #
......@@ -170,12 +174,12 @@ class GenerationMixin:
)
# -------------------------------------------------------------------------- #
# 初始化 cache_position
# 初始化 position_ids
# -------------------------------------------------------------------------- #
output_tokens_list = []
model_kwargs["input_ids"] = input_ids
model_kwargs["cache_position"] = None
model_kwargs["position_ids"] = None
output_content = ""
print()
......@@ -186,13 +190,13 @@ class GenerationMixin:
# -------------------------------------------------------------------------- #
model_inputs = self.prepare_inputs_for_generation(device, **model_kwargs)
model_kwargs["cache_position"] = model_inputs["cache_position"]
model_kwargs["position_ids"] = model_inputs["position_ids"]
# -------------------------------------------------------------------------- #
# 计算一次
# -------------------------------------------------------------------------- #
start_time = time.time()
logits = self.forward(**model_inputs, return_dict=True)
logits = self(**model_inputs)
# -------------------------------------------------------------------------- #
# 处理输出
......@@ -237,8 +241,12 @@ class GenerationMixin:
if token_id in eos_token_id_list:
break
print("\n</s>")
print(
f"\n\n\n Time per step: prefill {round(time_list[0], 2)} token/ms\n",
)
print(
f"\n\n Time per step: {round(sum(time_list) / len(time_list), 2)} ms\n",
f" Time per step: decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} token/ms \n",
)
return output_tokens_list, output_content
......@@ -3,9 +3,8 @@ from typing import Dict, Optional, Union
import torch
from safetensors import safe_open
import glob
# from safetensors.torch import load_file as safe_load_file
# from safetensors.torch import save_file as safe_save_file
import infinicore
str_to_torch_dtype = {
......@@ -76,9 +75,19 @@ def get_model_state_dict(
"""
Load the model weights.
"""
path = os.path.join(model_path, "model.safetensors")
model_param = load_state_dict(path)
# --------------------------------------------------------- #
# 使用从 *.safetensors文件中加载权重
# --------------------------------------------------------- #
model_param = {}
for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
model_param.update(load_state_dict(file_path))
if model_param.get("lm_head.weight", None) is None:
model_param["lm_head.weight"] = model_param["model.embed_tokens.weight"]
# --------------------------------------------------------- #
# 调整权重的device和dtype
# --------------------------------------------------------- #
torch_device = device.type
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
......@@ -86,6 +95,9 @@ def get_model_state_dict(
for key, value in model_param.items():
model_param[key] = value.to(device=torch_device, dtype=torch_dtype)
# --------------------------------------------------------- #
# model_param_infini 引用torch.Tensor
# --------------------------------------------------------- #
for key, value in model_param.items():
model_param_infini[key] = infinicore.from_torch(model_param[key])
......
from .llama import *
from .llama import AutoLlamaModel
__all__ = ["AutoLlamaModel"]
from .configuration_llama import * # noqa: F403
from .modeling_llama import * # noqa: F403
import os
from typing import Optional, Union
import infinicore
__all__ = ["AutoLlamaModel"]
class AutoLlamaModel:
@classmethod
def from_pretrained(
cls,
model_path: Optional[Union[str, os.PathLike]],
device: infinicore.device,
dtype=infinicore.dtype,
backend="python",
):
if backend == "python":
from . import modeling_llama
return modeling_llama.LlamaForCausalLM.from_pretrained(
model_path,
device=device,
dtype=dtype,
)
elif backend == "cpp":
from .backends import cpp
return cpp.LlamaForCausalLM.from_pretrained(
model_path,
device=device,
dtype=dtype,
)
raise KeyError("invalid backend")
from ....generation.utils import GenerationMixin
import infinicore
import os
from typing import Optional, Union
class LlamaForCausalLM(GenerationMixin):
def __init__(self):
super().__init__()
self.use_cache = False
self._model = None
raise NotImplementedError("NotImplementedError!!")
def forward(self, input_ids, position_ids, *args, **kwargs):
kv_caches = None
return infinicore.Tensor(
self._model.forward(input_ids, position_ids, kv_caches)
)
def __call__(self, input_ids, position_ids, *args, **kwargs):
return self.forward(input_ids=input_ids, position_ids=position_ids)
@classmethod
def from_pretrained(
cls,
model_path: Union[str, os.PathLike],
device: infinicore.device,
dtype=infinicore.dtype,
):
"""
Load a pretrained LlamaForCausalLM model from a directory.
Args:
model_path: Path to the model directory containing config.json
device: Device instance (defaults to CPU)
Returns:
LlamaForCausalLM instance
"""
raise NotImplementedError("NotImplementedError!!")
......@@ -228,6 +228,3 @@ class LlamaConfig(PretrainedConfig):
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
__all__ = ["LlamaConfig"]
......@@ -196,14 +196,14 @@ class LlamaAttention(infinicore.nn.Module):
# --------------------------------------------------------------------------------------- #
# 对 Q和K, 加上 rope
# --------------------------------------------------------------------------------------- #
cache_position = kwargs.pop("cache_position", None)
if cache_position is None:
raise KeyError("cache_position error")
position_ids = kwargs.pop("position_ids", None)
if position_ids is None:
raise KeyError("position_ids error")
if rope_instance is None:
raise KeyError("rope_instance error")
query_states = rope_instance(query_states, cache_position)
key_states = rope_instance(key_states, cache_position)
query_states = rope_instance(query_states, position_ids)
key_states = rope_instance(key_states, position_ids)
# --------------------------------------------------------------------------------------- #
# kv cache
......@@ -338,7 +338,7 @@ class LlamaModel(infinicore.nn.Module):
def forward(
self,
input_ids,
cache_position,
position_ids,
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None, # True
**kwargs,
......@@ -364,7 +364,7 @@ class LlamaModel(infinicore.nn.Module):
hidden_states = decoder_layer(
hidden_states,
past_key_values=past_key_values,
cache_position=cache_position,
position_ids=position_ids,
rope_instance=self.rope_instance,
**kwargs,
)
......@@ -384,6 +384,8 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
def __init__(self, config, **kwargs):
super().__init__()
self.config = config
self.use_cache = True
self.model = LlamaModel(config, **kwargs)
self.lm_head = infinicore.nn.Linear(
config.hidden_size,
......@@ -395,14 +397,14 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
def forward(
self,
input_ids,
cache_position,
position_ids,
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None,
**kwargs,
):
last_token = self.model(
input_ids,
cache_position,
position_ids,
past_key_values=past_key_values,
use_cache=use_cache,
**kwargs,
......@@ -425,9 +427,3 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
config = LlamaConfig(**config_dict)
return LlamaForCausalLM(config, device=device, dtype=dtype)
__all__ = [
"LlamaModel",
"LlamaForCausalLM",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment