Commit e4114c03 authored by pengcheng888's avatar pengcheng888
Browse files

issue/83 - 添加AutoLlama类,支持创建不同backend的模型

parent 5d182420
...@@ -49,5 +49,9 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA ...@@ -49,5 +49,9 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
- 单次推理测试 - 单次推理测试
- llama示例 - llama示例
```bash ```bash
python examples/llama.py [--cpu | --nvidia] --model_path=<path/to/model_dir> python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
```
例如:
```bash
python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0
``` ```
\ No newline at end of file
...@@ -53,20 +53,28 @@ def get_args(): ...@@ -53,20 +53,28 @@ def get_args():
default=100, default=100,
help="max_new_tokens", help="max_new_tokens",
) )
parser.add_argument(
"--backend",
type=str,
default="python",
help="python or cpp model",
)
return parser.parse_args() return parser.parse_args()
def test(model_path, device_str="cuda", max_new_tokens=100): def test(
prompt,
model_path,
max_new_tokens=100,
infini_dtype=infinicore.bfloat16,
infini_device=infinicore.device("cpu", 0),
backend="python",
):
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
# 创建模型, # 创建模型,
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
infini_device = infinicore.device(device_str, 0) model = infinilm.AutoLlamaModel.from_pretrained(
infini_dtype = infinicore.bfloat16 model_path, device=infini_device, dtype=infini_dtype, backend=backend
model = infinilm.LlamaForCausalLM.from_pretrained(
model_path,
device=infini_device,
dtype=infini_dtype,
) )
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
...@@ -85,7 +93,6 @@ def test(model_path, device_str="cuda", max_new_tokens=100): ...@@ -85,7 +93,6 @@ def test(model_path, device_str="cuda", max_new_tokens=100):
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
# 创建 tokenizer # 创建 tokenizer
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path)
if "llama" == config.model_type: if "llama" == config.model_type:
...@@ -109,7 +116,7 @@ def test(model_path, device_str="cuda", max_new_tokens=100): ...@@ -109,7 +116,7 @@ def test(model_path, device_str="cuda", max_new_tokens=100):
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
# token编码 # token编码
# ---------------------------------------------------------------------------- # # ---------------------------------------------------------------------------- #
prompt = "山东最高的山是?" # prompt = "山东最高的山是?"
input_content = tokenizer.apply_chat_template( input_content = tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}], conversation=[{"role": "user", "content": prompt}],
add_generation_prompt=True, add_generation_prompt=True,
...@@ -144,24 +151,37 @@ if __name__ == "__main__": ...@@ -144,24 +151,37 @@ if __name__ == "__main__":
print(args) print(args)
# Parse command line arguments # Parse command line arguments
device_type = "cpu" device_str = "cpu"
if args.cpu: if args.cpu:
device_type = "cpu" device_str = "cpu"
elif args.nvidia: elif args.nvidia:
device_type = "cuda" device_str = "cuda"
elif args.metax: elif args.metax:
device_type = "cuda" device_str = "cuda"
elif args.moore: elif args.moore:
device_type = "musa" device_str = "musa"
elif args.iluvatar: elif args.iluvatar:
device_type = "cuda" device_str = "cuda"
else: else:
print( print(
"Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>" "Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>\n"
"such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
) )
sys.exit(1) sys.exit(1)
prompt = "山东最高的山是?"
model_path = args.model_path model_path = args.model_path
max_new_tokens = args.max_new_tokens max_new_tokens = args.max_new_tokens
backend = args.backend
infini_device = infinicore.device(device_str, 0)
infini_dtype = infinicore.bfloat16
test(model_path, device_type, max_new_tokens) test(
prompt,
model_path,
max_new_tokens,
infini_device=infini_device,
infini_dtype=infini_dtype,
backend=backend,
)
from .models import * from .models import AutoLlamaModel
__all__ = ["AutoLlamaModel"]
...@@ -43,17 +43,17 @@ infinicore.Tensor.to_numpy = infini_to_numpy ...@@ -43,17 +43,17 @@ infinicore.Tensor.to_numpy = infini_to_numpy
class GenerationMixin: class GenerationMixin:
def _get_initial_cache_position( def _get_initial_position_ids(
self, self,
bs: int, bs: int,
seq_length: int, seq_length: int,
device: infinicore.device, device: infinicore.device,
) -> infinicore.Tensor: ) -> infinicore.Tensor:
"""Calculates `cache_position` for the pre-fill stage""" """Calculates `position_ids` for the pre-fill stage"""
cache_position_list = [list(range(0, seq_length)) for i in range(bs)] position_ids_list = [list(range(0, seq_length)) for i in range(bs)]
return infinicore.from_list( return infinicore.from_list(
cache_position_list, dtype=infinicore.int64, device=device position_ids_list, dtype=infinicore.int64, device=device
) )
def prepare_inputs_for_generation( def prepare_inputs_for_generation(
...@@ -73,29 +73,29 @@ class GenerationMixin: ...@@ -73,29 +73,29 @@ class GenerationMixin:
model_inputs["past_key_values"] = past_key_values model_inputs["past_key_values"] = past_key_values
# -------------------------------------------------------------------------- # # -------------------------------------------------------------------------- #
# 计算所需的,cache_position # 计算所需的,position_ids
# -------------------------------------------------------------------------- # # -------------------------------------------------------------------------- #
current_cache_position = kwargs.get("cache_position", None) current_position_ids = kwargs.get("position_ids", None)
if current_cache_position is None: if current_position_ids is None:
# prill阶段 # prill阶段
bs, seq_len = kwargs["input_ids"].shape[0:2] bs, seq_len = kwargs["input_ids"].shape[0:2]
model_inputs["cache_position"] = self._get_initial_cache_position( model_inputs["position_ids"] = self._get_initial_position_ids(
bs, seq_len, device bs, seq_len, device
) )
else: else:
# decoder 阶段 # decoder 阶段
bs, seq_len = current_cache_position.shape bs, seq_len = current_position_ids.shape
last_position = current_cache_position.narrow(1, seq_len - 1, 1) last_position = current_position_ids.narrow(1, seq_len - 1, 1)
one_value = infinicore.from_list( one_value = infinicore.from_list(
[1], [1] * bs,
dtype=last_position.dtype, dtype=last_position.dtype,
device=last_position.device, device=last_position.device,
).view((bs, 1)) ).view((bs, 1))
next_position = one_value + last_position next_position = one_value + last_position
model_inputs["cache_position"] = next_position model_inputs["position_ids"] = next_position
# -------------------------------------------------------------------- # # -------------------------------------------------------------------- #
# 所需的: token的input_ids # 所需的: token的input_ids
...@@ -127,8 +127,12 @@ class GenerationMixin: ...@@ -127,8 +127,12 @@ class GenerationMixin:
# -------------------------------------------------------------------- # # -------------------------------------------------------------------- #
# 创建 cache # # 创建 cache #
# -------------------------------------------------------------------- # # -------------------------------------------------------------------- #
model_kwargs["use_cache"] = True if self.use_cache:
model_kwargs["past_key_values"] = DynamicCache(config=self.config) model_kwargs["use_cache"] = True
model_kwargs["past_key_values"] = DynamicCache(config=self.config)
else:
model_kwargs["use_cache"] = False
model_kwargs["past_key_values"] = None
# -------------------------------------------------------------------- # # -------------------------------------------------------------------- #
# _sample函数 # # _sample函数 #
...@@ -170,12 +174,12 @@ class GenerationMixin: ...@@ -170,12 +174,12 @@ class GenerationMixin:
) )
# -------------------------------------------------------------------------- # # -------------------------------------------------------------------------- #
# 初始化 cache_position # 初始化 position_ids
# -------------------------------------------------------------------------- # # -------------------------------------------------------------------------- #
output_tokens_list = [] output_tokens_list = []
model_kwargs["input_ids"] = input_ids model_kwargs["input_ids"] = input_ids
model_kwargs["cache_position"] = None model_kwargs["position_ids"] = None
output_content = "" output_content = ""
print() print()
...@@ -186,13 +190,13 @@ class GenerationMixin: ...@@ -186,13 +190,13 @@ class GenerationMixin:
# -------------------------------------------------------------------------- # # -------------------------------------------------------------------------- #
model_inputs = self.prepare_inputs_for_generation(device, **model_kwargs) model_inputs = self.prepare_inputs_for_generation(device, **model_kwargs)
model_kwargs["cache_position"] = model_inputs["cache_position"] model_kwargs["position_ids"] = model_inputs["position_ids"]
# -------------------------------------------------------------------------- # # -------------------------------------------------------------------------- #
# 计算一次 # 计算一次
# -------------------------------------------------------------------------- # # -------------------------------------------------------------------------- #
start_time = time.time() start_time = time.time()
logits = self.forward(**model_inputs, return_dict=True) logits = self(**model_inputs)
# -------------------------------------------------------------------------- # # -------------------------------------------------------------------------- #
# 处理输出 # 处理输出
...@@ -237,8 +241,12 @@ class GenerationMixin: ...@@ -237,8 +241,12 @@ class GenerationMixin:
if token_id in eos_token_id_list: if token_id in eos_token_id_list:
break break
print("\n</s>")
print(
f"\n\n\n Time per step: prefill {round(time_list[0], 2)} token/ms\n",
)
print( print(
f"\n\n Time per step: {round(sum(time_list) / len(time_list), 2)} ms\n", f" Time per step: decoder {round(sum(time_list[1:]) / (len(time_list) - 1), 2)} token/ms \n",
) )
return output_tokens_list, output_content return output_tokens_list, output_content
...@@ -3,9 +3,8 @@ from typing import Dict, Optional, Union ...@@ -3,9 +3,8 @@ from typing import Dict, Optional, Union
import torch import torch
from safetensors import safe_open from safetensors import safe_open
import glob
# from safetensors.torch import load_file as safe_load_file
# from safetensors.torch import save_file as safe_save_file
import infinicore import infinicore
str_to_torch_dtype = { str_to_torch_dtype = {
...@@ -76,9 +75,19 @@ def get_model_state_dict( ...@@ -76,9 +75,19 @@ def get_model_state_dict(
""" """
Load the model weights. Load the model weights.
""" """
path = os.path.join(model_path, "model.safetensors") # --------------------------------------------------------- #
model_param = load_state_dict(path) # 使用从 *.safetensors文件中加载权重
# --------------------------------------------------------- #
model_param = {}
for file_path in glob.glob(os.path.join(model_path, "*.safetensors")):
model_param.update(load_state_dict(file_path))
if model_param.get("lm_head.weight", None) is None:
model_param["lm_head.weight"] = model_param["model.embed_tokens.weight"]
# --------------------------------------------------------- #
# 调整权重的device和dtype
# --------------------------------------------------------- #
torch_device = device.type torch_device = device.type
torch_dtype = infinicore.utils.to_torch_dtype(dtype) torch_dtype = infinicore.utils.to_torch_dtype(dtype)
...@@ -86,6 +95,9 @@ def get_model_state_dict( ...@@ -86,6 +95,9 @@ def get_model_state_dict(
for key, value in model_param.items(): for key, value in model_param.items():
model_param[key] = value.to(device=torch_device, dtype=torch_dtype) model_param[key] = value.to(device=torch_device, dtype=torch_dtype)
# --------------------------------------------------------- #
# model_param_infini 引用torch.Tensor
# --------------------------------------------------------- #
for key, value in model_param.items(): for key, value in model_param.items():
model_param_infini[key] = infinicore.from_torch(model_param[key]) model_param_infini[key] = infinicore.from_torch(model_param[key])
......
from .llama import * from .llama import AutoLlamaModel
__all__ = ["AutoLlamaModel"]
from .configuration_llama import * # noqa: F403 import os
from .modeling_llama import * # noqa: F403 from typing import Optional, Union
import infinicore
__all__ = ["AutoLlamaModel"]
class AutoLlamaModel:
@classmethod
def from_pretrained(
cls,
model_path: Optional[Union[str, os.PathLike]],
device: infinicore.device,
dtype=infinicore.dtype,
backend="python",
):
if backend == "python":
from . import modeling_llama
return modeling_llama.LlamaForCausalLM.from_pretrained(
model_path,
device=device,
dtype=dtype,
)
elif backend == "cpp":
from .backends import cpp
return cpp.LlamaForCausalLM.from_pretrained(
model_path,
device=device,
dtype=dtype,
)
raise KeyError("invalid backend")
from ....generation.utils import GenerationMixin
import infinicore
import os
from typing import Optional, Union
class LlamaForCausalLM(GenerationMixin):
def __init__(self):
super().__init__()
self.use_cache = False
self._model = None
raise NotImplementedError("NotImplementedError!!")
def forward(self, input_ids, position_ids, *args, **kwargs):
kv_caches = None
return infinicore.Tensor(
self._model.forward(input_ids, position_ids, kv_caches)
)
def __call__(self, input_ids, position_ids, *args, **kwargs):
return self.forward(input_ids=input_ids, position_ids=position_ids)
@classmethod
def from_pretrained(
cls,
model_path: Union[str, os.PathLike],
device: infinicore.device,
dtype=infinicore.dtype,
):
"""
Load a pretrained LlamaForCausalLM model from a directory.
Args:
model_path: Path to the model directory containing config.json
device: Device instance (defaults to CPU)
Returns:
LlamaForCausalLM instance
"""
raise NotImplementedError("NotImplementedError!!")
...@@ -228,6 +228,3 @@ class LlamaConfig(PretrainedConfig): ...@@ -228,6 +228,3 @@ class LlamaConfig(PretrainedConfig):
tie_word_embeddings=tie_word_embeddings, tie_word_embeddings=tie_word_embeddings,
**kwargs, **kwargs,
) )
__all__ = ["LlamaConfig"]
...@@ -196,14 +196,14 @@ class LlamaAttention(infinicore.nn.Module): ...@@ -196,14 +196,14 @@ class LlamaAttention(infinicore.nn.Module):
# --------------------------------------------------------------------------------------- # # --------------------------------------------------------------------------------------- #
# 对 Q和K, 加上 rope # 对 Q和K, 加上 rope
# --------------------------------------------------------------------------------------- # # --------------------------------------------------------------------------------------- #
cache_position = kwargs.pop("cache_position", None) position_ids = kwargs.pop("position_ids", None)
if cache_position is None: if position_ids is None:
raise KeyError("cache_position error") raise KeyError("position_ids error")
if rope_instance is None: if rope_instance is None:
raise KeyError("rope_instance error") raise KeyError("rope_instance error")
query_states = rope_instance(query_states, cache_position) query_states = rope_instance(query_states, position_ids)
key_states = rope_instance(key_states, cache_position) key_states = rope_instance(key_states, position_ids)
# --------------------------------------------------------------------------------------- # # --------------------------------------------------------------------------------------- #
# kv cache # kv cache
...@@ -338,7 +338,7 @@ class LlamaModel(infinicore.nn.Module): ...@@ -338,7 +338,7 @@ class LlamaModel(infinicore.nn.Module):
def forward( def forward(
self, self,
input_ids, input_ids,
cache_position, position_ids,
past_key_values: Optional[Cache] = None, past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None, # True use_cache: Optional[bool] = None, # True
**kwargs, **kwargs,
...@@ -364,7 +364,7 @@ class LlamaModel(infinicore.nn.Module): ...@@ -364,7 +364,7 @@ class LlamaModel(infinicore.nn.Module):
hidden_states = decoder_layer( hidden_states = decoder_layer(
hidden_states, hidden_states,
past_key_values=past_key_values, past_key_values=past_key_values,
cache_position=cache_position, position_ids=position_ids,
rope_instance=self.rope_instance, rope_instance=self.rope_instance,
**kwargs, **kwargs,
) )
...@@ -384,6 +384,8 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin): ...@@ -384,6 +384,8 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
def __init__(self, config, **kwargs): def __init__(self, config, **kwargs):
super().__init__() super().__init__()
self.config = config self.config = config
self.use_cache = True
self.model = LlamaModel(config, **kwargs) self.model = LlamaModel(config, **kwargs)
self.lm_head = infinicore.nn.Linear( self.lm_head = infinicore.nn.Linear(
config.hidden_size, config.hidden_size,
...@@ -395,14 +397,14 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin): ...@@ -395,14 +397,14 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
def forward( def forward(
self, self,
input_ids, input_ids,
cache_position, position_ids,
past_key_values: Optional[Cache] = None, past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None, use_cache: Optional[bool] = None,
**kwargs, **kwargs,
): ):
last_token = self.model( last_token = self.model(
input_ids, input_ids,
cache_position, position_ids,
past_key_values=past_key_values, past_key_values=past_key_values,
use_cache=use_cache, use_cache=use_cache,
**kwargs, **kwargs,
...@@ -425,9 +427,3 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin): ...@@ -425,9 +427,3 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
config = LlamaConfig(**config_dict) config = LlamaConfig(**config_dict)
return LlamaForCausalLM(config, device=device, dtype=dtype) return LlamaForCausalLM(config, device=device, dtype=dtype)
__all__ = [
"LlamaModel",
"LlamaForCausalLM",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment