Commit 3ddffe8d authored by pengcheng888's avatar pengcheng888
Browse files

issue/76 - 添加python的llama模型实现

parent 4fd9d490
...@@ -35,3 +35,19 @@ python scripts/test_perf.py ...@@ -35,3 +35,19 @@ python scripts/test_perf.py
```bash ```bash
python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS] python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MAX_BATCH] [--max-tokens MAX_TOKENS]
``` ```
## 使用方式(新版)
- 编译并安装 `InfiniCore`, 详情见 InfiniCore的 [`README`](https://github.com/InfiniTensor/InfiniCore) :
- 注意根据提示设置好 `INFINI_ROOT` 环境变量(默认为 `$HOME/.infini`
- 根据硬件平台,选择 xmake 构建配置
- 编译安装InfiniCore
- 安装 C++ 库
- 安装 Python 包
- 单次推理测试
- llama示例
```bash
python examples/llama.py [--cpu | --nvidia] --model_path=<path/to/model_dir>
```
\ No newline at end of file
import sys
import time
import os
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "../python"))
import argparse
import infinilm
from infinilm.modeling_utils import get_model_state_dict
from tokenizers import decoders as _dec
from transformers import AutoTokenizer
import infinicore
def get_args():
parser = argparse.ArgumentParser(description="run Llama args")
parser.add_argument(
"--cpu",
action="store_true",
help="Run cpu test",
)
parser.add_argument(
"--nvidia",
action="store_true",
help="Run nvidia test",
)
parser.add_argument(
"--metax",
action="store_true",
help="Run metax test",
)
parser.add_argument(
"--model_path",
type=str,
required=True,
help="model_path",
)
parser.add_argument(
"--max_new_tokens",
type=int,
default=100,
help="max_new_tokens",
)
return parser.parse_args()
def test(model_path, device_str="cuda", max_new_tokens=100):
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
infini_device = infinicore.device(device_str, 0)
infini_dtype = infinicore.bfloat16
model = infinilm.LlamaForCausalLM.from_pretrained(
model_path,
device=infini_device,
dtype=infini_dtype,
)
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
model_param_infini = get_model_state_dict(
model_path,
device=infini_device,
dtype=infini_dtype,
)
model.load_state_dict(model_param_infini)
config = model.config
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer = AutoTokenizer.from_pretrained(model_path)
if "llama" == config.model_type:
backend = getattr(tokenizer, "backend_tokenizer", None)
target = getattr(backend, "_tokenizer", backend)
norm = getattr(target, "normalizer", None)
dec = getattr(target, "decoder", None)
sn = repr(norm)[:800] if norm is not None else ""
sd = repr(dec)[:800] if dec is not None else ""
has_prepend = "Prepend" in sn
has_strip = "Strip" in sd
if has_prepend and has_strip:
target.decoder = _dec.Sequence(
[
_dec.Replace("▁", " "),
_dec.ByteFallback(),
_dec.Fuse(),
]
)
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
prompt = "山东最高的山是?"
input_content = tokenizer.apply_chat_template(
conversation=[{"role": "user", "content": prompt}],
add_generation_prompt=True,
tokenize=False,
)
print(input_content, end="", flush=True)
input_ids = tokenizer.encode(input_content)
# ---------------------------------------------------------------------------- #
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_list = [input_ids] # List: [[1, 1128, 526, 366, 29892]]
input_ids_infini = infinicore.from_list(input_ids_list)
t1 = time.time()
model.generate(
input_ids_infini,
max_new_tokens=max_new_tokens,
device=infini_device,
tokenizer=tokenizer,
config=config,
)
t2 = time.time()
print(
f"total_time: {round((t2 - t1) * 1000, 2)} ms",
)
if __name__ == "__main__":
args = get_args()
print(args)
# Parse command line arguments
device_type = "cpu"
if args.cpu:
device_type = "cpu"
elif args.nvidia:
device_type = "cuda"
elif args.metax:
device_type = "cuda"
else:
print(
"Usage: python examples/llama.py [--cpu | --nvidia] --model_path=<path/to/model_dir>"
)
sys.exit(1)
model_path = args.model_path
max_new_tokens = args.max_new_tokens
test(model_path, device_type, max_new_tokens)
from .models import *
# Copyright (c) 2025, InfiniCore
#
# This file contains modified code derived from transformers
# implementation, which is licensed under the BSD 3-Clause License.
#
# The modifications include adaptations for the InfiniCore framework.
#
# Original transformers source:
# https://github.com/huggingface/transformers
#
# Referencing PyTorch v4.57.0
#
# The use of this file is governed by the BSD 3-Clause License.
from abc import ABC, abstractmethod
from typing import Any, Optional
import transformers.utils.logging as logging
import infinicore
logger = logging.get_logger(__name__)
class CacheLayerMixin(ABC):
"""Base, abstract class for a single layer's cache."""
def __init__(self):
self.keys, self.values = None, None
def __repr__(self):
return f"{self.__class__.__name__}"
@abstractmethod
def lazy_initialization(self, key_states: infinicore.Tensor): ...
@abstractmethod
def update(
self,
key_states: infinicore.Tensor,
value_states: infinicore.Tensor,
cache_kwargs: Optional[dict[str, Any]] = None,
) -> tuple[infinicore.Tensor, infinicore.Tensor]: ...
class DynamicLayer(CacheLayerMixin):
"""
A cache layer that grows dynamically as more tokens are generated.
It stores the key and value states as tensors of shape `[batch_size, seq_len, num_heads, head_dim]`.
"""
def __init__(self, max_position_embeddings):
super().__init__()
self.max_position_embeddings = max_position_embeddings
self.cache_position = 0
def lazy_initialization(self, key_states: infinicore.Tensor):
batch_size, seq_len, num_heads, head_dim = key_states.shape
if self.keys is None:
dtype, device = key_states.dtype, key_states.device
self.cache_position = 0
self.max_seq_len = max(self.max_position_embeddings, seq_len)
self.keys = infinicore.empty(
[batch_size, self.max_seq_len, num_heads, head_dim],
dtype=dtype,
device=device,
)
self.values = infinicore.empty(
[batch_size, self.max_seq_len, num_heads, head_dim],
dtype=dtype,
device=device,
)
elif self.cache_position + seq_len >= self.max_seq_len:
dtype, device = key_states.dtype, key_states.device
self.max_seq_len = max(self.max_seq_len * 2, self.cache_position + seq_len)
keys_new = infinicore.empty(
[batch_size, self.max_seq_len, num_heads, head_dim],
dtype=dtype,
device=device,
)
values_new = infinicore.empty(
[batch_size, self.max_seq_len, num_heads, head_dim],
dtype=dtype,
device=device,
)
keys_new.narrow(1, 0, self.cache_position).copy_(
self.keys.narrow(1, 0, self.cache_position)
)
values_new.narrow(1, 0, self.cache_position).copy_(
self.values.narrow(1, 0, self.cache_position)
)
self.keys, self.values = keys_new, values_new
def update(
self,
key_states: infinicore.Tensor,
value_states: infinicore.Tensor,
cache_kwargs: Optional[dict[str, Any]] = None,
):
# Lazy initialization
self.lazy_initialization(key_states)
seq_len = key_states.shape[1]
index = self.cache_position
# Update the cache
self.keys.narrow(1, index, seq_len).copy_(key_states)
self.values.narrow(1, index, seq_len).copy_(value_states)
self.cache_position += seq_len
return self.keys.narrow(1, 0, self.cache_position), self.values.narrow(
1, 0, self.cache_position
)
class Cache:
"""
A `Cache` is mostly a list of `CacheLayerMixin` objects, one per model layer. It serves as a container for the Cache of each layer.
Args:
layers (`Optional`, *optional*): A list of pre-created `CacheLayerMixin`.
"""
def __init__(
self,
layers: Optional[list[CacheLayerMixin]] = None,
):
self.layers = layers if layers is not None else []
def update(
self,
key_states: infinicore.Tensor,
value_states: infinicore.Tensor,
layer_idx: int,
cache_kwargs: Optional[dict[str, Any]] = None,
) -> tuple[infinicore.Tensor, infinicore.Tensor]:
"""
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
Parameters:
key_states (`infinicore.Tensor`):
The new key states to cache.
value_states (`infinicore.Tensor`):
The new value states to cache.
layer_idx (`int`):
The index of the layer to cache the states for.
cache_kwargs (`dict[str, Any]`, *optional*):
Additional arguments for the cache subclass.
Return:
A tuple containing the updated key and value states.
"""
keys, values = self.layers[layer_idx].update(
key_states, value_states, cache_kwargs
)
return keys.contiguous(), values.contiguous()
class DynamicCache(Cache):
"""
A cache that grows dynamically as more tokens are generated. This is the default for generative models.
It stores the key and value states as a list of `CacheLayer`, one for each layer.
Args:
config (`PretrainedConfig`, *optional*):
The config of the model for which this Cache will be used..
"""
def __init__(
self,
config=None,
):
max_position_embeddings = config.max_position_embeddings
layers = []
# If a config is passed, use it to infer the layer types and initialize accordingly
if config is not None:
config = config.get_text_config()
layer_types = None
if layer_types is None:
layer_types = [
"full_attention" for _ in range(config.num_hidden_layers)
]
for layer_type in layer_types:
layers.append(DynamicLayer(max_position_embeddings))
super().__init__(
layers=layers,
)
# Copyright (c) 2025, InfiniCore
#
# This file contains modified code derived from transformers
# implementation, which is licensed under the BSD 3-Clause License.
#
# The modifications include adaptations for the InfiniCore framework.
#
# Original transformers source:
# https://github.com/huggingface/transformers
#
# Referencing PyTorch v4.57.0
#
# The use of this file is governed by the BSD 3-Clause License.
import copy
from typing import Any
class PretrainedConfig:
def __init__(*args, **kwargs):
pass
def to_dict(self) -> dict[str, Any]:
"""
Serializes this instance to a Python dictionary.
Returns:
`dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
"""
output = copy.deepcopy(self.__dict__)
if hasattr(self.__class__, "model_type"):
output["model_type"] = self.__class__.model_type
# Transformers version when serializing the model
output["transformers_version"] = "None"
for key, value in output.items():
# Deal with nested configs like CLIP
if isinstance(value, PretrainedConfig):
value = value.to_dict()
del value["transformers_version"]
output[key] = value
self.dict_dtype_to_str(output)
return output
def is_encoder_decoder(self):
return False
def dict_dtype_to_str(self, d: dict[str, Any]) -> None:
"""
Checks whether the passed dictionary and its nested dicts have a *dtype* key and if it's not None,
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
string, which can then be stored in the json format.
"""
if d.get("dtype") is not None and not isinstance(d["dtype"], str):
d["dtype"] = str(d["dtype"]).split(".")[1]
for value in d.values():
if isinstance(value, dict):
self.dict_dtype_to_str(value)
def get_text_config(self, decoder=None, encoder=None):
return_both = (
decoder == encoder
) # both unset or both set -> search all possible names
decoder_possible_text_config_names = ("decoder", "generator", "text_config")
encoder_possible_text_config_names = ("text_encoder",)
if return_both:
possible_text_config_names = (
encoder_possible_text_config_names + decoder_possible_text_config_names
)
elif decoder:
possible_text_config_names = decoder_possible_text_config_names
else:
possible_text_config_names = encoder_possible_text_config_names
valid_text_config_names = []
for text_config_name in possible_text_config_names:
if hasattr(self, text_config_name):
text_config = getattr(self, text_config_name, None)
if text_config is not None:
valid_text_config_names += [text_config_name]
if len(valid_text_config_names) > 1:
raise ValueError(
f"Multiple valid text configs were found in the model config: {valid_text_config_names}. In this "
"case, using `get_text_config()` would be ambiguous. Please specify the desired text config directly, "
"e.g. `text_config = config.sub_config_name`"
)
elif len(valid_text_config_names) == 1:
config_to_return = getattr(self, valid_text_config_names[0])
else:
config_to_return = self
# handle legacy models with flat config structure, when we only want one of the configs
if (
not return_both
and len(valid_text_config_names) == 0
and config_to_return.is_encoder_decoder
):
config_to_return = copy.deepcopy(config_to_return)
prefix_to_discard = "encoder" if decoder else "decoder"
for key in config_to_return.to_dict():
if key.startswith(prefix_to_discard):
delattr(config_to_return, key)
# old encoder/decoder models may use "encoder_layers"/"decoder_layers" instead of "num_hidden_layers"
if decoder and hasattr(config_to_return, "decoder_layers"):
config_to_return.num_hidden_layers = config_to_return.decoder_layers
elif encoder and hasattr(config_to_return, "encoder_layers"):
config_to_return.num_hidden_layers = config_to_return.encoder_layers
return config_to_return
import time
from typing import Optional
import infinicore
from ..cache_utils import Cache, DynamicCache
import numpy as np
def infini_to_ctype_dtype(infini_dtype):
"""Convert PyTorch data type to infinicore data type"""
import ctypes
if infini_dtype == infinicore.int32:
return ctypes.c_int32
elif infini_dtype == infinicore.float32:
return ctypes.c_float
else:
raise ValueError(f"Unsupported py_dtype: {infini_dtype}")
def infini_to_numpy(infini_tensor: infinicore.Tensor):
if infini_tensor.device.type != "cpu":
infini_tensor_cpu = infini_tensor.to(infinicore.device("cpu", 0))
else:
infini_tensor_cpu = infini_tensor
# 获取数据指针和形状信息
data_ptr = infini_tensor_cpu.data_ptr()
num_elements = infini_tensor_cpu.numel()
original_shape = infini_tensor_cpu.shape
# 创建1D NumPy数组(共享内存)
ArrayType = infini_to_ctype_dtype(infini_tensor_cpu.dtype) * num_elements
array = ArrayType.from_address(data_ptr)
np_flat = np.ctypeslib.as_array(array)
# 重塑为原始形状
np_array = np_flat.reshape(original_shape)
return np.copy(np_array)
infinicore.Tensor.to_numpy = infini_to_numpy
class GenerationMixin:
def _get_initial_cache_position(
self,
bs: int,
seq_length: int,
device: infinicore.device,
) -> infinicore.Tensor:
"""Calculates `cache_position` for the pre-fill stage"""
cache_position_list = [list(range(0, seq_length)) for i in range(bs)]
return infinicore.from_list(
cache_position_list, dtype=infinicore.int64, device=device
)
def prepare_inputs_for_generation(
self,
device: infinicore.device,
past_key_values: Optional[Cache] = None,
**kwargs,
):
"""Prepare the model inputs for generation."""
# 1. Handle BC:
model_inputs = {}
# -------------------------------------------------------------------- #
# 所需的: KV Cache
# -------------------------------------------------------------------- #
if past_key_values is not None:
model_inputs["past_key_values"] = past_key_values
# -------------------------------------------------------------------------- #
# 计算所需的,cache_position
# -------------------------------------------------------------------------- #
current_cache_position = kwargs.get("cache_position", None)
if current_cache_position is None:
# prill阶段
bs, seq_len = kwargs["input_ids"].shape[0:2]
model_inputs["cache_position"] = self._get_initial_cache_position(
bs, seq_len, device
)
else:
# decoder 阶段
bs, seq_len = current_cache_position.shape
last_position = current_cache_position.narrow(1, seq_len - 1, 1)
one_value = infinicore.from_list(
[1],
dtype=last_position.dtype,
device=last_position.device,
).view((bs, 1))
next_position = one_value + last_position
model_inputs["cache_position"] = next_position
# -------------------------------------------------------------------- #
# 所需的: token的input_ids
# -------------------------------------------------------------------- #
if kwargs.get("next_token_id", None) is not None:
next_token_id = kwargs["next_token_id"]
model_inputs["input_ids"] = infinicore.from_list([[next_token_id]])
# -------------------------------------------------------------------- #
# 其他
# -------------------------------------------------------------------- #
for key, value in kwargs.items():
if key not in model_inputs:
model_inputs[key] = value
return model_inputs
def generate(
self,
input_ids: infinicore.Tensor,
max_new_tokens: int,
device: infinicore.device,
tokenizer,
config,
**kwargs,
):
model_kwargs = kwargs
# -------------------------------------------------------------------- #
# 创建 cache #
# -------------------------------------------------------------------- #
model_kwargs["use_cache"] = True
model_kwargs["past_key_values"] = DynamicCache(config=self.config)
# -------------------------------------------------------------------- #
# _sample函数 #
# -------------------------------------------------------------------- #
result = self._sample(
input_ids,
max_new_tokens=max_new_tokens,
device=device,
tokenizer=tokenizer,
config=config,
**model_kwargs,
)
return result
def _sample(
self,
input_ids: infinicore.Tensor,
max_new_tokens: int,
device: infinicore.device,
tokenizer,
config,
**model_kwargs,
):
r"""
Generates sequences of token ids for models with a language modeling head.
Parameters:
input_ids (batch_size, seq_len): The sequence used as a prompt for the generation.
max_new_tokens: Maximum number of new tokens.
device: infinicore.device.
tokenizer: translating data into raw text.
"""
batch_size, seq_len = input_ids.shape[:2]
eos_token_id = config.eos_token_id
eos_token_id_list = (
[eos_token_id] if isinstance(eos_token_id, int) else eos_token_id
)
# -------------------------------------------------------------------------- #
# 初始化 cache_position
# -------------------------------------------------------------------------- #
output_tokens_list = []
model_kwargs["input_ids"] = input_ids
model_kwargs["cache_position"] = None
output_content = ""
print()
time_list = []
for i in range(0, max_new_tokens):
# -------------------------------------------------------------------------- #
# prepare model inputs
# -------------------------------------------------------------------------- #
model_inputs = self.prepare_inputs_for_generation(device, **model_kwargs)
model_kwargs["cache_position"] = model_inputs["cache_position"]
# -------------------------------------------------------------------------- #
# 计算一次
# -------------------------------------------------------------------------- #
start_time = time.time()
logits = self.forward(**model_inputs, return_dict=True)
# -------------------------------------------------------------------------- #
# 处理输出
# -------------------------------------------------------------------------- #
token_scores = logits
# -------------------------------------------------------------------------- #
# random_sample
# -------------------------------------------------------------------------- #
batch_size, _, vocab_size = token_scores.shape
next_tokens = infinicore.empty(
(batch_size,),
dtype=infinicore.int32,
device=token_scores.device,
)
for i in range(0, batch_size):
score = token_scores.narrow(0, i, 1).view([vocab_size])
out = next_tokens.narrow(0, i, 1).view([])
infinicore.nn.functional.random_sample(
score,
0.8,
0.1,
1,
1.0,
out=out,
)
end_time = time.time()
time_list.append((end_time - start_time) * 1000)
# ----------------------------------------------------------------- #
# 得到下一个token的id,并解码为字符
# ----------------------------------------------------------------- #
token_id = next_tokens.to_numpy()[0]
output_str = tokenizer.decode([token_id], skip_special_tokens=True)
model_kwargs["next_token_id"] = token_id
output_tokens_list.append(token_id)
output_content += output_str
print(output_str, end="", flush=True)
if token_id in eos_token_id_list:
break
print(
f"\n\n Time per step: {round(sum(time_list) / len(time_list), 2)} ms\n",
)
return output_tokens_list, output_content
import os
from typing import Dict, Optional, Union
import torch
from safetensors import safe_open
# from safetensors.torch import load_file as safe_load_file
# from safetensors.torch import save_file as safe_save_file
import infinicore
str_to_torch_dtype = {
"BOOL": torch.bool,
"U8": torch.uint8,
"I8": torch.int8,
"I16": torch.int16,
"F16": torch.float16,
"BF16": torch.bfloat16,
"I32": torch.int32,
"F32": torch.float32,
"F64": torch.float64,
"I64": torch.int64,
"F8_E4M3": torch.float8_e4m3fn,
"F8_E5M2": torch.float8_e5m2,
}
def load_state_dict(
checkpoint_file: Union[str, os.PathLike],
map_location: Optional[Union[str, torch.device]] = "cpu",
weights_only: bool = True,
) -> Dict[str, torch.Tensor]:
"""
Reads a `safetensor` checkpoint file. We load the checkpoint on "cpu" by default.
"""
# Use safetensors if possible
if not checkpoint_file.endswith(".safetensors"):
return {}
state_dict = {}
with safe_open(checkpoint_file, framework="pt") as f:
metadata = f.metadata()
if metadata is not None and metadata.get("format") not in [
"pt",
"tf",
"flax",
"mlx",
]:
raise OSError(
f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata."
)
for k in f.keys():
if map_location == "meta":
_slice = f.get_slice(k)
k_dtype = _slice.get_dtype()
if k_dtype in str_to_torch_dtype:
dtype = str_to_torch_dtype[k_dtype]
else:
raise ValueError(
f"Cannot load safetensors of unknown dtype {k_dtype}"
)
state_dict[k] = torch.empty(
size=_slice.get_shape(), dtype=dtype, device="meta"
)
else:
state_dict[k] = f.get_tensor(k)
return state_dict
def get_model_state_dict(
model_path: str,
device: infinicore.device,
dtype=infinicore.dtype,
) -> Dict[str, infinicore.Tensor]:
"""
Load the model weights.
"""
path = os.path.join(model_path, "model.safetensors")
model_param = load_state_dict(path)
torch_device = device.type
torch_dtype = infinicore.utils.to_torch_dtype(dtype)
model_param_infini = {}
for key, value in model_param.items():
model_param[key] = value.to(device=torch_device, dtype=torch_dtype)
for key, value in model_param.items():
model_param_infini[key] = infinicore.from_torch(model_param[key])
return model_param_infini
from .configuration_llama import * # noqa: F403
from .modeling_llama import * # noqa: F403
# coding=utf-8
# Copyright (c) 2025, InfiniCore
#
# This file contains modified code derived from transformers
# implementation, which is licensed under the BSD 3-Clause License.
#
# The modifications include adaptations for the InfiniCore framework.
#
# Original transformers source:
# https://github.com/huggingface/transformers
#
# Referencing PyTorch v4.57.0
#
# The use of this file is governed by the BSD 3-Clause License.
"""LLaMA model configuration"""
from ...configuration_utils import PretrainedConfig
class LlamaConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the LLaMA-7B.
e.g. [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`LlamaModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details, check out [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
Llama 2 up to 4096, CodeLlama up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
`factor` (`float`, *optional*):
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
original maximum pre-trained length.
`original_max_position_embeddings` (`int`, *optional*):
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
pretraining.
`attention_factor` (`float`, *optional*):
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
computation. If unspecified, it defaults to value recommended by the implementation, using the
`factor` field to infer the suggested value.
`beta_fast` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
ramp function. If unspecified, it defaults to 32.
`beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1.
`short_factor` (`list[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`long_factor` (`list[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`low_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
head_dim (`int`, *optional*):
The attention head dimension. If None, it will default to hidden_size // num_attention_heads
```python
>>> from transformers import LlamaModel, LlamaConfig
>>> # Initializing a LLaMA llama-7b style configuration
>>> configuration = LlamaConfig()
>>> # Initializing a model from the llama-7b style configuration
>>> model = LlamaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "llama"
keys_to_ignore_at_inference = ["past_key_values"]
# Default tensor parallel plan for base model `LlamaModel`
base_model_tp_plan = {
"layers.*.self_attn.q_proj": "colwise",
"layers.*.self_attn.k_proj": "colwise",
"layers.*.self_attn.v_proj": "colwise",
"layers.*.self_attn.o_proj": "rowwise",
"layers.*.mlp.gate_proj": "colwise",
"layers.*.mlp.up_proj": "colwise",
"layers.*.mlp.down_proj": "rowwise",
}
base_model_pp_plan = {
"embed_tokens": (["input_ids"], ["inputs_embeds"]),
"layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
"norm": (["hidden_states"], ["hidden_states"]),
}
def __init__(
self,
vocab_size=32000,
hidden_size=4096,
intermediate_size=11008,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=None,
hidden_act="silu",
max_position_embeddings=2048,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
pad_token_id=None,
bos_token_id=1,
eos_token_id=2,
pretraining_tp=1,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
attention_bias=False,
attention_dropout=0.0,
mlp_bias=False,
head_dim=None,
**kwargs,
):
# ---
self.model_type = "llama"
self.name_or_path = ""
self.pad_token_id = pad_token_id
self.bos_token_id = bos_token_id
self.eos_token_id = eos_token_id
# ---
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
# for backward compatibility
if num_key_value_heads is None:
num_key_value_heads = num_attention_heads
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.pretraining_tp = pretraining_tp
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_bias = attention_bias
self.attention_dropout = attention_dropout
self.mlp_bias = mlp_bias
self.head_dim = (
head_dim
if head_dim is not None
else self.hidden_size // self.num_attention_heads
)
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, copy it it to 'rope_type'.
if self.rope_scaling is not None and "type" in self.rope_scaling:
self.rope_scaling["rope_type"] = self.rope_scaling["type"]
# rope_config_validation(self)
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)
__all__ = ["LlamaConfig"]
# Copyright (c) 2025, InfiniCore
#
# This file contains modified code derived from transformers
# implementation, which is licensed under the BSD 3-Clause License.
#
# The modifications include adaptations for the InfiniCore framework.
#
# Original transformers source:
# https://github.com/huggingface/transformers
#
# Referencing PyTorch v4.57.0
#
# The use of this file is governed by the BSD 3-Clause License.
import json
import os
from typing import Optional, Union
from transformers.utils import logging
import infinicore
from ...cache_utils import Cache, DynamicCache
from ...generation.utils import GenerationMixin
from .configuration_llama import LlamaConfig
logger = logging.get_logger(__name__)
def repeat_kv(keys: infinicore.Tensor, values: infinicore.Tensor, ngroup: int):
total_seq_len, num_key_value_heads, head_dim = keys.shape
keys_repeat = infinicore.empty(
(total_seq_len, num_key_value_heads, ngroup, head_dim),
dtype=keys.dtype,
device=keys.device,
)
values_repeat = infinicore.empty(
(total_seq_len, num_key_value_heads, ngroup, head_dim),
dtype=values.dtype,
device=values.device,
)
for i in range(ngroup):
keys_repeat.narrow(2, i, 1).copy_(
keys.view((total_seq_len, num_key_value_heads, 1, head_dim))
)
values_repeat.narrow(2, i, 1).copy_(
values.view((total_seq_len, num_key_value_heads, 1, head_dim))
)
keys_new = keys_repeat.view((total_seq_len, num_key_value_heads * ngroup, head_dim))
values_new = values_repeat.view(
(total_seq_len, num_key_value_heads * ngroup, head_dim)
)
return keys_new, values_new
def multi_head_attention(
querys: infinicore.Tensor, # [seq_len, num_heads, head_dim]
keys: infinicore.Tensor, # [total_seq_len, num_heads, head_dim]
values: infinicore.Tensor, # [total_seq_len, num_heads, head_dim]
scaling: float,
):
# => [ num_heads, seq_len, head_dim]
Q = querys.permute((1, 0, 2))
# => [ num_heads, total_seq_len, head_dim]
K = keys
# => [ num_heads, total_seq_len, head_dim]
V = values.permute((1, 0, 2))
# [num_heads, seq_len, head_dim] @ [ num_heads, head_dim, total_seq_len]
# => [ num_heads, seq_len, total_seq_len]
attn_weight = Q @ K.permute((1, 2, 0))
scaling = infinicore.from_list(
[scaling], dtype=attn_weight.dtype, device=attn_weight.device
).as_strided(attn_weight.shape, [0, 0, 0])
attn_weight = attn_weight * scaling
infinicore.nn.functional.causal_softmax(attn_weight, out=attn_weight)
# [ num_heads, seq_len, total_seq_len] @ [num_heads, total_seq_len, head_dim]
# => [ num_heads,seq_len,head_dim]
out = attn_weight @ V
# => [seq_len, num_heads, head_dim]
return out.permute((1, 0, 2)).contiguous()
def grouped_query_attention(
querys: infinicore.Tensor, # [seq_len, num_attention_heads, head_dim]
keys: infinicore.Tensor, # [total_seq_len, num_key_value_heads, head_dim]
values: infinicore.Tensor, # [total_seq_len, num_key_value_heads, head_dim]
scaling: float,
):
num_attention_heads = querys.shape[1]
num_key_value_heads = keys.shape[1]
ngroup = num_attention_heads // num_key_value_heads
if ngroup > 1:
keys, values = repeat_kv(keys, values, ngroup)
return multi_head_attention(querys, keys, values, scaling=scaling)
LlamaRMSNorm = infinicore.nn.RMSNorm
class LlamaMLP(infinicore.nn.Module):
def __init__(self, config, **kwargs):
super().__init__()
hidden_size = config.hidden_size
intermediate_size = config.intermediate_size
mlp_bias = config.mlp_bias
self.gate_proj = infinicore.nn.Linear(
hidden_size, intermediate_size, bias=mlp_bias, **kwargs
)
self.up_proj = infinicore.nn.Linear(
hidden_size, intermediate_size, bias=mlp_bias, **kwargs
)
self.down_proj = infinicore.nn.Linear(
intermediate_size, hidden_size, bias=mlp_bias, **kwargs
)
self.act_fn = infinicore.nn.functional.silu
def forward(self, x: infinicore.Tensor) -> infinicore.Tensor:
down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
return down_proj
class LlamaAttention(infinicore.nn.Module):
def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
super().__init__()
self.config = config
self.layer_idx = layer_idx
self.hidden_size = config.hidden_size
self.num_attention_heads = config.num_attention_heads
self.num_key_value_heads = config.num_key_value_heads
self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads
attention_bias = config.attention_bias
self.head_dim = getattr(
config, "head_dim", self.hidden_size // self.num_attention_heads
)
self.scaling = self.head_dim**-0.5
self.q_proj = infinicore.nn.Linear(
self.hidden_size,
self.num_attention_heads * self.head_dim,
bias=attention_bias,
**kwargs,
)
self.k_proj = infinicore.nn.Linear(
self.hidden_size,
self.num_key_value_heads * self.head_dim,
bias=attention_bias,
**kwargs,
)
self.v_proj = infinicore.nn.Linear(
self.hidden_size,
self.num_key_value_heads * self.head_dim,
bias=attention_bias,
**kwargs,
)
self.o_proj = infinicore.nn.Linear(
self.num_attention_heads * self.head_dim,
self.hidden_size,
bias=attention_bias,
**kwargs,
)
def forward(
self,
hidden_states: infinicore.Tensor,
past_key_values: Optional[Cache] = None,
rope_instance: infinicore.nn.RoPE = None,
**kwargs,
) -> infinicore.Tensor:
hidden_states_shape = hidden_states.shape # [bs, seq_len, hidden_size]
bs, seq_len = hidden_states_shape[:-1] # [bs, seq_len]
querys_shape = (bs, seq_len, self.num_attention_heads, self.head_dim)
keys_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)
values_shape = (bs, seq_len, self.num_key_value_heads, self.head_dim)
# --------------------------------------------------------------------------------------- #
# 对 Q,K,V进行 project
# --------------------------------------------------------------------------------------- #
# => [bs, seq_len, num_attention_heads, head_dim]
query_states = self.q_proj(hidden_states).view(querys_shape)
# => [bs, seq_len, num_key_value_heads, head_dim]
key_states = self.k_proj(hidden_states).view(keys_shape)
# => [bs, seq_len, nkvh, head_dim]
value_states = self.v_proj(hidden_states).view(values_shape)
# --------------------------------------------------------------------------------------- #
# 对 Q和K, 加上 rope
# --------------------------------------------------------------------------------------- #
cache_position = kwargs.pop("cache_position", None)
if cache_position is None:
raise KeyError("cache_position error")
if rope_instance is None:
raise KeyError("rope_instance error")
query_states = rope_instance(query_states, cache_position)
key_states = rope_instance(key_states, cache_position)
# --------------------------------------------------------------------------------------- #
# kv cache
# --------------------------------------------------------------------------------------- #
if past_key_values is not None:
cache_kwargs = {}
key_states_total, value_states_total = past_key_values.update(
key_states, # [bs, seq_len, num_key_value_heads, head_dim]
value_states, # [bs, seq_len, num_key_value_heads, head_dim]
self.layer_idx,
cache_kwargs,
)
# --------------------------------------------------------------------------------------- #
# 注意力计算
# --------------------------------------------------------------------------------------- #
total_seq_len = key_states_total.shape[1]
attn_output = infinicore.empty_like(query_states)
for i in range(0, bs):
query_states_i = query_states.narrow(0, i, 1).view(
(seq_len, self.num_attention_heads, self.head_dim)
)
key_states_i = key_states_total.narrow(0, i, 1).view(
(total_seq_len, self.num_key_value_heads, self.head_dim)
)
value_states_i = value_states_total.narrow(0, i, 1).view(
(total_seq_len, self.num_key_value_heads, self.head_dim)
)
attn_output_i = attn_output.narrow(0, i, 1).view(
(seq_len, self.num_attention_heads, self.head_dim)
)
attention_i = grouped_query_attention(
query_states_i, key_states_i, value_states_i, scaling=self.scaling
)
attn_output_i.copy_(attention_i)
# --------------------------------------------------------------------------------------- #
# out project
# --------------------------------------------------------------------------------------- #
# ([bs, seq_len, num_attention_heads, head_dim]) ==> [bs, seq_len, hidden_size ]
attn_output = attn_output.view(hidden_states_shape)
# o_proj
return self.o_proj(attn_output)
class LlamaDecoderLayer(infinicore.nn.Module):
def __init__(self, config: LlamaConfig, layer_idx: int, **kwargs):
super().__init__()
hidden_size = config.hidden_size
rms_norm_eps = config.rms_norm_eps
self.self_attn = LlamaAttention(config=config, layer_idx=layer_idx, **kwargs)
self.mlp = LlamaMLP(config=config, **kwargs)
self.input_layernorm = LlamaRMSNorm(hidden_size, eps=rms_norm_eps, **kwargs)
self.post_attention_layernorm = LlamaRMSNorm(
hidden_size, eps=rms_norm_eps, **kwargs
)
def forward(
self,
hidden_states: infinicore.Tensor, # [bs, seq_len, hidden_size]
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = False,
rope_instance=None,
**kwargs,
) -> infinicore.Tensor:
# ------------------------------------------------ #
# Self Attention
# ------------------------------------------------ #
residual = hidden_states
hidden_states = self.input_layernorm(hidden_states)
hidden_states = self.self_attn(
hidden_states=hidden_states,
past_key_values=past_key_values,
use_cache=use_cache,
rope_instance=rope_instance,
**kwargs,
)
hidden_states = residual + hidden_states
# ------------------------------------------------ #
# Fully Connected
# ------------------------------------------------ #
residual = hidden_states
hidden_states = self.post_attention_layernorm(hidden_states)
hidden_states = self.mlp(hidden_states)
hidden_states = residual + hidden_states
return hidden_states
class LlamaModel(infinicore.nn.Module):
def __init__(self, config: LlamaConfig, **kwargs):
super().__init__()
self.config = config
self.padding_idx = config.pad_token_id
head_dim = getattr(
config, "head_dim", config.hidden_size // config.num_attention_heads
)
self.embed_tokens = infinicore.nn.Embedding(
config.vocab_size, config.hidden_size, **kwargs
)
self.layers = infinicore.nn.ModuleList(
[
LlamaDecoderLayer(config, layer_idx, **kwargs)
for layer_idx in range(config.num_hidden_layers)
]
)
self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps, **kwargs)
self.rope_instance = infinicore.nn.RoPE(
max_position_embeddings=config.max_position_embeddings,
rope_theta=config.rope_theta,
head_dim=head_dim,
**kwargs,
)
def forward(
self,
input_ids,
cache_position,
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None, # True
**kwargs,
):
if use_cache and past_key_values is None:
past_key_values = DynamicCache(config=self.config)
# --------------------------------------------------------- #
# token的embedding
# --------------------------------------------------------- #
# input_ids : {1,5} tensor([[ 1, 1128, 526, 366, 29892]])
# inputs_embeds : {1,5,2048} tensor([[[...]]])
inputs_embeds = self.embed_tokens(input_ids)
# --------------------------------------------------------- #
# decoder_layer
# --------------------------------------------------------- #
ilayer = 0 # noqa: F841
hidden_states = inputs_embeds
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
# print("ilayer: ", ilayer)
# ilayer += 1
hidden_states = decoder_layer(
hidden_states,
past_key_values=past_key_values,
cache_position=cache_position,
rope_instance=self.rope_instance,
**kwargs,
)
# --------------------------------------------------------- #
# norm
# --------------------------------------------------------- #
seq_len = hidden_states.shape[1]
last_token = hidden_states.narrow(1, seq_len - 1, 1)
return self.norm(last_token)
class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
config: LlamaConfig
def __init__(self, config, **kwargs):
super().__init__()
self.config = config
self.model = LlamaModel(config, **kwargs)
self.lm_head = infinicore.nn.Linear(
config.hidden_size,
config.vocab_size,
bias=False,
**kwargs,
)
def forward(
self,
input_ids,
cache_position,
past_key_values: Optional[Cache] = None,
use_cache: Optional[bool] = None,
**kwargs,
):
last_token = self.model(
input_ids,
cache_position,
past_key_values=past_key_values,
use_cache=use_cache,
**kwargs,
)
return self.lm_head(last_token)
@classmethod
def from_pretrained(
cls,
model_path: Optional[Union[str, os.PathLike]],
device: infinicore.device,
dtype=infinicore.dtype,
):
def load_config_json(dir_path_: str):
with open(os.path.join(dir_path_, "config.json"), "r") as f:
config = json.load(f)
return config
config_dict = load_config_json(os.path.join(model_path))
config = LlamaConfig(**config_dict)
return LlamaForCausalLM(config, device=device, dtype=dtype)
__all__ = [
"LlamaModel",
"LlamaForCausalLM",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment