Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
3ddffe8d
Commit
3ddffe8d
authored
Nov 20, 2025
by
pengcheng888
Browse files
issue/76 - 添加python的llama模型实现
parent
4fd9d490
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
1499 additions
and
0 deletions
+1499
-0
README.md
README.md
+16
-0
examples/llama.py
examples/llama.py
+153
-0
python/infinilm/__init__.py
python/infinilm/__init__.py
+1
-0
python/infinilm/cache_utils.py
python/infinilm/cache_utils.py
+199
-0
python/infinilm/configuration_utils.py
python/infinilm/configuration_utils.py
+115
-0
python/infinilm/generation/utils.py
python/infinilm/generation/utils.py
+244
-0
python/infinilm/modeling_utils.py
python/infinilm/modeling_utils.py
+92
-0
python/infinilm/models/__init__.py
python/infinilm/models/__init__.py
+1
-0
python/infinilm/models/llama/__init__.py
python/infinilm/models/llama/__init__.py
+2
-0
python/infinilm/models/llama/configuration_llama.py
python/infinilm/models/llama/configuration_llama.py
+233
-0
python/infinilm/models/llama/modeling_llama.py
python/infinilm/models/llama/modeling_llama.py
+443
-0
No files found.
README.md
View file @
3ddffe8d
...
...
@@ -35,3 +35,19 @@ python scripts/test_perf.py
```
bash
python scripts/test_ppl.py
--model-path
MODEL_PATH
[
--ndev
NDEV]
[
--max-batch
MAX_BATCH]
[
--max-tokens
MAX_TOKENS]
```
## 使用方式(新版)
-
编译并安装
`InfiniCore`
, 详情见 InfiniCore的
[
`README`
](
https://github.com/InfiniTensor/InfiniCore
)
:
-
注意根据提示设置好
`INFINI_ROOT`
环境变量(默认为
`$HOME/.infini`
)
-
根据硬件平台,选择 xmake 构建配置
-
编译安装InfiniCore
-
安装 C++ 库
-
安装 Python 包
-
单次推理测试
-
llama示例
```
bash
python examples/llama.py
[
--cpu
|
--nvidia
]
--model_path
=
<path/to/model_dir>
```
\ No newline at end of file
examples/llama.py
0 → 100644
View file @
3ddffe8d
import
sys
import
time
import
os
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../python"
))
import
argparse
import
infinilm
from
infinilm.modeling_utils
import
get_model_state_dict
from
tokenizers
import
decoders
as
_dec
from
transformers
import
AutoTokenizer
import
infinicore
def
get_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"run Llama args"
)
parser
.
add_argument
(
"--cpu"
,
action
=
"store_true"
,
help
=
"Run cpu test"
,
)
parser
.
add_argument
(
"--nvidia"
,
action
=
"store_true"
,
help
=
"Run nvidia test"
,
)
parser
.
add_argument
(
"--metax"
,
action
=
"store_true"
,
help
=
"Run metax test"
,
)
parser
.
add_argument
(
"--model_path"
,
type
=
str
,
required
=
True
,
help
=
"model_path"
,
)
parser
.
add_argument
(
"--max_new_tokens"
,
type
=
int
,
default
=
100
,
help
=
"max_new_tokens"
,
)
return
parser
.
parse_args
()
def
test
(
model_path
,
device_str
=
"cuda"
,
max_new_tokens
=
100
):
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
infini_device
=
infinicore
.
device
(
device_str
,
0
)
infini_dtype
=
infinicore
.
bfloat16
model
=
infinilm
.
LlamaForCausalLM
.
from_pretrained
(
model_path
,
device
=
infini_device
,
dtype
=
infini_dtype
,
)
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
model_param_infini
=
get_model_state_dict
(
model_path
,
device
=
infini_device
,
dtype
=
infini_dtype
,
)
model
.
load_state_dict
(
model_param_infini
)
config
=
model
.
config
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
)
if
"llama"
==
config
.
model_type
:
backend
=
getattr
(
tokenizer
,
"backend_tokenizer"
,
None
)
target
=
getattr
(
backend
,
"_tokenizer"
,
backend
)
norm
=
getattr
(
target
,
"normalizer"
,
None
)
dec
=
getattr
(
target
,
"decoder"
,
None
)
sn
=
repr
(
norm
)[:
800
]
if
norm
is
not
None
else
""
sd
=
repr
(
dec
)[:
800
]
if
dec
is
not
None
else
""
has_prepend
=
"Prepend"
in
sn
has_strip
=
"Strip"
in
sd
if
has_prepend
and
has_strip
:
target
.
decoder
=
_dec
.
Sequence
(
[
_dec
.
Replace
(
"▁"
,
" "
),
_dec
.
ByteFallback
(),
_dec
.
Fuse
(),
]
)
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
prompt
=
"山东最高的山是?"
input_content
=
tokenizer
.
apply_chat_template
(
conversation
=
[{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
,
tokenize
=
False
,
)
print
(
input_content
,
end
=
""
,
flush
=
True
)
input_ids
=
tokenizer
.
encode
(
input_content
)
# ---------------------------------------------------------------------------- #
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_list
=
[
input_ids
]
# List: [[1, 1128, 526, 366, 29892]]
input_ids_infini
=
infinicore
.
from_list
(
input_ids_list
)
t1
=
time
.
time
()
model
.
generate
(
input_ids_infini
,
max_new_tokens
=
max_new_tokens
,
device
=
infini_device
,
tokenizer
=
tokenizer
,
config
=
config
,
)
t2
=
time
.
time
()
print
(
f
"total_time:
{
round
((
t2
-
t1
)
*
1000
,
2
)
}
ms"
,
)
if
__name__
==
"__main__"
:
args
=
get_args
()
print
(
args
)
# Parse command line arguments
device_type
=
"cpu"
if
args
.
cpu
:
device_type
=
"cpu"
elif
args
.
nvidia
:
device_type
=
"cuda"
elif
args
.
metax
:
device_type
=
"cuda"
else
:
print
(
"Usage: python examples/llama.py [--cpu | --nvidia] --model_path=<path/to/model_dir>"
)
sys
.
exit
(
1
)
model_path
=
args
.
model_path
max_new_tokens
=
args
.
max_new_tokens
test
(
model_path
,
device_type
,
max_new_tokens
)
python/infinilm/__init__.py
0 → 100644
View file @
3ddffe8d
from
.models
import
*
python/infinilm/cache_utils.py
0 → 100644
View file @
3ddffe8d
# Copyright (c) 2025, InfiniCore
#
# This file contains modified code derived from transformers
# implementation, which is licensed under the BSD 3-Clause License.
#
# The modifications include adaptations for the InfiniCore framework.
#
# Original transformers source:
# https://github.com/huggingface/transformers
#
# Referencing PyTorch v4.57.0
#
# The use of this file is governed by the BSD 3-Clause License.
from
abc
import
ABC
,
abstractmethod
from
typing
import
Any
,
Optional
import
transformers.utils.logging
as
logging
import
infinicore
logger
=
logging
.
get_logger
(
__name__
)
class
CacheLayerMixin
(
ABC
):
"""Base, abstract class for a single layer's cache."""
def
__init__
(
self
):
self
.
keys
,
self
.
values
=
None
,
None
def
__repr__
(
self
):
return
f
"
{
self
.
__class__
.
__name__
}
"
@
abstractmethod
def
lazy_initialization
(
self
,
key_states
:
infinicore
.
Tensor
):
...
@
abstractmethod
def
update
(
self
,
key_states
:
infinicore
.
Tensor
,
value_states
:
infinicore
.
Tensor
,
cache_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
tuple
[
infinicore
.
Tensor
,
infinicore
.
Tensor
]:
...
class
DynamicLayer
(
CacheLayerMixin
):
"""
A cache layer that grows dynamically as more tokens are generated.
It stores the key and value states as tensors of shape `[batch_size, seq_len, num_heads, head_dim]`.
"""
def
__init__
(
self
,
max_position_embeddings
):
super
().
__init__
()
self
.
max_position_embeddings
=
max_position_embeddings
self
.
cache_position
=
0
def
lazy_initialization
(
self
,
key_states
:
infinicore
.
Tensor
):
batch_size
,
seq_len
,
num_heads
,
head_dim
=
key_states
.
shape
if
self
.
keys
is
None
:
dtype
,
device
=
key_states
.
dtype
,
key_states
.
device
self
.
cache_position
=
0
self
.
max_seq_len
=
max
(
self
.
max_position_embeddings
,
seq_len
)
self
.
keys
=
infinicore
.
empty
(
[
batch_size
,
self
.
max_seq_len
,
num_heads
,
head_dim
],
dtype
=
dtype
,
device
=
device
,
)
self
.
values
=
infinicore
.
empty
(
[
batch_size
,
self
.
max_seq_len
,
num_heads
,
head_dim
],
dtype
=
dtype
,
device
=
device
,
)
elif
self
.
cache_position
+
seq_len
>=
self
.
max_seq_len
:
dtype
,
device
=
key_states
.
dtype
,
key_states
.
device
self
.
max_seq_len
=
max
(
self
.
max_seq_len
*
2
,
self
.
cache_position
+
seq_len
)
keys_new
=
infinicore
.
empty
(
[
batch_size
,
self
.
max_seq_len
,
num_heads
,
head_dim
],
dtype
=
dtype
,
device
=
device
,
)
values_new
=
infinicore
.
empty
(
[
batch_size
,
self
.
max_seq_len
,
num_heads
,
head_dim
],
dtype
=
dtype
,
device
=
device
,
)
keys_new
.
narrow
(
1
,
0
,
self
.
cache_position
).
copy_
(
self
.
keys
.
narrow
(
1
,
0
,
self
.
cache_position
)
)
values_new
.
narrow
(
1
,
0
,
self
.
cache_position
).
copy_
(
self
.
values
.
narrow
(
1
,
0
,
self
.
cache_position
)
)
self
.
keys
,
self
.
values
=
keys_new
,
values_new
def
update
(
self
,
key_states
:
infinicore
.
Tensor
,
value_states
:
infinicore
.
Tensor
,
cache_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
):
# Lazy initialization
self
.
lazy_initialization
(
key_states
)
seq_len
=
key_states
.
shape
[
1
]
index
=
self
.
cache_position
# Update the cache
self
.
keys
.
narrow
(
1
,
index
,
seq_len
).
copy_
(
key_states
)
self
.
values
.
narrow
(
1
,
index
,
seq_len
).
copy_
(
value_states
)
self
.
cache_position
+=
seq_len
return
self
.
keys
.
narrow
(
1
,
0
,
self
.
cache_position
),
self
.
values
.
narrow
(
1
,
0
,
self
.
cache_position
)
class
Cache
:
"""
A `Cache` is mostly a list of `CacheLayerMixin` objects, one per model layer. It serves as a container for the Cache of each layer.
Args:
layers (`Optional`, *optional*): A list of pre-created `CacheLayerMixin`.
"""
def
__init__
(
self
,
layers
:
Optional
[
list
[
CacheLayerMixin
]]
=
None
,
):
self
.
layers
=
layers
if
layers
is
not
None
else
[]
def
update
(
self
,
key_states
:
infinicore
.
Tensor
,
value_states
:
infinicore
.
Tensor
,
layer_idx
:
int
,
cache_kwargs
:
Optional
[
dict
[
str
,
Any
]]
=
None
,
)
->
tuple
[
infinicore
.
Tensor
,
infinicore
.
Tensor
]:
"""
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
Parameters:
key_states (`infinicore.Tensor`):
The new key states to cache.
value_states (`infinicore.Tensor`):
The new value states to cache.
layer_idx (`int`):
The index of the layer to cache the states for.
cache_kwargs (`dict[str, Any]`, *optional*):
Additional arguments for the cache subclass.
Return:
A tuple containing the updated key and value states.
"""
keys
,
values
=
self
.
layers
[
layer_idx
].
update
(
key_states
,
value_states
,
cache_kwargs
)
return
keys
.
contiguous
(),
values
.
contiguous
()
class
DynamicCache
(
Cache
):
"""
A cache that grows dynamically as more tokens are generated. This is the default for generative models.
It stores the key and value states as a list of `CacheLayer`, one for each layer.
Args:
config (`PretrainedConfig`, *optional*):
The config of the model for which this Cache will be used..
"""
def
__init__
(
self
,
config
=
None
,
):
max_position_embeddings
=
config
.
max_position_embeddings
layers
=
[]
# If a config is passed, use it to infer the layer types and initialize accordingly
if
config
is
not
None
:
config
=
config
.
get_text_config
()
layer_types
=
None
if
layer_types
is
None
:
layer_types
=
[
"full_attention"
for
_
in
range
(
config
.
num_hidden_layers
)
]
for
layer_type
in
layer_types
:
layers
.
append
(
DynamicLayer
(
max_position_embeddings
))
super
().
__init__
(
layers
=
layers
,
)
python/infinilm/configuration_utils.py
0 → 100644
View file @
3ddffe8d
# Copyright (c) 2025, InfiniCore
#
# This file contains modified code derived from transformers
# implementation, which is licensed under the BSD 3-Clause License.
#
# The modifications include adaptations for the InfiniCore framework.
#
# Original transformers source:
# https://github.com/huggingface/transformers
#
# Referencing PyTorch v4.57.0
#
# The use of this file is governed by the BSD 3-Clause License.
import
copy
from
typing
import
Any
class
PretrainedConfig
:
def
__init__
(
*
args
,
**
kwargs
):
pass
def
to_dict
(
self
)
->
dict
[
str
,
Any
]:
"""
Serializes this instance to a Python dictionary.
Returns:
`dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
"""
output
=
copy
.
deepcopy
(
self
.
__dict__
)
if
hasattr
(
self
.
__class__
,
"model_type"
):
output
[
"model_type"
]
=
self
.
__class__
.
model_type
# Transformers version when serializing the model
output
[
"transformers_version"
]
=
"None"
for
key
,
value
in
output
.
items
():
# Deal with nested configs like CLIP
if
isinstance
(
value
,
PretrainedConfig
):
value
=
value
.
to_dict
()
del
value
[
"transformers_version"
]
output
[
key
]
=
value
self
.
dict_dtype_to_str
(
output
)
return
output
def
is_encoder_decoder
(
self
):
return
False
def
dict_dtype_to_str
(
self
,
d
:
dict
[
str
,
Any
])
->
None
:
"""
Checks whether the passed dictionary and its nested dicts have a *dtype* key and if it's not None,
converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"*
string, which can then be stored in the json format.
"""
if
d
.
get
(
"dtype"
)
is
not
None
and
not
isinstance
(
d
[
"dtype"
],
str
):
d
[
"dtype"
]
=
str
(
d
[
"dtype"
]).
split
(
"."
)[
1
]
for
value
in
d
.
values
():
if
isinstance
(
value
,
dict
):
self
.
dict_dtype_to_str
(
value
)
def
get_text_config
(
self
,
decoder
=
None
,
encoder
=
None
):
return_both
=
(
decoder
==
encoder
)
# both unset or both set -> search all possible names
decoder_possible_text_config_names
=
(
"decoder"
,
"generator"
,
"text_config"
)
encoder_possible_text_config_names
=
(
"text_encoder"
,)
if
return_both
:
possible_text_config_names
=
(
encoder_possible_text_config_names
+
decoder_possible_text_config_names
)
elif
decoder
:
possible_text_config_names
=
decoder_possible_text_config_names
else
:
possible_text_config_names
=
encoder_possible_text_config_names
valid_text_config_names
=
[]
for
text_config_name
in
possible_text_config_names
:
if
hasattr
(
self
,
text_config_name
):
text_config
=
getattr
(
self
,
text_config_name
,
None
)
if
text_config
is
not
None
:
valid_text_config_names
+=
[
text_config_name
]
if
len
(
valid_text_config_names
)
>
1
:
raise
ValueError
(
f
"Multiple valid text configs were found in the model config:
{
valid_text_config_names
}
. In this "
"case, using `get_text_config()` would be ambiguous. Please specify the desired text config directly, "
"e.g. `text_config = config.sub_config_name`"
)
elif
len
(
valid_text_config_names
)
==
1
:
config_to_return
=
getattr
(
self
,
valid_text_config_names
[
0
])
else
:
config_to_return
=
self
# handle legacy models with flat config structure, when we only want one of the configs
if
(
not
return_both
and
len
(
valid_text_config_names
)
==
0
and
config_to_return
.
is_encoder_decoder
):
config_to_return
=
copy
.
deepcopy
(
config_to_return
)
prefix_to_discard
=
"encoder"
if
decoder
else
"decoder"
for
key
in
config_to_return
.
to_dict
():
if
key
.
startswith
(
prefix_to_discard
):
delattr
(
config_to_return
,
key
)
# old encoder/decoder models may use "encoder_layers"/"decoder_layers" instead of "num_hidden_layers"
if
decoder
and
hasattr
(
config_to_return
,
"decoder_layers"
):
config_to_return
.
num_hidden_layers
=
config_to_return
.
decoder_layers
elif
encoder
and
hasattr
(
config_to_return
,
"encoder_layers"
):
config_to_return
.
num_hidden_layers
=
config_to_return
.
encoder_layers
return
config_to_return
python/infinilm/generation/utils.py
0 → 100644
View file @
3ddffe8d
import
time
from
typing
import
Optional
import
infinicore
from
..cache_utils
import
Cache
,
DynamicCache
import
numpy
as
np
def
infini_to_ctype_dtype
(
infini_dtype
):
"""Convert PyTorch data type to infinicore data type"""
import
ctypes
if
infini_dtype
==
infinicore
.
int32
:
return
ctypes
.
c_int32
elif
infini_dtype
==
infinicore
.
float32
:
return
ctypes
.
c_float
else
:
raise
ValueError
(
f
"Unsupported py_dtype:
{
infini_dtype
}
"
)
def
infini_to_numpy
(
infini_tensor
:
infinicore
.
Tensor
):
if
infini_tensor
.
device
.
type
!=
"cpu"
:
infini_tensor_cpu
=
infini_tensor
.
to
(
infinicore
.
device
(
"cpu"
,
0
))
else
:
infini_tensor_cpu
=
infini_tensor
# 获取数据指针和形状信息
data_ptr
=
infini_tensor_cpu
.
data_ptr
()
num_elements
=
infini_tensor_cpu
.
numel
()
original_shape
=
infini_tensor_cpu
.
shape
# 创建1D NumPy数组(共享内存)
ArrayType
=
infini_to_ctype_dtype
(
infini_tensor_cpu
.
dtype
)
*
num_elements
array
=
ArrayType
.
from_address
(
data_ptr
)
np_flat
=
np
.
ctypeslib
.
as_array
(
array
)
# 重塑为原始形状
np_array
=
np_flat
.
reshape
(
original_shape
)
return
np
.
copy
(
np_array
)
infinicore
.
Tensor
.
to_numpy
=
infini_to_numpy
class
GenerationMixin
:
def
_get_initial_cache_position
(
self
,
bs
:
int
,
seq_length
:
int
,
device
:
infinicore
.
device
,
)
->
infinicore
.
Tensor
:
"""Calculates `cache_position` for the pre-fill stage"""
cache_position_list
=
[
list
(
range
(
0
,
seq_length
))
for
i
in
range
(
bs
)]
return
infinicore
.
from_list
(
cache_position_list
,
dtype
=
infinicore
.
int64
,
device
=
device
)
def
prepare_inputs_for_generation
(
self
,
device
:
infinicore
.
device
,
past_key_values
:
Optional
[
Cache
]
=
None
,
**
kwargs
,
):
"""Prepare the model inputs for generation."""
# 1. Handle BC:
model_inputs
=
{}
# -------------------------------------------------------------------- #
# 所需的: KV Cache
# -------------------------------------------------------------------- #
if
past_key_values
is
not
None
:
model_inputs
[
"past_key_values"
]
=
past_key_values
# -------------------------------------------------------------------------- #
# 计算所需的,cache_position
# -------------------------------------------------------------------------- #
current_cache_position
=
kwargs
.
get
(
"cache_position"
,
None
)
if
current_cache_position
is
None
:
# prill阶段
bs
,
seq_len
=
kwargs
[
"input_ids"
].
shape
[
0
:
2
]
model_inputs
[
"cache_position"
]
=
self
.
_get_initial_cache_position
(
bs
,
seq_len
,
device
)
else
:
# decoder 阶段
bs
,
seq_len
=
current_cache_position
.
shape
last_position
=
current_cache_position
.
narrow
(
1
,
seq_len
-
1
,
1
)
one_value
=
infinicore
.
from_list
(
[
1
],
dtype
=
last_position
.
dtype
,
device
=
last_position
.
device
,
).
view
((
bs
,
1
))
next_position
=
one_value
+
last_position
model_inputs
[
"cache_position"
]
=
next_position
# -------------------------------------------------------------------- #
# 所需的: token的input_ids
# -------------------------------------------------------------------- #
if
kwargs
.
get
(
"next_token_id"
,
None
)
is
not
None
:
next_token_id
=
kwargs
[
"next_token_id"
]
model_inputs
[
"input_ids"
]
=
infinicore
.
from_list
([[
next_token_id
]])
# -------------------------------------------------------------------- #
# 其他
# -------------------------------------------------------------------- #
for
key
,
value
in
kwargs
.
items
():
if
key
not
in
model_inputs
:
model_inputs
[
key
]
=
value
return
model_inputs
def
generate
(
self
,
input_ids
:
infinicore
.
Tensor
,
max_new_tokens
:
int
,
device
:
infinicore
.
device
,
tokenizer
,
config
,
**
kwargs
,
):
model_kwargs
=
kwargs
# -------------------------------------------------------------------- #
# 创建 cache #
# -------------------------------------------------------------------- #
model_kwargs
[
"use_cache"
]
=
True
model_kwargs
[
"past_key_values"
]
=
DynamicCache
(
config
=
self
.
config
)
# -------------------------------------------------------------------- #
# _sample函数 #
# -------------------------------------------------------------------- #
result
=
self
.
_sample
(
input_ids
,
max_new_tokens
=
max_new_tokens
,
device
=
device
,
tokenizer
=
tokenizer
,
config
=
config
,
**
model_kwargs
,
)
return
result
def
_sample
(
self
,
input_ids
:
infinicore
.
Tensor
,
max_new_tokens
:
int
,
device
:
infinicore
.
device
,
tokenizer
,
config
,
**
model_kwargs
,
):
r
"""
Generates sequences of token ids for models with a language modeling head.
Parameters:
input_ids (batch_size, seq_len): The sequence used as a prompt for the generation.
max_new_tokens: Maximum number of new tokens.
device: infinicore.device.
tokenizer: translating data into raw text.
"""
batch_size
,
seq_len
=
input_ids
.
shape
[:
2
]
eos_token_id
=
config
.
eos_token_id
eos_token_id_list
=
(
[
eos_token_id
]
if
isinstance
(
eos_token_id
,
int
)
else
eos_token_id
)
# -------------------------------------------------------------------------- #
# 初始化 cache_position
# -------------------------------------------------------------------------- #
output_tokens_list
=
[]
model_kwargs
[
"input_ids"
]
=
input_ids
model_kwargs
[
"cache_position"
]
=
None
output_content
=
""
print
()
time_list
=
[]
for
i
in
range
(
0
,
max_new_tokens
):
# -------------------------------------------------------------------------- #
# prepare model inputs
# -------------------------------------------------------------------------- #
model_inputs
=
self
.
prepare_inputs_for_generation
(
device
,
**
model_kwargs
)
model_kwargs
[
"cache_position"
]
=
model_inputs
[
"cache_position"
]
# -------------------------------------------------------------------------- #
# 计算一次
# -------------------------------------------------------------------------- #
start_time
=
time
.
time
()
logits
=
self
.
forward
(
**
model_inputs
,
return_dict
=
True
)
# -------------------------------------------------------------------------- #
# 处理输出
# -------------------------------------------------------------------------- #
token_scores
=
logits
# -------------------------------------------------------------------------- #
# random_sample
# -------------------------------------------------------------------------- #
batch_size
,
_
,
vocab_size
=
token_scores
.
shape
next_tokens
=
infinicore
.
empty
(
(
batch_size
,),
dtype
=
infinicore
.
int32
,
device
=
token_scores
.
device
,
)
for
i
in
range
(
0
,
batch_size
):
score
=
token_scores
.
narrow
(
0
,
i
,
1
).
view
([
vocab_size
])
out
=
next_tokens
.
narrow
(
0
,
i
,
1
).
view
([])
infinicore
.
nn
.
functional
.
random_sample
(
score
,
0.8
,
0.1
,
1
,
1.0
,
out
=
out
,
)
end_time
=
time
.
time
()
time_list
.
append
((
end_time
-
start_time
)
*
1000
)
# ----------------------------------------------------------------- #
# 得到下一个token的id,并解码为字符
# ----------------------------------------------------------------- #
token_id
=
next_tokens
.
to_numpy
()[
0
]
output_str
=
tokenizer
.
decode
([
token_id
],
skip_special_tokens
=
True
)
model_kwargs
[
"next_token_id"
]
=
token_id
output_tokens_list
.
append
(
token_id
)
output_content
+=
output_str
print
(
output_str
,
end
=
""
,
flush
=
True
)
if
token_id
in
eos_token_id_list
:
break
print
(
f
"
\n\n
Time per step:
{
round
(
sum
(
time_list
)
/
len
(
time_list
),
2
)
}
ms
\n
"
,
)
return
output_tokens_list
,
output_content
python/infinilm/modeling_utils.py
0 → 100644
View file @
3ddffe8d
import
os
from
typing
import
Dict
,
Optional
,
Union
import
torch
from
safetensors
import
safe_open
# from safetensors.torch import load_file as safe_load_file
# from safetensors.torch import save_file as safe_save_file
import
infinicore
str_to_torch_dtype
=
{
"BOOL"
:
torch
.
bool
,
"U8"
:
torch
.
uint8
,
"I8"
:
torch
.
int8
,
"I16"
:
torch
.
int16
,
"F16"
:
torch
.
float16
,
"BF16"
:
torch
.
bfloat16
,
"I32"
:
torch
.
int32
,
"F32"
:
torch
.
float32
,
"F64"
:
torch
.
float64
,
"I64"
:
torch
.
int64
,
"F8_E4M3"
:
torch
.
float8_e4m3fn
,
"F8_E5M2"
:
torch
.
float8_e5m2
,
}
def
load_state_dict
(
checkpoint_file
:
Union
[
str
,
os
.
PathLike
],
map_location
:
Optional
[
Union
[
str
,
torch
.
device
]]
=
"cpu"
,
weights_only
:
bool
=
True
,
)
->
Dict
[
str
,
torch
.
Tensor
]:
"""
Reads a `safetensor` checkpoint file. We load the checkpoint on "cpu" by default.
"""
# Use safetensors if possible
if
not
checkpoint_file
.
endswith
(
".safetensors"
):
return
{}
state_dict
=
{}
with
safe_open
(
checkpoint_file
,
framework
=
"pt"
)
as
f
:
metadata
=
f
.
metadata
()
if
metadata
is
not
None
and
metadata
.
get
(
"format"
)
not
in
[
"pt"
,
"tf"
,
"flax"
,
"mlx"
,
]:
raise
OSError
(
f
"The safetensors archive passed at
{
checkpoint_file
}
does not contain the valid metadata."
)
for
k
in
f
.
keys
():
if
map_location
==
"meta"
:
_slice
=
f
.
get_slice
(
k
)
k_dtype
=
_slice
.
get_dtype
()
if
k_dtype
in
str_to_torch_dtype
:
dtype
=
str_to_torch_dtype
[
k_dtype
]
else
:
raise
ValueError
(
f
"Cannot load safetensors of unknown dtype
{
k_dtype
}
"
)
state_dict
[
k
]
=
torch
.
empty
(
size
=
_slice
.
get_shape
(),
dtype
=
dtype
,
device
=
"meta"
)
else
:
state_dict
[
k
]
=
f
.
get_tensor
(
k
)
return
state_dict
def
get_model_state_dict
(
model_path
:
str
,
device
:
infinicore
.
device
,
dtype
=
infinicore
.
dtype
,
)
->
Dict
[
str
,
infinicore
.
Tensor
]:
"""
Load the model weights.
"""
path
=
os
.
path
.
join
(
model_path
,
"model.safetensors"
)
model_param
=
load_state_dict
(
path
)
torch_device
=
device
.
type
torch_dtype
=
infinicore
.
utils
.
to_torch_dtype
(
dtype
)
model_param_infini
=
{}
for
key
,
value
in
model_param
.
items
():
model_param
[
key
]
=
value
.
to
(
device
=
torch_device
,
dtype
=
torch_dtype
)
for
key
,
value
in
model_param
.
items
():
model_param_infini
[
key
]
=
infinicore
.
from_torch
(
model_param
[
key
])
return
model_param_infini
python/infinilm/models/__init__.py
0 → 100644
View file @
3ddffe8d
from
.llama
import
*
python/infinilm/models/llama/__init__.py
0 → 100644
View file @
3ddffe8d
from
.configuration_llama
import
*
# noqa: F403
from
.modeling_llama
import
*
# noqa: F403
python/infinilm/models/llama/configuration_llama.py
0 → 100644
View file @
3ddffe8d
# coding=utf-8
# Copyright (c) 2025, InfiniCore
#
# This file contains modified code derived from transformers
# implementation, which is licensed under the BSD 3-Clause License.
#
# The modifications include adaptations for the InfiniCore framework.
#
# Original transformers source:
# https://github.com/huggingface/transformers
#
# Referencing PyTorch v4.57.0
#
# The use of this file is governed by the BSD 3-Clause License.
"""LLaMA model configuration"""
from
...configuration_utils
import
PretrainedConfig
class
LlamaConfig
(
PretrainedConfig
):
r
"""
This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the LLaMA-7B.
e.g. [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 32000):
Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`LlamaModel`]
hidden_size (`int`, *optional*, defaults to 4096):
Dimension of the hidden representations.
intermediate_size (`int`, *optional*, defaults to 11008):
Dimension of the MLP representations.
num_hidden_layers (`int`, *optional*, defaults to 32):
Number of hidden layers in the Transformer decoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads for each attention layer in the Transformer decoder.
num_key_value_heads (`int`, *optional*):
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
`num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
by meanpooling all the original heads within that group. For more details, check out [this
paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
`num_attention_heads`.
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
The non-linear activation function (function or string) in the decoder.
max_position_embeddings (`int`, *optional*, defaults to 2048):
The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens,
Llama 2 up to 4096, CodeLlama up to 16384.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
The epsilon used by the rms normalization layers.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Only
relevant if `config.is_decoder=True`.
pad_token_id (`int`, *optional*):
Padding token id.
bos_token_id (`int`, *optional*, defaults to 1):
Beginning of stream token id.
eos_token_id (`int`, *optional*, defaults to 2):
End of stream token id.
pretraining_tp (`int`, *optional*, defaults to 1):
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
Whether to tie weight embeddings
rope_theta (`float`, *optional*, defaults to 10000.0):
The base period of the RoPE embeddings.
rope_scaling (`Dict`, *optional*):
Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
accordingly.
Expected contents:
`rope_type` (`str`):
The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
'llama3'], with 'default' being the original RoPE implementation.
`factor` (`float`, *optional*):
Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
most scaling types, a `factor` of x will enable the model to handle sequences of length x *
original maximum pre-trained length.
`original_max_position_embeddings` (`int`, *optional*):
Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
pretraining.
`attention_factor` (`float`, *optional*):
Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
computation. If unspecified, it defaults to value recommended by the implementation, using the
`factor` field to infer the suggested value.
`beta_fast` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
ramp function. If unspecified, it defaults to 32.
`beta_slow` (`float`, *optional*):
Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
ramp function. If unspecified, it defaults to 1.
`short_factor` (`list[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to short contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`long_factor` (`list[float]`, *optional*):
Only used with 'longrope'. The scaling factor to be applied to long contexts (<
`original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
size divided by the number of attention heads divided by 2
`low_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
`high_freq_factor` (`float`, *optional*):
Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
attention_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in the query, key, value and output projection layers during self-attention.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
mlp_bias (`bool`, *optional*, defaults to `False`):
Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
head_dim (`int`, *optional*):
The attention head dimension. If None, it will default to hidden_size // num_attention_heads
```python
>>> from transformers import LlamaModel, LlamaConfig
>>> # Initializing a LLaMA llama-7b style configuration
>>> configuration = LlamaConfig()
>>> # Initializing a model from the llama-7b style configuration
>>> model = LlamaModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type
=
"llama"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
# Default tensor parallel plan for base model `LlamaModel`
base_model_tp_plan
=
{
"layers.*.self_attn.q_proj"
:
"colwise"
,
"layers.*.self_attn.k_proj"
:
"colwise"
,
"layers.*.self_attn.v_proj"
:
"colwise"
,
"layers.*.self_attn.o_proj"
:
"rowwise"
,
"layers.*.mlp.gate_proj"
:
"colwise"
,
"layers.*.mlp.up_proj"
:
"colwise"
,
"layers.*.mlp.down_proj"
:
"rowwise"
,
}
base_model_pp_plan
=
{
"embed_tokens"
:
([
"input_ids"
],
[
"inputs_embeds"
]),
"layers"
:
([
"hidden_states"
,
"attention_mask"
],
[
"hidden_states"
]),
"norm"
:
([
"hidden_states"
],
[
"hidden_states"
]),
}
def
__init__
(
self
,
vocab_size
=
32000
,
hidden_size
=
4096
,
intermediate_size
=
11008
,
num_hidden_layers
=
32
,
num_attention_heads
=
32
,
num_key_value_heads
=
None
,
hidden_act
=
"silu"
,
max_position_embeddings
=
2048
,
initializer_range
=
0.02
,
rms_norm_eps
=
1e-6
,
use_cache
=
True
,
pad_token_id
=
None
,
bos_token_id
=
1
,
eos_token_id
=
2
,
pretraining_tp
=
1
,
tie_word_embeddings
=
False
,
rope_theta
=
10000.0
,
rope_scaling
=
None
,
attention_bias
=
False
,
attention_dropout
=
0.0
,
mlp_bias
=
False
,
head_dim
=
None
,
**
kwargs
,
):
# ---
self
.
model_type
=
"llama"
self
.
name_or_path
=
""
self
.
pad_token_id
=
pad_token_id
self
.
bos_token_id
=
bos_token_id
self
.
eos_token_id
=
eos_token_id
# ---
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
hidden_size
=
hidden_size
self
.
intermediate_size
=
intermediate_size
self
.
num_hidden_layers
=
num_hidden_layers
self
.
num_attention_heads
=
num_attention_heads
# for backward compatibility
if
num_key_value_heads
is
None
:
num_key_value_heads
=
num_attention_heads
self
.
num_key_value_heads
=
num_key_value_heads
self
.
hidden_act
=
hidden_act
self
.
initializer_range
=
initializer_range
self
.
rms_norm_eps
=
rms_norm_eps
self
.
pretraining_tp
=
pretraining_tp
self
.
use_cache
=
use_cache
self
.
rope_theta
=
rope_theta
self
.
rope_scaling
=
rope_scaling
self
.
attention_bias
=
attention_bias
self
.
attention_dropout
=
attention_dropout
self
.
mlp_bias
=
mlp_bias
self
.
head_dim
=
(
head_dim
if
head_dim
is
not
None
else
self
.
hidden_size
//
self
.
num_attention_heads
)
# Validate the correctness of rotary position embeddings parameters
# BC: if there is a 'type' field, copy it it to 'rope_type'.
if
self
.
rope_scaling
is
not
None
and
"type"
in
self
.
rope_scaling
:
self
.
rope_scaling
[
"rope_type"
]
=
self
.
rope_scaling
[
"type"
]
# rope_config_validation(self)
super
().
__init__
(
pad_token_id
=
pad_token_id
,
bos_token_id
=
bos_token_id
,
eos_token_id
=
eos_token_id
,
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
__all__
=
[
"LlamaConfig"
]
python/infinilm/models/llama/modeling_llama.py
0 → 100644
View file @
3ddffe8d
# Copyright (c) 2025, InfiniCore
#
# This file contains modified code derived from transformers
# implementation, which is licensed under the BSD 3-Clause License.
#
# The modifications include adaptations for the InfiniCore framework.
#
# Original transformers source:
# https://github.com/huggingface/transformers
#
# Referencing PyTorch v4.57.0
#
# The use of this file is governed by the BSD 3-Clause License.
import
json
import
os
from
typing
import
Optional
,
Union
from
transformers.utils
import
logging
import
infinicore
from
...cache_utils
import
Cache
,
DynamicCache
from
...generation.utils
import
GenerationMixin
from
.configuration_llama
import
LlamaConfig
logger
=
logging
.
get_logger
(
__name__
)
def
repeat_kv
(
keys
:
infinicore
.
Tensor
,
values
:
infinicore
.
Tensor
,
ngroup
:
int
):
total_seq_len
,
num_key_value_heads
,
head_dim
=
keys
.
shape
keys_repeat
=
infinicore
.
empty
(
(
total_seq_len
,
num_key_value_heads
,
ngroup
,
head_dim
),
dtype
=
keys
.
dtype
,
device
=
keys
.
device
,
)
values_repeat
=
infinicore
.
empty
(
(
total_seq_len
,
num_key_value_heads
,
ngroup
,
head_dim
),
dtype
=
values
.
dtype
,
device
=
values
.
device
,
)
for
i
in
range
(
ngroup
):
keys_repeat
.
narrow
(
2
,
i
,
1
).
copy_
(
keys
.
view
((
total_seq_len
,
num_key_value_heads
,
1
,
head_dim
))
)
values_repeat
.
narrow
(
2
,
i
,
1
).
copy_
(
values
.
view
((
total_seq_len
,
num_key_value_heads
,
1
,
head_dim
))
)
keys_new
=
keys_repeat
.
view
((
total_seq_len
,
num_key_value_heads
*
ngroup
,
head_dim
))
values_new
=
values_repeat
.
view
(
(
total_seq_len
,
num_key_value_heads
*
ngroup
,
head_dim
)
)
return
keys_new
,
values_new
def
multi_head_attention
(
querys
:
infinicore
.
Tensor
,
# [seq_len, num_heads, head_dim]
keys
:
infinicore
.
Tensor
,
# [total_seq_len, num_heads, head_dim]
values
:
infinicore
.
Tensor
,
# [total_seq_len, num_heads, head_dim]
scaling
:
float
,
):
# => [ num_heads, seq_len, head_dim]
Q
=
querys
.
permute
((
1
,
0
,
2
))
# => [ num_heads, total_seq_len, head_dim]
K
=
keys
# => [ num_heads, total_seq_len, head_dim]
V
=
values
.
permute
((
1
,
0
,
2
))
# [num_heads, seq_len, head_dim] @ [ num_heads, head_dim, total_seq_len]
# => [ num_heads, seq_len, total_seq_len]
attn_weight
=
Q
@
K
.
permute
((
1
,
2
,
0
))
scaling
=
infinicore
.
from_list
(
[
scaling
],
dtype
=
attn_weight
.
dtype
,
device
=
attn_weight
.
device
).
as_strided
(
attn_weight
.
shape
,
[
0
,
0
,
0
])
attn_weight
=
attn_weight
*
scaling
infinicore
.
nn
.
functional
.
causal_softmax
(
attn_weight
,
out
=
attn_weight
)
# [ num_heads, seq_len, total_seq_len] @ [num_heads, total_seq_len, head_dim]
# => [ num_heads,seq_len,head_dim]
out
=
attn_weight
@
V
# => [seq_len, num_heads, head_dim]
return
out
.
permute
((
1
,
0
,
2
)).
contiguous
()
def
grouped_query_attention
(
querys
:
infinicore
.
Tensor
,
# [seq_len, num_attention_heads, head_dim]
keys
:
infinicore
.
Tensor
,
# [total_seq_len, num_key_value_heads, head_dim]
values
:
infinicore
.
Tensor
,
# [total_seq_len, num_key_value_heads, head_dim]
scaling
:
float
,
):
num_attention_heads
=
querys
.
shape
[
1
]
num_key_value_heads
=
keys
.
shape
[
1
]
ngroup
=
num_attention_heads
//
num_key_value_heads
if
ngroup
>
1
:
keys
,
values
=
repeat_kv
(
keys
,
values
,
ngroup
)
return
multi_head_attention
(
querys
,
keys
,
values
,
scaling
=
scaling
)
LlamaRMSNorm
=
infinicore
.
nn
.
RMSNorm
class
LlamaMLP
(
infinicore
.
nn
.
Module
):
def
__init__
(
self
,
config
,
**
kwargs
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
intermediate_size
=
config
.
intermediate_size
mlp_bias
=
config
.
mlp_bias
self
.
gate_proj
=
infinicore
.
nn
.
Linear
(
hidden_size
,
intermediate_size
,
bias
=
mlp_bias
,
**
kwargs
)
self
.
up_proj
=
infinicore
.
nn
.
Linear
(
hidden_size
,
intermediate_size
,
bias
=
mlp_bias
,
**
kwargs
)
self
.
down_proj
=
infinicore
.
nn
.
Linear
(
intermediate_size
,
hidden_size
,
bias
=
mlp_bias
,
**
kwargs
)
self
.
act_fn
=
infinicore
.
nn
.
functional
.
silu
def
forward
(
self
,
x
:
infinicore
.
Tensor
)
->
infinicore
.
Tensor
:
down_proj
=
self
.
down_proj
(
self
.
act_fn
(
self
.
gate_proj
(
x
))
*
self
.
up_proj
(
x
))
return
down_proj
class
LlamaAttention
(
infinicore
.
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
,
layer_idx
:
int
,
**
kwargs
):
super
().
__init__
()
self
.
config
=
config
self
.
layer_idx
=
layer_idx
self
.
hidden_size
=
config
.
hidden_size
self
.
num_attention_heads
=
config
.
num_attention_heads
self
.
num_key_value_heads
=
config
.
num_key_value_heads
self
.
num_key_value_groups
=
self
.
num_attention_heads
//
self
.
num_key_value_heads
attention_bias
=
config
.
attention_bias
self
.
head_dim
=
getattr
(
config
,
"head_dim"
,
self
.
hidden_size
//
self
.
num_attention_heads
)
self
.
scaling
=
self
.
head_dim
**-
0.5
self
.
q_proj
=
infinicore
.
nn
.
Linear
(
self
.
hidden_size
,
self
.
num_attention_heads
*
self
.
head_dim
,
bias
=
attention_bias
,
**
kwargs
,
)
self
.
k_proj
=
infinicore
.
nn
.
Linear
(
self
.
hidden_size
,
self
.
num_key_value_heads
*
self
.
head_dim
,
bias
=
attention_bias
,
**
kwargs
,
)
self
.
v_proj
=
infinicore
.
nn
.
Linear
(
self
.
hidden_size
,
self
.
num_key_value_heads
*
self
.
head_dim
,
bias
=
attention_bias
,
**
kwargs
,
)
self
.
o_proj
=
infinicore
.
nn
.
Linear
(
self
.
num_attention_heads
*
self
.
head_dim
,
self
.
hidden_size
,
bias
=
attention_bias
,
**
kwargs
,
)
def
forward
(
self
,
hidden_states
:
infinicore
.
Tensor
,
past_key_values
:
Optional
[
Cache
]
=
None
,
rope_instance
:
infinicore
.
nn
.
RoPE
=
None
,
**
kwargs
,
)
->
infinicore
.
Tensor
:
hidden_states_shape
=
hidden_states
.
shape
# [bs, seq_len, hidden_size]
bs
,
seq_len
=
hidden_states_shape
[:
-
1
]
# [bs, seq_len]
querys_shape
=
(
bs
,
seq_len
,
self
.
num_attention_heads
,
self
.
head_dim
)
keys_shape
=
(
bs
,
seq_len
,
self
.
num_key_value_heads
,
self
.
head_dim
)
values_shape
=
(
bs
,
seq_len
,
self
.
num_key_value_heads
,
self
.
head_dim
)
# --------------------------------------------------------------------------------------- #
# 对 Q,K,V进行 project
# --------------------------------------------------------------------------------------- #
# => [bs, seq_len, num_attention_heads, head_dim]
query_states
=
self
.
q_proj
(
hidden_states
).
view
(
querys_shape
)
# => [bs, seq_len, num_key_value_heads, head_dim]
key_states
=
self
.
k_proj
(
hidden_states
).
view
(
keys_shape
)
# => [bs, seq_len, nkvh, head_dim]
value_states
=
self
.
v_proj
(
hidden_states
).
view
(
values_shape
)
# --------------------------------------------------------------------------------------- #
# 对 Q和K, 加上 rope
# --------------------------------------------------------------------------------------- #
cache_position
=
kwargs
.
pop
(
"cache_position"
,
None
)
if
cache_position
is
None
:
raise
KeyError
(
"cache_position error"
)
if
rope_instance
is
None
:
raise
KeyError
(
"rope_instance error"
)
query_states
=
rope_instance
(
query_states
,
cache_position
)
key_states
=
rope_instance
(
key_states
,
cache_position
)
# --------------------------------------------------------------------------------------- #
# kv cache
# --------------------------------------------------------------------------------------- #
if
past_key_values
is
not
None
:
cache_kwargs
=
{}
key_states_total
,
value_states_total
=
past_key_values
.
update
(
key_states
,
# [bs, seq_len, num_key_value_heads, head_dim]
value_states
,
# [bs, seq_len, num_key_value_heads, head_dim]
self
.
layer_idx
,
cache_kwargs
,
)
# --------------------------------------------------------------------------------------- #
# 注意力计算
# --------------------------------------------------------------------------------------- #
total_seq_len
=
key_states_total
.
shape
[
1
]
attn_output
=
infinicore
.
empty_like
(
query_states
)
for
i
in
range
(
0
,
bs
):
query_states_i
=
query_states
.
narrow
(
0
,
i
,
1
).
view
(
(
seq_len
,
self
.
num_attention_heads
,
self
.
head_dim
)
)
key_states_i
=
key_states_total
.
narrow
(
0
,
i
,
1
).
view
(
(
total_seq_len
,
self
.
num_key_value_heads
,
self
.
head_dim
)
)
value_states_i
=
value_states_total
.
narrow
(
0
,
i
,
1
).
view
(
(
total_seq_len
,
self
.
num_key_value_heads
,
self
.
head_dim
)
)
attn_output_i
=
attn_output
.
narrow
(
0
,
i
,
1
).
view
(
(
seq_len
,
self
.
num_attention_heads
,
self
.
head_dim
)
)
attention_i
=
grouped_query_attention
(
query_states_i
,
key_states_i
,
value_states_i
,
scaling
=
self
.
scaling
)
attn_output_i
.
copy_
(
attention_i
)
# --------------------------------------------------------------------------------------- #
# out project
# --------------------------------------------------------------------------------------- #
# ([bs, seq_len, num_attention_heads, head_dim]) ==> [bs, seq_len, hidden_size ]
attn_output
=
attn_output
.
view
(
hidden_states_shape
)
# o_proj
return
self
.
o_proj
(
attn_output
)
class
LlamaDecoderLayer
(
infinicore
.
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
,
layer_idx
:
int
,
**
kwargs
):
super
().
__init__
()
hidden_size
=
config
.
hidden_size
rms_norm_eps
=
config
.
rms_norm_eps
self
.
self_attn
=
LlamaAttention
(
config
=
config
,
layer_idx
=
layer_idx
,
**
kwargs
)
self
.
mlp
=
LlamaMLP
(
config
=
config
,
**
kwargs
)
self
.
input_layernorm
=
LlamaRMSNorm
(
hidden_size
,
eps
=
rms_norm_eps
,
**
kwargs
)
self
.
post_attention_layernorm
=
LlamaRMSNorm
(
hidden_size
,
eps
=
rms_norm_eps
,
**
kwargs
)
def
forward
(
self
,
hidden_states
:
infinicore
.
Tensor
,
# [bs, seq_len, hidden_size]
past_key_values
:
Optional
[
Cache
]
=
None
,
use_cache
:
Optional
[
bool
]
=
False
,
rope_instance
=
None
,
**
kwargs
,
)
->
infinicore
.
Tensor
:
# ------------------------------------------------ #
# Self Attention
# ------------------------------------------------ #
residual
=
hidden_states
hidden_states
=
self
.
input_layernorm
(
hidden_states
)
hidden_states
=
self
.
self_attn
(
hidden_states
=
hidden_states
,
past_key_values
=
past_key_values
,
use_cache
=
use_cache
,
rope_instance
=
rope_instance
,
**
kwargs
,
)
hidden_states
=
residual
+
hidden_states
# ------------------------------------------------ #
# Fully Connected
# ------------------------------------------------ #
residual
=
hidden_states
hidden_states
=
self
.
post_attention_layernorm
(
hidden_states
)
hidden_states
=
self
.
mlp
(
hidden_states
)
hidden_states
=
residual
+
hidden_states
return
hidden_states
class
LlamaModel
(
infinicore
.
nn
.
Module
):
def
__init__
(
self
,
config
:
LlamaConfig
,
**
kwargs
):
super
().
__init__
()
self
.
config
=
config
self
.
padding_idx
=
config
.
pad_token_id
head_dim
=
getattr
(
config
,
"head_dim"
,
config
.
hidden_size
//
config
.
num_attention_heads
)
self
.
embed_tokens
=
infinicore
.
nn
.
Embedding
(
config
.
vocab_size
,
config
.
hidden_size
,
**
kwargs
)
self
.
layers
=
infinicore
.
nn
.
ModuleList
(
[
LlamaDecoderLayer
(
config
,
layer_idx
,
**
kwargs
)
for
layer_idx
in
range
(
config
.
num_hidden_layers
)
]
)
self
.
norm
=
LlamaRMSNorm
(
config
.
hidden_size
,
eps
=
config
.
rms_norm_eps
,
**
kwargs
)
self
.
rope_instance
=
infinicore
.
nn
.
RoPE
(
max_position_embeddings
=
config
.
max_position_embeddings
,
rope_theta
=
config
.
rope_theta
,
head_dim
=
head_dim
,
**
kwargs
,
)
def
forward
(
self
,
input_ids
,
cache_position
,
past_key_values
:
Optional
[
Cache
]
=
None
,
use_cache
:
Optional
[
bool
]
=
None
,
# True
**
kwargs
,
):
if
use_cache
and
past_key_values
is
None
:
past_key_values
=
DynamicCache
(
config
=
self
.
config
)
# --------------------------------------------------------- #
# token的embedding
# --------------------------------------------------------- #
# input_ids : {1,5} tensor([[ 1, 1128, 526, 366, 29892]])
# inputs_embeds : {1,5,2048} tensor([[[...]]])
inputs_embeds
=
self
.
embed_tokens
(
input_ids
)
# --------------------------------------------------------- #
# decoder_layer
# --------------------------------------------------------- #
ilayer
=
0
# noqa: F841
hidden_states
=
inputs_embeds
for
decoder_layer
in
self
.
layers
[:
self
.
config
.
num_hidden_layers
]:
# print("ilayer: ", ilayer)
# ilayer += 1
hidden_states
=
decoder_layer
(
hidden_states
,
past_key_values
=
past_key_values
,
cache_position
=
cache_position
,
rope_instance
=
self
.
rope_instance
,
**
kwargs
,
)
# --------------------------------------------------------- #
# norm
# --------------------------------------------------------- #
seq_len
=
hidden_states
.
shape
[
1
]
last_token
=
hidden_states
.
narrow
(
1
,
seq_len
-
1
,
1
)
return
self
.
norm
(
last_token
)
class
LlamaForCausalLM
(
infinicore
.
nn
.
Module
,
GenerationMixin
):
config
:
LlamaConfig
def
__init__
(
self
,
config
,
**
kwargs
):
super
().
__init__
()
self
.
config
=
config
self
.
model
=
LlamaModel
(
config
,
**
kwargs
)
self
.
lm_head
=
infinicore
.
nn
.
Linear
(
config
.
hidden_size
,
config
.
vocab_size
,
bias
=
False
,
**
kwargs
,
)
def
forward
(
self
,
input_ids
,
cache_position
,
past_key_values
:
Optional
[
Cache
]
=
None
,
use_cache
:
Optional
[
bool
]
=
None
,
**
kwargs
,
):
last_token
=
self
.
model
(
input_ids
,
cache_position
,
past_key_values
=
past_key_values
,
use_cache
=
use_cache
,
**
kwargs
,
)
return
self
.
lm_head
(
last_token
)
@
classmethod
def
from_pretrained
(
cls
,
model_path
:
Optional
[
Union
[
str
,
os
.
PathLike
]],
device
:
infinicore
.
device
,
dtype
=
infinicore
.
dtype
,
):
def
load_config_json
(
dir_path_
:
str
):
with
open
(
os
.
path
.
join
(
dir_path_
,
"config.json"
),
"r"
)
as
f
:
config
=
json
.
load
(
f
)
return
config
config_dict
=
load_config_json
(
os
.
path
.
join
(
model_path
))
config
=
LlamaConfig
(
**
config_dict
)
return
LlamaForCausalLM
(
config
,
device
=
device
,
dtype
=
dtype
)
__all__
=
[
"LlamaModel"
,
"LlamaForCausalLM"
,
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment