Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
e4114c03
Commit
e4114c03
authored
Nov 26, 2025
by
pengcheng888
Browse files
issue/83 - 添加AutoLlama类,支持创建不同backend的模型
parent
5d182420
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
176 additions
and
64 deletions
+176
-64
README.md
README.md
+5
-1
examples/llama.py
examples/llama.py
+38
-18
python/infinilm/__init__.py
python/infinilm/__init__.py
+3
-1
python/infinilm/generation/utils.py
python/infinilm/generation/utils.py
+27
-19
python/infinilm/modeling_utils.py
python/infinilm/modeling_utils.py
+16
-4
python/infinilm/models/__init__.py
python/infinilm/models/__init__.py
+3
-1
python/infinilm/models/llama/__init__.py
python/infinilm/models/llama/__init__.py
+35
-2
python/infinilm/models/llama/backends/cpp.py
python/infinilm/models/llama/backends/cpp.py
+38
-0
python/infinilm/models/llama/configuration_llama.py
python/infinilm/models/llama/configuration_llama.py
+0
-3
python/infinilm/models/llama/modeling_llama.py
python/infinilm/models/llama/modeling_llama.py
+11
-15
No files found.
README.md
View file @
e4114c03
...
...
@@ -49,5 +49,9 @@ python scripts/test_ppl.py --model-path MODEL_PATH [--ndev NDEV] [--max-batch MA
-
单次推理测试
-
llama示例
```
bash
python examples/llama.py
[
--cpu
|
--nvidia
]
--model_path
=
<path/to/model_dir>
python examples/llama.py
[
--cpu
|
--nvidia
|
--metax
|
--moore
|
--iluvatar
]
--model_path
=
<path/to/model_dir>
```
例如:
```
bash
python examples/llama.py
--nvidia
--model_path
=
~/TinyLlama-1.1B-Chat-v1.0
```
\ No newline at end of file
examples/llama.py
View file @
e4114c03
...
...
@@ -53,20 +53,28 @@ def get_args():
default
=
100
,
help
=
"max_new_tokens"
,
)
parser
.
add_argument
(
"--backend"
,
type
=
str
,
default
=
"python"
,
help
=
"python or cpp model"
,
)
return
parser
.
parse_args
()
def
test
(
model_path
,
device_str
=
"cuda"
,
max_new_tokens
=
100
):
def
test
(
prompt
,
model_path
,
max_new_tokens
=
100
,
infini_dtype
=
infinicore
.
bfloat16
,
infini_device
=
infinicore
.
device
(
"cpu"
,
0
),
backend
=
"python"
,
):
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
infini_device
=
infinicore
.
device
(
device_str
,
0
)
infini_dtype
=
infinicore
.
bfloat16
model
=
infinilm
.
LlamaForCausalLM
.
from_pretrained
(
model_path
,
device
=
infini_device
,
dtype
=
infini_dtype
,
model
=
infinilm
.
AutoLlamaModel
.
from_pretrained
(
model_path
,
device
=
infini_device
,
dtype
=
infini_dtype
,
backend
=
backend
)
# ---------------------------------------------------------------------------- #
...
...
@@ -85,7 +93,6 @@ def test(model_path, device_str="cuda", max_new_tokens=100):
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
)
if
"llama"
==
config
.
model_type
:
...
...
@@ -109,7 +116,7 @@ def test(model_path, device_str="cuda", max_new_tokens=100):
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
prompt
=
"山东最高的山是?"
#
prompt = "山东最高的山是?"
input_content
=
tokenizer
.
apply_chat_template
(
conversation
=
[{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
,
...
...
@@ -144,24 +151,37 @@ if __name__ == "__main__":
print
(
args
)
# Parse command line arguments
device_
type
=
"cpu"
device_
str
=
"cpu"
if
args
.
cpu
:
device_
type
=
"cpu"
device_
str
=
"cpu"
elif
args
.
nvidia
:
device_
type
=
"cuda"
device_
str
=
"cuda"
elif
args
.
metax
:
device_
type
=
"cuda"
device_
str
=
"cuda"
elif
args
.
moore
:
device_
type
=
"musa"
device_
str
=
"musa"
elif
args
.
iluvatar
:
device_
type
=
"cuda"
device_
str
=
"cuda"
else
:
print
(
"Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>"
"Usage: python examples/llama.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
\n
"
"such as, python examples/llama.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
)
sys
.
exit
(
1
)
prompt
=
"山东最高的山是?"
model_path
=
args
.
model_path
max_new_tokens
=
args
.
max_new_tokens
backend
=
args
.
backend
infini_device
=
infinicore
.
device
(
device_str
,
0
)
infini_dtype
=
infinicore
.
bfloat16
test
(
model_path
,
device_type
,
max_new_tokens
)
test
(
prompt
,
model_path
,
max_new_tokens
,
infini_device
=
infini_device
,
infini_dtype
=
infini_dtype
,
backend
=
backend
,
)
python/infinilm/__init__.py
View file @
e4114c03
from
.models
import
*
from
.models
import
AutoLlamaModel
__all__
=
[
"AutoLlamaModel"
]
python/infinilm/generation/utils.py
View file @
e4114c03
...
...
@@ -43,17 +43,17 @@ infinicore.Tensor.to_numpy = infini_to_numpy
class
GenerationMixin
:
def
_get_initial_
cache_
position
(
def
_get_initial_position
_ids
(
self
,
bs
:
int
,
seq_length
:
int
,
device
:
infinicore
.
device
,
)
->
infinicore
.
Tensor
:
"""Calculates `
cache_
position` for the pre-fill stage"""
cache_
position_list
=
[
list
(
range
(
0
,
seq_length
))
for
i
in
range
(
bs
)]
"""Calculates `position
_ids
` for the pre-fill stage"""
position_
ids_
list
=
[
list
(
range
(
0
,
seq_length
))
for
i
in
range
(
bs
)]
return
infinicore
.
from_list
(
cache_
position_list
,
dtype
=
infinicore
.
int64
,
device
=
device
position_
ids_
list
,
dtype
=
infinicore
.
int64
,
device
=
device
)
def
prepare_inputs_for_generation
(
...
...
@@ -73,29 +73,29 @@ class GenerationMixin:
model_inputs
[
"past_key_values"
]
=
past_key_values
# -------------------------------------------------------------------------- #
# 计算所需的,
cache_
position
# 计算所需的,position
_ids
# -------------------------------------------------------------------------- #
current_
cache_
position
=
kwargs
.
get
(
"
cache_
position"
,
None
)
if
current_
cache_
position
is
None
:
current_position
_ids
=
kwargs
.
get
(
"position
_ids
"
,
None
)
if
current_position
_ids
is
None
:
# prill阶段
bs
,
seq_len
=
kwargs
[
"input_ids"
].
shape
[
0
:
2
]
model_inputs
[
"
cache_
position"
]
=
self
.
_get_initial_
cache_
position
(
model_inputs
[
"position
_ids
"
]
=
self
.
_get_initial_position
_ids
(
bs
,
seq_len
,
device
)
else
:
# decoder 阶段
bs
,
seq_len
=
current_
cache_
position
.
shape
last_position
=
current_
cache_
position
.
narrow
(
1
,
seq_len
-
1
,
1
)
bs
,
seq_len
=
current_position
_ids
.
shape
last_position
=
current_position
_ids
.
narrow
(
1
,
seq_len
-
1
,
1
)
one_value
=
infinicore
.
from_list
(
[
1
],
[
1
]
*
bs
,
dtype
=
last_position
.
dtype
,
device
=
last_position
.
device
,
).
view
((
bs
,
1
))
next_position
=
one_value
+
last_position
model_inputs
[
"
cache_
position"
]
=
next_position
model_inputs
[
"position
_ids
"
]
=
next_position
# -------------------------------------------------------------------- #
# 所需的: token的input_ids
...
...
@@ -127,8 +127,12 @@ class GenerationMixin:
# -------------------------------------------------------------------- #
# 创建 cache #
# -------------------------------------------------------------------- #
model_kwargs
[
"use_cache"
]
=
True
model_kwargs
[
"past_key_values"
]
=
DynamicCache
(
config
=
self
.
config
)
if
self
.
use_cache
:
model_kwargs
[
"use_cache"
]
=
True
model_kwargs
[
"past_key_values"
]
=
DynamicCache
(
config
=
self
.
config
)
else
:
model_kwargs
[
"use_cache"
]
=
False
model_kwargs
[
"past_key_values"
]
=
None
# -------------------------------------------------------------------- #
# _sample函数 #
...
...
@@ -170,12 +174,12 @@ class GenerationMixin:
)
# -------------------------------------------------------------------------- #
# 初始化
cache_
position
# 初始化 position
_ids
# -------------------------------------------------------------------------- #
output_tokens_list
=
[]
model_kwargs
[
"input_ids"
]
=
input_ids
model_kwargs
[
"
cache_
position"
]
=
None
model_kwargs
[
"position
_ids
"
]
=
None
output_content
=
""
print
()
...
...
@@ -186,13 +190,13 @@ class GenerationMixin:
# -------------------------------------------------------------------------- #
model_inputs
=
self
.
prepare_inputs_for_generation
(
device
,
**
model_kwargs
)
model_kwargs
[
"
cache_
position"
]
=
model_inputs
[
"
cache_
position"
]
model_kwargs
[
"position
_ids
"
]
=
model_inputs
[
"position
_ids
"
]
# -------------------------------------------------------------------------- #
# 计算一次
# -------------------------------------------------------------------------- #
start_time
=
time
.
time
()
logits
=
self
.
forward
(
**
model_inputs
,
return_dict
=
True
)
logits
=
self
(
**
model_inputs
)
# -------------------------------------------------------------------------- #
# 处理输出
...
...
@@ -237,8 +241,12 @@ class GenerationMixin:
if
token_id
in
eos_token_id_list
:
break
print
(
"
\n
</s>"
)
print
(
f
"
\n\n\n
Time per step: prefill
{
round
(
time_list
[
0
],
2
)
}
token/ms
\n
"
,
)
print
(
f
"
\n\n
Time per step:
{
round
(
sum
(
time_list
)
/
len
(
time_list
),
2
)
}
ms
\n
"
,
f
" Time per step:
decoder
{
round
(
sum
(
time_list
[
1
:]
)
/
(
len
(
time_list
)
-
1
)
,
2
)
}
token/
ms
\n
"
,
)
return
output_tokens_list
,
output_content
python/infinilm/modeling_utils.py
View file @
e4114c03
...
...
@@ -3,9 +3,8 @@ from typing import Dict, Optional, Union
import
torch
from
safetensors
import
safe_open
import
glob
# from safetensors.torch import load_file as safe_load_file
# from safetensors.torch import save_file as safe_save_file
import
infinicore
str_to_torch_dtype
=
{
...
...
@@ -76,9 +75,19 @@ def get_model_state_dict(
"""
Load the model weights.
"""
path
=
os
.
path
.
join
(
model_path
,
"model.safetensors"
)
model_param
=
load_state_dict
(
path
)
# --------------------------------------------------------- #
# 使用从 *.safetensors文件中加载权重
# --------------------------------------------------------- #
model_param
=
{}
for
file_path
in
glob
.
glob
(
os
.
path
.
join
(
model_path
,
"*.safetensors"
)):
model_param
.
update
(
load_state_dict
(
file_path
))
if
model_param
.
get
(
"lm_head.weight"
,
None
)
is
None
:
model_param
[
"lm_head.weight"
]
=
model_param
[
"model.embed_tokens.weight"
]
# --------------------------------------------------------- #
# 调整权重的device和dtype
# --------------------------------------------------------- #
torch_device
=
device
.
type
torch_dtype
=
infinicore
.
utils
.
to_torch_dtype
(
dtype
)
...
...
@@ -86,6 +95,9 @@ def get_model_state_dict(
for
key
,
value
in
model_param
.
items
():
model_param
[
key
]
=
value
.
to
(
device
=
torch_device
,
dtype
=
torch_dtype
)
# --------------------------------------------------------- #
# model_param_infini 引用torch.Tensor
# --------------------------------------------------------- #
for
key
,
value
in
model_param
.
items
():
model_param_infini
[
key
]
=
infinicore
.
from_torch
(
model_param
[
key
])
...
...
python/infinilm/models/__init__.py
View file @
e4114c03
from
.llama
import
*
from
.llama
import
AutoLlamaModel
__all__
=
[
"AutoLlamaModel"
]
python/infinilm/models/llama/__init__.py
View file @
e4114c03
from
.configuration_llama
import
*
# noqa: F403
from
.modeling_llama
import
*
# noqa: F403
import
os
from
typing
import
Optional
,
Union
import
infinicore
__all__
=
[
"AutoLlamaModel"
]
class
AutoLlamaModel
:
@
classmethod
def
from_pretrained
(
cls
,
model_path
:
Optional
[
Union
[
str
,
os
.
PathLike
]],
device
:
infinicore
.
device
,
dtype
=
infinicore
.
dtype
,
backend
=
"python"
,
):
if
backend
==
"python"
:
from
.
import
modeling_llama
return
modeling_llama
.
LlamaForCausalLM
.
from_pretrained
(
model_path
,
device
=
device
,
dtype
=
dtype
,
)
elif
backend
==
"cpp"
:
from
.backends
import
cpp
return
cpp
.
LlamaForCausalLM
.
from_pretrained
(
model_path
,
device
=
device
,
dtype
=
dtype
,
)
raise
KeyError
(
"invalid backend"
)
python/infinilm/models/llama/backends/cpp.py
0 → 100644
View file @
e4114c03
from
....generation.utils
import
GenerationMixin
import
infinicore
import
os
from
typing
import
Optional
,
Union
class
LlamaForCausalLM
(
GenerationMixin
):
def
__init__
(
self
):
super
().
__init__
()
self
.
use_cache
=
False
self
.
_model
=
None
raise
NotImplementedError
(
"NotImplementedError!!"
)
def
forward
(
self
,
input_ids
,
position_ids
,
*
args
,
**
kwargs
):
kv_caches
=
None
return
infinicore
.
Tensor
(
self
.
_model
.
forward
(
input_ids
,
position_ids
,
kv_caches
)
)
def
__call__
(
self
,
input_ids
,
position_ids
,
*
args
,
**
kwargs
):
return
self
.
forward
(
input_ids
=
input_ids
,
position_ids
=
position_ids
)
@
classmethod
def
from_pretrained
(
cls
,
model_path
:
Union
[
str
,
os
.
PathLike
],
device
:
infinicore
.
device
,
dtype
=
infinicore
.
dtype
,
):
"""
Load a pretrained LlamaForCausalLM model from a directory.
Args:
model_path: Path to the model directory containing config.json
device: Device instance (defaults to CPU)
Returns:
LlamaForCausalLM instance
"""
raise
NotImplementedError
(
"NotImplementedError!!"
)
python/infinilm/models/llama/configuration_llama.py
View file @
e4114c03
...
...
@@ -228,6 +228,3 @@ class LlamaConfig(PretrainedConfig):
tie_word_embeddings
=
tie_word_embeddings
,
**
kwargs
,
)
__all__
=
[
"LlamaConfig"
]
python/infinilm/models/llama/modeling_llama.py
View file @
e4114c03
...
...
@@ -196,14 +196,14 @@ class LlamaAttention(infinicore.nn.Module):
# --------------------------------------------------------------------------------------- #
# 对 Q和K, 加上 rope
# --------------------------------------------------------------------------------------- #
cache_
position
=
kwargs
.
pop
(
"
cache_
position"
,
None
)
if
cache_
position
is
None
:
raise
KeyError
(
"
cache_
position error"
)
position
_ids
=
kwargs
.
pop
(
"position
_ids
"
,
None
)
if
position
_ids
is
None
:
raise
KeyError
(
"position
_ids
error"
)
if
rope_instance
is
None
:
raise
KeyError
(
"rope_instance error"
)
query_states
=
rope_instance
(
query_states
,
cache_
position
)
key_states
=
rope_instance
(
key_states
,
cache_
position
)
query_states
=
rope_instance
(
query_states
,
position
_ids
)
key_states
=
rope_instance
(
key_states
,
position
_ids
)
# --------------------------------------------------------------------------------------- #
# kv cache
...
...
@@ -338,7 +338,7 @@ class LlamaModel(infinicore.nn.Module):
def
forward
(
self
,
input_ids
,
cache_
position
,
position
_ids
,
past_key_values
:
Optional
[
Cache
]
=
None
,
use_cache
:
Optional
[
bool
]
=
None
,
# True
**
kwargs
,
...
...
@@ -364,7 +364,7 @@ class LlamaModel(infinicore.nn.Module):
hidden_states
=
decoder_layer
(
hidden_states
,
past_key_values
=
past_key_values
,
cache_
position
=
cache_
position
,
position
_ids
=
position
_ids
,
rope_instance
=
self
.
rope_instance
,
**
kwargs
,
)
...
...
@@ -384,6 +384,8 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
def
__init__
(
self
,
config
,
**
kwargs
):
super
().
__init__
()
self
.
config
=
config
self
.
use_cache
=
True
self
.
model
=
LlamaModel
(
config
,
**
kwargs
)
self
.
lm_head
=
infinicore
.
nn
.
Linear
(
config
.
hidden_size
,
...
...
@@ -395,14 +397,14 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
def
forward
(
self
,
input_ids
,
cache_
position
,
position
_ids
,
past_key_values
:
Optional
[
Cache
]
=
None
,
use_cache
:
Optional
[
bool
]
=
None
,
**
kwargs
,
):
last_token
=
self
.
model
(
input_ids
,
cache_
position
,
position
_ids
,
past_key_values
=
past_key_values
,
use_cache
=
use_cache
,
**
kwargs
,
...
...
@@ -425,9 +427,3 @@ class LlamaForCausalLM(infinicore.nn.Module, GenerationMixin):
config
=
LlamaConfig
(
**
config_dict
)
return
LlamaForCausalLM
(
config
,
device
=
device
,
dtype
=
dtype
)
__all__
=
[
"LlamaModel"
,
"LlamaForCausalLM"
,
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment