Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
1b5d1ea7
"...git@developer.sourcefind.cn:tsoc/superbenchmark.git" did not exist on "27b757ecd02759f2fb613ae337853b8d8644dd11"
Commit
1b5d1ea7
authored
Dec 11, 2025
by
pengcheng888
Browse files
issue/115 完善bench.py文件
parent
6498332e
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
252 additions
and
90 deletions
+252
-90
csrc/cache/cache_config.hpp
csrc/cache/cache_config.hpp
+1
-0
examples/bench.py
examples/bench.py
+238
-84
examples/jiuge.py
examples/jiuge.py
+3
-4
examples/llama.py
examples/llama.py
+0
-1
python/infinilm/generation/utils.py
python/infinilm/generation/utils.py
+0
-1
python/infinilm/models/llama/backends/cpp.py
python/infinilm/models/llama/backends/cpp.py
+10
-0
No files found.
csrc/cache/cache_config.hpp
View file @
1b5d1ea7
...
...
@@ -2,6 +2,7 @@
#include <cstddef>
#include <string>
#include <cstdint>
namespace
infinilm
::
cache
{
...
...
examples/bench.py
View file @
1b5d1ea7
import
infinicore
from
transformers
import
AutoTokenizer
from
tokenizers
import
decoders
as
_dec
from
infinilm.modeling_utils
import
load_model_state_dict_by_file
import
infinilm
from
infinilm.distributed
import
DistConfig
...
...
@@ -8,10 +7,121 @@ import argparse
import
sys
import
time
import
os
import
json
from
collections
import
OrderedDict
from
tqdm
import
tqdm
sys
.
path
.
insert
(
0
,
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
"../python"
))
DATA_TYPE_BYTES
=
{
"bfloat16"
:
2
,
"float16"
:
2
,
"float32"
:
4
,
}
# BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128]
# INPUT_LENS = [32, 256, 1024, 4096]
# OUTPUT_LENS = [256, 1024, 4096]
def
read_json_file
(
file_path
):
"""Load and return JSON content from file_path."""
with
open
(
file_path
,
"r"
)
as
file
:
return
json
.
load
(
file
)
def
parse_list
(
value
:
str
):
"""Parse parse_list argument: can be a single int or a list of ints.
Examples:
"1" -> 1
"[1,2,4]" -> [1, 2, 4]
"1,2,4" -> [1, 2, 4]
"""
value
=
value
.
strip
()
# Try to parse as JSON list first
if
value
.
startswith
(
"["
)
and
value
.
endswith
(
"]"
):
try
:
result
=
json
.
loads
(
value
)
if
isinstance
(
result
,
list
):
return
[
int
(
x
)
for
x
in
result
]
return
int
(
result
)
except
(
json
.
JSONDecodeError
,
ValueError
):
pass
# Try to parse as comma-separated values
if
","
in
value
:
try
:
return
[
int
(
x
.
strip
())
for
x
in
value
.
split
(
","
)]
except
ValueError
:
pass
# Try to parse as a single integer
try
:
return
int
(
value
)
except
ValueError
:
raise
argparse
.
ArgumentTypeError
(
f
"batch-size must be an int or list[int], got:
{
value
}
"
)
def
get_test_cases
(
model_path
:
str
,
batch_size_list
:
list
[
int
],
input_len_list
:
list
[
int
],
output_len_list
:
list
[
int
],
):
model_path
=
os
.
path
.
expanduser
(
model_path
)
"""Generate cases ordered by ascending KV cache memory usage."""
# Load model config to derive attention dimensions
config
=
read_json_file
(
os
.
path
.
join
(
model_path
,
"config.json"
))
head_dim
=
config
.
get
(
"head_dim"
,
config
.
get
(
"hidden_size"
)
//
config
.
get
(
"num_attention_heads"
)
)
# KV heads and layers drive cache size
num_key_value_heads
=
config
.
get
(
"num_key_value_heads"
)
num_hidden_layers
=
config
.
get
(
"num_hidden_layers"
)
# Enumerate all batch/input/output combinations and compute KV cache size
case_list
=
[]
for
batch_size
in
batch_size_list
:
for
input_len
in
input_len_list
:
for
output_len
in
output_len_list
:
for
data_type
in
[
"bfloat16"
]:
data_type_bytes
=
DATA_TYPE_BYTES
[
data_type
]
total_seq_len
=
input_len
+
output_len
kvcache_memory_bytes
=
(
data_type_bytes
*
(
batch_size
*
total_seq_len
*
num_key_value_heads
*
head_dim
)
*
num_hidden_layers
)
kvcache_memory_gb
=
kvcache_memory_bytes
/
(
1024
*
1024
*
1024
)
case_list
.
append
(
{
"idx"
:
len
(
case_list
),
"batch_size"
:
batch_size
,
"input_len"
:
input_len
,
"output_len"
:
output_len
,
"data_type"
:
data_type
,
"kvcache_memory"
:
round
(
kvcache_memory_gb
,
3
),
}
)
# Sort by KV cache size and wrap in OrderedDict with index keys
case_dict
=
OrderedDict
(
(
idx
,
case
)
for
idx
,
case
in
enumerate
(
sorted
(
case_list
,
key
=
lambda
case
:
case
[
"kvcache_memory"
])
)
)
return
case_dict
def
get_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"run Llama args"
)
...
...
@@ -41,9 +151,9 @@ def get_args():
parser
.
add_argument
(
"--batch-size"
,
type
=
in
t
,
type
=
parse_lis
t
,
default
=
1
,
help
=
"number of prompts in a batch"
,
help
=
"number of prompts in a batch
(can be an int or a list of ints, e.g., '1' or '[1,2,4]' or '1,2,4')
"
,
)
parser
.
add_argument
(
"--tensor-parallel-size"
,
...
...
@@ -54,15 +164,15 @@ def get_args():
)
parser
.
add_argument
(
"--input-len"
,
type
=
in
t
,
default
=
1
,
type
=
parse_lis
t
,
default
=
1
0
,
help
=
"output tokens"
,
)
parser
.
add_argument
(
"--output-len"
,
type
=
in
t
,
default
=
1
0
,
type
=
parse_lis
t
,
default
=
2
0
,
help
=
"output tokens"
,
)
return
parser
.
parse_args
()
...
...
@@ -77,77 +187,85 @@ def repeat_prompt(input_ids: list[int], target_length: int):
return
(
input_ids
*
repeat_times
)[:
target_length
]
def
test
(
model_path
,
infini_dtype
=
infinicore
.
bfloat16
,
infini_device
=
infinicore
.
device
(
"cpu"
,
0
),
batch_size
=
1
,
tp
=
1
,
input_len
=
10
,
output_len
=
10
,
):
model_path
=
os
.
path
.
expanduser
(
model_path
)
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
class
TestModel
:
model
:
infinicore
.
nn
.
Module
tokenizer
:
AutoTokenizer
input_ids_list
:
list
[
int
]
model
=
infinilm
.
AutoLlamaModel
.
from_pretrained
(
def
__init__
(
self
,
model_path
,
device
=
infini_device
,
dtype
=
infini_dtype
,
backend
=
"cpp"
,
distributed_config
=
DistConfig
(
tp
),
)
infini_dtype
=
infinicore
.
bfloat16
,
infini_device
=
infinicore
.
device
(
"cpu"
,
0
),
tp
=
1
,
)
->
None
:
model_path
=
os
.
path
.
expanduser
(
model_path
)
# ---------------------------------------------------------------------------- #
# 创建模型,
# ---------------------------------------------------------------------------- #
model
=
infinilm
.
AutoLlamaModel
.
from_pretrained
(
model_path
,
device
=
infini_device
,
dtype
=
infini_dtype
,
backend
=
"cpp"
,
distributed_config
=
DistConfig
(
tp
),
)
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
load_model_state_dict_by_file
(
model
,
model_path
,
dtype
=
infini_dtype
)
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
input_content
=
[
tokenizer
.
apply_chat_template
(
conversation
=
[{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
,
tokenize
=
False
,
# ---------------------------------------------------------------------------- #
# 加载权重
# ---------------------------------------------------------------------------- #
load_model_state_dict_by_file
(
model
,
model_path
,
dtype
=
infini_dtype
)
# ---------------------------------------------------------------------------- #
# 创建 tokenizer
# ---------------------------------------------------------------------------- #
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_path
,
trust_remote_code
=
True
)
# ---------------------------------------------------------------------------- #
# token编码
# ---------------------------------------------------------------------------- #
input_content
=
[
tokenizer
.
apply_chat_template
(
conversation
=
[{
"role"
:
"user"
,
"content"
:
prompt
}],
add_generation_prompt
=
True
,
tokenize
=
False
,
)
]
# print(input_content, end="", flush=True)
input_ids_list
=
tokenizer
.
batch_encode_plus
(
input_content
)[
"input_ids"
]
self
.
model
=
model
self
.
tokenizer
=
tokenizer
self
.
input_ids_list
=
input_ids_list
def
run
(
self
,
batch_size
:
int
,
input_len
:
int
,
output_len
:
int
,
):
input_ids
=
repeat_prompt
(
self
.
input_ids_list
[
0
],
target_length
=
input_len
)
input_ids_list
=
[
input_ids
]
*
batch_size
# ---------------------------------------------------------------------------- #
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_infini
=
infinicore
.
from_list
(
input_ids_list
)
t1
=
time
.
time
()
print
(
"=================== start generate ===================="
)
self
.
model
.
generate
(
input_ids_infini
,
max_new_tokens
=
output_len
,
tokenizer
=
self
.
tokenizer
,
stop_on_eos
=
False
,
)
]
# print(input_content, end="", flush=True)
input_ids_list
=
tokenizer
.
batch_encode_plus
(
input_content
)[
"input_ids"
]
# List: [[1, 1128, 526, 366, 29892]]
input_ids
=
repeat_prompt
(
input_ids_list
[
0
],
target_length
=
input_len
)
input_ids_list
=
[
input_ids
]
*
batch_size
# print(input_ids_list)
# ---------------------------------------------------------------------------- #
# 自回归生成
# ---------------------------------------------------------------------------- #
input_ids_infini
=
infinicore
.
from_list
(
input_ids_list
)
t1
=
time
.
time
()
print
(
"=================== start generate ===================="
)
model
.
generate
(
input_ids_infini
,
max_new_tokens
=
output_len
,
device
=
infini_device
,
tokenizer
=
tokenizer
,
stop_on_eos
=
False
,
)
t2
=
time
.
time
()
t2
=
time
.
time
()
print
(
f
"total_time:
{
round
((
t2
-
t1
)
*
1000
,
2
)
}
ms"
,
)
print
(
f
"total_time:
{
round
((
t2
-
t1
)
*
1000
,
2
)
}
ms"
,
)
if
__name__
==
"__main__"
:
...
...
@@ -162,15 +280,13 @@ if __name__ == "__main__":
device_str
=
"cuda"
else
:
print
(
"python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --t
ensor-parallel-size
=1 --input-len=50 --output-len=50"
"python examples/bench.py --nvidia --model=~/TinyLlama-1.1B-Chat-v1.0/ --batch-size=2 --t
p
=1 --input-len=50 --output-len=50"
)
sys
.
exit
(
1
)
# -------------------------------------------------------- #
# 解析参数
# -------------------------------------------------------- #
model_path
=
args
.
model
batch_size
=
args
.
batch_size
tp
=
args
.
tensor_parallel_size
output_len
=
args
.
output_len
input_len
=
args
.
input_len
infini_device
=
infinicore
.
device
(
device_str
,
0
)
if
args
.
dtype
==
"float32"
:
...
...
@@ -182,12 +298,50 @@ if __name__ == "__main__":
else
:
raise
ValueError
(
f
"Unsupported dtype:
{
args
.
dtype
}
"
)
test
(
tp
=
args
.
tensor_parallel_size
batch_size
=
args
.
batch_size
input_len
=
args
.
input_len
output_len
=
args
.
output_len
if
isinstance
(
batch_size
,
int
):
batch_size
=
[
batch_size
]
if
isinstance
(
input_len
,
int
):
input_len
=
[
input_len
]
if
isinstance
(
output_len
,
int
):
output_len
=
[
output_len
]
cases_dict
=
get_test_cases
(
model_path
,
batch_size
,
input_len
,
output_len
)
# -------------------------------------------------------- #
# 测试
# -------------------------------------------------------- #
# print("=================== start test ====================", type(batch_size))
test
=
TestModel
(
model_path
,
infini_device
=
infini_device
,
infini_dtype
=
infini_dtype
,
batch_size
=
batch_siz
e
,
infini_device
=
infini_devic
e
,
tp
=
tp
,
input_len
=
input_len
,
output_len
=
output_len
,
)
for
idx
,
case
in
tqdm
(
cases_dict
.
items
(),
desc
=
"Processing cases"
):
tqdm
.
write
(
f
"
\033
[92mProcessing :
{
case
}
\033
[0m"
)
batch_size
=
case
[
"batch_size"
]
input_len
=
case
[
"input_len"
]
output_len
=
case
[
"output_len"
]
# reset cache for each case
initial_capacity
=
input_len
+
output_len
+
100
test
.
model
.
reset_cache
(
batch_size
=
batch_size
,
pos
=
0
,
initial_capacity
=
initial_capacity
)
# run test one case
test
.
run
(
batch_size
=
batch_size
,
input_len
=
input_len
,
output_len
=
output_len
,
)
examples/jiuge.py
View file @
1b5d1ea7
...
...
@@ -65,7 +65,7 @@ def get_args():
help
=
"float32, float16, bfloat16"
,
)
parser
.
add_argument
(
"--batch
_
size"
,
"--batch
-
size"
,
type
=
int
,
default
=
1
,
help
=
"number of prompts in a batch"
,
...
...
@@ -164,7 +164,6 @@ def test(
model
.
generate
(
input_ids_infini
,
max_new_tokens
=
max_new_tokens
,
device
=
infini_device
,
tokenizer
=
tokenizer
,
)
t2
=
time
.
time
()
...
...
@@ -192,8 +191,8 @@ if __name__ == "__main__":
device_str
=
"cuda"
else
:
print
(
"Usage: python examples/
llama
.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
\n
"
"such as, python examples/
llama
.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
"Usage: python examples/
jiuge
.py [--cpu | --nvidia | --metax | --moore | --iluvatar] --model_path=<path/to/model_dir>
\n
"
"such as, python examples/
jiuge
.py --nvidia --model_path=~/TinyLlama-1.1B-Chat-v1.0"
)
sys
.
exit
(
1
)
prompts
=
[
args
.
prompt
for
_
in
range
(
args
.
batch_size
)]
...
...
examples/llama.py
View file @
1b5d1ea7
...
...
@@ -163,7 +163,6 @@ def test(
model
.
generate
(
input_ids_infini
,
max_new_tokens
=
max_new_tokens
,
device
=
infini_device
,
tokenizer
=
tokenizer
,
)
t2
=
time
.
time
()
...
...
python/infinilm/generation/utils.py
View file @
1b5d1ea7
...
...
@@ -169,7 +169,6 @@ class GenerationMixin:
Parameters:
input_ids (batch_size, seq_len): The sequence used as a prompt for the generation.
max_new_tokens: Maximum number of new tokens.
device: infinicore.device.
tokenizer: translating data into raw text.
"""
...
...
python/infinilm/models/llama/backends/cpp.py
View file @
1b5d1ea7
...
...
@@ -189,6 +189,16 @@ class LlamaForCausalLM(GenerationMixin):
config
.
_underlying
,
distributed_config
.
_underlying
,
device
.
_underlying
.
type
)
def
reset_cache
(
self
,
batch_size
:
int
,
pos
:
int
=
0
,
initial_capacity
:
int
=
1024
):
"""Reset the cache for the model"""
infinicore
.
sync_device
()
cache_config
=
self
.
_model
.
get_cache_config
()
cache_config
.
initial_batch_size
=
batch_size
cache_config
.
initial_capacity
=
initial_capacity
self
.
_model
.
reset_cache
(
cache_config
,
pos
)
def
state_dict_keyname
(
self
):
"""Get model key name."""
return
self
.
_model
.
state_dict
()[
0
].
keys
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment