Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
bytemlperf-dcu
Commits
24b257f1
Commit
24b257f1
authored
Nov 19, 2024
by
sunzhq2
Browse files
init
parent
920b3c0f
Changes
330
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1894 additions
and
0 deletions
+1894
-0
ByteMLPerf/byte_infer_perf/llm_perf/benchmark/tensorrt-llm/bench_engine.py
...nfer_perf/llm_perf/benchmark/tensorrt-llm/bench_engine.py
+256
-0
ByteMLPerf/byte_infer_perf/llm_perf/core/ckpt_loader.py
ByteMLPerf/byte_infer_perf/llm_perf/core/ckpt_loader.py
+474
-0
ByteMLPerf/byte_infer_perf/llm_perf/core/generation.py
ByteMLPerf/byte_infer_perf/llm_perf/core/generation.py
+49
-0
ByteMLPerf/byte_infer_perf/llm_perf/core/inferencer.py
ByteMLPerf/byte_infer_perf/llm_perf/core/inferencer.py
+85
-0
ByteMLPerf/byte_infer_perf/llm_perf/core/mp_engine.py
ByteMLPerf/byte_infer_perf/llm_perf/core/mp_engine.py
+108
-0
ByteMLPerf/byte_infer_perf/llm_perf/core/sampler.py
ByteMLPerf/byte_infer_perf/llm_perf/core/sampler.py
+50
-0
ByteMLPerf/byte_infer_perf/llm_perf/core/scheduler.py
ByteMLPerf/byte_infer_perf/llm_perf/core/scheduler.py
+164
-0
ByteMLPerf/byte_infer_perf/llm_perf/datasets/merged_52_test.csv
...Perf/byte_infer_perf/llm_perf/datasets/merged_52_test.csv
+64
-0
ByteMLPerf/byte_infer_perf/llm_perf/datasets/test_mini.csv
ByteMLPerf/byte_infer_perf/llm_perf/datasets/test_mini.csv
+2
-0
ByteMLPerf/byte_infer_perf/llm_perf/launch.py
ByteMLPerf/byte_infer_perf/llm_perf/launch.py
+339
-0
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/README.md
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/README.md
+15
-0
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/chatglm2-torch-fp16-6b.json
...infer_perf/llm_perf/model_zoo/chatglm2-torch-fp16-6b.json
+51
-0
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/falcon-torch-bf16-180b.json
...infer_perf/llm_perf/model_zoo/falcon-torch-bf16-180b.json
+33
-0
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/llama3-torch-bf16-70b.json
..._infer_perf/llm_perf/model_zoo/llama3-torch-bf16-70b.json
+37
-0
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/mixtral-torch-bf16-8x22b.json
...fer_perf/llm_perf/model_zoo/mixtral-torch-bf16-8x22b.json
+38
-0
ByteMLPerf/byte_infer_perf/llm_perf/prepare_model.py
ByteMLPerf/byte_infer_perf/llm_perf/prepare_model.py
+82
-0
ByteMLPerf/byte_infer_perf/llm_perf/requirements.txt
ByteMLPerf/byte_infer_perf/llm_perf/requirements.txt
+14
-0
ByteMLPerf/byte_infer_perf/llm_perf/script/extra_datasets.py
ByteMLPerf/byte_infer_perf/llm_perf/script/extra_datasets.py
+26
-0
ByteMLPerf/byte_infer_perf/llm_perf/script/lint.sh
ByteMLPerf/byte_infer_perf/llm_perf/script/lint.sh
+4
-0
ByteMLPerf/byte_infer_perf/llm_perf/script/proto.sh
ByteMLPerf/byte_infer_perf/llm_perf/script/proto.sh
+3
-0
No files found.
ByteMLPerf/byte_infer_perf/llm_perf/benchmark/tensorrt-llm/bench_engine.py
0 → 100644
View file @
24b257f1
import
os
import
sys
import
pathlib
import
argparse
import
logging
import
json
import
subprocess
CUR_DIR
=
pathlib
.
Path
.
cwd
()
FILE_DIR
=
pathlib
.
Path
(
__file__
).
parent
.
absolute
()
logger
=
logging
.
getLogger
(
"bench_trtllm"
)
def
setup_logger
(
loglevel
:
str
):
fmt
=
logging
.
Formatter
(
fmt
=
"%(asctime)s.%(msecs)03d %(filename)s:%(lineno)d [%(levelname)s]: %(message)s"
,
datefmt
=
"%Y-%m-%d %H:%M:%S"
,
)
handler
=
logging
.
StreamHandler
(
stream
=
sys
.
stdout
)
handler
.
setFormatter
(
fmt
)
logger
.
addHandler
(
handler
)
logger
.
setLevel
(
loglevel
.
upper
())
logger
.
propagate
=
False
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
# tensorrt-llm project path
parser
.
add_argument
(
"--trtllm_dir"
,
type
=
str
)
# model engine
parser
.
add_argument
(
"--engine_dir"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--model_dir"
,
type
=
str
,
required
=
True
)
# perf config
parser
.
add_argument
(
"--batch_size_list"
,
type
=
str
,
help
=
"batch_size list, split by comma,
\"
1,2,4,8,16,32
\"
"
)
parser
.
add_argument
(
"--seq_len_list"
,
type
=
str
,
help
=
"seq_len list, split by comma,
\"
1024,2048,4096,8192
\"
"
)
# workspace
parser
.
add_argument
(
"--workspace"
,
type
=
str
,
default
=
str
(
CUR_DIR
.
joinpath
(
"workspace"
)))
# logging
parser
.
add_argument
(
"--loglevel"
,
type
=
str
,
default
=
"INFO"
)
args
=
parser
.
parse_args
()
setup_logger
(
args
.
loglevel
)
# check trtllm
if
args
.
trtllm_dir
is
None
and
os
.
getenv
(
"TRTLLM_PATH"
)
is
None
:
logger
.
error
(
"trtllm_dir is None, please set trtllm_dir or set TRTLLM_PATH in env"
)
sys
.
exit
(
-
1
)
trtllm_dir
=
pathlib
.
Path
(
args
.
trtllm_dir
)
if
args
.
trtllm_dir
is
not
None
else
pathlib
.
Path
(
os
.
getenv
(
"TRTLLM_PATH"
)).
absolute
()
benchmark_build_dir
=
trtllm_dir
.
joinpath
(
"cpp"
,
"build"
,
"benchmarks"
)
session_benchmark
=
benchmark_build_dir
.
joinpath
(
"gptSessionBenchmark"
)
manager_benchmark
=
benchmark_build_dir
.
joinpath
(
"gptManagerBenchmark"
)
if
not
benchmark_build_dir
.
exists
()
or
not
session_benchmark
.
exists
()
or
not
manager_benchmark
.
exists
():
logger
.
error
(
f
"benchmark_build_dir:
{
benchmark_build_dir
}
not exists, please build benckmark first, cd cpp/build/benchmarks && make"
)
sys
.
exit
(
-
1
)
benchmark_dir
=
trtllm_dir
.
joinpath
(
"benchmarks"
,
"cpp"
)
prepare_dataset_script
=
benchmark_dir
.
joinpath
(
"prepare_dataset.py"
)
if
not
benchmark_dir
.
exists
()
or
not
prepare_dataset_script
.
exists
():
logger
.
error
(
f
"
{
prepare_dataset_script
}
not exists"
)
sys
.
exit
(
-
1
)
# check engine
engine_dir
=
pathlib
.
Path
(
args
.
engine_dir
).
absolute
()
if
not
engine_dir
.
exists
():
logger
.
error
(
f
"engine_dir:
{
engine_dir
}
not exists"
)
sys
.
exit
(
-
1
)
# check model
model_dir
=
pathlib
.
Path
(
args
.
model_dir
).
absolute
()
if
not
model_dir
.
exists
():
logger
.
error
(
f
"model_dir:
{
model_dir
}
not exists"
)
sys
.
exit
(
-
1
)
# check batch_size_list
if
args
.
batch_size_list
is
None
:
logger
.
error
(
"batch_size_list is None"
)
sys
.
exit
(
-
1
)
batch_size_list
=
[
int
(
batch_size
)
for
batch_size
in
args
.
batch_size_list
.
split
(
","
)]
# check seq_len_list
if
args
.
seq_len_list
is
None
:
logger
.
error
(
"seq_len_list is None"
)
sys
.
exit
(
-
1
)
seq_len_list
=
[
int
(
seq_len
)
for
seq_len
in
args
.
seq_len_list
.
split
(
","
)]
# workspace
workspace
=
pathlib
.
Path
(
args
.
workspace
).
absolute
()
if
not
workspace
.
exists
():
workspace
.
mkdir
(
parents
=
True
)
return
(
workspace
,
session_benchmark
,
manager_benchmark
,
prepare_dataset_script
,
engine_dir
,
model_dir
,
batch_size_list
,
seq_len_list
)
def
context_perf
(
session_benchmark
,
engine_dir
,
seq_len_list
):
print
(
""
)
engine_config
=
engine_dir
.
joinpath
(
"config.json"
)
config_data
=
json
.
loads
(
engine_config
.
read_text
())
max_batch_size
=
config_data
[
"build_config"
][
"max_batch_size"
]
max_input_len
=
config_data
[
"build_config"
][
"max_input_len"
]
max_seq_len
=
config_data
[
"build_config"
][
"max_seq_len"
]
max_num_tokens
=
config_data
[
"build_config"
][
"max_num_tokens"
]
tp_size
=
config_data
[
"build_config"
][
"auto_parallel_config"
][
"gpus_per_node"
]
device_name
=
config_data
[
"build_config"
][
"auto_parallel_config"
][
"cluster_key"
]
device_info
=
config_data
[
"build_config"
][
"auto_parallel_config"
][
"cluster_info"
]
for
seq_len
in
seq_len_list
:
if
seq_len
>
max_num_tokens
:
logger
.
warning
(
f
"seq_len:
{
seq_len
}
> max_num_tokens:
{
max_num_tokens
}
, skip"
)
continue
run_cmd
=
f
"mpirun --allow-run-as-root -n
{
tp_size
}
{
session_benchmark
}
"
run_cmd
+=
f
" --engine_dir
{
engine_dir
}
"
run_cmd
+=
f
" --batch_size 1"
run_cmd
+=
f
" --warm_up 2 --num_runs 20"
run_cmd
+=
f
" --input_output_len
\"
{
seq_len
}
,1
\"
"
results
=
subprocess
.
run
(
run_cmd
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
if
results
.
returncode
!=
0
:
logger
.
error
(
f
"run cmd:
{
run_cmd
}
failed, returncode:
{
results
.
returncode
}
, stderr:
{
results
.
stderr
}
"
)
sys
.
exit
(
-
1
)
for
line
in
results
.
stdout
.
splitlines
():
if
line
.
startswith
(
"[BENCHMARK]"
):
try
:
data_items
=
line
.
split
()
batch_size
=
int
(
data_items
[
2
])
seq_len
=
int
(
data_items
[
4
])
output_len
=
int
(
data_items
[
6
])
latency
=
float
(
data_items
[
8
])
except
Exception
as
e
:
logger
.
error
(
f
"parse line:
{
line
}
failed, error:
{
e
}
"
)
sys
.
exit
(
-
1
)
logger
.
info
(
f
"prefill, batch_size:
{
batch_size
}
, seq_len:
{
seq_len
}
, latency:
{
latency
}
ms"
)
def
decode_perf
(
workspace
,
manager_benchmark
,
prepare_dataset_script
,
engine_dir
,
model_path
,
batch_size_list
,
seq_len_list
):
print
(
""
)
engine_config
=
engine_dir
.
joinpath
(
"config.json"
)
config_data
=
json
.
loads
(
engine_config
.
read_text
())
max_batch_size
=
config_data
[
"build_config"
][
"max_batch_size"
]
max_input_len
=
config_data
[
"build_config"
][
"max_input_len"
]
max_seq_len
=
config_data
[
"build_config"
][
"max_seq_len"
]
max_num_tokens
=
config_data
[
"build_config"
][
"max_num_tokens"
]
tp_size
=
config_data
[
"build_config"
][
"auto_parallel_config"
][
"gpus_per_node"
]
device_name
=
config_data
[
"build_config"
][
"auto_parallel_config"
][
"cluster_key"
]
device_info
=
config_data
[
"build_config"
][
"auto_parallel_config"
][
"cluster_info"
]
for
seq_len
in
seq_len_list
:
if
seq_len
>
max_num_tokens
:
logger
.
warning
(
f
"seq_len:
{
seq_len
}
> max_num_tokens:
{
max_num_tokens
}
, skip"
)
continue
seq_workspace
=
workspace
.
joinpath
(
f
"seq_
{
seq_len
}
"
)
seq_workspace
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
context_generate_tokens
=
1
decode_generate_tokens
=
101
context_dataset
=
seq_workspace
.
joinpath
(
f
"context_
{
seq_len
}
_
{
context_generate_tokens
}
.json"
)
decode_dataset
=
seq_workspace
.
joinpath
(
f
"decode_
{
seq_len
}
_
{
decode_generate_tokens
}
.json"
)
prepare_dataset_cmd
=
f
"python3
{
prepare_dataset_script
}
"
prepare_dataset_cmd
+=
f
" --output
{
context_dataset
}
"
prepare_dataset_cmd
+=
f
" --tokenizer
{
model_path
}
"
prepare_dataset_cmd
+=
f
" token-norm-dist --num-requests
{
max_batch_size
}
"
prepare_dataset_cmd
+=
f
" --input-mean
{
seq_len
}
--input-stdev 0"
prepare_dataset_cmd
+=
f
" --output-mean
{
context_generate_tokens
}
--output-stdev 0"
subprocess
.
run
(
prepare_dataset_cmd
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
prepare_dataset_cmd
=
f
"python3
{
prepare_dataset_script
}
"
prepare_dataset_cmd
+=
f
" --output
{
decode_dataset
}
"
prepare_dataset_cmd
+=
f
" --tokenizer
{
model_path
}
"
prepare_dataset_cmd
+=
f
" token-norm-dist --num-requests
{
max_batch_size
}
"
prepare_dataset_cmd
+=
f
" --input-mean
{
seq_len
}
--input-stdev 0"
prepare_dataset_cmd
+=
f
" --output-mean
{
decode_generate_tokens
}
--output-stdev 0"
subprocess
.
run
(
prepare_dataset_cmd
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
for
batch_size
in
batch_size_list
:
if
batch_size
>
max_batch_size
:
logger
.
warning
(
f
"batch_size:
{
batch_size
}
> max_batch_size:
{
max_batch_size
}
, skip"
)
continue
context_csv
=
seq_workspace
.
joinpath
(
f
"context_batch
{
batch_size
}
.csv"
)
decode_csv
=
seq_workspace
.
joinpath
(
f
"decode_batch
{
batch_size
}
.csv"
)
# context
run_cmd
=
f
"mpirun --allow-run-as-root -n 8
{
manager_benchmark
}
"
run_cmd
+=
f
" --engine_dir
{
engine_dir
}
"
run_cmd
+=
f
" --type IFB"
run_cmd
+=
f
" --max_num_tokens
{
min
(
int
(
seq_len
*
1.5
),
int
(
max_num_tokens
))
}
"
run_cmd
+=
f
" --max_num_samples
{
batch_size
}
"
run_cmd
+=
f
" --static_emulated_batch_size
{
batch_size
}
"
run_cmd
+=
f
" --enable_kv_cache_reuse false"
run_cmd
+=
f
" --dataset
{
context_dataset
}
"
run_cmd
+=
f
" --output_csv
{
context_csv
}
"
subprocess
.
run
(
run_cmd
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
# decode
run_cmd
=
f
"mpirun --allow-run-as-root -n 8
{
manager_benchmark
}
"
run_cmd
+=
f
" --engine_dir
{
engine_dir
}
"
run_cmd
+=
f
" --type IFB"
run_cmd
+=
f
" --max_num_tokens
{
min
(
int
(
seq_len
*
1.5
),
int
(
max_num_tokens
))
}
"
run_cmd
+=
f
" --max_num_samples
{
batch_size
}
"
run_cmd
+=
f
" --static_emulated_batch_size
{
batch_size
}
"
run_cmd
+=
f
" --enable_kv_cache_reuse false"
run_cmd
+=
f
" --dataset
{
decode_dataset
}
"
run_cmd
+=
f
" --output_csv
{
decode_csv
}
"
subprocess
.
run
(
run_cmd
,
shell
=
True
,
capture_output
=
True
,
text
=
True
)
if
context_csv
.
exists
()
and
decode_csv
.
exists
():
try
:
context_latency
=
float
(
context_csv
.
read_text
().
splitlines
()[
1
].
split
(
","
)[
2
])
decode_latency
=
float
(
decode_csv
.
read_text
().
splitlines
()[
1
].
split
(
","
)[
2
])
except
Exception
as
e
:
logger
.
error
(
f
"parse context_csv:
{
context_csv
}
and decode_csv:
{
decode_csv
}
failed, error:
{
e
}
"
)
continue
per_token_latency
=
round
((
decode_latency
-
context_latency
)
/
(
decode_generate_tokens
-
context_generate_tokens
),
3
)
logger
.
info
(
f
"decode, batch_size:
{
batch_size
}
, seq_len:
{
seq_len
}
, latency:
{
per_token_latency
}
ms"
)
break
if
__name__
==
"__main__"
:
workspace
,
session_benchmark
,
manager_benchmark
,
prepare_dataset_script
,
engine_dir
,
model_dir
,
batch_size_list
,
seq_len_list
=
parse_args
()
logger
.
info
(
f
"session_benchmark:
{
session_benchmark
}
"
)
logger
.
info
(
f
"manager_benchmark:
{
manager_benchmark
}
"
)
logger
.
info
(
f
"engine_dir:
{
engine_dir
}
"
)
logger
.
info
(
f
"batch_size_list:
{
batch_size_list
}
"
)
logger
.
info
(
f
"seq_len_list:
{
seq_len_list
}
"
)
context_perf
(
session_benchmark
,
engine_dir
,
seq_len_list
)
decode_perf
(
workspace
,
manager_benchmark
,
prepare_dataset_script
,
engine_dir
,
model_dir
,
batch_size_list
,
seq_len_list
)
ByteMLPerf/byte_infer_perf/llm_perf/core/ckpt_loader.py
0 → 100644
View file @
24b257f1
import
json
import
pathlib
from
tqdm
import
tqdm
from
safetensors
import
safe_open
import
torch
import
torch.nn
as
nn
import
torch.distributed
as
dist
from
llm_perf.utils.logger
import
logger
from
abc
import
ABC
,
abstractmethod
from
concurrent.futures
import
ThreadPoolExecutor
from
typing
import
Union
,
List
,
Dict
class
CoreCkptLoader
(
ABC
):
def
__init__
(
self
,
prefix
,
model
,
mp_size
=
1
,
mp_rank
=
0
,
ckpt_path
:
str
=
""
):
self
.
prefix
=
prefix
self
.
model
=
model
self
.
mp_size
=
mp_size
self
.
mp_rank
=
mp_rank
self
.
ckpt_path
=
ckpt_path
self
.
state_dict
=
None
def
to_parameter
(
self
,
data
:
torch
.
Tensor
,
dtype
:
torch
.
dtype
=
None
):
if
dtype
is
not
None
:
data
=
data
.
to
(
dtype
)
return
nn
.
Parameter
(
data
,
requires_grad
=
False
)
def
to_contiguous
(
self
,
num_layers
,
param_suffixes
,
prefix
,
state_dict
):
result
=
{}
with
ThreadPoolExecutor
()
as
executor
:
for
i
in
range
(
num_layers
):
for
suffix
in
param_suffixes
:
# for example:
# "transformer.encoder.layers.0.mlp.dense_4h_to_h.weight"
name
=
f
"
{
prefix
}
.
{
i
}
.
{
suffix
}
"
if
name
in
state_dict
:
result
[
name
]
=
executor
.
submit
(
lambda
t
:
t
.
contiguous
(),
state_dict
[
name
])
for
i
in
range
(
num_layers
):
for
suffix
in
param_suffixes
:
name
=
f
"
{
prefix
}
.
{
i
}
.
{
suffix
}
"
if
name
in
state_dict
:
state_dict
[
name
]
=
result
[
name
].
result
def
gqa_split
(
self
,
src
,
dim
):
qkv_head_num
=
src
.
shape
[
dim
]
//
self
.
head_dim
src_split
=
src
.
chunk
(
qkv_head_num
,
dim
=
dim
)
qkv_cat
=
[]
for
i
in
range
(
self
.
mp_size
):
qkv_cat
.
append
(
torch
.
cat
(
[
src_split
[
i
*
self
.
mp_size
+
self
.
mp_rank
]
for
i
in
range
(
qkv_head_num
//
self
.
mp_size
)],
axis
=
dim
,
)
)
return
qkv_cat
def
qkv_split
(
self
,
src
,
dim
):
src_split
=
torch
.
split
(
src
.
data
,
src
.
shape
[
dim
]
//
3
,
dim
=
dim
)
qkv_split
=
[
torch
.
split
(
src_s
,
src_s
.
shape
[
dim
]
//
self
.
mp_size
,
dim
=
dim
)
for
src_s
in
src_split
]
qkv_cat
=
[
torch
.
cat
([
qkv_s
[
i
]
for
qkv_s
in
qkv_split
],
axis
=
dim
)
for
i
in
range
(
len
(
qkv_split
[
0
]))]
return
qkv_cat
def
with_outter_split
(
self
,
src
:
torch
.
Tensor
,
dim
:
int
,
outter
:
int
):
src_split
=
torch
.
split
(
src
.
data
,
src
.
shape
[
dim
]
//
outter
,
dim
=
dim
)
output_split
=
[
torch
.
split
(
src_s
,
src_s
.
shape
[
dim
]
//
self
.
mp_size
,
dim
=
dim
)
for
src_s
in
src_split
]
output_tensors
=
[
torch
.
cat
(
[
output_s
[
i
]
for
output_s
in
output_split
],
axis
=
dim
)
for
i
in
range
(
len
(
output_split
[
0
]))
]
output_tensors
=
[
tensor
.
contiguous
()
for
tensor
in
output_tensors
]
return
output_tensors
def
split
(
self
,
src
:
torch
.
Tensor
,
dim
:
int
,
chunks
:
List
[
int
]
=
[]
):
if
len
(
chunks
)
==
0
:
split_arg
=
src
.
shape
[
dim
]
//
self
.
mp_size
output_tensors
=
torch
.
split
(
src
,
split_arg
,
dim
=
dim
)
else
:
# for example
# chunks = [32, 2, 2], sum_chunks = 36, src.shape[dim] = (32 + 2 + 2) * 128, other_dim = 128
# mp_size = 8
# new_chunks = [4, 1, 1]
sum_chunks
=
sum
(
chunks
)
other_dim_size
=
src
.
shape
[
dim
]
//
sum_chunks
split_arg
=
[
i
*
other_dim_size
for
i
in
chunks
]
split_tensors
=
torch
.
split
(
src
,
split_arg
,
dim
=
dim
)
output_split
=
[]
for
i
,
tensor
in
enumerate
(
split_tensors
):
if
self
.
mp_size
>
chunks
[
i
]:
tensor_shape
=
tensor
.
size
()[:
dim
]
+
(
chunks
[
i
],
1
,
other_dim_size
)
+
tensor
.
size
()[
dim
+
1
:]
new_tensor_shape
=
tensor
.
size
()[:
dim
]
+
(
chunks
[
i
],
self
.
mp_size
//
chunks
[
i
],
other_dim_size
)
+
tensor
.
size
()[
dim
+
1
:]
output_tensor_shape
=
tensor
.
size
()[:
dim
]
+
(
self
.
mp_size
*
other_dim_size
,)
+
tensor
.
size
()[
dim
+
1
:]
tensor
=
tensor
.
view
(
tensor_shape
)
tensor
=
tensor
.
expand
(
*
new_tensor_shape
)
tensor
=
tensor
.
contiguous
()
tensor
=
tensor
.
view
(
output_tensor_shape
)
cur_split
=
torch
.
split
(
tensor
,
tensor
.
shape
[
dim
]
//
self
.
mp_size
,
dim
=
dim
)
output_split
.
append
(
cur_split
)
output_tensors
=
[]
for
i
in
range
(
self
.
mp_size
):
temp_tensors
=
[
output_split
[
j
][
i
]
for
j
in
range
(
len
(
chunks
))]
tp_tensors
=
torch
.
concat
(
temp_tensors
,
dim
=
dim
)
output_tensors
.
append
(
tp_tensors
)
output_tensors
=
[
tensor
.
contiguous
()
for
tensor
in
output_tensors
]
return
output_tensors
def
broadcast_meta
(
self
):
meta
=
[
{
k
:
{
"shape"
:
v
.
shape
,
"dtype"
:
v
.
dtype
}
for
k
,
v
in
self
.
state_dict
.
items
()}
]
if
self
.
mp_rank
==
0
else
[
None
]
dist
.
broadcast_object_list
(
meta
,
src
=
0
)
if
self
.
mp_rank
!=
0
:
self
.
state_dict
=
meta
[
0
]
@
abstractmethod
def
broadcast_weight
(
self
,
key
,
device
=
'cpu'
,
non_blocking
=
False
):
raise
NotImplementedError
# split_mode
# default
# with_outter
# split_outter
@
abstractmethod
def
scatter_weight
(
self
,
key
,
dim
,
split_mode
=
'default'
,
outter
=
1
,
non_blocking
=
False
):
raise
NotImplementedError
@
abstractmethod
def
parallel_loader
(
self
):
raise
NotImplementedError
@
abstractmethod
def
infusion_to_model
(
self
):
raise
NotImplementedError
class
ModelLoader
():
def
__init__
(
self
,
model_dir
:
pathlib
.
Path
,
total_size
:
int
,
weight_map
:
Dict
[
str
,
str
]
)
->
None
:
self
.
model_dir
=
model_dir
self
.
total_size
=
total_size
# {tensor_name: file_name} map
self
.
weight_map
=
weight_map
weight_set
=
set
()
for
weight_name
in
weight_map
:
weight_set
.
add
(
weight_map
[
weight_name
])
self
.
file_num
=
len
(
weight_set
)
# loaded bytes
self
.
loaded_bytes
=
0
# {tensor_name: tensor} map
self
.
weight_dict
=
{}
# {file_name: {tensor_name: tensor}} map
self
.
file_cache
=
{}
def
load_tensor
(
self
,
tensor_name
:
str
):
if
not
tensor_name
in
self
.
weight_map
:
logger
.
error
(
f
"tensor_name
{
tensor_name
}
not in weight_map"
)
return
if
not
self
.
file_cache
:
self
.
p_bar
=
tqdm
(
total
=
self
.
file_num
,
desc
=
"loading model"
)
file_name
=
self
.
weight_map
[
tensor_name
]
if
not
file_name
in
self
.
file_cache
:
if
file_name
.
endswith
(
".safetensors"
):
with
safe_open
(
self
.
model_dir
.
joinpath
(
file_name
),
framework
=
"pt"
,
device
=
"cpu"
)
as
f
:
self
.
file_cache
[
file_name
]
=
{}
for
key
in
f
.
keys
():
self
.
file_cache
[
file_name
][
key
]
=
f
.
get_tensor
(
key
)
self
.
loaded_bytes
+=
self
.
file_cache
[
file_name
][
key
].
numel
()
*
self
.
file_cache
[
file_name
][
key
].
element_size
()
elif
file_name
.
endswith
(
".bin"
):
self
.
file_cache
[
file_name
]
=
torch
.
load
(
self
.
model_dir
.
joinpath
(
file_name
),
map_location
=
"cpu"
)
for
key
in
self
.
file_cache
[
file_name
]:
self
.
loaded_bytes
+=
self
.
file_cache
[
file_name
][
key
].
numel
()
*
self
.
file_cache
[
file_name
][
key
].
element_size
()
else
:
logger
.
error
(
f
"file_name
{
file_name
}
not supported"
)
return
self
.
p_bar
.
update
(
1
)
if
self
.
p_bar
.
n
==
self
.
file_num
:
self
.
p_bar
.
close
()
self
.
p_bar
=
None
self
.
weight_dict
[
tensor_name
]
=
self
.
file_cache
[
file_name
][
tensor_name
]
class
ChatGLM2_ModelLoader
(
ModelLoader
):
def
__init__
(
self
,
model_dir
:
pathlib
.
Path
,
model_config
,
weight_index_config
:
Dict
,
)
->
None
:
# parent class
super
().
__init__
(
model_dir
,
weight_index_config
[
"metadata"
][
"total_size"
],
weight_index_config
[
"weight_map"
]
)
self
.
model_config
=
model_config
def
load_weight
(
self
):
self
.
loaded_bytes
=
0
self
.
weight_dict
=
{}
self
.
load_tensor
(
"transformer.embedding.word_embeddings.weight"
)
self
.
load_tensor
(
"transformer.rotary_pos_emb.inv_freq"
)
for
i
in
range
(
self
.
model_config
.
num_layers
):
self
.
load_tensor
(
f
"transformer.encoder.layers.
{
i
}
.input_layernorm.weight"
)
self
.
load_tensor
(
f
"transformer.encoder.layers.
{
i
}
.mlp.dense_4h_to_h.weight"
)
self
.
load_tensor
(
f
"transformer.encoder.layers.
{
i
}
.mlp.dense_h_to_4h.weight"
)
self
.
load_tensor
(
f
"transformer.encoder.layers.
{
i
}
.post_attention_layernorm.weight"
)
self
.
load_tensor
(
f
"transformer.encoder.layers.
{
i
}
.self_attention.dense.weight"
)
self
.
load_tensor
(
f
"transformer.encoder.layers.
{
i
}
.self_attention.query_key_value.bias"
)
self
.
load_tensor
(
f
"transformer.encoder.layers.
{
i
}
.self_attention.query_key_value.weight"
)
self
.
load_tensor
(
"transformer.encoder.final_layernorm.weight"
)
self
.
load_tensor
(
"transformer.output_layer.weight"
)
weight_bytes
=
0
for
tensor_name
in
self
.
weight_dict
:
tensor
=
self
.
weight_dict
[
tensor_name
]
weight_bytes
+=
tensor
.
numel
()
*
tensor
.
element_size
()
logger
.
info
(
f
"total_size:
{
self
.
total_size
}
, loaded_bytes:
{
self
.
loaded_bytes
}
, weight_bytes:
{
weight_bytes
}
"
)
assert
self
.
loaded_bytes
==
self
.
total_size
assert
weight_bytes
==
self
.
total_size
return
self
.
weight_dict
from
transformers
import
LlamaConfig
class
Llama_ModelLoader
(
ModelLoader
):
def
__init__
(
self
,
model_dir
:
pathlib
.
Path
):
model_config
=
LlamaConfig
.
from_pretrained
(
model_dir
)
weight_index_config
=
{}
for
child
in
model_dir
.
iterdir
():
if
child
.
name
.
endswith
(
".index.json"
):
with
open
(
child
,
"r"
)
as
f
:
weight_index_config
=
json
.
load
(
f
)
break
self
.
layer_num
=
model_config
.
num_hidden_layers
super
().
__init__
(
model_dir
,
weight_index_config
[
"metadata"
][
"total_size"
],
weight_index_config
[
"weight_map"
]
)
def
load_weight
(
self
):
self
.
loaded_bytes
=
0
self
.
weight_dict
=
{}
self
.
load_tensor
(
"model.embed_tokens.weight"
)
for
i
in
range
(
self
.
layer_num
):
self
.
load_tensor
(
f
"model.layers.
{
i
}
.input_layernorm.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.self_attn.q_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.self_attn.k_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.self_attn.v_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.self_attn.o_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.post_attention_layernorm.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.mlp.gate_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.mlp.up_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.mlp.down_proj.weight"
)
self
.
load_tensor
(
"model.norm.weight"
)
self
.
load_tensor
(
"lm_head.weight"
)
weight_bytes
=
0
for
tensor_name
in
self
.
weight_dict
:
tensor
=
self
.
weight_dict
[
tensor_name
]
weight_bytes
+=
tensor
.
numel
()
*
tensor
.
element_size
()
logger
.
info
(
f
"total_size:
{
self
.
total_size
}
, loaded_bytes:
{
self
.
loaded_bytes
}
, weight_bytes:
{
weight_bytes
}
"
)
assert
self
.
loaded_bytes
==
self
.
total_size
assert
weight_bytes
==
self
.
total_size
return
self
.
weight_dict
from
transformers
import
MixtralConfig
class
Mixtral_ModelLoader
(
ModelLoader
):
def
__init__
(
self
,
model_dir
:
pathlib
.
Path
)
->
None
:
model_config
=
MixtralConfig
.
from_pretrained
(
model_dir
)
weight_index_config
=
{}
for
child
in
model_dir
.
iterdir
():
if
child
.
name
.
endswith
(
".index.json"
):
with
open
(
child
,
"r"
)
as
f
:
weight_index_config
=
json
.
load
(
f
)
break
self
.
layer_num
=
model_config
.
num_hidden_layers
self
.
expert_num
=
model_config
.
num_local_experts
# parent class
super
().
__init__
(
model_dir
,
weight_index_config
[
"metadata"
][
"total_size"
],
weight_index_config
[
"weight_map"
]
)
def
load_weight
(
self
):
self
.
loaded_bytes
=
0
self
.
weight_dict
=
{}
self
.
load_tensor
(
"model.embed_tokens.weight"
)
for
i
in
range
(
self
.
layer_num
):
self
.
load_tensor
(
f
"model.layers.
{
i
}
.self_attn.q_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.self_attn.k_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.self_attn.v_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.self_attn.o_proj.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.block_sparse_moe.gate.weight"
)
for
j
in
range
(
self
.
expert_num
):
self
.
load_tensor
(
f
"model.layers.
{
i
}
.block_sparse_moe.experts.
{
j
}
.w1.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.block_sparse_moe.experts.
{
j
}
.w2.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.block_sparse_moe.experts.
{
j
}
.w3.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.input_layernorm.weight"
)
self
.
load_tensor
(
f
"model.layers.
{
i
}
.post_attention_layernorm.weight"
)
self
.
load_tensor
(
"model.norm.weight"
)
self
.
load_tensor
(
"lm_head.weight"
)
weight_bytes
=
0
for
tensor_name
in
self
.
weight_dict
:
tensor
=
self
.
weight_dict
[
tensor_name
]
weight_bytes
+=
tensor
.
numel
()
*
tensor
.
element_size
()
logger
.
info
(
f
"total_size:
{
self
.
total_size
}
, loaded_bytes:
{
self
.
loaded_bytes
}
, weight_bytes:
{
weight_bytes
}
"
)
assert
self
.
loaded_bytes
==
self
.
total_size
assert
weight_bytes
==
self
.
total_size
return
self
.
weight_dict
from
transformers
import
FalconConfig
class
Falcon_ModelLoader
(
ModelLoader
):
def
__init__
(
self
,
model_dir
:
pathlib
.
Path
)
->
None
:
model_config
=
FalconConfig
.
from_pretrained
(
model_dir
)
weight_index_config
=
{}
for
child
in
model_dir
.
iterdir
():
if
child
.
name
.
endswith
(
".index.json"
):
with
open
(
child
,
"r"
)
as
f
:
weight_index_config
=
json
.
load
(
f
)
break
# model config
self
.
layer_num
=
model_config
.
num_hidden_layers
super
().
__init__
(
model_dir
,
weight_index_config
[
"metadata"
][
"total_size"
],
weight_index_config
[
"weight_map"
]
)
def
load_weight
(
self
):
self
.
loaded_bytes
=
0
self
.
weight_dict
=
{}
self
.
load_tensor
(
"transformer.word_embeddings.weight"
)
for
i
in
range
(
self
.
layer_num
):
self
.
load_tensor
(
f
"transformer.h.
{
i
}
.self_attention.query_key_value.weight"
)
self
.
load_tensor
(
f
"transformer.h.
{
i
}
.self_attention.dense.weight"
)
self
.
load_tensor
(
f
"transformer.h.
{
i
}
.mlp.dense_h_to_4h.weight"
)
self
.
load_tensor
(
f
"transformer.h.
{
i
}
.mlp.dense_4h_to_h.weight"
)
self
.
load_tensor
(
f
"transformer.h.
{
i
}
.ln_attn.weight"
)
self
.
load_tensor
(
f
"transformer.h.
{
i
}
.ln_attn.bias"
)
self
.
load_tensor
(
f
"transformer.h.
{
i
}
.ln_mlp.weight"
)
self
.
load_tensor
(
f
"transformer.h.
{
i
}
.ln_mlp.bias"
)
self
.
load_tensor
(
"transformer.ln_f.weight"
)
self
.
load_tensor
(
"transformer.ln_f.bias"
)
self
.
load_tensor
(
"lm_head.weight"
)
weight_bytes
=
0
for
tensor_name
in
self
.
weight_dict
:
tensor
=
self
.
weight_dict
[
tensor_name
]
weight_bytes
+=
tensor
.
numel
()
*
tensor
.
element_size
()
logger
.
info
(
f
"total_size:
{
self
.
total_size
}
, loaded_bytes:
{
self
.
loaded_bytes
}
, weight_bytes:
{
weight_bytes
}
"
)
assert
self
.
loaded_bytes
==
self
.
total_size
assert
weight_bytes
==
self
.
total_size
return
self
.
weight_dict
ByteMLPerf/byte_infer_perf/llm_perf/core/generation.py
0 → 100644
View file @
24b257f1
import
torch
import
asyncio
from
dataclasses
import
dataclass
,
field
from
typing
import
List
@
dataclass
class
GenerateConfig
:
min_new_tokens
:
int
=
0
max_new_tokens
:
int
=
0
top_k
:
int
=
0
top_p
:
float
=
1.0
temperature
:
float
=
1.0
presence_penalty
:
float
=
1.0
eos_token_id
:
int
=
-
1
pad_token_id
:
int
=
-
1
get_input_logits
:
bool
=
False
@
dataclass
class
GenerateRequest
:
input_ids
:
List
[
int
]
generate_config
:
GenerateConfig
@
dataclass
class
GenerateResult
:
token_id
:
int
finish_reason
:
str
wait_time
:
float
model_time
:
float
post_process_time
:
float
logits
:
torch
.
Tensor
last_logits
:
torch
.
Tensor
class
ResultQueue
:
def
__init__
(
self
):
self
.
_q
=
asyncio
.
Queue
()
try
:
self
.
_loop
=
self
.
_q
.
_get_loop
()
except
:
self
.
_loop
=
asyncio
.
get_running_loop
()
def
put
(
self
,
item
):
self
.
_loop
.
call_soon_threadsafe
(
self
.
_q
.
put_nowait
,
item
)
async
def
get
(
self
):
return
await
self
.
_q
.
get
()
ByteMLPerf/byte_infer_perf/llm_perf/core/inferencer.py
0 → 100644
View file @
24b257f1
import
time
from
abc
import
ABC
,
abstractmethod
from
dataclasses
import
dataclass
from
enum
import
Enum
from
typing
import
Any
,
Dict
,
Iterable
,
List
from
llm_perf.core.generation
import
GenerateConfig
,
GenerateRequest
,
GenerateResult
from
llm_perf.core.generation
import
ResultQueue
from
llm_perf.utils.logger
import
logger
class
PacketStatus
(
Enum
):
ERROR
=
-
1
PENDING
=
0
RUNNING
=
1
FINISH
=
2
class
CoreInferencer
(
ABC
):
"""
Inference class
"""
@
dataclass
class
Task
:
request
:
GenerateRequest
state
:
PacketStatus
generate_ids
:
List
[
int
]
def
__init__
(
self
,
request
:
GenerateRequest
):
self
.
request
=
request
self
.
result_queue
=
ResultQueue
()
self
.
state
=
PacketStatus
.
PENDING
self
.
generate_ids
=
[]
self
.
exception
=
None
self
.
create_st
=
time
.
perf_counter_ns
()
self
.
last_model_start_st
=
time
.
perf_counter_ns
()
self
.
last_model_end_st
=
time
.
perf_counter_ns
()
self
.
last_process_st
=
time
.
perf_counter_ns
()
self
.
wait_time
=
[]
self
.
model_time
=
[]
self
.
post_process_time
=
[]
def
update_st
(
self
,
st_name
):
if
st_name
==
"model_start"
:
self
.
last_model_start_st
=
time
.
perf_counter_ns
()
self
.
wait_time
.
append
((
self
.
last_model_start_st
-
self
.
last_process_st
)
/
1e6
)
elif
st_name
==
"model_end"
:
self
.
last_model_end_st
=
time
.
perf_counter_ns
()
self
.
model_time
.
append
((
self
.
last_model_end_st
-
self
.
last_model_start_st
)
/
1e6
)
elif
st_name
==
"process_end"
:
self
.
last_process_st
=
time
.
perf_counter_ns
()
self
.
post_process_time
.
append
((
self
.
last_process_st
-
self
.
last_model_end_st
)
/
1e6
)
def
add_result
(
self
,
res
:
GenerateResult
):
self
.
generate_ids
.
append
(
res
.
token_id
)
self
.
result_queue
.
put
(
res
)
async
def
get_result
(
self
)
->
GenerateRequest
:
return
await
self
.
result_queue
.
get
()
def
finish
(
self
)
->
None
:
self
.
state
=
PacketStatus
.
FINISH
self
.
result_queue
.
put
(
None
)
def
error
(
self
)
->
None
:
self
.
state
==
PacketStatus
.
ERROR
def
is_finished
(
self
)
->
bool
:
return
self
.
state
==
PacketStatus
.
FINISH
def
return_q_empty
(
self
)
->
bool
:
return
self
.
result_queue
.
empty
()
def
__init__
(
self
)
->
None
:
super
().
__init__
()
@
abstractmethod
def
infer
(
self
,
tasks
:
List
[
"CoreInferencer.Task"
],
**
kwargs
):
raise
NotImplementedError
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/core/mp_engine.py
0 → 100644
View file @
24b257f1
import
os
import
sys
import
signal
from
abc
import
ABC
,
abstractmethod
import
torch.nn
as
nn
import
torch.multiprocessing
as
mp
from
llm_perf.utils.logger
import
logger
class
CoreMpEngine
(
ABC
):
def
__init__
(
self
,
world_size
:
int
,
model_impl
:
nn
.
Module
,
xpu_cfg
)
->
None
:
self
.
world_size
=
world_size
self
.
model_impl
=
model_impl
self
.
xpu_cfg
=
xpu_cfg
# https://github.com/pytorch/pytorch/issues/32322
# https://stackoverflow.com/questions/61939952/mp-set-start-methodspawn-triggered-an-error-saying-the-context-is-already-be
try
:
mp
.
set_start_method
(
"spawn"
,
force
=
True
)
except
Exception
as
e
:
logger
.
exception
(
f
"failed to set spawn context:
{
e
}
"
)
sys
.
exit
(
-
1
)
self
.
_subprocesses
=
None
def
signal_handler
(
signum
,
frame
):
logger
.
info
(
f
"Received signal
{
signum
}
, exiting..."
)
self
.
clean_subprocess
()
os
.
_exit
(
0
)
signal
.
signal
(
signal
.
SIGINT
,
signal_handler
)
signal
.
signal
(
signal
.
SIGTERM
,
signal_handler
)
if
os
.
getenv
(
"MASTER_PORT"
,
""
)
==
""
:
os
.
environ
[
"MASTER_PORT"
]
=
str
(
self
.
find_free_port
())
if
os
.
getenv
(
"MASTER_ADDR"
,
""
)
==
""
:
os
.
environ
[
"MASTER_ADDR"
]
=
"localhost"
self
.
_input_queues
=
mp
.
Queue
(
maxsize
=
self
.
world_size
)
self
.
_output_queues
=
mp
.
Queue
(
maxsize
=
1
)
self
.
_subprocesses
=
mp
.
spawn
(
fn
=
self
.
mp_loop_worker
,
args
=
(
world_size
,
self
.
_input_queues
,
self
.
_output_queues
,
model_impl
,
xpu_cfg
,
),
nprocs
=
world_size
,
join
=
False
,
daemon
=
False
,
)
self
.
_subprocess_pids
=
self
.
_subprocesses
.
pids
()
logger
.
info
(
f
"subprocesses created:
{
self
.
_subprocess_pids
}
"
)
logger
.
info
(
"waiting for ranks to be ready"
)
for
_
in
range
(
world_size
):
assert
"ready"
==
self
.
_output_queues
.
get
(
block
=
True
)
logger
.
info
(
"all ranks are ready and listening, init done"
)
def
__del__
(
self
):
self
.
clean_subprocess
()
def
clean_subprocess
(
self
):
try
:
if
self
.
_subprocesses
is
not
None
:
for
p
in
self
.
_subprocesses
.
processes
:
if
p
.
is_alive
():
logger
.
info
(
f
"terminate subprocess:
{
p
.
pid
}
"
)
p
.
terminate
()
except
Exception
as
e
:
logger
.
exception
(
f
"
{
e
}
, failed to terminate torch mp, which may cause mem leak; ignored..."
)
def
find_free_port
(
self
):
import
socket
from
contextlib
import
closing
with
closing
(
socket
.
socket
(
socket
.
AF_INET
,
socket
.
SOCK_STREAM
))
as
s
:
s
.
bind
((
""
,
0
))
s
.
setsockopt
(
socket
.
SOL_SOCKET
,
socket
.
SO_REUSEADDR
,
1
)
return
s
.
getsockname
()[
1
]
@
staticmethod
@
abstractmethod
def
mp_loop_worker
(
local_rank
:
int
,
world_size
:
int
,
input_queue
:
mp
.
Queue
,
output_queue
:
mp
.
Queue
,
model_impl
,
xpu_config
):
raise
NotImplementedError
@
abstractmethod
def
mp_forward
(
self
,
*
args
):
raise
NotImplementedError
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/core/sampler.py
0 → 100644
View file @
24b257f1
from
abc
import
ABC
,
abstractmethod
from
typing
import
Dict
,
List
import
torch
from
llm_perf.core.generation
import
GenerateResult
from
llm_perf.core.inferencer
import
CoreInferencer
class
CoreSampler
(
ABC
):
def
__init__
(
self
)
->
None
:
super
().
__init__
()
@
abstractmethod
def
sample
(
self
,
tasks
:
List
[
CoreInferencer
.
Task
],
logits
:
torch
.
FloatTensor
)
->
List
[
int
]:
"""Sample next tokens
Args:
packets: sample batch packets
logits: model inference outputs, shape is (sum(len(input_ids) of each packet), vocab_size)
Return:
next_tokens: next token list of each request
"""
raise
NotImplementedError
@
abstractmethod
def
postprocess
(
self
,
packets
:
List
[
CoreInferencer
.
Task
],
infer_outputs
:
Dict
[
str
,
torch
.
FloatTensor
],
next_tokens
:
List
[
int
],
)
->
List
[
GenerateResult
]:
"""Postprocess sample result tokens
Args:
packets: sample batch packets
infer_output: inference outputs, contain 'input_logits' and 'last_logits' `{"input_logits": tensor, "last_logits": tensor}`
input_logits: model inference output input logits
last_logits: model inference outputs last logits, shape is (sum(len(input_ids) of each packet), vocab_size)
next_tokens: sample packets next token list
Return:
GenerateResult list of packets
"""
raise
NotImplementedError
ByteMLPerf/byte_infer_perf/llm_perf/core/scheduler.py
0 → 100644
View file @
24b257f1
import
os
import
time
import
random
import
threading
import
signal
from
abc
import
ABC
,
abstractmethod
from
dataclasses
import
dataclass
from
enum
import
Enum
from
queue
import
Queue
from
typing
import
Any
,
AsyncIterable
,
Dict
,
Iterable
,
List
,
Tuple
,
Union
import
torch
import
torch.multiprocessing
as
mp
from
torch
import
distributed
as
dist
from
llm_perf.core.generation
import
(
GenerateConfig
,
GenerateRequest
,
GenerateResult
)
from
llm_perf.core.inferencer
import
CoreInferencer
from
llm_perf.core.sampler
import
CoreSampler
from
llm_perf.utils.logger
import
logger
from
llm_perf.utils.reporter
import
calc_perplexity
class
CoreScheduler
(
ABC
):
def
__init__
(
self
,
inferencer
:
CoreInferencer
,
sampler
:
CoreSampler
,
task_cls
=
CoreInferencer
.
Task
)
->
None
:
super
().
__init__
()
self
.
inferencer
:
CoreInferencer
=
inferencer
self
.
sampler
:
CoreSampler
=
sampler
self
.
Task
=
task_cls
self
.
task_queue
:
Queue
[
self
.
Task
]
=
Queue
()
self
.
started
=
False
self
.
scheduler_thread
=
None
def
start
(
self
):
if
not
self
.
started
:
logger
.
info
(
"start scheduler thread"
)
self
.
started
=
True
self
.
scheduler_thread
=
threading
.
Thread
(
target
=
self
.
scheduler_loop
)
self
.
scheduler_thread
.
start
()
def
stop
(
self
):
if
self
.
started
:
logger
.
info
(
"stop scheduler thread"
)
self
.
started
=
False
if
self
.
scheduler_thread
and
self
.
scheduler_thread
.
is_alive
():
self
.
scheduler_thread
.
join
(
timeout
=
1.
)
@
abstractmethod
@
torch
.
no_grad
()
def
scheduler_loop
(
self
):
raise
NotImplementedError
async
def
generate
(
self
,
req
:
GenerateRequest
)
->
Union
[
AsyncIterable
[
GenerateResult
],
Tuple
[
AsyncIterable
[
GenerateResult
],
float
,
str
]
]:
task
=
self
.
Task
(
request
=
req
)
self
.
submit
(
task
)
async
for
result
in
self
.
get_packet_results
(
req
.
generate_config
.
get_input_logits
,
task
):
yield
result
def
submit
(
self
,
task
):
self
.
task_queue
.
put_nowait
(
task
)
async
def
get_packet_results
(
self
,
get_input_logits
:
bool
,
task
:
CoreInferencer
.
Task
):
gen_index
=
0
while
True
:
result
=
await
task
.
get_result
()
if
result
is
None
:
if
task
.
exception
:
raise
task
.
exception
break
cur_input_tokens
=
task
.
request
.
input_ids
+
task
.
generate_ids
[:
gen_index
]
gen_index
+=
1
task_results
=
{
"result"
:
result
,
}
if
get_input_logits
:
await
self
.
update_logits
(
result
,
task
)
cur_labels_tensor
=
torch
.
tensor
(
cur_input_tokens
,
dtype
=
torch
.
int64
,
device
=
'cpu'
)
input_logits_len
=
len
(
cur_input_tokens
)
-
1
input_logits
=
task
.
all_logits
[:
input_logits_len
]
perplexity
=
calc_perplexity
(
input_logits
,
cur_labels_tensor
)
task_results
[
"dump_file"
]
=
""
task_results
[
"perplexity"
]
=
perplexity
yield
task_results
task_results
=
{
"result"
:
None
,
"perplexity"
:
-
1
,
}
if
get_input_logits
:
dump_file
=
await
self
.
dump_last_logits
(
task
)
task_results
[
"dump_file"
]
=
dump_file
yield
task_results
return
async
def
update_logits
(
self
,
result
,
task
):
# [8, num_vocab]
if
not
hasattr
(
task
,
"all_logits"
):
task
.
all_logits
=
result
.
logits
# [1, num_vocab]
else
:
task
.
all_logits
=
torch
.
cat
([
task
.
all_logits
,
result
.
logits
],
dim
=
0
)
async
def
dump_last_logits
(
self
,
task
:
CoreInferencer
.
Task
):
tmp_dir
=
".tmp_logits"
if
not
os
.
path
.
exists
(
tmp_dir
):
os
.
mkdir
(
tmp_dir
)
import
numpy
as
np
dump_file
=
(
tmp_dir
+
"/"
+
str
(
random
.
randint
(
0
,
100
))
+
"_"
+
str
(
int
(
time
.
time_ns
()))
+
".npy"
)
input_tokens_len
=
len
(
task
.
request
.
input_ids
)
gen_tokens_len
=
len
(
task
.
generate_ids
)
generate_logits
=
task
.
all_logits
[
-
gen_tokens_len
:].
unsqueeze
(
0
)
np
.
save
(
dump_file
,
generate_logits
.
numpy
())
return
dump_file
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/datasets/merged_52_test.csv
0 → 100644
View file @
24b257f1
id,question,A,B,C,D
0,最早向中国介绍西方进化论的是____,严复,梁启超,康有为,谭嗣同
0,“大将筹边尚未还,湖湘子弟满天山。新栽杨柳三千里,引得春风度玉关。”这首诗颂扬了一位清代名将率军收复新疆、治理边疆的业绩。这位名将是____。,林则徐,左宗棠,邓世昌,丁汝昌
0,巴黎公社为国际社会主义运动提供的最主要的经验是:____,同强大的敌人勇于斗争,联合农民建立工农联盟,用暴力手段推翻资产阶级统治,建立无产阶级专政,清除败类,纯洁革命队伍
0,“五证合一、一照一码”是指企业进行登记后,由市场监督管理部门核发一个加载法人和其他组织____营业执照的登记制度。,组织机构代码,统一社会信用代码,纳税人识别号,社会保险代码
0,下列不属于动作要素的内容是____,动作轨迹,动作时间,动作速度,动作方向
0,下列关于资本结构理论的说法中,不正确的是____。,代理理论、权衡理论、有企业所得税条件下的MM理论,都认为企业价值与资本结构有关,按照优序融资理论的观点,考虑信息不对称和逆向选择的影响,管理者偏好首选留存收益筹资,然后是发行新股筹资,最后是债务筹资,权衡理论是对有企业所得税条件下的MM理论的扩展,代理理论是对权衡理论的扩展
0,《三滴血》是____新戏。,京剧,昆曲,秦腔,河北梆子
0,"对于以下结构定义,++p->str中的++加在____
struct{
int len;
char*str;
}*P;",指针 p 上,指针 str 上,str 指的内容上,语法错误
0,实施互利共赢的开放战略符合世界各国同舟共济的客观要求。确立互利共赢的思想基础是____,全球化意识,现代化意识,历史意识,“命运共同体”意识
0,圆锥的底面半径为2,高为4.一个圆柱的下底面在圆锥的底面上,上底面的圆周在圆锥的侧面上,当圆柱侧面积为$4 \pi$时,该圆柱的体积为____,$\pi$,$2\pi$,$3\pi$,$4\pi$
0,下列制定城镇体系规划的基本程序中,叙述不正确的是____。,经审批机关批准同意修编,开展规划编制的组织工作,规划草案公告20日以上,组织编制单位采取论证会、听证会,组织编制机关委托具有相应资质等级的单位承担具体编制工作,在政府审查基础上,报请本级人民代表大会常务委员会审议
0,果戈理是俄国批判现实主义文学奠基人,其讽刺艺术被称为“含泪的笑”。他在其作品《死魂灵》中塑造的吝啬鬼形象是____,夏洛克,阿巴贡,葛朗台,泼留希金
0,宋真宗赵恒御笔作“劝学篇”,有“书中自有黄金屋”流布天下几近千年。黄金榜求龙头望,成了书生实现人生价值的“自古华山一条道”。当“黄金屋”成为读书的唯一价值取向时,很容易将一个民族的思维纳入功利和实用的框架,看似加速了现代化的进程,实则开了历史的倒车。借书籍端正人生要义,“正其谊不谋其利,明其道不计其功”才是书的王道。这段文字要表达的意思是____。,人们对于读书的理解,不能让功利的目的占了上风,“书中自有黄金屋”是许多人读书的目的和追求,读书就是读书,何必附加上那么多的东西,读书可以启智明理修身养性,也可以蒙蔽心智唯利是图
0,通才教育是一种____,注重理智的培养和情感的陶冶。,通识教育,全才教育,素质教育,英才教育
0,求不定积分:$\int{\frac{x^2+1}{x^4+1}}\mathrm{d}x=$____,$\dfrac{\sqrt{3}}{2}\arctan\left(\dfrac{x-\frac{1}{x}}{\sqrt{2}}\right)+C$,$\dfrac{\sqrt{3}}{3}\arctan\left(\dfrac{x-\frac{1}{x}}{\sqrt{3}}\right)+C$,$\dfrac{\sqrt{2}}{2}\arctan\left(\dfrac{x+\frac{1}{x}}{\sqrt{3}}\right)+C$,$\dfrac{\sqrt{2}}{2}\arctan\left(\dfrac{x-\frac{1}{x}}{\sqrt{2}}\right)+C$
0,有关涉外仲裁协议的效力问题,下列表述不正确的是:____,约定的仲裁事项超出法律规定的范围的,仲裁协议无效,如果仲裁协议对仲裁事项和仲裁委员会约定不明确,当事人不能达成补充协议的,该仲裁协议无效,当事人约定两个或两个以上的仲裁机构进行仲裁的,该仲裁协议无效,当事人达成的仲裁协议只规定了仲裁地点,未约定仲裁机构,双方当事人在补充协议中选定了在该地点依法重新组建的仲裁机构的,仲裁协议有效
0,急性肝淤血的病理变化有____,肝细胞脂肪变性,肝小叶中央静脉和肝窦扩张,肝细胞胞质可见多个脂肪空泡,槟榔肝
0,当a1≠a2时,CES生产函数的替代弹性为____。,1-ρ,1/(1-ρ),ρ/(1-ρ),1
0,计算机网络的资源主要是指____。,服务器、路由器、通信线路与用户计算机,计算机操作系统、数据库与应用软件,计算机硬件、软件与数据,Web服务器、数据库服务器与文件服务器
0,校准测量能力是指通常提供给用户的____,它用包含因子k=2的扩展不确定度表示。,测量方法的误差,标准器具的误差,最高的校准测量水平,以上都不对
0,我们现阶段的奋斗纲领是____,建设中国特色社会主义,建立社会主义和谐社会,向共产主义社会过渡,全面建设小康社会
0,有下列 5 种物质:①$C_2H_4$ 和 $H_2$,②$H_2$ 和 $Cl_2$,③$CH_4$ 和 $Cl_2$,④Agl,⑤浓 $HNO_3$。在光照下能发生反应的是____,①②③④,②③④⑤,①③④⑤,①②④⑤
0,____被誉为加拿大的“西部天堂”。,温哥华,魁北克市,多伦多,渥太华
0,“一山有四季,十里不同天”说明气温的分布深受什么的影响____,纬度位置,海陆位置,人类活动,海拔高度
0,理性认识的三种形式是____,感觉、知觉、表象,概念、判断、推理,实践、认识、再实践,抽象、具体、再抽象
0,甲(女)与乙(男)婚后购买住房一套,并签订协议:“乙应忠诚于甲,如因其婚外情离婚,该住房归甲所有。”后甲以乙与第三者的QQ聊天记录为证据,诉其违反忠诚协议。法官认为,该协议系双方自愿签订,不违反法律禁止性规定,故合法有效。经调解,两人离婚,住房归甲。对此,下列表述正确的是____。,该协议仅具有道德上的约束力,当事人的意思表示不能仅被看作是一种内心活动,而应首先被视为可能在法律上产生后果的行为,法官对协议的解释具有法律约束力,甲、乙签订的忠诚协议并非法律,遵守该约定不属于守法行为
0,下列四个判断中,不正确的是____,0既不是正数也不是负数,0的倒数是0,0的相反数是 0,零是绝对值最小的有理数
0,用于确定字符串模式的一个规则集称为____。,字符串匹配,正则表达式,文件名匹配,过滤器
0,"春花、夏雨、秋叶、冬雪,配上红墙琉璃瓦,紫禁城的四时皆为诗意美景。近日“二十四节气时光皮肤”【故宫·岁时】上线,该产品将故宫的一年四季与键盘背景绑定,键盘不仅能跟随时间变化,还可展示故宫节气的文化、礼俗、习俗等。这种别具一格的键盘皮肤完全符合年轻人的个性化表达方式,为用户带来更加极致化的输入体验与文化加持。总之,紫禁城的寒来暑往在用户每一次敲击键盘下更加深入于心,可见____
①文化美学必须依托科技创新
②文化传播可以用新的形式呈现
③文化传承以满足个性化的文化体验为目的
④文化情怀的营造不仅关注内容也要兼顾形式",①②,①③,②④,③④
0,山西能源资源丰富,但经济在全国中的地位并不领先,其中原因有____,能源开发规模不大,我国能源需求量不大,能源丰富难以发挥出优势,石油取代煤成为我国主要能源,煤炭开采开工不足,结构单一,生产链简短,能源的综合利用程度和附加价值不高
0,设$X$服从几何分布,$P(X=1)=0.6$,则$P(X=4\mid X>2)=$____,0.5,0.24,0.36,0.16
0,下列关于DNA的双螺旋二级结构稳定的因素,不正确的是____。,"3 "" ,5 "" -磷酸二酯键",互补碱基对之间的氢键,碱基堆积力,磷酸基团上的负电荷与介质中的阳离子之间形成的离子键
0,梅毒的病原体是____,病毒,细菌,螺旋体,支原体
0,不对称运行时负序电流在气隙中产生反转的旋转磁场,使转子带来了额外的损耗,造成转子温度____。,升高,降低,不变,先升高后降低
0,钛与热浓盐酸反应,产物之一为____,Ti(Ⅰ),Ti(Ⅱ),Ti(Ⅲ),Ti(Ⅳ)
0,在代数系统中,整环和域的关系是____,整环一定是域,域不一定是整环,域一定是整环,域一定不是整环
0,当照射光的波长从400nm变到300nm时,对同一金属,在光电效应实验中测得的遏止电压将(普朗克常量h=6.63×10^{-34}J·s,基本电荷e=1.60×10^{-19}C)____,减小0.56V,减小0.34V,增大0.165V,增大1.035V
0,根据《社会生活环境噪声排放标准》(GB22337—2008),在社会生活环境噪声排放源测点布设时,当边界无法测量到声源的实际排放状况时(如声源位于高空、边界设有声屏障等),除按一般规定设置测点外,同时在受影响的噪声敏感建筑物____处另设测点。,户内1m处,户外1.5m处,户外2m处,户外1m处
0,最近一项研究发现,海水颜色能够让飓风改变方向,也就是说,如果海水变色,飓风的移动路径也会变向。这也就意味着科学家可以根据海水的“脸色”判断哪些地方将被飓风袭击,哪些地区会幸免于难。值得关注的是,全球气候变暖可能已经让海水变色。以下哪项最可能是科学家作出判断所依赖的前提?____,海水颜色与飓风移动路径之间存在某种相对确定的联系。,海水温度升高会导致生成的飓风数量增加。,海水温度变化与海水颜色变化之间的联系尚不明确。,全球气候变暖是最近几年飓风频发的重要原因之一。
0,宋代在“十二经”的基础上加入了____,成了后人所说的“十三经”。,《孟子》,《孝经》,《论语》,《尚书》
0,企业缴纳的各种税金中,不影响企业当期损益的有____。,消费税,印花税,增值税,所得税
0,下列各句中,没有语病的一项是____,产业是会展的基础,会展是产业的舞台。杭州文博会不仅成为推动社会和经济效益“双丰收”的重要平台,更成为杭州文化的“秀场”搭建了世界各地多元文化交流的桥梁。,专家表示,中国与东盟国家间紧密的贸易往来,对于推进区域可持续发展起着重要的作用。期待双方紧抓机遇,顺利推动2020年双向贸易额突破1万亿美元等愿景的实现。,围棋作为文明交流的使者,五大洲越来越多的爱好者汇聚在一起。围棋蕴含着中华文明的智慧,为世界不同文明交流互鉴提供了生动的内容,它正以包容的姿态联接起世界。,针对近期出现的游客乱扔垃圾、乱敲熔岩、乱涂乱画的不文明行为,双龙洞景区采取有效的办法,雇用了多名退休的阿姨做景区管理员,让她们阻止游客不文明行为的影响。
0,小麦粒色受独立遗传的三对基因A/a、B/b、C/c-控制.A、B和C决定红色,每个基因对粒色增加效应相同且具叠加性,a、b和c决定白色.将粒色最浅和最深的植株杂交得到F1.Fl的自交后代中,与基因型为Aabbcc的个体表现型相同的概率是____,$1/64$,$3/32$,$15/64$,$5/16$
0,垃圾分类投放已成为新时尚。下列四种生活垃圾不能投放到可回收垃圾桶内的是____,玻璃啤酒瓶,废弃水果皮,塑料饮料瓶,废旧报纸
0,"近日,一场扫黑除恶专项斗争在泰州市全面展开。依法严厉打击黑恶势力____
①有利于维护社会公平正义 ②有利于保障人民安居乐业
③事关社会稳定和国家安全 ④能彻底根除社会巨大毒瘤",①②③,②③④,①③④,①②④
0,组成一个运算器需要多个部件,但下面所列____不是组成运算器的部件。,通用寄存器组,数据总线,ALU,地址寄存器
0,关于信息的传递,下列说法正确的是____,北斗卫星定位系统可提供全天候即时定位服务,5G网络通信主要是利用光导纤维传递信息的,手机话筒的主要作用是把声音信号变成恒定电流,电磁波只能传递声音信号,不能传递图像信号
0,在平坦的垒球运动场上,击球手挥动球棒将垒球水平击出,垒球飞行一段时间后落地。若不计空气阻力,则____,垒球落地时瞬间速度的大小仅由初速度决定,垒球落地时瞬时速度的方向仅由击球点离地面的高度决定,垒球在空中运动的水平位移仅由初速度决定,垒球在空中运动的时间仅由击球点离地面的高度决定
0,按照抗菌药物药代动力学和药效动力学理论(PK/PD),下列哪类属于浓度依赖性抗菌药物____,青霉素类,大环内酯类,碳青霉烯类,氟喹诺酮类
0,堆放时间较长的蔬菜会发热,那么与蔬菜细胞产热直接相关的主要作用是____,蒸腾作用,呼吸作用,光合作用,运输作用
0,某企业发生一起火灾事故,导致3人死亡,10人轻伤,则该起火灾为____火灾。,一般,较大,重大,特别重大
0,生物体内影响衰老的自由基如羟自由基(·OH)、烃氧基(RO·)、超氧阴离子自由基(O2-)、超氧物自由基(HOO·和ROO·)、单线态氧($1O_2·$)等,由于它们均含氧,且比氧更活泼,又被称为____。,氧化态,氧化物,氧离子,活性氧
ByteMLPerf/byte_infer_perf/llm_perf/datasets/test_mini.csv
0 → 100644
View file @
24b257f1
id,question,A,B,C,D
0,最早向中国介绍西方进化论的是____,严复,梁启超,康有为,谭嗣同
ByteMLPerf/byte_infer_perf/llm_perf/launch.py
0 → 100644
View file @
24b257f1
# Copyright 2023 ByteDance and/or its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
import
sys
import
random
import
argparse
import
subprocess
import
json
import
pathlib
import
multiprocessing
as
mp
import
signal
from
typing
import
Any
,
Dict
,
Iterable
,
List
import
traceback
# ${prj_root}/
BYTE_MLPERF_ROOT
=
pathlib
.
Path
(
__file__
).
parents
[
1
]
LLM_PERF_ROOT
=
BYTE_MLPERF_ROOT
.
joinpath
(
"llm_perf"
)
os
.
chdir
(
BYTE_MLPERF_ROOT
)
sys
.
path
.
insert
(
0
,
BYTE_MLPERF_ROOT
.
__str__
())
from
llm_perf.benchmark.bench
import
benchmark
from
llm_perf.utils.logger
import
logger
,
setup_logger
from
llm_perf.utils.reporter
import
Reporter
,
ReportType
class
PerfEngine
:
def
__init__
(
self
,
hardware
,
task
,
host
,
port
)
->
None
:
super
().
__init__
()
self
.
backend_type
=
hardware
self
.
task
=
task
self
.
host
=
host
self
.
port
=
port
self
.
result_queue
=
mp
.
Queue
()
self
.
jobs
:
List
[
mp
.
Process
]
=
[]
self
.
server_process
=
None
self
.
version
=
self
.
get_version
()
def
__del__
(
self
):
self
.
stop_server
()
def
get_version
(
self
):
version
=
""
try
:
version_file
=
os
.
path
.
join
(
str
(
BYTE_MLPERF_ROOT
),
"../VERSION"
)
with
open
(
version_file
)
as
f
:
_version
=
f
.
read
().
splitlines
()
version
=
'.'
.
join
(
v
.
split
(
'='
)[
1
]
for
v
in
_version
)
except
Exception
as
e
:
traceback
.
print_exc
()
logger
.
warning
(
f
"get bytemlperf version failed, error msg:
{
e
}
"
)
return
version
def
start_engine
(
self
)
->
None
:
# load workload
workload
=
load_workload
(
self
.
task
)
model_name
=
workload
[
"model"
]
min_tp_size
=
int
(
workload
[
"min_tp_size"
])
test_accuracy
=
bool
(
workload
[
"test_accuracy"
])
if
"test_accuracy"
in
workload
else
False
test_perf
=
bool
(
workload
[
"test_perf"
])
if
"test_perf"
in
workload
else
False
if
not
any
([
test_perf
,
test_accuracy
]):
logger
.
info
(
f
"End of the llm_perf, enable at least one test item"
)
return
# download model parameter and golden outputs
download_cmd
=
f
"python3 llm_perf/prepare_model.py --task
{
self
.
task
}
--download_model"
if
test_accuracy
:
download_cmd
+=
" --download_baseline"
subprocess
.
run
(
download_cmd
,
shell
=
True
)
# create and start reporter
self
.
reporter
=
Reporter
(
task
=
self
.
task
,
backend
=
self
.
backend_type
,
tp_size
=
min_tp_size
,
batch_size
=
1
,
input_tokens
=
1024
,
min_new_tokens
=
1
,
max_new_tokens
=
512
,
test_perf
=
test_perf
,
test_accuracy
=
test_accuracy
,
version
=
self
.
version
,
)
self
.
reporter
.
start
()
if
test_accuracy
:
accuracy_config
=
workload
[
"accuracy_config"
]
logger
.
info
(
"start test accuracy."
)
logger
.
info
(
f
"using tp_size=
{
min_tp_size
}
"
)
logger
.
info
(
f
"using batch_size=1"
)
self
.
run_perf
(
accuracy_config
,
min_tp_size
,
1
,
1024
,
ReportType
.
ACCURACY
)
if
test_perf
:
perf_config
=
workload
[
"perf_config"
]
test_tp_sizes
=
[]
for
tp_size
in
perf_config
[
"tp_sizes"
]:
if
tp_size
>=
min_tp_size
:
test_tp_sizes
.
append
(
tp_size
)
test_batch_sizes
=
perf_config
[
"batch_sizes"
]
test_input_tokens
=
perf_config
[
"input_tokens"
]
logger
.
info
(
"start test performance."
)
logger
.
info
(
f
"tp_sizes list:
{
test_tp_sizes
}
"
)
logger
.
info
(
f
"batch_sizes list:
{
test_batch_sizes
}
"
)
logger
.
info
(
f
"input_tokens list:
{
test_input_tokens
}
"
)
for
tp_size
in
test_tp_sizes
:
for
batch_size
in
test_batch_sizes
:
for
input_tokens
in
test_input_tokens
:
print
(
"*"
*
150
)
print
(
f
"using tp_size=
{
tp_size
}
, batch_size=
{
batch_size
}
, input_tokens=
{
input_tokens
}
"
)
print
(
"*"
*
150
)
self
.
run_perf
(
perf_config
,
tp_size
,
batch_size
,
input_tokens
,
ReportType
.
PERFORMANCE
,
)
print
(
"
\n\n\n
"
)
self
.
reporter
.
stop
()
self
.
reporter
.
summary
()
def
run_perf
(
self
,
workload
:
Dict
[
str
,
Any
],
tp_size
:
int
,
batch_size
:
int
,
input_tokens
:
int
,
report_type
:
ReportType
,
)
->
None
:
# 1. Start server
self
.
start_server
(
tp_size
,
batch_size
)
# 2. Benchmark clients
self
.
start_benchmark
(
workload
,
batch_size
,
input_tokens
,
report_type
)
# 3. Get result
alive_clients
=
batch_size
if
report_type
==
ReportType
.
PERFORMANCE
else
1
started
:
bool
=
False
while
alive_clients
:
result
=
self
.
result_queue
.
get
()
if
isinstance
(
result
,
str
)
and
result
==
"@start"
:
if
not
started
:
# Reset reporter mate information
self
.
reporter
.
update_meta
(
tp_size
,
batch_size
,
input_tokens
)
started
=
True
continue
elif
result
is
None
:
alive_clients
=
alive_clients
-
1
continue
self
.
reporter
.
submit
(
result
,
report_type
)
# 4. Join benchmark client process
for
p
in
self
.
jobs
:
p
.
join
()
# 5. Kill server process
self
.
stop_server
()
def
start_server
(
self
,
tp_size
:
int
,
batch_size
:
int
):
fifo_name
=
"./server_fifo"
try
:
os
.
mkfifo
(
fifo_name
)
except
FileExistsError
:
logger
.
debug
(
f
"
{
fifo_name
}
already exist"
)
# create server
command
=
[
"python3"
,
"llm_perf/server/launch_server.py"
,
"--model_config"
,
"llm_perf/model_zoo/"
+
self
.
task
+
".json"
,
"--hardware_type"
,
self
.
backend_type
,
"--tp_size"
,
str
(
tp_size
),
"--max_batch_size"
,
str
(
batch_size
),
"--port"
,
str
(
self
.
port
)
]
logger
.
info
(
f
"Start Server:
{
' '
.
join
(
command
)
}
"
)
self
.
server_process
=
subprocess
.
Popen
(
command
,
start_new_session
=
True
)
# wait until server is ready
with
open
(
fifo_name
,
"r"
)
as
fifo_fd
:
while
True
:
data
=
fifo_fd
.
readline
().
strip
()
if
data
==
"Server Ready"
:
break
os
.
remove
(
fifo_name
)
logger
.
info
(
"Server Ready"
)
def
stop_server
(
self
):
if
self
.
server_process
and
self
.
server_process
.
poll
()
is
None
:
logger
.
info
(
"stopping server process"
)
os
.
killpg
(
os
.
getpgid
(
self
.
server_process
.
pid
),
signal
.
SIGTERM
)
try
:
self
.
server_process
.
wait
(
timeout
=
5
)
logger
.
info
(
"server process has stopped"
)
except
subprocess
.
TimeoutExpired
:
os
.
killpg
(
os
.
getpgid
(
self
.
server_process
.
pid
),
signal
.
SIGKILL
)
logger
.
info
(
"server process force killing"
)
else
:
# logger already exit
print
(
f
"server process already exit with
{
self
.
server_process
.
poll
()
}
"
)
# launch clients threads
def
start_benchmark
(
self
,
workload
:
Dict
[
str
,
Any
],
batch_size
:
int
,
input_tokens
:
int
,
report_type
:
ReportType
,
):
clients
=
1
if
report_type
==
ReportType
.
ACCURACY
else
batch_size
sleep_units
=
[
i
for
i
in
range
(
batch_size
)]
random
.
shuffle
(
sleep_units
)
for
i
in
range
(
clients
):
p
=
mp
.
Process
(
target
=
benchmark
,
args
=
(
i
,
sleep_units
[
i
],
workload
,
report_type
,
input_tokens
,
self
.
result_queue
,
self
.
host
,
self
.
port
),
)
self
.
jobs
.
append
(
p
)
p
.
start
()
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--hardware_type"
,
type
=
str
,
default
=
"GPU"
,
help
=
"The backend going to be evaluted, refs to backends/"
,
)
parser
.
add_argument
(
"--task"
,
type
=
str
,
default
=
"chatglm2-torch-fp16-6b"
,
help
=
"The task going to be evaluted, refs to workloads/"
,
)
parser
.
add_argument
(
"--host"
,
type
=
str
,
default
=
"127.0.0.1"
,
help
=
"Host for the gRPC server"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
default
=
51000
,
help
=
"port of the server"
)
parser
.
add_argument
(
"--log_level"
,
type
=
str
,
default
=
os
.
environ
.
get
(
"LOG_LEVEL"
,
"info"
),
help
=
"log level"
)
args
=
parser
.
parse_args
()
return
args
def
load_workload
(
task
:
str
)
->
Dict
[
str
,
Any
]:
"""
Return a list of dictionary with model Configuration
Args: List[str]
Returns: List[dic]
"""
modules_dir
=
LLM_PERF_ROOT
.
joinpath
(
"workloads"
)
workload_dict
=
None
for
filepath
in
modules_dir
.
iterdir
():
if
filepath
.
suffix
==
".json"
and
filepath
.
stem
==
task
:
with
open
(
filepath
)
as
file
:
workload_dict
=
json
.
load
(
file
)
break
if
workload_dict
is
None
:
logger
.
error
(
f
"Task name:
{
task
}
was not found, please check your task name"
)
exit
(
-
1
)
return
workload_dict
if
__name__
==
"__main__"
:
args
=
parse_args
()
hardware
=
args
.
hardware_type
task
=
args
.
task
host
=
args
.
host
port
=
args
.
port
setup_logger
(
args
.
log_level
)
logger
.
info
(
f
"hardware:
{
hardware
}
"
)
logger
.
info
(
f
"task:
{
task
}
"
)
logger
.
info
(
f
"host:
{
host
}
"
)
logger
.
info
(
f
"port:
{
port
}
"
)
instance
=
PerfEngine
(
hardware
,
task
,
host
,
port
)
instance
.
start_engine
()
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/README.md
0 → 100644
View file @
24b257f1
### Meaning of name of json_file
Take mixtral-torch-bf16-8x22b as an example:
-
**mixtral**
means model type, there may be other variances of model type.
-
**torch**
means model framework, we will only use torch as infer framework and prioritize using huggingface model format.
-
**bf16**
means source data type of model weights.
-
**8x22b**
means model size, 8 means experts num, 22b means model size for each expert.
### How to create new json_file
Every json file defines a individual llm model, and the name of json file will be a unique identifier.
Each json file contains following info:
-
model_name:
**mixtral**
for example.
-
model_path: path to model, default to "llm_perf/model_zoo/sota/
`repo_name`
"
-
model_interface: model module name.
-
tokenizer: tokenizer info.
-
network: model config.
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/chatglm2-torch-fp16-6b.json
0 → 100644
View file @
24b257f1
{
"model_name"
:
"chatglm2"
,
"model_path"
:
"llm_perf/model_zoo/sota/chatglm2-6b"
,
"model_interface"
:
"ChatGLMForConditionalGeneration"
,
"tokenizer"
:
{
"path"
:
"llm_perf/model_zoo/sota/chatglm2-6b"
,
"support_chn"
:
true
},
"network"
:
{
"_name_or_path"
:
"THUDM/chatglm2-6b"
,
"model_type"
:
"chatglm"
,
"architectures"
:
[
"ChatGLMModel"
],
"auto_map"
:
{
"AutoConfig"
:
"configuration_chatglm.ChatGLMConfig"
,
"AutoModel"
:
"modeling_chatglm.ChatGLMForConditionalGeneration"
,
"AutoModelForCausalLM"
:
"modeling_chatglm.ChatGLMForConditionalGeneration"
,
"AutoModelForSeq2SeqLM"
:
"modeling_chatglm.ChatGLMForConditionalGeneration"
,
"AutoModelForSequenceClassification"
:
"modeling_chatglm.ChatGLMForSequenceClassification"
},
"add_bias_linear"
:
false
,
"add_qkv_bias"
:
true
,
"apply_query_key_layer_scaling"
:
true
,
"apply_residual_connection_post_layernorm"
:
false
,
"attention_dropout"
:
0
,
"attention_softmax_in_fp32"
:
true
,
"bias_dropout_fusion"
:
true
,
"ffn_hidden_size"
:
13696
,
"fp32_residual_connection"
:
false
,
"hidden_dropout"
:
0
,
"hidden_size"
:
4096
,
"kv_channels"
:
128
,
"layernorm_epsilon"
:
0.00001
,
"multi_query_attention"
:
true
,
"multi_query_group_num"
:
2
,
"num_attention_heads"
:
32
,
"num_layers"
:
28
,
"original_rope"
:
true
,
"padded_vocab_size"
:
65024
,
"post_layer_norm"
:
true
,
"rmsnorm"
:
true
,
"seq_length"
:
32768
,
"use_cache"
:
true
,
"torch_dtype"
:
"float16"
,
"transformers_version"
:
"4.27.1"
,
"tie_word_embeddings"
:
false
,
"eos_token_id"
:
2
,
"pad_token_id"
:
0
}
}
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/falcon-torch-bf16-180b.json
0 → 100644
View file @
24b257f1
{
"model_name"
:
"falcon"
,
"model_path"
:
"llm_perf/model_zoo/sota/falcon-180b"
,
"model_interface"
:
"FalconForCausalLM"
,
"tokenizer"
:
{
"path"
:
"llm_perf/model_zoo/sota/falcon-180b"
},
"network"
:
{
"alibi"
:
false
,
"architectures"
:
[
"FalconForCausalLM"
],
"attention_dropout"
:
0.0
,
"bias"
:
false
,
"bos_token_id"
:
11
,
"eos_token_id"
:
11
,
"hidden_dropout"
:
0.0
,
"hidden_size"
:
14848
,
"initializer_range"
:
0.02
,
"layer_norm_epsilon"
:
1e-05
,
"model_type"
:
"falcon"
,
"multi_query"
:
true
,
"new_decoder_architecture"
:
true
,
"num_attention_heads"
:
232
,
"num_hidden_layers"
:
80
,
"num_kv_heads"
:
8
,
"parallel_attn"
:
true
,
"torch_dtype"
:
"bfloat16"
,
"transformers_version"
:
"4.32.0"
,
"use_cache"
:
true
,
"vocab_size"
:
65024
}
}
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/llama3-torch-bf16-70b.json
0 → 100644
View file @
24b257f1
{
"model_name"
:
"llama3"
,
"model_path"
:
"llm_perf/model_zoo/sota/llama3-70b"
,
"model_interface"
:
"FalconForCausalLM"
,
"tokenizer"
:
{
"path"
:
"llm_perf/model_zoo/sota/llama3-70b"
,
"support_chn"
:
true
,
"apply_chat_template"
:
true
},
"network"
:
{
"architectures"
:
[
"LlamaForCausalLM"
],
"attention_bias"
:
false
,
"attention_dropout"
:
0.0
,
"bos_token_id"
:
128000
,
"eos_token_id"
:
128009
,
"hidden_act"
:
"silu"
,
"hidden_size"
:
8192
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
28672
,
"max_position_embeddings"
:
8192
,
"model_type"
:
"llama"
,
"num_attention_heads"
:
64
,
"num_hidden_layers"
:
80
,
"num_key_value_heads"
:
8
,
"pretraining_tp"
:
1
,
"rms_norm_eps"
:
1e-05
,
"rope_scaling"
:
null
,
"rope_theta"
:
500000.0
,
"tie_word_embeddings"
:
false
,
"torch_dtype"
:
"bfloat16"
,
"transformers_version"
:
"4.40.0"
,
"use_cache"
:
true
,
"vocab_size"
:
128256
}
}
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/model_zoo/mixtral-torch-bf16-8x22b.json
0 → 100644
View file @
24b257f1
{
"model_name"
:
"mixtral"
,
"model_path"
:
"llm_perf/model_zoo/sota/mixtral-8x22b-instruct"
,
"model_interface"
:
"MixtralForCausalLM"
,
"tokenizer"
:
{
"path"
:
"llm_perf/model_zoo/sota/mixtral-8x22b-instruct"
,
"apply_chat_template"
:
true
},
"network"
:
{
"architectures"
:
[
"MixtralForCausalLM"
],
"attention_dropout"
:
0.0
,
"bos_token_id"
:
1
,
"eos_token_id"
:
2
,
"hidden_act"
:
"silu"
,
"hidden_size"
:
6144
,
"initializer_range"
:
0.02
,
"intermediate_size"
:
16384
,
"max_position_embeddings"
:
65536
,
"model_type"
:
"mixtral"
,
"num_attention_heads"
:
48
,
"num_experts_per_tok"
:
2
,
"num_hidden_layers"
:
56
,
"num_key_value_heads"
:
8
,
"num_local_experts"
:
8
,
"output_router_logits"
:
false
,
"rms_norm_eps"
:
1e-05
,
"rope_theta"
:
1000000.0
,
"router_aux_loss_coef"
:
0.001
,
"sliding_window"
:
null
,
"tie_word_embeddings"
:
false
,
"torch_dtype"
:
"bfloat16"
,
"transformers_version"
:
"4.38.0"
,
"use_cache"
:
true
,
"vocab_size"
:
32768
}
}
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/prepare_model.py
0 → 100644
View file @
24b257f1
import
os
import
sys
import
pathlib
import
argparse
import
subprocess
# ${prj_root}/
BYTE_MLPERF_ROOT
=
pathlib
.
Path
(
__file__
).
parents
[
1
].
absolute
()
LLM_PERF_ROOT
=
BYTE_MLPERF_ROOT
.
joinpath
(
"llm_perf"
)
task_map
=
{
"chatglm2-torch-fp16-6b"
:
(
"chatglm2-6b"
,
"THUDM/chatglm2-6b"
),
"llama3-torch-bf16-70b"
:
(
"llama3-70b"
,
"shenzhi-wang/Llama3-70B-Chinese-Chat"
),
"falcon-torch-bf16-180b"
:
(
"falcon-180b"
,
"tiiuae/falcon-180B"
),
"mixtral-torch-bf16-8x22b"
:
(
"mixtral-8x22b-instruct"
,
"mistralai/Mixtral-8x22B-Instruct-v0.1"
),
}
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--task"
,
type
=
str
,
default
=
"chatglm2-torch-fp16-6b"
)
parser
.
add_argument
(
"--download_model"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--download_baseline"
,
action
=
"store_true"
)
args
=
parser
.
parse_args
()
os
.
chdir
(
LLM_PERF_ROOT
)
task_name
=
args
.
task
if
task_name
not
in
task_map
:
print
(
f
"task
{
task_name
}
not found, please check your task name"
)
sys
.
exit
(
-
1
)
model_name
=
task_map
[
task_name
][
0
]
model_repo_name
=
task_map
[
task_name
][
1
]
download_path
=
LLM_PERF_ROOT
.
joinpath
(
"download"
)
download_path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
if
args
.
download_model
:
sota_model_path
=
LLM_PERF_ROOT
.
joinpath
(
"model_zoo"
,
"sota"
)
sota_model_path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
model_path
=
sota_model_path
.
joinpath
(
model_name
)
if
model_path
.
exists
():
print
(
f
"model
{
model_name
}
already exists, skip downloading model."
)
else
:
print
(
f
"downloading model
{
model_name
}
"
)
subprocess
.
run
(
f
"huggingface-cli download --local-dir
{
model_path
}
{
model_repo_name
}
"
,
shell
=
True
,
check
=
True
)
if
args
.
download_baseline
:
gpu_baseline_path
=
LLM_PERF_ROOT
.
joinpath
(
"reports"
,
"base"
)
gpu_baseline_path
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
tar_file_name
=
f
"reports_gpu_
{
task_name
}
.tar.gz"
src_path
=
f
"https://lf-bytemlperf.17mh.cn/obj/bytemlperf-zoo/llm/
{
tar_file_name
}
"
dst_path
=
download_path
.
joinpath
(
tar_file_name
)
if
dst_path
.
exists
():
print
(
f
"baseline
{
model_name
}
already exists, skip downloading baseline."
)
else
:
print
(
f
"downloading baseline
{
model_name
}
"
)
subprocess
.
run
(
f
"wget -O
{
dst_path
}
{
src_path
}
"
,
shell
=
True
,
check
=
True
)
base_path
=
gpu_baseline_path
.
joinpath
(
task_name
)
if
base_path
.
exists
():
print
(
f
"baseline
{
model_name
}
already exists, skip extracting baseline."
)
else
:
print
(
f
"extracting baseline
{
model_name
}
"
)
subprocess
.
run
(
f
"tar -xzvf
{
dst_path
}
-C
{
gpu_baseline_path
}
"
,
shell
=
True
,
check
=
True
)
ByteMLPerf/byte_infer_perf/llm_perf/requirements.txt
0 → 100644
View file @
24b257f1
grpcio
protobuf>=3.20.3,<4.0.0
black==23.1.0
isort
sentencepiece
pandas
google-api-python-client
transformers==4.40.0
tqdm
matplotlib
backoff
psutil
accelerate
prettytable
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/script/extra_datasets.py
0 → 100644
View file @
24b257f1
import
os
import
pandas
as
pd
"""
Select first question in 52 project of datasets, merge to 1 project
"""
dir
=
"llm_perf/datasets/test/"
filenames
=
[
os
.
path
.
join
(
dir
,
f
)
for
f
in
os
.
listdir
(
dir
)
if
os
.
path
.
isfile
(
os
.
path
.
join
(
dir
,
f
))
and
f
.
endswith
(
".csv"
)
]
# print(filenames)
rows
=
[]
for
filename
in
filenames
:
df
=
pd
.
read_csv
(
filename
)
rows
.
append
(
list
(
df
.
iloc
[
0
]))
# print(rows)
result
=
pd
.
DataFrame
(
rows
,
columns
=
df
.
columns
)
save_dir
=
"llm_perf/datasets/"
result
.
to_csv
(
f
"
{
save_dir
}
/merged_52_test.csv"
,
index
=
False
)
ByteMLPerf/byte_infer_perf/llm_perf/script/lint.sh
0 → 100644
View file @
24b257f1
#!/bin/bash
python3
-m
isort llm_perf
-s
model_impl
-s
model_zoo
python3
-m
black llm_perf
--extend-exclude
"model_impl|model_zoo"
\ No newline at end of file
ByteMLPerf/byte_infer_perf/llm_perf/script/proto.sh
0 → 100644
View file @
24b257f1
#!/bin/bash
python3
-m
grpc_tools.protoc
-I
./llm_perf
--python_out
=
./llm_perf
--grpc_python_out
=
./llm_perf ./llm_perf/server.proto
\ No newline at end of file
Prev
1
…
7
8
9
10
11
12
13
14
15
…
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment