Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
1c710c19
Unverified
Commit
1c710c19
authored
Oct 29, 2025
by
PanZezhong1725
Committed by
GitHub
Oct 29, 2025
Browse files
Merge pull request #65 from InfiniTensor/issue/64
issue/64 - jiuge.py verbose output
parents
80909bee
7a412203
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
148 additions
and
16 deletions
+148
-16
scripts/jiuge.py
scripts/jiuge.py
+148
-16
No files found.
scripts/jiuge.py
View file @
1c710c19
...
@@ -123,7 +123,11 @@ class JiugeMetaFromLlama(JiugeMetaCStruct):
...
@@ -123,7 +123,11 @@ class JiugeMetaFromLlama(JiugeMetaCStruct):
if
"num_key_value_heads"
in
config
if
"num_key_value_heads"
in
config
else
config
[
"num_attention_heads"
]
else
config
[
"num_attention_heads"
]
),
),
dh
=
config
[
"head_dim"
]
if
"head_dim"
in
config
else
config
[
"hidden_size"
]
//
config
[
"num_attention_heads"
],
dh
=
(
config
[
"head_dim"
]
if
"head_dim"
in
config
else
config
[
"hidden_size"
]
//
config
[
"num_attention_heads"
]
),
di
=
config
[
"intermediate_size"
],
di
=
config
[
"intermediate_size"
],
dctx
=
(
dctx
=
(
config
[
"max_position_embeddings"
]
if
max_tokens
is
None
else
max_tokens
config
[
"max_position_embeddings"
]
if
max_tokens
is
None
else
max_tokens
...
@@ -533,9 +537,9 @@ class JiugeForCauslLM:
...
@@ -533,9 +537,9 @@ class JiugeForCauslLM:
else
:
else
:
raise
ValueError
(
"Unsupported model architecture"
)
raise
ValueError
(
"Unsupported model architecture"
)
if
"llama"
==
config
[
"model_type"
]:
if
"llama"
==
config
[
"model_type"
]:
from
tokenizers
import
decoders
as
_dec
from
tokenizers
import
decoders
as
_dec
backend
=
getattr
(
self
.
tokenizer
,
"backend_tokenizer"
,
None
)
backend
=
getattr
(
self
.
tokenizer
,
"backend_tokenizer"
,
None
)
target
=
getattr
(
backend
,
"_tokenizer"
,
backend
)
target
=
getattr
(
backend
,
"_tokenizer"
,
backend
)
norm
=
getattr
(
target
,
"normalizer"
,
None
)
norm
=
getattr
(
target
,
"normalizer"
,
None
)
...
@@ -545,11 +549,13 @@ class JiugeForCauslLM:
...
@@ -545,11 +549,13 @@ class JiugeForCauslLM:
has_prepend
=
"Prepend"
in
sn
has_prepend
=
"Prepend"
in
sn
has_strip
=
"Strip"
in
sd
has_strip
=
"Strip"
in
sd
if
has_prepend
and
has_strip
:
if
has_prepend
and
has_strip
:
target
.
decoder
=
_dec
.
Sequence
([
target
.
decoder
=
_dec
.
Sequence
(
[
_dec
.
Replace
(
"▁"
,
" "
),
_dec
.
Replace
(
"▁"
,
" "
),
_dec
.
ByteFallback
(),
_dec
.
ByteFallback
(),
_dec
.
Fuse
(),
_dec
.
Fuse
(),
])
]
)
load_end_time
=
time
.
time
()
load_end_time
=
time
.
time
()
print
(
f
"Time used:
{
load_end_time
-
load_start_time
:.
3
f
}
s"
)
print
(
f
"Time used:
{
load_end_time
-
load_start_time
:.
3
f
}
s"
)
...
@@ -599,7 +605,15 @@ class JiugeForCauslLM:
...
@@ -599,7 +605,15 @@ class JiugeForCauslLM:
)
)
return
list
(
output
)
return
list
(
output
)
def
generate
(
self
,
input_content
,
max_steps
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
):
def
generate
(
self
,
input_content
,
max_steps
,
topp_
=
1.0
,
topk_
=
1
,
temperature_
=
1.0
,
verbose
=
False
,
):
input_content
=
self
.
tokenizer
.
apply_chat_template
(
input_content
=
self
.
tokenizer
.
apply_chat_template
(
conversation
=
[{
"role"
:
"user"
,
"content"
:
input_content
}],
conversation
=
[{
"role"
:
"user"
,
"content"
:
input_content
}],
add_generation_prompt
=
True
,
add_generation_prompt
=
True
,
...
@@ -620,9 +634,61 @@ class JiugeForCauslLM:
...
@@ -620,9 +634,61 @@ class JiugeForCauslLM:
steps
=
0
steps
=
0
total_time
=
0
total_time
=
0
prefill_time
=
0
decode_time
=
0
output_content
=
""
output_content
=
""
for
step_i
in
range
(
max_steps
):
# Prefill phase - process initial prompt
prefill_start_time
=
time
.
time
()
output_tokens
=
self
.
batch_infer_one_round
([
infer_task
])
prefill_end_time
=
time
.
time
()
prefill_time
=
prefill_end_time
-
prefill_start_time
steps
+=
1
output_str
=
self
.
tokenizer
.
decode
(
output_tokens
[
0
])
output_content
+=
output_str
print
(
output_str
,
end
=
""
,
flush
=
True
)
if
output_tokens
[
0
]
in
self
.
eos_token_id
:
# If generation ends after prefill, calculate metrics
total_time
=
prefill_time
total_tokens
=
len
(
tokens
)
+
1
# input tokens + first output token
print
(
"
\n
"
)
print
(
f
"Time per step:
{
total_time
*
1000
:.
3
f
}
ms"
)
if
verbose
:
overall_throughput
=
total_tokens
/
total_time
prefill_throughput
=
len
(
tokens
)
/
prefill_time
decode_throughput
=
1
/
0.001
# Avoid division by zero, use small value
print
(
"="
*
50
)
print
(
"PERFORMANCE METRICS"
)
print
(
"="
*
50
)
print
(
f
"Input tokens:
{
len
(
tokens
)
}
"
)
print
(
f
"Generated tokens: 1"
)
print
(
f
"Total tokens:
{
total_tokens
}
"
)
print
(
f
"Total time:
{
total_time
*
1000
:.
3
f
}
ms"
)
print
(
f
"Prefill time:
{
prefill_time
*
1000
:.
3
f
}
ms"
)
print
(
f
"Decode time: 0.000ms"
)
print
(
"-"
*
50
)
print
(
f
"Time per step:
{
total_time
*
1000
:.
3
f
}
ms"
)
print
(
f
"Avg prefill time per token:
{
prefill_time
*
1000
/
len
(
tokens
):.
3
f
}
ms"
)
print
(
f
"Avg decode time per token: N/A"
)
print
(
"-"
*
50
)
print
(
f
"Overall throughput:
{
overall_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Prefill throughput:
{
prefill_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Decode throughput: N/A"
)
print
(
"="
*
50
)
return
output_content
,
total_time
*
1000
infer_task
.
next
(
output_tokens
[
0
])
# Decode phase - generate subsequent tokens
decode_start_time
=
time
.
time
()
for
step_i
in
range
(
1
,
max_steps
):
start_time
=
time
.
time
()
start_time
=
time
.
time
()
output_tokens
=
self
.
batch_infer_one_round
([
infer_task
])
output_tokens
=
self
.
batch_infer_one_round
([
infer_task
])
end_time
=
time
.
time
()
end_time
=
time
.
time
()
...
@@ -638,12 +704,65 @@ class JiugeForCauslLM:
...
@@ -638,12 +704,65 @@ class JiugeForCauslLM:
if
step_i
>
0
:
if
step_i
>
0
:
total_time
+=
end_time
-
start_time
total_time
+=
end_time
-
start_time
decode_end_time
=
time
.
time
()
decode_time
=
decode_end_time
-
decode_start_time
print
(
"
\n
"
)
print
(
"
\n
"
)
avg_time
=
total_time
*
1000
/
(
steps
-
1
)
print
(
f
"Time per step:
{
avg_time
:.
3
f
}
ms"
)
# Calculate performance metrics
total_time
=
prefill_time
+
decode_time
input_tokens
=
len
(
tokens
)
generated_tokens
=
steps
# including first token from prefill
# Time per token calculations
avg_time_per_step
=
(
total_time
*
1000
/
(
steps
-
1
)
if
steps
>
1
else
total_time
*
1000
)
print
(
f
"Time per step:
{
avg_time_per_step
:.
3
f
}
ms"
)
# Only print detailed metrics if verbose flag is set
if
verbose
:
total_tokens
=
input_tokens
+
generated_tokens
# Throughput calculations
overall_throughput
=
total_tokens
/
total_time
# tokens per second
prefill_throughput
=
input_tokens
/
prefill_time
if
prefill_time
>
0
else
0
decode_throughput
=
(
(
generated_tokens
-
1
)
/
decode_time
if
decode_time
>
0
else
0
)
# exclude first token from prefill
# Time per token calculations
avg_prefill_time_per_token
=
(
prefill_time
*
1000
/
input_tokens
if
input_tokens
>
0
else
0
)
avg_decode_time_per_token
=
(
decode_time
*
1000
/
(
generated_tokens
-
1
)
if
generated_tokens
>
1
else
0
)
print
(
"="
*
50
)
print
(
"PERFORMANCE METRICS"
)
print
(
"="
*
50
)
print
(
f
"Input tokens:
{
input_tokens
}
"
)
print
(
f
"Generated tokens:
{
generated_tokens
}
"
)
print
(
f
"Total tokens:
{
total_tokens
}
"
)
print
(
f
"Total time:
{
total_time
*
1000
:.
3
f
}
ms"
)
print
(
f
"Prefill time:
{
prefill_time
*
1000
:.
3
f
}
ms"
)
print
(
f
"Decode time:
{
decode_time
*
1000
:.
3
f
}
ms"
)
print
(
"-"
*
50
)
print
(
f
"Time per step:
{
avg_time_per_step
:.
3
f
}
ms"
)
print
(
f
"Avg prefill time per token:
{
avg_prefill_time_per_token
:.
3
f
}
ms"
)
print
(
f
"Avg decode time per token:
{
avg_decode_time_per_token
:.
3
f
}
ms"
)
print
(
"-"
*
50
)
print
(
f
"Overall throughput:
{
overall_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Prefill throughput:
{
prefill_throughput
:.
2
f
}
tokens/s"
)
print
(
f
"Decode throughput:
{
decode_throughput
:.
2
f
}
tokens/s"
)
print
(
"="
*
50
)
infer_task
.
_kv_cache
.
drop
(
self
)
infer_task
.
_kv_cache
.
drop
(
self
)
return
output_content
,
avg_time
return
output_content
,
avg_time
_per_step
def
perplexity
(
self
,
test_sequences
:
List
[
Sequence
[
int
]],
batch_size
=
10
):
def
perplexity
(
self
,
test_sequences
:
List
[
Sequence
[
int
]],
batch_size
=
10
):
tasks
=
[
tasks
=
[
...
@@ -706,11 +825,21 @@ class JiugeForCauslLM:
...
@@ -706,11 +825,21 @@ class JiugeForCauslLM:
def
test
():
def
test
():
if
len
(
sys
.
argv
)
<
3
:
if
len
(
sys
.
argv
)
<
3
:
print
(
print
(
"Usage: python jiuge.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device]"
"Usage: python jiuge.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device]
[--verbose]
"
)
)
sys
.
exit
(
1
)
sys
.
exit
(
1
)
# Parse command line arguments
model_path
=
sys
.
argv
[
2
]
model_path
=
sys
.
argv
[
2
]
device_type
=
DeviceType
.
DEVICE_TYPE_CPU
device_type
=
DeviceType
.
DEVICE_TYPE_CPU
verbose
=
False
# Check for verbose flag
for
arg
in
sys
.
argv
:
if
arg
==
"--verbose"
:
verbose
=
True
break
if
sys
.
argv
[
1
]
==
"--cpu"
:
if
sys
.
argv
[
1
]
==
"--cpu"
:
device_type
=
DeviceType
.
DEVICE_TYPE_CPU
device_type
=
DeviceType
.
DEVICE_TYPE_CPU
elif
sys
.
argv
[
1
]
==
"--nvidia"
:
elif
sys
.
argv
[
1
]
==
"--nvidia"
:
...
@@ -731,13 +860,16 @@ def test():
...
@@ -731,13 +860,16 @@ def test():
device_type
=
DeviceType
.
DEVICE_TYPE_HYGON
device_type
=
DeviceType
.
DEVICE_TYPE_HYGON
else
:
else
:
print
(
print
(
"Usage: python jiuge.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device]"
"Usage: python jiuge.py [--cpu | --nvidia| --cambricon | --ascend | --metax | --moore | --iluvatar | --kunlun | --hygon] <path/to/model_dir> [n_device]
[--verbose]
"
)
)
sys
.
exit
(
1
)
sys
.
exit
(
1
)
ndev
=
int
(
sys
.
argv
[
3
])
if
len
(
sys
.
argv
)
>
3
else
1
# Find n_device argument (skip --verbose)
ndev_args
=
[
arg
for
arg
in
sys
.
argv
[
3
:]
if
arg
!=
"--verbose"
]
ndev
=
int
(
ndev_args
[
0
])
if
ndev_args
else
1
model
=
JiugeForCauslLM
(
model_path
,
device_type
,
ndev
)
model
=
JiugeForCauslLM
(
model_path
,
device_type
,
ndev
)
model
.
generate
(
"山东最高的山是?"
,
500
)
model
.
generate
(
"山东最高的山是?"
,
500
,
verbose
=
verbose
)
model
.
destroy_model_instance
()
model
.
destroy_model_instance
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment