Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinilm
Commits
9e434492
Commit
9e434492
authored
Dec 18, 2025
by
PanZezhong
Browse files
issue/140 准确率脚本支持torch,添加总吞吐
parent
7862a723
Changes
4
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
370 additions
and
126 deletions
+370
-126
csrc/engine/rank_worker.cpp
csrc/engine/rank_worker.cpp
+1
-0
csrc/models/llama/llama_for_causal_lm.cpp
csrc/models/llama/llama_for_causal_lm.cpp
+0
-7
python/infinilm/generation/utils.py
python/infinilm/generation/utils.py
+13
-4
test/bench/test_benchmark.py
test/bench/test_benchmark.py
+356
-115
No files found.
csrc/engine/rank_worker.cpp
View file @
9e434492
...
@@ -251,6 +251,7 @@ void RankWorker::thread_loop() {
...
@@ -251,6 +251,7 @@ void RankWorker::thread_loop() {
}
else
if
(
local_cmd
==
Command
::
RUN
)
{
}
else
if
(
local_cmd
==
Command
::
RUN
)
{
try
{
try
{
auto
out
=
model_
->
forward
(
local_args
);
auto
out
=
model_
->
forward
(
local_args
);
infinicore
::
context
::
syncStream
();
{
{
std
::
lock_guard
<
std
::
mutex
>
lk
(
mutex_
);
std
::
lock_guard
<
std
::
mutex
>
lk
(
mutex_
);
...
...
csrc/models/llama/llama_for_causal_lm.cpp
View file @
9e434492
...
@@ -34,13 +34,6 @@ infinicore::Tensor LlamaForCausalLM::forward(const infinicore::Tensor &input_ids
...
@@ -34,13 +34,6 @@ infinicore::Tensor LlamaForCausalLM::forward(const infinicore::Tensor &input_ids
// 2. Apply language modeling head to get logits
// 2. Apply language modeling head to get logits
auto
logits
=
lm_head_
->
forward
(
hidden_states
);
auto
logits
=
lm_head_
->
forward
(
hidden_states
);
// 3. CRITICAL: Synchronize the C++ backend's context after forward pass
// This ensures all C++ backend operations complete before returning to Python
if
(
device_
.
getType
()
!=
infinicore
::
Device
::
Type
::
CPU
)
{
infinicore
::
context
::
setDevice
(
device_
,
false
);
infinicore
::
context
::
syncStream
();
}
return
logits
;
return
logits
;
}
}
...
...
python/infinilm/generation/utils.py
View file @
9e434492
...
@@ -251,22 +251,31 @@ class GenerationMixin:
...
@@ -251,22 +251,31 @@ class GenerationMixin:
output_content
+=
output_str
output_content
+=
output_str
end_time
=
time
.
time
()
end_time
=
time
.
time
()
time_list
.
append
((
end_time
-
start_time
)
*
1000
)
time_list
.
append
((
end_time
-
start_time
))
print
(
output_str
,
end
=
""
,
flush
=
True
)
print
(
output_str
,
end
=
""
,
flush
=
True
)
if
stop_on_eos
and
token_id
in
eos_token_id_list
:
if
stop_on_eos
and
token_id
in
eos_token_id_list
:
break
break
print
(
"
\n
</s>"
)
print
(
"
\n
</s>"
)
print
(
f
"
\n\n\n
Generation completed in
{
round
(
sum
(
time_list
),
2
)
}
ms"
)
print
(
f
"
\n\n\n
Generation completed in
{
round
(
sum
(
time_list
)
*
1000
,
2
)
}
ms"
)
print
(
print
(
f
" Batchsize=
{
batch_size
}
Per_Batch_Input_Len=
{
seq_len
}
Per_Batch_New_Tokens=
{
len
(
time_list
)
}
\n
"
f
" Batchsize=
{
batch_size
}
Per_Batch_Input_Len=
{
seq_len
}
Per_Batch_New_Tokens=
{
len
(
time_list
)
}
\n
"
)
)
print
(
print
(
f
" Prefill TTFT:
{
round
(
time_list
[
0
],
2
)
}
ms Throughput:
{
round
((
1000
*
batch_size
*
seq_len
)
/
time_list
[
0
],
2
)
}
tok/s
\n
"
,
f
" Prefill TTFT:
{
round
(
time_list
[
0
],
2
)
}
ms Throughput:
{
round
((
batch_size
*
seq_len
)
/
time_list
[
0
],
2
)
}
tok/s
\n
"
,
)
)
if
len
(
time_list
)
>
1
:
if
len
(
time_list
)
>
1
:
print
(
print
(
f
" Decode Avg ITL:
{
round
(
sum
(
time_list
[
1
:])
/
(
len
(
time_list
)
-
1
),
2
)
}
ms Throughput:
{
round
((
1000
*
batch_size
*
(
len
(
time_list
)
-
1
))
/
sum
(
time_list
[
1
:]),
2
)
}
tok/s
\n
"
,
f
" Decode Avg ITL:
{
round
(
sum
(
time_list
[
1
:])
*
1000
/
(
len
(
time_list
)
-
1
),
2
)
}
ms Throughput:
{
round
((
batch_size
*
(
len
(
time_list
)
-
1
))
/
sum
(
time_list
[
1
:]),
2
)
}
tok/s
\n
"
,
)
)
return
{
"output_token_ids"
:
output_tokens_list
,
"output_content"
:
output_content
,
"total_latency"
:
sum
(
time_list
),
"prefill_latency"
:
time_list
[
0
],
"decode_latency"
:
sum
(
time_list
[
1
:]),
"total_input_tokens"
:
batch_size
*
seq_len
,
"total_output_tokens"
:
len
(
time_list
),
}
return
output_tokens_list
,
output_content
return
output_tokens_list
,
output_content
test/bench/test_benchmark.py
View file @
9e434492
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment