Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
AutoAWQ
Commits
86ea8df1
Commit
86ea8df1
authored
Sep 13, 2023
by
Casper Hansen
Browse files
More consistent benchmark
parent
98f6d7b9
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
9 additions
and
7 deletions
+9
-7
examples/benchmark.py
examples/benchmark.py
+9
-7
No files found.
examples/benchmark.py
View file @
86ea8df1
...
@@ -13,7 +13,7 @@ def warmup(model):
...
@@ -13,7 +13,7 @@ def warmup(model):
def
generate
(
model
,
input_ids
,
n_generate
):
def
generate
(
model
,
input_ids
,
n_generate
):
context_time
=
0
context_time
=
0
generate_time
=
0
generate_time
=
[]
with
torch
.
inference_mode
():
with
torch
.
inference_mode
():
for
i
in
range
(
n_generate
):
for
i
in
range
(
n_generate
):
...
@@ -35,7 +35,7 @@ def generate(model, input_ids, n_generate):
...
@@ -35,7 +35,7 @@ def generate(model, input_ids, n_generate):
if
i
==
0
:
if
i
==
0
:
context_time
+=
time
.
time
()
-
start
context_time
+=
time
.
time
()
-
start
else
:
else
:
generate_time
+=
time
.
time
()
-
start
generate_time
.
append
(
time
.
time
()
-
start
)
return
context_time
,
generate_time
return
context_time
,
generate_time
...
@@ -65,8 +65,10 @@ def run_round(model_path, quant_file, n_generate, input_ids, batch_size):
...
@@ -65,8 +65,10 @@ def run_round(model_path, quant_file, n_generate, input_ids, batch_size):
memory_pct
=
memory_used
/
(
torch
.
cuda
.
get_device_properties
(
device
).
total_memory
/
(
1024
**
3
))
*
100
memory_pct
=
memory_used
/
(
torch
.
cuda
.
get_device_properties
(
device
).
total_memory
/
(
1024
**
3
))
*
100
if
successful_generate
:
if
successful_generate
:
prefill_tokens_per_second
=
n_generate
/
context_time
*
batch_size
# number of tokens in context / time for processing context * batch size
decode_tokens_per_second
=
n_generate
/
generate_time
*
batch_size
prefill_tokens_per_second
=
input_ids
.
shape
[
1
]
/
context_time
*
batch_size
# 1 second / median time per token in seconds * batch size
decode_tokens_per_second
=
1
/
np
.
median
(
generate_time
)
*
batch_size
print
(
f
" ** Speed (Prefill):
{
prefill_tokens_per_second
:.
2
f
}
tokens/second"
)
print
(
f
" ** Speed (Prefill):
{
prefill_tokens_per_second
:.
2
f
}
tokens/second"
)
print
(
f
" ** Speed (Decode):
{
decode_tokens_per_second
:.
2
f
}
tokens/second"
)
print
(
f
" ** Speed (Decode):
{
decode_tokens_per_second
:.
2
f
}
tokens/second"
)
...
@@ -82,11 +84,10 @@ def run_round(model_path, quant_file, n_generate, input_ids, batch_size):
...
@@ -82,11 +84,10 @@ def run_round(model_path, quant_file, n_generate, input_ids, batch_size):
"Prefill tokens/s"
:
prefill_tokens_per_second
,
"Prefill tokens/s"
:
prefill_tokens_per_second
,
"Decode tokens/s"
:
decode_tokens_per_second
,
"Decode tokens/s"
:
decode_tokens_per_second
,
"Memory (VRAM)"
:
f
"
{
memory_used
:.
2
f
}
GB (
{
memory_pct
:.
2
f
}
%)"
"Memory (VRAM)"
:
f
"
{
memory_used
:.
2
f
}
GB (
{
memory_pct
:.
2
f
}
%)"
}
}
,
model
.
quant_config
[
"version"
]
def
main
(
args
):
def
main
(
args
):
rounds
=
[
rounds
=
[
{
"context"
:
4
,
"n_generate"
:
200
},
{
"context"
:
32
,
"n_generate"
:
32
},
{
"context"
:
32
,
"n_generate"
:
32
},
{
"context"
:
64
,
"n_generate"
:
64
},
{
"context"
:
64
,
"n_generate"
:
64
},
{
"context"
:
128
,
"n_generate"
:
128
},
{
"context"
:
128
,
"n_generate"
:
128
},
...
@@ -102,7 +103,7 @@ def main(args):
...
@@ -102,7 +103,7 @@ def main(args):
for
settings
in
rounds
:
for
settings
in
rounds
:
input_ids
=
torch
.
randint
(
0
,
tokenizer
.
vocab_size
,
(
args
.
batch_size
,
settings
[
"context"
])).
cuda
()
input_ids
=
torch
.
randint
(
0
,
tokenizer
.
vocab_size
,
(
args
.
batch_size
,
settings
[
"context"
])).
cuda
()
stats
=
run_round
(
stats
,
model_version
=
run_round
(
args
.
model_path
,
args
.
model_path
,
args
.
quant_file
,
args
.
quant_file
,
settings
[
"n_generate"
],
settings
[
"n_generate"
],
...
@@ -118,6 +119,7 @@ def main(args):
...
@@ -118,6 +119,7 @@ def main(args):
df
=
pd
.
DataFrame
(
all_stats
)
df
=
pd
.
DataFrame
(
all_stats
)
print
(
'GPU:'
,
torch
.
cuda
.
get_device_name
())
print
(
'GPU:'
,
torch
.
cuda
.
get_device_name
())
print
(
'Model:'
,
args
.
model_path
)
print
(
'Model:'
,
args
.
model_path
)
print
(
'Version:'
,
model_version
)
print
(
df
.
to_markdown
(
index
=
False
))
print
(
df
.
to_markdown
(
index
=
False
))
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment