Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
e6888d0e
Unverified
Commit
e6888d0e
authored
Jul 04, 2023
by
Nicolas Patry
Committed by
GitHub
Jul 04, 2023
Browse files
docs(benchmarker): Adding some help for the options in `text-generation-benchmark`. (#462)
parent
8405581f
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
51 additions
and
2 deletions
+51
-2
benchmark/src/main.rs
benchmark/src/main.rs
+51
-2
No files found.
benchmark/src/main.rs
View file @
e6888d0e
...
@@ -14,36 +14,85 @@ use tracing_subscriber::EnvFilter;
...
@@ -14,36 +14,85 @@ use tracing_subscriber::EnvFilter;
#[derive(Parser,
Debug)]
#[derive(Parser,
Debug)]
#[clap(author,
version,
about,
long_about
=
None)]
#[clap(author,
version,
about,
long_about
=
None)]
struct
Args
{
struct
Args
{
/// The name of the tokenizer (as in model_id on the huggingface hub, or local path).
#[clap(short,
long,
env)]
#[clap(short,
long,
env)]
tokenizer_name
:
String
,
tokenizer_name
:
String
,
/// The revision to use for the tokenizer if on the hub.
#[clap(default_value
=
"main"
,
long,
env)]
#[clap(default_value
=
"main"
,
long,
env)]
revision
:
String
,
revision
:
String
,
/// The various batch sizes to benchmark for, the idea is to get enough
/// batching to start seeing increased latency, this usually means you're
/// moving from memory bound (usual as BS=1) to compute bound, and this is
/// a sweet spot for the maximum batch size for the model under test
#[clap(short,
long)]
#[clap(short,
long)]
batch_size
:
Option
<
Vec
<
u32
>>
,
batch_size
:
Option
<
Vec
<
u32
>>
,
/// This is the initial prompt sent to the text-generation-server length
/// in token. Longer prompt will slow down the benchmark. Usually the
/// latency grows somewhat linearly with this for the prefill step.
///
/// Most importantly, the prefill step is usually not the one dominating
/// your runtime, so it's ok to keep it short.
#[clap(default_value
=
"10"
,
short,
long,
env)]
#[clap(default_value
=
"10"
,
short,
long,
env)]
sequence_length
:
u32
,
sequence_length
:
u32
,
/// This is how many tokens will be generated by the server and averaged out
/// to give the `decode` latency. This is the *critical* number you want to optimize for
/// LLM spend most of their time doing decoding.
///
/// Decode latency is usually quite stable.
#[clap(default_value
=
"8"
,
short,
long,
env)]
#[clap(default_value
=
"8"
,
short,
long,
env)]
decode_length
:
u32
,
decode_length
:
u32
,
///How many runs should we average from
#[clap(default_value
=
"10"
,
short,
long,
env)]
#[clap(default_value
=
"10"
,
short,
long,
env)]
runs
:
usize
,
runs
:
usize
,
/// Number of warmup cycles
#[clap(default_value
=
"1"
,
short,
long,
env)]
#[clap(default_value
=
"1"
,
short,
long,
env)]
warmups
:
usize
,
warmups
:
usize
,
/// The location of the grpc socket. This benchmark tool bypasses the router
/// completely and directly talks to the gRPC processes
#[clap(default_value
=
"/tmp/text-generation-server-0"
,
short,
long,
env)]
master_shard_uds_path
:
String
,
/// Generation parameter in case you want to specifically test/debug particular
/// decoding strategies, for full doc refer to the `text-generation-server`
#[clap(long,
env)]
#[clap(long,
env)]
temperature
:
Option
<
f32
>
,
temperature
:
Option
<
f32
>
,
/// Generation parameter in case you want to specifically test/debug particular
/// decoding strategies, for full doc refer to the `text-generation-server`
#[clap(long,
env)]
#[clap(long,
env)]
top_k
:
Option
<
u32
>
,
top_k
:
Option
<
u32
>
,
/// Generation parameter in case you want to specifically test/debug particular
/// decoding strategies, for full doc refer to the `text-generation-server`
#[clap(long,
env)]
#[clap(long,
env)]
top_p
:
Option
<
f32
>
,
top_p
:
Option
<
f32
>
,
/// Generation parameter in case you want to specifically test/debug particular
/// decoding strategies, for full doc refer to the `text-generation-server`
#[clap(long,
env)]
#[clap(long,
env)]
typical_p
:
Option
<
f32
>
,
typical_p
:
Option
<
f32
>
,
/// Generation parameter in case you want to specifically test/debug particular
/// decoding strategies, for full doc refer to the `text-generation-server`
#[clap(long,
env)]
#[clap(long,
env)]
repetition_penalty
:
Option
<
f32
>
,
repetition_penalty
:
Option
<
f32
>
,
/// Generation parameter in case you want to specifically test/debug particular
/// decoding strategies, for full doc refer to the `text-generation-server`
#[clap(long,
env)]
#[clap(long,
env)]
watermark
:
bool
,
watermark
:
bool
,
/// Generation parameter in case you want to specifically test/debug particular
/// decoding strategies, for full doc refer to the `text-generation-server`
#[clap(long,
env)]
#[clap(long,
env)]
do_sample
:
bool
,
do_sample
:
bool
,
#[clap(default_value
=
"/tmp/text-generation-server-0"
,
short,
long,
env)]
master_shard_uds_path
:
String
,
}
}
fn
main
()
->
Result
<
(),
Box
<
dyn
std
::
error
::
Error
>>
{
fn
main
()
->
Result
<
(),
Box
<
dyn
std
::
error
::
Error
>>
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment