Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
text-generation-inference
Commits
951930fb
Unverified
Commit
951930fb
authored
May 25, 2023
by
OlivierDehaene
Committed by
GitHub
May 25, 2023
Browse files
feat(benchmarker): add summary tables (#368)
parent
218c9ada
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
325 additions
and
42 deletions
+325
-42
Cargo.lock
Cargo.lock
+42
-0
benchmark/Cargo.toml
benchmark/Cargo.toml
+1
-0
benchmark/src/app.rs
benchmark/src/app.rs
+31
-27
benchmark/src/generation.rs
benchmark/src/generation.rs
+7
-11
benchmark/src/lib.rs
benchmark/src/lib.rs
+44
-2
benchmark/src/main.rs
benchmark/src/main.rs
+30
-2
benchmark/src/table.rs
benchmark/src/table.rs
+170
-0
No files found.
Cargo.lock
View file @
951930fb
...
@@ -249,6 +249,12 @@ version = "3.12.0"
...
@@ -249,6 +249,12 @@ version = "3.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
checksum = "0d261e256854913907f67ed06efbc3338dfe6179796deefc1ff763fc1aee5535"
[[package]]
name = "bytecount"
version = "0.6.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2c676a478f63e9fa2dd5368a42f28bba0d6c560b775f38583c8bbaa7fcd67c9c"
[[package]]
[[package]]
name = "byteorder"
name = "byteorder"
version = "1.4.3"
version = "1.4.3"
...
@@ -1706,6 +1712,17 @@ version = "0.1.1"
...
@@ -1706,6 +1712,17 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]]
name = "papergrid"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fdfe703c51ddc52887ad78fc69cd2ea78d895ffcd6e955c9d03566db8ab5bb1"
dependencies = [
"bytecount",
"fnv",
"unicode-width",
]
[[package]]
[[package]]
name = "parking_lot"
name = "parking_lot"
version = "0.12.1"
version = "0.12.1"
...
@@ -2490,6 +2507,30 @@ dependencies = [
...
@@ -2490,6 +2507,30 @@ dependencies = [
"winapi",
"winapi",
]
]
[[package]]
name = "tabled"
version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "da1a2e56bbf7bfdd08aaa7592157a742205459eff774b73bc01809ae2d99dc2a"
dependencies = [
"papergrid",
"tabled_derive",
"unicode-width",
]
[[package]]
name = "tabled_derive"
version = "0.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99f688a08b54f4f02f0a3c382aefdb7884d3d69609f785bd253dc033243e3fe4"
dependencies = [
"heck",
"proc-macro-error",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
[[package]]
name = "tar"
name = "tar"
version = "0.4.38"
version = "0.4.38"
...
@@ -2525,6 +2566,7 @@ dependencies = [
...
@@ -2525,6 +2566,7 @@ dependencies = [
"ratatui",
"ratatui",
"serde",
"serde",
"serde_json",
"serde_json",
"tabled",
"text-generation-client",
"text-generation-client",
"thiserror",
"thiserror",
"tokenizers",
"tokenizers",
...
...
benchmark/Cargo.toml
View file @
951930fb
...
@@ -20,6 +20,7 @@ crossterm = "0.26"
...
@@ -20,6 +20,7 @@ crossterm = "0.26"
float-ord
=
"0.3.2"
float-ord
=
"0.3.2"
serde
=
{
version
=
"1.0.142"
,
features
=
["derive"]
}
serde
=
{
version
=
"1.0.142"
,
features
=
["derive"]
}
serde_json
=
"1.0"
serde_json
=
"1.0"
tabled
=
"0.12.0"
text-generation-client
=
{
path
=
"../router/client"
}
text-generation-client
=
{
path
=
"../router/client"
}
thiserror
=
"1.0.38"
thiserror
=
"1.0.38"
tokenizers
=
"0.13.3"
tokenizers
=
"0.13.3"
...
...
benchmark/src/app.rs
View file @
951930fb
...
@@ -15,6 +15,7 @@ use tui::{symbols, Frame};
...
@@ -15,6 +15,7 @@ use tui::{symbols, Frame};
/// TUI powered App
/// TUI powered App
pub
(
crate
)
struct
App
{
pub
(
crate
)
struct
App
{
pub
(
crate
)
running
:
bool
,
pub
(
crate
)
running
:
bool
,
pub
(
crate
)
data
:
Data
,
completed_runs
:
Vec
<
usize
>
,
completed_runs
:
Vec
<
usize
>
,
completed_batch
:
usize
,
completed_batch
:
usize
,
current_batch
:
usize
,
current_batch
:
usize
,
...
@@ -22,12 +23,10 @@ pub(crate) struct App {
...
@@ -22,12 +23,10 @@ pub(crate) struct App {
touched_tab
:
bool
,
touched_tab
:
bool
,
zoom
:
bool
,
zoom
:
bool
,
is_error
:
bool
,
is_error
:
bool
,
data
:
Data
,
tokenizer_name
:
String
,
tokenizer_name
:
String
,
sequence_length
:
u32
,
sequence_length
:
u32
,
decode_length
:
u32
,
decode_length
:
u32
,
n_run
:
usize
,
n_run
:
usize
,
batch_size
:
Vec
<
u32
>
,
receiver
:
mpsc
::
Receiver
<
Result
<
Message
,
ClientError
>>
,
receiver
:
mpsc
::
Receiver
<
Result
<
Message
,
ClientError
>>
,
}
}
...
@@ -40,7 +39,6 @@ impl App {
...
@@ -40,7 +39,6 @@ impl App {
n_run
:
usize
,
n_run
:
usize
,
batch_size
:
Vec
<
u32
>
,
batch_size
:
Vec
<
u32
>
,
)
->
Self
{
)
->
Self
{
let
data
=
Data
::
new
(
n_run
,
batch_size
.len
());
let
current_tab
=
0
;
let
current_tab
=
0
;
let
completed_runs
:
Vec
<
usize
>
=
(
0
..
batch_size
.len
())
.map
(|
_
|
0
)
.collect
();
let
completed_runs
:
Vec
<
usize
>
=
(
0
..
batch_size
.len
())
.map
(|
_
|
0
)
.collect
();
...
@@ -48,8 +46,11 @@ impl App {
...
@@ -48,8 +46,11 @@ impl App {
let
current_batch
=
0
;
let
current_batch
=
0
;
let
is_error
=
false
;
let
is_error
=
false
;
let
data
=
Data
::
new
(
n_run
,
batch_size
);
Self
{
Self
{
running
:
true
,
running
:
true
,
data
,
completed_runs
,
completed_runs
,
completed_batch
,
completed_batch
,
current_batch
,
current_batch
,
...
@@ -57,12 +58,10 @@ impl App {
...
@@ -57,12 +58,10 @@ impl App {
touched_tab
:
false
,
touched_tab
:
false
,
zoom
:
false
,
zoom
:
false
,
is_error
,
is_error
,
data
,
tokenizer_name
,
tokenizer_name
,
sequence_length
,
sequence_length
,
decode_length
,
decode_length
,
n_run
,
n_run
,
batch_size
,
receiver
,
receiver
,
}
}
}
}
...
@@ -79,7 +78,7 @@ impl App {
...
@@ -79,7 +78,7 @@ impl App {
code
:
KeyCode
::
Tab
,
..
code
:
KeyCode
::
Tab
,
..
}
=>
{
}
=>
{
self
.touched_tab
=
true
;
self
.touched_tab
=
true
;
self
.current_tab
=
(
self
.current_tab
+
1
)
%
self
.batch_size
.len
();
self
.current_tab
=
(
self
.current_tab
+
1
)
%
self
.
data.
batch_size
.len
();
}
}
// Decrease and wrap tab
// Decrease and wrap tab
KeyEvent
{
KeyEvent
{
...
@@ -90,7 +89,7 @@ impl App {
...
@@ -90,7 +89,7 @@ impl App {
if
self
.current_tab
>
0
{
if
self
.current_tab
>
0
{
self
.current_tab
-=
1
;
self
.current_tab
-=
1
;
}
else
{
}
else
{
self
.current_tab
=
self
.batch_size
.len
()
-
1
;
self
.current_tab
=
self
.
data.
batch_size
.len
()
-
1
;
}
}
}
}
// Zoom on throughput/latency fig
// Zoom on throughput/latency fig
...
@@ -137,7 +136,7 @@ impl App {
...
@@ -137,7 +136,7 @@ impl App {
self
.data
.end_batch
(
self
.current_batch
);
self
.data
.end_batch
(
self
.current_batch
);
self
.completed_batch
+=
1
;
self
.completed_batch
+=
1
;
if
self
.current_batch
<
self
.batch_size
.len
()
-
1
{
if
self
.current_batch
<
self
.
data.
batch_size
.len
()
-
1
{
// Only go to next tab if the user never touched the tab keys
// Only go to next tab if the user never touched the tab keys
if
!
self
.touched_tab
{
if
!
self
.touched_tab
{
self
.current_tab
+=
1
;
self
.current_tab
+=
1
;
...
@@ -156,7 +155,7 @@ impl App {
...
@@ -156,7 +155,7 @@ impl App {
/// Render frame
/// Render frame
pub
fn
render
<
B
:
Backend
>
(
&
mut
self
,
f
:
&
mut
Frame
<
'_
,
B
>
)
{
pub
fn
render
<
B
:
Backend
>
(
&
mut
self
,
f
:
&
mut
Frame
<
'_
,
B
>
)
{
let
batch_progress
=
let
batch_progress
=
(
self
.completed_batch
as
f64
/
self
.batch_size
.len
()
as
f64
)
.clamp
(
0.0
,
1.0
);
(
self
.completed_batch
as
f64
/
self
.
data.
batch_size
.len
()
as
f64
)
.clamp
(
0.0
,
1.0
);
let
run_progress
=
let
run_progress
=
(
self
.completed_runs
[
self
.current_batch
]
as
f64
/
self
.n_run
as
f64
)
.clamp
(
0.0
,
1.0
);
(
self
.completed_runs
[
self
.current_batch
]
as
f64
/
self
.n_run
as
f64
)
.clamp
(
0.0
,
1.0
);
...
@@ -241,6 +240,7 @@ impl App {
...
@@ -241,6 +240,7 @@ impl App {
// Batch tabs
// Batch tabs
let
titles
=
self
let
titles
=
self
.data
.batch_size
.batch_size
.iter
()
.iter
()
.map
(|
b
|
{
.map
(|
b
|
{
...
@@ -269,7 +269,7 @@ impl App {
...
@@ -269,7 +269,7 @@ impl App {
};
};
let
batch_gauge
=
progress_gauge
(
let
batch_gauge
=
progress_gauge
(
"Total Progress"
,
"Total Progress"
,
format!
(
"{} / {}"
,
self
.completed_batch
,
self
.batch_size
.len
()),
format!
(
"{} / {}"
,
self
.completed_batch
,
self
.
data.
batch_size
.len
()),
batch_progress
,
batch_progress
,
color
,
color
,
);
);
...
@@ -347,7 +347,7 @@ impl App {
...
@@ -347,7 +347,7 @@ impl App {
// Prefill latency/throughput chart
// Prefill latency/throughput chart
let
prefill_latency_throughput_chart
=
latency_throughput_chart
(
let
prefill_latency_throughput_chart
=
latency_throughput_chart
(
&
self
.data.prefill_batch_latency_throughput
,
&
self
.data.prefill_batch_latency_throughput
,
&
self
.batch_size
,
&
self
.
data.
batch_size
,
self
.zoom
,
self
.zoom
,
"Prefill"
,
"Prefill"
,
);
);
...
@@ -356,7 +356,7 @@ impl App {
...
@@ -356,7 +356,7 @@ impl App {
// Decode latency/throughput chart
// Decode latency/throughput chart
let
decode_latency_throughput_chart
=
latency_throughput_chart
(
let
decode_latency_throughput_chart
=
latency_throughput_chart
(
&
self
.data.decode_batch_latency_throughput
,
&
self
.data.decode_batch_latency_throughput
,
&
self
.batch_size
,
&
self
.
data.
batch_size
,
self
.zoom
,
self
.zoom
,
"Decode"
,
"Decode"
,
);
);
...
@@ -365,31 +365,35 @@ impl App {
...
@@ -365,31 +365,35 @@ impl App {
}
}
/// App internal data struct
/// App internal data struct
struct
Data
{
pub
(
crate
)
struct
Data
{
prefill_latencies
:
Vec
<
Vec
<
f64
>>
,
pub
(
crate
)
batch_size
:
Vec
<
u32
>
,
prefill_throughputs
:
Vec
<
Vec
<
f64
>>
,
pub
(
crate
)
prefill_latencies
:
Vec
<
Vec
<
f64
>>
,
decode_latencies
:
Vec
<
Vec
<
f64
>>
,
pub
(
crate
)
prefill_throughputs
:
Vec
<
Vec
<
f64
>>
,
decode_token_latencies
:
Vec
<
Vec
<
f64
>>
,
pub
(
crate
)
decode_latencies
:
Vec
<
Vec
<
f64
>>
,
decode_throughputs
:
Vec
<
Vec
<
f64
>>
,
pub
(
crate
)
decode_token_latencies
:
Vec
<
Vec
<
f64
>>
,
prefill_batch_latency_throughput
:
Vec
<
(
f64
,
f64
)
>
,
pub
(
crate
)
decode_throughputs
:
Vec
<
Vec
<
f64
>>
,
decode_batch_latency_throughput
:
Vec
<
(
f64
,
f64
)
>
,
pub
(
crate
)
prefill_batch_latency_throughput
:
Vec
<
(
f64
,
f64
)
>
,
pub
(
crate
)
decode_batch_latency_throughput
:
Vec
<
(
f64
,
f64
)
>
,
}
}
impl
Data
{
impl
Data
{
fn
new
(
n_run
:
usize
,
n_batch
:
usize
)
->
Self
{
fn
new
(
n_run
:
usize
,
batch_size
:
Vec
<
u32
>
)
->
Self
{
let
prefill_latencies
:
Vec
<
Vec
<
f64
>>
=
let
prefill_latencies
:
Vec
<
Vec
<
f64
>>
=
(
0
..
batch_size
.len
())
(
0
..
n_batch
)
.map
(|
_
|
Vec
::
with_capacity
(
n_run
))
.collect
();
.map
(|
_
|
Vec
::
with_capacity
(
n_run
))
.collect
();
let
prefill_throughputs
:
Vec
<
Vec
<
f64
>>
=
prefill_latencies
.clone
();
let
prefill_throughputs
:
Vec
<
Vec
<
f64
>>
=
prefill_latencies
.clone
();
let
decode_latencies
:
Vec
<
Vec
<
f64
>>
=
prefill_latencies
.clone
();
let
decode_latencies
:
Vec
<
Vec
<
f64
>>
=
prefill_latencies
.clone
();
let
decode_token_latencies
:
Vec
<
Vec
<
f64
>>
=
decode_latencies
.clone
();
let
decode_token_latencies
:
Vec
<
Vec
<
f64
>>
=
decode_latencies
.clone
();
let
decode_throughputs
:
Vec
<
Vec
<
f64
>>
=
prefill_throughputs
.clone
();
let
decode_throughputs
:
Vec
<
Vec
<
f64
>>
=
prefill_throughputs
.clone
();
let
prefill_batch_latency_throughput
:
Vec
<
(
f64
,
f64
)
>
=
Vec
::
with_capacity
(
n_batch
);
let
prefill_batch_latency_throughput
:
Vec
<
(
f64
,
f64
)
>
=
Vec
::
with_capacity
(
batch_size
.len
());
let
decode_batch_latency_throughput
:
Vec
<
(
f64
,
f64
)
>
=
let
decode_batch_latency_throughput
:
Vec
<
(
f64
,
f64
)
>
=
prefill_batch_latency_throughput
.clone
();
prefill_batch_latency_throughput
.clone
();
Self
{
Self
{
batch_size
,
prefill_latencies
,
prefill_latencies
,
prefill_throughputs
,
prefill_throughputs
,
decode_latencies
,
decode_latencies
,
...
@@ -401,14 +405,14 @@ impl Data {
...
@@ -401,14 +405,14 @@ impl Data {
}
}
fn
push_prefill
(
&
mut
self
,
prefill
:
Prefill
,
batch_idx
:
usize
)
{
fn
push_prefill
(
&
mut
self
,
prefill
:
Prefill
,
batch_idx
:
usize
)
{
let
latency
=
prefill
.latency
.as_mi
lli
s
()
as
f64
;
let
latency
=
prefill
.latency
.as_mi
cro
s
()
as
f64
/
1000.0
;
self
.prefill_latencies
[
batch_idx
]
.push
(
latency
);
self
.prefill_latencies
[
batch_idx
]
.push
(
latency
);
self
.prefill_throughputs
[
batch_idx
]
.push
(
prefill
.throughput
);
self
.prefill_throughputs
[
batch_idx
]
.push
(
prefill
.throughput
);
}
}
fn
push_decode
(
&
mut
self
,
decode
:
Decode
,
batch_idx
:
usize
)
{
fn
push_decode
(
&
mut
self
,
decode
:
Decode
,
batch_idx
:
usize
)
{
let
latency
=
decode
.latency
.as_mi
lli
s
()
as
f64
;
let
latency
=
decode
.latency
.as_mi
cro
s
()
as
f64
/
1000.0
;
let
token_latency
=
decode
.token_latency
.as_mi
lli
s
()
as
f64
;
let
token_latency
=
decode
.token_latency
.as_mi
cro
s
()
as
f64
/
1000.0
;
self
.decode_latencies
[
batch_idx
]
.push
(
latency
);
self
.decode_latencies
[
batch_idx
]
.push
(
latency
);
self
.decode_token_latencies
[
batch_idx
]
.push
(
token_latency
);
self
.decode_token_latencies
[
batch_idx
]
.push
(
token_latency
);
self
.decode_throughputs
[
batch_idx
]
.push
(
decode
.throughput
);
self
.decode_throughputs
[
batch_idx
]
.push
(
decode
.throughput
);
...
...
benchmark/src/generation.rs
View file @
951930fb
...
@@ -39,6 +39,7 @@ pub(crate) async fn generation_task(
...
@@ -39,6 +39,7 @@ pub(crate) async fn generation_task(
decode_length
:
u32
,
decode_length
:
u32
,
n_runs
:
usize
,
n_runs
:
usize
,
warmups
:
usize
,
warmups
:
usize
,
parameters
:
NextTokenChooserParameters
,
client
:
ShardedClient
,
client
:
ShardedClient
,
run_sender
:
mpsc
::
Sender
<
Result
<
Message
,
ClientError
>>
,
run_sender
:
mpsc
::
Sender
<
Result
<
Message
,
ClientError
>>
,
mut
shutdown_receiver
:
broadcast
::
Receiver
<
()
>
,
mut
shutdown_receiver
:
broadcast
::
Receiver
<
()
>
,
...
@@ -47,7 +48,7 @@ pub(crate) async fn generation_task(
...
@@ -47,7 +48,7 @@ pub(crate) async fn generation_task(
// End task if a message is received on shutdown_receiver
// End task if a message is received on shutdown_receiver
// _shutdown_guard_sender will be dropped once the task is finished
// _shutdown_guard_sender will be dropped once the task is finished
tokio
::
select!
{
tokio
::
select!
{
res
=
generate_runs
(
tokenizer
,
batch_size
,
sequence_length
,
decode_length
,
n_runs
,
warmups
,
client
,
run_sender
.clone
())
=>
{
res
=
generate_runs
(
tokenizer
,
batch_size
,
sequence_length
,
decode_length
,
n_runs
,
warmups
,
parameters
,
client
,
run_sender
.clone
())
=>
{
if
let
Err
(
err
)
=
res
{
if
let
Err
(
err
)
=
res
{
run_sender
.send
(
Err
(
err
))
.await
.unwrap_or
(());
run_sender
.send
(
Err
(
err
))
.await
.unwrap_or
(());
}
}
...
@@ -65,6 +66,7 @@ async fn generate_runs(
...
@@ -65,6 +66,7 @@ async fn generate_runs(
decode_length
:
u32
,
decode_length
:
u32
,
n_runs
:
usize
,
n_runs
:
usize
,
warmups
:
usize
,
warmups
:
usize
,
parameters
:
NextTokenChooserParameters
,
mut
client
:
ShardedClient
,
mut
client
:
ShardedClient
,
run_sender
:
mpsc
::
Sender
<
Result
<
Message
,
ClientError
>>
,
run_sender
:
mpsc
::
Sender
<
Result
<
Message
,
ClientError
>>
,
)
->
Result
<
(),
ClientError
>
{
)
->
Result
<
(),
ClientError
>
{
...
@@ -79,6 +81,7 @@ async fn generate_runs(
...
@@ -79,6 +81,7 @@ async fn generate_runs(
sequence_length
,
sequence_length
,
b
,
b
,
decode_length
,
decode_length
,
parameters
.clone
(),
&
mut
client
,
&
mut
client
,
)
)
.await
?
;
.await
?
;
...
@@ -93,6 +96,7 @@ async fn generate_runs(
...
@@ -93,6 +96,7 @@ async fn generate_runs(
sequence_length
,
sequence_length
,
b
,
b
,
decode_length
,
decode_length
,
parameters
.clone
(),
&
mut
client
,
&
mut
client
,
)
)
.await
?
;
.await
?
;
...
@@ -125,6 +129,7 @@ async fn prefill(
...
@@ -125,6 +129,7 @@ async fn prefill(
sequence_length
:
u32
,
sequence_length
:
u32
,
batch_size
:
u32
,
batch_size
:
u32
,
decode_length
:
u32
,
decode_length
:
u32
,
parameters
:
NextTokenChooserParameters
,
client
:
&
mut
ShardedClient
,
client
:
&
mut
ShardedClient
,
)
->
Result
<
(
Prefill
,
CachedBatch
),
ClientError
>
{
)
->
Result
<
(
Prefill
,
CachedBatch
),
ClientError
>
{
// Create requests
// Create requests
...
@@ -133,16 +138,7 @@ async fn prefill(
...
@@ -133,16 +138,7 @@ async fn prefill(
id
:
id
.into
(),
id
:
id
.into
(),
inputs
:
sequence
.clone
(),
inputs
:
sequence
.clone
(),
truncate
:
sequence_length
,
truncate
:
sequence_length
,
parameters
:
Some
(
NextTokenChooserParameters
{
parameters
:
Some
(
parameters
.clone
()),
temperature
:
1.0
,
top_k
:
0
,
top_p
:
1.0
,
typical_p
:
1.0
,
do_sample
:
false
,
seed
:
0
,
repetition_penalty
:
1.0
,
watermark
:
false
,
}),
stopping_parameters
:
Some
(
StoppingCriteriaParameters
{
stopping_parameters
:
Some
(
StoppingCriteriaParameters
{
max_new_tokens
:
decode_length
,
max_new_tokens
:
decode_length
,
stop_sequences
:
vec!
[],
stop_sequences
:
vec!
[],
...
...
benchmark/src/lib.rs
View file @
951930fb
mod
app
;
mod
app
;
mod
event
;
mod
event
;
mod
generation
;
mod
generation
;
mod
table
;
mod
utils
;
mod
utils
;
use
crate
::
app
::
App
;
use
crate
::
app
::
App
;
use
crate
::
event
::
Event
;
use
crate
::
event
::
Event
;
use
crossterm
::
ExecutableCommand
;
use
crossterm
::
ExecutableCommand
;
use
std
::
io
;
use
std
::
io
;
use
text_generation_client
::
ShardedClient
;
use
text_generation_client
::
{
NextTokenChooserParameters
,
ShardedClient
}
;
use
tokenizers
::
Tokenizer
;
use
tokenizers
::
Tokenizer
;
use
tokio
::
sync
::{
broadcast
,
mpsc
};
use
tokio
::
sync
::{
broadcast
,
mpsc
};
use
tui
::
backend
::
CrosstermBackend
;
use
tui
::
backend
::
CrosstermBackend
;
...
@@ -23,8 +24,26 @@ pub async fn run(
...
@@ -23,8 +24,26 @@ pub async fn run(
decode_length
:
u32
,
decode_length
:
u32
,
n_runs
:
usize
,
n_runs
:
usize
,
warmups
:
usize
,
warmups
:
usize
,
temperature
:
Option
<
f32
>
,
top_k
:
Option
<
u32
>
,
top_p
:
Option
<
f32
>
,
typical_p
:
Option
<
f32
>
,
repetition_penalty
:
Option
<
f32
>
,
watermark
:
bool
,
do_sample
:
bool
,
client
:
ShardedClient
,
client
:
ShardedClient
,
)
->
Result
<
(),
crossterm
::
ErrorKind
>
{
)
->
Result
<
(),
crossterm
::
ErrorKind
>
{
let
parameters
=
NextTokenChooserParameters
{
temperature
:
temperature
.unwrap_or
(
1.0
),
top_k
:
top_k
.unwrap_or
(
0
),
top_p
:
top_p
.unwrap_or
(
1.0
),
typical_p
:
typical_p
.unwrap_or
(
1.0
),
do_sample
,
seed
:
0
,
repetition_penalty
:
repetition_penalty
.unwrap_or
(
1.0
),
watermark
,
};
// Initialize terminal properties
// Initialize terminal properties
crossterm
::
terminal
::
enable_raw_mode
()
?
;
crossterm
::
terminal
::
enable_raw_mode
()
?
;
io
::
stdout
()
.execute
(
crossterm
::
terminal
::
EnterAlternateScreen
)
?
;
io
::
stdout
()
.execute
(
crossterm
::
terminal
::
EnterAlternateScreen
)
?
;
...
@@ -53,6 +72,7 @@ pub async fn run(
...
@@ -53,6 +72,7 @@ pub async fn run(
decode_length
,
decode_length
,
n_runs
,
n_runs
,
warmups
,
warmups
,
parameters
,
client
,
client
,
run_sender
,
run_sender
,
shutdown_sender
.subscribe
(),
shutdown_sender
.subscribe
(),
...
@@ -73,7 +93,7 @@ pub async fn run(
...
@@ -73,7 +93,7 @@ pub async fn run(
// Create App
// Create App
let
mut
app
=
App
::
new
(
let
mut
app
=
App
::
new
(
run_receiver
,
run_receiver
,
tokenizer_name
,
tokenizer_name
.clone
()
,
sequence_length
,
sequence_length
,
decode_length
,
decode_length
,
n_runs
,
n_runs
,
...
@@ -106,5 +126,27 @@ pub async fn run(
...
@@ -106,5 +126,27 @@ pub async fn run(
crossterm
::
terminal
::
disable_raw_mode
()
?
;
crossterm
::
terminal
::
disable_raw_mode
()
?
;
io
::
stdout
()
.execute
(
crossterm
::
cursor
::
Show
)
?
;
io
::
stdout
()
.execute
(
crossterm
::
cursor
::
Show
)
?
;
let
parameters_table
=
table
::
parameters_table
(
tokenizer_name
,
sequence_length
,
decode_length
,
n_runs
,
warmups
,
temperature
,
top_k
,
top_p
,
typical_p
,
repetition_penalty
,
watermark
,
do_sample
,
);
println!
(
"
\n
{parameters_table}
\n
"
);
let
latency_table
=
table
::
latency_table
(
&
app
.data
);
println!
(
"
\n
{latency_table}
\n
"
);
let
throughput_table
=
table
::
throughput_table
(
&
app
.data
);
println!
(
"
\n
{throughput_table}
\n
"
);
Ok
(())
Ok
(())
}
}
benchmark/src/main.rs
View file @
951930fb
...
@@ -28,11 +28,27 @@ struct Args {
...
@@ -28,11 +28,27 @@ struct Args {
runs
:
usize
,
runs
:
usize
,
#[clap(default_value
=
"1"
,
short,
long,
env)]
#[clap(default_value
=
"1"
,
short,
long,
env)]
warmups
:
usize
,
warmups
:
usize
,
#[clap(long,
env)]
temperature
:
Option
<
f32
>
,
#[clap(long,
env)]
top_k
:
Option
<
u32
>
,
#[clap(long,
env)]
top_p
:
Option
<
f32
>
,
#[clap(long,
env)]
typical_p
:
Option
<
f32
>
,
#[clap(long,
env)]
repetition_penalty
:
Option
<
f32
>
,
#[clap(long,
env)]
watermark
:
bool
,
#[clap(long,
env)]
do_sample
:
bool
,
#[clap(default_value
=
"/tmp/text-generation-server-0"
,
short,
long,
env)]
#[clap(default_value
=
"/tmp/text-generation-server-0"
,
short,
long,
env)]
master_shard_uds_path
:
String
,
master_shard_uds_path
:
String
,
}
}
fn
main
()
->
Result
<
(),
Box
<
dyn
std
::
error
::
Error
>>
{
fn
main
()
->
Result
<
(),
Box
<
dyn
std
::
error
::
Error
>>
{
init_logging
();
// Get args
// Get args
let
args
=
Args
::
parse
();
let
args
=
Args
::
parse
();
// Pattern match configuration
// Pattern match configuration
...
@@ -44,13 +60,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
...
@@ -44,13 +60,18 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
decode_length
,
decode_length
,
runs
,
runs
,
warmups
,
warmups
,
temperature
,
top_k
,
top_p
,
typical_p
,
repetition_penalty
,
watermark
,
do_sample
,
master_shard_uds_path
,
master_shard_uds_path
,
}
=
args
;
}
=
args
;
let
batch_size
=
batch_size
.unwrap_or
(
vec!
[
1
,
2
,
4
,
8
,
16
,
32
]);
let
batch_size
=
batch_size
.unwrap_or
(
vec!
[
1
,
2
,
4
,
8
,
16
,
32
]);
init_logging
();
// Tokenizer instance
// Tokenizer instance
// This will only be used to validate payloads
// This will only be used to validate payloads
tracing
::
info!
(
"Loading tokenizer"
);
tracing
::
info!
(
"Loading tokenizer"
);
...
@@ -105,6 +126,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
...
@@ -105,6 +126,13 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
decode_length
,
decode_length
,
runs
,
runs
,
warmups
,
warmups
,
temperature
,
top_k
,
top_p
,
typical_p
,
repetition_penalty
,
watermark
,
do_sample
,
sharded_client
,
sharded_client
,
)
)
.await
.await
...
...
benchmark/src/table.rs
0 → 100644
View file @
951930fb
use
crate
::
app
::
Data
;
use
tabled
::
settings
::
Merge
;
use
tabled
::{
builder
::
Builder
,
settings
::
Style
,
Table
};
#[allow(clippy::too_many_arguments)]
pub
(
crate
)
fn
parameters_table
(
tokenizer_name
:
String
,
sequence_length
:
u32
,
decode_length
:
u32
,
n_runs
:
usize
,
warmups
:
usize
,
temperature
:
Option
<
f32
>
,
top_k
:
Option
<
u32
>
,
top_p
:
Option
<
f32
>
,
typical_p
:
Option
<
f32
>
,
repetition_penalty
:
Option
<
f32
>
,
watermark
:
bool
,
do_sample
:
bool
,
)
->
Table
{
let
mut
builder
=
Builder
::
default
();
builder
.set_header
([
"Parameter"
,
"Value"
]);
builder
.push_record
([
"Model"
,
&
tokenizer_name
]);
builder
.push_record
([
"Sequence Length"
,
&
sequence_length
.to_string
()]);
builder
.push_record
([
"Decode Length"
,
&
decode_length
.to_string
()]);
builder
.push_record
([
"N Runs"
,
&
n_runs
.to_string
()]);
builder
.push_record
([
"Warmups"
,
&
warmups
.to_string
()]);
builder
.push_record
([
"Temperature"
,
&
format!
(
"{temperature:?}"
)]);
builder
.push_record
([
"Top K"
,
&
format!
(
"{top_k:?}"
)]);
builder
.push_record
([
"Top P"
,
&
format!
(
"{top_p:?}"
)]);
builder
.push_record
([
"Typical P"
,
&
format!
(
"{typical_p:?}"
)]);
builder
.push_record
([
"Repetition Penalty"
,
&
format!
(
"{repetition_penalty:?}"
)]);
builder
.push_record
([
"Watermark"
,
&
watermark
.to_string
()]);
builder
.push_record
([
"Do Sample"
,
&
do_sample
.to_string
()]);
let
mut
table
=
builder
.build
();
table
.with
(
Style
::
markdown
());
table
}
pub
(
crate
)
fn
latency_table
(
data
:
&
Data
)
->
Table
{
let
mut
builder
=
Builder
::
default
();
builder
.set_header
([
"Step"
,
"Batch Size"
,
"Average"
,
"Lowest"
,
"Highest"
,
"p50"
,
"p90"
,
"p99"
,
]);
add_latencies
(
&
mut
builder
,
"Prefill"
,
&
data
.batch_size
,
&
data
.prefill_latencies
,
);
add_latencies
(
&
mut
builder
,
"Decode (token)"
,
&
data
.batch_size
,
&
data
.decode_token_latencies
,
);
add_latencies
(
&
mut
builder
,
"Decode (total)"
,
&
data
.batch_size
,
&
data
.decode_latencies
,
);
let
mut
table
=
builder
.build
();
table
.with
(
Style
::
markdown
())
.with
(
Merge
::
vertical
());
table
}
pub
(
crate
)
fn
throughput_table
(
data
:
&
Data
)
->
Table
{
let
mut
builder
=
Builder
::
default
();
builder
.set_header
([
"Step"
,
"Batch Size"
,
"Average"
,
"Lowest"
,
"Highest"
]);
add_throuhgputs
(
&
mut
builder
,
"Prefill"
,
&
data
.batch_size
,
&
data
.prefill_throughputs
,
);
add_throuhgputs
(
&
mut
builder
,
"Decode"
,
&
data
.batch_size
,
&
data
.decode_throughputs
,
);
let
mut
table
=
builder
.build
();
table
.with
(
Style
::
markdown
())
.with
(
Merge
::
vertical
());
table
}
fn
add_latencies
(
builder
:
&
mut
Builder
,
step
:
&
'static
str
,
batch_size
:
&
[
u32
],
batch_latencies
:
&
[
Vec
<
f64
>
],
)
{
for
(
i
,
b
)
in
batch_size
.iter
()
.enumerate
()
{
let
latencies
=
&
batch_latencies
[
i
];
let
(
avg
,
min
,
max
)
=
avg_min_max
(
latencies
);
let
row
=
[
step
,
&
b
.to_string
(),
&
format_value
(
avg
,
"ms"
),
&
format_value
(
min
,
"ms"
),
&
format_value
(
max
,
"ms"
),
&
format_value
(
px
(
latencies
,
50
),
"ms"
),
&
format_value
(
px
(
latencies
,
90
),
"ms"
),
&
format_value
(
px
(
latencies
,
99
),
"ms"
),
];
builder
.push_record
(
row
);
}
}
fn
add_throuhgputs
(
builder
:
&
mut
Builder
,
step
:
&
'static
str
,
batch_size
:
&
[
u32
],
batch_throughputs
:
&
[
Vec
<
f64
>
],
)
{
for
(
i
,
b
)
in
batch_size
.iter
()
.enumerate
()
{
let
throughputs
=
&
batch_throughputs
[
i
];
let
(
avg
,
min
,
max
)
=
avg_min_max
(
throughputs
);
let
row
=
[
step
,
&
b
.to_string
(),
&
format_value
(
avg
,
"tokens/secs"
),
&
format_value
(
min
,
"tokens/secs"
),
&
format_value
(
max
,
"tokens/secs"
),
];
builder
.push_record
(
row
);
}
}
fn
avg_min_max
(
data
:
&
Vec
<
f64
>
)
->
(
f64
,
f64
,
f64
)
{
let
average
=
data
.iter
()
.sum
::
<
f64
>
()
/
data
.len
()
as
f64
;
let
min
=
data
.iter
()
.min_by
(|
a
,
b
|
a
.total_cmp
(
b
))
.unwrap_or
(
&
std
::
f64
::
NAN
);
let
max
=
data
.iter
()
.max_by
(|
a
,
b
|
a
.total_cmp
(
b
))
.unwrap_or
(
&
std
::
f64
::
NAN
);
(
average
,
*
min
,
*
max
)
}
fn
px
(
data
:
&
Vec
<
f64
>
,
p
:
u32
)
->
f64
{
let
i
=
(
f64
::
from
(
p
)
/
100.0
*
data
.len
()
as
f64
)
as
usize
;
*
data
.get
(
i
)
.unwrap_or
(
&
std
::
f64
::
NAN
)
}
fn
format_value
(
value
:
f64
,
unit
:
&
'static
str
)
->
String
{
format!
(
"{:.2} {unit}"
,
value
)
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment