Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
ca63c49d
Unverified
Commit
ca63c49d
authored
Jan 26, 2026
by
Janelle Cai
Committed by
GitHub
Jan 26, 2026
Browse files
test: radix tree benchmarks (#5617)
parent
8fc70f20
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
1006 additions
and
0 deletions
+1006
-0
lib/llm/Cargo.toml
lib/llm/Cargo.toml
+7
-0
lib/llm/benches/radix_tree_microbench.rs
lib/llm/benches/radix_tree_microbench.rs
+999
-0
No files found.
lib/llm/Cargo.toml
View file @
ca63c49d
...
...
@@ -27,6 +27,7 @@ cuda = ["dep:cudarc"]
integration
=
["dynamo-runtime/integration"]
media-nixl
=
[
"dep:nixl-sys"
,
"dep:dynamo-memory"
]
media-ffmpeg
=
[
"dep:video-rs"
,
"dep:ffmpeg-next"
,
"dep:memfile"
,
"media-nixl"
]
kv-router-stress
=
[
"dep:clap"
,
"dep:indicatif"
]
[[bench]]
name
=
"tokenizer"
...
...
@@ -206,3 +207,9 @@ tonic-build = { version = "0.13.1" }
name
=
"bench_local_transfer_v2"
path
=
"bin/bench_local_transfer_v2.rs"
required-features
=
["block-manager-bench"]
[[bench]]
name
=
"radix_tree_microbench"
path
=
"benches/radix_tree_microbench.rs"
harness
=
false
required-features
=
["kv-router-stress"]
lib/llm/benches/radix_tree_microbench.rs
0 → 100644
View file @
ca63c49d
// SPDX-FileCopyrightText: Copyright (c) 2024-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
//! Microbenchmark for radix tree operations with configurable size and depth.
//!
//! Measures latency and throughput of:
//! - store_block: Adding blocks to the tree
//! - remove_block: Removing blocks from the tree
//! - find_matches: Finding prefix matches in the tree
//!
//! Size is defined as total (worker, block) pairs in the tree.
//! Depth is the number of blocks per sequence (depth = (isl + osl) / block_size).
//!
//! Run with: cargo run --package dynamo-llm --bin radix_tree_microbench --features kv-router-stress -- --help
use
clap
::{
Parser
,
ValueEnum
};
use
dynamo_llm
::
kv_router
::{
indexer
::{
RadixTree
,
RouterEvent
},
protocols
::{
ExternalSequenceBlockHash
,
KvCacheEvent
,
KvCacheEventData
,
KvCacheRemoveData
,
KvCacheStoreData
,
KvCacheStoredBlockData
,
LocalBlockHash
,
WorkerId
,
compute_block_hash_for_seq
,
},
};
use
rand
::
rngs
::
StdRng
;
use
rand
::{
Rng
,
SeedableRng
};
use
std
::
time
::{
Duration
,
Instant
};
/// Sweep benchmark mode
#[derive(Debug,
Clone,
Copy,
PartialEq,
Eq,
ValueEnum)]
enum
SweepMode
{
/// Vary sequence/query length (query has exactly `depth` blocks, all matching)
Depth
,
/// Vary match length (query has `max_depth` blocks, first `depth` match, rest garbage)
MatchLength
,
/// Vary number of prefix prompt groups (width of shared prefixes)
Width
,
}
#[derive(Parser,
Debug)]
#[command(name
=
"radix_tree_microbench"
)]
#[command(about
=
"Microbenchmark for radix tree operations"
)]
struct
Args
{
/// Ignored: passed by cargo bench harness
#[arg(long,
hide
=
true
)]
bench
:
bool
,
/// Target tree size in total (worker, block) pairs
#[arg(long,
default_value
=
"10000"
)]
size
:
usize
,
/// Sequence depth in blocks (depth = (isl + osl) / block_size, where block_size = 16)
#[arg(long,
default_value
=
"32"
)]
depth
:
usize
,
/// Number of workers to distribute blocks across
#[arg(long,
default_value
=
"4"
)]
num_workers
:
usize
,
/// Number of iterations per operation for timing
#[arg(long,
default_value
=
"1000"
)]
iterations
:
usize
,
/// Prefix prompt ratio (0.0 to 1.0) - portion of sequence from the beginning that is a shared prefix
#[arg(long,
default_value
=
"0.25"
)]
prefix_prompt_ratio
:
f64
,
/// Number of unique prefix prompt groups to randomly sample from
#[arg(long,
default_value
=
"4"
)]
num_prefix_prompts
:
usize
,
/// Run only specific benchmark (hash, store, remove, find_matches, dump, sweep, or all)
#[arg(long,
default_value
=
"all"
)]
benchmark_type
:
String
,
/// KV block size in tokens (for hash computation)
#[arg(long,
default_value
=
"16"
)]
block_size
:
u32
,
/// Verbose output with per-iteration timings
#[arg(short,
long)]
verbose
:
bool
,
/// Minimum depth for sweep mode
#[arg(long,
default_value
=
"1"
)]
min_depth
:
usize
,
/// Maximum depth for sweep mode
#[arg(long,
default_value
=
"8000"
)]
max_depth
:
usize
,
/// Number of depth points to sample in sweep mode (logarithmically spaced)
#[arg(long,
default_value
=
"20"
)]
sweep_points
:
usize
,
/// Iterations per depth point in sweep mode
#[arg(long,
default_value
=
"100"
)]
sweep_iterations
:
usize
,
/// Output format for sweep mode: "table" or "csv"
#[arg(long,
default_value
=
"table"
)]
sweep_format
:
String
,
/// Sweep mode: what to vary during the sweep
#[arg(long,
value_enum,
default_value
=
"depth"
)]
sweep_mode
:
SweepMode
,
/// Minimum width (num_prefix_prompts) for width sweep mode
#[arg(long,
default_value
=
"1"
)]
min_width
:
usize
,
/// Maximum width (num_prefix_prompts) for width sweep mode
#[arg(long,
default_value
=
"64"
)]
max_width
:
usize
,
/// Random seed for reproducibility
#[arg(long,
default_value
=
"42"
)]
seed
:
u64
,
}
/// Pre-generated sequence data for benchmarking
#[derive(Clone)]
struct
SequenceData
{
worker_id
:
WorkerId
,
local_hashes
:
Vec
<
LocalBlockHash
>
,
external_hashes
:
Vec
<
ExternalSequenceBlockHash
>
,
}
impl
SequenceData
{
fn
new
(
seq_id
:
u64
,
worker_id
:
WorkerId
,
depth
:
usize
)
->
Self
{
let
local_hashes
:
Vec
<
LocalBlockHash
>
=
(
0
..
depth
)
.map
(|
block_idx
|
LocalBlockHash
((
seq_id
<<
32
)
|
(
block_idx
as
u64
)))
.collect
();
let
external_hashes
:
Vec
<
ExternalSequenceBlockHash
>
=
(
0
..
depth
)
.map
(|
block_idx
|
ExternalSequenceBlockHash
((
seq_id
<<
32
)
|
(
block_idx
as
u64
)))
.collect
();
Self
{
worker_id
,
local_hashes
,
external_hashes
,
}
}
fn
to_store_event
(
&
self
,
event_id
:
u64
)
->
RouterEvent
{
RouterEvent
{
worker_id
:
self
.worker_id
,
event
:
KvCacheEvent
{
event_id
,
data
:
KvCacheEventData
::
Stored
(
KvCacheStoreData
{
parent_hash
:
None
,
blocks
:
self
.local_hashes
.iter
()
.zip
(
self
.external_hashes
.iter
())
.map
(|(
local
,
ext
)|
KvCacheStoredBlockData
{
tokens_hash
:
*
local
,
block_hash
:
*
ext
,
mm_extra_info
:
None
,
})
.collect
(),
}),
dp_rank
:
0
,
},
}
}
fn
to_remove_event
(
&
self
,
event_id
:
u64
)
->
RouterEvent
{
RouterEvent
{
worker_id
:
self
.worker_id
,
event
:
KvCacheEvent
{
event_id
,
data
:
KvCacheEventData
::
Removed
(
KvCacheRemoveData
{
block_hashes
:
self
.external_hashes
.clone
(),
}),
dp_rank
:
0
,
},
}
}
}
/// Generate sequences with shared prefix prompts
fn
generate_sequences
(
num_sequences
:
usize
,
depth
:
usize
,
num_workers
:
usize
,
prefix_prompt_ratio
:
f64
,
num_prefix_prompts
:
usize
,
seed
:
u64
,
)
->
Vec
<
SequenceData
>
{
let
mut
sequences
=
Vec
::
with_capacity
(
num_sequences
);
let
prefix_length
:
usize
=
(
depth
as
f64
*
prefix_prompt_ratio
)
.round
()
as
usize
;
let
mut
rng
:
StdRng
=
StdRng
::
seed_from_u64
(
seed
);
for
seq_id
in
0
..
num_sequences
{
let
worker_id
=
(
seq_id
%
num_workers
)
as
WorkerId
;
let
mut
seq
=
SequenceData
::
new
(
seq_id
as
u64
,
worker_id
,
depth
);
if
num_prefix_prompts
>
0
&&
prefix_length
>
0
{
let
group_id
=
rng
.random_range
(
0
..
num_prefix_prompts
);
for
i
in
0
..
prefix_length
{
seq
.local_hashes
[
i
]
=
LocalBlockHash
(
0xDEAD_BEEF_0000_0000
|
((
group_id
as
u64
)
<<
32
)
|
(
i
as
u64
));
}
}
sequences
.push
(
seq
);
}
sequences
}
/// Build a pre-populated tree (prints timing info)
fn
build_tree
(
sequences
:
&
[
SequenceData
])
->
RadixTree
{
let
num_blocks
:
usize
=
sequences
.iter
()
.map
(|
s
|
s
.local_hashes
.len
())
.sum
();
print!
(
" Building tree with {} sequences ({} blocks)... "
,
sequences
.len
(),
num_blocks
);
std
::
io
::
Write
::
flush
(
&
mut
std
::
io
::
stdout
())
.unwrap
();
let
start
=
Instant
::
now
();
let
mut
tree
=
RadixTree
::
new
();
for
(
event_id
,
seq
)
in
sequences
.iter
()
.enumerate
()
{
let
event
=
seq
.to_store_event
(
event_id
as
u64
);
let
_
=
tree
.apply_event
(
event
);
}
let
elapsed
=
start
.elapsed
();
println!
(
"done in {:.2?} ({:.2} sequences/sec, {:.2} blocks/sec)"
,
elapsed
,
sequences
.len
()
as
f64
/
elapsed
.as_secs_f64
(),
num_blocks
as
f64
/
elapsed
.as_secs_f64
()
);
tree
}
/// Statistics for a set of timing measurements
#[derive(Debug)]
struct
LatencyStats
{
min
:
Duration
,
max
:
Duration
,
avg
:
Duration
,
p50
:
Duration
,
p95
:
Duration
,
p99
:
Duration
,
throughput_ops_sec
:
f64
,
}
impl
LatencyStats
{
fn
from_durations
(
mut
durations
:
Vec
<
Duration
>
)
->
Self
{
durations
.sort
();
let
n
=
durations
.len
();
let
total
:
Duration
=
durations
.iter
()
.sum
();
let
avg
=
total
/
n
as
u32
;
Self
{
min
:
durations
[
0
],
max
:
durations
[
n
-
1
],
avg
,
p50
:
durations
[
n
/
2
],
p95
:
durations
[
n
*
95
/
100
],
p99
:
durations
[
n
*
99
/
100
],
throughput_ops_sec
:
n
as
f64
/
total
.as_secs_f64
(),
}
}
fn
print
(
&
self
,
operation
:
&
str
,
blocks_per_op
:
usize
)
{
println!
(
"
\n
{} Latency Statistics:"
,
operation
);
println!
(
" min: {:>12?}"
,
self
.min
);
println!
(
" avg: {:>12?}"
,
self
.avg
);
println!
(
" p50: {:>12?}"
,
self
.p50
);
println!
(
" p95: {:>12?}"
,
self
.p95
);
println!
(
" p99: {:>12?}"
,
self
.p99
);
println!
(
" max: {:>12?}"
,
self
.max
);
println!
(
" throughput: {:.2} ops/sec"
,
self
.throughput_ops_sec
);
println!
(
" throughput: {:.2} blocks/sec"
,
self
.throughput_ops_sec
*
blocks_per_op
as
f64
);
}
}
/// Benchmark compute_block_hash_for_seq operation
fn
bench_hash
(
args
:
&
Args
)
{
println!
(
"
\n
=== Benchmarking COMPUTE_BLOCK_HASH (per-request hot path) ==="
);
let
num_tokens
=
args
.depth
*
args
.block_size
as
usize
;
println!
(
" Token sequence length: {} tokens ({} blocks)"
,
num_tokens
,
args
.depth
);
// Generate token sequences to hash
let
token_sequences
:
Vec
<
Vec
<
u32
>>
=
(
0
..
args
.iterations
)
.map
(|
i
|
{
(
0
..
num_tokens
)
.map
(|
j
|
((
i
*
num_tokens
+
j
)
%
50000
)
as
u32
)
.collect
()
})
.collect
();
let
mut
durations
=
Vec
::
with_capacity
(
args
.iterations
);
for
(
i
,
tokens
)
in
token_sequences
.iter
()
.enumerate
()
{
let
start
=
Instant
::
now
();
let
_
=
compute_block_hash_for_seq
(
tokens
,
args
.block_size
,
None
);
let
elapsed
=
start
.elapsed
();
durations
.push
(
elapsed
);
if
args
.verbose
&&
(
i
+
1
)
%
100
==
0
{
println!
(
" Completed {}/{} iterations"
,
i
+
1
,
args
.iterations
);
}
}
let
stats
=
LatencyStats
::
from_durations
(
durations
);
stats
.print
(
"COMPUTE_BLOCK_HASH"
,
args
.depth
);
}
/// Benchmark store_block operation
fn
bench_store
(
args
:
&
Args
)
{
println!
(
"
\n
=== Benchmarking STORE_BLOCK ==="
);
let
num_sequences
=
args
.size
/
args
.depth
;
let
bench_iters
=
args
.iterations
.min
(
num_sequences
);
let
all_sequences
=
generate_sequences
(
num_sequences
,
args
.depth
,
args
.num_workers
,
args
.prefix_prompt_ratio
,
args
.num_prefix_prompts
,
args
.seed
,
);
let
split_point
=
num_sequences
.saturating_sub
(
bench_iters
);
let
pre_sequences
=
&
all_sequences
[
..
split_point
];
let
bench_sequences
=
&
all_sequences
[
split_point
..
];
// Build tree once, then store sequences sequentially
// Tree grows from (size - iterations) to size over the benchmark
let
mut
tree
=
build_tree
(
&
pre_sequences
);
println!
(
" Initial tree size: {} blocks, will grow to ~{} blocks"
,
tree
.current_size
(),
tree
.current_size
()
+
bench_iters
*
args
.depth
);
let
mut
durations
=
Vec
::
with_capacity
(
bench_iters
);
for
(
i
,
seq
)
in
bench_sequences
.iter
()
.enumerate
()
{
let
event
=
seq
.to_store_event
(
i
as
u64
);
let
start
=
Instant
::
now
();
let
_
=
tree
.apply_event
(
event
);
let
elapsed
=
start
.elapsed
();
durations
.push
(
elapsed
);
if
args
.verbose
&&
(
i
+
1
)
%
100
==
0
{
println!
(
" Completed {}/{} iterations"
,
i
+
1
,
bench_iters
);
}
}
let
stats
=
LatencyStats
::
from_durations
(
durations
);
stats
.print
(
"STORE_BLOCK"
,
args
.depth
);
}
/// Benchmark remove_block operation
fn
bench_remove
(
args
:
&
Args
)
{
println!
(
"
\n
=== Benchmarking REMOVE_BLOCK ==="
);
let
num_sequences
=
args
.size
/
args
.depth
;
let
sequences
=
generate_sequences
(
num_sequences
,
args
.depth
,
args
.num_workers
,
args
.prefix_prompt_ratio
,
args
.num_prefix_prompts
,
args
.seed
,
);
// Build tree once, then remove/re-add to restore state after each timed removal
let
mut
tree
=
build_tree
(
&
sequences
);
println!
(
" Tree size: {} blocks"
,
tree
.current_size
());
let
mut
durations
=
Vec
::
with_capacity
(
args
.iterations
);
for
i
in
0
..
args
.iterations
{
// Remove a sequence (timed)
let
seq_to_remove
=
&
sequences
[
i
%
sequences
.len
()];
let
remove_event
=
seq_to_remove
.to_remove_event
(
i
as
u64
);
let
start
=
Instant
::
now
();
let
_
=
tree
.apply_event
(
remove_event
);
let
elapsed
=
start
.elapsed
();
durations
.push
(
elapsed
);
// Re-add the sequence to restore tree state (untimed)
let
store_event
=
seq_to_remove
.to_store_event
(
i
as
u64
+
args
.iterations
as
u64
);
let
_
=
tree
.apply_event
(
store_event
);
if
args
.verbose
&&
(
i
+
1
)
%
100
==
0
{
println!
(
" Completed {}/{} iterations"
,
i
+
1
,
args
.iterations
);
}
}
let
stats
=
LatencyStats
::
from_durations
(
durations
);
stats
.print
(
"REMOVE_BLOCK"
,
args
.depth
);
}
/// Benchmark find_matches operation
fn
bench_find_matches
(
args
:
&
Args
)
{
println!
(
"
\n
=== Benchmarking FIND_MATCHES ==="
);
let
num_sequences
=
args
.size
/
args
.depth
;
let
sequences
=
generate_sequences
(
num_sequences
,
args
.depth
,
args
.num_workers
,
args
.prefix_prompt_ratio
,
args
.num_prefix_prompts
,
args
.seed
,
);
// Build tree once for all find_matches calls
let
tree
=
build_tree
(
&
sequences
);
println!
(
" Tree built with {} sequences, {} total blocks"
,
sequences
.len
(),
tree
.current_size
()
);
// Benchmark hit case (lookup existing sequences)
println!
(
"
\n
--- HIT case (existing sequences) ---"
);
let
mut
hit_durations
=
Vec
::
with_capacity
(
args
.iterations
);
for
i
in
0
..
args
.iterations
{
let
seq
=
&
sequences
[
i
%
sequences
.len
()];
let
hashes_copy
=
seq
.local_hashes
.clone
();
let
start
=
Instant
::
now
();
let
_
=
tree
.find_matches
(
hashes_copy
,
false
);
let
elapsed
=
start
.elapsed
();
hit_durations
.push
(
elapsed
);
if
args
.verbose
&&
(
i
+
1
)
%
100
==
0
{
println!
(
" Completed {}/{} iterations"
,
i
+
1
,
args
.iterations
);
}
}
let
hit_stats
=
LatencyStats
::
from_durations
(
hit_durations
);
hit_stats
.print
(
"FIND_MATCHES (HIT)"
,
args
.depth
);
// Benchmark miss case (find_matches on non-existing sequences)
println!
(
"
\n
--- MISS case (non-existing sequences) ---"
);
let
mut
miss_durations
=
Vec
::
with_capacity
(
args
.iterations
);
for
i
in
0
..
args
.iterations
{
// Generate a sequence that won't match
let
miss_hashes
:
Vec
<
LocalBlockHash
>
=
(
0
..
args
.depth
)
.map
(|
j
|
LocalBlockHash
(
0xBAD_C0DE_0000_0000
|
((
i
as
u64
)
<<
16
)
|
(
j
as
u64
)))
.collect
();
let
start
=
Instant
::
now
();
let
_
=
tree
.find_matches
(
miss_hashes
,
false
);
let
elapsed
=
start
.elapsed
();
miss_durations
.push
(
elapsed
);
if
args
.verbose
&&
(
i
+
1
)
%
100
==
0
{
println!
(
" Completed {}/{} iterations"
,
i
+
1
,
args
.iterations
);
}
}
let
miss_stats
=
LatencyStats
::
from_durations
(
miss_durations
);
miss_stats
.print
(
"FIND_MATCHES (MISS)"
,
args
.depth
);
// Benchmark partial match case
println!
(
"
\n
--- PARTIAL case (prefix match only) ---"
);
let
mut
partial_durations
=
Vec
::
with_capacity
(
args
.iterations
);
for
i
in
0
..
args
.iterations
{
let
seq
=
&
sequences
[
i
%
sequences
.len
()];
// Use first half of real sequence, second half is garbage
let
half
=
args
.depth
/
2
;
let
mut
partial_hashes
=
seq
.local_hashes
[
..
half
]
.to_vec
();
partial_hashes
.extend
(
(
0
..
half
)
.map
(|
j
|
LocalBlockHash
(
0xDEAD_0000
|
((
i
as
u64
)
<<
16
)
|
(
j
as
u64
))),
);
let
start
=
Instant
::
now
();
let
_
=
tree
.find_matches
(
partial_hashes
,
false
);
let
elapsed
=
start
.elapsed
();
partial_durations
.push
(
elapsed
);
if
args
.verbose
&&
(
i
+
1
)
%
100
==
0
{
println!
(
" Completed {}/{} iterations"
,
i
+
1
,
args
.iterations
);
}
}
let
partial_stats
=
LatencyStats
::
from_durations
(
partial_durations
);
partial_stats
.print
(
"FIND_MATCHES (PARTIAL)"
,
args
.depth
);
// Benchmark with early_exit=true
println!
(
"
\n
--- EARLY_EXIT case ---"
);
let
mut
early_exit_durations
=
Vec
::
with_capacity
(
args
.iterations
);
for
i
in
0
..
args
.iterations
{
let
seq
=
&
sequences
[
i
%
sequences
.len
()];
let
start
=
Instant
::
now
();
let
_
=
tree
.find_matches
(
seq
.local_hashes
.clone
(),
true
);
let
elapsed
=
start
.elapsed
();
early_exit_durations
.push
(
elapsed
);
}
let
early_exit_stats
=
LatencyStats
::
from_durations
(
early_exit_durations
);
early_exit_stats
.print
(
"FIND_MATCHES (EARLY_EXIT)"
,
args
.depth
);
}
/// Generate logarithmically spaced values between min and max
fn
generate_log_spaced_points
(
min_val
:
usize
,
max_val
:
usize
,
num_points
:
usize
)
->
Vec
<
usize
>
{
if
num_points
<=
1
{
return
vec!
[
max_val
];
}
let
log_min
=
(
min_val
as
f64
)
.ln
();
let
log_max
=
(
max_val
as
f64
)
.ln
();
let
step
=
(
log_max
-
log_min
)
/
(
num_points
-
1
)
as
f64
;
let
mut
points
:
Vec
<
usize
>
=
(
0
..
num_points
)
.map
(|
i
|
(
log_min
+
step
*
i
as
f64
)
.exp
()
.round
()
as
usize
)
.map
(|
v
|
v
.max
(
1
))
// Ensure minimum value of 1
.collect
();
// Deduplicate (logarithmic spacing can produce duplicates at low values)
points
.dedup
();
points
}
/// Latency statistics (avg, p50, p99) in nanoseconds
#[derive(Debug)]
struct
DurationStats
{
avg_ns
:
u64
,
p50_ns
:
u64
,
p99_ns
:
u64
,
}
impl
DurationStats
{
/// Compute stats from durations. Sorts the input vector in place.
fn
from_durations
(
durations
:
&
mut
[
Duration
])
->
Self
{
durations
.sort
();
let
n
=
durations
.len
();
let
avg
=
durations
.iter
()
.sum
::
<
Duration
>
()
/
n
as
u32
;
Self
{
avg_ns
:
avg
.as_nanos
()
as
u64
,
p50_ns
:
durations
[
n
/
2
]
.as_nanos
()
as
u64
,
p99_ns
:
durations
[
n
*
99
/
100
]
.as_nanos
()
as
u64
,
}
}
}
/// Results for a single sweep point (depth or width)
#[derive(Debug)]
struct
SweepResult
{
point
:
usize
,
point_label
:
&
'static
str
,
store
:
DurationStats
,
remove
:
DurationStats
,
find_matches
:
DurationStats
,
}
impl
SweepResult
{
fn
csv_header
(
&
self
)
->
String
{
format!
(
"{},store_avg_ns,store_p50_ns,store_p99_ns,remove_avg_ns,remove_p50_ns,remove_p99_ns,find_matches_avg_ns,find_matches_p50_ns,find_matches_p99_ns"
,
self
.point_label
)
}
fn
csv_row
(
&
self
)
->
String
{
format!
(
"{},{},{},{},{},{},{},{},{},{}"
,
self
.point
,
self
.store.avg_ns
,
self
.store.p50_ns
,
self
.store.p99_ns
,
self
.remove.avg_ns
,
self
.remove.p50_ns
,
self
.remove.p99_ns
,
self
.find_matches.avg_ns
,
self
.find_matches.p50_ns
,
self
.find_matches.p99_ns
)
}
fn
table_header
(
&
self
)
->
String
{
format!
(
"{:>8} | store_avg store_p50 store_p99 | remove_avg remove_p50 remove_p99 | fm_avg fm_p50 fm_p99"
,
self
.point_label
)
}
fn
table_row
(
&
self
)
->
String
{
format!
(
"{:>8} | {:>12} {:>12} {:>12} | {:>12} {:>12} {:>12} | {:>12} {:>12} {:>12}"
,
self
.point
,
format_duration_ns
(
self
.store.avg_ns
),
format_duration_ns
(
self
.store.p50_ns
),
format_duration_ns
(
self
.store.p99_ns
),
format_duration_ns
(
self
.remove.avg_ns
),
format_duration_ns
(
self
.remove.p50_ns
),
format_duration_ns
(
self
.remove.p99_ns
),
format_duration_ns
(
self
.find_matches.avg_ns
),
format_duration_ns
(
self
.find_matches.p50_ns
),
format_duration_ns
(
self
.find_matches.p99_ns
)
)
}
}
fn
print_sweep_results_dynamic
(
results
:
&
[
SweepResult
],
format
:
&
str
)
{
if
results
.is_empty
()
{
return
;
}
println!
();
if
format
==
"csv"
{
println!
(
"{}"
,
results
[
0
]
.csv_header
());
for
r
in
results
{
println!
(
"{}"
,
r
.csv_row
());
}
}
else
{
println!
(
"{}"
,
results
[
0
]
.table_header
());
println!
(
"{}"
,
"-"
.repeat
(
130
));
for
r
in
results
{
println!
(
"{}"
,
r
.table_row
());
}
}
}
/// Benchmark store/remove/find_matches across a range of depths or widths.
///
/// For each sweep point, the tree is rebuilt.
///
/// With `--sweep_mode match_length`, find_matches queries have `max_depth` blocks
/// where only the first `depth` blocks match (rest are garbage). With `--sweep_mode depth`,
/// queries have exactly `depth` blocks (all matching). With `--sweep_mode width`,
/// the number of prefix prompt groups is varied.
fn
bench_sweep
(
args
:
&
Args
)
{
let
seq_length
=
args
.max_depth
;
let
num_sequences
=
args
.size
/
seq_length
;
if
num_sequences
<
2
{
eprintln!
(
"Error: size {} / max_depth {} = {} sequences (need at least 2).
\
Increase --size or decrease --max-depth."
,
args
.size
,
seq_length
,
num_sequences
);
std
::
process
::
exit
(
1
);
}
let
(
mode_name
,
point_label
,
sweep_points
)
=
match
args
.sweep_mode
{
SweepMode
::
Depth
=>
(
"Depth"
,
"depth"
,
generate_log_spaced_points
(
args
.min_depth
,
args
.max_depth
,
args
.sweep_points
),
),
SweepMode
::
MatchLength
=>
(
"Match Length"
,
"depth"
,
generate_log_spaced_points
(
args
.min_depth
,
args
.max_depth
,
args
.sweep_points
),
),
SweepMode
::
Width
=>
(
"Width"
,
"width"
,
generate_log_spaced_points
(
args
.min_width
,
args
.max_width
,
args
.sweep_points
),
),
};
println!
(
"
\n
=== {} Sweep Benchmark ==="
,
mode_name
);
println!
(
" Sequence length: {} blocks (fixed)"
,
seq_length
);
match
args
.sweep_mode
{
SweepMode
::
Depth
|
SweepMode
::
MatchLength
=>
{
println!
(
" Sweep range: {} to {} ({} points, log-spaced)"
,
args
.min_depth
,
args
.max_depth
,
args
.sweep_points
);
}
SweepMode
::
Width
=>
{
println!
(
" Width range: {} to {} ({} points, log-spaced)"
,
args
.min_width
,
args
.max_width
,
args
.sweep_points
);
println!
(
" Prefix prompt ratio: {:.1}%"
,
args
.prefix_prompt_ratio
*
100.0
);
}
}
println!
(
" Iterations per point: {}"
,
args
.sweep_iterations
);
println!
(
" Tree: {} sequences, {} total blocks"
,
num_sequences
,
num_sequences
*
seq_length
);
println!
(
" Workers: {}"
,
args
.num_workers
);
match
args
.sweep_mode
{
SweepMode
::
MatchLength
=>
{
println!
(
" Mode: find_matches queries padded with garbage to max_depth"
);
}
SweepMode
::
Depth
=>
{
println!
(
" Mode: find_matches queries truncated to depth"
);
}
SweepMode
::
Width
=>
{
println!
(
" Mode: varying num_prefix_prompts, full-depth operations"
);
}
}
println!
();
let
mut
results
:
Vec
<
SweepResult
>
=
Vec
::
with_capacity
(
sweep_points
.len
());
for
(
idx
,
&
point
)
in
sweep_points
.iter
()
.enumerate
()
{
print!
(
"[{}/{}] {}={}... "
,
idx
+
1
,
sweep_points
.len
(),
point_label
,
point
);
std
::
io
::
Write
::
flush
(
&
mut
std
::
io
::
stdout
())
.unwrap
();
// Determine depth and num_prefix_prompts for this sweep point
let
(
depth
,
num_prefix_prompts
)
=
match
args
.sweep_mode
{
SweepMode
::
Depth
|
SweepMode
::
MatchLength
=>
(
point
,
args
.num_prefix_prompts
),
SweepMode
::
Width
=>
(
seq_length
,
point
),
};
// Generate sequences and rebuild tree for this point
let
extra_count
=
args
.sweep_iterations
;
let
all_sequences
=
generate_sequences
(
num_sequences
+
extra_count
,
seq_length
,
args
.num_workers
,
args
.prefix_prompt_ratio
,
num_prefix_prompts
,
args
.seed
,
);
let
tree_sequences
=
&
all_sequences
[
..
num_sequences
];
let
extra_sequences
=
&
all_sequences
[
num_sequences
..
];
let
mut
tree
=
build_tree
(
tree_sequences
);
// --- STORE benchmark ---
let
mut
store_durations
=
Vec
::
with_capacity
(
args
.sweep_iterations
);
for
(
i
,
seq
)
in
extra_sequences
.iter
()
.enumerate
()
.take
(
args
.sweep_iterations
)
{
let
truncated
=
SequenceData
{
worker_id
:
seq
.worker_id
,
local_hashes
:
seq
.local_hashes
[
..
depth
]
.to_vec
(),
external_hashes
:
seq
.external_hashes
[
..
depth
]
.to_vec
(),
};
let
store_event
=
truncated
.to_store_event
(
i
as
u64
);
let
start
=
Instant
::
now
();
let
_
=
tree
.apply_event
(
store_event
);
store_durations
.push
(
start
.elapsed
());
// Remove to restore tree state (untimed)
let
remove_event
=
truncated
.to_remove_event
(
i
as
u64
);
let
_
=
tree
.apply_event
(
remove_event
);
}
// --- REMOVE benchmark ---
let
mut
remove_durations
=
Vec
::
with_capacity
(
args
.sweep_iterations
);
for
i
in
0
..
args
.sweep_iterations
.min
(
num_sequences
)
{
let
seq
=
&
tree_sequences
[
i
%
tree_sequences
.len
()];
let
truncated
=
SequenceData
{
worker_id
:
seq
.worker_id
,
local_hashes
:
seq
.local_hashes
[
..
depth
]
.to_vec
(),
external_hashes
:
seq
.external_hashes
[
..
depth
]
.to_vec
(),
};
let
remove_event
=
truncated
.to_remove_event
(
i
as
u64
);
let
start
=
Instant
::
now
();
let
_
=
tree
.apply_event
(
remove_event
);
remove_durations
.push
(
start
.elapsed
());
// Re-add to restore state (untimed)
let
store_event
=
truncated
.to_store_event
(
i
as
u64
+
1000000
);
let
_
=
tree
.apply_event
(
store_event
);
}
// --- FIND_MATCHES benchmark ---
let
mut
find_matches_durations
=
Vec
::
with_capacity
(
args
.sweep_iterations
);
for
i
in
0
..
args
.sweep_iterations
{
let
seq
=
&
tree_sequences
[
i
%
tree_sequences
.len
()];
let
query
=
match
args
.sweep_mode
{
SweepMode
::
MatchLength
=>
{
// Match length mode: first `depth` blocks match, rest are garbage
let
mut
q
=
seq
.local_hashes
[
..
depth
]
.to_vec
();
let
garbage_len
=
seq_length
-
depth
;
q
.extend
((
0
..
garbage_len
)
.map
(|
j
|
{
LocalBlockHash
(
0xBAD_C0DE_0000_0000
|
((
i
as
u64
)
<<
16
)
|
(
j
as
u64
))
}));
q
}
SweepMode
::
Depth
|
SweepMode
::
Width
=>
{
// Depth/width mode: query has exactly `depth` blocks
seq
.local_hashes
[
..
depth
]
.to_vec
()
}
};
let
start
=
Instant
::
now
();
let
_
=
tree
.find_matches
(
query
,
false
);
find_matches_durations
.push
(
start
.elapsed
());
}
// Compute stats
let
store
=
DurationStats
::
from_durations
(
&
mut
store_durations
);
let
remove
=
DurationStats
::
from_durations
(
&
mut
remove_durations
);
let
find_matches
=
DurationStats
::
from_durations
(
&
mut
find_matches_durations
);
println!
(
"store={:.2}us, remove={:.2}us, find_matches={:.2}us"
,
store
.avg_ns
as
f64
/
1000.0
,
remove
.avg_ns
as
f64
/
1000.0
,
find_matches
.avg_ns
as
f64
/
1000.0
);
results
.push
(
SweepResult
{
point
,
point_label
,
store
,
remove
,
find_matches
,
});
}
print_sweep_results_dynamic
(
&
results
,
&
args
.sweep_format
);
}
/// Benchmark dump_tree_as_events (BFS dump)
fn
bench_dump
(
args
:
&
Args
)
{
println!
(
"
\n
=== Benchmarking DUMP_TREE_AS_EVENTS (BFS dump) ==="
);
let
num_sequences
=
args
.size
/
args
.depth
;
let
sequences
=
generate_sequences
(
num_sequences
,
args
.depth
,
args
.num_workers
,
args
.prefix_prompt_ratio
,
args
.num_prefix_prompts
,
args
.seed
,
);
let
tree
=
build_tree
(
&
sequences
);
println!
(
" Tree built with {} sequences, {} total blocks"
,
sequences
.len
(),
tree
.current_size
()
);
// Single iteration timing
let
start
=
Instant
::
now
();
let
events
=
tree
.dump_tree_as_events
();
let
elapsed
=
start
.elapsed
();
println!
(
"
\n
DUMP_TREE_AS_EVENTS Results:"
);
println!
(
" Time: {:?}"
,
elapsed
);
println!
(
" Events: {}"
,
events
.len
());
println!
(
" Throughput: {:.2} events/sec"
,
events
.len
()
as
f64
/
elapsed
.as_secs_f64
()
);
}
/// Format nanoseconds as human-readable string
fn
format_duration_ns
(
ns
:
u64
)
->
String
{
if
ns
>=
1_000_000_000
{
format!
(
"{:.2}s"
,
ns
as
f64
/
1_000_000_000.0
)
}
else
if
ns
>=
1_000_000
{
format!
(
"{:.2}ms"
,
ns
as
f64
/
1_000_000.0
)
}
else
if
ns
>=
1_000
{
format!
(
"{:.2}us"
,
ns
as
f64
/
1_000.0
)
}
else
{
format!
(
"{}ns"
,
ns
)
}
}
fn
main
()
{
let
args
=
Args
::
parse
();
// Validate arguments to prevent panics
if
args
.size
==
0
||
args
.depth
==
0
||
args
.num_workers
==
0
||
args
.iterations
==
0
||
args
.block_size
==
0
||
args
.min_depth
==
0
||
args
.max_depth
==
0
||
args
.min_width
==
0
||
args
.max_width
==
0
||
args
.sweep_iterations
==
0
{
eprintln!
(
"size, depth, num_workers, iterations, block_size, min_depth, max_depth, min_width, max_width, and sweep_iterations must be > 0"
);
std
::
process
::
exit
(
1
);
}
if
args
.min_depth
>
args
.max_depth
{
eprintln!
(
"min_depth must be <= max_depth"
);
std
::
process
::
exit
(
1
);
}
if
args
.min_width
>
args
.max_width
{
eprintln!
(
"min_width must be <= max_width"
);
std
::
process
::
exit
(
1
);
}
if
!
(
0.0
..=
1.0
)
.contains
(
&
args
.prefix_prompt_ratio
)
{
eprintln!
(
"prefix_prompt_ratio must be between 0.0 and 1.0"
);
std
::
process
::
exit
(
1
);
}
let
num_sequences
=
args
.size
/
args
.depth
;
if
matches!
(
args
.benchmark_type
.as_str
(),
"store"
|
"remove"
|
"lookup"
|
"sweep"
|
"all"
)
&&
num_sequences
==
0
{
eprintln!
(
"size must be >= depth to produce at least one sequence for {}"
,
args
.benchmark_type
);
std
::
process
::
exit
(
1
);
}
println!
(
"Radix Tree Microbenchmark"
);
println!
(
"=========================
\n
"
);
println!
(
"Configuration:"
);
println!
(
" Target size: {} (worker, block) pairs"
,
args
.size
);
println!
(
" Depth: {} blocks/sequence (= {} tokens with block_size={})"
,
args
.depth
,
args
.depth
*
args
.block_size
as
usize
,
args
.block_size
);
println!
(
" Block size: {} tokens"
,
args
.block_size
);
println!
(
" Workers: {}"
,
args
.num_workers
);
println!
(
" Iterations: {}"
,
args
.iterations
);
println!
(
" Prefix prompt ratio: {:.1}% ({} blocks at depth {})"
,
args
.prefix_prompt_ratio
*
100.0
,
(
args
.depth
as
f64
*
args
.prefix_prompt_ratio
)
.round
()
as
usize
,
args
.depth
);
println!
(
" Prefix prompt groups: {}"
,
args
.num_prefix_prompts
);
println!
(
"
\n
Derived: {} sequences to reach target size"
,
num_sequences
);
match
args
.benchmark_type
.as_str
()
{
"hash"
=>
bench_hash
(
&
args
),
"store"
=>
bench_store
(
&
args
),
"remove"
=>
bench_remove
(
&
args
),
"find_matches"
=>
bench_find_matches
(
&
args
),
"dump"
=>
bench_dump
(
&
args
),
"sweep"
=>
bench_sweep
(
&
args
),
"all"
=>
{
bench_hash
(
&
args
);
bench_store
(
&
args
);
bench_remove
(
&
args
);
bench_find_matches
(
&
args
);
bench_dump
(
&
args
);
}
_
=>
{
eprintln!
(
"Unknown benchmark type: {}. Use 'hash', 'store', 'remove', 'find_matches', 'dump', 'sweep', or 'all'"
,
args
.benchmark_type
);
std
::
process
::
exit
(
1
);
}
}
println!
(
"
\n
Benchmark complete."
);
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment