Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
cf97c0dc
Unverified
Commit
cf97c0dc
authored
Nov 17, 2025
by
Hongkuan Zhou
Committed by
GitHub
Nov 17, 2025
Browse files
fix: DSR1 DEP Prefill Profiling Benchmark (#4367)
Signed-off-by:
hongkuanz
<
hongkuanz@nvidia.com
>
parent
24af5a33
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
35 additions
and
7 deletions
+35
-7
benchmarks/profiler/profile_sla.py
benchmarks/profiler/profile_sla.py
+1
-0
benchmarks/profiler/utils/aiperf.py
benchmarks/profiler/utils/aiperf.py
+24
-4
benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
...rofiler/utils/config_modifiers/parallelization_mapping.py
+2
-1
benchmarks/profiler/utils/config_modifiers/sglang.py
benchmarks/profiler/utils/config_modifiers/sglang.py
+2
-0
benchmarks/profiler/utils/defaults.py
benchmarks/profiler/utils/defaults.py
+5
-0
benchmarks/profiler/utils/profile_decode.py
benchmarks/profiler/utils/profile_decode.py
+1
-0
benchmarks/profiler/utils/profile_prefill.py
benchmarks/profiler/utils/profile_prefill.py
+0
-2
No files found.
benchmarks/profiler/profile_sla.py
View file @
cf97c0dc
...
@@ -457,6 +457,7 @@ async def run_profile(args):
...
@@ -457,6 +457,7 @@ async def run_profile(args):
model_name
,
model_name
,
base_url
=
base_url
,
base_url
=
base_url
,
num_gpus
=
num_gpus
,
num_gpus
=
num_gpus
,
attention_dp_size
=
mapping
.
get_attn_dp_size
(),
)
)
if
itl
is
not
None
and
thpt_per_gpu
is
not
None
:
if
itl
is
not
None
and
thpt_per_gpu
is
not
None
:
...
...
benchmarks/profiler/utils/aiperf.py
View file @
cf97c0dc
...
@@ -20,6 +20,12 @@ import random
...
@@ -20,6 +20,12 @@ import random
import
subprocess
import
subprocess
from
typing
import
Optional
,
Tuple
from
typing
import
Optional
,
Tuple
from
benchmarks.profiler.utils.defaults
import
(
AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO
,
AIPERF_PREFILL_BENCHMARK_OSL
,
AIPERF_WARMUP_REQUEST_PER_DP_RANK
,
)
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
logger
.
setLevel
(
logging
.
INFO
)
logger
.
setLevel
(
logging
.
INFO
)
console_handler
=
logging
.
StreamHandler
()
console_handler
=
logging
.
StreamHandler
()
...
@@ -37,7 +43,7 @@ def _get_common_aiperf_cmd(
...
@@ -37,7 +43,7 @@ def _get_common_aiperf_cmd(
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
tokenizer
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
tokenizer
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
base_url
=
"http://localhost:8000"
,
base_url
=
"http://localhost:8000"
,
warmup_request_count
:
int
=
3
,
warmup_request_count
:
int
=
AIPERF_WARMUP_REQUEST_PER_DP_RANK
,
):
):
return
[
return
[
"aiperf"
,
"aiperf"
,
...
@@ -74,11 +80,11 @@ def get_prefill_aiperf_cmd(
...
@@ -74,11 +80,11 @@ def get_prefill_aiperf_cmd(
seed
=
100
,
seed
=
100
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
tokenizer
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
tokenizer
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
osl
=
5
,
osl
=
AIPERF_PREFILL_BENCHMARK_OSL
,
base_url
=
"http://localhost:8000"
,
base_url
=
"http://localhost:8000"
,
concurrency
:
int
=
1
,
concurrency
:
int
=
1
,
request_count
:
int
=
1
,
request_count
:
int
=
1
,
warmup_request_count
:
int
=
3
,
warmup_request_count
:
int
=
AIPERF_WARMUP_REQUEST_PER_DP_RANK
,
):
):
return
_get_common_aiperf_cmd
(
return
_get_common_aiperf_cmd
(
artifact_dir
,
artifact_dir
,
...
@@ -116,6 +122,7 @@ def get_decode_aiperf_cmd(
...
@@ -116,6 +122,7 @@ def get_decode_aiperf_cmd(
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
tokenizer
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
tokenizer
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
base_url
=
"http://localhost:8000"
,
base_url
=
"http://localhost:8000"
,
warmup_request_count
:
int
=
AIPERF_WARMUP_REQUEST_PER_DP_RANK
,
):
):
return
_get_common_aiperf_cmd
(
return
_get_common_aiperf_cmd
(
artifact_dir
,
artifact_dir
,
...
@@ -123,6 +130,7 @@ def get_decode_aiperf_cmd(
...
@@ -123,6 +130,7 @@ def get_decode_aiperf_cmd(
model
,
model
,
tokenizer
,
tokenizer
,
base_url
,
base_url
,
warmup_request_count
=
warmup_request_count
,
)
+
[
)
+
[
"--synthetic-input-tokens-mean"
,
"--synthetic-input-tokens-mean"
,
str
(
isl
),
str
(
isl
),
...
@@ -207,7 +215,7 @@ def get_prefill_ttft(
...
@@ -207,7 +215,7 @@ def get_prefill_ttft(
tokenizer
:
str
,
tokenizer
:
str
,
base_url
:
str
=
"http://localhost:8000"
,
base_url
:
str
=
"http://localhost:8000"
,
attention_dp_size
:
int
=
1
,
attention_dp_size
:
int
=
1
,
attn_dp_num_req_ratio
:
int
=
4
,
attn_dp_num_req_ratio
:
int
=
AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO
,
)
->
Optional
[
float
]:
)
->
Optional
[
float
]:
"""
"""
Run prefill benchmark and extract TTFT (ms). Returns None on failure.
Run prefill benchmark and extract TTFT (ms). Returns None on failure.
...
@@ -218,6 +226,7 @@ def get_prefill_ttft(
...
@@ -218,6 +226,7 @@ def get_prefill_ttft(
"""
"""
# DEP-aware measurement (waves of size attention_dp_size)
# DEP-aware measurement (waves of size attention_dp_size)
if
attention_dp_size
>
1
:
if
attention_dp_size
>
1
:
assert
attn_dp_num_req_ratio
>
0
,
"attn_dp_num_req_ratio must be greater than 0"
total_concurrency
=
attention_dp_size
*
attn_dp_num_req_ratio
total_concurrency
=
attention_dp_size
*
attn_dp_num_req_ratio
logger
.
info
(
logger
.
info
(
f
"DEP prefill measurement: isl=
{
isl
}
, attn_dp=
{
attention_dp_size
}
, attn_dp_num_req_ratio=
{
attn_dp_num_req_ratio
}
, "
f
"DEP prefill measurement: isl=
{
isl
}
, attn_dp=
{
attention_dp_size
}
, attn_dp_num_req_ratio=
{
attn_dp_num_req_ratio
}
, "
...
@@ -232,9 +241,16 @@ def get_prefill_ttft(
...
@@ -232,9 +241,16 @@ def get_prefill_ttft(
base_url
=
base_url
,
base_url
=
base_url
,
concurrency
=
total_concurrency
,
concurrency
=
total_concurrency
,
request_count
=
total_concurrency
,
request_count
=
total_concurrency
,
warmup_request_count
=
AIPERF_WARMUP_REQUEST_PER_DP_RANK
*
attention_dp_size
,
)
)
try
:
try
:
max_ttft
=
float
(
aiperf_result
[
"time_to_first_token"
][
"max"
])
max_ttft
=
float
(
aiperf_result
[
"time_to_first_token"
][
"max"
])
# subtract the decoding time in-between prefill runs
max_ttft
-=
(
float
(
aiperf_result
[
"inter_token_latency"
][
"avg"
])
*
(
AIPERF_PREFILL_BENCHMARK_OSL
-
1
)
*
(
attn_dp_num_req_ratio
-
1
)
)
return
max_ttft
/
float
(
attn_dp_num_req_ratio
)
return
max_ttft
/
float
(
attn_dp_num_req_ratio
)
except
(
KeyError
,
TypeError
,
ValueError
):
except
(
KeyError
,
TypeError
,
ValueError
):
logger
.
warning
(
logger
.
warning
(
...
@@ -266,6 +282,7 @@ def get_decode_itl_and_thpt_per_gpu(
...
@@ -266,6 +282,7 @@ def get_decode_itl_and_thpt_per_gpu(
tokenizer
:
str
,
tokenizer
:
str
,
base_url
:
str
=
"http://localhost:8000"
,
base_url
:
str
=
"http://localhost:8000"
,
num_gpus
:
int
=
1
,
num_gpus
:
int
=
1
,
attention_dp_size
:
int
=
1
,
)
->
Tuple
[
Optional
[
float
],
Optional
[
float
]]:
)
->
Tuple
[
Optional
[
float
],
Optional
[
float
]]:
"""
"""
Run decode benchmark and extract (ITL ms, throughput per GPU).
Run decode benchmark and extract (ITL ms, throughput per GPU).
...
@@ -279,6 +296,7 @@ def get_decode_itl_and_thpt_per_gpu(
...
@@ -279,6 +296,7 @@ def get_decode_itl_and_thpt_per_gpu(
model_name
,
model_name
,
tokenizer
,
tokenizer
,
base_url
=
base_url
,
base_url
=
base_url
,
warmup_request_count
=
AIPERF_WARMUP_REQUEST_PER_DP_RANK
*
attention_dp_size
,
)
)
if
aiperf_result
is
None
:
if
aiperf_result
is
None
:
return
None
,
None
return
None
,
None
...
@@ -300,6 +318,7 @@ def benchmark_decode(
...
@@ -300,6 +318,7 @@ def benchmark_decode(
model_name
,
model_name
,
tokenizer
,
tokenizer
,
base_url
=
"http://localhost:8000"
,
base_url
=
"http://localhost:8000"
,
warmup_request_count
:
int
=
AIPERF_WARMUP_REQUEST_PER_DP_RANK
,
):
):
logger
.
info
(
f
"Profiling decode with num_request
{
num_request
}
..."
)
logger
.
info
(
f
"Profiling decode with num_request
{
num_request
}
..."
)
...
@@ -316,6 +335,7 @@ def benchmark_decode(
...
@@ -316,6 +335,7 @@ def benchmark_decode(
model
=
model_name
,
model
=
model_name
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
base_url
=
base_url
,
base_url
=
base_url
,
warmup_request_count
=
warmup_request_count
,
)
)
aiperf_process
=
subprocess
.
Popen
(
aiperf_process
=
subprocess
.
Popen
(
aiperf_cmd
,
aiperf_cmd
,
...
...
benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
View file @
cf97c0dc
...
@@ -226,6 +226,7 @@ def apply_parallel_mapping_to_config(
...
@@ -226,6 +226,7 @@ def apply_parallel_mapping_to_config(
cfg
=
config_modifier
.
set_prefill_config
(
cfg
=
config_modifier
.
set_prefill_config
(
cfg
,
cfg
,
max_batch_size
=
mapping
.
get_attn_dp_size
(),
max_batch_size
=
mapping
.
get_attn_dp_size
(),
max_num_tokens
=
PREFILL_MAX_NUM_TOKENS
,
# max num tokens is shared by all attention dp ranks
max_num_tokens
=
PREFILL_MAX_NUM_TOKENS
*
mapping
.
get_attn_dp_size
(),
)
)
return
cfg
return
cfg
benchmarks/profiler/utils/config_modifiers/sglang.py
View file @
cf97c0dc
...
@@ -376,5 +376,7 @@ class SGLangConfigModifier:
...
@@ -376,5 +376,7 @@ class SGLangConfigModifier:
# Cap total tokens processed in a batch to avoid chunked prefill
# Cap total tokens processed in a batch to avoid chunked prefill
args
=
set_argument_value
(
args
,
"--chunked-prefill-size"
,
str
(
max_num_tokens
))
args
=
set_argument_value
(
args
,
"--chunked-prefill-size"
,
str
(
max_num_tokens
))
args
=
append_argument
(
args
,
"--enable-dp-lm-head"
)
worker_service
.
extraPodSpec
.
mainContainer
.
args
=
args
worker_service
.
extraPodSpec
.
mainContainer
.
args
=
args
return
cfg
.
model_dump
()
return
cfg
.
model_dump
()
benchmarks/profiler/utils/defaults.py
View file @
cf97c0dc
...
@@ -25,6 +25,11 @@ DECODE_MAX_CONCURRENCY = 2000
...
@@ -25,6 +25,11 @@ DECODE_MAX_CONCURRENCY = 2000
# set a prefill maximum number of tokens to 32768 to avoid chunked prefill but not too large to cause activation tensor too large
# set a prefill maximum number of tokens to 32768 to avoid chunked prefill but not too large to cause activation tensor too large
PREFILL_MAX_NUM_TOKENS
=
32768
PREFILL_MAX_NUM_TOKENS
=
32768
# AIPerf benchmarking related defaults
AIPERF_WARMUP_REQUEST_PER_DP_RANK
=
3
AIPERF_PREFILL_BENCHMARK_OSL
=
5
AIPERF_PREFILL_ATTN_DP_NUM_REQ_RATIO
=
4
class
EngineType
(
str
,
Enum
):
class
EngineType
(
str
,
Enum
):
PREFILL
=
"prefill"
PREFILL
=
"prefill"
...
...
benchmarks/profiler/utils/profile_decode.py
View file @
cf97c0dc
...
@@ -123,6 +123,7 @@ def profile_decode(
...
@@ -123,6 +123,7 @@ def profile_decode(
tokenizer
,
tokenizer
,
base_url
=
url
,
base_url
=
url
,
num_gpus
=
num_gpus
,
num_gpus
=
num_gpus
,
attention_dp_size
=
attention_dp_size
,
)
)
return
_profile_decode_helper
(
return
_profile_decode_helper
(
...
...
benchmarks/profiler/utils/profile_prefill.py
View file @
cf97c0dc
...
@@ -90,7 +90,6 @@ def profile_prefill(
...
@@ -90,7 +90,6 @@ def profile_prefill(
max_context_length
,
max_context_length
,
interpolation_granularity
,
interpolation_granularity
,
attention_dp_size
:
int
=
1
,
attention_dp_size
:
int
=
1
,
attn_dp_num_req_ratio
:
int
=
4
,
):
):
def
get_ttft
(
isl
):
def
get_ttft
(
isl
):
ai_perf_artifact_dir
=
f
"
{
work_dir
}
/aiperf_isl
{
isl
}
"
ai_perf_artifact_dir
=
f
"
{
work_dir
}
/aiperf_isl
{
isl
}
"
...
@@ -101,7 +100,6 @@ def profile_prefill(
...
@@ -101,7 +100,6 @@ def profile_prefill(
tokenizer
,
tokenizer
,
base_url
=
url
,
base_url
=
url
,
attention_dp_size
=
attention_dp_size
,
attention_dp_size
=
attention_dp_size
,
attn_dp_num_req_ratio
=
attn_dp_num_req_ratio
,
)
)
return
_profile_prefill_helper
(
return
_profile_prefill_helper
(
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment