Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
59053354
Unverified
Commit
59053354
authored
Aug 20, 2025
by
Jason Zhou
Committed by
GitHub
Aug 20, 2025
Browse files
feat: add --tokenizer_path to profile_endpoint.py (#2550)
parent
344c21dc
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
53 additions
and
5 deletions
+53
-5
benchmarks/profiler/profile_endpoint.py
benchmarks/profiler/profile_endpoint.py
+11
-0
benchmarks/profiler/profile_sla.py
benchmarks/profiler/profile_sla.py
+2
-0
benchmarks/profiler/utils/genai_perf.py
benchmarks/profiler/utils/genai_perf.py
+19
-3
benchmarks/profiler/utils/profile_decode.py
benchmarks/profiler/utils/profile_decode.py
+9
-0
benchmarks/profiler/utils/profile_prefill.py
benchmarks/profiler/utils/profile_prefill.py
+12
-2
No files found.
benchmarks/profiler/profile_endpoint.py
View file @
59053354
...
@@ -35,6 +35,13 @@ if __name__ == "__main__":
...
@@ -35,6 +35,13 @@ if __name__ == "__main__":
required
=
True
,
required
=
True
,
help
=
"model name"
,
help
=
"model name"
,
)
)
parser
.
add_argument
(
"--tokenizer_path"
,
type
=
str
,
required
=
False
,
default
=
""
,
help
=
"tokenizer path"
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--url"
,
"--url"
,
type
=
str
,
type
=
str
,
...
@@ -75,10 +82,13 @@ if __name__ == "__main__":
...
@@ -75,10 +82,13 @@ if __name__ == "__main__":
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
os
.
makedirs
(
args
.
work_dir
,
exist_ok
=
True
)
os
.
makedirs
(
args
.
work_dir
,
exist_ok
=
True
)
if
args
.
tokenizer_path
==
""
:
args
.
tokenizer_path
=
args
.
model_name
if
args
.
mode
==
"prefill"
:
if
args
.
mode
==
"prefill"
:
profile_prefill
(
profile_prefill
(
args
.
work_dir
,
args
.
work_dir
,
args
.
model_name
,
args
.
model_name
,
args
.
tokenizer_path
,
args
.
url
,
args
.
url
,
args
.
num_gpus
,
args
.
num_gpus
,
args
.
max_context_length
,
args
.
max_context_length
,
...
@@ -89,6 +99,7 @@ if __name__ == "__main__":
...
@@ -89,6 +99,7 @@ if __name__ == "__main__":
profile_decode
(
profile_decode
(
args
.
work_dir
,
args
.
work_dir
,
args
.
model_name
,
args
.
model_name
,
args
.
tokenizer_path
,
args
.
url
,
args
.
url
,
args
.
num_gpus
,
args
.
num_gpus
,
args
.
max_kv_tokens
,
args
.
max_kv_tokens
,
...
...
benchmarks/profiler/profile_sla.py
View file @
59053354
...
@@ -421,6 +421,7 @@ async def run_profile(args):
...
@@ -421,6 +421,7 @@ async def run_profile(args):
profile_prefill
(
profile_prefill
(
work_dir
,
work_dir
,
model_name
,
model_name
,
model_name
,
base_url
,
base_url
,
best_prefill_tp
,
best_prefill_tp
,
args
.
max_context_length
,
args
.
max_context_length
,
...
@@ -476,6 +477,7 @@ async def run_profile(args):
...
@@ -476,6 +477,7 @@ async def run_profile(args):
profile_decode
(
profile_decode
(
work_dir
,
work_dir
,
model_name
,
model_name
,
model_name
,
base_url
,
base_url
,
best_decode_tp
,
best_decode_tp
,
max_kv_tokens
,
max_kv_tokens
,
...
...
benchmarks/profiler/utils/genai_perf.py
View file @
59053354
...
@@ -34,6 +34,7 @@ def _get_common_genai_perf_cmd(
...
@@ -34,6 +34,7 @@ def _get_common_genai_perf_cmd(
artifact_dir
,
artifact_dir
,
seed
=
100
,
seed
=
100
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
tokenizer
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
base_url
=
"http://localhost:8000"
,
base_url
=
"http://localhost:8000"
,
):
):
return
[
return
[
...
@@ -42,7 +43,7 @@ def _get_common_genai_perf_cmd(
...
@@ -42,7 +43,7 @@ def _get_common_genai_perf_cmd(
"--model"
,
"--model"
,
model
,
model
,
"--tokenizer"
,
"--tokenizer"
,
model
,
tokenizer
,
"--endpoint-type"
,
"--endpoint-type"
,
"chat"
,
"chat"
,
"--endpoint"
,
"--endpoint"
,
...
@@ -68,6 +69,7 @@ def get_prefill_genai_perf_cmd(
...
@@ -68,6 +69,7 @@ def get_prefill_genai_perf_cmd(
artifact_dir
,
artifact_dir
,
seed
=
100
,
seed
=
100
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
tokenizer
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
osl
=
5
,
osl
=
5
,
base_url
=
"http://localhost:8000"
,
base_url
=
"http://localhost:8000"
,
):
):
...
@@ -75,6 +77,7 @@ def get_prefill_genai_perf_cmd(
...
@@ -75,6 +77,7 @@ def get_prefill_genai_perf_cmd(
artifact_dir
,
artifact_dir
,
seed
,
seed
,
model
,
model
,
tokenizer
,
base_url
,
base_url
,
)
+
[
)
+
[
"--synthetic-input-tokens-mean"
,
"--synthetic-input-tokens-mean"
,
...
@@ -103,12 +106,14 @@ def get_decode_genai_perf_cmd(
...
@@ -103,12 +106,14 @@ def get_decode_genai_perf_cmd(
num_request
,
num_request
,
seed
=
100
,
seed
=
100
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
model
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
tokenizer
=
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
,
base_url
=
"http://localhost:8000"
,
base_url
=
"http://localhost:8000"
,
):
):
return
_get_common_genai_perf_cmd
(
return
_get_common_genai_perf_cmd
(
artifact_dir
,
artifact_dir
,
seed
,
seed
,
model
,
model
,
tokenizer
,
base_url
,
base_url
,
)
+
[
)
+
[
"--synthetic-input-tokens-mean"
,
"--synthetic-input-tokens-mean"
,
...
@@ -147,11 +152,19 @@ def get_gap_result(artifact_dir: str) -> dict:
...
@@ -147,11 +152,19 @@ def get_gap_result(artifact_dir: str) -> dict:
def
benchmark_prefill
(
def
benchmark_prefill
(
isl
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
"http://localhost:8000"
isl
,
genai_perf_artifact_dir
,
model_name
,
tokenizer
,
base_url
=
"http://localhost:8000"
,
):
):
logger
.
info
(
f
"Running genai-perf with isl
{
isl
}
"
)
logger
.
info
(
f
"Running genai-perf with isl
{
isl
}
"
)
genai_perf_cmd
=
get_prefill_genai_perf_cmd
(
genai_perf_cmd
=
get_prefill_genai_perf_cmd
(
isl
,
genai_perf_artifact_dir
,
model
=
model_name
,
base_url
=
base_url
isl
,
genai_perf_artifact_dir
,
model
=
model_name
,
tokenizer
=
tokenizer
,
base_url
=
base_url
,
)
)
print
(
f
"genai-perf cmd:
{
genai_perf_cmd
}
"
)
print
(
f
"genai-perf cmd:
{
genai_perf_cmd
}
"
)
# import pdb; pdb.set_trace()
# import pdb; pdb.set_trace()
...
@@ -179,6 +192,7 @@ def benchmark_decode(
...
@@ -179,6 +192,7 @@ def benchmark_decode(
num_request
,
num_request
,
genai_perf_artifact_dir
,
genai_perf_artifact_dir
,
model_name
,
model_name
,
tokenizer
,
base_url
=
"http://localhost:8000"
,
base_url
=
"http://localhost:8000"
,
):
):
logger
.
info
(
f
"Profiling decode with num_request
{
num_request
}
..."
)
logger
.
info
(
f
"Profiling decode with num_request
{
num_request
}
..."
)
...
@@ -194,6 +208,7 @@ def benchmark_decode(
...
@@ -194,6 +208,7 @@ def benchmark_decode(
num_request
,
num_request
,
seed
=
seed
,
seed
=
seed
,
model
=
model_name
,
model
=
model_name
,
tokenizer
=
tokenizer
,
base_url
=
base_url
,
base_url
=
base_url
,
)
)
gap_process
=
subprocess
.
Popen
(
gap_process
=
subprocess
.
Popen
(
...
@@ -211,6 +226,7 @@ def benchmark_decode(
...
@@ -211,6 +226,7 @@ def benchmark_decode(
num_request
,
num_request
,
seed
=
seed
,
seed
=
seed
,
model
=
model_name
,
model
=
model_name
,
tokenizer
=
tokenizer
,
base_url
=
base_url
,
base_url
=
base_url
,
)
)
gap_process
=
subprocess
.
Popen
(
gap_process
=
subprocess
.
Popen
(
...
...
benchmarks/profiler/utils/profile_decode.py
View file @
59053354
...
@@ -21,6 +21,7 @@ logger.addHandler(console_handler)
...
@@ -21,6 +21,7 @@ logger.addHandler(console_handler)
def
profile_decode
(
def
profile_decode
(
work_dir
,
work_dir
,
model_name
,
model_name
,
tokenizer
,
url
,
url
,
num_gpus
,
num_gpus
,
max_kv_tokens
,
max_kv_tokens
,
...
@@ -41,6 +42,13 @@ def profile_decode(
...
@@ -41,6 +42,13 @@ def profile_decode(
(
max_context_length
-
osl
)
//
interpolation_granularity
,
(
max_context_length
-
osl
)
//
interpolation_granularity
,
):
):
max_concurrency
=
max_kv_tokens
//
(
isl
+
osl
)
max_concurrency
=
max_kv_tokens
//
(
isl
+
osl
)
if
max_concurrency
//
interpolation_granularity
==
0
:
logger
.
warning
(
f
"max_concurrency
{
max_concurrency
}
is too small for"
f
" interpolation granularity
{
interpolation_granularity
}
."
f
" max_kv_tokens
{
max_kv_tokens
}
, isl
{
isl
}
, osl
{
osl
}
"
)
break
sweep_num_request
=
range
(
sweep_num_request
=
range
(
1
,
1
,
max_concurrency
,
max_concurrency
,
...
@@ -54,6 +62,7 @@ def profile_decode(
...
@@ -54,6 +62,7 @@ def profile_decode(
num_request
,
num_request
,
genai_perf_artifact_dir
,
genai_perf_artifact_dir
,
model_name
,
model_name
,
tokenizer
,
base_url
=
url
,
base_url
=
url
,
)
)
if
gap_result
is
not
None
:
if
gap_result
is
not
None
:
...
...
benchmarks/profiler/utils/profile_prefill.py
View file @
59053354
...
@@ -19,7 +19,13 @@ logger.addHandler(console_handler)
...
@@ -19,7 +19,13 @@ logger.addHandler(console_handler)
def
profile_prefill
(
def
profile_prefill
(
work_dir
,
model_name
,
url
,
num_gpus
,
max_context_length
,
interpolation_granularity
work_dir
,
model_name
,
tokenizer
,
url
,
num_gpus
,
max_context_length
,
interpolation_granularity
,
):
):
prefill_isl
=
[]
prefill_isl
=
[]
prefill_ttft
=
[]
prefill_ttft
=
[]
...
@@ -32,7 +38,11 @@ def profile_prefill(
...
@@ -32,7 +38,11 @@ def profile_prefill(
# run genai-perf
# run genai-perf
genai_perf_artifact_dir
=
f
"
{
work_dir
}
/gap_isl
{
isl
}
"
genai_perf_artifact_dir
=
f
"
{
work_dir
}
/gap_isl
{
isl
}
"
gap_result
=
benchmark_prefill
(
gap_result
=
benchmark_prefill
(
isl
,
genai_perf_artifact_dir
,
model_name
,
base_url
=
url
isl
,
genai_perf_artifact_dir
,
model_name
,
tokenizer
,
base_url
=
url
,
)
)
if
gap_result
is
not
None
:
if
gap_result
is
not
None
:
ttft
=
gap_result
[
"time_to_first_token"
][
"avg"
]
ttft
=
gap_result
[
"time_to_first_token"
][
"avg"
]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment