Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
6c1a3f0c
Unverified
Commit
6c1a3f0c
authored
Oct 30, 2025
by
lpc0220
Committed by
GitHub
Oct 30, 2025
Browse files
enable cudaProfilerApi for one batch benchmarking (#11116)
parent
62377548
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
149 additions
and
34 deletions
+149
-34
python/sglang/bench_one_batch.py
python/sglang/bench_one_batch.py
+149
-34
No files found.
python/sglang/bench_one_batch.py
View file @
6c1a3f0c
...
@@ -11,6 +11,11 @@ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruc
...
@@ -11,6 +11,11 @@ python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruc
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
## run with profiling:
## run with profiling:
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --profile
## run with profiling to custom directory:
export SGLANG_TORCH_PROFILER_DIR=/root/sglang/profile_log
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile
## run with CUDA profiler (nsys):
nsys profile --force-overwrite=true -o bench_one_batch python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 --input-len 256 --profile --profiler_activities CUDA_PROFILER
# Usage (correctness test):
# Usage (correctness test):
python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
...
@@ -93,6 +98,68 @@ profile_activities = [torch.profiler.ProfilerActivity.CPU] + [
...
@@ -93,6 +98,68 @@ profile_activities = [torch.profiler.ProfilerActivity.CPU] + [
]
]
def
start_profile
(
profiler_activities
,
profile_record_shapes
=
False
,
rank_print
=
print
):
"""
Abstracted function to start profiling based on profiler_activities.
Returns profiler object (or None).
"""
if
"CUDA_PROFILER"
in
profiler_activities
:
try
:
torch
.
cuda
.
cudart
().
cudaProfilerStart
()
rank_print
(
"CUDA Profiler started (nsys will begin capturing)"
)
except
Exception
as
e
:
rank_print
(
f
"Failed to start CUDA profiler:
{
e
}
"
)
return
None
else
:
activities
=
[]
if
"CPU"
in
profiler_activities
:
activities
.
append
(
torch
.
profiler
.
ProfilerActivity
.
CPU
)
if
"GPU"
in
profiler_activities
:
activities
.
append
(
torch
.
profiler
.
ProfilerActivity
.
CUDA
)
if
activities
:
profiler
=
torch
.
profiler
.
profile
(
activities
=
activities
,
with_stack
=
True
,
record_shapes
=
profile_record_shapes
,
)
profiler
.
start
()
return
profiler
return
None
def
stop_profile
(
profiler
,
profiler_activities
,
rank_print
=
print
,
save_trace
=
False
,
trace_filename
=
None
,
stage
=
None
,
):
"""
Abstracted function to stop profiling based on profiler_activities.
Optionally saves trace results and prints completion messages.
"""
if
"CUDA_PROFILER"
in
profiler_activities
:
try
:
torch
.
cuda
.
cudart
().
cudaProfilerStop
()
rank_print
(
"CUDA Profiler stopped (nsys should dump traces)"
)
except
Exception
as
e
:
rank_print
(
f
"Failed to stop CUDA profiler:
{
e
}
"
)
elif
profiler
is
not
None
:
profiler
.
stop
()
if
save_trace
:
if
profiler
is
not
None
:
if
trace_filename
:
_save_profile_trace_results
(
profiler
,
trace_filename
)
stage_desc
=
f
"for
{
stage
}
"
if
stage
else
""
rank_print
(
f
"torch profiler chrome trace
{
stage_desc
}
saved to
{
trace_filename
}
"
)
if
"CUDA_PROFILER"
in
profiler_activities
:
rank_print
(
f
"CUDA profiler trace for
{
stage
}
completed"
)
@
dataclasses
.
dataclass
@
dataclasses
.
dataclass
class
BenchArgs
:
class
BenchArgs
:
run_name
:
str
=
"default"
run_name
:
str
=
"default"
...
@@ -107,6 +174,8 @@ class BenchArgs:
...
@@ -107,6 +174,8 @@ class BenchArgs:
log_decode_step
:
int
=
0
log_decode_step
:
int
=
0
profile
:
bool
=
False
profile
:
bool
=
False
profile_record_shapes
:
bool
=
False
profile_record_shapes
:
bool
=
False
profiler_activities
:
Tuple
[
str
]
=
(
"CPU"
,
"GPU"
)
profile_stage
:
str
=
"all"
profile_filename_prefix
:
str
=
"profile"
profile_filename_prefix
:
str
=
"profile"
@
staticmethod
@
staticmethod
...
@@ -135,14 +204,27 @@ class BenchArgs:
...
@@ -135,14 +204,27 @@ class BenchArgs:
default
=
BenchArgs
.
log_decode_step
,
default
=
BenchArgs
.
log_decode_step
,
help
=
"Log decode latency by step, default is set to zero to disable."
,
help
=
"Log decode latency by step, default is set to zero to disable."
,
)
)
parser
.
add_argument
(
parser
.
add_argument
(
"--profile"
,
action
=
"store_true"
,
help
=
"Enable profiling."
)
"--profile"
,
action
=
"store_true"
,
help
=
"Use Torch Profiler."
)
parser
.
add_argument
(
parser
.
add_argument
(
"--profile-record-shapes"
,
"--profile-record-shapes"
,
action
=
"store_true"
,
action
=
"store_true"
,
help
=
"Record tensor shapes in profiling results."
,
help
=
"Record tensor shapes in profiling results."
,
)
)
parser
.
add_argument
(
"--profiler_activities"
,
type
=
str
,
nargs
=
"+"
,
default
=
[
"CPU"
,
"GPU"
],
choices
=
[
"CPU"
,
"GPU"
,
"CUDA_PROFILER"
],
help
=
"Profiler activities: CPU, GPU, CUDA_PROFILER. If CPU/GPU, use torch profiler. If CUDA_PROFILER, use CUDA profiler."
,
)
parser
.
add_argument
(
"--profile-stage"
,
type
=
str
,
default
=
BenchArgs
.
profile_stage
,
choices
=
[
"all"
,
"prefill"
,
"decode"
],
help
=
"Which stage to profile: all, prefill, or decode only."
,
)
parser
.
add_argument
(
parser
.
add_argument
(
"--profile-filename-prefix"
,
"--profile-filename-prefix"
,
type
=
str
,
type
=
str
,
...
@@ -337,6 +419,18 @@ def _read_prompts_from_file(prompt_file, rank_print):
...
@@ -337,6 +419,18 @@ def _read_prompts_from_file(prompt_file, rank_print):
return
pf
.
readlines
()
return
pf
.
readlines
()
def
_get_torch_profiler_output_dir
():
return
os
.
environ
.
get
(
"SGLANG_TORCH_PROFILER_DIR"
,
"/tmp"
)
def
_create_torch_profiler_filename
(
profile_filename_prefix
,
batch_size
,
input_len
,
output_len
,
stage
):
output_dir
=
_get_torch_profiler_output_dir
()
filename
=
f
"
{
profile_filename_prefix
}
_batch
{
batch_size
}
_input
{
input_len
}
_output
{
output_len
}
_
{
stage
}
.trace.json.gz"
return
os
.
path
.
join
(
output_dir
,
filename
)
def
_save_profile_trace_results
(
profiler
,
filename
):
def
_save_profile_trace_results
(
profiler
,
filename
):
parent_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
filename
))
parent_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
filename
))
os
.
makedirs
(
parent_dir
,
exist_ok
=
True
)
os
.
makedirs
(
parent_dir
,
exist_ok
=
True
)
...
@@ -413,7 +507,10 @@ def latency_test_run_once(
...
@@ -413,7 +507,10 @@ def latency_test_run_once(
log_decode_step
,
log_decode_step
,
profile
,
profile
,
profile_record_shapes
,
profile_record_shapes
,
profiler_activities
,
profile_filename_prefix
,
profile_filename_prefix
,
profile_stage
,
tp_rank
,
):
):
max_batch_size
=
model_runner
.
max_total_num_tokens
//
(
input_len
+
output_len
)
max_batch_size
=
model_runner
.
max_total_num_tokens
//
(
input_len
+
output_len
)
if
batch_size
>
max_batch_size
:
if
batch_size
>
max_batch_size
:
...
@@ -422,7 +519,6 @@ def latency_test_run_once(
...
@@ -422,7 +519,6 @@ def latency_test_run_once(
)
)
return
return
# Clear the pools.
model_runner
.
req_to_token_pool
.
clear
()
model_runner
.
req_to_token_pool
.
clear
()
model_runner
.
token_to_kv_pool_allocator
.
clear
()
model_runner
.
token_to_kv_pool_allocator
.
clear
()
...
@@ -436,20 +532,33 @@ def latency_test_run_once(
...
@@ -436,20 +532,33 @@ def latency_test_run_once(
tot_latency
=
0
tot_latency
=
0
profiler
=
None
profiler
=
None
if
profile
:
enable_profile_prefill
=
profile
and
profile_stage
in
[
"all"
,
"prefill"
]
profiler
=
torch
.
profiler
.
profile
(
if
enable_profile_prefill
:
activities
=
profile_activities
,
profiler
=
start_profile
(
with_stack
=
True
,
profiler_activities
,
record_shapes
=
profile_record_shapes
,
profile_record_shapes
=
profile_record_shapes
,
rank_print
=
rank_print
,
)
)
profiler
.
start
()
# Prefill
synchronize
(
device
)
synchronize
(
device
)
tic
=
time
.
perf_counter
()
tic
=
time
.
perf_counter
()
next_token_ids
,
_
,
batch
=
extend
(
reqs
,
model_runner
)
next_token_ids
,
_
,
batch
=
extend
(
reqs
,
model_runner
)
synchronize
(
device
)
synchronize
(
device
)
prefill_latency
=
time
.
perf_counter
()
-
tic
prefill_latency
=
time
.
perf_counter
()
-
tic
if
enable_profile_prefill
:
trace_filename
=
_create_torch_profiler_filename
(
profile_filename_prefix
,
batch_size
,
input_len
,
output_len
,
"prefill"
)
stop_profile
(
profiler
,
profiler_activities
,
rank_print
=
rank_print
,
save_trace
=
True
,
trace_filename
=
trace_filename
,
stage
=
"prefill"
,
)
tot_latency
+=
prefill_latency
tot_latency
+=
prefill_latency
throughput
=
input_len
*
batch_size
/
prefill_latency
throughput
=
input_len
*
batch_size
/
prefill_latency
rank_print
(
rank_print
(
...
@@ -458,29 +567,37 @@ def latency_test_run_once(
...
@@ -458,29 +567,37 @@ def latency_test_run_once(
measurement_results
[
"prefill_latency"
]
=
prefill_latency
measurement_results
[
"prefill_latency"
]
=
prefill_latency
measurement_results
[
"prefill_throughput"
]
=
throughput
measurement_results
[
"prefill_throughput"
]
=
throughput
if
profile
:
profiler
.
stop
()
trace_filename
=
f
"
{
profile_filename_prefix
}
_batch
{
batch_size
}
_input
{
input_len
}
_output
{
output_len
}
_prefill.trace.json.gz"
_save_profile_trace_results
(
profiler
,
trace_filename
)
rank_print
(
f
"torch profiler chrome trace for prefill saved to
{
trace_filename
}
"
)
# Decode
decode_latencies
=
[]
decode_latencies
=
[]
profile_step_of_interest
=
output_len
//
2
enable_profile_decode
=
profile
and
profile_stage
in
[
"all"
,
"decode"
]
for
i
in
range
(
output_len
-
1
):
for
i
in
range
(
output_len
-
1
):
synchronize
(
device
)
synchronize
(
device
)
if
profile
and
i
==
output_len
/
2
:
profiler
=
None
profiler
=
None
profiler
=
torch
.
profiler
.
profile
(
if
enable_profile_decode
and
i
==
profile_step_of_interest
:
activities
=
profile_activities
,
profiler
=
start_profile
(
with_stack
=
True
,
profiler_activities
,
record_shapes
=
profile_record_shapes
,
profile_record_shapes
=
profile_record_shapes
,
rank_print
=
rank_print
,
)
)
profiler
.
start
()
tic
=
time
.
perf_counter
()
tic
=
time
.
perf_counter
()
next_token_ids
,
_
=
decode
(
next_token_ids
,
batch
,
model_runner
)
next_token_ids
,
_
=
decode
(
next_token_ids
,
batch
,
model_runner
)
synchronize
(
device
)
synchronize
(
device
)
latency
=
time
.
perf_counter
()
-
tic
latency
=
time
.
perf_counter
()
-
tic
if
enable_profile_decode
and
i
==
profile_step_of_interest
:
trace_filename
=
_create_torch_profiler_filename
(
profile_filename_prefix
,
batch_size
,
input_len
,
output_len
,
"decode"
)
stop_profile
(
profiler
,
profiler_activities
,
rank_print
=
rank_print
,
save_trace
=
True
,
trace_filename
=
trace_filename
,
stage
=
"decode"
,
)
tot_latency
+=
latency
tot_latency
+=
latency
throughput
=
batch_size
/
latency
throughput
=
batch_size
/
latency
decode_latencies
.
append
(
latency
)
decode_latencies
.
append
(
latency
)
...
@@ -489,14 +606,6 @@ def latency_test_run_once(
...
@@ -489,14 +606,6 @@ def latency_test_run_once(
f
"Decode
{
i
}
. Batch size:
{
batch_size
}
, latency:
{
latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
f
"Decode
{
i
}
. Batch size:
{
batch_size
}
, latency:
{
latency
:
6.5
f
}
s, throughput:
{
throughput
:
9.2
f
}
token/s"
)
)
if
profile
and
i
==
output_len
/
2
:
profiler
.
stop
()
trace_filename
=
f
"
{
profile_filename_prefix
}
_batch
{
batch_size
}
_input
{
input_len
}
_output
{
output_len
}
_decode.trace.json.gz"
_save_profile_trace_results
(
profiler
,
trace_filename
)
rank_print
(
f
"torch profiler chrome trace for decoding 1 token saved to
{
trace_filename
}
"
)
# Record decode timing from 2nd output
# Record decode timing from 2nd output
if
output_len
>
1
:
if
output_len
>
1
:
med_decode_latency
=
np
.
median
(
decode_latencies
)
med_decode_latency
=
np
.
median
(
decode_latencies
)
...
@@ -557,7 +666,10 @@ def latency_test(
...
@@ -557,7 +666,10 @@ def latency_test(
log_decode_step
=
0
,
log_decode_step
=
0
,
profile
=
False
,
profile
=
False
,
profile_record_shapes
=
False
,
profile_record_shapes
=
False
,
profile_filename_prefix
=
""
,
# not used
profiler_activities
=
(
"CPU"
,
"GPU"
),
profile_filename_prefix
=
""
,
profile_stage
=
"all"
,
tp_rank
=
tp_rank
,
)
)
rank_print
(
"Benchmark ..."
)
rank_print
(
"Benchmark ..."
)
...
@@ -604,7 +716,10 @@ def latency_test(
...
@@ -604,7 +716,10 @@ def latency_test(
bench_args
.
log_decode_step
,
bench_args
.
log_decode_step
,
bench_args
.
profile
if
tp_rank
==
0
else
None
,
bench_args
.
profile
if
tp_rank
==
0
else
None
,
bench_args
.
profile_record_shapes
if
tp_rank
==
0
else
None
,
bench_args
.
profile_record_shapes
if
tp_rank
==
0
else
None
,
bench_args
.
profiler_activities
,
bench_args
.
profile_filename_prefix
,
bench_args
.
profile_filename_prefix
,
bench_args
.
profile_stage
,
tp_rank
,
)
)
if
ret
is
not
None
:
if
ret
is
not
None
:
result_list
.
append
(
ret
)
result_list
.
append
(
ret
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment