Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
04564997
Commit
04564997
authored
Apr 02, 2026
by
one
Browse files
Re-implement kernel launch
parent
05cdf5d6
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
120 additions
and
55 deletions
+120
-55
superbench/benchmarks/micro_benchmarks/kernel_launch_overhead.py
...nch/benchmarks/micro_benchmarks/kernel_launch_overhead.py
+13
-16
superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/kernel_launch.cu
.../micro_benchmarks/kernel_launch_overhead/kernel_launch.cu
+106
-38
tests/benchmarks/micro_benchmarks/test_kernel_launch_overhead.py
...enchmarks/micro_benchmarks/test_kernel_launch_overhead.py
+1
-1
No files found.
superbench/benchmarks/micro_benchmarks/kernel_launch_overhead.py
View file @
04564997
...
...
@@ -13,6 +13,8 @@
class
KernelLaunch
(
MicroBenchmarkWithInvoke
):
"""The KernelLaunch overhead benchmark class."""
_metric_names
=
[
'e2e_latency_us'
,
'host_dispatch_us'
,
'launch_throughput_mkps'
,
'device_launch_us'
]
def
__init__
(
self
,
name
,
parameters
=
''
):
"""Constructor.
...
...
@@ -81,27 +83,22 @@ def _process_raw_result(self, cmd_idx, raw_output):
"""
self
.
_result
.
add_raw_data
(
'raw_output_'
+
str
(
cmd_idx
),
raw_output
,
self
.
_args
.
log_raw_data
)
pattern
=
r
'\d+\.\d+'
result
=
re
.
findall
(
pattern
,
raw_output
)
if
len
(
result
)
!=
2
:
logger
.
error
(
'Cannot extract kernel launch overhead in event and wall mode - round: {}, benchmark: {}, raw data: {}.'
.
format
(
self
.
_curr_run_index
,
self
.
_name
,
raw_output
)
)
return
False
result
=
{}
pattern
=
re
.
compile
(
r
'^(e2e_latency_us|host_dispatch_us|launch_throughput_mkps|device_launch_us):\s*(-?\d+(?:\.\d+)?)$'
)
for
line
in
raw_output
.
splitlines
():
match
=
pattern
.
match
(
line
.
strip
())
if
match
:
result
[
match
.
group
(
1
)]
=
float
(
match
.
group
(
2
))
try
:
result
=
[
float
(
item
)
for
item
in
result
]
except
BaseException
as
e
:
if
set
(
result
.
keys
())
!=
set
(
self
.
_metric_names
):
logger
.
error
(
'The result format is invalid - round: {}, benchmark: {}, result: {}, message: {}.'
.
format
(
self
.
_curr_run_index
,
self
.
_name
,
result
,
str
(
e
)
)
'Cannot extract kernel launch benchmark metrics - round: {}, benchmark: {}, raw data: {}.'
.
format
(
self
.
_curr_run_index
,
self
.
_name
,
raw_output
)
)
return
False
self
.
_result
.
add_result
(
'event_time'
,
result
[
0
])
self
.
_result
.
add_result
(
'wall_time'
,
result
[
1
])
for
metric
in
self
.
_metric_names
:
self
.
_result
.
add_result
(
metric
,
result
[
metric
])
return
True
...
...
superbench/benchmarks/micro_benchmarks/kernel_launch_overhead/kernel_launch.cu
View file @
04564997
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
// Kernel launch benchmark which will launch one empty kernel and record the cost in event mode and wall mode.
// event mode: using cuda/hip event to record the elapsed time of kernel launch on device.
// wall mode: using host timer to record the elapsed time kernel launch on both host and device.
// Kernel launch benchmark with four metrics:
// e2e_latency_us: single-shot end-to-end latency per kernel.
// host_dispatch_us: host-side dispatch cost per kernel.
// launch_throughput_mkps: steady-state launch throughput in million kernels/s.
// device_launch_us: device-side average time per kernel measured by events.
#include <algorithm>
#include <chrono>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <sys/time.h>
#include <thread>
#include "cuda_runtime.h"
__global__
void
EmptyKernel
()
{}
double
test_cuda_kernel_launch_event_time
(
int
num_warmups
,
int
num_steps
)
{
float
time
=
0.
f
;
double
total_time
=
0.0
;
namespace
{
cudaEvent_t
start
,
stop
;
cudaEventCreate
(
&
start
);
cudaEventCreate
(
&
stop
);
constexpr
int
kDeviceId
=
0
;
constexpr
int
kDispatchBatchSize
=
32
;
void
CheckCuda
(
cudaError_t
result
)
{
if
(
result
!=
cudaSuccess
)
{
fprintf
(
stderr
,
"CUDA Error: %s
\n
"
,
cudaGetErrorString
(
result
));
exit
(
1
);
}
}
double
test_e2e_latency_us
(
int
num_warmups
,
int
num_steps
)
{
CheckCuda
(
cudaSetDevice
(
kDeviceId
));
for
(
int
i
=
0
;
i
<
num_warmups
;
i
++
)
{
cudaEventRecord
(
start
,
0
);
EmptyKernel
<<<
1
,
1
>>>
();
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
CheckCuda
(
cudaDeviceSynchronize
());
}
auto
begin
=
std
::
chrono
::
steady_clock
::
now
();
for
(
int
i
=
0
;
i
<
num_steps
;
i
++
)
{
cudaEventRecord
(
start
,
0
);
EmptyKernel
<<<
1
,
1
>>>
();
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
cudaEventElapsedTime
(
&
time
,
start
,
stop
);
total_time
+=
time
;
CheckCuda
(
cudaDeviceSynchronize
());
}
auto
end
=
std
::
chrono
::
steady_clock
::
now
();
std
::
chrono
::
duration
<
double
,
std
::
micro
>
elapsed
=
end
-
begin
;
return
elapsed
.
count
()
/
num_steps
;
}
double
test_host_dispatch_us
(
int
num_warmups
,
int
num_steps
)
{
CheckCuda
(
cudaSetDevice
(
kDeviceId
));
for
(
int
i
=
0
;
i
<
num_warmups
;
i
++
)
{
EmptyKernel
<<<
1
,
1
>>>
();
}
CheckCuda
(
cudaDeviceSynchronize
());
int
remaining
=
num_steps
;
std
::
chrono
::
duration
<
double
,
std
::
micro
>
total_elapsed
(
0
);
while
(
remaining
>
0
)
{
const
int
current_batch
=
std
::
min
(
kDispatchBatchSize
,
remaining
);
CheckCuda
(
cudaDeviceSynchronize
());
auto
begin
=
std
::
chrono
::
steady_clock
::
now
();
for
(
int
i
=
0
;
i
<
current_batch
;
i
++
)
{
EmptyKernel
<<<
1
,
1
>>>
();
}
auto
end
=
std
::
chrono
::
steady_clock
::
now
();
total_elapsed
+=
end
-
begin
;
remaining
-=
current_batch
;
}
cudaEventDestroy
(
start
);
cudaEventDestroy
(
stop
);
CheckCuda
(
cudaDeviceSynchronize
());
return
total_
time
;
return
total_
elapsed
.
count
()
/
num_steps
;
}
double
test_
cuda_kernel_launch_wall_time
(
int
num_warmups
,
int
num_steps
)
{
double
total_time
=
0.0
;
double
test_
launch_throughput_mkps
(
int
num_warmups
,
int
num_steps
)
{
CheckCuda
(
cudaSetDevice
(
kDeviceId
))
;
for
(
int
i
=
0
;
i
<
num_warmups
;
i
++
)
{
EmptyKernel
<<<
1
,
1
>>>
();
cudaDeviceSynchronize
();
}
CheckCuda
(
cudaDeviceSynchronize
());
struct
timeval
begin_tv
,
end_tv
;
auto
begin
=
std
::
chrono
::
steady_clock
::
now
()
;
for
(
int
i
=
0
;
i
<
num_steps
;
i
++
)
{
gettimeofday
(
&
begin_tv
,
NULL
);
EmptyKernel
<<<
1
,
1
>>>
();
cudaDeviceSynchronize
();
gettimeofday
(
&
end_tv
,
NULL
);
total_time
+=
(((
end_tv
.
tv_sec
)
*
1000
+
(
end_tv
.
tv_usec
)
/
1000
)
-
((
begin_tv
.
tv_sec
)
*
1000
+
(
begin_tv
.
tv_usec
)
/
1000
));
}
CheckCuda
(
cudaDeviceSynchronize
());
auto
end
=
std
::
chrono
::
steady_clock
::
now
();
return
total_time
;
std
::
chrono
::
duration
<
double
>
elapsed
=
end
-
begin
;
return
static_cast
<
double
>
(
num_steps
)
/
elapsed
.
count
()
/
1e6
;
}
double
test_device_launch_us
(
int
num_warmups
,
int
num_steps
)
{
CheckCuda
(
cudaSetDevice
(
kDeviceId
));
cudaEvent_t
start
,
stop
;
CheckCuda
(
cudaEventCreate
(
&
start
));
CheckCuda
(
cudaEventCreate
(
&
stop
));
for
(
int
i
=
0
;
i
<
num_warmups
;
i
++
)
{
EmptyKernel
<<<
1
,
1
>>>
();
}
CheckCuda
(
cudaDeviceSynchronize
());
CheckCuda
(
cudaEventRecord
(
start
,
0
));
for
(
int
i
=
0
;
i
<
num_steps
;
i
++
)
{
EmptyKernel
<<<
1
,
1
>>>
();
}
CheckCuda
(
cudaEventRecord
(
stop
,
0
));
CheckCuda
(
cudaEventSynchronize
(
stop
));
float
total_time_ms
=
0.0
f
;
CheckCuda
(
cudaEventElapsedTime
(
&
total_time_ms
,
start
,
stop
));
CheckCuda
(
cudaEventDestroy
(
start
));
CheckCuda
(
cudaEventDestroy
(
stop
));
return
total_time_ms
*
1000.0
/
num_steps
;
}
char
*
getCmdOption
(
char
**
begin
,
char
**
end
,
const
std
::
string
&
option
)
{
...
...
@@ -75,6 +134,8 @@ char *getCmdOption(char **begin, char **end, const std::string &option) {
return
0
;
}
}
// namespace
int
main
(
int
argc
,
char
*
argv
[])
{
int
num_warmups
=
100
;
int
num_steps
=
2000000
;
...
...
@@ -92,16 +153,23 @@ int main(int argc, char *argv[]) {
interval
=
std
::
stoi
(
value
);
}
// Test the kernel launch event time.
double
event_total_time
=
test_cuda_kernel_launch_event_time
(
num_warmups
,
num_steps
);
printf
(
"Kernel launch overhead - event time: %3.5f ms
\n
"
,
event_total_time
/
num_steps
);
const
double
e2e_latency_us
=
test_e2e_latency_us
(
num_warmups
,
num_steps
);
printf
(
"e2e_latency_us: %.6f
\n
"
,
e2e_latency_us
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
interval
));
const
double
host_dispatch_us
=
test_host_dispatch_us
(
num_warmups
,
num_steps
);
printf
(
"host_dispatch_us: %.6f
\n
"
,
host_dispatch_us
);
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
interval
));
const
double
launch_throughput_mkps
=
test_launch_throughput_mkps
(
num_warmups
,
num_steps
);
printf
(
"launch_throughput_mkps: %.6f
\n
"
,
launch_throughput_mkps
);
// Sleep for interval milliseconds and run the next test.
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
interval
));
// Test the kernel launch wall time.
double
wall_total_time
=
test_cuda_kernel_launch_wall_time
(
num_warmups
,
num_steps
);
printf
(
"Kernel launch overhead - wall time: %3.5f ms
\n
"
,
wall_total_time
/
num_steps
);
const
double
device_launch_us
=
test_device_launch_us
(
num_warmups
,
num_steps
);
printf
(
"device_launch_us: %.6f
\n
"
,
device_launch_us
);
return
0
;
}
tests/benchmarks/micro_benchmarks/test_kernel_launch_overhead.py
View file @
04564997
...
...
@@ -36,7 +36,7 @@ def test_kernel_launch_overhead():
assert
(
'raw_output_0'
in
benchmark
.
raw_data
)
assert
(
len
(
benchmark
.
raw_data
[
'raw_output_0'
])
==
1
)
assert
(
isinstance
(
benchmark
.
raw_data
[
'raw_output_0'
][
0
],
str
))
for
metric
in
[
'e
vent_time'
,
'wall_time
'
]:
for
metric
in
[
'e
2e_latency_us'
,
'host_dispatch_us'
,
'launch_throughput_mkps'
,
'device_launch_us
'
]:
assert
(
metric
in
benchmark
.
result
)
assert
(
len
(
benchmark
.
result
[
metric
])
==
1
)
assert
(
isinstance
(
benchmark
.
result
[
metric
][
0
],
numbers
.
Number
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment