Commit 04564997 authored by one's avatar one
Browse files

Re-implement kernel launch

parent 05cdf5d6
......@@ -13,6 +13,8 @@
class KernelLaunch(MicroBenchmarkWithInvoke):
"""The KernelLaunch overhead benchmark class."""
_metric_names = ['e2e_latency_us', 'host_dispatch_us', 'launch_throughput_mkps', 'device_launch_us']
def __init__(self, name, parameters=''):
"""Constructor.
......@@ -81,27 +83,22 @@ def _process_raw_result(self, cmd_idx, raw_output):
"""
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
pattern = r'\d+\.\d+'
result = re.findall(pattern, raw_output)
if len(result) != 2:
logger.error(
'Cannot extract kernel launch overhead in event and wall mode - round: {}, benchmark: {}, raw data: {}.'
.format(self._curr_run_index, self._name, raw_output)
)
return False
result = {}
pattern = re.compile(r'^(e2e_latency_us|host_dispatch_us|launch_throughput_mkps|device_launch_us):\s*(-?\d+(?:\.\d+)?)$')
for line in raw_output.splitlines():
match = pattern.match(line.strip())
if match:
result[match.group(1)] = float(match.group(2))
try:
result = [float(item) for item in result]
except BaseException as e:
if set(result.keys()) != set(self._metric_names):
logger.error(
'The result format is invalid - round: {}, benchmark: {}, result: {}, message: {}.'.format(
self._curr_run_index, self._name, result, str(e)
)
'Cannot extract kernel launch benchmark metrics - round: {}, benchmark: {}, raw data: {}.'
.format(self._curr_run_index, self._name, raw_output)
)
return False
self._result.add_result('event_time', result[0])
self._result.add_result('wall_time', result[1])
for metric in self._metric_names:
self._result.add_result(metric, result[metric])
return True
......
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
// Kernel launch benchmark which will launch one empty kernel and record the cost in event mode and wall mode.
// event mode: using cuda/hip event to record the elapsed time of kernel launch on device.
// wall mode: using host timer to record the elapsed time kernel launch on both host and device.
// Kernel launch benchmark with four metrics:
// e2e_latency_us: single-shot end-to-end latency per kernel.
// host_dispatch_us: host-side dispatch cost per kernel.
// launch_throughput_mkps: steady-state launch throughput in million kernels/s.
// device_launch_us: device-side average time per kernel measured by events.
#include <algorithm>
#include <chrono>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <sys/time.h>
#include <thread>
#include "cuda_runtime.h"
__global__ void EmptyKernel() {}
double test_cuda_kernel_launch_event_time(int num_warmups, int num_steps) {
float time = 0.f;
double total_time = 0.0;
namespace {
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
constexpr int kDeviceId = 0;
constexpr int kDispatchBatchSize = 32;
void CheckCuda(cudaError_t result) {
if (result != cudaSuccess) {
fprintf(stderr, "CUDA Error: %s\n", cudaGetErrorString(result));
exit(1);
}
}
double test_e2e_latency_us(int num_warmups, int num_steps) {
CheckCuda(cudaSetDevice(kDeviceId));
for (int i = 0; i < num_warmups; i++) {
cudaEventRecord(start, 0);
EmptyKernel<<<1, 1>>>();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
CheckCuda(cudaDeviceSynchronize());
}
auto begin = std::chrono::steady_clock::now();
for (int i = 0; i < num_steps; i++) {
cudaEventRecord(start, 0);
EmptyKernel<<<1, 1>>>();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
total_time += time;
CheckCuda(cudaDeviceSynchronize());
}
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double, std::micro> elapsed = end - begin;
return elapsed.count() / num_steps;
}
double test_host_dispatch_us(int num_warmups, int num_steps) {
CheckCuda(cudaSetDevice(kDeviceId));
for (int i = 0; i < num_warmups; i++) {
EmptyKernel<<<1, 1>>>();
}
CheckCuda(cudaDeviceSynchronize());
int remaining = num_steps;
std::chrono::duration<double, std::micro> total_elapsed(0);
while (remaining > 0) {
const int current_batch = std::min(kDispatchBatchSize, remaining);
CheckCuda(cudaDeviceSynchronize());
auto begin = std::chrono::steady_clock::now();
for (int i = 0; i < current_batch; i++) {
EmptyKernel<<<1, 1>>>();
}
auto end = std::chrono::steady_clock::now();
total_elapsed += end - begin;
remaining -= current_batch;
}
cudaEventDestroy(start);
cudaEventDestroy(stop);
CheckCuda(cudaDeviceSynchronize());
return total_time;
return total_elapsed.count() / num_steps;
}
double test_cuda_kernel_launch_wall_time(int num_warmups, int num_steps) {
double total_time = 0.0;
double test_launch_throughput_mkps(int num_warmups, int num_steps) {
CheckCuda(cudaSetDevice(kDeviceId));
for (int i = 0; i < num_warmups; i++) {
EmptyKernel<<<1, 1>>>();
cudaDeviceSynchronize();
}
CheckCuda(cudaDeviceSynchronize());
struct timeval begin_tv, end_tv;
auto begin = std::chrono::steady_clock::now();
for (int i = 0; i < num_steps; i++) {
gettimeofday(&begin_tv, NULL);
EmptyKernel<<<1, 1>>>();
cudaDeviceSynchronize();
gettimeofday(&end_tv, NULL);
total_time += (((end_tv.tv_sec) * 1000 + (end_tv.tv_usec) / 1000) -
((begin_tv.tv_sec) * 1000 + (begin_tv.tv_usec) / 1000));
}
CheckCuda(cudaDeviceSynchronize());
auto end = std::chrono::steady_clock::now();
return total_time;
std::chrono::duration<double> elapsed = end - begin;
return static_cast<double>(num_steps) / elapsed.count() / 1e6;
}
double test_device_launch_us(int num_warmups, int num_steps) {
CheckCuda(cudaSetDevice(kDeviceId));
cudaEvent_t start, stop;
CheckCuda(cudaEventCreate(&start));
CheckCuda(cudaEventCreate(&stop));
for (int i = 0; i < num_warmups; i++) {
EmptyKernel<<<1, 1>>>();
}
CheckCuda(cudaDeviceSynchronize());
CheckCuda(cudaEventRecord(start, 0));
for (int i = 0; i < num_steps; i++) {
EmptyKernel<<<1, 1>>>();
}
CheckCuda(cudaEventRecord(stop, 0));
CheckCuda(cudaEventSynchronize(stop));
float total_time_ms = 0.0f;
CheckCuda(cudaEventElapsedTime(&total_time_ms, start, stop));
CheckCuda(cudaEventDestroy(start));
CheckCuda(cudaEventDestroy(stop));
return total_time_ms * 1000.0 / num_steps;
}
char *getCmdOption(char **begin, char **end, const std::string &option) {
......@@ -75,6 +134,8 @@ char *getCmdOption(char **begin, char **end, const std::string &option) {
return 0;
}
} // namespace
int main(int argc, char *argv[]) {
int num_warmups = 100;
int num_steps = 2000000;
......@@ -92,16 +153,23 @@ int main(int argc, char *argv[]) {
interval = std::stoi(value);
}
// Test the kernel launch event time.
double event_total_time = test_cuda_kernel_launch_event_time(num_warmups, num_steps);
printf("Kernel launch overhead - event time: %3.5f ms \n", event_total_time / num_steps);
const double e2e_latency_us = test_e2e_latency_us(num_warmups, num_steps);
printf("e2e_latency_us: %.6f\n", e2e_latency_us);
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
const double host_dispatch_us = test_host_dispatch_us(num_warmups, num_steps);
printf("host_dispatch_us: %.6f\n", host_dispatch_us);
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
const double launch_throughput_mkps = test_launch_throughput_mkps(num_warmups, num_steps);
printf("launch_throughput_mkps: %.6f\n", launch_throughput_mkps);
// Sleep for interval milliseconds and run the next test.
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
// Test the kernel launch wall time.
double wall_total_time = test_cuda_kernel_launch_wall_time(num_warmups, num_steps);
printf("Kernel launch overhead - wall time: %3.5f ms \n", wall_total_time / num_steps);
const double device_launch_us = test_device_launch_us(num_warmups, num_steps);
printf("device_launch_us: %.6f\n", device_launch_us);
return 0;
}
......@@ -36,7 +36,7 @@ def test_kernel_launch_overhead():
assert ('raw_output_0' in benchmark.raw_data)
assert (len(benchmark.raw_data['raw_output_0']) == 1)
assert (isinstance(benchmark.raw_data['raw_output_0'][0], str))
for metric in ['event_time', 'wall_time']:
for metric in ['e2e_latency_us', 'host_dispatch_us', 'launch_throughput_mkps', 'device_launch_us']:
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment