Commit 0fe0b01f authored by one's avatar one
Browse files

Add kernel launch overhead benchmark and associated build scripts

- Introduce kernel_launch_overhead.cu to measure kernel launch latency, system throughput, CPU dispatch overhead, and GPU dispatch time.
- Create Makefile for building the benchmark with support for nvcc and hipcc.
- Add run-all.sh script to execute the benchmark with specified device settings.
parent 65bf476e
# 编译器,未指定则优先考虑 nvcc
CXX_COMPILER ?= $(if $(shell which nvcc 2>/dev/null),nvcc,hipcc)
CXX_FLAGS ?= -std=c++17 -O3
HIPIFY ?= hipify-perl
HEADERS := $(wildcard *.h)
SRCS := $(wildcard *.cu)
TARGETS := $(SRCS:.cu=)
.PHONY: all clean
all: $(TARGETS)
ifeq ($(CXX_COMPILER),nvcc)
# nvcc 编译
%: %.cu $(HEADERS)
nvcc $(CXX_FLAGS) -o $@ $<
else
# 先转码再用 hipcc 编译
%: %.cu $(HEADERS)
$(HIPIFY) $< > $@.hip
$(CXX_COMPILER) $(CXX_FLAGS) -o $@ $@.hip
rm $@.hip
endif
clean:
rm -f $(TARGETS)
#include <algorithm>
#include <cctype>
#include <chrono>
#include <cmath>
#include <cstdlib>
#include <cuda_runtime.h>
#include <iostream>
#include <sstream>
#include <string>
#include <thread>
__global__ void emptyKernel() { return; }
void checkCudaErrors(cudaError_t result) {
if (result != cudaSuccess) {
std::cerr << "CUDA Error: " << cudaGetErrorString(result) << std::endl;
exit(1);
}
}
char *getCmdOption(char **begin, char **end, const std::string &option) {
char **itr = std::find(begin, end, option);
if (itr != end && ++itr != end) {
return *itr;
}
return 0;
}
/// Kernel launch 端到端延迟,单个任务的交互成本。
/// 包括 CPU 发射 + GPU 执行 + CPU 等待完成的完整回路延迟。
double testSingleLaunchLatency(int device_id, int n_warmups, int n_steps) {
checkCudaErrors(cudaSetDevice(device_id));
// Warmup
for (int i = 0; i < n_warmups; ++i) {
emptyKernel<<<1, 1>>>();
checkCudaErrors(cudaDeviceSynchronize());
}
auto start_cpu = std::chrono::high_resolution_clock::now();
for (int i = 0; i < n_steps; ++i) {
emptyKernel<<<1, 1>>>();
checkCudaErrors(cudaDeviceSynchronize());
}
auto end_cpu = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::micro> elapsed = end_cpu - start_cpu;
return elapsed.count() / n_steps;
}
/// 峰值吞吐能力,包括了最终同步开销。
/// 时间的倒数就是吞吐量。
double testSystemThroughputTime(int device_id, int n_warmups, int n_steps) {
checkCudaErrors(cudaSetDevice(device_id));
// Warmup
for (int i = 0; i < n_warmups; ++i) {
emptyKernel<<<1, 1>>>();
}
checkCudaErrors(cudaDeviceSynchronize());
auto start_cpu = std::chrono::high_resolution_clock::now();
for (int i = 0; i < n_steps; ++i) {
emptyKernel<<<1, 1>>>();
}
// 计时结束前同步
checkCudaErrors(cudaDeviceSynchronize());
auto end_cpu = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::micro> elapsed = end_cpu - start_cpu;
return elapsed.count() / n_steps;
}
/// CPU 发射开销,不包括同步开销。
/// 分 batch 测试,避免队列满导致阻塞。
double testCpuDispatchOverhead(int device_id, int n_warmups, int n_steps,
int batch_size) {
checkCudaErrors(cudaSetDevice(device_id));
// Warmup
for (int i = 0; i < n_warmups; ++i) {
emptyKernel<<<1, 1>>>();
}
checkCudaErrors(cudaDeviceSynchronize());
int remaining = n_steps;
std::chrono::duration<double, std::micro> total_elapsed(0);
while (remaining > 0) {
int current_batch = std::min(batch_size, remaining);
// 确保上一批次执行完毕,腾出队列空间,避免测量时发生阻塞
checkCudaErrors(cudaDeviceSynchronize());
auto start_cpu = std::chrono::high_resolution_clock::now();
for (int i = 0; i < current_batch; ++i) {
emptyKernel<<<1, 1>>>();
}
auto end_cpu = std::chrono::high_resolution_clock::now();
total_elapsed += (end_cpu - start_cpu);
remaining -= current_batch;
}
// 最终同步
checkCudaErrors(cudaDeviceSynchronize());
return total_elapsed.count() / n_steps;
}
/// GPU 处理单个空 kernel 的平均时间。
/// 用 event 测量,倒数是吞吐量。
double testGpuThroughputTime(int device_id, int n_warmups, int n_steps) {
checkCudaErrors(cudaSetDevice(device_id));
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// Warmup
for (int i = 0; i < n_warmups; ++i) {
emptyKernel<<<1, 1>>>();
}
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaEventRecord(start, 0));
for (int i = 0; i < n_steps; ++i) {
emptyKernel<<<1, 1>>>();
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaEventSynchronize(stop));
float total_time_ms = 0.f;
checkCudaErrors(cudaEventElapsedTime(&total_time_ms, start, stop));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
// 转换为微秒
return (total_time_ms * 1000.0) / n_steps;
}
int main(int argc, char *argv[]) {
int n_warmups = 100;
int n_steps = std::pow(10, 6);
int batch_size = 10; // CPU dispatch 测试所用的 batch size
int interval = 1000; // 测试间隔
std::vector<bool> cases{true, true, true, true};
if (char *value = getCmdOption(argv, argv + argc, "-w")) {
n_warmups = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "-n")) {
n_steps = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "-b")) {
batch_size = std::stoi(value);
}
if (char *value = getCmdOption(argv, argv + argc, "-i")) {
interval = std::stoi(value);
}
// 输入cases,以逗号分隔,例如 "1,2,4"
// Cases:
// 1. E2E
// 2. System Peak
// 3. CPU Dispatch
// 4. GPU Dispatch
if (char *value = getCmdOption(argv, argv + argc, "-c")) {
cases.assign(4, false);
std::stringstream ss(value);
std::string token;
while (std::getline(ss, token, ',')) {
token.erase(
std::remove_if(token.begin(), token.end(),
[](unsigned char c) { return std::isspace(c); }),
token.end());
if (token.empty()) {
continue;
}
int idx = std::stoi(token);
if (idx >= 1 && idx <= 4) {
cases[idx - 1] = true;
}
}
}
std::cout << "Benchmarking kernel launch overhead..." << std::endl;
std::cout << "---------------------------------------------------"
<< std::endl;
std::cout << "Warmups: " << n_warmups << std::endl;
std::cout << "Steps Per Test: " << n_steps << std::endl;
std::cout << "Interval: " << interval << " ms" << std::endl;
std::cout << "---------------------------------------------------"
<< std::endl;
// 1. 端到端延迟(测试会很慢)
if (cases[0]) {
double e2e_latency = testSingleLaunchLatency(0, n_warmups, n_steps);
printf("1. End-to-End Latency: %.3f us \n", e2e_latency);
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
}
// 2. 测试系统峰值吞吐,即高负载下处理空 kernel 的能力
if (cases[1]) {
double sys_throughput_time =
testSystemThroughputTime(0, n_warmups, n_steps);
printf("2. System Peak Time: %.3f us (Rate: %.3f MKrnls/s)\n",
sys_throughput_time, 1.0 / sys_throughput_time);
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
}
// 3. 测试 CPU 发射开销,应该属于软件栈开销
if (cases[2]) {
double cpu_dispatch =
testCpuDispatchOverhead(0, n_warmups, n_steps, batch_size);
printf("3. CPU Dispatch Time: %.3f us (Batch Size: %d)\n", cpu_dispatch,
batch_size);
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
}
// 4. 测试 GPU 执行空 kernel 的吞吐量,应该能反映 GPU 调度的性能
if (cases[3]) {
double gpu_hw_time = testGpuThroughputTime(0, n_warmups, n_steps);
printf("4. GPU Dispatch Time: %.3f us (Rate: %.3f MKrnls/s)\n", gpu_hw_time,
1.0 / gpu_hw_time);
std::this_thread::sleep_for(std::chrono::milliseconds(interval));
}
std::cout << "---------------------------------------------------"
<< std::endl;
return 0;
}
#!/bin/bash
set -e
export HIP_VISIBLE_DEVICES=1
RUN_CMD="numactl -m 1 -N 1 ./kernel_launch_overhead"
echo "==== Baseline ===="
${RUN_CMD}
# echo "==== hipprof --hip-trace ===="
# hipprof --hip-trace ${RUN_CMD} -c 1
# echo "==== hipprof --hip-trace --hsa-trace ===="
# hipprof --hip-trace --hsa-trace ${RUN_CMD} -c 1
# echo "==== hipprof --hip-trace --pmc ===="
# hipprof --hip-trace --pmc ${RUN_CMD} -c 1 -n 10000
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment