Add kernel launch overhead benchmark and associated build scripts

- Introduce kernel_launch_overhead.cu to measure kernel launch latency, system throughput, CPU dispatch overhead, and GPU dispatch time. - Create Makefile for building the benchmark with support for nvcc and hipcc. - Add run-all.sh script to execute the benchmark with specified device settings.

Add kernel launch overhead benchmark and associated build scripts
- Introduce kernel_launch_overhead.cu to measure kernel launch latency, system throughput, CPU dispatch overhead, and GPU dispatch time. - Create Makefile for building the benchmark with support for nvcc and hipcc. - Add run-all.sh script to execute the benchmark with specified device settings.
0fe0b01f · one · 65bf476e · 0fe0b01f · 0fe0b01f · 0fe0b01f
Commit 0fe0b01f authored Feb 01, 2026 by one
3 changed files
--- a/benchmark/kernel-launch/Makefile
+++ b/benchmark/kernel-launch/Makefile
+# 编译器，未指定则优先考虑 nvcc
+CXX_COMPILER ?= $(if $(shell which nvcc 2>/dev/null),nvcc,hipcc)
+CXX_FLAGS ?= -std=c++17 -O3
+HIPIFY ?= hipify-perl
+HEADERS := $(wildcard *.h)
+SRCS := $(wildcard *.cu)
+TARGETS := $(SRCS:.cu=)
+.PHONY: all clean
+all: $(TARGETS)
+ifeq ($(CXX_COMPILER),nvcc)
+    # nvcc 编译
+    %: %.cu $(HEADERS)
+		nvcc $(CXX_FLAGS) -o $@ $<
+else
+    # 先转码再用 hipcc 编译
+    %: %.cu $(HEADERS)
+		$(HIPIFY) $< > $@.hip
+		$(CXX_COMPILER) $(CXX_FLAGS) -o $@ $@.hip
+		rm $@.hip
+endif
+clean:
+	rm -f $(TARGETS)
--- a/benchmark/kernel-launch/kernel_launch_overhead.cu
+++ b/benchmark/kernel-launch/kernel_launch_overhead.cu
+#include <algorithm>
+#include <cctype>
+#include <chrono>
+#include <cmath>
+#include <cstdlib>
+#include <cuda_runtime.h>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <thread>
+__global__ void emptyKernel() { return; }
+void checkCudaErrors(cudaError_t result) {
+  if (result != cudaSuccess) {
+    std::cerr << "CUDA Error: " << cudaGetErrorString(result) << std::endl;
+    exit(1);
+  }
+}
+char *getCmdOption(char **begin, char **end, const std::string &option) {
+  char **itr = std::find(begin, end, option);
+  if (itr != end && ++itr != end) {
+    return *itr;
+  }
+  return 0;
+}
+/// Kernel launch 端到端延迟，单个任务的交互成本。
+/// 包括 CPU 发射 + GPU 执行 + CPU 等待完成的完整回路延迟。
+double testSingleLaunchLatency(int device_id, int n_warmups, int n_steps) {
+  checkCudaErrors(cudaSetDevice(device_id));
+  // Warmup
+  for (int i = 0; i < n_warmups; ++i) {
+    emptyKernel<<<1, 1>>>();
+    checkCudaErrors(cudaDeviceSynchronize());
+  }
+  auto start_cpu = std::chrono::high_resolution_clock::now();
+  for (int i = 0; i < n_steps; ++i) {
+    emptyKernel<<<1, 1>>>();
+    checkCudaErrors(cudaDeviceSynchronize());
+  }
+  auto end_cpu = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::micro> elapsed = end_cpu - start_cpu;
+  return elapsed.count() / n_steps;
+}
+/// 峰值吞吐能力，包括了最终同步开销。
+/// 时间的倒数就是吞吐量。
+double testSystemThroughputTime(int device_id, int n_warmups, int n_steps) {
+  checkCudaErrors(cudaSetDevice(device_id));
+  // Warmup
+  for (int i = 0; i < n_warmups; ++i) {
+    emptyKernel<<<1, 1>>>();
+  }
+  checkCudaErrors(cudaDeviceSynchronize());
+  auto start_cpu = std::chrono::high_resolution_clock::now();
+  for (int i = 0; i < n_steps; ++i) {
+    emptyKernel<<<1, 1>>>();
+  }
+  // 计时结束前同步
+  checkCudaErrors(cudaDeviceSynchronize());
+  auto end_cpu = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double, std::micro> elapsed = end_cpu - start_cpu;
+  return elapsed.count() / n_steps;
+}
+/// CPU 发射开销，不包括同步开销。
+/// 分 batch 测试，避免队列满导致阻塞。
+double testCpuDispatchOverhead(int device_id, int n_warmups, int n_steps,
+                               int batch_size) {
+  checkCudaErrors(cudaSetDevice(device_id));
+  // Warmup
+  for (int i = 0; i < n_warmups; ++i) {
+    emptyKernel<<<1, 1>>>();
+  }
+  checkCudaErrors(cudaDeviceSynchronize());
+  int remaining = n_steps;
+  std::chrono::duration<double, std::micro> total_elapsed(0);
+  while (remaining > 0) {
+    int current_batch = std::min(batch_size, remaining);
+    // 确保上一批次执行完毕，腾出队列空间，避免测量时发生阻塞
+    checkCudaErrors(cudaDeviceSynchronize());
+    auto start_cpu = std::chrono::high_resolution_clock::now();
+    for (int i = 0; i < current_batch; ++i) {
+      emptyKernel<<<1, 1>>>();
+    }
+    auto end_cpu = std::chrono::high_resolution_clock::now();
+    total_elapsed += (end_cpu - start_cpu);
+    remaining -= current_batch;
+  }
+  // 最终同步
+  checkCudaErrors(cudaDeviceSynchronize());
+  return total_elapsed.count() / n_steps;
+}
+/// GPU 处理单个空 kernel 的平均时间。
+/// 用 event 测量，倒数是吞吐量。
+double testGpuThroughputTime(int device_id, int n_warmups, int n_steps) {
+  checkCudaErrors(cudaSetDevice(device_id));
+  cudaEvent_t start, stop;
+  checkCudaErrors(cudaEventCreate(&start));
+  checkCudaErrors(cudaEventCreate(&stop));
+  // Warmup
+  for (int i = 0; i < n_warmups; ++i) {
+    emptyKernel<<<1, 1>>>();
+  }
+  checkCudaErrors(cudaDeviceSynchronize());
+  checkCudaErrors(cudaEventRecord(start, 0));
+  for (int i = 0; i < n_steps; ++i) {
+    emptyKernel<<<1, 1>>>();
+  }
+  checkCudaErrors(cudaEventRecord(stop, 0));
+  checkCudaErrors(cudaEventSynchronize(stop));
+  float total_time_ms = 0.f;
+  checkCudaErrors(cudaEventElapsedTime(&total_time_ms, start, stop));
+  checkCudaErrors(cudaEventDestroy(start));
+  checkCudaErrors(cudaEventDestroy(stop));
+  // 转换为微秒
+  return (total_time_ms * 1000.0) / n_steps;
+}
+int main(int argc, char *argv[]) {
+  int n_warmups = 100;
+  int n_steps = std::pow(10, 6);
+  int batch_size = 10; // CPU dispatch 测试所用的 batch size
+  int interval = 1000; // 测试间隔
+  std::vector<bool> cases{true, true, true, true};
+  if (char *value = getCmdOption(argv, argv + argc, "-w")) {
+    n_warmups = std::stoi(value);
+  }
+  if (char *value = getCmdOption(argv, argv + argc, "-n")) {
+    n_steps = std::stoi(value);
+  }
+  if (char *value = getCmdOption(argv, argv + argc, "-b")) {
+    batch_size = std::stoi(value);
+  }
+  if (char *value = getCmdOption(argv, argv + argc, "-i")) {
+    interval = std::stoi(value);
+  }
+  // 输入cases，以逗号分隔，例如 "1,2,4"
+  // Cases:
+  // 1. E2E
+  // 2. System Peak
+  // 3. CPU Dispatch
+  // 4. GPU Dispatch
+  if (char *value = getCmdOption(argv, argv + argc, "-c")) {
+    cases.assign(4, false);
+    std::stringstream ss(value);
+    std::string token;
+    while (std::getline(ss, token, ',')) {
+      token.erase(
+          std::remove_if(token.begin(), token.end(),
+                         [](unsigned char c) { return std::isspace(c); }),
+          token.end());
+      if (token.empty()) {
+        continue;
+      }
+      int idx = std::stoi(token);
+      if (idx >= 1 && idx <= 4) {
+        cases[idx - 1] = true;
+      }
+    }
+  }
+  std::cout << "Benchmarking kernel launch overhead..." << std::endl;
+  std::cout << "---------------------------------------------------"
+            << std::endl;
+  std::cout << "Warmups: " << n_warmups << std::endl;
+  std::cout << "Steps Per Test: " << n_steps << std::endl;
+  std::cout << "Interval: " << interval << " ms" << std::endl;
+  std::cout << "---------------------------------------------------"
+            << std::endl;
+  // 1. 端到端延迟（测试会很慢）
+  if (cases[0]) {
+    double e2e_latency = testSingleLaunchLatency(0, n_warmups, n_steps);
+    printf("1. End-to-End Latency:  %.3f us \n", e2e_latency);
+    std::this_thread::sleep_for(std::chrono::milliseconds(interval));
+  }
+  // 2. 测试系统峰值吞吐，即高负载下处理空 kernel 的能力
+  if (cases[1]) {
+    double sys_throughput_time =
+        testSystemThroughputTime(0, n_warmups, n_steps);
+    printf("2. System Peak Time: %.3f us (Rate: %.3f MKrnls/s)\n",
+           sys_throughput_time, 1.0 / sys_throughput_time);
+    std::this_thread::sleep_for(std::chrono::milliseconds(interval));
+  }
+  // 3. 测试 CPU 发射开销，应该属于软件栈开销
+  if (cases[2]) {
+    double cpu_dispatch =
+        testCpuDispatchOverhead(0, n_warmups, n_steps, batch_size);
+    printf("3. CPU Dispatch Time: %.3f us (Batch Size: %d)\n", cpu_dispatch,
+           batch_size);
+    std::this_thread::sleep_for(std::chrono::milliseconds(interval));
+  }
+  // 4. 测试 GPU 执行空 kernel 的吞吐量，应该能反映 GPU 调度的性能
+  if (cases[3]) {
+    double gpu_hw_time = testGpuThroughputTime(0, n_warmups, n_steps);
+    printf("4. GPU Dispatch Time: %.3f us (Rate: %.3f MKrnls/s)\n", gpu_hw_time,
+           1.0 / gpu_hw_time);
+    std::this_thread::sleep_for(std::chrono::milliseconds(interval));
+  }
+  std::cout << "---------------------------------------------------"
+            << std::endl;
+  return 0;
+}
--- a/benchmark/kernel-launch/run-all.sh
+++ b/benchmark/kernel-launch/run-all.sh
+#!/bin/bash
+set -e
+export HIP_VISIBLE_DEVICES=1
+RUN_CMD="numactl -m 1 -N 1 ./kernel_launch_overhead"
+echo "==== Baseline ===="
+${RUN_CMD}
+# echo "==== hipprof --hip-trace ===="
+# hipprof --hip-trace ${RUN_CMD} -c 1
+# echo "==== hipprof --hip-trace --hsa-trace ===="
+# hipprof --hip-trace --hsa-trace ${RUN_CMD} -c 1
+# echo "==== hipprof --hip-trace --pmc ===="
+# hipprof --hip-trace --pmc ${RUN_CMD} -c 1 -n 10000
\ No newline at end of file