#include #include #include #include #include #include #include #include #include #include __global__ void emptyKernel() { return; } void checkCudaErrors(cudaError_t result) { if (result != cudaSuccess) { std::cerr << "CUDA Error: " << cudaGetErrorString(result) << std::endl; exit(1); } } char *getCmdOption(char **begin, char **end, const std::string &option) { char **itr = std::find(begin, end, option); if (itr != end && ++itr != end) { return *itr; } return 0; } /// Kernel launch 端到端延迟,单个任务的交互成本。 /// 包括 CPU 发射 + GPU 执行 + CPU 等待完成的完整回路延迟。 double testSingleLaunchLatency(int device_id, int n_warmups, int n_steps) { checkCudaErrors(cudaSetDevice(device_id)); // Warmup for (int i = 0; i < n_warmups; ++i) { emptyKernel<<<1, 1>>>(); checkCudaErrors(cudaDeviceSynchronize()); } auto start_cpu = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n_steps; ++i) { emptyKernel<<<1, 1>>>(); checkCudaErrors(cudaDeviceSynchronize()); } auto end_cpu = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end_cpu - start_cpu; return elapsed.count() / n_steps; } /// 峰值吞吐能力,包括了最终同步开销。 /// 时间的倒数就是吞吐量。 double testSystemThroughputTime(int device_id, int n_warmups, int n_steps) { checkCudaErrors(cudaSetDevice(device_id)); // Warmup for (int i = 0; i < n_warmups; ++i) { emptyKernel<<<1, 1>>>(); } checkCudaErrors(cudaDeviceSynchronize()); auto start_cpu = std::chrono::high_resolution_clock::now(); for (int i = 0; i < n_steps; ++i) { emptyKernel<<<1, 1>>>(); } // 计时结束前同步 checkCudaErrors(cudaDeviceSynchronize()); auto end_cpu = std::chrono::high_resolution_clock::now(); std::chrono::duration elapsed = end_cpu - start_cpu; return elapsed.count() / n_steps; } /// CPU 发射开销,不包括同步开销。 /// 分 batch 测试,避免队列满导致阻塞。 double testCpuDispatchOverhead(int device_id, int n_warmups, int n_steps, int batch_size) { checkCudaErrors(cudaSetDevice(device_id)); // Warmup for (int i = 0; i < n_warmups; ++i) { emptyKernel<<<1, 1>>>(); } checkCudaErrors(cudaDeviceSynchronize()); int remaining = n_steps; std::chrono::duration total_elapsed(0); while (remaining > 0) { int current_batch = std::min(batch_size, remaining); // 确保上一批次执行完毕,腾出队列空间,避免测量时发生阻塞 checkCudaErrors(cudaDeviceSynchronize()); auto start_cpu = std::chrono::high_resolution_clock::now(); for (int i = 0; i < current_batch; ++i) { emptyKernel<<<1, 1>>>(); } auto end_cpu = std::chrono::high_resolution_clock::now(); total_elapsed += (end_cpu - start_cpu); remaining -= current_batch; } // 最终同步 checkCudaErrors(cudaDeviceSynchronize()); return total_elapsed.count() / n_steps; } /// GPU 处理单个空 kernel 的平均时间。 /// 用 event 测量,倒数是吞吐量。 double testGpuThroughputTime(int device_id, int n_warmups, int n_steps) { checkCudaErrors(cudaSetDevice(device_id)); cudaEvent_t start, stop; checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); // Warmup for (int i = 0; i < n_warmups; ++i) { emptyKernel<<<1, 1>>>(); } checkCudaErrors(cudaDeviceSynchronize()); checkCudaErrors(cudaEventRecord(start, 0)); for (int i = 0; i < n_steps; ++i) { emptyKernel<<<1, 1>>>(); } checkCudaErrors(cudaEventRecord(stop, 0)); checkCudaErrors(cudaEventSynchronize(stop)); float total_time_ms = 0.f; checkCudaErrors(cudaEventElapsedTime(&total_time_ms, start, stop)); checkCudaErrors(cudaEventDestroy(start)); checkCudaErrors(cudaEventDestroy(stop)); // 转换为微秒 return (total_time_ms * 1000.0) / n_steps; } int main(int argc, char *argv[]) { int n_warmups = 100; int n_steps = std::pow(10, 6); int batch_size = 10; // CPU dispatch 测试所用的 batch size int interval = 1000; // 测试间隔 std::vector cases{true, true, true, true}; if (char *value = getCmdOption(argv, argv + argc, "-w")) { n_warmups = std::stoi(value); } if (char *value = getCmdOption(argv, argv + argc, "-n")) { n_steps = std::stoi(value); } if (char *value = getCmdOption(argv, argv + argc, "-b")) { batch_size = std::stoi(value); } if (char *value = getCmdOption(argv, argv + argc, "-i")) { interval = std::stoi(value); } // 输入cases,以逗号分隔,例如 "1,2,4" // Cases: // 1. E2E // 2. System Peak // 3. CPU Dispatch // 4. GPU Dispatch if (char *value = getCmdOption(argv, argv + argc, "-c")) { cases.assign(4, false); std::stringstream ss(value); std::string token; while (std::getline(ss, token, ',')) { token.erase( std::remove_if(token.begin(), token.end(), [](unsigned char c) { return std::isspace(c); }), token.end()); if (token.empty()) { continue; } int idx = std::stoi(token); if (idx >= 1 && idx <= 4) { cases[idx - 1] = true; } } } std::cout << "Benchmarking kernel launch overhead..." << std::endl; std::cout << "---------------------------------------------------" << std::endl; std::cout << "Warmups: " << n_warmups << std::endl; std::cout << "Steps Per Test: " << n_steps << std::endl; std::cout << "Interval: " << interval << " ms" << std::endl; std::cout << "---------------------------------------------------" << std::endl; // 1. 端到端延迟(测试会很慢) if (cases[0]) { double e2e_latency = testSingleLaunchLatency(0, n_warmups, n_steps); printf("1. End-to-End Latency: %.3f us \n", e2e_latency); std::this_thread::sleep_for(std::chrono::milliseconds(interval)); } // 2. 测试系统峰值吞吐,即高负载下处理空 kernel 的能力 if (cases[1]) { double sys_throughput_time = testSystemThroughputTime(0, n_warmups, n_steps); printf("2. System Peak Time: %.3f us (Rate: %.3f MKrnls/s)\n", sys_throughput_time, 1.0 / sys_throughput_time); std::this_thread::sleep_for(std::chrono::milliseconds(interval)); } // 3. 测试 CPU 发射开销,应该属于软件栈开销 if (cases[2]) { double cpu_dispatch = testCpuDispatchOverhead(0, n_warmups, n_steps, batch_size); printf("3. CPU Dispatch Time: %.3f us (Batch Size: %d)\n", cpu_dispatch, batch_size); std::this_thread::sleep_for(std::chrono::milliseconds(interval)); } // 4. 测试 GPU 执行空 kernel 的吞吐量,应该能反映 GPU 调度的性能 if (cases[3]) { double gpu_hw_time = testGpuThroughputTime(0, n_warmups, n_steps); printf("4. GPU Dispatch Time: %.3f us (Rate: %.3f MKrnls/s)\n", gpu_hw_time, 1.0 / gpu_hw_time); std::this_thread::sleep_for(std::chrono::milliseconds(interval)); } std::cout << "---------------------------------------------------" << std::endl; return 0; }