Improve launch bounds for gpu-copy

eea26d0d · one · 2ea51c1d · eea26d0d
Commit eea26d0d authored Apr 17, 2026 by one
Show whitespace changes
Inline Side-by-side

Showing with 37 additions and 1 deletion

superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu ...chmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu +37 -1

No files found.
--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
@@ -875,11 +875,42 @@ __global__ void SMOneToAllCopyKernel(ulong2 **dst_buffers, ulong2 *src_buffer, u
    }
 }

+int GetSafeAllToAllThreadBlockSize(const Opts &opts, int *thread_block_size) {
+    if (thread_block_size == nullptr) {
+        return -1;
+    }
+
+    cudaFuncAttributes func_attr;
+#if defined(__HIP_PLATFORM_AMD__)
+    cudaError_t cuda_err = cudaFuncGetAttributes(&func_attr, reinterpret_cast<const void *>(SMOneToAllCopyKernel));
+#else
+    cudaError_t cuda_err = cudaFuncGetAttributes(&func_attr, SMOneToAllCopyKernel);
+#endif
+    if (cuda_err != cudaSuccess) {
+        fprintf(stderr, "GetSafeAllToAllThreadBlockSize::cudaFuncGetAttributes error: %d\n", cuda_err);
+        return -1;
+    }
+
+    if (func_attr.maxThreadsPerBlock <= 0) {
+        fprintf(stderr, "GetSafeAllToAllThreadBlockSize::invalid maxThreadsPerBlock: %d\n",
+                func_attr.maxThreadsPerBlock);
+        return -1;
+    }
+
+    *thread_block_size = static_cast<int>(opts.all_to_all_thread_block_size);
+    if (*thread_block_size > func_attr.maxThreadsPerBlock) {
+        *thread_block_size = func_attr.maxThreadsPerBlock;
+    }
+
+    return 0;
+}
+
 // src_rank/dst_rank: < 0 for all ranks, else for specified rank
 int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank) {
    int ret = 0;
    cudaError_t cuda_err = cudaSuccess;
    int can_access = 0;
+    int thread_block_size = 0;

    std::vector<uint8_t *> src_buffers_gpu(gpu_count, nullptr);
    std::vector<uint8_t *> dst_buffers_gpu(gpu_count, nullptr);
@@ -890,6 +921,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank

    uint64_t *data_buffer_cpu = nullptr;

+    ret = GetSafeAllToAllThreadBlockSize(opts, &thread_block_size);
+    if (ret != 0) {
+        return -1;
+    }
+
    // Scan all GPUs
    for (int i = 0; i < gpu_count; i++) {
        for (int j = 0; j < gpu_count; j++) {
@@ -1011,7 +1047,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
                }
            }
            SMOneToAllCopyKernel<<<gpu_count * opts.all_to_all_num_thread_blocks_per_rank,
-                                   opts.all_to_all_thread_block_size, 0, streams[rank]>>>(
+                                   thread_block_size, 0, streams[rank]>>>(
                (ulong2 **)dst_buffer_gpu_args[rank], (ulong2 *)src_buffers_gpu[rank], opts.size, rank, dst_rank,
                gpu_count);
            if (i == opts.num_warm_up + opts.num_loops - 1) {