#include #include #include #include #include #include // 定义向量类型 typedef float v4f __attribute__((vector_size(16))); typedef __fp16 __fp16x4_t __attribute__((vector_size(8))); // 简单的 MMAC 指令测试 __global__ void test_mmac_kernel(float* result) { // 使用 __builtin_amdgcn_sched_barrier 进行调度屏障 __builtin_amdgcn_sched_barrier(0); // 准备向量参数 __fp16x4_t A = {0.1f, 0.2f, 0.3f, 0.4f}; __fp16x4_t B = {1.0f, 2.0f, 3.0f, 4.0f}; // 初始累加值 v4f c = {0.0f, 0.0f, 0.0f, 0.0f}; v4f d; // 计算参考结果（常规乘法） float ref_result = 0.0f; float a_values[] = {0.1f, 0.2f, 0.3f, 0.4f}; float b_values[] = {1.0f, 2.0f, 3.0f, 4.0f}; for (int i = 0; i < 4; i++) { ref_result += a_values[i] * b_values[i]; } // 尝试使用 MMAC 指令 #ifdef __HIP_DEVICE_COMPILE__ #ifdef __gfx928__ // 在gfx928架构上使用正确的MMAC指令 d = __builtin_amdgcn_mmac_f32_16x16x16f16(A, B, c); *result = d[0]; #else // 在不支持的硬件上使用参考结果 *result = ref_result; #endif #else // 在主机上编译时使用参考结果 *result = ref_result; #endif // 再次使用调度屏障 __builtin_amdgcn_sched_barrier(0); } // 矩阵分块大小 #define BLOCK_SIZE 16 // 使用MMAC指令的高效矩阵乘法内核 __global__ void mmac_matrix_kernel(__fp16* A, __fp16* B, float* C, int M, int N, int K) { // 使用 __builtin_amdgcn_sched_barrier 进行调度屏障 __builtin_amdgcn_sched_barrier(0); // 获取线程块和线程在块中的位置 int blockRow = blockIdx.y; int blockCol = blockIdx.x; int row = threadIdx.y; int col = threadIdx.x; // 每个线程负责计算的结果元素 float result = 0.0f; // 计算全局行列索引 int globalRow = blockRow * BLOCK_SIZE + row; int globalCol = blockCol * BLOCK_SIZE + col; // 确保线程在有效范围内 if (globalRow < M && globalCol < N) { // 使用MMAC指令进行矩阵乘法 for (int k = 0; k < K; k += 16) { // 加载A矩阵的16个元素 (1x16) __fp16x4_t a0 = reinterpret_cast<__fp16x4_t*>(&A[globalRow * K + k])[0]; __fp16x4_t a1 = reinterpret_cast<__fp16x4_t*>(&A[globalRow * K + k + 4])[0]; __fp16x4_t a2 = reinterpret_cast<__fp16x4_t*>(&A[globalRow * K + k + 8])[0]; __fp16x4_t a3 = reinterpret_cast<__fp16x4_t*>(&A[globalRow * K + k + 12])[0]; // 加载B矩阵的16个元素 (16x1) __fp16x4_t b0 = reinterpret_cast<__fp16x4_t*>(&B[k * N + globalCol])[0]; __fp16x4_t b1 = reinterpret_cast<__fp16x4_t*>(&B[(k + 4) * N + globalCol])[0]; __fp16x4_t b2 = reinterpret_cast<__fp16x4_t*>(&B[(k + 8) * N + globalCol])[0]; __fp16x4_t b3 = reinterpret_cast<__fp16x4_t*>(&B[(k + 12) * N + globalCol])[0]; // 初始累加值 v4f c = {0.0f, 0.0f, 0.0f, 0.0f}; v4f d; // 使用MMAC指令进行计算 d = __builtin_amdgcn_mmac_f32_16x16x16f16(a0, b0, c); result += d[0]; // d = __builtin_amdgcn_mmac_f32_16x16x16f16(a1, b1, c); // result += d[0]; // d = __builtin_amdgcn_mmac_f32_16x16x16f16(a2, b2, c); // result += d[0]; // d = __builtin_amdgcn_mmac_f32_16x16x16f16(a3, b3, c); // result += d[0]; } // 存储结果 C[globalRow * N + globalCol] = result; } // 再次使用调度屏障 __builtin_amdgcn_sched_barrier(0); } // CPU 侧矩阵乘法参考实现 template void cpu_matrix_multiply(__fp16* A, __fp16* B, float* C) { for (int i = 0; i < M; ++i) { for (int j = 0; j < N; ++j) { float sum = 0.0f; for (int k = 0; k < K; ++k) { sum += (__half2float(A[i * K + k])) * (__half2float(B[k * N + j])); } C[i * N + j] = sum; } } } // 验证GPU和CPU结果是否一致 template bool verify_results(float* cpu_result, float* gpu_result, float epsilon = 1e-3f) { for (int i = 0; i < M * N; ++i) { if (fabs(cpu_result[i] - gpu_result[i]) > epsilon) { printf("Result mismatch at index %d: CPU=%f, GPU=%f\n", i, cpu_result[i], gpu_result[i]); return false; } } return true; } // 性能测试函数 template void run_performance_test() { printf("\n=== Testing matrix size %dx%dx%d ===\n", M, K, N); // 分配内存 __fp16* h_A = (__fp16*)malloc(M * K * sizeof(__fp16)); __fp16* h_B = (__fp16*)malloc(K * N * sizeof(__fp16)); float* h_cpu_result = (float*)malloc(M * N * sizeof(float)); float* h_gpu_result = (float*)malloc(M * N * sizeof(float)); __fp16* d_A; __fp16* d_B; float* d_C; hipMalloc((void**)&d_A, M * K * sizeof(__fp16)); hipMalloc((void**)&d_B, K * N * sizeof(__fp16)); hipMalloc((void**)&d_C, M * N * sizeof(float)); // 初始化数据 for (int i = 0; i < M * K; ++i) { h_A[i] = (__fp16)(0.1f * (i % 100)); } for (int i = 0; i < K * N; ++i) { h_B[i] = (__fp16)(0.1f * (i % 100)); } // 复制数据到GPU hipMemcpy(d_A, h_A, M * K * sizeof(__fp16), hipMemcpyHostToDevice); hipMemcpy(d_B, h_B, K * N * sizeof(__fp16), hipMemcpyHostToDevice); // 设置线程块和网格大小，使用BLOCK_SIZE常量 dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE); dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE); // 预热运行 mmac_matrix_kernel<<>>(d_A, d_B, d_C, M, N, K); hipDeviceSynchronize(); // 性能测试 - 增加迭代次数以获得更准确的计时 int iterations = 100; hipEvent_t start, stop; hipEventCreate(&start); hipEventCreate(&stop); // 确保GPU准备就绪 hipDeviceSynchronize(); hipEventRecord(start); for (int i = 0; i < iterations; ++i) { mmac_matrix_kernel<<>>(d_A, d_B, d_C, M, N, K); } hipEventRecord(stop); hipEventSynchronize(stop); float elapsed_ms; hipEventElapsedTime(&elapsed_ms, start, stop); float avg_time_ms = elapsed_ms / iterations; // 确保时间有效 if (avg_time_ms < 0.001f) { avg_time_ms = 0.001f; // 避免除以零 } // 计算TFLOPS double flops = 2.0 * M * N * K; double tflops = (flops / avg_time_ms) / 1e9; // 计算带宽（GB/s） double bytes = (M * K * sizeof(__fp16) + K * N * sizeof(__fp16) + M * N * sizeof(float)); double bandwidth = (bytes / avg_time_ms) / 1e6; // 复制结果回主机 hipMemcpy(h_gpu_result, d_C, M * N * sizeof(float), hipMemcpyDeviceToHost); // CPU 计算 clock_t cpu_start = clock(); cpu_matrix_multiply(h_A, h_B, h_cpu_result); clock_t cpu_end = clock(); double cpu_time_ms = (double)(cpu_end - cpu_start) * 1000.0 / CLOCKS_PER_SEC; // 验证结果 bool success = verify_results(h_cpu_result, h_gpu_result); if (success) { printf("✓ Results match between CPU and GPU\n"); } else { printf("✗ Results mismatch between CPU and GPU\n"); // 打印前几个结果进行调试 printf("First 5 results - CPU: %f, %f, %f, %f, %f\n", h_cpu_result[0], h_cpu_result[1], h_cpu_result[2], h_cpu_result[3], h_cpu_result[4]); printf("First 5 results - GPU: %f, %f, %f, %f, %f\n", h_gpu_result[0], h_gpu_result[1], h_gpu_result[2], h_gpu_result[3], h_gpu_result[4]); } // 输出性能数据 printf("GPU Time: %.3f ms\n", avg_time_ms); printf("CPU Time: %.3f ms\n", cpu_time_ms); printf("TFLOPS: %.3f\n", tflops); printf("Bandwidth: %.3f GB/s\n", bandwidth); if (avg_time_ms > 0) { printf("Speedup: %.2fx\n", cpu_time_ms / avg_time_ms); } else { printf("Speedup: N/A (GPU time too small)\n"); } // 清理资源 free(h_A); free(h_B); free(h_cpu_result); free(h_gpu_result); hipFree(d_A); hipFree(d_B); hipFree(d_C); hipEventDestroy(start); hipEventDestroy(stop); } int main() { // 原始的简单测试 printf("=== Original Simple MMAC Test ===\n"); float* d_result; float h_result; hipMalloc((void**)&d_result, sizeof(float)); // 启动内核 test_mmac_kernel<<<1, 1>>>(d_result); hipDeviceSynchronize(); // 复制结果回主机 hipMemcpy(&h_result, d_result, sizeof(float), hipMemcpyDeviceToHost); // CPU 参考计算 float a_values[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f, 1.6f}; float b_values[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}; float ref_result = 0.0f; for (int i = 0; i < 16; i++) { ref_result += a_values[i] * b_values[i]; } printf("MMAC result: %f\n", h_result); printf("Reference result: %f\n", ref_result); printf("Difference: %e\n", fabs(h_result - ref_result)); hipFree(d_result); // 运行不同大小的矩阵性能测试 run_performance_test<128, 128, 128>(); run_performance_test<256, 256, 256>(); run_performance_test<512, 512, 512>(); run_performance_test<1024, 1024, 1024>(); printf("\nAll tests completed!\n"); return 0; }