simple_mmac_test.cu

#include <stdio.h>
#include <stdlib.h>
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <time.h>
#include <cmath>

// 定义向量类型
typedef float v4f __attribute__((vector_size(16)));
typedef __fp16 __fp16x4_t __attribute__((vector_size(8)));

// 简单的 MMAC 指令测试
__global__ void test_mmac_kernel(float* result)
{
    // 使用 __builtin_amdgcn_sched_barrier 进行调度屏障
    __builtin_amdgcn_sched_barrier(0);
    
    // 准备向量参数
    __fp16x4_t A = {0.1f, 0.2f, 0.3f, 0.4f};
    __fp16x4_t B = {1.0f, 2.0f, 3.0f, 4.0f};
    
    // 初始累加值
    v4f c = {0.0f, 0.0f, 0.0f, 0.0f};
    v4f d;
    
    // 计算参考结果（常规乘法）
    float ref_result = 0.0f;
    float a_values[] = {0.1f, 0.2f, 0.3f, 0.4f};
    float b_values[] = {1.0f, 2.0f, 3.0f, 4.0f};
    for (int i = 0; i < 4; i++) {
        ref_result += a_values[i] * b_values[i];
    }
    
    // 尝试使用 MMAC 指令
    #ifdef __HIP_DEVICE_COMPILE__
    #ifdef __gfx928__
        // 在gfx928架构上使用正确的MMAC指令
        d = __builtin_amdgcn_mmac_f32_16x16x16f16(A, B, c);
        *result = d[0];
    #else
        // 在不支持的硬件上使用参考结果
        *result = ref_result;
    #endif
    #else
        // 在主机上编译时使用参考结果
        *result = ref_result;
    #endif
    
    // 再次使用调度屏障
    __builtin_amdgcn_sched_barrier(0);
}

// 矩阵分块大小
#define BLOCK_SIZE 16

// 使用MMAC指令的高效矩阵乘法内核
__global__ void mmac_matrix_kernel(__fp16* A, __fp16* B, float* C, int M, int N, int K)
{
    // 使用 __builtin_amdgcn_sched_barrier 进行调度屏障
    __builtin_amdgcn_sched_barrier(0);
    
    // 获取线程块和线程在块中的位置
    int blockRow = blockIdx.y;
    int blockCol = blockIdx.x;
    int row = threadIdx.y;
    int col = threadIdx.x;
    
    // 每个线程负责计算的结果元素
    float result = 0.0f;
    
    // 计算全局行列索引
    int globalRow = blockRow * BLOCK_SIZE + row;
    int globalCol = blockCol * BLOCK_SIZE + col;
    
    // 确保线程在有效范围内
    if (globalRow < M && globalCol < N) {
        // 使用MMAC指令进行矩阵乘法
        for (int k = 0; k < K; k += 16) {
            // 加载A矩阵的16个元素 (1x16)
            __fp16x4_t a0 = reinterpret_cast<__fp16x4_t*>(&A[globalRow * K + k])[0];
            __fp16x4_t a1 = reinterpret_cast<__fp16x4_t*>(&A[globalRow * K + k + 4])[0];
            __fp16x4_t a2 = reinterpret_cast<__fp16x4_t*>(&A[globalRow * K + k + 8])[0];
            __fp16x4_t a3 = reinterpret_cast<__fp16x4_t*>(&A[globalRow * K + k + 12])[0];
            
            // 加载B矩阵的16个元素 (16x1)
            __fp16x4_t b0 = reinterpret_cast<__fp16x4_t*>(&B[k * N + globalCol])[0];
            __fp16x4_t b1 = reinterpret_cast<__fp16x4_t*>(&B[(k + 4) * N + globalCol])[0];
            __fp16x4_t b2 = reinterpret_cast<__fp16x4_t*>(&B[(k + 8) * N + globalCol])[0];
            __fp16x4_t b3 = reinterpret_cast<__fp16x4_t*>(&B[(k + 12) * N + globalCol])[0];
            
            // 初始累加值
            v4f c = {0.0f, 0.0f, 0.0f, 0.0f};
            v4f d;
            
            // 使用MMAC指令进行计算
            d = __builtin_amdgcn_mmac_f32_16x16x16f16(a0, b0, c);
            result += d[0];
            
            // d = __builtin_amdgcn_mmac_f32_16x16x16f16(a1, b1, c);
            // result += d[0];
            
            // d = __builtin_amdgcn_mmac_f32_16x16x16f16(a2, b2, c);
            // result += d[0];
            
            // d = __builtin_amdgcn_mmac_f32_16x16x16f16(a3, b3, c);
            // result += d[0];
        }
        
        // 存储结果
        C[globalRow * N + globalCol] = result;
    }
    
    // 再次使用调度屏障
    __builtin_amdgcn_sched_barrier(0);
}

// CPU 侧矩阵乘法参考实现
template <int M, int N, int K>
void cpu_matrix_multiply(__fp16* A, __fp16* B, float* C)
{
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < N; ++j) {
            float sum = 0.0f;
            for (int k = 0; k < K; ++k) {
                sum += (__half2float(A[i * K + k])) * (__half2float(B[k * N + j]));
            }
            C[i * N + j] = sum;
        }
    }
}

// 验证GPU和CPU结果是否一致
template <int M, int N>
bool verify_results(float* cpu_result, float* gpu_result, float epsilon = 1e-3f)
{
    for (int i = 0; i < M * N; ++i) {
        if (fabs(cpu_result[i] - gpu_result[i]) > epsilon) {
            printf("Result mismatch at index %d: CPU=%f, GPU=%f\n", i, cpu_result[i], gpu_result[i]);
            return false;
        }
    }
    return true;
}

// 性能测试函数
template <int M, int N, int K>
void run_performance_test()
{
    printf("\n=== Testing matrix size %dx%dx%d ===\n", M, K, N);
    
    // 分配内存
    __fp16* h_A = (__fp16*)malloc(M * K * sizeof(__fp16));
    __fp16* h_B = (__fp16*)malloc(K * N * sizeof(__fp16));
    float* h_cpu_result = (float*)malloc(M * N * sizeof(float));
    float* h_gpu_result = (float*)malloc(M * N * sizeof(float));
    
    __fp16* d_A;
    __fp16* d_B;
    float* d_C;
    
    hipMalloc((void**)&d_A, M * K * sizeof(__fp16));
    hipMalloc((void**)&d_B, K * N * sizeof(__fp16));
    hipMalloc((void**)&d_C, M * N * sizeof(float));
    
    // 初始化数据
    for (int i = 0; i < M * K; ++i) {
        h_A[i] = (__fp16)(0.1f * (i % 100));
    }
    for (int i = 0; i < K * N; ++i) {
        h_B[i] = (__fp16)(0.1f * (i % 100));
    }
    
    // 复制数据到GPU
    hipMemcpy(d_A, h_A, M * K * sizeof(__fp16), hipMemcpyHostToDevice);
    hipMemcpy(d_B, h_B, K * N * sizeof(__fp16), hipMemcpyHostToDevice);
    
    // 设置线程块和网格大小，使用BLOCK_SIZE常量
    dim3 blockDim(BLOCK_SIZE, BLOCK_SIZE);
    dim3 gridDim((N + BLOCK_SIZE - 1) / BLOCK_SIZE, (M + BLOCK_SIZE - 1) / BLOCK_SIZE);
    
    // 预热运行
    mmac_matrix_kernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
    hipDeviceSynchronize();
    
    // 性能测试 - 增加迭代次数以获得更准确的计时
    int iterations = 100;
    hipEvent_t start, stop;
    hipEventCreate(&start);
    hipEventCreate(&stop);
    
    // 确保GPU准备就绪
    hipDeviceSynchronize();
    
    hipEventRecord(start);
    for (int i = 0; i < iterations; ++i) {
        mmac_matrix_kernel<<<gridDim, blockDim>>>(d_A, d_B, d_C, M, N, K);
    }
    hipEventRecord(stop);
    hipEventSynchronize(stop);
    
    float elapsed_ms;
    hipEventElapsedTime(&elapsed_ms, start, stop);
    float avg_time_ms = elapsed_ms / iterations;
    
    // 确保时间有效
    if (avg_time_ms < 0.001f) {
        avg_time_ms = 0.001f; // 避免除以零
    }
    
    // 计算TFLOPS
    double flops = 2.0 * M * N * K;
    double tflops = (flops / avg_time_ms) / 1e9;
    
    // 计算带宽（GB/s）
    double bytes = (M * K * sizeof(__fp16) + K * N * sizeof(__fp16) + M * N * sizeof(float));
    double bandwidth = (bytes / avg_time_ms) / 1e6;
    
    // 复制结果回主机
    hipMemcpy(h_gpu_result, d_C, M * N * sizeof(float), hipMemcpyDeviceToHost);
    
    // CPU 计算
    clock_t cpu_start = clock();
    cpu_matrix_multiply<M, N, K>(h_A, h_B, h_cpu_result);
    clock_t cpu_end = clock();
    double cpu_time_ms = (double)(cpu_end - cpu_start) * 1000.0 / CLOCKS_PER_SEC;
    
    // 验证结果
    bool success = verify_results<M, N>(h_cpu_result, h_gpu_result);
    if (success) {
        printf("✓ Results match between CPU and GPU\n");
    } else {
        printf("✗ Results mismatch between CPU and GPU\n");
        // 打印前几个结果进行调试
        printf("First 5 results - CPU: %f, %f, %f, %f, %f\n", 
               h_cpu_result[0], h_cpu_result[1], h_cpu_result[2], h_cpu_result[3], h_cpu_result[4]);
        printf("First 5 results - GPU: %f, %f, %f, %f, %f\n", 
               h_gpu_result[0], h_gpu_result[1], h_gpu_result[2], h_gpu_result[3], h_gpu_result[4]);
    }
    
    // 输出性能数据
    printf("GPU Time: %.3f ms\n", avg_time_ms);
    printf("CPU Time: %.3f ms\n", cpu_time_ms);
    printf("TFLOPS: %.3f\n", tflops);
    printf("Bandwidth: %.3f GB/s\n", bandwidth);
    if (avg_time_ms > 0) {
        printf("Speedup: %.2fx\n", cpu_time_ms / avg_time_ms);
    } else {
        printf("Speedup: N/A (GPU time too small)\n");
    }
    
    // 清理资源
    free(h_A);
    free(h_B);
    free(h_cpu_result);
    free(h_gpu_result);
    
    hipFree(d_A);
    hipFree(d_B);
    hipFree(d_C);
    
    hipEventDestroy(start);
    hipEventDestroy(stop);
}

int main()
{
    // 原始的简单测试
    printf("=== Original Simple MMAC Test ===\n");
    float* d_result;
    float h_result;
    
    hipMalloc((void**)&d_result, sizeof(float));
    
    // 启动内核
    test_mmac_kernel<<<1, 1>>>(d_result);
    hipDeviceSynchronize();
    
    // 复制结果回主机
    hipMemcpy(&h_result, d_result, sizeof(float), hipMemcpyDeviceToHost);
    
    // CPU 参考计算
    float a_values[] = {0.1f, 0.2f, 0.3f, 0.4f, 0.5f, 0.6f, 0.7f, 0.8f, 0.9f, 1.0f, 1.1f, 1.2f, 1.3f, 1.4f, 1.5f, 1.6f};
    float b_values[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f};
    float ref_result = 0.0f;
    for (int i = 0; i < 16; i++) {
        ref_result += a_values[i] * b_values[i];
    }
    
    printf("MMAC result: %f\n", h_result);
    printf("Reference result: %f\n", ref_result);
    printf("Difference: %e\n", fabs(h_result - ref_result));
    
    hipFree(d_result);
    
    // 运行不同大小的矩阵性能测试
    run_performance_test<128, 128, 128>();
    run_performance_test<256, 256, 256>();
    run_performance_test<512, 512, 512>();
    run_performance_test<1024, 1024, 1024>();
    
    printf("\nAll tests completed!\n");
    
    return 0;
}