#include <iostream>
#include <vector>
#include "hip/hip_runtime.h"
#include "roctracer/roctx.h"
#include <chrono>
#include <thread>


// 定义矩阵大小和线程块配置
#define WIDTH 1024
#define NUM (WIDTH * WIDTH)
#define THREADS_PER_BLOCK_X 32
#define THREADS_PER_BLOCK_Y 32

// 矩阵转置内核函数
__global__ __launch_bounds__(1024) void matrixTranspose(float* transposedMatrix, const float* matrix, int width) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < width) {
        // 使用共享内存进行优化转置
        __shared__ float tile[THREADS_PER_BLOCK_X][THREADS_PER_BLOCK_Y + 1]; // +1 避免bank conflict

        // 从全局内存加载到共享内存（原始顺序）
        tile[threadIdx.y][threadIdx.x] = matrix[y * width + x];

        __syncthreads();

        // 从共享内存写入到全局内存（转置顺序）
        transposedMatrix[x * width + y] = tile[threadIdx.x][threadIdx.y];
    }
}

int main() {


    // 主机端矩阵
    std::vector<float> Matrix(NUM);
    std::vector<float> TransposeMatrix(NUM);

    // 初始化矩阵
    for (int i = 0; i < WIDTH; ++i) {
        for (int j = 0; j < WIDTH; ++j) {
            Matrix[i * WIDTH + j] = i * WIDTH + j;
        }
    }

    // 设备端矩阵
    float *gpuMatrix = nullptr;
    float *gpuTransposeMatrix = nullptr;

    
    hipMalloc(&gpuMatrix, NUM * sizeof(float));

    // 分配设备内存
    hipMalloc(&gpuTransposeMatrix, NUM * sizeof(float));


    // 初始化ROCTx
    roctxMark("Program start");
    int rangeId = roctxRangeStart("hipLaunchKernel range");
    // 内存从主机传输到设备
    hipMemcpy(gpuMatrix, Matrix.data(), NUM * sizeof(float), hipMemcpyHostToDevice);
    roctxRangePush("hipMemcpy HostToDevice");

    // 设置内核启动配置
    dim3 gridSize(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y);
    dim3 blockSize(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);

    for (int i = 0; i < 100; ++i){
        std::this_thread::sleep_for(std::chrono::milliseconds(5));
        // 启动转置内核
        matrixTranspose<<<gridSize, blockSize>>>(gpuTransposeMatrix, gpuMatrix, WIDTH);
        hipDeviceSynchronize();
    }

    // 检查内核启动错误
    hipError_t kernelLaunchError = hipGetLastError();
    if (kernelLaunchError != hipSuccess) {
        std::cerr << "Kernel launch failed: " << hipGetErrorString(kernelLaunchError) << std::endl;
        return -1;
    }

    roctxMark("after hipLaunchKernel");

    // 等待内核执行完成
    hipDeviceSynchronize();



    // 检查内核执行错误
    hipError_t kernelExecutionError = hipGetLastError();
    if (kernelExecutionError != hipSuccess) {
        std::cerr << "Kernel execution failed: " << hipGetErrorString(kernelExecutionError) << std::endl;
        return -1;
    }


    // 内存从设备传输到主机
    roctxRangePush("hipMemcpy DeviceToHost");
    hipMemcpy(TransposeMatrix.data(), gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);


    roctxRangePop(); // for "hipMemcpy"
    roctxRangePop(); // for "hipLaunchKernel"
    roctxRangeStop(rangeId);

    bool correct = true;
    for (int i = 0; i < WIDTH && correct; ++i) {
        for (int j = 0; j < WIDTH; ++j) {
            if (TransposeMatrix[i * WIDTH + j] != Matrix[j * WIDTH + i]) {
                std::cerr << "Mismatch at (" << i << ", " << j << "): "
                          << TransposeMatrix[i * WIDTH + j] << " != " << Matrix[j * WIDTH + i] << std::endl;
                correct = false;
                break;
            }
        }
    }

    if (correct) {
        std::cout << "Matrix transpose completed successfully!" << std::endl;
    }

    // 释放设备内存
    hipFree(gpuMatrix);
    hipFree(gpuTransposeMatrix);

    return 0;
}

