#include #include #include "hip/hip_runtime.h" #include "roctracer/roctx.h" #include #include // 定义矩阵大小和线程块配置 #define WIDTH 1024 #define NUM (WIDTH * WIDTH) #define THREADS_PER_BLOCK_X 32 #define THREADS_PER_BLOCK_Y 32 // 矩阵转置内核函数 __global__ __launch_bounds__(1024) void matrixTranspose(float* transposedMatrix, const float* matrix, int width) { int x = blockIdx.x * blockDim.x + threadIdx.x; int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < width && y < width) { // 使用共享内存进行优化转置 __shared__ float tile[THREADS_PER_BLOCK_X][THREADS_PER_BLOCK_Y + 1]; // +1 避免bank conflict // 从全局内存加载到共享内存（原始顺序） tile[threadIdx.y][threadIdx.x] = matrix[y * width + x]; __syncthreads(); // 从共享内存写入到全局内存（转置顺序） transposedMatrix[x * width + y] = tile[threadIdx.x][threadIdx.y]; } } int main() { // 主机端矩阵 std::vector Matrix(NUM); std::vector TransposeMatrix(NUM); // 初始化矩阵 for (int i = 0; i < WIDTH; ++i) { for (int j = 0; j < WIDTH; ++j) { Matrix[i * WIDTH + j] = i * WIDTH + j; } } // 设备端矩阵 float *gpuMatrix = nullptr; float *gpuTransposeMatrix = nullptr; hipMalloc(&gpuMatrix, NUM * sizeof(float)); // 分配设备内存 hipMalloc(&gpuTransposeMatrix, NUM * sizeof(float)); // 初始化ROCTx roctxMark("Program start"); int rangeId = roctxRangeStart("hipLaunchKernel range"); // 内存从主机传输到设备 hipMemcpy(gpuMatrix, Matrix.data(), NUM * sizeof(float), hipMemcpyHostToDevice); roctxRangePush("hipMemcpy HostToDevice"); // 设置内核启动配置 dim3 gridSize(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y); dim3 blockSize(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y); for (int i = 0; i < 100; ++i){ std::this_thread::sleep_for(std::chrono::milliseconds(5)); // 启动转置内核 matrixTranspose<<>>(gpuTransposeMatrix, gpuMatrix, WIDTH); hipDeviceSynchronize(); } // 检查内核启动错误 hipError_t kernelLaunchError = hipGetLastError(); if (kernelLaunchError != hipSuccess) { std::cerr << "Kernel launch failed: " << hipGetErrorString(kernelLaunchError) << std::endl; return -1; } roctxMark("after hipLaunchKernel"); // 等待内核执行完成 hipDeviceSynchronize(); // 检查内核执行错误 hipError_t kernelExecutionError = hipGetLastError(); if (kernelExecutionError != hipSuccess) { std::cerr << "Kernel execution failed: " << hipGetErrorString(kernelExecutionError) << std::endl; return -1; } // 内存从设备传输到主机 roctxRangePush("hipMemcpy DeviceToHost"); hipMemcpy(TransposeMatrix.data(), gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost); roctxRangePop(); // for "hipMemcpy" roctxRangePop(); // for "hipLaunchKernel" roctxRangeStop(rangeId); bool correct = true; for (int i = 0; i < WIDTH && correct; ++i) { for (int j = 0; j < WIDTH; ++j) { if (TransposeMatrix[i * WIDTH + j] != Matrix[j * WIDTH + i]) { std::cerr << "Mismatch at (" << i << ", " << j << "): " << TransposeMatrix[i * WIDTH + j] << " != " << Matrix[j * WIDTH + i] << std::endl; correct = false; break; } } } if (correct) { std::cout << "Matrix transpose completed successfully!" << std::endl; } // 释放设备内存 hipFree(gpuMatrix); hipFree(gpuTransposeMatrix); return 0; }