hiptx-demo.cpp 3.77 KB
Newer Older
wangkaixiong's avatar
init  
wangkaixiong committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#include <iostream>
#include <vector>
#include "hip/hip_runtime.h"
#include "roctracer/roctx.h"
#include <chrono>
#include <thread>


// 定义矩阵大小和线程块配置
#define WIDTH 1024
#define NUM (WIDTH * WIDTH)
#define THREADS_PER_BLOCK_X 32
#define THREADS_PER_BLOCK_Y 32

// 矩阵转置内核函数
__global__ __launch_bounds__(1024) void matrixTranspose(float* transposedMatrix, const float* matrix, int width) {
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < width && y < width) {
        // 使用共享内存进行优化转置
        __shared__ float tile[THREADS_PER_BLOCK_X][THREADS_PER_BLOCK_Y + 1]; // +1 避免bank conflict

        // 从全局内存加载到共享内存(原始顺序)
        tile[threadIdx.y][threadIdx.x] = matrix[y * width + x];

        __syncthreads();

        // 从共享内存写入到全局内存(转置顺序)
        transposedMatrix[x * width + y] = tile[threadIdx.x][threadIdx.y];
    }
}

int main() {


    // 主机端矩阵
    std::vector<float> Matrix(NUM);
    std::vector<float> TransposeMatrix(NUM);

    // 初始化矩阵
    for (int i = 0; i < WIDTH; ++i) {
        for (int j = 0; j < WIDTH; ++j) {
            Matrix[i * WIDTH + j] = i * WIDTH + j;
        }
    }

    // 设备端矩阵
    float *gpuMatrix = nullptr;
    float *gpuTransposeMatrix = nullptr;

    
    hipMalloc(&gpuMatrix, NUM * sizeof(float));

    // 分配设备内存
    hipMalloc(&gpuTransposeMatrix, NUM * sizeof(float));


    // 初始化ROCTx
    roctxMark("Program start");
    int rangeId = roctxRangeStart("hipLaunchKernel range");
    // 内存从主机传输到设备
    hipMemcpy(gpuMatrix, Matrix.data(), NUM * sizeof(float), hipMemcpyHostToDevice);
    roctxRangePush("hipMemcpy HostToDevice");

    // 设置内核启动配置
    dim3 gridSize(WIDTH / THREADS_PER_BLOCK_X, WIDTH / THREADS_PER_BLOCK_Y);
    dim3 blockSize(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y);

    for (int i = 0; i < 100; ++i){
        std::this_thread::sleep_for(std::chrono::milliseconds(5));
        // 启动转置内核
        matrixTranspose<<<gridSize, blockSize>>>(gpuTransposeMatrix, gpuMatrix, WIDTH);
        hipDeviceSynchronize();
    }

    // 检查内核启动错误
    hipError_t kernelLaunchError = hipGetLastError();
    if (kernelLaunchError != hipSuccess) {
        std::cerr << "Kernel launch failed: " << hipGetErrorString(kernelLaunchError) << std::endl;
        return -1;
    }

    roctxMark("after hipLaunchKernel");

    // 等待内核执行完成
    hipDeviceSynchronize();



    // 检查内核执行错误
    hipError_t kernelExecutionError = hipGetLastError();
    if (kernelExecutionError != hipSuccess) {
        std::cerr << "Kernel execution failed: " << hipGetErrorString(kernelExecutionError) << std::endl;
        return -1;
    }


    // 内存从设备传输到主机
    roctxRangePush("hipMemcpy DeviceToHost");
    hipMemcpy(TransposeMatrix.data(), gpuTransposeMatrix, NUM * sizeof(float), hipMemcpyDeviceToHost);


    roctxRangePop(); // for "hipMemcpy"
    roctxRangePop(); // for "hipLaunchKernel"
    roctxRangeStop(rangeId);

    bool correct = true;
    for (int i = 0; i < WIDTH && correct; ++i) {
        for (int j = 0; j < WIDTH; ++j) {
            if (TransposeMatrix[i * WIDTH + j] != Matrix[j * WIDTH + i]) {
                std::cerr << "Mismatch at (" << i << ", " << j << "): "
                          << TransposeMatrix[i * WIDTH + j] << " != " << Matrix[j * WIDTH + i] << std::endl;
                correct = false;
                break;
            }
        }
    }

    if (correct) {
        std::cout << "Matrix transpose completed successfully!" << std::endl;
    }

    // 释放设备内存
    hipFree(gpuMatrix);
    hipFree(gpuTransposeMatrix);

    return 0;
}