gemm_xdl.cpp 8.24 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>
#include <half.hpp>
#include "config.hpp"
#include "print.hpp"
#include "device.hpp"
#include "host_tensor.hpp"
#include "host_tensor_generator.hpp"
#include "host_gemm.hpp"
#include "device_tensor.hpp"
rocking5566's avatar
rocking5566 committed
14
#include "device_gemm_xdl_c_shuffle.hpp"
Chao Liu's avatar
Chao Liu committed
15
#include "element_wise_operation.hpp"
Chao Liu's avatar
Chao Liu committed
16
#include "reference_gemm.hpp"
Chao Liu's avatar
Chao Liu committed
17

Chao Liu's avatar
Chao Liu committed
18
19
20
21
22
23
24
25
26
27
28
29
template <ck::index_t... Is>
using S = ck::Sequence<Is...>;

using ADataType   = ck::half_t;
using BDataType   = ck::half_t;
using CDataType   = ck::half_t;
using AccDataType = float;

using ALayout = ck::tensor_layout::gemm::RowMajor;
using BLayout = ck::tensor_layout::gemm::ColumnMajor;
using CLayout = ck::tensor_layout::gemm::RowMajor;

Chao Liu's avatar
Chao Liu committed
30
31
32
using AElementOp = ck::tensor_operation::element_wise::PassThrough;
using BElementOp = ck::tensor_operation::element_wise::PassThrough;
using CElementOp = ck::tensor_operation::element_wise::PassThrough;
Chao Liu's avatar
Chao Liu committed
33
34

// clang-format off
rocking5566's avatar
rocking5566 committed
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl_C_Shuffle<
    ADataType,              // ADataType
    BDataType,              // BDataType
    CDataType,              // CDataType
    AccDataType,            // AccDataType
    ALayout,                // ALayout
    BLayout,                // BLayout
    CLayout,                // CLayout
    AElementOp,             // AElementwiseOperation
    BElementOp,             // BElementwiseOperation
    CElementOp,             // CElementwiseOperation
    256,                    // BlockSize
    256,                    // MPerBlock
    128,                    // NPerBlock
    4,                      // K0PerBlock
    8,                      // K1
    32,                     // MPerXDL
    32,                     // NPerXDL
    4,                      // MXdlPerWave
    2,                      // NXdlPerWave
    S<4, 64, 1>,            // ABlockTransferThreadClusterLengths_K0_M_K1
    S<1, 0, 2>,             // ABlockTransferThreadClusterArrangeOrder
    S<1, 0, 2>,             // ABlockTransferSrcAccessOrder
    2,                      // ABlockTransferSrcVectorDim
    8,                      // ABlockTransferSrcScalarPerVector
    8,                      // ABlockTransferDstScalarPerVector_K1
    true,                   // ABlockLdsAddExtraM
    S<4, 64, 1>,            // BBlockTransferThreadClusterLengths_K0_N_K1
    S<1, 0, 2>,             // BBlockTransferThreadClusterArrangeOrder
    S<1, 0, 2>,             // BBlockTransferSrcAccessOrder
    2,                      // BBlockTransferSrcVectorDim
    8,                      // BBlockTransferSrcScalarPerVector
    8,                      // BBlockTransferDstScalarPerVector_K1
    true,                   // BBlockLdsAddExtraN
    1,                      // CShuffleMXdlPerWavePerShuffle
    1,                      // CShuffleNXdlPerWavePerShuffle
    S<1, 1, 32, 1, 1, 8>,   // CBlockTransferClusterLengths_MBlock_MXdlPerWave_MWaveMPerXdl_NBlock_NXdlPerWave_NWaveNPerXdl
    8>;                     // CBlockTransferScalarPerVector_NWaveNPerXdl
Chao Liu's avatar
Chao Liu committed
73
74
// clang-format on

Chao Liu's avatar
Chao Liu committed
75
76
using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
77
78
79

int main(int argc, char* argv[])
{
Chao Liu's avatar
Chao Liu committed
80
81
82
    bool do_verification = 0;
    int init_method      = 0;
    int nrepeat          = 5;
83
84
85
86
87
88
89
90
91
92

    // GEMM shape
    ck::index_t M = 3840;
    ck::index_t N = 4096;
    ck::index_t K = 4096;

    ck::index_t StrideA = 4096;
    ck::index_t StrideB = 4096;
    ck::index_t StrideC = 4096;

Chao Liu's avatar
Chao Liu committed
93
94
    if(argc == 4)
    {
95
96
97
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        nrepeat         = std::stoi(argv[3]);
Chao Liu's avatar
Chao Liu committed
98
99
100
101
102
103
104
105
106
107
    }
    else if(argc == 10)
    {
        do_verification = std::stoi(argv[1]);
        init_method     = std::stoi(argv[2]);
        nrepeat         = std::stoi(argv[3]);

        M = std::stoi(argv[4]);
        N = std::stoi(argv[5]);
        K = std::stoi(argv[6]);
108

Chao Liu's avatar
Chao Liu committed
109
110
111
112
113
114
115
116
117
118
119
120
        StrideA = std::stoi(argv[7]);
        StrideB = std::stoi(argv[8]);
        StrideC = std::stoi(argv[9]);
    }
    else
    {
        printf("arg1: verification (0=no, 1=yes)\n");
        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
        printf("arg3: run kernel # of times (>1)\n");
        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
        exit(0);
    }
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148

    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
                                            std::vector<std::size_t>({stride, 1}));
            }
            else
            {
                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
                                            std::vector<std::size_t>({1, stride}));
            }
        };

    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
    Tensor<BDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
    Tensor<BDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));

    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;

    switch(init_method)
    {
    case 0: break;
    case 1:
149
150
        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
151
152
        break;
    default:
153
154
        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
155
156
157
158
159
160
161
162
163
    }

    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
    DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());

    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
    b_k_n_device_buf.ToDevice(b_k_n.mData.data());

Chao Liu's avatar
Chao Liu committed
164
165
166
167
    auto a_element_op = AElementOp{};
    auto b_element_op = BElementOp{};
    auto c_element_op = CElementOp{};

168
    // do GEMM
Chao Liu's avatar
Chao Liu committed
169
    auto gemm     = DeviceGemmInstance{};
170
171
172
173
174
175
176
177
178
    auto invoker  = gemm.MakeInvoker();
    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
                                      static_cast<BDataType*>(b_k_n_device_buf.GetDeviceBuffer()),
                                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()),
                                      M,
                                      N,
                                      K,
                                      StrideA,
                                      StrideB,
Chao Liu's avatar
Chao Liu committed
179
                                      StrideC,
Chao Liu's avatar
Chao Liu committed
180
181
182
                                      a_element_op,
                                      b_element_op,
                                      c_element_op);
183
184
185
186
187
188
189
190
191
192
193
194

    if(!gemm.IsSupportedArgument(argument))
    {
        throw std::runtime_error(
            "wrong! device_gemm with the specified compilation parameters does "
            "not support this GEMM problem");
    }

    float ave_time = invoker.Run(argument, nrepeat);

    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_btype =
Chao Liu's avatar
Chao Liu committed
195
        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
196
197
198
199
200
201
202
203
204
205
206
207

    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;

    float gb_per_sec = num_btype / 1.E6 / ave_time;

    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
              << std::endl;

    c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data());

    if(do_verification)
    {
Chao Liu's avatar
Chao Liu committed
208
209
210
211
212
213
214
        auto ref_gemm    = ReferenceGemmInstance{};
        auto ref_invoker = ref_gemm.MakeInvoker();

        auto ref_argument = ref_gemm.MakeArgument(
            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);

        ref_invoker.Run(ref_argument);
215
216
217

        check_error(c_m_n_host_result, c_m_n_device_result);
    }
Jianfeng Yan's avatar
Jianfeng Yan committed
218
219

    return 0;
220
}