gemm_wmma_fp16.cpp 2.96 KB
Newer Older
1
// SPDX-License-Identifier: MIT
2
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

#include "common.hpp"

#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"

using ADataType        = ck::half_t;
using BDataType        = ck::half_t;
using AccDataType      = float;
using CShuffleDataType = float;
using CDataType        = ck::half_t;

using ALayout = Row;
using BLayout = Col;
using CLayout = Row;

using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;

zjing14's avatar
zjing14 committed
22
static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
23
24
25

// clang-format off
using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle
26
27
28
29
         < ALayout,
           BLayout,
           CLayout,
           ADataType,
zjing14's avatar
zjing14 committed
30
           BDataType,
31
32
33
34
35
36
37
           CDataType,
           AccDataType,
           CShuffleDataType,
           AElementOp,
           BElementOp,
           CElementOp,
           GemmDefault,
zjing14's avatar
zjing14 committed
38
39
40
41
42
           1,           // Prefetch stage
           128,         // BlockSize
           64,          // MPerBlock
           128,         // NPerBlock
           64,          // KPerBlock
43
           2,           // K1
zjing14's avatar
zjing14 committed
44
45
46
47
           16,          // MPerWmma
           16,          // NPerWmma
           2,           // M-Repeat // M-PerWmma / M-Repeat = M-Wave
           4,           // N-Repeat // N-PerWmma / N-Repeat = N-Wave
48
49
50
51
52
53
54
55
56
57
58
59
60
61
           S<4, 32, 1>,
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
           2,
           2,
           true,
           S<4, 32, 1>,
           S<1, 0, 2>,
           S<1, 0, 2>,
           2,
           2,
           2,
           true,
zjing14's avatar
zjing14 committed
62
63
           1,           // C shuffle (M Repeat) Per store
           1,           // C shuffle (N Repeat) Per store
64
           S<1, 32, 1,  4>,
zjing14's avatar
zjing14 committed
65
           8>;
66
67
68
69
70
// clang-format on

using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

71
72
73
74
75
76
77
78
79
80
81
using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm<ALayout,
                                                                             BLayout,
                                                                             CLayout,
                                                                             ADataType,
                                                                             BDataType,
                                                                             CDataType,
                                                                             AccDataType,
                                                                             AElementOp,
                                                                             BElementOp,
                                                                             CElementOp>;

82
83
84
#include "run_gemm_example.inc"

int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }