gemm_wmma_fp16_rrr.cpp 5.69 KB
Newer Older
aska-0096's avatar
aska-0096 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.

#include "common.hpp"

#include "ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp"

using ADataType        = ck::half_t;
using BDataType        = ck::half_t;
using AccDataType      = float;
using CShuffleDataType = float;
using CDataType        = ck::half_t;

using ALayout = Row;
using BLayout = Row;
using CLayout = Row;

using AElementOp = PassThrough;
using BElementOp = PassThrough;
using CElementOp = PassThrough;

static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;

// clang-format off
using DeviceGemmInstances = std::tuple<
// RRR Gemm AIT
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   
          128, 256, 8, 8, 16, 16, 4, 4,
          S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 32, 1, 8>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   
          256, 128, 8, 8, 16, 16, 8, 2,
          S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 32, 1, 8>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   
          128, 256, 4, 8, 16, 16, 4, 4,
          S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 32, 1, 8>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   
          256, 128, 4, 8, 16, 16, 8, 2,
          S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 32, 1, 8>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   
          128, 128, 8, 8, 16, 16, 4, 2,
          S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 32, 1, 8>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   
          128, 128, 4, 8, 16, 16, 4, 2,
          S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 32, 1, 8>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   
          256, 64, 8, 8, 16, 16, 8, 1,
          S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 32, 1, 8>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   256,   
          64, 256, 8, 8, 16, 16, 2, 4,
          S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 64, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 32, 1, 8>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   128,   
          128, 128, 8, 8, 16, 16, 8, 2,
          S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 16, 1, 8>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   128,   
          128, 64, 8, 8, 16, 16, 4, 2,
          S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 32, 1, 4>, 8, 1>,
    ck::tensor_operation::device::DeviceGemmWmma_CShuffle
         < ALayout, BLayout, CLayout, ADataType, BDataType, CDataType, AccDataType, CShuffleDataType,  
          AElementOp,  BElementOp,  CElementOp,    GemmDefault,   128,   
          64, 128, 8, 8, 16, 16, 4, 2,
          S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, true,
          S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>, 1, 2, 8, true,
          1, 1, S<1, 16, 1, 8>, 8, 1>
>;
// clang-format on

using ReferenceGemmInstance = ck::tensor_operation::host::
    ReferenceGemm<ADataType, BDataType, CDataType, AccDataType, AElementOp, BElementOp, CElementOp>;

#include "run_gemm_example.inc"

int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); }