profile_gemm_add_add_fastgelu.cpp 4.36 KB
Newer Older
Chao Liu's avatar
Chao Liu committed
1
2
3
4
5
6
#include <iostream>
#include <numeric>
#include <initializer_list>
#include <cstdlib>
#include <stdlib.h>

Chao Liu's avatar
rename  
Chao Liu committed
7
#include "profile_gemm_add_add_fastgelu_impl.hpp"
Chao Liu's avatar
Chao Liu committed
8

Chao Liu's avatar
rename  
Chao Liu committed
9
int profile_gemm_add_add_fastgelu(int argc, char* argv[])
Chao Liu's avatar
Chao Liu committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
{
    enum struct GemmMatrixLayout
    {
        MK_KN_MN, // 0
        MK_NK_MN, // 1
        KM_KN_MN, // 2
        KM_NK_MN, // 3
        MK_KN_NM, // 4
        MK_NK_NM, // 5
        KM_KN_NM, // 6
        KM_NK_NM, // 7
    };

    enum struct GemmDataType
    {
        F32_F32_F32,    // 0
        F16_F16_F16,    // 1
        BF16_BF16_BF16, // 2
        INT8_INT8_INT8, // 3
    };

Chao Liu's avatar
rename  
Chao Liu committed
31
    if(argc != 16)
Chao Liu's avatar
Chao Liu committed
32
    {
Chao Liu's avatar
rename  
Chao Liu committed
33
34
        // clang-format off
        printf("arg1: tensor operation (gemm_gelu: GEMM+Add+Add+GeLU)\n");
Chao Liu's avatar
Chao Liu committed
35
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
Chao Liu's avatar
rename  
Chao Liu committed
36
37
38
39
        printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n] + D1[m, n]);\n");
        printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n] + D1[m, n]);\n");
        printf("                     2: E[m, n] = FastGeLU(A[k, m] * B[k, n] + D0[m, n] + D1[m, n]);\n");
        printf("                     3: E[m, n] = FastGeLU(A[k, m] * B[n, k] + D0[m, n] + D1[m, n]))\n");
Chao Liu's avatar
Chao Liu committed
40
41
42
43
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
        printf("arg6: print tensor value (0: no; 1: yes)\n");
        printf("arg7: time kernel (0=n0, 1=yes)\n");
Chao Liu's avatar
rename  
Chao Liu committed
44
45
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC, StrideD0, StrideD1\n");
        // clang-format on
Chao Liu's avatar
Chao Liu committed
46
47
48
49
50
51
52
53
54
55
56
57
58
59
        exit(1);
    }

    const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
    const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
    const bool do_verification = std::stoi(argv[4]);
    const int init_method      = std::stoi(argv[5]);
    const bool do_log          = std::stoi(argv[6]);
    const bool time_kernel     = std::stoi(argv[7]);

    const int M = std::stoi(argv[8]);
    const int N = std::stoi(argv[9]);
    const int K = std::stoi(argv[10]);

Chao Liu's avatar
rename  
Chao Liu committed
60
61
62
63
64
    const int StrideA  = std::stoi(argv[11]);
    const int StrideB  = std::stoi(argv[12]);
    const int StrideC  = std::stoi(argv[13]);
    const int StrideD0 = std::stoi(argv[14]);
    const int StrideD1 = std::stoi(argv[15]);
Chao Liu's avatar
Chao Liu committed
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

    using F16 = ck::half_t;

    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;

    auto profile =
        [&](auto a_type, auto b_type, auto c_type, auto a_layout, auto b_layout, auto c_layout) {
            using ADataType = decltype(a_type);
            using BDataType = decltype(b_type);
            using CDataType = decltype(c_type);
            using ALayout   = decltype(a_layout);
            using BLayout   = decltype(b_layout);
            using CLayout   = decltype(c_layout);

            const int DefaultStrideA = ck::is_same_v<ALayout, Row> ? K : M;
            const int DefaultStrideB = ck::is_same_v<BLayout, Row> ? N : K;
            const int DefaultStrideC = ck::is_same_v<CLayout, Row> ? N : M;

            return ck::profiler::
                profile_gemm_gelu_impl<ADataType, BDataType, CDataType, ALayout, BLayout, CLayout>(
                    do_verification,
                    init_method,
                    do_log,
                    time_kernel,
                    M,
                    N,
                    K,
                    (StrideA < 0) ? DefaultStrideA : StrideA,
                    (StrideB < 0) ? DefaultStrideB : StrideB,
                    (StrideC < 0) ? DefaultStrideC : StrideC);
        };

    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
    {
        return profile(F16{}, F16{}, F16{}, Row{}, Row{}, Row{});
    }
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
    {
        return profile(F16{}, F16{}, F16{}, Row{}, Col{}, Row{});
    }
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_KN_MN)
    {
        return profile(F16{}, F16{}, F16{}, Col{}, Row{}, Row{});
    }
    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::KM_NK_MN)
    {
        return profile(F16{}, F16{}, F16{}, Col{}, Col{}, Row{});
    }
    else
    {
        std::cout << "this data_type & layout is not implemented" << std::endl;

        return 0;
    }
}