gemm_flops_performance.py 7.28 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Module of the FLOPs performance benchmarks."""

import os

from superbench.common.utils import logger
from superbench.common.utils import nv_helper
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke


class GemmFlopsCuda(MicroBenchmarkWithInvoke):
    """The GEMM FLOPs performance benchmark class."""
    def __init__(self, name, parameters=''):
        """Constructor.

        Args:
            name (str): benchmark name.
            parameters (str): benchmark parameters.
        """
        super().__init__(name, parameters)

        self._bin_name = 'cutlass_profiler'

27
        # TODO - To support more architecutres, currently only support compute capability = 7.0 and 8.0
28
        self.__kernel_map = {
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
            7.0: {
                'FP64': 'cutlass_simt_dgemm_128x128_8x2_*',
                'FP32': 'cutlass_simt_sgemm_128x128_8x2_*',
                'FP16': 'cutlass_simt_hgemm_256x128_8x2_*',
                'FP16_TC': 'cutlass_tensorop_h884gemm_256x128_32x2_*',
            },
            8.0: {
                'FP64': 'cutlass_simt_dgemm_128x128_8x2_*',
                'FP32': 'cutlass_simt_sgemm_128x128_8x2_*',
                'FP16': 'cutlass_simt_hgemm_256x128_8x2_*',
                'FP64_TC': 'cutlass_tensorop_d884gemm_128x128_16x3_*',
                'TF32_TC': 'cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3_*',
                'BF16_TC': 'cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3_*',
                'FP16_TC': 'cutlass_tensorop_h16816gemm_256x128_32x3_*',
                'INT8_TC': 'cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3_*',
                'INT4_TC': 'cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3_*',
            }
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
        }

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()

        self._parser.add_argument(
            '--num_warmup',
            type=int,
            default=5,
            required=False,
            help='The number of warmup step.',
        )
        self._parser.add_argument(
            '--n',
            type=int,
            default=16384,
            required=False,
            help='The N dim of matmul (N, K) * (K, M).',
        )
        self._parser.add_argument(
            '--k',
            type=int,
            default=16384,
            required=False,
            help='The K dim of matmul (N, K) * (K, M).',
        )
        self._parser.add_argument(
            '--m',
            type=int,
            default=16384,
            required=False,
            help='The M dim of matmul (N, K) * (K, M).',
        )
        self._parser.add_argument(
            '--precision',
            type=str,
            nargs='+',
84
85
            default=list(),
            help='Precision for benchmarking. E.g. {}.'.format(' '.join(list(self.__kernel_map[8.0].keys()))),
86
87
88
89
90
91
92
93
94
95
96
        )

    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

97
        # Reset kernels according to compute capability.
98
        capability = nv_helper.get_device_compute_capability()
99
        if capability not in self.__kernel_map:
100
101
102
103
104
105
106
107
            self._result.set_return_code(ReturnCode.MICROBENCHMARK_UNSUPPORTED_ARCHITECTURE)
            logger.error(
                'Unsupported architecture - benchmark: {}, compute capability: {}, expected: 7.0 or 8.0'.format(
                    self._name, capability
                )
            )
            return False

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
        self.__precision_need_to_run = list()
        if len(self._args.precision) == 0:
            self.__precision_need_to_run = list(self.__kernel_map[capability].keys())
        else:
            self._args.precision = [p.upper() for p in self._args.precision]
            for p in self._args.precision:
                if p not in list(self.__kernel_map.keys()):
                    self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
                    logger.warning(
                        'Unsupported precision - benchmark: {}, precision: {}, expected: {}.'.format(
                            self._name, p, list(self.__kernel_map[capability].keys())
                        )
                    )
                else:
                    self.__precision_need_to_run.append(p)

        if len(self.__precision_need_to_run) == 0:
            self._result.set_return_code(ReturnCode.NO_SUPPORTED_PRECISION)
            return False

        for p in self.__precision_need_to_run:
            command = os.path.join(self._args.bin_dir, self._bin_name)
            command += (' --warmup-iterations=' + str(self._args.num_warmup))
            command += (' --operation=gemm')
            command += (' --n=' + str(self._args.n))
            command += (' --k=' + str(self._args.k))
            command += (' --m=' + str(self._args.m))
            command += (' --kernels=' + self.__kernel_map[capability][p])
            self._commands.append(command)

138
139
140
141
142
143
144
145
146
147
148
149
150
151
        return True

    def _process_raw_result(self, cmd_idx, raw_output):
        """Function to parse raw results and save the summarized results.

          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.

        Args:
            cmd_idx (int): the index of command corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
            True if the raw output string is valid and result can be extracted.
        """
152
        precision = self.__precision_need_to_run[cmd_idx]
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
        self._result.add_raw_data('raw_output_' + precision, raw_output)

        valid = True
        flops = list()
        content = raw_output.splitlines()
        try:
            for line in content:
                if 'gemm,cutlass_simt_dgemm_128x128_8x2' in line or \
                   'gemm,cutlass_simt_sgemm_128x128_8x2' in line or \
                   'gemm,cutlass_simt_hgemm_256x128_8x2' in line or \
                   'gemm,cutlass_tensorop_d884gemm_128x128_16x3' in line or \
                   'gemm,cutlass_tensorop_tf32_s1688gemm_tf32_256x128_16x3' in line or \
                   'gemm,cutlass_tensorop_bf16_s16816gemm_bf16_256x128_32x3' in line or \
                   'gemm,cutlass_tensorop_h16816gemm_256x128_32x3' in line or \
                   'gemm,cutlass_tensorop_h884gemm_256x128_32x2' in line or \
                   'gemm,cutlass_tensorop_s8_i16832gemm_s8_256x128_64x3' in line or \
                   'gemm,cutlass_tensorop_s4_i16864gemm_s4_256x128_128x3' in line:
                    flops.append(float(line.split(',')[-1]))
        except BaseException:
            valid = False
        finally:
            if valid is False or len(flops) == 0:
                logger.error(
                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
                        self._curr_run_index, self._name, raw_output
                    )
                )
                return False

        self._result.add_result(precision, max(flops))

        return True


BenchmarkRegistry.register_benchmark('gemm-flops', GemmFlopsCuda, platform=Platform.CUDA)