gpu_burn_test.py 4.85 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Module of the GPU-Burn Test."""

import os

from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke


class GpuBurnBenchmark(MicroBenchmarkWithInvoke):
    """The GPU Burn Test benchmark class."""
    def __init__(self, name, parameters=''):
        """Constructor.

        Args:
            name (str): benchmark name.
            parameters (str): benchmark parameters.
        """
        super().__init__(name, parameters)

        self._bin_name = 'gpu_burn'

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()

        self._parser.add_argument(
            '--doubles',
            action='store_true',
            default=False,
            help='Use doubles for the data type used in GPU-Burn',
        )
        self._parser.add_argument(
            '--tensor_core',
            action='store_true',
            default=False,
            help='Use tensor cores in GPU-Burn',
        )
        self._parser.add_argument(
            '--time',
            type=int,
            default=10,
            help='Length of time to run GPU-Burn for(in seconds)',
        )

    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        if not self._set_binary_path():
            return False

        command = os.path.join(self._args.bin_dir, self._bin_name)

        if self._args.doubles:
            command += ' -d'

        if self._args.tensor_core:
            command += ' -tc'
        command += ' {} '.format(self._args.time)
        # copy compare.ptx which needs to be in the working directory
        compare_copy = 'cp ' + self._args.bin_dir + '/compare.ptx ./'
        # remove compare.ptx from working directory
        compare_rm = 'rm ' + 'compare.ptx'

        self._commands.append(compare_copy + ' && ' + command + ' && ' + compare_rm)

        return True

    def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
        """Function to parse raw results and save the summarized results.

           self._result.add_raw_data() and self._result.add_result() need to be called to save the results.

        Args:
            cmd_idx (int): the index of command corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
            True if the raw output string is valid and result can be extracted.
        """
        content = raw_output.splitlines()
        gpu_res = []
        abort = False
        failure_msg = 'unknown failure'
        index = -1
        try:
            for idx, line in enumerate(content):
                if 'No clients are alive!' in line or "Couldn't init a GPU" \
                        in line or 'Failure during compute' in line or 'Low mem for result' in line:
                    abort = True
                    failure_msg = line
                    break
                if 'done' in line:
                    index = idx
                    break

            if not abort:
                if 'done' not in content[index]:
                    abort = True
                    failure_msg = 'The result format invalid'
                    raise failure_msg

                content = content[index + 2:len(content):]

                for line in content:
                    if 'Tested' in line:
                        continue
                    if 'GPU' in line:
                        gpu_res.append(line.strip('\n').strip('\t'))

                self._result.add_result('time', self._args.time)
                for res in gpu_res:
                    if 'OK' in res:
                        self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 1)
                    else:
                        self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 0)
                    self._result.add_raw_data('GPU-Burn_result', res)
            else:
                self._result.add_raw_data('GPU Burn Failure: ', failure_msg)
                self._result.add_result('abort', 1)
                return False
        except BaseException as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
                    self._curr_run_index, self._name, raw_output, str(e)
                )
            )
            self._result.add_result('abort', 1)
            return False
        self._result.add_result('abort', 0)
        return True


BenchmarkRegistry.register_benchmark('gpu-burn', GpuBurnBenchmark, platform=Platform.CUDA)