gpu_burn_test.py 4.89 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Module of the GPU-Burn Test."""

import os

from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke


class GpuBurnBenchmark(MicroBenchmarkWithInvoke):
    """The GPU Burn Test benchmark class."""
    def __init__(self, name, parameters=''):
        """Constructor.

        Args:
            name (str): benchmark name.
            parameters (str): benchmark parameters.
        """
        super().__init__(name, parameters)

        self._bin_name = 'gpu_burn'

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()

        self._parser.add_argument(
            '--doubles',
            action='store_true',
            default=False,
            help='Use doubles for the data type used in GPU-Burn',
        )
        self._parser.add_argument(
            '--tensor_core',
            action='store_true',
            default=False,
            help='Use tensor cores in GPU-Burn',
        )
        self._parser.add_argument(
            '--time',
            type=int,
            default=10,
            help='Length of time to run GPU-Burn for(in seconds)',
        )

    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        if not self._set_binary_path():
            return False

        command = os.path.join(self._args.bin_dir, self._bin_name)

        if self._args.doubles:
            command += ' -d'

        if self._args.tensor_core:
            command += ' -tc'
        command += ' {} '.format(self._args.time)
        # copy compare.ptx which needs to be in the working directory
        compare_copy = 'cp ' + self._args.bin_dir + '/compare.ptx ./'
        # remove compare.ptx from working directory
        compare_rm = 'rm ' + 'compare.ptx'

        self._commands.append(compare_copy + ' && ' + command + ' && ' + compare_rm)

        return True

    def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
        """Function to parse raw results and save the summarized results.

           self._result.add_raw_data() and self._result.add_result() need to be called to save the results.

        Args:
            cmd_idx (int): the index of command corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
            True if the raw output string is valid and result can be extracted.
        """
        content = raw_output.splitlines()
        gpu_res = []
        abort = False
        failure_msg = 'unknown failure'
        index = -1
        try:
            for idx, line in enumerate(content):
                if 'No clients are alive!' in line or "Couldn't init a GPU" \
                        in line or 'Failure during compute' in line or 'Low mem for result' in line:
                    abort = True
                    failure_msg = line
                    break
                if 'done' in line:
                    index = idx
                    break

            if not abort:
                if 'done' not in content[index]:
                    abort = True
                    failure_msg = 'The result format invalid'
                    raise failure_msg

                content = content[index + 2:len(content):]

                for line in content:
                    if 'Tested' in line:
                        continue
                    if 'GPU' in line:
                        gpu_res.append(line.strip('\n').strip('\t'))

                self._result.add_result('time', self._args.time)
                for res in gpu_res:
                    if 'OK' in res:
                        self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 1)
                    else:
                        self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 0)
126
                    self._result.add_raw_data('GPU-Burn_result', res, self._args.log_raw_data)
127
            else:
128
                self._result.add_raw_data('GPU Burn Failure: ', failure_msg, self._args.log_raw_data)
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
                self._result.add_result('abort', 1)
                return False
        except BaseException as e:
            logger.error(
                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
                    self._curr_run_index, self._name, raw_output, str(e)
                )
            )
            self._result.add_result('abort', 1)
            return False
        self._result.add_result('abort', 0)
        return True


BenchmarkRegistry.register_benchmark('gpu-burn', GpuBurnBenchmark, platform=Platform.CUDA)