cpu_stream_performance.py 5.91 KB
Newer Older
rafsalas19's avatar
rafsalas19 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

"""Module for running the University of Virginia STREAM tool. It measures sustainable main memory \
    bandwidth in MB/s and the corresponding computation rate for simple vector kernels."""

import os

from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke


class CpuStreamBenchmark(MicroBenchmarkWithInvoke):
    """The Stream benchmark class."""
    def __init__(self, name, parameters=''):
        """Constructor.

        Args:
            name (str): benchmark name.
            parameters (str): benchmark parameters.
        """
        super().__init__(name, parameters)

25
        self._bin_name = 'stream'
26
        self.__cpu_arch = ['other', 'zen3', 'zen4', 'neo2']
rafsalas19's avatar
rafsalas19 committed
27
28
29
30
31
32
33
34

    def add_parser_arguments(self):
        """Add the specified arguments."""
        super().add_parser_arguments()

        self._parser.add_argument(
            '--cpu_arch',
            type=str,
35
            default='other',
rafsalas19's avatar
rafsalas19 committed
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
            required=False,
            help='The targeted cpu architectures to run \
                STREAM. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch))
        )
        core_link = 'https://techcommunity.microsoft.com/t5/azure-compute-blog/performance-\
        amp-scalability-of-hbv3-vms-with-milan-x-cpus/ba-p/2939814'

        self._parser.add_argument(
            '--cores',
            nargs='+',
            type=int,
            default=[
                0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120, 126, 132, 140, 148, 156, 164, 170
            ],
            required=False,
            help='List of cores to perform test. Default core configuration is for HBv4/Zen4 SKU offering. \
            For HBv3/Zen3 please see: ' + core_link
        )

55
56
57
58
59
60
61
62
63
        self._parser.add_argument(
            '--numa_mem_nodes',
            nargs='+',
            type=int,
            default=None,    # None means system default
            required=False,
            help='List of NUMA memory nodes to bind to. If not set, system default will be used.'
        )

rafsalas19's avatar
rafsalas19 committed
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

        Return:
            True if _preprocess() succeed.
        """
        if not super()._preprocess():
            return False

        # zen3
        # cores=[0, 4, 8, 12, 16, 20, 24, 28, 30, 34, 38, 42, 46, 50,
        # 54, 58, 60, 64, 68, 72, 76, 80, 84, 88, 90, 94, 98, 102, 106, 110, 114, 118]
        # zen4
        # cores=[0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120,
        # 126, 132, 140, 148, 156, 164, 170]
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
        # neo2: grace dual socket has 2 sockets, each socket has 72 cores
        #   numa node0: cores=[0, 1, 2,... 70, 71]
        #   numa node1: cores=[72, 73,... 142, 143]

        # parse cores into a comma-separated list of places for libgomp
        omp_places = ','.join(f'{{{core}}}' for core in self._args.cores)

        envar = (
            'OMP_SCHEDULE=static && OMP_DYNAMIC=false && '
            'OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && '
            'OMP_PROC_BIND=true && OMP_NUM_THREADS={} && '
            'OMP_PLACES={}'
        ).format(len(self._args.cores), omp_places)

        # if binding to NUMA memory nodes, prefix with numactl
        numa_cmd = ''
        if self._args.numa_mem_nodes is not None:
            mem_node_str = ','.join(map(str, self._args.numa_mem_nodes))
            numa_cmd = f'numactl -m{mem_node_str}'
rafsalas19's avatar
rafsalas19 committed
98

99
        # set the binary name based on cpu architecture
rafsalas19's avatar
rafsalas19 committed
100
        if self._args.cpu_arch == 'zen3':
101
            self._bin_name = 'streamZen3'
rafsalas19's avatar
rafsalas19 committed
102
        elif self._args.cpu_arch == 'zen4':
103
            self._bin_name = 'streamZen4'
104
        elif self._args.cpu_arch == 'neo2':
105
            self._bin_name = 'streamNeo2'
rafsalas19's avatar
rafsalas19 committed
106

107
108
        binary_path = os.path.join(self._args.bin_dir, self._bin_name)
        command = f'{envar} {numa_cmd} {binary_path}'
rafsalas19's avatar
rafsalas19 committed
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157

        if not self._set_binary_path():
            logger.error(
                'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
            )
            return False

        self._commands.append(command)
        return True

    def _process_raw_result(self, cmd_idx, raw_output):
        """Function to parse raw results and save the summarized results.

          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.

        Args:
            cmd_idx (int): the index of command corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
            True if the raw output string is valid and result can be extracted.
        """
        functions = ['Copy', 'Scale', 'Add', 'Triad']
        records = []
        content = raw_output.splitlines()
        for line in content:
            if 'Number of Threads counted' in line:
                line.split('= ')[1]
                self._result.add_result('threads', int(line.split('= ')[1]))
            for function in functions:
                if function in line:
                    records.append(line)

        # individual results
        for record in records:
            entries = record.split()
            metric = entries[0].strip().replace(':', '')
            self._result.add_result(metric.lower() + '_throughput', float(entries[1].strip()))
            self._result.add_result(metric.lower() + '_time_avg', float(entries[2].strip()))
            self._result.add_result(metric.lower() + '_time_min', float(entries[3].strip()))
            self._result.add_result(metric.lower() + '_time_max', float(entries[4].strip()))

        # raw output
        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)

        return True


BenchmarkRegistry.register_benchmark('cpu-stream', CpuStreamBenchmark)