# Copyright (c) Microsoft Corporation. # Licensed under the MIT license. """Module for running the University of Virginia STREAM tool. It measures sustainable main memory \ bandwidth in MB/s and the corresponding computation rate for simple vector kernels.""" import os from superbench.common.utils import logger from superbench.benchmarks import BenchmarkRegistry from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke class CpuStreamBenchmark(MicroBenchmarkWithInvoke): """The Stream benchmark class.""" def __init__(self, name, parameters=''): """Constructor. Args: name (str): benchmark name. parameters (str): benchmark parameters. """ super().__init__(name, parameters) self._bin_name = 'stream' self.__cpu_arch = ['other', 'zen3', 'zen4', 'neo2'] def add_parser_arguments(self): """Add the specified arguments.""" super().add_parser_arguments() self._parser.add_argument( '--cpu_arch', type=str, default='other', required=False, help='The targeted cpu architectures to run \ STREAM. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch)) ) core_link = 'https://techcommunity.microsoft.com/t5/azure-compute-blog/performance-\ amp-scalability-of-hbv3-vms-with-milan-x-cpus/ba-p/2939814' self._parser.add_argument( '--cores', nargs='+', type=int, default=[ 0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120, 126, 132, 140, 148, 156, 164, 170 ], required=False, help='List of cores to perform test. Default core configuration is for HBv4/Zen4 SKU offering. \ For HBv3/Zen3 please see: ' + core_link ) self._parser.add_argument( '--numa_mem_nodes', nargs='+', type=int, default=None, # None means system default required=False, help='List of NUMA memory nodes to bind to. If not set, system default will be used.' ) def _preprocess(self): """Preprocess/preparation operations before the benchmarking. Return: True if _preprocess() succeed. """ if not super()._preprocess(): return False # zen3 # cores=[0, 4, 8, 12, 16, 20, 24, 28, 30, 34, 38, 42, 46, 50, # 54, 58, 60, 64, 68, 72, 76, 80, 84, 88, 90, 94, 98, 102, 106, 110, 114, 118] # zen4 # cores=[0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120, # 126, 132, 140, 148, 156, 164, 170] # neo2: grace dual socket has 2 sockets, each socket has 72 cores # numa node0: cores=[0, 1, 2,... 70, 71] # numa node1: cores=[72, 73,... 142, 143] # parse cores into a comma-separated list of places for libgomp omp_places = ','.join(f'{{{core}}}' for core in self._args.cores) envar = ( 'OMP_SCHEDULE=static && OMP_DYNAMIC=false && ' 'OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && ' 'OMP_PROC_BIND=true && OMP_NUM_THREADS={} && ' 'OMP_PLACES={}' ).format(len(self._args.cores), omp_places) # if binding to NUMA memory nodes, prefix with numactl numa_cmd = '' if self._args.numa_mem_nodes is not None: mem_node_str = ','.join(map(str, self._args.numa_mem_nodes)) numa_cmd = f'numactl -m{mem_node_str}' # set the binary name based on cpu architecture if self._args.cpu_arch == 'zen3': self._bin_name = 'streamZen3' elif self._args.cpu_arch == 'zen4': self._bin_name = 'streamZen4' elif self._args.cpu_arch == 'neo2': self._bin_name = 'streamNeo2' binary_path = os.path.join(self._args.bin_dir, self._bin_name) command = f'{envar} {numa_cmd} {binary_path}' if not self._set_binary_path(): logger.error( 'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir) ) return False self._commands.append(command) return True def _process_raw_result(self, cmd_idx, raw_output): """Function to parse raw results and save the summarized results. self._result.add_raw_data() and self._result.add_result() need to be called to save the results. Args: cmd_idx (int): the index of command corresponding with the raw_output. raw_output (str): raw output string of the micro-benchmark. Return: True if the raw output string is valid and result can be extracted. """ functions = ['Copy', 'Scale', 'Add', 'Triad'] records = [] content = raw_output.splitlines() for line in content: if 'Number of Threads counted' in line: line.split('= ')[1] self._result.add_result('threads', int(line.split('= ')[1])) for function in functions: if function in line: records.append(line) # individual results for record in records: entries = record.split() metric = entries[0].strip().replace(':', '') self._result.add_result(metric.lower() + '_throughput', float(entries[1].strip())) self._result.add_result(metric.lower() + '_time_avg', float(entries[2].strip())) self._result.add_result(metric.lower() + '_time_min', float(entries[3].strip())) self._result.add_result(metric.lower() + '_time_max', float(entries[4].strip())) # raw output self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) return True BenchmarkRegistry.register_benchmark('cpu-stream', CpuStreamBenchmark)