Unverified Commit 0b8d1fd4 authored by WenqingLan1's avatar WenqingLan1 Committed by GitHub
Browse files

Benchmark - Add Grace CPU support for CPU Stream (#719)



**Description**
Added support for Grace CPU neo2 architecture in CPU Stream. Now CPU
Stream supports dual socket benchmarking.

Example config for this arch support:
```yaml
    cpu-stream:numa0:
      timeout: *default_timeout
      modes:
      - name: local
        parallel: no
      parameters:
        cpu_arch: neo2
        numa_mem_nodes: 0
        cores: 0 1 2 3 4 5 6 7 8
    cpu-stream:numa1:
      timeout: *default_timeout
      modes:
      - name: local
        parallel: no
      parameters:
        cpu_arch: neo2
        numa_mem_nodes: 1
        cores: 64 65 66 67 68 69 70 71 72
    cpu-stream:numa-spread:
      timeout: *default_timeout
      modes:
      - name: local
        parallel: no
      parameters:
        cpu_arch: neo2
        numa_mem_nodes: 0 1
        cores: 0 1 2 3 4 5 6 7 8 64 65 66 67 68 69 70 71 72
```

---------
Co-authored-by: default avatardpower4 <dilipreddi@gmail.com>
parent 4eddd50a
......@@ -12,9 +12,7 @@
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'cpu-stream',
parameters='--cpu_arch zen3 \
--cores 0 4 8 12 16 20 24 28 30 34 38 42 46 50 54 58 60 64 68 72 76 80 84 88 90 94 98 102 106 110 114 118'
'cpu-stream', parameters='--cpu_arch neo2 --numa_mem_nodes 0 --cores 0 1 2 3'
)
benchmark = BenchmarkRegistry.launch_benchmark(context)
......
......@@ -52,6 +52,15 @@ def add_parser_arguments(self):
For HBv3/Zen3 please see: ' + core_link
)
self._parser.add_argument(
'--numa_mem_nodes',
nargs='+',
type=int,
default=None, # None means system default
required=False,
help='List of NUMA memory nodes to bind to. If not set, system default will be used.'
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
......@@ -67,14 +76,25 @@ def _preprocess(self):
# zen4
# cores=[0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120,
# 126, 132, 140, 148, 156, 164, 170]
# parse cores argument
omp_places = ''
for core in self._args.cores:
omp_places += '{' + '{}:1'.format(core) + '}'
envar = 'OMP_SCHEDULE=static && OMP_DYNAMIC=false && OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && \
OMP_PROC_BIND=true && OMP_NUM_THREADS={} && OMP_PLACES={}'.format(len(self._args.cores), omp_places)
# neo2: grace dual socket has 2 sockets, each socket has 72 cores
# numa node0: cores=[0, 1, 2,... 70, 71]
# numa node1: cores=[72, 73,... 142, 143]
# parse cores into a comma-separated list of places for libgomp
omp_places = ','.join(f'{{{core}}}' for core in self._args.cores)
envar = (
'OMP_SCHEDULE=static && OMP_DYNAMIC=false && '
'OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && '
'OMP_PROC_BIND=true && OMP_NUM_THREADS={} && '
'OMP_PLACES={}'
).format(len(self._args.cores), omp_places)
# if binding to NUMA memory nodes, prefix with numactl
numa_cmd = ''
if self._args.numa_mem_nodes is not None:
mem_node_str = ','.join(map(str, self._args.numa_mem_nodes))
numa_cmd = f'numactl -m{mem_node_str}'
# set the binary name based on cpu architecture
if self._args.cpu_arch == 'zen3':
......@@ -84,7 +104,8 @@ def _preprocess(self):
elif self._args.cpu_arch == 'neo2':
self._bin_name = 'streamNeo2'
command = envar + ' ' + os.path.join(self._args.bin_dir, self._bin_name)
binary_path = os.path.join(self._args.bin_dir, self._bin_name)
command = f'{envar} {numa_cmd} {binary_path}'
if not self._set_binary_path():
logger.error(
......
......@@ -19,10 +19,11 @@ def setUpClass(cls):
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/stream'])
cls.createMockFiles(cls, ['bin/streamZen3'])
cls.createMockFiles(cls, ['bin/streamNeo2'])
return True
@decorator.load_data('tests/data/streamResult.log')
def test_stream(self, results):
@decorator.load_data('tests/data/streamResultZen.log')
def test_stream_zen(self, results):
"""Test STREAM benchmark command generation."""
benchmark_name = 'cpu-stream'
(benchmark_class,
......@@ -65,6 +66,46 @@ def test_stream(self, results):
assert (result == values[index])
assert (int(benchmark.result['threads'][0]) == 32)
@decorator.load_data('tests/data/streamResultNeo.log')
def test_stream_neo(self, results):
"""Test STREAM benchmark command generation for neo2."""
benchmark_name = 'cpu-stream'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
assert (benchmark_class)
cores = '0 1 2 3'
coresList = [0, 1, 2, 3]
arch = 'neo2'
parameters = '--cpu_arch ' + arch + ' --cores ' + cores + ' --numa_mem_nodes 0'
benchmark = benchmark_class(benchmark_name, parameters=parameters)
# Check basic information
assert (benchmark)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == benchmark_name)
assert (benchmark.type == BenchmarkType.MICRO)
# Check parameters specified in BenchmarkContext.
assert (benchmark._args.cores == coresList)
assert (benchmark._args.cpu_arch == arch)
assert (benchmark._args.numa_mem_nodes == [0])
# Check command
assert (1 == len(benchmark._commands))
assert ('OMP_PLACES' in benchmark._commands[0])
assert ('numactl -m0' in benchmark._commands[0])
# Check results
assert (benchmark._process_raw_result(0, results))
functions = ['copy', 'scale', 'add', 'triad']
values = [108180.5, 108572.8, 104571.7, 105256.0]
for index in range(0, 4):
result = float(benchmark.result[functions[index] + '_throughput'][0])
assert (result == values[index])
if __name__ == '__main__':
unittest.main()
-------------------------------------------------------------
STREAM version $Revision: 5.10 $
-------------------------------------------------------------
This system uses 8 bytes per array element.
-------------------------------------------------------------
Array size = 120000000 (elements), Offset = 0 (elements)
Memory per array = 915.5 MiB (= 0.9 GiB).
Total memory required = 2746.6 MiB (= 2.7 GiB).
Each kernel will be executed 200 times.
The *best* time for each kernel (excluding the first iteration)
will be used to compute the reported bandwidth.
-------------------------------------------------------------
Number of Threads requested = 4
Number of Threads counted = 4
-------------------------------------------------------------
Your clock granularity/precision appears to be 1 microseconds.
Each test below will take on the order of 13558 microseconds.
(= 13558 clock ticks)
Increase the size of the arrays if this shows that
you are not getting at least 20 clock ticks per test.
-------------------------------------------------------------
WARNING -- The above is only a rough guideline.
For best results, please be sure you know the
precision of your system timer.
-------------------------------------------------------------
Function Best Rate MB/s Avg time Min time Max time
Copy: 108180.5 0.017811 0.017748 0.017950
Scale: 108572.8 0.017753 0.017684 0.017948
Add: 104571.7 0.027626 0.027541 0.028479
Triad: 105256.0 0.027448 0.027362 0.027785
-------------------------------------------------------------
Solution Validates: avg error less than 1.000000e-13 on all three arrays
-------------------------------------------------------------
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment