Benchmark - Add Grace CPU support for CPU Stream (#719)

**Description** Added support for Grace CPU neo2 architecture in CPU Stream. Now CPU Stream supports dual socket benchmarking. Example config for this arch support: ```yaml cpu-stream:numa0: timeout: *default_timeout modes: - name: local parallel: no parameters: cpu_arch: neo2 numa_mem_nodes: 0 cores: 0 1 2 3 4 5 6 7 8 cpu-stream:numa1: timeout: *default_timeout modes: - name: local parallel: no parameters: cpu_arch: neo2 numa_mem_nodes: 1 cores: 64 65 66 67 68 69 70 71 72 cpu-stream:numa-spread: timeout: *default_timeout modes: - name: local parallel: no parameters: cpu_arch: neo2 numa_mem_nodes: 0 1 cores: 0 1 2 3 4 5 6 7 8 64 65 66 67 68 69 70 71 72 ``` --------- Co-authored-by: dpower4 <dilipreddi@gmail.com>

Benchmark - Add Grace CPU support for CPU Stream (#719)
**Description** Added support for Grace CPU neo2 architecture in CPU Stream. Now CPU Stream supports dual socket benchmarking. Example config for this arch support: ```yaml cpu-stream:numa0: timeout: *default_timeout modes: - name: local parallel: no parameters: cpu_arch: neo2 numa_mem_nodes: 0 cores: 0 1 2 3 4 5 6 7 8 cpu-stream:numa1: timeout: *default_timeout modes: - name: local parallel: no parameters: cpu_arch: neo2 numa_mem_nodes: 1 cores: 64 65 66 67 68 69 70 71 72 cpu-stream:numa-spread: timeout: *default_timeout modes: - name: local parallel: no parameters: cpu_arch: neo2 numa_mem_nodes: 0 1 cores: 0 1 2 3 4 5 6 7 8 64 65 66 67 68 69 70 71 72 ``` --------- Co-authored-by: dpower4 <dilipreddi@gmail.com>
0b8d1fd4 · WenqingLan1 · GitHub · 4eddd50a · 0b8d1fd4 · 0b8d1fd4
Unverified Commit 0b8d1fd4 authored Jun 19, 2025 by WenqingLan1 Committed by GitHub Jun 19, 2025
5 changed files
--- a/examples/benchmarks/cpu_stream_performance.py
+++ b/examples/benchmarks/cpu_stream_performance.py
@@ -12,9 +12,7 @@ from superbench.common.utils import logger

 if __name__ == '__main__':
    context = BenchmarkRegistry.create_benchmark_context(
-        'cpu-stream',
-        parameters='--cpu_arch zen3 \
-        --cores 0 4 8 12 16 20 24 28 30 34 38 42 46 50 54 58 60 64 68 72 76 80 84 88 90 94 98 102 106 110 114 118'
+        'cpu-stream', parameters='--cpu_arch neo2 --numa_mem_nodes 0 --cores 0 1 2 3'
    )

    benchmark = BenchmarkRegistry.launch_benchmark(context)

--- a/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py
@@ -52,6 +52,15 @@ class CpuStreamBenchmark(MicroBenchmarkWithInvoke):
            For HBv3/Zen3 please see: ' + core_link
        )

+        self._parser.add_argument(
+            '--numa_mem_nodes',
+            nargs='+',
+            type=int,
+            default=None,    # None means system default
+            required=False,
+            help='List of NUMA memory nodes to bind to. If not set, system default will be used.'
+        )
+
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

@@ -67,14 +76,25 @@ class CpuStreamBenchmark(MicroBenchmarkWithInvoke):
        # zen4
        # cores=[0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120,
        # 126, 132, 140, 148, 156, 164, 170]
-
-        # parse cores argument
-        omp_places = ''
-        for core in self._args.cores:
-            omp_places += '{' + '{}:1'.format(core) + '}'
-
-        envar = 'OMP_SCHEDULE=static && OMP_DYNAMIC=false && OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && \
-            OMP_PROC_BIND=true && OMP_NUM_THREADS={} && OMP_PLACES={}'.format(len(self._args.cores), omp_places)
+        # neo2: grace dual socket has 2 sockets, each socket has 72 cores
+        #   numa node0: cores=[0, 1, 2,... 70, 71]
+        #   numa node1: cores=[72, 73,... 142, 143]
+
+        # parse cores into a comma-separated list of places for libgomp
+        omp_places = ','.join(f'{{{core}}}' for core in self._args.cores)
+
+        envar = (
+            'OMP_SCHEDULE=static && OMP_DYNAMIC=false && '
+            'OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && '
+            'OMP_PROC_BIND=true && OMP_NUM_THREADS={} && '
+            'OMP_PLACES={}'
+        ).format(len(self._args.cores), omp_places)
+
+        # if binding to NUMA memory nodes, prefix with numactl
+        numa_cmd = ''
+        if self._args.numa_mem_nodes is not None:
+            mem_node_str = ','.join(map(str, self._args.numa_mem_nodes))
+            numa_cmd = f'numactl -m{mem_node_str}'

        # set the binary name based on cpu architecture
        if self._args.cpu_arch == 'zen3':
@@ -84,7 +104,8 @@ class CpuStreamBenchmark(MicroBenchmarkWithInvoke):
        elif self._args.cpu_arch == 'neo2':
            self._bin_name = 'streamNeo2'

-        command = envar + ' ' + os.path.join(self._args.bin_dir, self._bin_name)
+        binary_path = os.path.join(self._args.bin_dir, self._bin_name)
+        command = f'{envar} {numa_cmd} {binary_path}'

        if not self._set_binary_path():
            logger.error(

--- a/tests/benchmarks/micro_benchmarks/test_cpu_stream_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cpu_stream_performance.py
@@ -19,10 +19,11 @@ class CpuStreamBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
        cls.createMockEnvs(cls)
        cls.createMockFiles(cls, ['bin/stream'])
        cls.createMockFiles(cls, ['bin/streamZen3'])
+        cls.createMockFiles(cls, ['bin/streamNeo2'])
        return True

-    @decorator.load_data('tests/data/streamResult.log')
-    def test_stream(self, results):
+    @decorator.load_data('tests/data/streamResultZen.log')
+    def test_stream_zen(self, results):
        """Test STREAM benchmark command generation."""
        benchmark_name = 'cpu-stream'
        (benchmark_class,
@@ -65,6 +66,46 @@ class CpuStreamBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
            assert (result == values[index])
        assert (int(benchmark.result['threads'][0]) == 32)

+    @decorator.load_data('tests/data/streamResultNeo.log')
+    def test_stream_neo(self, results):
+        """Test STREAM benchmark command generation for neo2."""
+        benchmark_name = 'cpu-stream'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+        assert (benchmark_class)
+
+        cores = '0 1 2 3'
+        coresList = [0, 1, 2, 3]
+        arch = 'neo2'
+        parameters = '--cpu_arch ' + arch + ' --cores ' + cores + ' --numa_mem_nodes 0'
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+
+        # Check basic information
+        assert (benchmark)
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == benchmark_name)
+        assert (benchmark.type == BenchmarkType.MICRO)
+
+        # Check parameters specified in BenchmarkContext.
+        assert (benchmark._args.cores == coresList)
+        assert (benchmark._args.cpu_arch == arch)
+        assert (benchmark._args.numa_mem_nodes == [0])
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('OMP_PLACES' in benchmark._commands[0])
+        assert ('numactl -m0' in benchmark._commands[0])
+
+        # Check results
+        assert (benchmark._process_raw_result(0, results))
+        functions = ['copy', 'scale', 'add', 'triad']
+        values = [108180.5, 108572.8, 104571.7, 105256.0]
+        for index in range(0, 4):
+            result = float(benchmark.result[functions[index] + '_throughput'][0])
+            assert (result == values[index])
+

 if __name__ == '__main__':
    unittest.main()
--- a/tests/data/streamResultNeo.log
+++ b/tests/data/streamResultNeo.log
+-------------------------------------------------------------
+STREAM version $Revision: 5.10 $
+-------------------------------------------------------------
+This system uses 8 bytes per array element.
+-------------------------------------------------------------
+Array size = 120000000 (elements), Offset = 0 (elements)
+Memory per array = 915.5 MiB (= 0.9 GiB).
+Total memory required = 2746.6 MiB (= 2.7 GiB).
+Each kernel will be executed 200 times.
+ The *best* time for each kernel (excluding the first iteration)
+ will be used to compute the reported bandwidth.
+-------------------------------------------------------------
+Number of Threads requested = 4
+Number of Threads counted = 4
+-------------------------------------------------------------
+Your clock granularity/precision appears to be 1 microseconds.
+Each test below will take on the order of 13558 microseconds.
+   (= 13558 clock ticks)
+Increase the size of the arrays if this shows that
+you are not getting at least 20 clock ticks per test.
+-------------------------------------------------------------
+WARNING -- The above is only a rough guideline.
+For best results, please be sure you know the
+precision of your system timer.
+-------------------------------------------------------------
+Function    Best Rate MB/s  Avg time     Min time     Max time
+Copy:          108180.5     0.017811     0.017748     0.017950
+Scale:         108572.8     0.017753     0.017684     0.017948
+Add:           104571.7     0.027626     0.027541     0.028479
+Triad:         105256.0     0.027448     0.027362     0.027785
+-------------------------------------------------------------
+Solution Validates: avg error less than 1.000000e-13 on all three arrays
+-------------------------------------------------------------
\ No newline at end of file
--- a/tests/data/streamResult.log
+++ b/tests/data/streamResult.log