Adding Stream Benchmark (#473)

**Description** - Added stream benchmark - Added stream unit test - Added stream example - Modified docker files to build stream --------- Co-authored-by: Ubuntu <azureuser@sbtestvm.jzlku1oskncengjiado35wf1hd.ax.internal.cloudapp.net> Co-authored-by: Peng Cheng <chengpeng5555@outlook.com> Co-authored-by: Yifan Xiong <xiongyf@yandex.com>

Adding Stream Benchmark (#473)
**Description** - Added stream benchmark - Added stream unit test - Added stream example - Modified docker files to build stream --------- Co-authored-by: Ubuntu <azureuser@sbtestvm.jzlku1oskncengjiado35wf1hd.ax.internal.cloudapp.net> Co-authored-by: Peng Cheng <chengpeng5555@outlook.com> Co-authored-by: Yifan Xiong <xiongyf@yandex.com>
32896ca4 · rafsalas19 · GitHub · 62a29134 · 32896ca4 · 32896ca4
Unverified Commit 32896ca4 authored Feb 13, 2023 by rafsalas19 Committed by GitHub Feb 13, 2023
12 changed files
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -122,6 +122,12 @@ RUN echo PATH="$PATH" > /etc/environment && \
    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
+# Install AOCC compiler
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb
 # Add config files
 ADD dockerfile/etc /opt/microsoft/

--- a/dockerfile/cuda11.8.dockerfile
+++ b/dockerfile/cuda11.8.dockerfile
@@ -102,6 +102,12 @@ RUN echo PATH="$PATH" > /etc/environment && \
    echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
    echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
+# Install AOCC compiler
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb
 # Add config files
 ADD dockerfile/etc /opt/microsoft/

--- a/dockerfile/rocm5.0.x.dockerfile
+++ b/dockerfile/rocm5.0.x.dockerfile
@@ -102,6 +102,12 @@ RUN cd /tmp && \
    cp ./Linux/mlc /usr/local/bin/ && \
    rm -rf ./Linux mlc.tgz
+# Install AOCC compiler
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb
 # Install rccl-rdma-sharp-plugins
 ENV SHARP_VERSION=5.0
 RUN cd /opt/rocm && \

--- a/dockerfile/rocm5.1.x.dockerfile
+++ b/dockerfile/rocm5.1.x.dockerfile
@@ -114,6 +114,12 @@ RUN cd /tmp && \
    cp ./Linux/mlc /usr/local/bin/ && \
    rm -rf ./Linux mlc.tgz
+# Install AOCC compiler
+RUN cd /tmp && \
+    wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
+    apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
+    rm -rf aocc-compiler-4.0.0_1_amd64.deb
 ENV PATH="${PATH}:/opt/rocm/hip/bin/" \
    LD_LIBRARY_PATH="/usr/local/lib/:${LD_LIBRARY_PATH}" \
    SB_HOME=/opt/superbench \

--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -171,6 +171,23 @@ Supports the use of double unit types and the use of tensor cores.
 | gpu-burn/gpu_[0-9]_pass | yes/no   | The result of the gpu-burn test for each GPU (1: yes, 0: no).                      |
 | gpu-burn/abort          | yes/no   | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
+### `cpu-stream`
+#### Introduction
+Measure of memory bandwidth and computation rate for simple vector kernels.
+performed by [University of Virginia STREAM benchmark](https://www.cs.virginia.edu/stream/ref.html).
+#### Metrics
+| Name                                                     | Unit             | Description                                                         |
+|----------------------------------------------------------|------------------|---------------------------------------------------------------------|
+| cpu-stream/threads                                       |                  | Number of threads used for the test. Determined by core count.      |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_throughput | bandwidth (MB/s) | Memory throughput of designated kerel operation.                    |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_avg   | time (s)         | Average elapsed times over all iterations.                          |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_min   | time (s)         | Minimum elapsed times over all iterations.                          |
+| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_max   | time (s)         | Maximum elapsed times over all iterations.                          |
 ## Communication Benchmarks
 ### `cpu-memory-bw-latency`

--- a/examples/benchmarks/cpu_stream_performance.py
+++ b/examples/benchmarks/cpu_stream_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""Micro benchmark example for CPU Stream performance.
+Commands to run:
+  python3 examples/benchmarks/cpu_stream_performance.py
+"""
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.common.utils import logger
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'cpu-stream',
+        parameters='--cpu_arch zen3 \
+        --cores 0 4 8 12 16 20 24 28 30 34 38 42 46 50 54 58 60 64 68 72 76 80 84 88 90 94 98 102 106 110 114 118'
+    )
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -16,6 +16,7 @@ from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import Cuda
 from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
 from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
 from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
+from superbench.benchmarks.micro_benchmarks.cpu_stream_performance import CpuStreamBenchmark
 from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
@@ -32,6 +33,7 @@ from superbench.benchmarks.micro_benchmarks.tensorrt_inference_performance impor
 __all__ = [
    'ComputationCommunicationOverlap',
    'CpuMemBwLatencyBenchmark',
+    'CpuStreamBenchmark',
    'CublasBenchmark',
    'CublasLtBenchmark',
    'CudaGemmFlopsBenchmark',

--- a/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cpu_stream_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""Module for running the University of Virginia STREAM tool. It measures sustainable main memory \
+    bandwidth in MB/s and the corresponding computation rate for simple vector kernels."""
+import os
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+class CpuStreamBenchmark(MicroBenchmarkWithInvoke):
+    """The Stream benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'streamZen3.exe'
+        self.__cpu_arch = ['other', 'zen3', 'zen4']
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--cpu_arch',
+            type=str,
+            default='zen4',
+            required=False,
+            help='The targeted cpu architectures to run \
+                STREAM. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch))
+        )
+        core_link = 'https://techcommunity.microsoft.com/t5/azure-compute-blog/performance-\
+        amp-scalability-of-hbv3-vms-with-milan-x-cpus/ba-p/2939814'
+        self._parser.add_argument(
+            '--cores',
+            nargs='+',
+            type=int,
+            default=[
+                0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120, 126, 132, 140, 148, 156, 164, 170
+            ],
+            required=False,
+            help='List of cores to perform test. Default core configuration is for HBv4/Zen4 SKU offering. \
+            For HBv3/Zen3 please see: ' + core_link
+        )
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+        # zen3
+        # cores=[0, 4, 8, 12, 16, 20, 24, 28, 30, 34, 38, 42, 46, 50,
+        # 54, 58, 60, 64, 68, 72, 76, 80, 84, 88, 90, 94, 98, 102, 106, 110, 114, 118]
+        # zen4
+        # cores=[0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120,
+        # 126, 132, 140, 148, 156, 164, 170]
+        # parse cores argument
+        omp_places = ''
+        for core in self._args.cores:
+            omp_places += '{' + '{}:1'.format(core) + '}'
+        envar = 'OMP_SCHEDULE=static && OMP_DYNAMIC=false && OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && \
+            OMP_PROC_BIND=true && OMP_NUM_THREADS={} && OMP_PLACES={}'.format(len(self._args.cores), omp_places)
+        if self._args.cpu_arch == 'zen3':
+            exe = 'streamZen3.exe'
+        elif self._args.cpu_arch == 'zen4':
+            exe = 'streamZen4.exe'
+        else:
+            exe = 'streamx86.exe'
+        command = envar + ' ' + os.path.join(self._args.bin_dir, exe)
+        self._bin_name = exe
+        if not self._set_binary_path():
+            logger.error(
+                'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
+            )
+            return False
+        self._commands.append(command)
+        return True
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        functions = ['Copy', 'Scale', 'Add', 'Triad']
+        records = []
+        content = raw_output.splitlines()
+        for line in content:
+            if 'Number of Threads counted' in line:
+                line.split('= ')[1]
+                self._result.add_result('threads', int(line.split('= ')[1]))
+            for function in functions:
+                if function in line:
+                    records.append(line)
+        # individual results
+        for record in records:
+            entries = record.split()
+            metric = entries[0].strip().replace(':', '')
+            self._result.add_result(metric.lower() + '_throughput', float(entries[1].strip()))
+            self._result.add_result(metric.lower() + '_time_avg', float(entries[2].strip()))
+            self._result.add_result(metric.lower() + '_time_min', float(entries[3].strip()))
+            self._result.add_result(metric.lower() + '_time_max', float(entries[4].strip()))
+        # raw output
+        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+        return True
+BenchmarkRegistry.register_benchmark('cpu-stream', CpuStreamBenchmark)
--- a/tests/benchmarks/micro_benchmarks/test_cpu_stream_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cpu_stream_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""Tests for STREAM benchmark."""
+import unittest
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+class CpuStreamBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
+    """Test class for STREAM benchmark."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/streamZen3.exe'])
+        return True
+    @decorator.load_data('tests/data/streamResult.log')
+    def test_stream(self, results):
+        """Test STREAM benchmark command generation."""
+        benchmark_name = 'cpu-stream'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
+        assert (benchmark_class)
+        cores = '0 4 8 12 16 20 24 28 30 34 38 42 46 50 54 58 60 64 68 72 76 80 84 88 90 94 98 102 106 110 114 118'
+        coresList = [
+            0, 4, 8, 12, 16, 20, 24, 28, 30, 34, 38, 42, 46, 50, 54, 58, 60, 64, 68, 72, 76, 80, 84, 88, 90, 94, 98,
+            102, 106, 110, 114, 118
+        ]
+        arch = 'zen3'
+        parameters = '--cpu_arch ' + arch + ' --cores ' + cores
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        # Check basic information
+        assert (benchmark)
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == benchmark_name)
+        assert (benchmark.type == BenchmarkType.MICRO)
+        # Check parameters specified in BenchmarkContext.
+        assert (benchmark._args.cores == coresList)
+        assert (benchmark._args.cpu_arch == arch)
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert ('OMP_PLACES' in benchmark._commands[0])
+        # Check results
+        assert (benchmark._process_raw_result(0, results))
+        assert (benchmark.result['return_code'][0] == 0)
+        functions = ['copy', 'scale', 'add', 'triad']
+        values = [342008.3, 342409.6, 343827.7, 363208.7]
+        for index in range(0, 4):
+            result = float(benchmark.result[functions[index] + '_throughput'][0])
+            print(result, values[index])
+            assert (result == values[index])
+        assert (int(benchmark.result['threads'][0]) == 32)
+if __name__ == '__main__':
+    unittest.main()
--- a/tests/data/streamResult.log
+++ b/tests/data/streamResult.log
+-------------------------------------------------------------
+STREAM version $Revision: 5.10 $
+-------------------------------------------------------------
+This system uses 8 bytes per array element.
+-------------------------------------------------------------
+Array size = 400000000 (elements), Offset = 0 (elements)
+Memory per array = 3051.8 MiB (= 3.0 GiB).
+Total memory required = 9155.3 MiB (= 8.9 GiB).
+Each kernel will be executed 10 times.
+ The *best* time for each kernel (excluding the first iteration)
+ will be used to compute the reported bandwidth.
+-------------------------------------------------------------
+Number of Threads requested = 32
+Number of Threads counted = 32
+-------------------------------------------------------------
+Your clock granularity/precision appears to be 1 microseconds.
+Each test below will take on the order of 19105 microseconds.
+   (= 19105 clock ticks)
+Increase the size of the arrays if this shows that
+you are not getting at least 20 clock ticks per test.
+-------------------------------------------------------------
+WARNING -- The above is only a rough guideline.
+For best results, please be sure you know the
+precision of your system timer.
+-------------------------------------------------------------
+Function    Best Rate MB/s  Avg time     Min time     Max time
+Copy:          342008.3     0.018755     0.018713     0.018895
+Scale:         342409.6     0.018737     0.018691     0.018802
+Add:           343827.7     0.028050     0.027921     0.028269
+Triad:         363208.7     0.026599     0.026431     0.026855
+-------------------------------------------------------------
+Solution Validates: avg error less than 1.000000e-13 on all three arrays
+-------------------------------------------------------------
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -11,13 +11,13 @@ HPCX_HOME ?= /opt/hpcx
 CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
 ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
-.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn
+.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream
 # Build all targets.
 all: cuda rocm
 cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest
-common: fio
+common: fio cpu_stream
 # Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
 sb_micro_path:
@@ -118,3 +118,13 @@ ifneq (,$(wildcard gpu-burn/Makefile))
 	cp -v ./gpu-burn/gpu_burn $(SB_MICRO_PATH)/bin/
 	cp -v ./gpu-burn/compare.ptx $(SB_MICRO_PATH)/bin/
 endif
+#Build stream from main branch (only branch that exists)
+cpu_stream: sb_micro_path
+ifneq (,$(wildcard stream-tests/Makefile))
+	cd ./stream-tests && \
+    wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
+	make all
+	cp -v ./stream-tests/stream*.exe $(SB_MICRO_PATH)/bin/
+endif
--- a/third_party/stream-tests/Makefile
+++ b/third_party/stream-tests/Makefile
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang
+CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10
+GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000 
+ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 
+ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 
+GEN_OUTPUT= streamx86.exe
+ZEN3_OUTPUT= streamZen3.exe
+ZEN4_OUTPUT= streamZen4.exe
+all: ZEN3 ZEN4 X86
+ZEN3: stream.c
+	$(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT)
+ZEN4:
+	$(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT)
+X86:
+	$(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT)
+clean:
+	rm $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT)