Unverified Commit 32896ca4 authored by rafsalas19's avatar rafsalas19 Committed by GitHub
Browse files

Adding Stream Benchmark (#473)



**Description**

- Added stream benchmark
- Added stream unit test
- Added stream example
- Modified docker files to build stream

---------
Co-authored-by: default avatarUbuntu <azureuser@sbtestvm.jzlku1oskncengjiado35wf1hd.ax.internal.cloudapp.net>
Co-authored-by: default avatarPeng Cheng <chengpeng5555@outlook.com>
Co-authored-by: default avatarYifan Xiong <xiongyf@yandex.com>
parent 62a29134
......@@ -122,6 +122,12 @@ RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
# Install AOCC compiler
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb
# Add config files
ADD dockerfile/etc /opt/microsoft/
......
......@@ -102,6 +102,12 @@ RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
# Install AOCC compiler
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb
# Add config files
ADD dockerfile/etc /opt/microsoft/
......
......@@ -102,6 +102,12 @@ RUN cd /tmp && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
# Install AOCC compiler
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb
# Install rccl-rdma-sharp-plugins
ENV SHARP_VERSION=5.0
RUN cd /opt/rocm && \
......
......@@ -114,6 +114,12 @@ RUN cd /tmp && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
# Install AOCC compiler
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb
ENV PATH="${PATH}:/opt/rocm/hip/bin/" \
LD_LIBRARY_PATH="/usr/local/lib/:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
......
......@@ -171,6 +171,23 @@ Supports the use of double unit types and the use of tensor cores.
| gpu-burn/gpu_[0-9]_pass | yes/no | The result of the gpu-burn test for each GPU (1: yes, 0: no). |
| gpu-burn/abort | yes/no | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
### `cpu-stream`
#### Introduction
Measure of memory bandwidth and computation rate for simple vector kernels.
performed by [University of Virginia STREAM benchmark](https://www.cs.virginia.edu/stream/ref.html).
#### Metrics
| Name | Unit | Description |
|----------------------------------------------------------|------------------|---------------------------------------------------------------------|
| cpu-stream/threads | | Number of threads used for the test. Determined by core count. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_throughput | bandwidth (MB/s) | Memory throughput of designated kerel operation. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_avg | time (s) | Average elapsed times over all iterations. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_min | time (s) | Minimum elapsed times over all iterations. |
| cpu-stream/['copy', 'scale', 'add', 'triad']\_time_max | time (s) | Maximum elapsed times over all iterations. |
## Communication Benchmarks
### `cpu-memory-bw-latency`
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for CPU Stream performance.
Commands to run:
python3 examples/benchmarks/cpu_stream_performance.py
"""
from superbench.benchmarks import BenchmarkRegistry
from superbench.common.utils import logger
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'cpu-stream',
parameters='--cpu_arch zen3 \
--cores 0 4 8 12 16 20 24 28 30 34 38 42 46 50 54 58 60 64 68 72 76 80 84 88 90 94 98 102 106 110 114 118'
)
benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
......@@ -16,6 +16,7 @@
from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
from superbench.benchmarks.micro_benchmarks.disk_performance import DiskBenchmark
from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
from superbench.benchmarks.micro_benchmarks.cpu_stream_performance import CpuStreamBenchmark
from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
......@@ -32,6 +33,7 @@
__all__ = [
'ComputationCommunicationOverlap',
'CpuMemBwLatencyBenchmark',
'CpuStreamBenchmark',
'CublasBenchmark',
'CublasLtBenchmark',
'CudaGemmFlopsBenchmark',
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module for running the University of Virginia STREAM tool. It measures sustainable main memory \
bandwidth in MB/s and the corresponding computation rate for simple vector kernels."""
import os
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class CpuStreamBenchmark(MicroBenchmarkWithInvoke):
"""The Stream benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'streamZen3.exe'
self.__cpu_arch = ['other', 'zen3', 'zen4']
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--cpu_arch',
type=str,
default='zen4',
required=False,
help='The targeted cpu architectures to run \
STREAM. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch))
)
core_link = 'https://techcommunity.microsoft.com/t5/azure-compute-blog/performance-\
amp-scalability-of-hbv3-vms-with-milan-x-cpus/ba-p/2939814'
self._parser.add_argument(
'--cores',
nargs='+',
type=int,
default=[
0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120, 126, 132, 140, 148, 156, 164, 170
],
required=False,
help='List of cores to perform test. Default core configuration is for HBv4/Zen4 SKU offering. \
For HBv3/Zen3 please see: ' + core_link
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
# zen3
# cores=[0, 4, 8, 12, 16, 20, 24, 28, 30, 34, 38, 42, 46, 50,
# 54, 58, 60, 64, 68, 72, 76, 80, 84, 88, 90, 94, 98, 102, 106, 110, 114, 118]
# zen4
# cores=[0, 8, 16, 24, 32, 38, 44, 52, 60, 68, 76, 82, 88, 96, 104, 112, 120,
# 126, 132, 140, 148, 156, 164, 170]
# parse cores argument
omp_places = ''
for core in self._args.cores:
omp_places += '{' + '{}:1'.format(core) + '}'
envar = 'OMP_SCHEDULE=static && OMP_DYNAMIC=false && OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && \
OMP_PROC_BIND=true && OMP_NUM_THREADS={} && OMP_PLACES={}'.format(len(self._args.cores), omp_places)
if self._args.cpu_arch == 'zen3':
exe = 'streamZen3.exe'
elif self._args.cpu_arch == 'zen4':
exe = 'streamZen4.exe'
else:
exe = 'streamx86.exe'
command = envar + ' ' + os.path.join(self._args.bin_dir, exe)
self._bin_name = exe
if not self._set_binary_path():
logger.error(
'Executable {} not found in {} or it is not executable'.format(self._bin_name, self._args.bin_dir)
)
return False
self._commands.append(command)
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
functions = ['Copy', 'Scale', 'Add', 'Triad']
records = []
content = raw_output.splitlines()
for line in content:
if 'Number of Threads counted' in line:
line.split('= ')[1]
self._result.add_result('threads', int(line.split('= ')[1]))
for function in functions:
if function in line:
records.append(line)
# individual results
for record in records:
entries = record.split()
metric = entries[0].strip().replace(':', '')
self._result.add_result(metric.lower() + '_throughput', float(entries[1].strip()))
self._result.add_result(metric.lower() + '_time_avg', float(entries[2].strip()))
self._result.add_result(metric.lower() + '_time_min', float(entries[3].strip()))
self._result.add_result(metric.lower() + '_time_max', float(entries[4].strip()))
# raw output
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
return True
BenchmarkRegistry.register_benchmark('cpu-stream', CpuStreamBenchmark)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for STREAM benchmark."""
import unittest
from tests.helper import decorator
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
class CpuStreamBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
"""Test class for STREAM benchmark."""
@classmethod
def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class."""
super().setUpClass()
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/streamZen3.exe'])
return True
@decorator.load_data('tests/data/streamResult.log')
def test_stream(self, results):
"""Test STREAM benchmark command generation."""
benchmark_name = 'cpu-stream'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CPU)
assert (benchmark_class)
cores = '0 4 8 12 16 20 24 28 30 34 38 42 46 50 54 58 60 64 68 72 76 80 84 88 90 94 98 102 106 110 114 118'
coresList = [
0, 4, 8, 12, 16, 20, 24, 28, 30, 34, 38, 42, 46, 50, 54, 58, 60, 64, 68, 72, 76, 80, 84, 88, 90, 94, 98,
102, 106, 110, 114, 118
]
arch = 'zen3'
parameters = '--cpu_arch ' + arch + ' --cores ' + cores
benchmark = benchmark_class(benchmark_name, parameters=parameters)
# Check basic information
assert (benchmark)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == benchmark_name)
assert (benchmark.type == BenchmarkType.MICRO)
# Check parameters specified in BenchmarkContext.
assert (benchmark._args.cores == coresList)
assert (benchmark._args.cpu_arch == arch)
# Check command
assert (1 == len(benchmark._commands))
assert ('OMP_PLACES' in benchmark._commands[0])
# Check results
assert (benchmark._process_raw_result(0, results))
assert (benchmark.result['return_code'][0] == 0)
functions = ['copy', 'scale', 'add', 'triad']
values = [342008.3, 342409.6, 343827.7, 363208.7]
for index in range(0, 4):
result = float(benchmark.result[functions[index] + '_throughput'][0])
print(result, values[index])
assert (result == values[index])
assert (int(benchmark.result['threads'][0]) == 32)
if __name__ == '__main__':
unittest.main()
-------------------------------------------------------------
STREAM version $Revision: 5.10 $
-------------------------------------------------------------
This system uses 8 bytes per array element.
-------------------------------------------------------------
Array size = 400000000 (elements), Offset = 0 (elements)
Memory per array = 3051.8 MiB (= 3.0 GiB).
Total memory required = 9155.3 MiB (= 8.9 GiB).
Each kernel will be executed 10 times.
The *best* time for each kernel (excluding the first iteration)
will be used to compute the reported bandwidth.
-------------------------------------------------------------
Number of Threads requested = 32
Number of Threads counted = 32
-------------------------------------------------------------
Your clock granularity/precision appears to be 1 microseconds.
Each test below will take on the order of 19105 microseconds.
(= 19105 clock ticks)
Increase the size of the arrays if this shows that
you are not getting at least 20 clock ticks per test.
-------------------------------------------------------------
WARNING -- The above is only a rough guideline.
For best results, please be sure you know the
precision of your system timer.
-------------------------------------------------------------
Function Best Rate MB/s Avg time Min time Max time
Copy: 342008.3 0.018755 0.018713 0.018895
Scale: 342409.6 0.018737 0.018691 0.018802
Add: 343827.7 0.028050 0.027921 0.028269
Triad: 363208.7 0.026599 0.026431 0.026855
-------------------------------------------------------------
Solution Validates: avg error less than 1.000000e-13 on all three arrays
-------------------------------------------------------------
......@@ -11,13 +11,13 @@ HPCX_HOME ?= /opt/hpcx
CUDA_VER ?= $(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
ROCBLAS_BRANCH ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream
# Build all targets.
all: cuda rocm
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest
common: fio
common: fio cpu_stream
# Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
sb_micro_path:
......@@ -118,3 +118,13 @@ ifneq (,$(wildcard gpu-burn/Makefile))
cp -v ./gpu-burn/gpu_burn $(SB_MICRO_PATH)/bin/
cp -v ./gpu-burn/compare.ptx $(SB_MICRO_PATH)/bin/
endif
#Build stream from main branch (only branch that exists)
cpu_stream: sb_micro_path
ifneq (,$(wildcard stream-tests/Makefile))
cd ./stream-tests && \
wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
make all
cp -v ./stream-tests/stream*.exe $(SB_MICRO_PATH)/bin/
endif
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang
CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10
GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000
ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3
ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4
GEN_OUTPUT= streamx86.exe
ZEN3_OUTPUT= streamZen3.exe
ZEN4_OUTPUT= streamZen4.exe
all: ZEN3 ZEN4 X86
ZEN3: stream.c
$(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT)
ZEN4:
$(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT)
X86:
$(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT)
clean:
rm $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment