Unverified Commit 991c0051 authored by Hongtao Zhang's avatar Hongtao Zhang Committed by GitHub
Browse files

microbenchmark - CPU Stream Benchmark Revise (#712)



In the current implementation, the CPU‑stream benchmark code renames the
binary before the microbench base class can verify its existence,
causing the default‐binary check to fail.

This PR adds a “default” binary—built with the standard compile
parameters—so that the base class can always find and validate it. Once
the default binary is in place, the CPU‑stream code will rename it as
needed and re‑check its presence before running the benchmark.

The PR also enable CPU stream in the default settings.

---------
Co-authored-by: default avatarHongtao Zhang <hongtaozhang@microsoft.com>
parent 431bf19c
...@@ -22,7 +22,7 @@ def __init__(self, name, parameters=''): ...@@ -22,7 +22,7 @@ def __init__(self, name, parameters=''):
""" """
super().__init__(name, parameters) super().__init__(name, parameters)
self._bin_name = 'streamZen3.exe' self._bin_name = 'stream'
self.__cpu_arch = ['other', 'zen3', 'zen4', 'neo2'] self.__cpu_arch = ['other', 'zen3', 'zen4', 'neo2']
def add_parser_arguments(self): def add_parser_arguments(self):
...@@ -32,7 +32,7 @@ def add_parser_arguments(self): ...@@ -32,7 +32,7 @@ def add_parser_arguments(self):
self._parser.add_argument( self._parser.add_argument(
'--cpu_arch', '--cpu_arch',
type=str, type=str,
default='zen4', default='other',
required=False, required=False,
help='The targeted cpu architectures to run \ help='The targeted cpu architectures to run \
STREAM. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch)) STREAM. Default is zen4. Possible values are {}.'.format(' '.join(self.__cpu_arch))
...@@ -76,17 +76,15 @@ def _preprocess(self): ...@@ -76,17 +76,15 @@ def _preprocess(self):
envar = 'OMP_SCHEDULE=static && OMP_DYNAMIC=false && OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && \ envar = 'OMP_SCHEDULE=static && OMP_DYNAMIC=false && OMP_MAX_ACTIVE_LEVELS=1 && OMP_STACKSIZE=256M && \
OMP_PROC_BIND=true && OMP_NUM_THREADS={} && OMP_PLACES={}'.format(len(self._args.cores), omp_places) OMP_PROC_BIND=true && OMP_NUM_THREADS={} && OMP_PLACES={}'.format(len(self._args.cores), omp_places)
# set the binary name based on cpu architecture
if self._args.cpu_arch == 'zen3': if self._args.cpu_arch == 'zen3':
exe = 'streamZen3.exe' self._bin_name = 'streamZen3'
elif self._args.cpu_arch == 'zen4': elif self._args.cpu_arch == 'zen4':
exe = 'streamZen4.exe' self._bin_name = 'streamZen4'
elif self._args.cpu_arch == 'neo2': elif self._args.cpu_arch == 'neo2':
exe = 'streamNeo2.exe' self._bin_name = 'streamNeo2'
else:
exe = 'streamx86.exe'
command = envar + ' ' + os.path.join(self._args.bin_dir, exe) command = envar + ' ' + os.path.join(self._args.bin_dir, self._bin_name)
self._bin_name = exe
if not self._set_binary_path(): if not self._set_binary_path():
logger.error( logger.error(
......
...@@ -17,7 +17,8 @@ def setUpClass(cls): ...@@ -17,7 +17,8 @@ def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class.""" """Hook method for setting up class fixture before running tests in the class."""
super().setUpClass() super().setUpClass()
cls.createMockEnvs(cls) cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/streamZen3.exe']) cls.createMockFiles(cls, ['bin/stream'])
cls.createMockFiles(cls, ['bin/streamZen3'])
return True return True
@decorator.load_data('tests/data/streamResult.log') @decorator.load_data('tests/data/streamResult.log')
......
...@@ -23,12 +23,12 @@ all: cuda rocm ...@@ -23,12 +23,12 @@ all: cuda rocm
cuda_with_msccl: cuda cuda_msccl cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
cpu: common cpu_perftest cpu_stream cpu: common cpu_perftest
common: fio common: fio cpu_stream
# non aarch64 specific targets # non aarch64 specific targets
ifneq ($(shell uname -m), aarch64) ifneq ($(shell uname -m), aarch64)
common: fio cpu_hpl common: cpu_hpl
directx_amd: directx_amf_encoding_latency directx_amd: directx_amf_encoding_latency
endif endif
...@@ -184,7 +184,7 @@ ifneq (,$(wildcard stream-tests/Makefile)) ...@@ -184,7 +184,7 @@ ifneq (,$(wildcard stream-tests/Makefile))
cd ./stream-tests && \ cd ./stream-tests && \
wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \ wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c && \
make all make all
cp -v ./stream-tests/stream*.exe $(SB_MICRO_PATH)/bin/ cp -v ./stream-tests/stream* $(SB_MICRO_PATH)/bin/
endif endif
# Build AMD Encoder Latency Test # Build AMD Encoder Latency Test
......
# Copyright (c) Microsoft Corporation. # Copyright (c) Microsoft Corporation.
# Licensed under the MIT license. # Licensed under the MIT license.
GENFLAGS := -DSTREAM_ARRAY_SIZE=400000000 GENFLAGS := -DSTREAM_ARRAY_SIZE=120000000
ZEN3FLAGS := -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 ZEN3FLAGS := -DSTREAM_ARRAY_SIZE=400000000 -march=znver3
ZEN4FLAGS := -DSTREAM_ARRAY_SIZE=800000000 -march=znver4 ZEN4FLAGS := -DSTREAM_ARRAY_SIZE=800000000 -march=znver4
NEO2FLAGS := -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2 NEO2FLAGS := -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2
GEN_OUTPUT := streamx86.exe GEN_OUTPUT := stream
ZEN3_OUTPUT := streamZen3.exe ZEN3_OUTPUT := streamZen3
ZEN4_OUTPUT := streamZen4.exe ZEN4_OUTPUT := streamZen4
NEO2_OUTPUT := streamNeo2.exe NEO2_OUTPUT := streamNeo2
ALL_TARGETS := GEN
ARCH := $(shell uname -m) ARCH := $(shell uname -m)
# ARM64 build gets NEO2 added
ifeq ($(ARCH), aarch64) ifeq ($(ARCH), aarch64)
CFLAGS := -Ofast -fopenmp -DNTIMES=200 CC := gcc
CC := gcc CFLAGS := -Ofast -fopenmp -DNTIMES=200
all: NEO2 ALL_TARGETS += NEO2
else endif
CC := /opt/AMD/aocc-compiler-4.0.0/bin/clang
CFLAGS := -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 # AMD AOCC clang present? add ZEN3 and ZEN4
all: ZEN3 ZEN4 X86 ifneq ("$(wildcard /opt/AMD/aocc-compiler-4.0.0/bin/clang)","")
CC := /opt/AMD/aocc-compiler-4.0.0/bin/clang
CFLAGS := -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp \
-fnt-store=aggressive -DNTIMES=10
ALL_TARGETS += ZEN3 ZEN4
endif endif
ZEN3: stream.c # the one all: definition
all: $(ALL_TARGETS)
GEN:
$(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT)
ZEN3:
$(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT) $(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT)
ZEN4: ZEN4:
$(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT) $(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT)
X86:
$(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT)
NEO2: NEO2:
$(CC) $(CFLAGS) $(NEO2FLAGS) stream.c -o $(NEO2_OUTPUT) $(CC) $(CFLAGS) $(NEO2FLAGS) stream.c -o $(NEO2_OUTPUT)
ifeq ($(ARCH), aarch64) # clean up the generated files
clean: clean:
rm $(NEO2_OUTPUT) rm -f $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT) $(NEO2_OUTPUT)
else
clean:
rm $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT)
endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment