Unverified Commit 47949127 authored by pdr's avatar pdr Committed by GitHub
Browse files

Dockerfile - Add support for arm64 build (#660)

Add support for arm64 build:

- Updated dockerfile for arm64 build
- extend cpu stream compilation for neoverse 
- handle onnxruntime-gpu installation
- third party builds filtering based on arch
- disable cuda decode perf build for non x86
parent 59d36f7f
...@@ -28,21 +28,25 @@ jobs: ...@@ -28,21 +28,25 @@ jobs:
- name: cuda12.4 - name: cuda12.4
dockerfile: cuda12.4 dockerfile: cuda12.4
tags: superbench/main:cuda12.4 tags: superbench/main:cuda12.4
platforms: linux/amd64 # TODO: linux/arm64
runner: [self-hosted] runner: [self-hosted]
build_args: "NUM_MAKE_JOBS=16" build_args: "NUM_MAKE_JOBS=16"
- name: cuda12.2 - name: cuda12.2
dockerfile: cuda12.2 dockerfile: cuda12.2
tags: superbench/main:cuda12.2 tags: superbench/main:cuda12.2
platforms: linux/amd64
runner: [self-hosted] runner: [self-hosted]
build_args: "NUM_MAKE_JOBS=16" build_args: "NUM_MAKE_JOBS=16"
- name: cuda11.1.1 - name: cuda11.1.1
dockerfile: cuda11.1.1 dockerfile: cuda11.1.1
tags: superbench/main:cuda11.1.1,superbench/superbench:latest tags: superbench/main:cuda11.1.1,superbench/superbench:latest
platforms: linux/amd64
runner: ubuntu-latest runner: ubuntu-latest
build_args: "NUM_MAKE_JOBS=8" build_args: "NUM_MAKE_JOBS=8"
- name: rocm6.2 - name: rocm6.2
dockerfile: rocm6.2.x dockerfile: rocm6.2.x
tags: superbench/main:rocm6.2 tags: superbench/main:rocm6.2
platforms: linux/amd64
runner: [self-hosted] runner: [self-hosted]
build_args: "NUM_MAKE_JOBS=16" build_args: "NUM_MAKE_JOBS=16"
steps: steps:
...@@ -125,7 +129,7 @@ jobs: ...@@ -125,7 +129,7 @@ jobs:
id: docker_build id: docker_build
uses: docker/build-push-action@v2 uses: docker/build-push-action@v2
with: with:
platforms: linux/amd64 platforms: ${{ matrix.platforms }}
context: . context: .
file: ${{ steps.metadata.outputs.dockerfile }} file: ${{ steps.metadata.outputs.dockerfile }}
push: ${{ github.event_name != 'pull_request' }} push: ${{ github.event_name != 'pull_request' }}
......
...@@ -19,6 +19,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3 ...@@ -19,6 +19,7 @@ FROM nvcr.io/nvidia/pytorch:24.03-py3
LABEL maintainer="SuperBench" LABEL maintainer="SuperBench"
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y --no-install-recommends \ apt-get install -y --no-install-recommends \
autoconf \ autoconf \
...@@ -60,11 +61,13 @@ RUN apt-get update && \ ...@@ -60,11 +61,13 @@ RUN apt-get update && \
rm -rf /var/lib/apt/lists/* /tmp/* rm -rf /var/lib/apt/lists/* /tmp/*
ARG NUM_MAKE_JOBS= ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH
# Install Docker # Install Docker
ENV DOCKER_VERSION=20.10.8 ENV DOCKER_VERSION=20.10.8
RUN cd /tmp && \ RUN TARGETARCH_HW=$(uname -m) && \
wget -q https://download.docker.com/linux/static/stable/x86_64/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \ wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \ tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz rm docker.tgz
...@@ -80,40 +83,43 @@ RUN mkdir -p /root/.ssh && \ ...@@ -80,40 +83,43 @@ RUN mkdir -p /root/.ssh && \
# Install OFED # Install OFED
ENV OFED_VERSION=23.07-0.5.1.2 ENV OFED_VERSION=23.07-0.5.1.2
RUN cd /tmp && \ RUN TARGETARCH_HW=$(uname -m) && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ cd /tmp && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64.tgz && \ wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \ tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu22.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
# Install HPC-X # Install HPC-X
ENV HPCX_VERSION=v2.18 ENV HPCX_VERSION=v2.18
RUN cd /opt && \ RUN TARGETARCH_HW=$(uname -m) && \
cd /opt && \
rm -rf hpcx && \ rm -rf hpcx && \
wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64.tbz -O hpcx.tbz && \ wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}/hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \ tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-x86_64 hpcx && \ mv hpcx-${HPCX_VERSION}-gcc-mlnx_ofed-ubuntu22.04-cuda12-${TARGETARCH_HW} hpcx && \
rm hpcx.tbz rm hpcx.tbz
# Install Intel MLC # Installs specific to amd64 platform
RUN cd /tmp && \ RUN if [ "$TARGETARCH" = "amd64" ]; then \
# Install Intel MLC
cd /tmp && \
wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \ wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \ tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \ cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz rm -rf ./Linux mlc.tgz && \
# Install AOCC compiler
# Install AOCC compiler
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \ wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \ apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
# Install AMD BLIS
# Install AMD BLIS
RUN cd /tmp && \
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \ wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \ tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
mv amd-blis /opt/AMD && \ mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
else \
echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
fi
# Install NCCL 2.23.4 # Install NCCL 2.23.4
RUN cd /tmp && \ RUN cd /tmp && \
......
...@@ -215,8 +215,8 @@ def run(self): ...@@ -215,8 +215,8 @@ def run(self):
], ],
'ort': [ 'ort': [
'onnx>=1.10.2', 'onnx>=1.10.2',
'onnxruntime-gpu==1.10.0; python_version<"3.10"', 'onnxruntime-gpu==1.10.0; python_version<"3.10" and platform_machine == "x86_64"',
'onnxruntime-gpu; python_version>="3.10"', 'onnxruntime-gpu; python_version>="3.10" and platform_machine == "x86_64"',
], ],
'nvidia': ['py3nvml>=0.2.6'], 'nvidia': ['py3nvml>=0.2.6'],
'amd': ['amdsmi'], 'amd': ['amdsmi'],
......
...@@ -23,7 +23,7 @@ def __init__(self, name, parameters=''): ...@@ -23,7 +23,7 @@ def __init__(self, name, parameters=''):
super().__init__(name, parameters) super().__init__(name, parameters)
self._bin_name = 'streamZen3.exe' self._bin_name = 'streamZen3.exe'
self.__cpu_arch = ['other', 'zen3', 'zen4'] self.__cpu_arch = ['other', 'zen3', 'zen4', 'neo2']
def add_parser_arguments(self): def add_parser_arguments(self):
"""Add the specified arguments.""" """Add the specified arguments."""
...@@ -80,6 +80,8 @@ def _preprocess(self): ...@@ -80,6 +80,8 @@ def _preprocess(self):
exe = 'streamZen3.exe' exe = 'streamZen3.exe'
elif self._args.cpu_arch == 'zen4': elif self._args.cpu_arch == 'zen4':
exe = 'streamZen4.exe' exe = 'streamZen4.exe'
elif self._args.cpu_arch == 'neo2':
exe = 'streamNeo2.exe'
else: else:
exe = 'streamx86.exe' exe = 'streamx86.exe'
......
...@@ -4,114 +4,120 @@ ...@@ -4,114 +4,120 @@
cmake_minimum_required(VERSION 3.18) cmake_minimum_required(VERSION 3.18)
project(cuda_decode_performance) project(cuda_decode_performance)
find_package(CUDA QUIET) # Check architecture
if(CUDA_FOUND) if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
set(CMAKE_CXX_STANDARD 17) message(WARNING "Skipping Cuda decode Performance build. This build only supports x86_64 arch.")
set(CMAKE_CXX_STANDARD_REQUIRED ON) else()
find_package(CUDA QUIET)
set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples) if(CUDA_FOUND)
set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface) set(CMAKE_CXX_STANDARD 17)
set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder) set(THIRD_PARTY_SAMPLE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Samples)
set(NVCODEC_PUBLIC_INTERFACE_DIR ${THIRD_PARTY_SAMPLE_DIR}/../Interface)
if(CMAKE_SYSTEM_NAME STREQUAL "Linux") set(NVCODEC_UTILS_DIR ${THIRD_PARTY_SAMPLE_DIR}/Utils)
find_package(PkgConfig REQUIRED) set(NV_CODEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec)
pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec) set(NV_DEC_DIR ${THIRD_PARTY_SAMPLE_DIR}/NvCodec/NvDecoder)
pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil) if(CMAKE_SYSTEM_NAME STREQUAL "Linux")
pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample) find_package(PkgConfig REQUIRED)
pkg_check_modules(PC_AVCODEC REQUIRED IMPORTED_TARGET libavcodec)
set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS}) pkg_check_modules(PC_AVFORMAT REQUIRED IMPORTED_TARGET libavformat)
find_library(AVCODEC_LIBRARY NAMES avcodec pkg_check_modules(PC_AVUTIL REQUIRED IMPORTED_TARGET libavutil)
HINTS pkg_check_modules(PC_SWRESAMPLE REQUIRED IMPORTED_TARGET libswresample)
${PC_AVCODEC_LIBDIR}
${PC_AVCODEC_LIBRARY_DIRS} set(NV_FFMPEG_HDRS ${PC_AVCODEC_INCLUDE_DIRS})
) find_library(AVCODEC_LIBRARY NAMES avcodec
find_library(AVFORMAT_LIBRARY NAMES avformat HINTS
HINTS ${PC_AVCODEC_LIBDIR}
${PC_AVFORMAT_LIBDIR} ${PC_AVCODEC_LIBRARY_DIRS}
${PC_AVFORMAT_LIBRARY_DIRS} )
) find_library(AVFORMAT_LIBRARY NAMES avformat
find_library(AVUTIL_LIBRARY NAMES avutil HINTS
HINTS ${PC_AVFORMAT_LIBDIR}
${PC_AVUTIL_LIBDIR} ${PC_AVFORMAT_LIBRARY_DIRS}
${PC_AVUTIL_LIBRARY_DIRS} )
) find_library(AVUTIL_LIBRARY NAMES avutil
find_library(SWRESAMPLE_LIBRARY NAMES swresample HINTS
HINTS ${PC_AVUTIL_LIBDIR}
${PC_SWRESAMPLE_LIBDIR} ${PC_AVUTIL_LIBRARY_DIRS}
${PC_SWRESAMPLE_LIBRARY_DIRS} )
) find_library(SWRESAMPLE_LIBRARY NAMES swresample
set(AVCODEC_LIB ${AVCODEC_LIBRARY}) HINTS
set(AVFORMAT_LIB ${AVFORMAT_LIBRARY}) ${PC_SWRESAMPLE_LIBDIR}
set(AVUTIL_LIB ${AVUTIL_LIBRARY}) ${PC_SWRESAMPLE_LIBRARY_DIRS}
set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY}) )
endif() set(AVCODEC_LIB ${AVCODEC_LIBRARY})
set(AVFORMAT_LIB ${AVFORMAT_LIBRARY})
set(APP_SOURCES set(AVUTIL_LIB ${AVUTIL_LIBRARY})
${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp set(SWRESAMPLE_LIB ${SWRESAMPLE_LIBRARY})
) endif()
set(NV_DEC_SOURCES set(APP_SOURCES
${NV_DEC_DIR}/NvDecoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/AppDecPerf.cpp
${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp )
)
set(NV_DEC_SOURCES
set(NV_DEC_HDRS ${NV_DEC_DIR}/NvDecoder.cpp
${NV_DEC_DIR}/NvDecoder.h ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.cpp
${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h )
${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
${NVCODEC_UTILS_DIR}/NvCodecUtils.h set(NV_DEC_HDRS
${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h ${NV_DEC_DIR}/NvDecoder.h
${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h ${NVCODEC_PUBLIC_INTERFACE_DIR}/cuviddec.h
${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h ${NVCODEC_PUBLIC_INTERFACE_DIR}/nvcuvid.h
) ${NVCODEC_UTILS_DIR}/NvCodecUtils.h
${NVCODEC_UTILS_DIR}/FFmpegDemuxer.h
source_group( "headers" FILES ${NV_DEC_HDRS} ) ${CMAKE_CURRENT_SOURCE_DIR}/ThreadPoolUtils.h
source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES}) ${CMAKE_CURRENT_SOURCE_DIR}/OptimizedNvDecoder.h
set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}") )
find_package(CUDA)
set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER}) source_group( "headers" FILES ${NV_DEC_HDRS} )
set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\") source_group( "sources" FILES ${APP_SOURCES} ${NV_DEC_SOURCES})
if ( CMAKE_COMPILER_IS_GNUCC ) set(CMAKE_LIBRARY_PATH "${CUDA_TOOLKIT_ROOT_DIR}/lib64/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib/stubs;${CUDA_TOOLKIT_ROOT_DIR}/lib64;${CUDA_TOOLKIT_ROOT_DIR}/lib;${CMAKE_LIBRARY_PATH}")
if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" ) find_package(CUDA)
list(APPEND CUDA_NVCC_FLAGS -std=c++11) set(CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
endif() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS};-gencode arch=compute_50,code=\"sm_50,compute_50\")
endif() if ( CMAKE_COMPILER_IS_GNUCC )
if(NOT "${CUDA_NVCC_FLAGS}" MATCHES "-std=c\\+\\+11" )
# Check if the file exists list(APPEND CUDA_NVCC_FLAGS -std=c++11)
if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" ) endif()
execute_process( endif()
COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
RESULT_VARIABLE result # Check if the file exists
) if (NOT EXISTS "/usr/local/lib/libnvcuvid.so" )
if(result) execute_process(
message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}") COMMAND sudo ln -s /usr/lib/x86_64-linux-gnu/libnvcuvid.so.1 /usr/local/lib/libnvcuvid.so
endif() RESULT_VARIABLE result
endif () )
if(result)
find_library(CUVID_LIB nvcuvid message(FATAL_ERROR "Failed to create symbolic link for nvcuvid lib: ${result}")
HINTS endif()
"/usr/local/lib/" endif ()
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
) find_library(CUVID_LIB nvcuvid
HINTS
cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS}) "/usr/local/lib/"
"${CMAKE_CURRENT_SOURCE_DIR}/../../../../third_party/Video_Codec_SDK/Lib/linux/stubs/x86_64/"
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON) )
target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS} cuda_add_executable(${PROJECT_NAME} ${APP_SOURCES} ${NV_DEC_SOURCES} ${NV_DEC_HDRS})
${NVCODEC_PUBLIC_INTERFACE_DIR}
${NVCODEC_UTILS_DIR} set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
${NV_CODEC_DIR}
${NV_APPDEC_COMMON_DIR} target_include_directories(${PROJECT_NAME} PUBLIC ${CUDA_INCLUDE_DIRS}
${NV_FFMPEG_HDRS} ${NVCODEC_PUBLIC_INTERFACE_DIR}
${THIRD_PARTY_SAMPLE_DIR} ${NVCODEC_UTILS_DIR}
) ${NV_CODEC_DIR}
${NV_APPDEC_COMMON_DIR}
target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB} ${NV_FFMPEG_HDRS}
${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB}) ${THIRD_PARTY_SAMPLE_DIR}
)
install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
target_link_libraries(${PROJECT_NAME} ${CUDA_CUDA_LIBRARY} ${CMAKE_DL_LIBS} ${CUVID_LIB} ${AVCODEC_LIB}
${AVFORMAT_LIB} ${AVUTIL_LIB} ${SWRESAMPLE_LIB})
install(TARGETS ${PROJECT_NAME} RUNTIME DESTINATION bin LIBRARY DESTINATION lib)
endif()
endif() endif()
...@@ -18,14 +18,19 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2) ...@@ -18,14 +18,19 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
.PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm .PHONY: all cuda_with_msccl cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm
# Build all targets. # Build targets.
all: cuda rocm all: cuda rocm
cuda_with_msccl: cuda cuda_msccl cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm
cpu: common cpu_perftest cpu: common cpu_perftest cpu_stream
common: cpu_hpl cpu_stream fio common: fio
# non aarch64 specific targets
ifneq ($(shell uname -m), aarch64)
common: fio cpu_hpl
directx_amd: directx_amf_encoding_latency directx_amd: directx_amf_encoding_latency
endif
# Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed. # Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
sb_micro_path: sb_micro_path:
...@@ -59,7 +64,7 @@ else ...@@ -59,7 +64,7 @@ else
endif endif
if [ -d cuda-samples ]; then rm -rf cuda-samples; fi if [ -d cuda-samples ]; then rm -rf cuda-samples; fi
git clone -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git git clone -b v$(CUDA_VER) https://github.com/NVIDIA/cuda-samples.git
cd ./$(TEST_PATH) && make clean && make TARGET_ARCH=x86_64 SMS=$(ARCHS) cd ./$(TEST_PATH) && make clean && make SMS=$(ARCHS)
cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/ cp -v ./$(TEST_PATH)/bandwidthTest $(SB_MICRO_PATH)/bin/
# Build nccl-tests from commit 8274cb4 of default branch. # Build nccl-tests from commit 8274cb4 of default branch.
......
# Copyright (c) Microsoft Corporation. # Copyright (c) Microsoft Corporation.
# Licensed under the MIT license. # Licensed under the MIT license.
CC= /opt/AMD/aocc-compiler-4.0.0/bin/clang GENFLAGS := -DSTREAM_ARRAY_SIZE=400000000
CFLAGS= -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10 ZEN3FLAGS := -DSTREAM_ARRAY_SIZE=400000000 -march=znver3
GENFLAGS= -DSTREAM_ARRAY_SIZE=400000000 ZEN4FLAGS := -DSTREAM_ARRAY_SIZE=800000000 -march=znver4
ZEN3FLAGS= -DSTREAM_ARRAY_SIZE=400000000 -march=znver3 NEO2FLAGS := -DSTREAM_ARRAY_SIZE=120000000 -mcpu=neoverse-v2
ZEN4FLAGS= -DSTREAM_ARRAY_SIZE=800000000 -march=znver4
GEN_OUTPUT= streamx86.exe
ZEN3_OUTPUT= streamZen3.exe
ZEN4_OUTPUT= streamZen4.exe
GEN_OUTPUT := streamx86.exe
ZEN3_OUTPUT := streamZen3.exe
ZEN4_OUTPUT := streamZen4.exe
NEO2_OUTPUT := streamNeo2.exe
ARCH := $(shell uname -m)
ifeq ($(ARCH), aarch64)
CFLAGS := -Ofast -fopenmp -DNTIMES=200
CC := gcc
all: NEO2
else
CC := /opt/AMD/aocc-compiler-4.0.0/bin/clang
CFLAGS := -Ofast -mcmodel=large -mavx2 -ffp-contract=fast -lomp -fopenmp -fnt-store=aggressive -DNTIMES=10
all: ZEN3 ZEN4 X86 all: ZEN3 ZEN4 X86
endif
ZEN3: stream.c ZEN3: stream.c
$(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT) $(CC) $(CFLAGS) $(ZEN3FLAGS) stream.c -o $(ZEN3_OUTPUT)
...@@ -18,6 +29,13 @@ ZEN4: ...@@ -18,6 +29,13 @@ ZEN4:
$(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT) $(CC) $(CFLAGS) $(ZEN4FLAGS) stream.c -o $(ZEN4_OUTPUT)
X86: X86:
$(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT) $(CC) $(CFLAGS) $(GENFLAGS) stream.c -o $(GEN_OUTPUT)
NEO2:
$(CC) $(CFLAGS) $(NEO2FLAGS) stream.c -o $(NEO2_OUTPUT)
ifeq ($(ARCH), aarch64)
clean:
rm $(NEO2_OUTPUT)
else
clean: clean:
rm $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT) rm $(GEN_OUTPUT) $(ZEN3_OUTPUT) $(ZEN4_OUTPUT)
endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment