Commit c4f39919 authored by one's avatar one
Browse files

Update DTK dockerfile and microbenchmarks

- Update rocm_commom.cmake for CMake>=3.24
- Prevent isolation build
- Add BabelStream as a submodule
- Update dockerignore
parent 0fdfe4c3
......@@ -15,3 +15,8 @@ outputs/
**/.dockerignore
.github/
.azure-pipelines/
# Build directories
**/build/
**/Build/
**/build-*/
......@@ -33,3 +33,6 @@
[submodule "third_party/nvbandwidth"]
path = third_party/nvbandwidth
url = https://github.com/NVIDIA/nvbandwidth.git
[submodule "third_party/BabelStream"]
path = third_party/BabelStream
url = https://github.com/UoB-HPC/BabelStream.git
......@@ -84,6 +84,7 @@ RUN cd /tmp && \
--with-rocm=${ROCM_PATH} \
--without-knem --without-cuda --without-java && \
make -j $(nproc) && \
rm -rf ${UCX_HOME} && \
make install && \
rm -rf /tmp/ucx-${UCX_VERSION}*
......@@ -102,20 +103,33 @@ RUN cd /tmp && \
--enable-mca-no-build=btl-uct \
--enable-prte-prefix-by-default && \
make -j $(nproc) && \
rm -rf ${MPI_HOME} && \
make install && \
ldconfig && \
cd / && \
rm -rf /tmp/openmpi-${OMPI_VERSION}*
# Install Intel MLC
# RUN cd /tmp && \
# wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
# tar xzf mlc.tgz Linux/mlc && \
# cp ./Linux/mlc /usr/local/bin/ && \
# rm -rf ./Linux mlc.tgz
RUN cd /tmp && \
wget -q https://downloadmirror.intel.com/866182/mlc_v3.12.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz
# Install AMD SMI Python Library
RUN python3 -m pip install amdsmi==5.7.0
RUN cd /tmp && \
wget -q https://github.com/ROCm/amdsmi/archive/refs/tags/rocm-5.7.0.tar.gz -O amdsmi.tar.gz && \
tar xzf amdsmi.tar.gz --transform 's/amdsmi-rocm-5.7.0/amdsmi/' && \
cd amdsmi && \
cmake -S . -B build && \
cmake --build build -j $(nproc) && \
cmake --install build --prefix ${ROCM_PATH}/ && \
rm -rf amdsmi.tar.gz amdsmi && \
python3 -m pip install amdsmi==5.7.0
# Add rocblas-bench to path
RUN ln -s ${ROCM_PATH}/lib/rocblas/benchmark_tool/rocblas-bench ${ROCM_PATH}/bin/ && \
chmod +x ${ROCM_PATH}/bin/rocblas-bench
ENV PATH="${MPI_HOME}/bin:${UCX_HOME}/bin:/opt/superbench/bin:/usr/local/bin/${PATH:+:${PATH}}" \
LD_LIBRARY_PATH="${MPI_HOME}/lib:${UCX_HOME}/lib:/usr/lib/x86_64-linux-gnu/:/usr/local/lib/${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" \
......@@ -128,17 +142,30 @@ RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment
RUN python3 -m pip install --upgrade pip wheel setuptools==65.7 mpi4py
WORKDIR ${SB_HOME}
ADD third_party third_party
RUN make RCCL_HOME=${ROCM_PATH}/rccl ROCM_PATH=${ROCM_PATH} HIP_HOME=${ROCM_PATH}/hip MPI_HOME=${MPI_HOME} -C third_party dtk -o cpu_hpl -o cpu_stream -o megatron_lm -o apex_rocm -o megatron_deepspeed -o rocm_megatron_lm
ADD . .
# ENV USE_HIP_DATATYPE=1
# ENV USE_HIPBLAS_COMPUTETYPE=1
RUN python3 -m pip install .[hgworker] && \
CXX=${ROCM_PATH}/bin/hipcc make cppbuild && \
COPY third_party third_party
RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \
make \
RCCL_HOME=${ROCM_PATH}/rccl \
ROCM_PATH=${ROCM_PATH} \
HIP_HOME=${ROCM_PATH}/hip \
MPI_HOME=${MPI_HOME} \
-C third_party \
dtk \
-o cpu_hpl \
-o cpu_stream \
-o megatron_lm \
-o apex_rocm \
-o megatron_deepspeed \
-o rocm_megatron_lm
COPY . .
ENV USE_HIP_DATATYPE=1
ENV USE_HIPBLAS_COMPUTETYPE=1
RUN --mount=type=bind,from=hyhal,source=/,target=/opt/hyhal \
python3 -m pip install --upgrade pip wheel setuptools==65.7 mpi4py && \
python3 -m pip install --no-build-isolation .[hgworker] && \
make cppbuild && \
make postinstall
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.18)
cmake_minimum_required(VERSION 3.24)
project(cpu_copy LANGUAGES CXX)
find_package(CUDAToolkit QUIET)
# Cuda environment
if(CUDAToolkit_FOUND)
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
include(../cuda_common.cmake)
add_executable(cpu_copy cpu_copy.cpp)
set_property(TARGET cpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_link_libraries(cpu_copy numa)
else()
# ROCm environment
include(../rocm_common.cmake)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found ROCm: " ${HIP_VERSION})
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o cpu_copy.cpp cpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# link hip device lib
add_executable(cpu_copy cpu_copy.cpp)
include(CheckSymbolExists)
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
if(${HIP_UNCACHED_MEMORY})
target_compile_definitions(cpu_copy PRIVATE HIP_UNCACHED_MEMORY)
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
target_link_libraries(cpu_copy numa hip::device)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif()
add_executable(cpu_copy cpu_copy.cpp)
target_compile_options(cpu_copy PRIVATE -O2)
target_link_libraries(cpu_copy PRIVATE numa)
install(TARGETS cpu_copy RUNTIME DESTINATION bin)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.18)
cmake_minimum_required(VERSION 3.24)
project(dist_inference LANGUAGES CXX)
......@@ -10,39 +10,31 @@ include_directories(SYSTEM ${MPI_INCLUDE_PATH})
find_package(CUDAToolkit QUIET)
# Cuda environment
if(CUDAToolkit_FOUND)
if(CUDAToolkit_FOUND) # CUDA environment
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
include(../cuda_common.cmake)
add_executable(dist_inference dist_inference.cu)
set_property(TARGET dist_inference PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_link_libraries(dist_inference MPI::MPI_CXX nccl cublasLt)
else()
# ROCm environment
target_link_libraries(dist_inference PRIVATE MPI::MPI_CXX nccl cublasLt)
else() # ROCm environment
include(../rocm_common.cmake)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found ROCm: " ${HIP_VERSION})
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o dist_inference.cpp dist_inference.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
hipify_sources(HIP_FILES dist_inference.cu)
add_executable(dist_inference ${HIP_FILES})
target_compile_options(dist_inference PRIVATE -O2)
target_compile_definitions(dist_inference PRIVATE ROCM_USE_FLOAT16=1)
# link hip device lib
add_executable(dist_inference dist_inference.cpp)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DROCM_USE_FLOAT16=1")
if(DEFINED ENV{USE_HIPBLASLT_DATATYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLASLT_DATATYPE=1")
target_compile_definitions(dist_inference PRIVATE USE_HIPBLASLT_DATATYPE=1)
elseif(DEFINED ENV{USE_HIP_DATATYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIP_DATATYPE=1")
target_compile_definitions(dist_inference PRIVATE USE_HIP_DATATYPE=1)
endif()
if(DEFINED ENV{USE_HIPBLAS_COMPUTETYPE})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_HIPBLAS_COMPUTETYPE=1")
endif()
target_link_libraries(dist_inference MPI::MPI_CXX rccl hipblaslt hip::device)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
target_compile_definitions(dist_inference PRIVATE USE_HIPBLAS_COMPUTETYPE=1)
endif()
target_link_libraries(dist_inference PRIVATE MPI::MPI_CXX rccl hipblaslt)
endif()
install(TARGETS dist_inference RUNTIME DESTINATION bin)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.18)
cmake_minimum_required(VERSION 3.24)
project(gpu_copy LANGUAGES CXX)
find_package(CUDAToolkit QUIET)
# Cuda environment
if(CUDAToolkit_FOUND)
if(CUDAToolkit_FOUND) # CUDA environment
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
include(../cuda_common.cmake)
add_executable(gpu_copy gpu_copy.cu)
set_property(TARGET gpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_link_libraries(gpu_copy numa)
else()
# ROCm environment
else() # ROCm environment
include(../rocm_common.cmake)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found ROCm: " ${HIP_VERSION})
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o gpu_copy.cpp gpu_copy.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# link hip device lib
add_executable(gpu_copy gpu_copy.cpp)
hipify_sources(HIP_FILES gpu_copy.cu)
add_executable(gpu_copy ${HIP_FILES})
target_compile_options(gpu_copy PRIVATE -O2)
target_link_libraries(gpu_copy PRIVATE numa)
include(CheckSymbolExists)
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
if(${HIP_UNCACHED_MEMORY})
target_compile_definitions(gpu_copy PRIVATE HIP_UNCACHED_MEMORY)
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
target_link_libraries(gpu_copy numa hip::device)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif()
install(TARGETS gpu_copy RUNTIME DESTINATION bin)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.18)
project(gpu_stream LANGUAGES CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)
find_package(CUDAToolkit QUIET)
# Source files
set(SOURCES
gpu_stream_test.cpp
gpu_stream_utils.cpp
gpu_stream.cu
gpu_stream_kernels.cu
)
# Cuda environment
if(CUDAToolkit_FOUND)
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
include(../cuda_common.cmake)
add_executable(gpu_stream ${SOURCES})
set_property(TARGET gpu_stream PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
target_include_directories(gpu_stream PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
target_link_libraries(gpu_stream numa nvidia-ml)
else()
# TODO: test for ROC
# ROCm environment
include(../rocm_common.cmake)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found ROCm: " ${HIP_VERSION})
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o gpu_stream.cpp ${SOURCES} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# link hip device lib
add_executable(gpu_stream gpu_stream.cpp)
include(CheckSymbolExists)
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
if(${HIP_UNCACHED_MEMORY})
target_compile_definitions(gpu_stream PRIVATE HIP_UNCACHED_MEMORY)
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
target_link_libraries(gpu_stream numa hip::device)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif()
install(TARGETS gpu_stream RUNTIME DESTINATION bin)
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#pragma once
#include <getopt.h>
#include <iostream>
#include <memory>
#include <variant>
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <numa.h>
#include "gpu_stream_kernels.hpp"
#include "gpu_stream_utils.hpp"
#define NON_HIP (!defined(__HIP_PLATFORM_HCC__) && !defined(__HCC__) && !defined(__HIPCC__))
using namespace stream_config;
class GpuStream {
public:
GpuStream() = delete; // Delete default constructor
GpuStream(Opts &) noexcept; // Constructor
~GpuStream() noexcept = default; // Destructor
GpuStream(const GpuStream &) = delete;
GpuStream &operator=(const GpuStream &) = delete;
GpuStream(GpuStream &&) noexcept = default;
GpuStream &operator=(GpuStream &&) noexcept = default;
int Run();
private:
using BenchArgsVariant = std::variant<std::unique_ptr<BenchArgs<double>>>;
std::vector<BenchArgsVariant> bench_args_;
Opts opts_;
// Memory management functions
template <typename T> cudaError_t GpuMallocDataBuf(T **, uint64_t);
template <typename T> int PrepareValidationBuf(std::unique_ptr<BenchArgs<T>> &);
template <typename T> int CheckBuf(std::unique_ptr<BenchArgs<T>> &, int);
template <typename T> int PrepareEvent(std::unique_ptr<BenchArgs<T>> &);
template <typename T> int PrepareBufAndStream(std::unique_ptr<BenchArgs<T>> &);
template <typename T> int DestroyEvent(std::unique_ptr<BenchArgs<T>> &);
template <typename T> int DestroyBufAndStream(std::unique_ptr<BenchArgs<T>> &);
template <typename T> int Destroy(std::unique_ptr<BenchArgs<T>> &);
// Benchmark functions
template <typename T> int RunStreamKernel(std::unique_ptr<BenchArgs<T>> &, Kernel, int);
float GetActualMemoryClockRate(int gpu_id);
template <typename T> int RunStream(std::unique_ptr<BenchArgs<T>> &, const std::string &data_type, float peak_bw);
// Helper functions
int GetGpuCount(int *);
int SetGpu(int gpu_id);
float GetMemoryClockRate(int device_id, const cudaDeviceProp &prop);
void PrintCudaDeviceInfo(int device_id, const cudaDeviceProp &prop, float memory_clock_mhz, float peak_bw);
};
\ No newline at end of file
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include "gpu_stream_kernels.hpp"
/**
* @brief Fetches a value from source memory and writes it to a register.
*
* @details This inline device function fetches a value from the specified source memory
* location and writes it to the provided register. The implementation references the following:
* 1) NCCL:
* https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L483
* 2) RCCL:
* https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L268
*
* @tparam T The type of the value to fetch.
* @param[out] v The register to write the fetched value to.
* @param[in] p The source memory location to fetch the value from.
*/
template <typename T> inline __device__ void Fetch(T &v, const T *p) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
v = *p;
#else
if constexpr (std::is_same<T, float>::value) {
asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(v) : "l"(p) : "memory");
} else if constexpr (std::is_same<T, double>::value) {
asm volatile("ld.volatile.global.f64 %0, [%1];" : "=d"(v) : "l"(p) : "memory");
}
#endif
}
/**
* @brief Stores a value from register and writes it to target memory.
*
* @details This inline device function stores a value from the provided register
* and writes it to the specified target memory location. The implementation references the following:
* 1) NCCL:
* https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L486
* 2) RCCL:
* https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L276
*
* @tparam T The type of the value to store.
* @param[out] p The target memory location to write the value to.
* @param[in] v The register containing the value to be stored.
*/
template <typename T> inline __device__ void Store(T *p, const T &v) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
*p = v;
#else
if constexpr (std::is_same<T, float>::value) {
asm volatile("st.volatile.global.f32 [%0], %1;" ::"l"(p), "f"(v) : "memory");
} else if constexpr (std::is_same<T, double>::value) {
asm volatile("st.volatile.global.f64 [%0], %1;" ::"l"(p), "d"(v) : "memory");
}
#endif
}
/**
* @brief Performs COPY, a simple copy operation from source to target. b = a
*
* @details This CUDA kernel performs a simple copy operation, copying data from the source array
* to the target array. This is used to measure transfer rates without any arithmetic operations.
*
* @param[out] tgt The target array where data will be copied to.
* @param[in] src The source array from which data will be copied.
*/
__global__ void CopyKernel(double *tgt, const double *src) {
uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
double val[kNumLoopUnrollAlias];
#pragma unroll
for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++)
Fetch(val[i], src + index + i * blockDim.x);
#pragma unroll
for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++)
Store(tgt + index + i * blockDim.x, val[i]);
}
/**
* @brief Performs SCALE, a scaling operation on the source data. b = x * a
*
* @details This CUDA kernel performs a simple arithmetic operation by scaling the source data
* with a given scalar value and storing the result in the target array.
*
* @param[out] tgt The target array where the scaled data will be stored.
* @param[in] src The source array containing the data to be scaled.
* @param[in] scalar The scalar value used to scale the source data.
*/
__global__ void ScaleKernel(double *tgt, const double *src, const double scalar) {
uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
double val[kNumLoopUnrollAlias];
#pragma unroll
for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++)
Fetch(val[i], src + index + i * blockDim.x);
#pragma unroll
for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
val[i] *= scalar;
Store(tgt + index + i * blockDim.x, val[i]);
}
}
/**
* @brief Performs ADD, an addition operation on two source arrays. c = a + b
*
* @details This CUDA kernel adds corresponding elements from two source arrays and stores the result
* in the target array. This operation is used to measure transfer rates with a simple arithmetic addition.
*
* @param[out] tgt The target array where the result of the addition will be stored.
* @param[in] src_a The first source array containing the first set of operands.
* @param[in] src_b The second source array containing the second set of operands.
*/
__global__ void AddKernel(double *tgt, const double *src_a, const double *src_b) {
uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
double val_a[kNumLoopUnrollAlias];
double val_b[kNumLoopUnrollAlias];
#pragma unroll
for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
Fetch(val_a[i], src_a + index + i * blockDim.x);
Fetch(val_b[i], src_b + index + i * blockDim.x);
}
#pragma unroll
for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
val_a[i] += val_b[i];
Store(tgt + index + i * blockDim.x, val_a[i]);
}
}
/**
* @brief Performs TRIAD, fused multiply/add operations on source arrays. a = b + x * c
*
* @details This CUDA kernel performs a fused multiply/add operation by multiplying elements from
* the second source array with a scalar value, adding the result to corresponding elements from
* the first source array, and storing the result in the target array.
*
* @param[out] tgt The target array where the result of the fused multiply/add operation will be stored.
* @param[in] src_a The first source array containing the first set of operands.
* @param[in] src_b The second source array containing the second set of operands to be multiplied by the scalar.
* @param[in] scalar The scalar value used in the multiply/add operation.
*/
__global__ void TriadKernel(double *tgt, const double *src_a, const double *src_b, const double scalar) {
uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
double val_a[kNumLoopUnrollAlias];
double val_b[kNumLoopUnrollAlias];
#pragma unroll
for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
Fetch(val_a[i], src_a + index + i * blockDim.x);
Fetch(val_b[i], src_b + index + i * blockDim.x);
}
#pragma unroll
for (uint64_t i = 0; i < kNumLoopUnrollAlias; i++) {
val_b[i] += (val_a[i] * scalar);
Store(tgt + index + i * blockDim.x, val_b[i]);
}
}
\ No newline at end of file
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include "gpu_stream_utils.hpp"
constexpr auto kNumLoopUnrollAlias = stream_config::kNumLoopUnroll;
// Function declarations
template <typename T> inline __device__ void Fetch(T &v, const T *p);
template <typename T> inline __device__ void Store(T *p, const T &v);
__global__ void CopyKernel(double *, const double *);
__global__ void ScaleKernel(double *, const double *, const double);
__global__ void AddKernel(double *, const double *, const double *);
__global__ void TriadKernel(double *, const double *, const double *, const double);
\ No newline at end of file
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include "gpu_stream.hpp"
/**
* @brief Main function and entry of gpu stream benchmark
* @details
* params list:
* num_warm_up: warm up count
* num_loops: num of runs for timing
* size: number of bytes to setup for the test
* @param argc argument count
* @param argv argument vector
* @return int
*/
int main(int argc, char **argv) {
int ret = 0;
stream_config::Opts opts;
// parse arguments from cmd
ret = stream_config::ParseOpts(argc, argv, &opts);
if (ret != 0) {
return ret;
}
// run the stream benchmark
GpuStream gpu_stream(opts);
ret = gpu_stream.Run();
if (ret != 0) {
return ret;
}
return 0;
}
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#include "gpu_stream_utils.hpp"
namespace stream_config {
/**
* @brief Converts a kernel index to its corresponding string representation.
*
* @details This function takes an integer representing a kernel index and returns the corresponding
* string representation of the kernel. The mapping between kernel indices and their string representations
* should be defined within the function.
*
* @param[in] kernel_idx The index of the kernel to be converted to a string.
*
* @return std::string The string representation of the kernel.
*/
std::string KernelToString(int kernel_idx) {
switch (kernel_idx) {
case static_cast<int>(Kernel::kCopy):
return "COPY";
case static_cast<int>(Kernel::kScale):
return "SCALE";
case static_cast<int>(Kernel::kAdd):
return "ADD";
case static_cast<int>(Kernel::kTriad):
return "TRIAD";
default:
return "UNKNOWN";
}
}
/**
* @brief Print the usage of this program.
*
* @details Thus function prints the usage of this program.
*
* @return void.
* */
void PrintUsage() {
std::cout << "Usage: gpu_stream "
<< "--size <size in bytes> "
<< "--num_warm_up <num_warm_up> "
<< "--num_loops <num_loops> "
<< "[--check_data]" << std::endl;
}
/**
* @brief Print the user provided inputs info.
*
* @details Thus function prints the parsed user provided inputs of this program..
*
* @param[in] opts The Opts struct that stores the parsed values.
*
* @return void
* */
void PrintInputInfo(Opts &opts) {
std::cout << "STREAM Benchmark" << std::endl;
std::cout << "Buffer size(bytes): " << opts.size << std::endl;
std::cout << "Number of warm up runs: " << opts.num_warm_up << std::endl;
std::cout << "Number of loops: " << opts.num_loops << std::endl;
std::cout << "Check data: " << (opts.check_data ? "Yes" : "No") << std::endl;
}
/**
* @brief Parse the command line options.
*
* @details Thus function parses the command line options and stores the values in the Opts struct.
*
* @param[in] argc The number of command line options.
* @param[in] argv The command line options.
* @param[out] opts The Opts struct to store the parsed values.
*
* @return int The status code.
* */
int ParseOpts(int argc, char **argv, Opts *opts) {
enum class OptIdx { kSize, kNumWarmUp, kNumLoops, kEnableCheckData };
const struct option options[] = {{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
{"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
{"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
int getopt_ret = 0;
int opt_idx = 0;
bool size_specified = true;
bool num_warm_up_specified = false;
bool num_loops_specified = false;
bool parse_err = false;
while (true) {
getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
if (getopt_ret == -1) {
if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
parse_err = true;
}
break;
} else if (getopt_ret == '?') {
parse_err = true;
break;
}
switch (opt_idx) {
case static_cast<int>(OptIdx::kSize):
if (1 != sscanf(optarg, "%lu", &(opts->size))) {
std::cerr << "Invalid size: " << optarg << std::endl;
parse_err = true;
} else {
size_specified = true;
}
break;
case static_cast<int>(OptIdx::kNumWarmUp):
if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
std::cerr << "Invalid num_warm_up: " << optarg << std::endl;
parse_err = true;
} else {
num_warm_up_specified = true;
}
break;
case static_cast<int>(OptIdx::kNumLoops):
if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
std::cerr << "Invalid num_loops: " << optarg << std::endl;
parse_err = true;
} else {
num_loops_specified = true;
}
break;
case static_cast<int>(OptIdx::kEnableCheckData):
opts->check_data = true;
break;
default:
parse_err = true;
}
if (parse_err) {
break;
}
}
if (parse_err) {
PrintUsage();
return -1;
}
return 0;
}
} // namespace stream_config
unsigned long long getCurrentTimestampInMicroseconds() {
// Get the current time point
auto now = std::chrono::system_clock::now();
// Convert to time since epoch
auto duration = now.time_since_epoch();
// Convert to microseconds
auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(duration).count();
return static_cast<unsigned long long>(microseconds);
}
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
#pragma once
#include <array>
#include <chrono>
#include <getopt.h>
#include <iomanip>
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include <cuda.h>
#include <cuda_runtime.h>
#include <numa.h>
#include <nvml.h>
// Custom deleter for GPU buffers
struct GpuBufferDeleter {
template <typename T> void operator()(T *ptr) const {
if (ptr) {
cudaFree(ptr);
}
}
};
unsigned long long getCurrentTimestampInMicroseconds();
namespace stream_config {
constexpr std::array<int, 4> kThreadsPerBlock = {128, 256, 512, 1024}; // Threads per block
constexpr uint64_t kDefaultBufferSizeInBytes = 4294967296; // Default buffer size 4GB
constexpr int kNumLoopUnroll = 2; // Unroll depth in SM copy kernel
constexpr int kNumBuffers = 3; // Number of buffers for triad, add kernel
constexpr int kNumValidationBuffers = 4; // Number of validation buffers, one for each kernel
constexpr int kUInt8Mod = 256; // Modulo for unsigned long data type
constexpr std::array<int, 4> kBufferBwMultipliers = {2, 2, 3, 3}; // Buffer multiplier for triad, add kernel
constexpr double scalar = 11.0; // Scalar for scale, triad kernel
// Enum for different kernels
enum class Kernel {
kCopy,
kScale,
kAdd,
kTriad,
kCount // Add a count to keep track of the number of enums. Helpful for iterating over enums.
};
// Arguments for each sub benchmark run.
template <typename T> struct SubBenchArgs {
// Unique pointer for GPU buffers
using GpuBufferUniquePtr = std::unique_ptr<T, GpuBufferDeleter>;
// Original data buffer.
T *data_buf = nullptr;
// Buffer to validate the correctness of data transfer.
T *check_buf = nullptr;
// GPU pointer of the data buffer on source devices.
std::vector<GpuBufferUniquePtr> gpu_buf_ptrs;
// Pointer of the validation buffers for each kernel. Order is same as Kernel enum.
std::vector<std::vector<T>> validation_buf_ptrs;
// CUDA stream to be used.
cudaStream_t stream;
// CUDA event to record start time.
cudaEvent_t start_event;
// CUDA event to record end time.
cudaEvent_t end_event;
// CUDA event to record end time.
std::vector<std::vector<float>> times_in_ms;
// Stream Kernel name.
std::string kernel_name;
};
// Arguments for each benchmark run.
template <typename T> struct BenchArgs {
// NUMA node under which the benchmark is done.
uint64_t numa_id = 0;
// GPU ID for device.
int gpu_id = 0;
// GPU device info
cudaDeviceProp gpu_device_prop;
// Data buffer size used.
uint64_t size = kDefaultBufferSizeInBytes;
// Number of warm up rounds to run.
uint64_t num_warm_up = 0;
// Number of loops to run.
uint64_t num_loops = 1;
// Whether check data after copy.
bool check_data = false;
// Sub-benchmarks in parallel.
SubBenchArgs<T> sub;
};
// Options accepted by this program.
struct Opts {
// Data buffer size for copy benchmark.
uint64_t size = kDefaultBufferSizeInBytes;
// Number of warm up rounds to run.
uint64_t num_warm_up = 0;
// Number of loops to run.
uint64_t num_loops = 0;
// Whether check data after copy.
bool check_data = false;
};
std::string KernelToString(int); // Function to convert enum to string
int ParseOpts(int, char **, Opts *);
void PrintInputInfo(Opts &);
void PrintUsage();
} // namespace stream_config
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
cmake_minimum_required(VERSION 3.18)
cmake_minimum_required(VERSION 3.24)
project(kernel_launch_overhead LANGUAGES CXX)
find_package(CUDAToolkit QUIET)
# Cuda environment
if(CUDAToolkit_FOUND)
if(CUDAToolkit_FOUND) # CUDA environment
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
include(../cuda_common.cmake)
add_executable(kernel_launch_overhead kernel_launch.cu)
set_property(TARGET kernel_launch_overhead PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
else()
# ROCm environment
else() # ROCm environment
include(../rocm_common.cmake)
find_package(hip QUIET)
if(hip_FOUND)
message(STATUS "Found HIP: " ${HIP_VERSION})
# Convert cuda code to hip code in cpp
execute_process(COMMAND hipify-perl -print-stats -o kernel_launch.cpp kernel_launch.cu WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# link hip device lib
add_executable(kernel_launch_overhead kernel_launch.cpp)
target_link_libraries(kernel_launch_overhead hip::device)
# Install tergets
install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
hipify_sources(HIP_FILES kernel_launch.cu)
add_executable(kernel_launch_overhead ${HIP_FILES})
endif()
install(TARGETS kernel_launch_overhead RUNTIME DESTINATION bin)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
enable_language(HIP)
# Set ROCM_PATH
if(NOT DEFINED ENV{ROCM_PATH})
# Run hipconfig -p to get ROCm path
......@@ -37,16 +39,53 @@ else()
set(HIP_PATH $ENV{HIP_PATH})
endif()
# Turn off CMAKE_HIP_ARCHITECTURES Feature if cmake version is 3.21+
if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.21.0)
set(CMAKE_HIP_ARCHITECTURES OFF)
endif()
message(STATUS "CMAKE HIP ARCHITECTURES: ${CMAKE_HIP_ARCHITECTURES}")
if(EXISTS ${HIP_PATH})
# Search for hip in common locations
list(APPEND CMAKE_PREFIX_PATH ${HIP_PATH} ${ROCM_PATH} ${ROCM_PATH}/hsa ${ROCM_PATH}/hip ${ROCM_PATH}/share/rocm/cmake/)
set(CMAKE_CXX_COMPILER "${HIP_PATH}/bin/hipcc")
set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH})
set(CMAKE_MODULE_PATH "${HIP_PATH}/lib/cmake/hip" ${CMAKE_MODULE_PATH})
list(APPEND CMAKE_MODULE_PATH
"${HIP_PATH}/cmake"
"${HIP_PATH}/lib/cmake/hip"
)
endif()
function(hipify_sources OUTPUT_VAR_NAME)
if(NOT HIPIFY_TOOL)
find_program(HIPIFY_TOOL hipify-perl PATHS $ENV{ROCM_PATH}/bin)
if(NOT HIPIFY_TOOL)
message(FATAL_ERROR "hipify-perl not found! Cannot translate CUDA to HIP.")
endif()
endif()
set(HIP_SOURCE_EXTS ".hip" ".cpp" ".cc" ".cxx")
set(GENERATED_HIP_FILES "")
foreach(SRC_FILE ${ARGN})
get_filename_component(FILE_ABS ${SRC_FILE} ABSOLUTE)
get_filename_component(FILE_NAME_WE ${SRC_FILE} NAME_WE)
get_filename_component(FILE_EXT ${SRC_FILE} EXT)
if(FILE_EXT STREQUAL ".cu")
set(OUT_EXT ".hip")
else()
set(OUT_EXT ${FILE_EXT})
endif()
set(OUT_FILE "${CMAKE_CURRENT_BINARY_DIR}/${FILE_NAME_WE}${OUT_EXT}")
add_custom_command(
OUTPUT ${OUT_FILE}
COMMAND ${HIPIFY_TOOL} -print-stats -o ${OUT_FILE} ${FILE_ABS}
DEPENDS ${FILE_ABS}
COMMENT "Auto-hipifying ${SRC_FILE}..."
)
if(OUT_EXT IN_LIST HIP_SOURCE_EXTS)
set_source_files_properties(${OUT_FILE} PROPERTIES
COMPILE_OPTIONS "-Wno-unused-result;-Wno-return-type"
LANGUAGE HIP
)
endif()
list(APPEND GENERATED_HIP_FILES ${OUT_FILE})
endforeach()
set(${OUTPUT_VAR_NAME} ${GENERATED_HIP_FILES} PARENT_SCOPE)
endfunction()
Subproject commit f6ae48de899408cf50c24079417dc71a03dbb5a8
......@@ -16,7 +16,7 @@ ROCM_VER ?= $(shell hipconfig -R | grep -oP '\d+\.\d+\.\d+' || echo "0.0.0")
NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
.PHONY: all cuda_with_msccl cuda rocm dtk common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest cuda_msccl rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn cpu_stream cpu_hpl directx_amf_encoding_latency directx_amd rocm_hipblaslt rocm_babelstream_hip megatron_lm megatron_deepspeed apex_rocm nvbandwidth rocm_megatron_lm
# Build targets.
all: cuda rocm
......@@ -24,7 +24,7 @@ all: cuda rocm
cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
dtk: common rocm_perftest rocm_rccl_tests megatron_deepspeed apex_rocm rocm_megatron_lm
dtk: common rocm_perftest rocm_rccl_tests rocm_babelstream_hip megatron_deepspeed apex_rocm rocm_megatron_lm
cpu: common cpu_perftest
common: fio cpu_stream
......@@ -180,6 +180,18 @@ rocm_bandwidthTest: sb_micro_path
cd ./HIP/samples/1_Utils/hipBusBandwidth/ && mkdir -p build && cd build && cmake .. && make
cp -v ./HIP/samples/1_Utils/hipBusBandwidth/build/hipBusBandwidth $(SB_MICRO_PATH)/bin/
# Build BabelStream hip-stream from submodule tag v5.0.
rocm_babelstream_hip: sb_micro_path
ifneq (,$(wildcard BabelStream/CMakeLists.txt))
cd ./BabelStream && \
cmake -S . -B build \
-DMODEL=hip \
-DCMAKE_CXX_COMPILER=hipcc \
-DCXX_EXTRA_FLAGS="--gpu-max-threads-per-block=1024" && \
cmake --build build -j $(NUM_MAKE_JOBS)
cp -v ./BabelStream/build/hip-stream $(SB_MICRO_PATH)/bin/
endif
# Build GPCNET from commit c56fd9.
gpcnet: sb_micro_path
bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment