Unverified Commit 60189dd6 authored by WenqingLan1's avatar WenqingLan1 Committed by GitHub
Browse files

Dockerfile - add cuda13.0.dockerfile (#739)



Add support for cuda13.0.
Add cuda13.0.dockerfile.
Add cuda13.0 image building task to github pipeline.
Update GPU STREAM to work with cuda13.0.
Fix data type conversion perf bug in GPU stream.
Update nvbandwidth submodule to be v0.8.
Update perftest submodule to be 4bee61f80d9e268fc97eaf40be00409e91d3a19e
(recent master).

---------
Co-authored-by: default avatarUbuntu <dilipreddi@gmail.com>
Co-authored-by: default avatarguoshzhao <guzhao@microsoft.com>
parent 93e9d262
......@@ -26,6 +26,18 @@ jobs:
fail-fast: true
matrix:
include:
- name: cuda13.0-arm64
dockerfile: cuda13.0
tags: superbench/main:cuda13.0-arm64
platforms: linux/arm64
runner: [self-hosted, linux/arm64]
build_args: "NUM_MAKE_JOBS=16"
- name: cuda13.0-amd64
dockerfile: cuda13.0
tags: superbench/main:cuda13.0-amd64
platforms: linux/amd64
runner: [self-hosted, linux/amd64]
build_args: "NUM_MAKE_JOBS=16"
- name: cuda12.8-arm64
dockerfile: cuda12.8
tags: superbench/main:cuda12.8-arm64
......
......@@ -20,7 +20,7 @@
url = https://github.com/netbench/GPCNET.git
[submodule "third_party/gpu-burn"]
path = third_party/gpu-burn
url = https://github.com/wilicc/gpu-burn.git
url = https://github.com/WenqingLan1/gpu-burn.git
[submodule "third_party/msccl"]
path = third_party/msccl
url = https://github.com/Azure/msccl
......
FROM nvcr.io/nvidia/pytorch:25.08-py3
# OS:
# - Ubuntu: 24.04
# - OpenMPI: 4.1.9a1
# - Docker Client: 20.10.8 (installed in this dockerfile)
# NVIDIA:
# - CUDA: 13.0.0.044
# - cuDNN: 9.12.0.46
# - cuBLAS: 13.0.0.19
# - NCCL: 2.27.7
# - TransformerEngine: v2.5
# - torch: 2.8.0a0+34c6371d24
# Mellanox:
# - MOFED_VERSION: (installed in this dockerfile)
# - HPC-X: 2.24
# Intel:
# - mlc: ??? (amd64 only)
LABEL maintainer="SuperBench"
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update && \
apt-get install -y --no-install-recommends \
autoconf \
automake \
bc \
build-essential \
curl \
dmidecode \
ffmpeg \
git \
iproute2 \
jq \
libaio-dev \
libavcodec-dev \
libavformat-dev \
libavutil-dev \
libboost-program-options-dev \
libcap2 \
libcurl4-openssl-dev \
libnuma-dev \
libpci-dev \
libswresample-dev \
libncurses-dev \
libtool \
lshw \
python3-mpi4py \
net-tools \
nlohmann-json3-dev \
openssh-client \
openssh-server \
pciutils \
sudo \
util-linux \
vim \
wget \
rsync \
&& \
apt-get autoremove && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*
ARG NUM_MAKE_JOBS=
ARG TARGETPLATFORM
ARG TARGETARCH
# Install Docker
ENV DOCKER_VERSION=20.10.8
RUN TARGETARCH_HW=$(uname -m) && \
wget -q https://download.docker.com/linux/static/stable/${TARGETARCH_HW}/docker-${DOCKER_VERSION}.tgz -O docker.tgz && \
tar --extract --file docker.tgz --strip-components 1 --directory /usr/local/bin/ && \
rm docker.tgz
# Update system config
RUN mkdir -p /root/.ssh && \
touch /root/.ssh/authorized_keys && \
mkdir -p /var/run/sshd && \
sed -i "s/[# ]*PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*PermitUserEnvironment no/PermitUserEnvironment yes/" /etc/ssh/sshd_config && \
sed -i "s/[# ]*Port.*/Port 22/" /etc/ssh/sshd_config && \
echo "* soft nofile 1048576\n* hard nofile 1048576" >> /etc/security/limits.conf && \
echo "root soft nofile 1048576\nroot hard nofile 1048576" >> /etc/security/limits.conf
# Install OFED
ENV OFED_VERSION=24.10-1.1.4.0
RUN TARGETARCH_HW=$(uname -m) && \
cd /tmp && \
wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}.tgz && \
MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu24.04-${TARGETARCH_HW}/mlnxofedinstall --user-space-only --without-fw-update --without-ucx-cuda --force --all && \
rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
# Install HPC-X
ENV HPCX_VERSION=v2.24.1
RUN TARGETARCH_HW=$(uname -m) && \
cd /opt && \
rm -rf hpcx && \
wget https://content.mellanox.com/hpc/hpc-x/${HPCX_VERSION}_cuda13/hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda13-${TARGETARCH_HW}.tbz -O hpcx.tbz && \
tar xf hpcx.tbz && \
mv hpcx-${HPCX_VERSION}-gcc-doca_ofed-ubuntu24.04-cuda13-${TARGETARCH_HW} hpcx && \
rm hpcx.tbz
# Installs specific to amd64 platform
RUN if [ "$TARGETARCH" = "amd64" ]; then \
# Install Intel MLC
cd /tmp && \
wget -q https://downloadmirror.intel.com/793041/mlc_v3.11.tgz -O mlc.tgz && \
tar xzf mlc.tgz Linux/mlc && \
cp ./Linux/mlc /usr/local/bin/ && \
rm -rf ./Linux mlc.tgz && \
# Install AOCC compiler
wget https://download.amd.com/developer/eula/aocc-compiler/aocc-compiler-4.0.0_1_amd64.deb && \
apt install -y ./aocc-compiler-4.0.0_1_amd64.deb && \
rm -rf aocc-compiler-4.0.0_1_amd64.deb && \
# Install AMD BLIS
wget https://download.amd.com/developer/eula/blis/blis-4-0/aocl-blis-linux-aocc-4.0.tar.gz && \
tar xzf aocl-blis-linux-aocc-4.0.tar.gz && \
mv amd-blis /opt/AMD && \
rm -rf aocl-blis-linux-aocc-4.0.tar.gz; \
else \
echo "Skipping Intel MLC, AOCC and AMD Bliss installations for non-amd64 architecture: $TARGETARCH"; \
fi
# Install UCX with multi-threading support
ENV UCX_VERSION=1.18.0
RUN cd /tmp && \
wget https://github.com/openucx/ucx/releases/download/v${UCX_VERSION}-rc1/ucx-${UCX_VERSION}.tar.gz && \
tar xzf ucx-${UCX_VERSION}.tar.gz && \
cd ucx-${UCX_VERSION} && \
./contrib/configure-release-mt --prefix=/usr/local && \
make -j ${NUM_MAKE_JOBS} && \
make install
ENV PATH="${PATH}" \
LD_LIBRARY_PATH="/usr/local/lib:/usr/local/mpi/lib:${LD_LIBRARY_PATH}" \
SB_HOME=/opt/superbench \
SB_MICRO_PATH=/opt/superbench \
ANSIBLE_DEPRECATION_WARNINGS=FALSE \
ANSIBLE_COLLECTIONS_PATH=/usr/share/ansible/collections
RUN echo PATH="$PATH" > /etc/environment && \
echo LD_LIBRARY_PATH="$LD_LIBRARY_PATH" >> /etc/environment && \
echo SB_MICRO_PATH="$SB_MICRO_PATH" >> /etc/environment && \
echo "source /opt/hpcx/hpcx-init.sh && hpcx_load" | tee -a /etc/bash.bashrc >> /etc/profile.d/10-hpcx.sh
# Add config files
ADD dockerfile/etc /opt/microsoft/
WORKDIR ${SB_HOME}
ADD third_party third_party
RUN make -C third_party cuda
ADD . .
RUN python3 -m pip install --upgrade setuptools==78.1.0 && \
python3 -m pip install --no-cache-dir .[nvworker] && \
make cppbuild && \
make postinstall && \
rm -rf .git
......@@ -38,4 +38,11 @@ if(NOT DEFINED NVCC_ARCHS_SUPPORTED)
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
list(APPEND NVCC_ARCHS_SUPPORTED 100)
endif()
if (NOT CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.9)
list(APPEND NVCC_ARCHS_SUPPORTED 103)
endif()
# CUDA 13.0+ drops support for archs before sm_75, remove them
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
list(REMOVE_ITEM NVCC_ARCHS_SUPPORTED 53 60 61 70 72)
endif()
endif()
......@@ -171,7 +171,12 @@ void ParseCommandLine(int argc, char *argv[], char *szInputFileName, int &iGpu,
OptimizedNvDecoder *InitOptimizedNvDecoder(int i, const CUdevice &cuDevice, CUcontext &cuContext, bool bSingle,
bool bHost, cudaVideoCodec codec, CUVIDDECODECAPS decodecaps) {
if (!bSingle) {
#if CUDA_VERSION >= 13000
CUctxCreateParams ctxCreateParams = {};
ck(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice));
#else
ck(cuCtxCreate(&cuContext, 0, cuDevice));
#endif
}
OptimizedNvDecoder *sessionObject = new OptimizedNvDecoder(cuContext, !bHost, codec, decodecaps);
sessionObject->setDecoderSessionID(i);
......@@ -247,7 +252,12 @@ void InitializeContext(std::vector<OptimizedNvDecoder *> &vDec, int iGpu, int nT
std::cout << "GPU in use: " << szDeviceName << std::endl;
CUcontext cuContext = NULL;
#if CUDA_VERSION >= 13000
CUctxCreateParams ctxCreateParams = {};
ck(cuCtxCreate(&cuContext, &ctxCreateParams, 0, cuDevice));
#else
ck(cuCtxCreate(&cuContext, 0, cuDevice));
#endif
CUVIDDECODECAPS decodecaps;
GetDefaultDecoderCaps(decodecaps, codec);
......
......@@ -53,6 +53,7 @@ def __init__(self, name, parameters=''):
# Skip INT4 for Hopper due to no native CUDA/Tensor Cores
self.__kernel_map[9.0] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
self.__kernel_map[10.0] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
self.__kernel_map[10.3] = {k: self.__kernel_map[8.0][k] for k in self.__kernel_map[8.0] if 'int4_tc' not in k}
self.__parse_logline = [
'gemm,cutlass_simt_dgemm_128x128_8x2', 'gemm,cutlass_simt_sgemm_128x128_8x2',
'gemm,cutlass_simt_hgemm_256x128_8x2', 'gemm,cutlass_tensorop_d884gemm_128x128_16x3',
......
......@@ -9,6 +9,7 @@
#include "gpu_stream.hpp"
#include <cassert>
#include <iostream>
#include <nvml.h>
/**
* @brief Destroys the CUDA events used for benchmarking.
......@@ -103,23 +104,73 @@ template <typename T> int GpuStream::Destroy(std::unique_ptr<BenchArgs<T>> &args
return ret;
}
/**
* @brief Gets the memory clock rate for a CUDA device.
*
* @details This function gets the memory clock rate using the appropriate method
* based on CUDA version: CUDA 12.0+ uses NVML and cudaDeviceGetAttribute as a fallback;
* older CUDA versions use cudaDeviceProp.
*
* @param[in] device_id The ID of the CUDA device.
* @param[in] prop The properties of the CUDA device.
* @return float The memory clock rate in MHz, or -1.0f if retrieval fails.
*/
float GpuStream::GetMemoryClockRate(int device_id, const cudaDeviceProp &prop) {
float memory_clock_mhz = 0.0f;
// Set the device before getting attributes
if (SetGpu(device_id)) {
return -1.0f;
}
#if CUDA_VERSION >= 12000
// For CUDA 12.0+, first try NVML for actual clock rate
memory_clock_mhz = GetActualMemoryClockRate(device_id);
// If NVML fails, fall back to cudaDeviceGetAttribute
if (memory_clock_mhz < 0.0f) {
int memory_clock_khz = 0;
cudaError_t cuda_err = cudaDeviceGetAttribute(&memory_clock_khz, cudaDevAttrMemoryClockRate, device_id);
if (cuda_err != cudaSuccess) {
std::cerr << "GetMemoryClockRate::cudaDeviceGetAttribute error: " << cuda_err << std::endl;
return -1.0f;
}
// Convert kHz to MHz
memory_clock_mhz = memory_clock_khz / 1000.0f;
}
#else
// For CUDA < 12.0, use memoryClockRate from cudaDeviceProp
// Note: memoryClockRate is in kHz, need to convert to MHz first
memory_clock_mhz = prop.memoryClockRate / 1000.0f; // Convert kHz to MHz
#endif
return memory_clock_mhz;
}
/**
* @brief Prints CUDA device information.
*
* @details This function prints the properties of a CUDA device specified by the device ID.
*
* @param[in] device_id The ID of the CUDA device.
* @param[out] prop The properties of the CUDA device.
* @return void
* */
void GpuStream::PrintCudaDeviceInfo(int device_id, const cudaDeviceProp &prop) {
* @param[in] prop The properties of the CUDA device.
* @param[in] memory_clock_mhz The memory clock rate in MHz (or -1.0f if unavailable).
* @param[in] peak_bw The theoretical peak bandwidth in GB/s (or -1.0f if unavailable).
*/
void GpuStream::PrintCudaDeviceInfo(int device_id, const cudaDeviceProp &prop, float memory_clock_mhz, float peak_bw) {
std::cout << "\nDevice " << device_id << ": \"" << prop.name << "\"";
std::cout << " " << prop.multiProcessorCount << " SMs(" << prop.major << "." << prop.minor << ")";
// Compute theoretical bw:
// https://developer.nvidia.com/blog/how-implement-performance-metrics-cuda-cc/#theoretical_bandwidth
std::cout << " Memory: " << prop.memoryClockRate / 1000.0 << "MHz x " << prop.memoryBusWidth
<< "-bit = " << (prop.memoryClockRate / 1000.0) * (prop.memoryBusWidth / 8) * 2 / 1000.0 << " GB/s PEAK ";
if (peak_bw < 0.0f) {
// Bandwidth computation failed (memory_clock_mhz will also be < 0)
std::cout << " Memory: " << prop.memoryBusWidth << "-bit bus, " << prop.totalGlobalMem / (1024 * 1024 * 1024)
<< " GB total (peak BW unavailable)";
} else {
// Both memory_clock_mhz and peak_bw are valid
std::cout << " Memory: " << memory_clock_mhz << "MHz x " << prop.memoryBusWidth << "-bit = " << peak_bw
<< " GB/s PEAK ";
}
std::cout << " ECC is " << (prop.ECCEnabled ? "ON" : "OFF") << std::endl;
}
......@@ -158,11 +209,12 @@ template <typename T> int GpuStream::PrepareValidationBuf(std::unique_ptr<BenchA
}
// Initialize validation buffer
T s = static_cast<T>(scalar);
for (size_t j = 0; j < size; j++) {
args->sub.validation_buf_ptrs[0][j] = static_cast<T>(j % kUInt8Mod);
args->sub.validation_buf_ptrs[1][j] = static_cast<T>(j % kUInt8Mod) * scalar;
args->sub.validation_buf_ptrs[1][j] = static_cast<T>(j % kUInt8Mod) * s;
args->sub.validation_buf_ptrs[2][j] = static_cast<T>(j % kUInt8Mod) + static_cast<T>(j % kUInt8Mod);
args->sub.validation_buf_ptrs[3][j] = static_cast<T>(j % kUInt8Mod) + static_cast<T>(j % kUInt8Mod) * scalar;
args->sub.validation_buf_ptrs[3][j] = static_cast<T>(j % kUInt8Mod) + static_cast<T>(j % kUInt8Mod) * s;
}
return 0;
}
......@@ -404,7 +456,7 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
case Kernel::kScale:
ScaleKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), scalar);
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()), static_cast<T>(scalar));
args->sub.kernel_name = "SCALE";
break;
case Kernel::kAdd:
......@@ -419,7 +471,7 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
TriadKernel<<<num_thread_blocks, num_threads_per_block, 0, args->sub.stream>>>(
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[2].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[0].get()),
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), scalar);
reinterpret_cast<T *>(args->sub.gpu_buf_ptrs[1].get()), static_cast<T>(scalar));
size_factor = 3;
args->sub.kernel_name = "TRIAD";
break;
......@@ -458,6 +510,38 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
return 0;
}
float GpuStream::GetActualMemoryClockRate(int gpu_id) {
nvmlReturn_t result;
nvmlDevice_t device;
unsigned int clock_mhz;
// Initialize NVML
result = nvmlInit();
if (result != NVML_SUCCESS) {
std::cerr << "Failed to initialize NVML: " << nvmlErrorString(result) << std::endl;
return -1.0f;
}
// Get device handle
result = nvmlDeviceGetHandleByIndex(gpu_id, &device);
if (result != NVML_SUCCESS) {
std::cerr << "Failed to get device handle: " << nvmlErrorString(result) << std::endl;
nvmlShutdown();
return -1.0f;
}
// Get memory clock rate
result = nvmlDeviceGetClockInfo(device, NVML_CLOCK_MEM, &clock_mhz);
if (result != NVML_SUCCESS) {
std::cerr << "Failed to get memory clock: " << nvmlErrorString(result) << std::endl;
nvmlShutdown();
return -1.0f;
}
nvmlShutdown();
return static_cast<float>(clock_mhz);
}
/**
* @brief Runs the benchmark for various kernels and processes the results for a BenchArgs config.
*
......@@ -470,13 +554,11 @@ int GpuStream::RunStreamKernel(std::unique_ptr<BenchArgs<T>> &args, Kernel kerne
*
* @return int The status code indicating success or failure of the benchmark execution.
* */
template <typename T> int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string &data_type) {
template <typename T>
int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &args, const std::string &data_type, float peak_bw) {
int ret = 0;
ret = PrepareBufAndStream<T>(args);
float peak_bw =
args->gpu_device_prop.memoryClockRate / 1000.0 * args->gpu_device_prop.memoryBusWidth / 8 * 2 / 1000.0;
if (ret != 0) {
return DestroyBufAndStream(args);
}
......@@ -508,8 +590,13 @@ template <typename T> int GpuStream::RunStream(std::unique_ptr<BenchArgs<T>> &ar
for (int j = 0; j < args->sub.times_in_ms[i].size(); j++) {
// Calculate and display bandwidth
double bw = args->size * args->num_loops / args->sub.times_in_ms[i][j] / 1e6;
std::cout << tag << "_block_" << kThreadsPerBlock[j] << "\t" << bw << "\t" << std::fixed
<< std::setprecision(2) << bw / peak_bw * 100 << std::endl;
std::cout << tag << "_block_" << kThreadsPerBlock[j] << "\t" << bw << "\t";
if (peak_bw < 0) { // cannot get peak_bw -> prints -1 for efficiency
std::cout << "-1" << std::endl;
} else {
std::cout << std::fixed << std::setprecision(2) << bw / peak_bw * 100 << std::endl;
}
}
}
// cleanup buffer and streams for the curr arg
......@@ -567,7 +654,20 @@ int GpuStream::Run() {
for (auto &variant_args : bench_args_) {
std::visit(
[&](auto &curr_args) {
PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop);
// Get memory clock rate once for both bandwidth computation and display
float memory_clock_mhz = GetMemoryClockRate(curr_args->gpu_id, curr_args->gpu_device_prop);
// Compute theoretical bandwidth using the memory clock rate
float peak_bw = -1.0f;
if (memory_clock_mhz > 0.0f) {
// Calculate theoretical bandwidth: memory_clock_mhz * bus_width_bytes * 2 (DDR) / 1000 (convert to
// GB/s)
peak_bw = memory_clock_mhz * (curr_args->gpu_device_prop.memoryBusWidth / 8) * 2 / 1000.0;
}
// Print device info with both the memory clock and peak bandwidth
PrintCudaDeviceInfo(curr_args->gpu_id, curr_args->gpu_device_prop, memory_clock_mhz, peak_bw);
// Set the NUMA node
ret = numa_run_on_node(curr_args->numa_id);
if (ret != 0) {
......@@ -576,11 +676,11 @@ int GpuStream::Run() {
return;
}
// Run the stream benchmark for the configued data
// Run the stream benchmark for the configured data, passing the peak bandwidth
if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<float>>) {
ret = RunStream<float>(curr_args, "float");
ret = RunStream<float>(curr_args, "float", peak_bw);
} else if constexpr (std::is_same_v<std::decay_t<decltype(*curr_args)>, BenchArgs<double>>) {
ret = RunStream<double>(curr_args, "double");
ret = RunStream<double>(curr_args, "double", peak_bw);
} else {
std::cerr << "Run::Unknown type error" << std::endl;
has_error = true;
......
......@@ -52,10 +52,12 @@ class GpuStream {
// Benchmark functions
template <typename T> int RunStreamKernel(std::unique_ptr<BenchArgs<T>> &, Kernel, int);
template <typename T> int RunStream(std::unique_ptr<BenchArgs<T>> &, const std::string &data_type);
float GetActualMemoryClockRate(int gpu_id);
template <typename T> int RunStream(std::unique_ptr<BenchArgs<T>> &, const std::string &data_type, float peak_bw);
// Helper functions
int GetGpuCount(int *);
int SetGpu(int gpu_id);
void PrintCudaDeviceInfo(int, const cudaDeviceProp &);
float GetMemoryClockRate(int device_id, const cudaDeviceProp &prop);
void PrintCudaDeviceInfo(int device_id, const cudaDeviceProp &prop, float memory_clock_mhz, float peak_bw);
};
\ No newline at end of file
......@@ -85,7 +85,7 @@ __global__ void CopyKernel(double *tgt, const double *src) {
* @param[in] src The source array containing the data to be scaled.
* @param[in] scalar The scalar value used to scale the source data.
*/
__global__ void ScaleKernel(double *tgt, const double *src, const long scalar) {
__global__ void ScaleKernel(double *tgt, const double *src, const double scalar) {
uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
double val[kNumLoopUnrollAlias];
#pragma unroll
......@@ -137,7 +137,7 @@ __global__ void AddKernel(double *tgt, const double *src_a, const double *src_b)
* @param[in] src_b The second source array containing the second set of operands to be multiplied by the scalar.
* @param[in] scalar The scalar value used in the multiply/add operation.
*/
__global__ void TriadKernel(double *tgt, const double *src_a, const double *src_b, const long scalar) {
__global__ void TriadKernel(double *tgt, const double *src_a, const double *src_b, const double scalar) {
uint64_t index = blockIdx.x * blockDim.x * kNumLoopUnrollAlias + threadIdx.x;
double val_a[kNumLoopUnrollAlias];
double val_b[kNumLoopUnrollAlias];
......
......@@ -14,6 +14,6 @@ template <typename T> inline __device__ void Fetch(T &v, const T *p);
template <typename T> inline __device__ void Store(T *p, const T &v);
__global__ void CopyKernel(double *, const double *);
__global__ void ScaleKernel(double *, const double *, const long);
__global__ void ScaleKernel(double *, const double *, const double);
__global__ void AddKernel(double *, const double *, const double *);
__global__ void TriadKernel(double *, const double *, const double *, const long);
\ No newline at end of file
__global__ void TriadKernel(double *, const double *, const double *, const double);
\ No newline at end of file
......@@ -36,7 +36,7 @@ constexpr int kNumBuffers = 3; // Number
constexpr int kNumValidationBuffers = 4; // Number of validation buffers, one for each kernel
constexpr int kUInt8Mod = 256; // Modulo for unsigned long data type
constexpr std::array<int, 4> kBufferBwMultipliers = {2, 2, 3, 3}; // Buffer multiplier for triad, add kernel
constexpr long scalar = 11; // Scalar for scale, triad kernel
constexpr double scalar = 11.0; // Scalar for scale, triad kernel
// Enum for different kernels
enum class Kernel {
......
......@@ -20,6 +20,7 @@ NUM_MAKE_JOBS ?= $(shell nproc --ignore=2)
# Build targets.
all: cuda rocm
# msccl: api change in cudaStreamUpdateCaptureDependencies
cuda_with_msccl: cuda cuda_msccl
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn megatron_lm megatron_deepspeed nvbandwidth
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm_hipblaslt megatron_deepspeed apex_rocm rocm_megatron_lm
......@@ -40,7 +41,11 @@ sb_micro_path:
# Build cutlass.
# for cuda 12.9 and later Build from commit v3.9 (3.9 release commit) for blackwell support
cuda_cutlass:
ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
$(eval ARCHS := "100;103")
if [ -d cutlass ]; then rm -rf cutlass; fi
git clone --branch v4.1.0 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
$(eval ARCHS := "90;100")
if [ -d cutlass ]; then rm -rf cutlass; fi
git clone --branch v3.9.2 --depth 1 https://github.com/NVIDIA/cutlass.git && cd cutlass
......@@ -108,7 +113,7 @@ ifneq (,$(wildcard nccl-tests/Makefile))
endif
# Build perftest.
# The version we use is the tag v4.5-0.2.
# The commit we use is 4bee61f80d9e268fc97eaf40be00409e91d3a19e.
cuda_perftest:
ifneq (,$(wildcard perftest/autogen.sh))
cd perftest && ./autogen.sh && ./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h --prefix=$(SB_MICRO_PATH) && make -j && make install
......@@ -264,7 +269,12 @@ apex_rocm:
# Build MSCCL for CUDA
cuda_msccl: sb_micro_path
ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
ifeq ($(shell echo $(CUDA_VER)">=12.9" | bc -l), 1)
$(eval ARCHS := 100 103)
if [ -d msccl ]; then rm -rf msccl; fi; \
git clone --single-branch --branch main https://github.com/Azure/msccl.git \
&& git -C msccl checkout 87048bd && git -C msccl submodule update --recursive --init
else ifeq ($(shell echo $(CUDA_VER)">=12.8" | bc -l), 1)
# Get commit 87048bd from msscl to support updated nccl and sm_100
$(eval ARCHS := 75 80 86 89 90 100)
if [ -d msccl ]; then rm -rf msccl; fi; \
......
Subproject commit cab8221b1147e83dd1fea3e42c3fe255254236ff
Subproject commit 565e55b46f9885688ba9737f1600b1f62d47a95e
Subproject commit fb851de841a0b1fb261cbc3a6fe626f17a19ba0f
Subproject commit 66746a3bef61c8c2e12ab34955310da70b9e38cb
Subproject commit dffd1dd8b8a26dad2634a546e7e4d082dc882fbc
Subproject commit 4bee61f80d9e268fc97eaf40be00409e91d3a19e
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment