Unverified Commit c72a105f authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.12 Adding NVIDIA platform support (#10)

parent cc0e9cb4
# Changelog for TransferBench
## v1.12
### Added
- Added support for TransferBench on NVIDIA platforms (via HIP_PLATFORM=nvidia)
- CPU executors on NVIDIA platform cannot access GPU memory (no large-bar access)
## v1.11
### Added
- New multi-input / multi-output support (MIMO). Transfers now can reduce (element-wise summation) multiple input memory arrays
......
......@@ -28,7 +28,7 @@ THE SOFTWARE.
#include <time.h>
#include "Kernels.hpp"
#define TB_VERSION "1.11"
#define TB_VERSION "1.12"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......
......@@ -29,6 +29,19 @@ THE SOFTWARE.
#define MEMSET_CHAR 75
#define MEMSET_VAL 13323083.0f
#if defined(__NVCC__)
// Define float4 addition operator for NVIDIA platform
__device__ inline float4& operator +=(float4& a, const float4& b)
{
a.x += b.x;
a.y += b.y;
a.z += b.z;
a.w += b.w;
return a;
}
#endif
// Each subExecutor is provided with subarrays to work on
#define MAX_SRCS 16
#define MAX_DSTS 16
......@@ -51,14 +64,14 @@ void CpuReduceKernel(SubExecParam const& p)
if (numSrcs == 0)
{
for (int i = 0; i < numDsts; ++i)
memset((float* __restrict__)p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
}
else if (numSrcs == 1)
{
float const* __restrict__ src = p.src[0];
for (int i = 0; i < numDsts; ++i)
{
memcpy((float* __restrict__)p.dst[i], src, p.N * sizeof(float));
memcpy(p.dst[i], src, p.N * sizeof(float));
}
}
else
......@@ -88,7 +101,6 @@ GpuReduceKernel(SubExecParam* params)
SubExecParam& p = params[blockIdx.x];
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
int const numWaves = BLOCKSIZE / WARP_SIZE; // Number of wavefronts per threadblock
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
......@@ -177,11 +189,11 @@ GpuReduceKernel(SubExecParam* params)
else
{
for (int i = 0; i < numSrcs; ++i)
val += ((float const* __restrict__)p.src[i])[offset];
val += p.src[i][offset];
}
for (int i = 0; i < numDsts; ++i)
((float* __restrict__)p.dst[i])[offset] = val;
p.dst[i][offset] = val;
}
}
......@@ -197,7 +209,6 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
{
int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
int constexpr numWaves = BLOCKSIZE / WARP_SIZE; // Number of wavefronts per threadblock
size_t constexpr loopPackInc = BLOCKSIZE * UNROLL_FACTOR;
size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR;
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
......
......@@ -3,7 +3,7 @@ ROCM_PATH ?= /opt/rocm
HIPCC=$(ROCM_PATH)/bin/hipcc
EXE=TransferBench
CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64 -ferror-limit=5
CXXFLAGS = -O3 -I. -I$(ROCM_PATH)/hsa/include -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64
all: $(EXE)
......
......@@ -13,6 +13,14 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be
If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
## NVIDIA platform support
TransferBench may also be built to run on NVIDIA platforms via HIP, but requires a HIP-compatible CUDA version installed (e.g. CUDA 11.5)
To build:
```
CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make`
```
## Hints and suggestions
- Running TransferBench with no arguments will display usage instructions and detected topology information
......
......@@ -291,7 +291,11 @@ void ExecuteTransfers(EnvVars const& ev,
printf(" DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
}
printf("Hit <Enter> to continue: ");
scanf("%*c");
if (scanf("%*c") != 0)
{
printf("[ERROR] Unexpected input\n");
exit(1);
}
printf("\n");
}
......@@ -332,7 +336,11 @@ void ExecuteTransfers(EnvVars const& ev,
if (verbose && ev.useInteractive)
{
printf("Transfers complete. Hit <Enter> to continue: ");
scanf("%*c");
if (scanf("%*c") != 0)
{
printf("[ERROR] Unexpected input\n");
exit(1);
}
printf("\n");
}
......@@ -590,6 +598,7 @@ int RemappedIndex(int const origIdx, bool const isCpuType)
void DisplayTopology(bool const outputToCsv)
{
int numCpuDevices = numa_num_configured_nodes();
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
......@@ -648,6 +657,7 @@ void DisplayTopology(bool const outputToCsv)
else
printf(" %5d | ", numCpus);
#if !defined(__NVCC__)
bool isFirst = true;
for (int j = 0; j < numGpuDevices; j++)
{
......@@ -658,10 +668,16 @@ void DisplayTopology(bool const outputToCsv)
printf("%d", j);
}
}
#endif
printf("\n");
}
printf("\n");
#if defined(__NVCC__)
// No further topology detection done for NVIDIA platforms
return;
#endif
// Print out detected GPU topology
if (outputToCsv)
{
......@@ -691,8 +707,8 @@ void DisplayTopology(bool const outputToCsv)
printf("--------------+------+-------------\n");
}
#if !defined(__NVCC__)
char pciBusId[20];
for (int i = 0; i < numGpuDevices; i++)
{
int const deviceIdx = RemappedIndex(i, false);
......@@ -732,6 +748,7 @@ void DisplayTopology(bool const outputToCsv)
else
printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
}
#endif
}
void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
......@@ -930,11 +947,20 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
// Allocate host-pinned memory (should respect NUMA mem policy)
if (memType == MEM_CPU_FINE)
{
#if defined (__NVCC__)
printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
exit(1);
#else
HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
#endif
}
else if (memType == MEM_CPU)
{
#if defined (__NVCC__)
if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
#else
if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
#endif
{
printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
exit(1);
......@@ -960,8 +986,13 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
}
else if (memType == MEM_GPU_FINE)
{
#if defined (__NVCC__)
printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
exit(1);
#else
HIP_CALL(hipSetDevice(devIndex));
HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
#endif
}
else
{
......@@ -1044,13 +1075,18 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// In single stream mode, all the threadblocks for this GPU are launched
// Otherwise, just launch the threadblocks associated with this single Transfer
int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
#if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
HIP_CALL(hipEventRecord(stopEvent, stream));
#else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
dim3(numBlocksToRun, 1, 1),
dim3(BLOCKSIZE, 1, 1),
ev.sharedMemBytes, stream,
startEvent, stopEvent,
0, transfer->subExecParamGpuPtr);
#endif
// Synchronize per iteration, unless in single sync mode, in which case
// synchronize during last warmup / last actual iteration
HIP_CALL(hipStreamSynchronize(stream));
......@@ -1228,8 +1264,6 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
// Skip bidirectional on same device
if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
int const initOffset = ev.byteOffset / sizeof(float);
// Prepare Transfers
std::vector<Transfer> transfers(2);
transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
......@@ -1265,6 +1299,12 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
{
if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
return 0;
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
return 0;
#endif
}
ExecuteTransfers(ev, 0, N, transfers, false);
......@@ -1549,6 +1589,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
{
if (exeList[i].second != srcList[j].second)
{
#if defined(__NVCC__)
isXgmiSrc = false;
#else
uint32_t exeToSrcLinkType, exeToSrcHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
RemappedIndex(srcList[j].second, false),
......@@ -1556,6 +1599,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
&exeToSrcHopCount));
isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
#endif
}
else
{
......@@ -1582,6 +1626,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
{
if (exeList[i].second != dstList[k].second)
{
#if defined(__NVCC__)
isXgmiSrc = false;
#else
uint32_t exeToDstLinkType, exeToDstHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
RemappedIndex(dstList[k].second, false),
......@@ -1589,6 +1636,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
&exeToDstHopCount));
isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiDst) numHopsDst = exeToDstHopCount;
#endif
}
else
{
......@@ -1606,6 +1654,12 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
// Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
#if defined(__NVCC__)
// Skip CPU executors on GPU memory on NVIDIA platform
if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
continue;
#endif
tinfo.dstType = dstList[k].first;
tinfo.dstIndex = dstList[k].second;
......
......@@ -31,8 +31,15 @@ THE SOFTWARE.
#include <map>
#include <iostream>
#include <sstream>
#include <hip/hip_runtime.h>
#if defined(__NVCC__)
#include <cuda_runtime.h>
#define __builtin_amdgcn_s_memrealtime clock64
#else
#include <hip/hip_ext.h>
#endif
#include <hip/hip_runtime.h>
#include <hsa/hsa_ext_amd.h>
// Helper macro for catching HIP errors
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment