Unverified Commit c72a105f authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.12 Adding NVIDIA platform support (#10)

parent cc0e9cb4
# Changelog for TransferBench # Changelog for TransferBench
## v1.12
### Added
- Added support for TransferBench on NVIDIA platforms (via HIP_PLATFORM=nvidia)
- CPU executors on NVIDIA platform cannot access GPU memory (no large-bar access)
## v1.11 ## v1.11
### Added ### Added
- New multi-input / multi-output support (MIMO). Transfers now can reduce (element-wise summation) multiple input memory arrays - New multi-input / multi-output support (MIMO). Transfers now can reduce (element-wise summation) multiple input memory arrays
......
...@@ -28,7 +28,7 @@ THE SOFTWARE. ...@@ -28,7 +28,7 @@ THE SOFTWARE.
#include <time.h> #include <time.h>
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.11" #define TB_VERSION "1.12"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
......
...@@ -29,6 +29,19 @@ THE SOFTWARE. ...@@ -29,6 +29,19 @@ THE SOFTWARE.
#define MEMSET_CHAR 75 #define MEMSET_CHAR 75
#define MEMSET_VAL 13323083.0f #define MEMSET_VAL 13323083.0f
#if defined(__NVCC__)
// Define float4 addition operator for NVIDIA platform
__device__ inline float4& operator +=(float4& a, const float4& b)
{
a.x += b.x;
a.y += b.y;
a.z += b.z;
a.w += b.w;
return a;
}
#endif
// Each subExecutor is provided with subarrays to work on // Each subExecutor is provided with subarrays to work on
#define MAX_SRCS 16 #define MAX_SRCS 16
#define MAX_DSTS 16 #define MAX_DSTS 16
...@@ -51,14 +64,14 @@ void CpuReduceKernel(SubExecParam const& p) ...@@ -51,14 +64,14 @@ void CpuReduceKernel(SubExecParam const& p)
if (numSrcs == 0) if (numSrcs == 0)
{ {
for (int i = 0; i < numDsts; ++i) for (int i = 0; i < numDsts; ++i)
memset((float* __restrict__)p.dst[i], MEMSET_CHAR, p.N * sizeof(float)); memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
} }
else if (numSrcs == 1) else if (numSrcs == 1)
{ {
float const* __restrict__ src = p.src[0]; float const* __restrict__ src = p.src[0];
for (int i = 0; i < numDsts; ++i) for (int i = 0; i < numDsts; ++i)
{ {
memcpy((float* __restrict__)p.dst[i], src, p.N * sizeof(float)); memcpy(p.dst[i], src, p.N * sizeof(float));
} }
} }
else else
...@@ -88,7 +101,6 @@ GpuReduceKernel(SubExecParam* params) ...@@ -88,7 +101,6 @@ GpuReduceKernel(SubExecParam* params)
SubExecParam& p = params[blockIdx.x]; SubExecParam& p = params[blockIdx.x];
int const numSrcs = p.numSrcs; int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts; int const numDsts = p.numDsts;
int const numWaves = BLOCKSIZE / WARP_SIZE; // Number of wavefronts per threadblock
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
...@@ -177,11 +189,11 @@ GpuReduceKernel(SubExecParam* params) ...@@ -177,11 +189,11 @@ GpuReduceKernel(SubExecParam* params)
else else
{ {
for (int i = 0; i < numSrcs; ++i) for (int i = 0; i < numSrcs; ++i)
val += ((float const* __restrict__)p.src[i])[offset]; val += p.src[i][offset];
} }
for (int i = 0; i < numDsts; ++i) for (int i = 0; i < numDsts; ++i)
((float* __restrict__)p.dst[i])[offset] = val; p.dst[i][offset] = val;
} }
} }
...@@ -197,7 +209,6 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR> ...@@ -197,7 +209,6 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N) __device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
{ {
int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread int constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
int constexpr numWaves = BLOCKSIZE / WARP_SIZE; // Number of wavefronts per threadblock
size_t constexpr loopPackInc = BLOCKSIZE * UNROLL_FACTOR; size_t constexpr loopPackInc = BLOCKSIZE * UNROLL_FACTOR;
size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR; size_t constexpr numPacksPerWave = WARP_SIZE * UNROLL_FACTOR;
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
......
...@@ -3,7 +3,7 @@ ROCM_PATH ?= /opt/rocm ...@@ -3,7 +3,7 @@ ROCM_PATH ?= /opt/rocm
HIPCC=$(ROCM_PATH)/bin/hipcc HIPCC=$(ROCM_PATH)/bin/hipcc
EXE=TransferBench EXE=TransferBench
CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64 -ferror-limit=5 CXXFLAGS = -O3 -I. -I$(ROCM_PATH)/hsa/include -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64
all: $(EXE) all: $(EXE)
......
...@@ -13,6 +13,14 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be ...@@ -13,6 +13,14 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be
If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
## NVIDIA platform support
TransferBench may also be built to run on NVIDIA platforms via HIP, but requires a HIP-compatible CUDA version installed (e.g. CUDA 11.5)
To build:
```
CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make`
```
## Hints and suggestions ## Hints and suggestions
- Running TransferBench with no arguments will display usage instructions and detected topology information - Running TransferBench with no arguments will display usage instructions and detected topology information
......
...@@ -291,7 +291,11 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -291,7 +291,11 @@ void ExecuteTransfers(EnvVars const& ev,
printf(" DST %0d: %p\n", iDst, transfer.dstMem[iDst]); printf(" DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
} }
printf("Hit <Enter> to continue: "); printf("Hit <Enter> to continue: ");
scanf("%*c"); if (scanf("%*c") != 0)
{
printf("[ERROR] Unexpected input\n");
exit(1);
}
printf("\n"); printf("\n");
} }
...@@ -332,7 +336,11 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -332,7 +336,11 @@ void ExecuteTransfers(EnvVars const& ev,
if (verbose && ev.useInteractive) if (verbose && ev.useInteractive)
{ {
printf("Transfers complete. Hit <Enter> to continue: "); printf("Transfers complete. Hit <Enter> to continue: ");
scanf("%*c"); if (scanf("%*c") != 0)
{
printf("[ERROR] Unexpected input\n");
exit(1);
}
printf("\n"); printf("\n");
} }
...@@ -590,6 +598,7 @@ int RemappedIndex(int const origIdx, bool const isCpuType) ...@@ -590,6 +598,7 @@ int RemappedIndex(int const origIdx, bool const isCpuType)
void DisplayTopology(bool const outputToCsv) void DisplayTopology(bool const outputToCsv)
{ {
int numCpuDevices = numa_num_configured_nodes(); int numCpuDevices = numa_num_configured_nodes();
int numGpuDevices; int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices)); HIP_CALL(hipGetDeviceCount(&numGpuDevices));
...@@ -648,6 +657,7 @@ void DisplayTopology(bool const outputToCsv) ...@@ -648,6 +657,7 @@ void DisplayTopology(bool const outputToCsv)
else else
printf(" %5d | ", numCpus); printf(" %5d | ", numCpus);
#if !defined(__NVCC__)
bool isFirst = true; bool isFirst = true;
for (int j = 0; j < numGpuDevices; j++) for (int j = 0; j < numGpuDevices; j++)
{ {
...@@ -658,10 +668,16 @@ void DisplayTopology(bool const outputToCsv) ...@@ -658,10 +668,16 @@ void DisplayTopology(bool const outputToCsv)
printf("%d", j); printf("%d", j);
} }
} }
#endif
printf("\n"); printf("\n");
} }
printf("\n"); printf("\n");
#if defined(__NVCC__)
// No further topology detection done for NVIDIA platforms
return;
#endif
// Print out detected GPU topology // Print out detected GPU topology
if (outputToCsv) if (outputToCsv)
{ {
...@@ -691,8 +707,8 @@ void DisplayTopology(bool const outputToCsv) ...@@ -691,8 +707,8 @@ void DisplayTopology(bool const outputToCsv)
printf("--------------+------+-------------\n"); printf("--------------+------+-------------\n");
} }
#if !defined(__NVCC__)
char pciBusId[20]; char pciBusId[20];
for (int i = 0; i < numGpuDevices; i++) for (int i = 0; i < numGpuDevices; i++)
{ {
int const deviceIdx = RemappedIndex(i, false); int const deviceIdx = RemappedIndex(i, false);
...@@ -732,6 +748,7 @@ void DisplayTopology(bool const outputToCsv) ...@@ -732,6 +748,7 @@ void DisplayTopology(bool const outputToCsv)
else else
printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx)); printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
} }
#endif
} }
void ParseMemType(std::string const& token, int const numCpus, int const numGpus, void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
...@@ -930,11 +947,20 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt ...@@ -930,11 +947,20 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
// Allocate host-pinned memory (should respect NUMA mem policy) // Allocate host-pinned memory (should respect NUMA mem policy)
if (memType == MEM_CPU_FINE) if (memType == MEM_CPU_FINE)
{ {
#if defined (__NVCC__)
printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
exit(1);
#else
HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser)); HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
#endif
} }
else if (memType == MEM_CPU) else if (memType == MEM_CPU)
{ {
#if defined (__NVCC__)
if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
#else
if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess) if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
#endif
{ {
printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex); printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
exit(1); exit(1);
...@@ -960,8 +986,13 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt ...@@ -960,8 +986,13 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
} }
else if (memType == MEM_GPU_FINE) else if (memType == MEM_GPU_FINE)
{ {
#if defined (__NVCC__)
printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
exit(1);
#else
HIP_CALL(hipSetDevice(devIndex)); HIP_CALL(hipSetDevice(devIndex));
HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained)); HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
#endif
} }
else else
{ {
...@@ -1044,13 +1075,18 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1044,13 +1075,18 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// In single stream mode, all the threadblocks for this GPU are launched // In single stream mode, all the threadblocks for this GPU are launched
// Otherwise, just launch the threadblocks associated with this single Transfer // Otherwise, just launch the threadblocks associated with this single Transfer
int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs; int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
#if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
HIP_CALL(hipEventRecord(stopEvent, stream));
#else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel], hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
dim3(numBlocksToRun, 1, 1), dim3(numBlocksToRun, 1, 1),
dim3(BLOCKSIZE, 1, 1), dim3(BLOCKSIZE, 1, 1),
ev.sharedMemBytes, stream, ev.sharedMemBytes, stream,
startEvent, stopEvent, startEvent, stopEvent,
0, transfer->subExecParamGpuPtr); 0, transfer->subExecParamGpuPtr);
#endif
// Synchronize per iteration, unless in single sync mode, in which case // Synchronize per iteration, unless in single sync mode, in which case
// synchronize during last warmup / last actual iteration // synchronize during last warmup / last actual iteration
HIP_CALL(hipStreamSynchronize(stream)); HIP_CALL(hipStreamSynchronize(stream));
...@@ -1228,8 +1264,6 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N, ...@@ -1228,8 +1264,6 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
// Skip bidirectional on same device // Skip bidirectional on same device
if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f; if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
int const initOffset = ev.byteOffset / sizeof(float);
// Prepare Transfers // Prepare Transfers
std::vector<Transfer> transfers(2); std::vector<Transfer> transfers(2);
transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float); transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
...@@ -1265,6 +1299,12 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N, ...@@ -1265,6 +1299,12 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
{ {
if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0) if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
return 0; return 0;
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
return 0;
#endif
} }
ExecuteTransfers(ev, 0, N, transfers, false); ExecuteTransfers(ev, 0, N, transfers, false);
...@@ -1549,6 +1589,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con ...@@ -1549,6 +1589,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
{ {
if (exeList[i].second != srcList[j].second) if (exeList[i].second != srcList[j].second)
{ {
#if defined(__NVCC__)
isXgmiSrc = false;
#else
uint32_t exeToSrcLinkType, exeToSrcHopCount; uint32_t exeToSrcLinkType, exeToSrcHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false), HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
RemappedIndex(srcList[j].second, false), RemappedIndex(srcList[j].second, false),
...@@ -1556,6 +1599,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con ...@@ -1556,6 +1599,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
&exeToSrcHopCount)); &exeToSrcHopCount));
isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI); isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiSrc) numHopsSrc = exeToSrcHopCount; if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
#endif
} }
else else
{ {
...@@ -1582,6 +1626,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con ...@@ -1582,6 +1626,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
{ {
if (exeList[i].second != dstList[k].second) if (exeList[i].second != dstList[k].second)
{ {
#if defined(__NVCC__)
isXgmiSrc = false;
#else
uint32_t exeToDstLinkType, exeToDstHopCount; uint32_t exeToDstLinkType, exeToDstHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false), HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
RemappedIndex(dstList[k].second, false), RemappedIndex(dstList[k].second, false),
...@@ -1589,6 +1636,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con ...@@ -1589,6 +1636,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
&exeToDstHopCount)); &exeToDstHopCount));
isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI); isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiDst) numHopsDst = exeToDstHopCount; if (isXgmiDst) numHopsDst = exeToDstHopCount;
#endif
} }
else else
{ {
...@@ -1606,6 +1654,12 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con ...@@ -1606,6 +1654,12 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
// Skip this DST if total XGMI distance (SRC + DST) is greater than max limit // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue; if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
#if defined(__NVCC__)
// Skip CPU executors on GPU memory on NVIDIA platform
if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
continue;
#endif
tinfo.dstType = dstList[k].first; tinfo.dstType = dstList[k].first;
tinfo.dstIndex = dstList[k].second; tinfo.dstIndex = dstList[k].second;
......
...@@ -31,8 +31,15 @@ THE SOFTWARE. ...@@ -31,8 +31,15 @@ THE SOFTWARE.
#include <map> #include <map>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <hip/hip_runtime.h>
#if defined(__NVCC__)
#include <cuda_runtime.h>
#define __builtin_amdgcn_s_memrealtime clock64
#else
#include <hip/hip_ext.h> #include <hip/hip_ext.h>
#endif
#include <hip/hip_runtime.h>
#include <hsa/hsa_ext_amd.h> #include <hsa/hsa_ext_amd.h>
// Helper macro for catching HIP errors // Helper macro for catching HIP errors
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment