v1.12 Adding NVIDIA platform support (#10)

c72a105f · gilbertlee-amd · GitHub · cc0e9cb4 · c72a105f · c72a105f
Unverified Commit c72a105f authored Jan 31, 2023 by gilbertlee-amd Committed by GitHub Jan 31, 2023
7 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench
+## v1.12
+### Added
+- Added support for TransferBench on NVIDIA platforms (via HIP_PLATFORM=nvidia)
+  - CPU executors on NVIDIA platform cannot access GPU memory (no large-bar access)
 ## v1.11
 ### Added
 - New multi-input / multi-output support (MIMO).  Transfers now can reduce (element-wise summation) multiple input memory arrays

--- a/EnvVars.hpp
+++ b/EnvVars.hpp
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #include <time.h>
 #include "Kernels.hpp"
-#define TB_VERSION "1.11"
+#define TB_VERSION "1.12"
 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];

--- a/Kernels.hpp
+++ b/Kernels.hpp
@@ -29,6 +29,19 @@ THE SOFTWARE.
 #define MEMSET_CHAR     75
 #define MEMSET_VAL      13323083.0f
+#if defined(__NVCC__)
+// Define float4 addition operator for NVIDIA platform
+__device__ inline float4& operator +=(float4& a, const float4& b)
+{
+  a.x += b.x;
+  a.y += b.y;
+  a.z += b.z;
+  a.w += b.w;
+  return a;
+}
+#endif
 // Each subExecutor is provided with subarrays to work on
 #define MAX_SRCS 16
 #define MAX_DSTS 16
@@ -51,14 +64,14 @@ void CpuReduceKernel(SubExecParam const& p)
  if (numSrcs == 0)
  {
    for (int i = 0; i < numDsts; ++i)
-      memset((float* __restrict__)p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
+      memset(p.dst[i], MEMSET_CHAR, p.N * sizeof(float));
  }
  else if (numSrcs == 1)
  {
    float const* __restrict__ src = p.src[0];
    for (int i = 0; i < numDsts; ++i)
    {
-      memcpy((float* __restrict__)p.dst[i], src, p.N * sizeof(float));
+      memcpy(p.dst[i], src, p.N * sizeof(float));
    }
  }
  else
@@ -88,7 +101,6 @@ GpuReduceKernel(SubExecParam* params)
  SubExecParam& p    = params[blockIdx.x];
  int const numSrcs  = p.numSrcs;
  int const numDsts  = p.numDsts;
-  int const numWaves = BLOCKSIZE   / WARP_SIZE; // Number of wavefronts per threadblock
  int const waveId   = threadIdx.x / WARP_SIZE; // Wavefront number
  int const threadId = threadIdx.x % WARP_SIZE; // Thread index within wavefront
@@ -177,11 +189,11 @@ GpuReduceKernel(SubExecParam* params)
      else
      {
        for (int i = 0; i < numSrcs; ++i)
-          val += ((float const* __restrict__)p.src[i])[offset];
+          val += p.src[i][offset];
      }
      for (int i = 0; i < numDsts; ++i)
-        ((float* __restrict__)p.dst[i])[offset] = val;
+        p.dst[i][offset] = val;
    }
  }
@@ -197,7 +209,6 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR>
 __device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
 {
  int    constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
-  int    constexpr numWaves         = BLOCKSIZE   / WARP_SIZE;            // Number of wavefronts per threadblock
  size_t constexpr loopPackInc      = BLOCKSIZE * UNROLL_FACTOR;
  size_t constexpr numPacksPerWave  = WARP_SIZE * UNROLL_FACTOR;
  int    const     waveId           = threadIdx.x / WARP_SIZE;            // Wavefront number

--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ ROCM_PATH ?= /opt/rocm
 HIPCC=$(ROCM_PATH)/bin/hipcc
 EXE=TransferBench
-CXXFLAGS = -O3 -I. -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64 -ferror-limit=5
+CXXFLAGS = -O3 -I. -I$(ROCM_PATH)/hsa/include -lnuma -L$(ROCM_PATH)/hsa/lib -lhsa-runtime64
 all: $(EXE)

--- a/README.md
+++ b/README.md
@@ -13,6 +13,14 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be
  If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
+## NVIDIA platform support
+TransferBench may also be built to run on NVIDIA platforms via HIP, but requires a HIP-compatible CUDA version installed (e.g. CUDA 11.5)
+To build:
+```
+   CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make`
+```
 ## Hints and suggestions
 - Running TransferBench with no arguments will display usage instructions and detected topology information

--- a/TransferBench.cpp
+++ b/TransferBench.cpp
@@ -291,7 +291,11 @@ void ExecuteTransfers(EnvVars const& ev,
          printf("  DST %0d: %p\n", iDst, transfer.dstMem[iDst]);
      }
      printf("Hit <Enter> to continue: ");
-      scanf("%*c");
+      if (scanf("%*c") != 0)
+      {
+        printf("[ERROR] Unexpected input\n");
+        exit(1);
+      }
      printf("\n");
    }
@@ -332,7 +336,11 @@ void ExecuteTransfers(EnvVars const& ev,
  if (verbose && ev.useInteractive)
  {
    printf("Transfers complete. Hit <Enter> to continue: ");
-    scanf("%*c");
+    if (scanf("%*c") != 0)
+    {
+      printf("[ERROR] Unexpected input\n");
+      exit(1);
+    }
    printf("\n");
  }
@@ -590,6 +598,7 @@ int RemappedIndex(int const origIdx, bool const isCpuType)
 void DisplayTopology(bool const outputToCsv)
 {
  int numCpuDevices = numa_num_configured_nodes();
  int numGpuDevices;
  HIP_CALL(hipGetDeviceCount(&numGpuDevices));
@@ -648,6 +657,7 @@ void DisplayTopology(bool const outputToCsv)
    else
      printf(" %5d | ", numCpus);
+#if !defined(__NVCC__)
    bool isFirst = true;
    for (int j = 0; j < numGpuDevices; j++)
    {
@@ -658,10 +668,16 @@ void DisplayTopology(bool const outputToCsv)
        printf("%d", j);
      }
    }
+#endif
    printf("\n");
  }
  printf("\n");
+#if defined(__NVCC__)
+  // No further topology detection done for NVIDIA platforms
+  return;
+#endif
  // Print out detected GPU topology
  if (outputToCsv)
  {
@@ -691,8 +707,8 @@ void DisplayTopology(bool const outputToCsv)
    printf("--------------+------+-------------\n");
  }
+#if !defined(__NVCC__)
  char pciBusId[20];
  for (int i = 0; i < numGpuDevices; i++)
  {
    int const deviceIdx = RemappedIndex(i, false);
@@ -732,6 +748,7 @@ void DisplayTopology(bool const outputToCsv)
    else
      printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
  }
+#endif
 }
 void ParseMemType(std::string const& token, int const numCpus, int const numGpus,
@@ -930,11 +947,20 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
    // Allocate host-pinned memory (should respect NUMA mem policy)
    if (memType == MEM_CPU_FINE)
    {
+#if defined (__NVCC__)
+      printf("[ERROR] Fine-grained CPU memory not supported on NVIDIA platform\n");
+      exit(1);
+#else
      HIP_CALL(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser));
+#endif
    }
    else if (memType == MEM_CPU)
    {
+#if defined (__NVCC__)
+      if (hipHostMalloc((void **)memPtr, numBytes, 0) != hipSuccess)
+#else
      if (hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent) != hipSuccess)
+#endif
      {
        printf("[ERROR] Unable to allocate non-coherent host memory on NUMA node %d\n", devIndex);
        exit(1);
@@ -960,8 +986,13 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
  }
  else if (memType == MEM_GPU_FINE)
  {
+#if defined (__NVCC__)
+    printf("[ERROR] Fine-grained GPU memory not supported on NVIDIA platform\n");
+    exit(1);
+#else
    HIP_CALL(hipSetDevice(devIndex));
    HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, hipDeviceMallocFinegrained));
+#endif
  }
  else
  {
@@ -1044,13 +1075,18 @@ void RunTransfer(EnvVars const& ev, int const iteration,
    // In single stream mode, all the threadblocks for this GPU are launched
    // Otherwise, just launch the threadblocks associated with this single Transfer
    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
+#if defined(__NVCC__)
+    HIP_CALL(hipEventRecord(startEvent, stream));
+    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
+    HIP_CALL(hipEventRecord(stopEvent, stream));
+#else
    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
                          dim3(numBlocksToRun, 1, 1),
                          dim3(BLOCKSIZE, 1, 1),
                          ev.sharedMemBytes, stream,
                          startEvent, stopEvent,
                          0, transfer->subExecParamGpuPtr);
+#endif
    // Synchronize per iteration, unless in single sync mode, in which case
    // synchronize during last warmup / last actual iteration
    HIP_CALL(hipStreamSynchronize(stream));
@@ -1228,8 +1264,6 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
  // Skip bidirectional on same device
  if (isBidirectional && srcType == dstType && srcIndex == dstIndex) return 0.0f;
-  int const initOffset = ev.byteOffset / sizeof(float);
  // Prepare Transfers
  std::vector<Transfer> transfers(2);
  transfers[0].numBytes = transfers[1].numBytes = N * sizeof(float);
@@ -1265,6 +1299,12 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
  {
    if (transfers[i].exeType == EXE_CPU && ev.numCpusPerNuma[transfers[i].exeIndex] == 0)
      return 0;
+#if defined(__NVCC__)
+    // NVIDIA platform cannot access GPU memory directly from CPU executors
+    if (transfers[i].exeType == EXE_CPU && (IsGpuType(srcType) || IsGpuType(dstType)))
+        return 0;
+#endif
  }
  ExecuteTransfers(ev, 0, N, transfers, false);
@@ -1549,6 +1589,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
      {
        if (exeList[i].second != srcList[j].second)
        {
+#if defined(__NVCC__)
+          isXgmiSrc = false;
+#else
          uint32_t exeToSrcLinkType, exeToSrcHopCount;
          HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
                                                RemappedIndex(srcList[j].second, false),
@@ -1556,6 +1599,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
                                                &exeToSrcHopCount));
          isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
          if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
+#endif
        }
        else
        {
@@ -1582,6 +1626,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
        {
          if (exeList[i].second != dstList[k].second)
          {
+#if defined(__NVCC__)
+            isXgmiSrc = false;
+#else
            uint32_t exeToDstLinkType, exeToDstHopCount;
            HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, false),
                                                  RemappedIndex(dstList[k].second, false),
@@ -1589,6 +1636,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
                                                  &exeToDstHopCount));
            isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
            if (isXgmiDst) numHopsDst = exeToDstHopCount;
+#endif
          }
          else
          {
@@ -1606,6 +1654,12 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
        // Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
        if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
+#if defined(__NVCC__)
+        // Skip CPU executors on GPU memory on NVIDIA platform
+        if (IsCpuType(exeList[i].first) && (IsGpuType(dstList[j].first) || IsGpuType(dstList[k].first)))
+          continue;
+#endif
        tinfo.dstType  = dstList[k].first;
        tinfo.dstIndex = dstList[k].second;

--- a/TransferBench.hpp
+++ b/TransferBench.hpp
@@ -31,8 +31,15 @@ THE SOFTWARE.
 #include <map>
 #include <iostream>
 #include <sstream>
-#include <hip/hip_runtime.h>
+#if defined(__NVCC__)
+#include <cuda_runtime.h>
+#define __builtin_amdgcn_s_memrealtime clock64
+#else
 #include <hip/hip_ext.h>
+#endif
+#include <hip/hip_runtime.h>
 #include <hsa/hsa_ext_amd.h>
 // Helper macro for catching HIP errors