v1.47 Fixing CUDA compilation (#83)

47480a59 · gilbertlee-amd · GitHub · e9f51f2b · 47480a59 · 47480a59
Unverified Commit 47480a59 authored Jan 09, 2024 by gilbertlee-amd Committed by GitHub Jan 09, 2024
5 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,11 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
+## v1.47
+### Fixes
+* Fixing CUDA support
 ## v1.46
 ### Fixes

--- a/src/Makefile
+++ b/src/Makefile
@@ -13,7 +13,7 @@ else
 endif
 CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
-NVFLAGS = -O3 -g -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_75,code=sm_75
+NVFLAGS = -O3 -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_75,code=sm_75
 LDFLAGS    += -lpthread
 all: $(EXE)

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -1459,16 +1459,17 @@ void RunTransfer(EnvVars const& ev, int const iteration,
    // Otherwise, just launch the threadblocks associated with this single Transfer
    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
    int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);
+    dim3 const gridSize(numXCCs, numBlocksToRun, 1);
+    dim3 const blockSize(ev.gfxBlockSize, 1, 1);
 #if defined(__NVCC__)
    HIP_CALL(hipEventRecord(startEvent, stream));
-    GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1]
+    GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1]
-      <<<numBlocksToRun, ev.gfxBlockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.waveOrder);
+      <<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
    HIP_CALL(hipEventRecord(stopEvent, stream));
 #else
-    hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/warpSize - 1][ev.gfxUnroll - 1],
+    hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1],
-                          dim3(numXCCs, numBlocksToRun, 1),
+                          gridSize, blockSize,
-                          dim3(ev.gfxBlockSize, 1, 1),
                          ev.sharedMemBytes, stream,
                          startEvent, stopEvent,
                          0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
@@ -1994,9 +1995,9 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
      if (ev.a2aDirect)
      {
-#if !defined(__NVCC__)
        if (i == j) continue;
+#if !defined(__NVCC__)
        uint32_t linkType, hopCount;
        HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(i, false),
                                              RemappedIndex(j, false),
@@ -2460,7 +2461,6 @@ void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int
 void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
 {
-  char memType = ev.useFineGrain ? 'F' : 'G';
  printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
  char sep = (ev.outputToCsv ? ',' : ' ');

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"
-#define TB_VERSION "1.46"
+#define TB_VERSION "1.47"
 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -530,8 +530,7 @@ public:
    for (int i = 0; i < numDetectedGpus; i++)
    {
 #if defined(__NVCC__)
-      // NOTE: wallClock doesn't exist in CUDA.  This may need to be adjusted / run with fixed clocks
+      wallClockPerDeviceMhz[i] = 1000000;
-      wallClockPerDeviceMhz[i] = 1410000;
 #else
      hipDeviceProp_t prop;
      HIP_CALL(hipGetDeviceProperties(&prop, i));

--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
@@ -29,6 +29,10 @@ THE SOFTWARE.
 #define MEMSET_VAL      13323083.0f
+#if defined(__NVCC__)
+#define warpSize 32
+#endif
 #define MAX_WAVEGROUPS  MAX_BLOCKSIZE / warpSize
 #define MAX_UNROLL      8
 #define NUM_WAVEORDERS  6
@@ -44,7 +48,7 @@ struct SubExecParam
  int       numDsts;                            // Number of destination arrays
  float*    src[MAX_SRCS];                      // Source array pointers
  float*    dst[MAX_DSTS];                      // Destination array pointers
-  uint32_t  preferredXccId;                     // XCC ID to execute on
+  int32_t   preferredXccId;                     // XCC ID to execute on
  // Prepared
  int       teamSize;                           // Index of this sub executor amongst team
@@ -133,6 +137,17 @@ PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
  }
 }
+__device__ int64_t GetTimestamp()
+{
+#if defined(__NVCC__)
+  int64_t result;
+  asm volatile("mov.u64 %0, %%globaltimer;" : "=l"(result));
+  return result;
+#else
+  return wall_clock64();
+#endif
+}
 // Helper function for memset
 template <typename T> __device__ __forceinline__ T      MemsetVal();
 template <>           __device__ __forceinline__ float  MemsetVal(){ return MEMSET_VAL; };
@@ -143,14 +158,16 @@ __global__ void __launch_bounds__(BLOCKSIZE)
  GpuReduceKernel(SubExecParam* params, int waveOrder)
 {
  int64_t startCycle;
-  if (threadIdx.x == 0) startCycle = wall_clock64();
+  if (threadIdx.x == 0) startCycle = GetTimestamp();
  SubExecParam& p = params[blockIdx.y];
  // (Experimental) Filter by XCC if desired
+#if !defined(__NVCC__)
  int32_t xccId;
  GetXccId(xccId);
  if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
+#endif
  // Collect data information
  int32_t const  numSrcs  = p.numSrcs;
@@ -168,7 +185,6 @@ __global__ void __launch_bounds__(BLOCKSIZE)
  int32_t const tIdx     = threadIdx.x % warpSize; // Thread index within wavefront
  size_t  const numFloat4 = p.N / 4;
-  int32_t const nFlt4PerWave = warpSize * 4;
  int32_t teamStride, waveStride, unrlStride, teamStride2, waveStride2;
  switch (waveOrder)
@@ -266,7 +282,7 @@ __global__ void __launch_bounds__(BLOCKSIZE)
  if (threadIdx.x == 0)
  {
    __threadfence_system();
-    p.stopCycle  = wall_clock64();
+    p.stopCycle  = GetTimestamp();
    p.startCycle = startCycle;
    p.xccId      = xccId;
    __trace_hwreg();