Adding BLOCK_SIZE and BLOCK_ORDER (#57)

e7cfab75 · gilbertlee-amd · GitHub · 0b29707e · e7cfab75 · e7cfab75
Unverified Commit e7cfab75 authored Oct 13, 2023 by gilbertlee-amd Committed by GitHub Oct 13, 2023
5 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench
+## v1.30
+### Added
+- BLOCK_SIZE added to control threadblock size (Must be multiple of 64, up to 512)
+- BLOCK_ORDER added to control how work is ordered for GFX-executors running USE_SINGLE_STREAM=1
+  - 0 - Threadblocks for Transfers are ordered sequentially (Default)
+  - 1 - Threadblocks for Transfers are interleaved
+  - 2 - Threadblocks for Transfers are ordered randomly
 ## v1.29
 ### Added
 - a2a preset config now responds to USE_REMOTE_READ

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -336,8 +336,6 @@ void ExecuteTransfers(EnvVars const& ev,
    int     const exeIndex   = RemappedIndex(executor.second, IsCpuType(exeType));
    exeInfo.totalBytes = 0;
-    int transferOffset = 0;
    for (int i = 0; i < exeInfo.transfers.size(); ++i)
    {
      // Prepare subarrays each threadblock works on and fill src memory with patterned data
@@ -345,20 +343,75 @@ void ExecuteTransfers(EnvVars const& ev,
      transfer->PrepareSubExecParams(ev);
      isSrcCorrect &= transfer->PrepareSrc(ev);
      exeInfo.totalBytes += transfer->numBytesActual;
+    }
+    // Copy block parameters to GPU for GPU executors
+    if (exeType == EXE_GPU_GFX)
+    {
+      std::vector<SubExecParam> tempSubExecParam;
-      // Copy block parameters to GPU for GPU executors
+      if (!ev.useSingleStream || (ev.blockOrder == ORDER_SEQUENTIAL))
-      if (transfer->exeType == EXE_GPU_GFX)
      {
-        exeInfo.transfers[i]->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
+        // Assign Transfers to sequentual threadblocks
-        HIP_CALL(hipSetDevice(exeIndex));
+        int transferOffset = 0;
-        HIP_CALL(hipMemcpy(&exeInfo.subExecParamGpu[transferOffset],
+        for (Transfer* transfer : exeInfo.transfers)
-                           transfer->subExecParam.data(),
+        {
-                           transfer->subExecParam.size() * sizeof(SubExecParam),
+          transfer->subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
-                           hipMemcpyHostToDevice));
-        HIP_CALL(hipDeviceSynchronize());
-        transferOffset += transfer->subExecParam.size();
+          transfer->subExecIdx.clear();
+          for (int subExecIdx = 0; subExecIdx < transfer->subExecParam.size(); subExecIdx++)
+          {
+            transfer->subExecIdx.push_back(transferOffset + subExecIdx);
+            tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
+          }
+          transferOffset += transfer->numSubExecs;
+        }
      }
+      else if (ev.blockOrder == ORDER_INTERLEAVED)
+      {
+        // Interleave threadblocks of different Transfers
+        exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
+        for (int subExecIdx = 0; tempSubExecParam.size() < exeInfo.totalSubExecs; ++subExecIdx)
+        {
+          for (Transfer* transfer : exeInfo.transfers)
+          {
+            if (subExecIdx < transfer->numSubExecs)
+            {
+              transfer->subExecIdx.push_back(tempSubExecParam.size());
+              tempSubExecParam.push_back(transfer->subExecParam[subExecIdx]);
+            }
+          }
+        }
+      }
+      else if (ev.blockOrder == ORDER_RANDOM)
+      {
+        std::vector<std::pair<int,int>> indices;
+        exeInfo.transfers[0]->subExecParamGpuPtr = exeInfo.subExecParamGpu;
+        // Build up a list of (transfer,subExecParam) indices, then randomly sort them
+        for (int i = 0; i < exeInfo.transfers.size(); i++)
+        {
+          Transfer* transfer = exeInfo.transfers[i];
+          for (int subExecIdx = 0; subExecIdx < transfer->numSubExecs; subExecIdx++)
+            indices.push_back(std::make_pair(i, subExecIdx));
+        }
+        std::shuffle(indices.begin(), indices.end(), *ev.generator);
+        // Build randomized threadblock list
+        for (auto p : indices)
+        {
+          Transfer* transfer = exeInfo.transfers[p.first];
+          transfer->subExecIdx.push_back(tempSubExecParam.size());
+          tempSubExecParam.push_back(transfer->subExecParam[p.second]);
+        }
+      }
+      HIP_CALL(hipSetDevice(exeIndex));
+      HIP_CALL(hipMemcpy(exeInfo.subExecParamGpu,
+                         tempSubExecParam.data(),
+                         tempSubExecParam.size() * sizeof(SubExecParam),
+                         hipMemcpyDefault));
+      HIP_CALL(hipDeviceSynchronize());
    }
  }
@@ -602,7 +655,7 @@ void ExecuteTransfers(EnvVars const& ev,
                for (auto x : transfer->perIterationCUs[t.second - 1])
                  printf(" %2d", x);
              }
-                printf("\n");
+              printf("\n");
            }
            printf("      StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
        }
@@ -1281,12 +1334,12 @@ void RunTransfer(EnvVars const& ev, int const iteration,
    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
 #if defined(__NVCC__)
    HIP_CALL(hipEventRecord(startEvent, stream));
-    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, BLOCKSIZE, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
+    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
    HIP_CALL(hipEventRecord(stopEvent, stream));
 #else
    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
                          dim3(numBlocksToRun, 1, 1),
-                          dim3(BLOCKSIZE, 1, 1),
+                          dim3(ev.blockSize, 1, 1),
                          ev.sharedMemBytes, stream,
                          startEvent, stopEvent,
                          0, transfer->subExecParamGpuPtr);
@@ -1306,12 +1359,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
        // Figure out individual timings for Transfers that were all launched together
        for (Transfer* currTransfer : exeInfo.transfers)
        {
-          long long minStartCycle = currTransfer->subExecParamGpuPtr[0].startCycle;
+          long long minStartCycle = std::numeric_limits<long long>::max();
-          long long maxStopCycle  = currTransfer->subExecParamGpuPtr[0].stopCycle;
+          long long maxStopCycle  = std::numeric_limits<long long>::min();
-          for (int i = 1; i < currTransfer->numSubExecs; i++)
+          std::set<int> CUs;
+          for (auto subExecIdx : currTransfer->subExecIdx)
          {
-            minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle);
+            minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
-            maxStopCycle  = std::max(maxStopCycle,  currTransfer->subExecParamGpuPtr[i].stopCycle);
+            maxStopCycle  = std::max(maxStopCycle,  exeInfo.subExecParamGpu[subExecIdx].stopCycle);
+            if (ev.showIterations)
+              CUs.insert(GetId(exeInfo.subExecParamGpu[subExecIdx].hwId));
          }
          int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
@@ -1319,9 +1376,6 @@ void RunTransfer(EnvVars const& ev, int const iteration,
          if (ev.showIterations)
          {
            currTransfer->perIterationTime.push_back(iterationTimeMs);
-            std::set<int> CUs;
-            for (int i = 0; i < currTransfer->numSubExecs; i++)
-              CUs.insert(GetId(currTransfer->subExecParamGpuPtr[i].hwId));
            currTransfer->perIterationCUs.push_back(CUs);
          }
        }
@@ -1990,7 +2044,7 @@ bool Transfer::PrepareSrc(EnvVars const& ev)
      int const deviceIdx = RemappedIndex(this->srcIndex[srcIdx], false);
      HIP_CALL(hipSetDevice(deviceIdx));
      if (ev.usePrepSrcKernel)
-        PrepSrcDataKernel<<<32, BLOCKSIZE>>>(srcPtr, N, srcIdx);
+        PrepSrcDataKernel<<<32, ev.blockSize>>>(srcPtr, N, srcIdx);
      else
        HIP_CALL(hipMemcpy(srcPtr, reference.data(), this->numBytesActual, hipMemcpyDefault));
      HIP_CALL(hipDeviceSynchronize());

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"
-#define TB_VERSION "1.29"
+#define TB_VERSION "1.30"
 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -43,6 +43,13 @@ enum ConfigModeEnum
  CFG_A2A   = 4
 };
+enum BlockOrderEnum
+{
+  ORDER_SEQUENTIAL  = 0,
+  ORDER_INTERLEAVED = 1,
+  ORDER_RANDOM      = 2
+};
 // This class manages environment variable that affect TransferBench
 class EnvVars
 {
@@ -65,7 +72,9 @@ public:
  int const DEFAULT_SWEEP_TIME_LIMIT  = 0;
  // Environment variables
+  int blockSize;         // Size of each threadblock (must be multiple of 64)
  int blockBytes;        // Each CU, except the last, gets a multiple of this many bytes to copy
+  int blockOrder;        // How blocks are ordered in single-stream mode (0=Sequential, 1=Interleaved, 2=Random)
  int byteOffset;        // Byte-offset for memory allocations
  int continueOnError;   // Continue tests even after mismatch detected
  int hideEnv;           // Skip printing environment variable
@@ -157,7 +166,9 @@ public:
    else if (archName == "gfx940") defaultGpuKernel = 6;
    else if (archName == "gfx941") defaultGpuKernel = 6;
+    blockSize         = GetEnvVar("BLOCK_SIZE"          , 256);
    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
+    blockOrder        = GetEnvVar("BLOCK_ORDER"         , 0);
    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
    continueOnError   = GetEnvVar("CONTINUE_ON_ERROR"   , 0);
    hideEnv           = GetEnvVar("HIDE_ENV"            , 0);
@@ -324,11 +335,26 @@ public:
      printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
      exit(1);
    }
+    if (blockSize % 64)
+    {
+      printf("[ERROR] BLOCK_SIZE (%d) must be a multiple of 64\n", blockSize);
+      exit(1);
+    }
+    if (blockSize > MAX_BLOCKSIZE)
+    {
+      printf("[ERROR] BLOCK_SIZE (%d) must be less than %d\n", blockSize, MAX_BLOCKSIZE);
+      exit(1);
+    }
    if (byteOffset % sizeof(float))
    {
      printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
      exit(1);
    }
+    if (blockOrder < 0 || blockOrder > 2)
+    {
+      printf("[ERROR] BLOCK_ORDER must be 0 (Sequential), 1 (Interleaved), or 2 (Random)\n");
+      exit(1);
+    }
    if (numWarmups < 0)
    {
      printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
@@ -349,7 +375,6 @@ public:
      printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
      exit(1);
    }
    if (numGpuSubExecs <= 0)
    {
      printf("[ERROR] NUM_GPU_SE must be greater than 0\n");
@@ -454,7 +479,9 @@ public:
  {
    printf("Environment variables:\n");
    printf("======================\n");
-    printf(" BLOCK_BYTES=B          - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
+    printf(" BLOCK_SIZE             - # of threads per threadblock (Must be multiple of 64). Defaults to 256\n");
+    printf(" BLOCK_BYTES            - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
+    printf(" BLOCK_ORDER            - Threadblock ordering in single-stream mode (0=Serial, 1=Interleaved, 2=Random)\n");
    printf(" BYTE_OFFSET            - Initial byte-offset for memory allocations.  Must be multiple of 4. Defaults to 0\n");
    printf(" CONTINUE_ON_ERROR      - Continue tests even after mismatch detected\n");
    printf(" CU_MASK                - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
@@ -495,8 +522,14 @@ public:
      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
    if (hideEnv) return;
+    PRINT_EV("BLOCK_SIZE", blockSize,
+             std::string("Threadblock size of " + std::to_string(blockSize)));
    PRINT_EV("BLOCK_BYTES", blockBytes,
             std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
+    PRINT_EV("BLOCK_ORDER", blockOrder,
+             std::string("Transfer blocks order: " + std::string((blockOrder == 0 ? "Sequential"  :
+                                                                  blockOrder == 1 ? "Interleaved" :
+                                                                                    "Random"))));
    PRINT_EV("BYTE_OFFSET", byteOffset,
             std::string("Using byte offset of " + std::to_string(byteOffset)));
    PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
@@ -531,12 +564,16 @@ public:
    PRINT_EV("VALIDATE_DIRECT", validateDirect,
             std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
    printf("\n");
+    if (blockOrder != ORDER_SEQUENTIAL && !useSingleStream)
+      printf("[WARN] BLOCK_ORDER is ignored if USE_SINGLE_STREAM is not enabled\n");
  };
  // Display env var for P2P Benchmark preset
  void DisplayP2PBenchmarkEnvVars() const
  {
    DisplayEnvVars();
    if (hideEnv) return;
    if (!outputToCsv)

--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
@@ -24,7 +24,7 @@ THE SOFTWARE.
 #define PackedFloat_t   float4
 #define WARP_SIZE       64
-#define BLOCKSIZE       256
+#define MAX_BLOCKSIZE   512
 #define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
 #define MEMSET_CHAR     75
 #define MEMSET_VAL      13323083.0f
@@ -34,11 +34,14 @@ THE SOFTWARE.
 #define MAX_DSTS 16
 struct SubExecParam
 {
+  // Inputs
  size_t    N;                                  // Number of floats this subExecutor works on
  int       numSrcs;                            // Number of source arrays
  int       numDsts;                            // Number of destination arrays
  float*    src[MAX_SRCS];                      // Source array pointers
  float*    dst[MAX_DSTS];                      // Destination array pointers
+  // Outputs
  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
  long long stopCycle;                          // Stop  timestamp for in-kernel timing (GPU-GFX executor)
  uint32_t  hwId;                               // Hardware ID
@@ -111,7 +114,7 @@ template <>           __device__ __forceinline__ float4 MemsetVal(){ return make
 // GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
 template <int LOOP1_UNROLL>
-__global__ void __launch_bounds__(BLOCKSIZE)
+__global__ void __launch_bounds__(MAX_BLOCKSIZE)
 GpuReduceKernel(SubExecParam* params)
 {
  int64_t startCycle = wall_clock64();
@@ -128,7 +131,7 @@ GpuReduceKernel(SubExecParam* params)
  size_t       Nrem        = p.N;
  size_t const loop1Npack  = (Nrem / (FLOATS_PER_PACK * LOOP1_UNROLL * WARP_SIZE)) * (LOOP1_UNROLL * WARP_SIZE);
  size_t const loop1Nelem  = loop1Npack * FLOATS_PER_PACK;
-  size_t const loop1Inc    = BLOCKSIZE * LOOP1_UNROLL;
+  size_t const loop1Inc    = blockDim.x * LOOP1_UNROLL;
  size_t       loop1Offset = waveId * LOOP1_UNROLL * WARP_SIZE + threadId;
  while (loop1Offset < loop1Npack)
@@ -167,7 +170,7 @@ GpuReduceKernel(SubExecParam* params)
    // NOTE: Using int32_t due to smaller size requirements
    int32_t const loop2Npack  = Nrem / FLOATS_PER_PACK;
    int32_t const loop2Nelem  = loop2Npack * FLOATS_PER_PACK;
-    int32_t const loop2Inc    = BLOCKSIZE;
+    int32_t const loop2Inc    = blockDim.x;
    int32_t       loop2Offset = threadIdx.x;
    while (loop2Offset < loop2Npack)
@@ -229,7 +232,7 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR>
 __device__ size_t GpuReduceFuncImpl2(SubExecParam const &p, size_t const offset, size_t const N)
 {
  int    constexpr numFloatsPerPack = sizeof(FLOAT_TYPE) / sizeof(float); // Number of floats handled at a time per thread
-  size_t constexpr loopPackInc      = BLOCKSIZE * UNROLL_FACTOR;
+  size_t constexpr loopPackInc      = blockDim.x * UNROLL_FACTOR;
  size_t constexpr numPacksPerWave  = WARP_SIZE * UNROLL_FACTOR;
  int    const     waveId           = threadIdx.x / WARP_SIZE;            // Wavefront number
  int    const     threadId         = threadIdx.x % WARP_SIZE;            // Thread index within wavefront
@@ -283,8 +286,8 @@ __device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset,
 {
  // Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
  int    constexpr numFloatsPerRead      = sizeof(FLOAT_TYPE) / sizeof(float);
-  size_t constexpr numFloatsPerInnerLoop = BLOCKSIZE * numFloatsPerRead;
+  size_t const     numFloatsPerInnerLoop = blockDim.x * numFloatsPerRead;
-  size_t constexpr numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
+  size_t const     numFloatsPerOuterLoop = numFloatsPerInnerLoop * UNROLL_FACTOR;
  size_t const     numFloatsLeft         = (numFloatsPerRead == 1 && UNROLL_FACTOR == 1) ? 0 : N % numFloatsPerOuterLoop;
  size_t const     numFloatsDone         = N - numFloatsLeft;
  int    const     numSrcs               = p.numSrcs;
@@ -351,7 +354,7 @@ __device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size
 }
 // GPU copy kernel
-__global__ void __launch_bounds__(BLOCKSIZE)
+__global__ void __launch_bounds__(MAX_BLOCKSIZE)
 GpuReduceKernel2(SubExecParam* params)
 {
  int64_t startCycle = wall_clock64();

--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
@@ -118,6 +118,7 @@ struct Transfer
  std::vector<SubExecParam>  subExecParam;       // Defines subarrays assigned to each threadblock
  SubExecParam*              subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
+  std::vector<int>           subExecIdx;         // Indicies into subExecParamGpu
  std::vector<double>        perIterationTime;   // Per-iteration timing
  std::vector<std::set<int>> perIterationCUs;    // Per-iteration CU usage