v1.36 - Adding experimental USE_XCC_FILTER (#67)

08997e88 · gilbertlee-amd · GitHub · e047656f · 08997e88 · 08997e88
Unverified Commit 08997e88 authored Nov 23, 2023 by gilbertlee-amd Committed by GitHub Nov 23, 2023
Showing with 122 additions and 14 deletions

CHANGELOG.md CHANGELOG.md +7 -0

src/TransferBench.cpp src/TransferBench.cpp +14 -3

src/include/EnvVars.hpp src/include/EnvVars.hpp +78 -3

src/include/Kernels.hpp src/include/Kernels.hpp +23 -8

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,13 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.36
+
+### Additions
+
+* (Experimental) Adding XCC filtering - combined with XCC_PREF_TABLE, this tries to select
+  specific XCCs to use for specific (SRC->DST) Transfers
+
 ## v1.35

 ### Additions

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -259,7 +259,7 @@ void ExecuteTransfers(EnvVars const& ev,
        int     const  srcIndex    = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));

        // Ensure executing GPU can access source memory
-        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex)
+        if (IsGpuType(exeType) && IsGpuType(srcType) && srcIndex != exeIndex)
          EnablePeerAccess(exeIndex, srcIndex);

        AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
@@ -273,7 +273,7 @@ void ExecuteTransfers(EnvVars const& ev,
        int     const  dstIndex    = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));

        // Ensure executing GPU can access destination memory
-        if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex)
+        if (IsGpuType(exeType) && IsGpuType(dstType) && dstIndex != exeIndex)
          EnablePeerAccess(exeIndex, dstIndex);

        AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
@@ -1362,13 +1362,15 @@ void RunTransfer(EnvVars const& ev, int const iteration,
    // In single stream mode, all the threadblocks for this GPU are launched
    // Otherwise, just launch the threadblocks associated with this single Transfer
    int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
+    int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);
+
 #if defined(__NVCC__)
    HIP_CALL(hipEventRecord(startEvent, stream));
    GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
    HIP_CALL(hipEventRecord(stopEvent, stream));
 #else
    hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
-                          dim3(numBlocksToRun, 1, 1),
+                          dim3(numXCCs, numBlocksToRun, 1),
                          dim3(ev.blockSize, 1, 1),
                          ev.sharedMemBytes, stream,
                          startEvent, stopEvent,
@@ -1999,6 +2001,15 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
    for (int iDst = 0; iDst < this->numDsts; ++iDst)
      p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;

+    p.preferredXccId = -1;
+    if (ev.useXccFilter)
+    {
+      if (this->exeType == EXE_GPU_GFX && this->numDsts == 1 && IsGpuType(this->dstType[0]))
+      {
+        p.preferredXccId = ev.prefXccTable[this->exeIndex][this->dstIndex[0]];
+      }
+    }
+
    if (ev.enableDebug)
    {
      printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"

-#define TB_VERSION "1.35"
+#define TB_VERSION "1.36"

 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -91,10 +91,12 @@ public:
  int usePcieIndexing;   // Base GPU indexing on PCIe address instead of HIP device
  int usePrepSrcKernel;  // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
  int useSingleStream;   // Use a single stream per GPU GFX executor instead of stream per Transfer
+  int useXccFilter;      // Use XCC filtering (experimental)
  int validateDirect;    // Validate GPU destination memory directly instead of staging GPU memory on host

  std::vector<float> fillPattern; // Pattern of floats used to fill source data
  std::vector<uint32_t> cuMask;   // Bit-vector representing the CU mask
+  std::vector<std::vector<int>> prefXccTable;

  // Environment variables only for P2P preset
  int numCpuSubExecs;    // Number of CPU subexecttors to use
@@ -135,6 +137,8 @@ public:

  std::vector<int> wallClockPerDeviceMhz;

+  std::vector<std::set<int>> xccIdsPerDevice;
+
  // Constructor that collects values
  EnvVars()
  {
@@ -187,6 +191,7 @@ public:
    usePcieIndexing   = GetEnvVar("USE_PCIE_INDEX"      , 0);
    usePrepSrcKernel  = GetEnvVar("USE_PREP_KERNEL"     , 0);
    useSingleStream   = GetEnvVar("USE_SINGLE_STREAM"   , 0);
+    useXccFilter      = GetEnvVar("USE_XCC_FILTER"      , 0);
    validateDirect    = GetEnvVar("VALIDATE_DIRECT"     , 0);
    enableDebug       = GetEnvVar("DEBUG"               , 0);
    gpuKernel         = GetEnvVar("GPU_KERNEL"          , defaultGpuKernel);
@@ -327,6 +332,60 @@ public:
 #endif
    }

+    // Figure out number of xccs per device
+    int maxNumXccs = 64;
+    xccIdsPerDevice.resize(numGpuDevices);
+    for (int i = 0; i < numGpuDevices; i++)
+    {
+      int* data;
+      HIP_CALL(hipSetDevice(i));
+      HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
+      CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
+      HIP_CALL(hipDeviceSynchronize());
+
+      xccIdsPerDevice[i].clear();
+      for (int j = 0; j < maxNumXccs; j++)
+        xccIdsPerDevice[i].insert(data[j]);
+
+      HIP_CALL(hipHostFree(data));
+    }
+
+    // Parse preferred XCC table (if provided
+    {
+      prefXccTable.resize(numGpuDevices);
+      for (int i = 0; i < numGpuDevices; i++)
+      {
+        prefXccTable[i].resize(numGpuDevices, 0);
+      }
+      char* prefXccStr = getenv("XCC_PREF_TABLE");
+      char* token = strtok(prefXccStr, ",");
+      int tokenCount = 0;
+      while (token)
+      {
+        int xccId;
+        if (sscanf(token, "%d", &xccId) == 1)
+        {
+          int src = tokenCount / numGpuDevices;
+          int dst = tokenCount % numGpuDevices;
+          if (xccIdsPerDevice[src].count(xccId) == 0)
+          {
+            printf("[ERROR] GPU %d does not contain XCC %d\n", src, xccId);
+            exit(1);
+          }
+          prefXccTable[src][dst] = xccId;
+
+          tokenCount++;
+          if (tokenCount == (numGpuDevices * numGpuDevices)) break;
+        }
+        else
+        {
+          printf("[ERROR] Unrecognized token [%s]\n", token);
+          exit(1);
+        }
+        token = strtok(NULL, ",");
+      }
+    }
+
    // Perform some basic validation
    if (numCpuDevices > numDetectedCpus)
    {
@@ -503,6 +562,7 @@ public:
    printf(" USE_PCIE_INDEX         - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
    printf(" USE_PREP_KERNEL        - Use GPU kernel to initialize source data array pattern\n");
    printf(" USE_SINGLE_STREAM      - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
+    printf(" USE_XCC_FILTER         - Use XCC filtering (experimental)\n");
    printf(" VALIDATE_DIRECT        - Validate GPU destination memory directly instead of staging GPU memory on host\n");
  }

@@ -519,8 +579,8 @@ public:
    if (!outputToCsv)
    {
      printf("TransferBench v%s\n", TB_VERSION);
-      printf("=====================================================\n");
-      if (!hideEnv) printf("[Common]\n");
+      printf("===============================================================\n");
+      if (!hideEnv) printf("[Common]             (Suppress by setting HIDE_ENV=1)\n");
    }
    else if (!hideEnv)
      printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
@@ -566,6 +626,21 @@ public:
             std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
    PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
             std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
+    PRINT_EV("USE_XCC_FILTER", useXccFilter,
+             std::string("XCC filtering ") + (useXccFilter ? "enabled" : "disabled"));
+    if (useXccFilter)
+    {
+      printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
+      printf("%36s:         ", "");
+      for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
+      for (int i = 0; i < numGpuDevices; i++)
+      {
+        printf("%36s: GPU %3d ", "", i);
+        for (int j = 0; j < numGpuDevices; j++)
+          printf(" %3d", prefXccTable[i][j]);
+        printf(" %3lu\n", xccIdsPerDevice[i].size());
+      }
+    }
    PRINT_EV("VALIDATE_DIRECT", validateDirect,
             std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
    printf("\n");

--- a/src/include/Kernels.hpp
+++ b/src/include/Kernels.hpp
@@ -40,6 +40,7 @@ struct SubExecParam
  int       numDsts;                            // Number of destination arrays
  float*    src[MAX_SRCS];                      // Source array pointers
  float*    dst[MAX_DSTS];                      // Destination array pointers
+  uint32_t  preferredXccId;                     // XCC ID to execute on

  // Outputs
  long long startCycle;                         // Start timestamp for in-kernel timing (GPU-GFX executor)
@@ -59,11 +60,11 @@ struct SubExecParam

 // Macro for collecting HW_REG_XCC_ID
 #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
-#define __trace_xccreg() \
-  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (p.xccId));
+#define GetXccId(val) \
+  asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
 #else
-#define __trace_xccreg() \
-  p.xccId = 0
+#define GetXccId(val) \
+  val = 0
 #endif

 void CpuReduceKernel(SubExecParam const& p)
@@ -105,6 +106,13 @@ __host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
  return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
 }

+__global__ void CollectXccIdsKernel(int* xccIds)
+{
+  int xccId;
+  GetXccId(xccId);
+  xccIds[blockIdx.x] = xccId;
+}
+
 // GPU kernel to prepare src buffer data
 __global__ void
 PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
@@ -127,10 +135,17 @@ template <int LOOP1_UNROLL>
 __global__ void __launch_bounds__(MAX_BLOCKSIZE)
 GpuReduceKernel(SubExecParam* params)
 {
-  int64_t startCycle = wall_clock64();
+  int64_t startCycle;
+  if (threadIdx.x == 0) startCycle = wall_clock64();
+
+  SubExecParam& p = params[blockIdx.y];
+
+  // Filter by XCC if desired
+  int xccId;
+  GetXccId(xccId);
+  if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;

  // Operate on wavefront granularity
-  SubExecParam& p    = params[blockIdx.x];
  int const numSrcs  = p.numSrcs;
  int const numDsts  = p.numDsts;
  int const waveId   = threadIdx.x / WARP_SIZE; // Wavefront number
@@ -234,8 +249,8 @@ GpuReduceKernel(SubExecParam* params)
  {
    p.stopCycle  = wall_clock64();
    p.startCycle = startCycle;
+    p.xccId      = xccId;
    __trace_hwreg();
-    __trace_xccreg();
  }
 }

@@ -369,7 +384,7 @@ __global__ void __launch_bounds__(MAX_BLOCKSIZE)
 GpuReduceKernel2(SubExecParam* params)
 {
  int64_t startCycle = wall_clock64();
-  SubExecParam& p = params[blockIdx.x];
+  SubExecParam& p = params[blockIdx.y];

  size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
  if (numFloatsLeft)