Fix inf, CU labelling. Update default kernels for gfx94x (#56)

0b29707e · gilbertlee-amd · GitHub · 0b7b979e · 0b29707e · 0b29707e
Unverified Commit 0b29707e authored Oct 11, 2023 by gilbertlee-amd Committed by GitHub Oct 11, 2023
Showing with 45 additions and 43 deletions

CHANGELOG.md CHANGELOG.md +10 -0

src/TransferBench.cpp src/TransferBench.cpp +6 -40

src/include/EnvVars.hpp src/include/EnvVars.hpp +29 -2

src/include/TransferBench.hpp src/include/TransferBench.hpp +0 -1

No files found.
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog for TransferBench

+## v1.29
+### Added
+- a2a preset config now responds to USE_REMOTE_READ
+### Fixed
+- Race-condition during wall-clock initialization caused "inf" during single stream runs
+- CU numbering output after CU masking
+### Modified
+- Default number of warmups reverted to 3
+- Default unroll factor for gfx940/941 set to 6
+
 ## v1.28
 ### Added
 - Added A2A_DIRECT which only executes all-to-all only directly connected GPUs (on by default now)

--- a/src/TransferBench.cpp
+++ b/src/TransferBench.cpp
@@ -1254,9 +1254,9 @@ void CheckPages(char* array, size_t numBytes, int targetId)
 uint32_t GetId(uint32_t hwId)
 {
  // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
-  int const shId = (hwId >> 12) & 1;
-  int const cuId = (hwId >>  8) & 7;
-  int const seId = (hwId >> 13) & 3;
+  int const shId = (hwId >> 12) &  1;
+  int const cuId = (hwId >>  8) & 15;
+  int const seId = (hwId >> 13) &  3;
  return (shId << 5) + (cuId << 2) + seId;
 }

@@ -1313,7 +1313,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,
            minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle);
            maxStopCycle  = std::max(maxStopCycle,  currTransfer->subExecParamGpuPtr[i].stopCycle);
          }
-          int const wallClockRate = GetWallClockRate(exeIndex);
+          int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
          double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
          currTransfer->transferTime += iterationTimeMs;
          if (ev.showIterations)
@@ -1799,10 +1799,11 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
  for (int i = 0; i < numGpus; i++)
  {
    transfer.srcIndex[0] = i;
-    transfer.exeIndex    = i;
    for (int j = 0; j < numGpus; j++)
    {
      transfer.dstIndex[0] = j;
+      transfer.exeIndex    = (ev.useRemoteRead ? j : i);
+
      if (ev.a2aDirect)
      {
 #if !defined(__NVCC__)
@@ -2124,41 +2125,6 @@ std::string Transfer::DstToStr() const
  return ss.str();
 }

-// NOTE: This is a stop-gap solution until HIP provides wallclock values
-int GetWallClockRate(int deviceId)
-{
-  static std::vector<int> wallClockPerDeviceMhz;
-
-  if (wallClockPerDeviceMhz.size() == 0)
-  {
-    int numGpuDevices;
-    HIP_CALL(hipGetDeviceCount(&numGpuDevices));
-    wallClockPerDeviceMhz.resize(numGpuDevices);
-
-    for (int i = 0; i < numGpuDevices; i++)
-    {
-#if defined(__NVCC__)
-      int value = 1410000;
-      //HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
-      //value *= 1000;
-#else
-      hipDeviceProp_t prop;
-      HIP_CALL(hipGetDeviceProperties(&prop, i));
-      int value = 25000;
-      switch (prop.gcnArch)
-      {
-      case 906: case 910: value = 25000; break;
-      case 940: case 941: case 942: value = 100000; break;
-      default:
-        printf("Unrecognized GCN arch %d\n", prop.gcnArch);
-      }
-#endif
-      wallClockPerDeviceMhz[i] = value;
-    }
-  }
-  return wallClockPerDeviceMhz[deviceId];
-}
-
 void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
 {
  ev.DisplaySweepEnvVars();

--- a/src/include/EnvVars.hpp
+++ b/src/include/EnvVars.hpp
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include "Compatibility.hpp"
 #include "Kernels.hpp"

-#define TB_VERSION "1.28"
+#define TB_VERSION "1.29"

 extern char const MemTypeStr[];
 extern char const ExeTypeStr[];
@@ -48,7 +48,7 @@ class EnvVars
 {
 public:
  // Default configuration values
-  int const DEFAULT_NUM_WARMUPS          =  1;
+  int const DEFAULT_NUM_WARMUPS          =  3;
  int const DEFAULT_NUM_ITERATIONS       = 10;
  int const DEFAULT_SAMPLING_FACTOR      =  1;

@@ -123,6 +123,8 @@ public:
  // Track how many CPUs are available per NUMA node
  std::vector<int> numCpusPerNuma;

+  std::vector<int> wallClockPerDeviceMhz;
+
  // Constructor that collects values
  EnvVars()
  {
@@ -152,6 +154,8 @@ public:
    int defaultGpuKernel = 0;
    if      (archName == "gfx906") defaultGpuKernel = 13;
    else if (archName == "gfx90a") defaultGpuKernel = 9;
+    else if (archName == "gfx940") defaultGpuKernel = 6;
+    else if (archName == "gfx941") defaultGpuKernel = 6;

    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
@@ -411,6 +415,26 @@ public:
    for (int i = 0; i < totalCpus; i++)
      numCpusPerNuma[numa_node_of_cpu(i)]++;

+    // Build array of wall clock rates per GPU device
+    wallClockPerDeviceMhz.resize(numDetectedGpus);
+    for (int i = 0; i < numDetectedGpus; i++)
+    {
+#if defined(__NVCC__)
+      // NOTE: wallClock doesn't exist in CUDA.  This may need to be adjusted / run with fixed clocks
+      wallClockPerDeviceMhz[i] = 1410000;
+#else
+      hipDeviceProp_t prop;
+      HIP_CALL(hipGetDeviceProperties(&prop, i));
+      int value = 25000;
+      std::string fullName = prop.gcnArchName;
+      std::string archName = fullName.substr(0, fullName.find(':'));
+      if (archName == "gfx940" || archName == "gfx941" || archName == "gfx942")
+        wallClockPerDeviceMhz[i] = 100000;
+      else
+        wallClockPerDeviceMhz[i] = 25000;
+#endif
+    }
+
    // Check for deprecated env vars
    if (getenv("USE_HIP_CALL"))
    {
@@ -577,6 +601,9 @@ public:
      printf("[AllToAll Related]\n");
    PRINT_EV("A2A_DIRECT", a2aDirect,
             std::string(a2aDirect ? "Only using direct links" : "Full all-to-all"));
+    PRINT_EV("USE_REMOTE_READ", useRemoteRead,
+             std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
+
    printf("\n");
  }


--- a/src/include/TransferBench.hpp
+++ b/src/include/TransferBench.hpp
@@ -193,6 +193,5 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
 std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);

 int RemappedIndex(int const origIdx, bool const isCpuType);
-int GetWallClockRate(int deviceId);
 void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
 std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);