Sync release/rocm-rel-7.1 into docs/7.1.0

3b2a07d6 · alexxu-amd · GitHub · 6bcbcf4d · a824bc1b · 3b2a07d6
Unverified Commit 3b2a07d6 authored Oct 30, 2025 by alexxu-amd Committed by GitHub Oct 30, 2025
8 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,27 @@
 Documentation for TransferBench is available at
 [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).

+## v1.64.00
+### Added
+- Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes
+- Added FILL_COMPRESS to allow more control over input data pattern
+  - FILL_COMPRESS takes in a comma-separated list of integer percentages (that must add up to 100)
+    that sets the percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0 data patterns
+    - Bins:
+      - 0 - random
+      - 1 - 1B0    upper 1 byte of each aligned 2 bytes is 0
+      - 2 - 2B0    upper 2 bytes of each aligned 4 bytes is 0
+      - 3 - 4B0    upper 4 bytes of each aligned 8 bytes is 0
+      - 4 - 32B0   upper 32 bytes of each aligned 64-byte line are 0
+  - FILL_PATTERN will be ignored if FILL_COMPRESS is specified
+- Additional details about data patterns generated will be printed if the debug env var DUMP_LINES is
+  set to a non-zero value, which also corresponds to how many 64 byte lines will be printed
+### Modified
+- Increased GFX_BLOCKSIZE limit from 512 to 1024 (still requires multiple of 64)
+
+### Fixed
+- Fixed bug when using BYTE_OFFSET
+
 ## v1.63.00
 ### Added
 - Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE)
  message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
 endif()

-set(VERSION_STRING "1.63.00")
+set(VERSION_STRING "1.64.00")
 project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)

 ## Load CMake modules

--- a/Makefile
+++ b/Makefile
@@ -83,4 +83,3 @@ TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")

 clean:
 	rm -f ./TransferBench ./TransferBenchCuda
-
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
-rocm-docs-core==1.22.0
+rocm-docs-core==1.23.0
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -187,7 +187,7 @@ requests==2.32.2
    # via
    #   pygithub
    #   sphinx
-rocm-docs-core==1.22.0
+rocm-docs-core==1.23.0
    # via -r requirements.in
 rpds-py==0.22.3
    # via

--- a/src/client/EnvVars.hpp
+++ b/src/client/EnvVars.hpp
@@ -77,6 +77,7 @@ public:
  int blockBytes;                    // Each subexecutor, except the last, gets a multiple of this many bytes to copy
  int byteOffset;                    // Byte-offset for memory allocations
  vector<float> fillPattern;         // Pattern of floats used to fill source data
+  vector<int> fillCompress;          // Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0
  int validateDirect;                // Validate GPU destination memory directly instead of staging GPU memory on host
  int validateSource;                // Validate source GPU memory immediately after preparation

@@ -137,6 +138,7 @@ public:
    alwaysValidate    = GetEnvVar("ALWAYS_VALIDATE"     , 0);
    blockBytes        = GetEnvVar("BLOCK_BYTES"         , 256);
    byteOffset        = GetEnvVar("BYTE_OFFSET"         , 0);
+    fillCompress      = GetEnvVarArray("FILL_COMPRESS"  , {});
    gfxBlockOrder     = GetEnvVar("GFX_BLOCK_ORDER"     , 0);
    gfxBlockSize      = GetEnvVar("GFX_BLOCK_SIZE"      , 256);
    gfxSingleTeam     = GetEnvVar("GFX_SINGLE_TEAM"     , 1);
@@ -314,6 +316,7 @@ public:
    printf(" CLOSEST_NIC       - Comma-separated list of per-GPU closest NIC (default=auto)\n");
 #endif
    printf(" CU_MASK           - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
+    printf(" FILL_COMPRESS     - Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0\n");
    printf(" FILL_PATTERN      - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
    printf(" GFX_BLOCK_ORDER   - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
    printf(" GFX_BLOCK_SIZE    - # of threads per threadblock (Must be multiple of 64)\n");
@@ -400,6 +403,8 @@ public:
 #endif
    Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
          "%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
+    Print("FILL_COMPRESS", getenv("FILL_COMPRESS") ? 1 : 0,
+          "%s", (fillCompress.size() ? GetStr(fillCompress).c_str() : "Not specified"));
    Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
          "%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
    Print("GFX_BLOCK_ORDER", gfxBlockOrder,
@@ -493,6 +498,27 @@ public:
  }

  static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
+  {
+    if (getenv(varname.c_str())) {
+      std::vector<int> values;
+      char* arrayStr = getenv(varname.c_str());
+      char* token = strtok(arrayStr, ",");
+      while (token) {
+        int val;
+        if (sscanf(token, "%d", &val) == 1) {
+          values.push_back(val);
+        } else {
+          printf("[ERROR] Unrecognized token [%s]\n", token);
+          exit(1);
+        }
+        token = strtok(NULL, ",");
+      }
+      return values;
+    }
+    return defaultValue;
+  }
+
+  static std::vector<int> GetEnvVarRangeArray(std::string const& varname, std::vector<int> const& defaultValue)
  {
    if (getenv(varname.c_str())) {
      char* rangeStr = getenv(varname.c_str());
@@ -524,6 +550,15 @@ public:
    return defaultValue;
  }

+  std::string GetStr(std::vector<int> const& varnameList) const {
+    std::string result = "";
+    for (int i = 0; i < varnameList.size(); i++) {
+      if (i) result += ",";
+      result += std::to_string(varnameList[i]);
+    }
+    return result;
+  }
+
  std::string GetCuMaskDesc() const
  {
    std::vector<std::pair<int, int>> runs;
@@ -572,9 +607,10 @@ public:
    cfg.data.alwaysValidate        = alwaysValidate;
    cfg.data.blockBytes            = blockBytes;
    cfg.data.byteOffset            = byteOffset;
+    cfg.data.fillCompress          = fillCompress;
+    cfg.data.fillPattern           = fillPattern;
    cfg.data.validateDirect        = validateDirect;
    cfg.data.validateSource        = validateSource;
-    cfg.data.fillPattern           = fillPattern;

    cfg.dma.useHipEvents           = useHipEvents;
    cfg.dma.useHsaCopy             = useHsaDma;

--- a/src/client/Presets/AllToAllSweep.hpp
+++ b/src/client/Presets/AllToAllSweep.hpp
@@ -50,6 +50,7 @@ void AllToAllSweepPreset(EnvVars&           ev,
  int useSpray      = EnvVars::GetEnvVar("USE_SPRAY",       0);
  int verbose       = EnvVars::GetEnvVar("VERBOSE",         0);

+  std::vector<int> blockList  = EnvVars::GetEnvVarArray("BLOCKSIZES", {256});
  std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
  std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});

@@ -77,6 +78,7 @@ void AllToAllSweepPreset(EnvVars&           ev,
    ev.Print("A2A_MODE"       , (a2aMode == A2A_CUSTOM) ?  std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
                                (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
                                                           std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
+    ev.Print("BLOCKSIZES"     , blockList.size() , EnvVars::ToStr(blockList).c_str());
    ev.Print("SHOW_MIN_ONLY"  , showMinOnly      , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
    ev.Print("NUM_CUS"        , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
    ev.Print("NUM_GPU_DEVICES", numGpus          , "Using %d GPUs", numGpus);
@@ -180,6 +182,10 @@ void AllToAllSweepPreset(EnvVars&           ev,
  std::map<std::pair<int, int>, TransferBench::TestResults> results;

  // Display summary
+  for (int blockSize : blockList) {
+    printf("Blocksize: %d\n", blockSize);
+    ev.gfxBlockSize = cfg.gfx.blockSize = blockSize;
+
    printf("#CUs\\Unroll");
    for (int u : unrollList) {
      printf("  %d(Min) ", u);
@@ -225,4 +231,5 @@ void AllToAllSweepPreset(EnvVars&           ev,
        }
      }
    }
+  }
 }
--- a/src/header/TransferBench.hpp
+++ b/src/header/TransferBench.hpp
 /*
-Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) Advanced Micro Devices, Inc. All rights reserved.

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -66,7 +66,7 @@ namespace TransferBench
  using std::set;
  using std::vector;

-  constexpr char VERSION[] = "1.63";
+  constexpr char VERSION[] = "1.64";

  /**
   * Enumeration of supported Executor types
@@ -166,6 +166,7 @@ namespace TransferBench
    int           blockBytes       = 256;       ///< Each subexecutor works on a multiple of this many bytes
    int           byteOffset       = 0;         ///< Byte-offset for memory allocations
    vector<float> fillPattern      = {};        ///< Pattern of floats used to fill source data
+    vector<int>   fillCompress     = {};        ///< Customized data patterns (overrides fillPattern if non-empty)
    int           validateDirect   = 0;         ///< Validate GPU results directly instead of copying to host
    int           validateSource   = 0;         ///< Validate src GPU memory immediately after preparation
  };
@@ -599,7 +600,7 @@ namespace {
 // Constants
 //========================================================================================

-  int   constexpr MAX_BLOCKSIZE  = 512;                // Max threadblock size
+  int   constexpr MAX_BLOCKSIZE  = 1024;               // Max threadblock size
  int   constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64; // Max wavegroups/warps
  int   constexpr MAX_UNROLL     = 8;                  // Max unroll factor
  int   constexpr MAX_SRCS       = 8;                  // Max srcs per Transfer
@@ -681,8 +682,8 @@ namespace {
    int canAccess;
    ERR_CHECK(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
    if (!canAccess)
-      return {ERR_FATAL,
-              "Unable to enable peer access from GPU devices %d to %d", peerDeviceId, deviceId};
+      return {ERR_FATAL, "Peer access is unavailable between GPU devices %d to %d."
+                         "For AMD hardware, check IOMMU configuration", peerDeviceId, deviceId};

    ERR_CHECK(hipSetDevice(deviceId));
    hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
@@ -973,6 +974,19 @@ namespace {
      errors.push_back({ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)});
    if (cfg.data.byteOffset < 0 || cfg.data.byteOffset % sizeof(float))
      errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
+    if (cfg.data.fillCompress.size() > 0 && cfg.data.fillPattern.size() > 0)
+      errors.push_back({ERR_WARN, "[data.fillCompress] will override [data.fillPattern] when both are specified"});
+    if (cfg.data.fillCompress.size() > 0) {
+      int sum = 0;
+      for (int bin : cfg.data.fillCompress)
+        sum += bin;
+      if (sum != 100) {
+        errors.push_back({ERR_FATAL, "[data.fillCompress] values must add up to 100"});
+      }
+    }
+    if (cfg.data.fillCompress.size() > 5) {
+      errors.push_back({ERR_FATAL, "[data.fillCompress] may only have up to 5 values"});
+    }

    // Check GFX options
    if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2)
@@ -2162,8 +2176,104 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
  {
    size_t N = cpuBuffer.size();

-    // Source buffer
-    if (bufferIdx >= 0) {
+    if (!cfg.data.fillCompress.empty()) {
+      // 0 -> Random
+      // 1 ->  1B0 - The upper  1 byte  of each aligned 2 bytes is 0
+      // 2 ->  2B0 - The upper  2 bytes of each aligned 4 bytes are 0
+      // 3 ->  4B0 - The upper  4 bytes of each aligned 8 bytes are 0
+      // 4 -> 32B0 - The upper 32 bytes of each 64-byte line are 0
+
+      // Fill buffer with random floats
+      std::mt19937 gen;
+      gen.seed(bufferIdx * 425);
+      std::uniform_real_distribution<float> dist(-100000.0f, +100000.0f);
+      for (size_t i = 0; i < N; i++) {
+        cpuBuffer[i] = dist(gen);
+      }
+
+      // Figure out distribution for lines based on the percentages given
+      size_t numLines = N / 16;
+      size_t leftover = numLines;
+      std::vector<size_t> lineCounts(5, 0);
+      std::set<std::pair<double, int>> remainder;
+
+      // Assign rounded down values first
+      std::vector<int> percentages = cfg.data.fillCompress;
+      while (percentages.size() < 5) percentages.push_back(0);
+      for (int i = 0; i < percentages.size(); i++){
+        lineCounts[i] = (size_t)(numLines * (percentages[i] / 100.0));
+        leftover -= lineCounts[i];
+        remainder.insert(std::make_pair(numLines * (percentages[i] / 100.0) - lineCounts[i], i));
+      }
+
+      // Assign leftovers based on largest remainder
+      while (leftover != 0) {
+        auto last = *remainder.rbegin();
+        lineCounts[last.second]++;
+        remainder.erase(last);
+        leftover--;
+      }
+
+      // Randomly decide which lines get assigned to which types
+      std::vector<int> lineTypes(numLines, 0);
+      int offset = lineCounts[0];
+      for (int i = 1; i < 5; i++) {
+        for (int j = 0; j < lineCounts[i]; j++)
+          lineTypes[offset++] = i;
+      }
+      std::shuffle(lineTypes.begin(), lineTypes.end(), gen);
+
+      // Apply zero-ing
+      int dumpLines = getenv("DUMP_LINES") ? atoi(getenv("DUMP_LINES")) : 0;
+
+      if (dumpLines) {
+        printf("Input pattern 64B line statistics for bufferIdx %d:\n", bufferIdx);
+        printf("Total lines: %lu\n", numLines);
+        printf("- 0: Random : %8lu (%8.3f%%)\n", lineCounts[0], 100.0 * lineCounts[0] / (1.0 * numLines));
+        printf("- 1: 1B0    : %8lu (%8.3f%%)\n", lineCounts[1], 100.0 * lineCounts[1] / (1.0 * numLines));
+        printf("- 2: 2B0    : %8lu (%8.3f%%)\n", lineCounts[2], 100.0 * lineCounts[2] / (1.0 * numLines));
+        printf("- 3: 4B0    : %8lu (%8.3f%%)\n", lineCounts[3], 100.0 * lineCounts[3] / (1.0 * numLines));
+        printf("- 4: 32B0   : %8lu (%8.3f%%)\n", lineCounts[4], 100.0 * lineCounts[4] / (1.0 * numLines));
+      }
+
+      for (int line = 0; line < numLines; line++) {
+        unsigned char* linePtr = (unsigned char*)&cpuBuffer[line * 16];
+
+        switch (lineTypes[line]) {
+        case 1: // 1B0
+          for (int i = 0; i < 32; i++)
+            linePtr[2*i+1] = 0;
+          break;
+        case 2: // 2B0
+          for (int i = 0; i < 16; i++) {
+            linePtr[4*i+2] = 0;
+            linePtr[4*i+3] = 0;
+          }
+          break;
+        case 3: // 4B0
+          for (int i = 0; i < 8; i++) {
+            linePtr[8*i+4] = 0;
+            linePtr[8*i+5] = 0;
+            linePtr[8*i+6] = 0;
+            linePtr[8*i+7] = 0;
+          }
+          break;
+        case 4: // 32B0
+          for (int i = 32; i < 64; i++)
+            linePtr[i] = 0;
+          break;
+        }
+
+        if (line < dumpLines) {
+          printf("Line %02d [%d]: ", line, lineTypes[line]);
+          for (int j = 63; j >= 0; j--){
+            printf("%02x ", linePtr[j]);
+            if (j % 16 == 0) printf(" ");
+          }
+          printf("\n");
+        }
+      }
+    } else {
      // Use fill pattern if specified
      size_t patternLen = cfg.data.fillPattern.size();
      if (patternLen > 0) {
@@ -2177,27 +2287,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
        if (leftOver)
          memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), leftOver * sizeof(float));
      } else {
+        // Fall back to pseudo-random
        for (size_t i = 0; i < N; ++i)
          cpuBuffer[i] = PrepSrcValue(bufferIdx, i);
      }
-    } else { // Destination buffer
-      int numSrcs = -bufferIdx - 1;
-
-      if (numSrcs == 0) {
-        // Note: 0x75757575 = 13323083.0
-        memset(cpuBuffer.data(), MEMSET_CHAR, N * sizeof(float));
-      } else {
-        PrepareReference(cfg, cpuBuffer, 0);
-        if (numSrcs > 1) {
-          std::vector<float> temp(N);
-          for (int i = 1; i < numSrcs; i++) {
-            PrepareReference(cfg, temp, i);
-            for (int j = 0; j < N; j++) {
-              cpuBuffer[i] += temp[i];
-            }
-          }
-        }
-      }
    }
  }

@@ -2472,7 +2565,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
        }

        std::random_device rd;
-        std::default_random_engine gen(rd());
+        std::mt19937 gen(rd());
        std::shuffle(indices.begin(), indices.end(), gen);

        // Build randomized threadblock list
@@ -3055,7 +3148,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    GPU_KERNEL_UNROLL_DECL(320),
    GPU_KERNEL_UNROLL_DECL(384),
    GPU_KERNEL_UNROLL_DECL(448),
-    GPU_KERNEL_UNROLL_DECL(512)
+    GPU_KERNEL_UNROLL_DECL(512),
+    GPU_KERNEL_UNROLL_DECL(576),
+    GPU_KERNEL_UNROLL_DECL(640),
+    GPU_KERNEL_UNROLL_DECL(704),
+    GPU_KERNEL_UNROLL_DECL(768),
+    GPU_KERNEL_UNROLL_DECL(832),
+    GPU_KERNEL_UNROLL_DECL(896),
+    GPU_KERNEL_UNROLL_DECL(960),
+    GPU_KERNEL_UNROLL_DECL(1024),
  };
  #undef GPU_KERNEL_UNROLL_DECL
  #undef GPU_KERNEL_DWORD_DECL
@@ -3442,6 +3543,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
    vector<float> outputBuffer(maxN);
    vector<vector<float>> dstReference(maxNumSrcs + 1, vector<float>(maxN));
    {
+      size_t initOffset = cfg.data.byteOffset / sizeof(float);
      vector<vector<float>> srcReference(maxNumSrcs, vector<float>(maxN));
      memset(dstReference[0].data(), MEMSET_CHAR, maxNumBytes);

@@ -3458,7 +3560,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
      // Initialize all src memory buffers
      for (auto resource : transferResources) {
        for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) {
-          ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx], srcReference[srcIdx].data(), resource->numBytes,
+          ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx] + initOffset, srcReference[srcIdx].data(), resource->numBytes,
                               hipMemcpyDefault), errResults);
        }
      }