Unverified Commit 3b2a07d6 authored by alexxu-amd's avatar alexxu-amd Committed by GitHub
Browse files

Sync release/rocm-rel-7.1 into docs/7.1.0

parents 6bcbcf4d a824bc1b
...@@ -3,6 +3,27 @@ ...@@ -3,6 +3,27 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.64.00
### Added
- Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes
- Added FILL_COMPRESS to allow more control over input data pattern
- FILL_COMPRESS takes in a comma-separated list of integer percentages (that must add up to 100)
that sets the percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0 data patterns
- Bins:
- 0 - random
- 1 - 1B0 upper 1 byte of each aligned 2 bytes is 0
- 2 - 2B0 upper 2 bytes of each aligned 4 bytes is 0
- 3 - 4B0 upper 4 bytes of each aligned 8 bytes is 0
- 4 - 32B0 upper 32 bytes of each aligned 64-byte line are 0
- FILL_PATTERN will be ignored if FILL_COMPRESS is specified
- Additional details about data patterns generated will be printed if the debug env var DUMP_LINES is
set to a non-zero value, which also corresponds to how many 64 byte lines will be printed
### Modified
- Increased GFX_BLOCKSIZE limit from 512 to 1024 (still requires multiple of 64)
### Fixed
- Fixed bug when using BYTE_OFFSET
## v1.63.00 ## v1.63.00
### Added ### Added
- Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds - Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds
......
...@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE) ...@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE)
message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}") message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif() endif()
set(VERSION_STRING "1.63.00") set(VERSION_STRING "1.64.00")
project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX) project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
## Load CMake modules ## Load CMake modules
......
...@@ -83,4 +83,3 @@ TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp") ...@@ -83,4 +83,3 @@ TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
clean: clean:
rm -f ./TransferBench ./TransferBenchCuda rm -f ./TransferBench ./TransferBenchCuda
rocm-docs-core==1.22.0 rocm-docs-core==1.23.0
...@@ -187,7 +187,7 @@ requests==2.32.2 ...@@ -187,7 +187,7 @@ requests==2.32.2
# via # via
# pygithub # pygithub
# sphinx # sphinx
rocm-docs-core==1.22.0 rocm-docs-core==1.23.0
# via -r requirements.in # via -r requirements.in
rpds-py==0.22.3 rpds-py==0.22.3
# via # via
......
...@@ -77,6 +77,7 @@ public: ...@@ -77,6 +77,7 @@ public:
int blockBytes; // Each subexecutor, except the last, gets a multiple of this many bytes to copy int blockBytes; // Each subexecutor, except the last, gets a multiple of this many bytes to copy
int byteOffset; // Byte-offset for memory allocations int byteOffset; // Byte-offset for memory allocations
vector<float> fillPattern; // Pattern of floats used to fill source data vector<float> fillPattern; // Pattern of floats used to fill source data
vector<int> fillCompress; // Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
int validateSource; // Validate source GPU memory immediately after preparation int validateSource; // Validate source GPU memory immediately after preparation
...@@ -137,6 +138,7 @@ public: ...@@ -137,6 +138,7 @@ public:
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0); alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256); blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0); byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
fillCompress = GetEnvVarArray("FILL_COMPRESS" , {});
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0); gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256); gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1); gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
...@@ -314,6 +316,7 @@ public: ...@@ -314,6 +316,7 @@ public:
printf(" CLOSEST_NIC - Comma-separated list of per-GPU closest NIC (default=auto)\n"); printf(" CLOSEST_NIC - Comma-separated list of per-GPU closest NIC (default=auto)\n");
#endif #endif
printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n"); printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
printf(" FILL_COMPRESS - Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0\n");
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n"); printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n"); printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n"); printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
...@@ -400,6 +403,8 @@ public: ...@@ -400,6 +403,8 @@ public:
#endif #endif
Print("CU_MASK", getenv("CU_MASK") ? 1 : 0, Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
"%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All")); "%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
Print("FILL_COMPRESS", getenv("FILL_COMPRESS") ? 1 : 0,
"%s", (fillCompress.size() ? GetStr(fillCompress).c_str() : "Not specified"));
Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0, Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
"%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str())); "%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
Print("GFX_BLOCK_ORDER", gfxBlockOrder, Print("GFX_BLOCK_ORDER", gfxBlockOrder,
...@@ -493,6 +498,27 @@ public: ...@@ -493,6 +498,27 @@ public:
} }
static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue) static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
{
if (getenv(varname.c_str())) {
std::vector<int> values;
char* arrayStr = getenv(varname.c_str());
char* token = strtok(arrayStr, ",");
while (token) {
int val;
if (sscanf(token, "%d", &val) == 1) {
values.push_back(val);
} else {
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
return values;
}
return defaultValue;
}
static std::vector<int> GetEnvVarRangeArray(std::string const& varname, std::vector<int> const& defaultValue)
{ {
if (getenv(varname.c_str())) { if (getenv(varname.c_str())) {
char* rangeStr = getenv(varname.c_str()); char* rangeStr = getenv(varname.c_str());
...@@ -524,6 +550,15 @@ public: ...@@ -524,6 +550,15 @@ public:
return defaultValue; return defaultValue;
} }
std::string GetStr(std::vector<int> const& varnameList) const {
std::string result = "";
for (int i = 0; i < varnameList.size(); i++) {
if (i) result += ",";
result += std::to_string(varnameList[i]);
}
return result;
}
std::string GetCuMaskDesc() const std::string GetCuMaskDesc() const
{ {
std::vector<std::pair<int, int>> runs; std::vector<std::pair<int, int>> runs;
...@@ -572,9 +607,10 @@ public: ...@@ -572,9 +607,10 @@ public:
cfg.data.alwaysValidate = alwaysValidate; cfg.data.alwaysValidate = alwaysValidate;
cfg.data.blockBytes = blockBytes; cfg.data.blockBytes = blockBytes;
cfg.data.byteOffset = byteOffset; cfg.data.byteOffset = byteOffset;
cfg.data.fillCompress = fillCompress;
cfg.data.fillPattern = fillPattern;
cfg.data.validateDirect = validateDirect; cfg.data.validateDirect = validateDirect;
cfg.data.validateSource = validateSource; cfg.data.validateSource = validateSource;
cfg.data.fillPattern = fillPattern;
cfg.dma.useHipEvents = useHipEvents; cfg.dma.useHipEvents = useHipEvents;
cfg.dma.useHsaCopy = useHsaDma; cfg.dma.useHsaCopy = useHsaDma;
......
...@@ -50,6 +50,7 @@ void AllToAllSweepPreset(EnvVars& ev, ...@@ -50,6 +50,7 @@ void AllToAllSweepPreset(EnvVars& ev,
int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0); int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0);
int verbose = EnvVars::GetEnvVar("VERBOSE", 0); int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
std::vector<int> blockList = EnvVars::GetEnvVarArray("BLOCKSIZES", {256});
std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8}); std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32}); std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
...@@ -77,6 +78,7 @@ void AllToAllSweepPreset(EnvVars& ev, ...@@ -77,6 +78,7 @@ void AllToAllSweepPreset(EnvVars& ev,
ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode), ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " + (a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]); std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
ev.Print("BLOCKSIZES" , blockList.size() , EnvVars::ToStr(blockList).c_str());
ev.Print("SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results"); ev.Print("SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str()); ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus); ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
...@@ -180,48 +182,53 @@ void AllToAllSweepPreset(EnvVars& ev, ...@@ -180,48 +182,53 @@ void AllToAllSweepPreset(EnvVars& ev,
std::map<std::pair<int, int>, TransferBench::TestResults> results; std::map<std::pair<int, int>, TransferBench::TestResults> results;
// Display summary // Display summary
printf("#CUs\\Unroll"); for (int blockSize : blockList) {
for (int u : unrollList) { printf("Blocksize: %d\n", blockSize);
printf(" %d(Min) ", u); ev.gfxBlockSize = cfg.gfx.blockSize = blockSize;
if (!showMinOnly) printf(" %d(Max) ", u);
} printf("#CUs\\Unroll");
printf("\n");
for (int c : numCusList) {
printf(" %5d ", c); fflush(stdout);
for (int u : unrollList) { for (int u : unrollList) {
ev.gfxUnroll = cfg.gfx.unrollFactor = u; printf(" %d(Min) ", u);
for (auto& transfer : transfers) if (!showMinOnly) printf(" %d(Max) ", u);
transfer.numSubExecs = useSpray ? (c * targetCount) : c;
double minBandwidth = std::numeric_limits<double>::max();
double maxBandwidth = std::numeric_limits<double>::min();
TransferBench::TestResults result;
if (TransferBench::RunTransfers(cfg, transfers, result)) {
for (auto const& exeResult : result.exeResults) {
minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
}
if (useSpray) {
minBandwidth *= targetCount;
maxBandwidth *= targetCount;
}
results[std::make_pair(c,u)] = result;
} else {
minBandwidth = 0.0;
}
printf(" %7.2f ", minBandwidth);
if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
fflush(stdout);
} }
printf("\n"); fflush(stdout); printf("\n");
}
if (verbose) {
int testNum = 0;
for (int c : numCusList) { for (int c : numCusList) {
printf(" %5d ", c); fflush(stdout);
for (int u : unrollList) { for (int u : unrollList) {
printf("CUs: %d Unroll %d\n", c, u); ev.gfxUnroll = cfg.gfx.unrollFactor = u;
PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]); for (auto& transfer : transfers)
transfer.numSubExecs = useSpray ? (c * targetCount) : c;
double minBandwidth = std::numeric_limits<double>::max();
double maxBandwidth = std::numeric_limits<double>::min();
TransferBench::TestResults result;
if (TransferBench::RunTransfers(cfg, transfers, result)) {
for (auto const& exeResult : result.exeResults) {
minBandwidth = std::min(minBandwidth, exeResult.second.avgBandwidthGbPerSec);
maxBandwidth = std::max(maxBandwidth, exeResult.second.avgBandwidthGbPerSec);
}
if (useSpray) {
minBandwidth *= targetCount;
maxBandwidth *= targetCount;
}
results[std::make_pair(c,u)] = result;
} else {
minBandwidth = 0.0;
}
printf(" %7.2f ", minBandwidth);
if (!showMinOnly) printf(" %7.2f ", maxBandwidth);
fflush(stdout);
}
printf("\n"); fflush(stdout);
}
if (verbose) {
int testNum = 0;
for (int c : numCusList) {
for (int u : unrollList) {
printf("CUs: %d Unroll %d\n", c, u);
PrintResults(ev, ++testNum, transfers, results[std::make_pair(c,u)]);
}
} }
} }
} }
......
/* /*
Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -66,7 +66,7 @@ namespace TransferBench ...@@ -66,7 +66,7 @@ namespace TransferBench
using std::set; using std::set;
using std::vector; using std::vector;
constexpr char VERSION[] = "1.63"; constexpr char VERSION[] = "1.64";
/** /**
* Enumeration of supported Executor types * Enumeration of supported Executor types
...@@ -166,6 +166,7 @@ namespace TransferBench ...@@ -166,6 +166,7 @@ namespace TransferBench
int blockBytes = 256; ///< Each subexecutor works on a multiple of this many bytes int blockBytes = 256; ///< Each subexecutor works on a multiple of this many bytes
int byteOffset = 0; ///< Byte-offset for memory allocations int byteOffset = 0; ///< Byte-offset for memory allocations
vector<float> fillPattern = {}; ///< Pattern of floats used to fill source data vector<float> fillPattern = {}; ///< Pattern of floats used to fill source data
vector<int> fillCompress = {}; ///< Customized data patterns (overrides fillPattern if non-empty)
int validateDirect = 0; ///< Validate GPU results directly instead of copying to host int validateDirect = 0; ///< Validate GPU results directly instead of copying to host
int validateSource = 0; ///< Validate src GPU memory immediately after preparation int validateSource = 0; ///< Validate src GPU memory immediately after preparation
}; };
...@@ -599,7 +600,7 @@ namespace { ...@@ -599,7 +600,7 @@ namespace {
// Constants // Constants
//======================================================================================== //========================================================================================
int constexpr MAX_BLOCKSIZE = 512; // Max threadblock size int constexpr MAX_BLOCKSIZE = 1024; // Max threadblock size
int constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64; // Max wavegroups/warps int constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64; // Max wavegroups/warps
int constexpr MAX_UNROLL = 8; // Max unroll factor int constexpr MAX_UNROLL = 8; // Max unroll factor
int constexpr MAX_SRCS = 8; // Max srcs per Transfer int constexpr MAX_SRCS = 8; // Max srcs per Transfer
...@@ -681,8 +682,8 @@ namespace { ...@@ -681,8 +682,8 @@ namespace {
int canAccess; int canAccess;
ERR_CHECK(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId)); ERR_CHECK(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
if (!canAccess) if (!canAccess)
return {ERR_FATAL, return {ERR_FATAL, "Peer access is unavailable between GPU devices %d to %d."
"Unable to enable peer access from GPU devices %d to %d", peerDeviceId, deviceId}; "For AMD hardware, check IOMMU configuration", peerDeviceId, deviceId};
ERR_CHECK(hipSetDevice(deviceId)); ERR_CHECK(hipSetDevice(deviceId));
hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0); hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
...@@ -973,6 +974,19 @@ namespace { ...@@ -973,6 +974,19 @@ namespace {
errors.push_back({ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)}); errors.push_back({ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)});
if (cfg.data.byteOffset < 0 || cfg.data.byteOffset % sizeof(float)) if (cfg.data.byteOffset < 0 || cfg.data.byteOffset % sizeof(float))
errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)}); errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
if (cfg.data.fillCompress.size() > 0 && cfg.data.fillPattern.size() > 0)
errors.push_back({ERR_WARN, "[data.fillCompress] will override [data.fillPattern] when both are specified"});
if (cfg.data.fillCompress.size() > 0) {
int sum = 0;
for (int bin : cfg.data.fillCompress)
sum += bin;
if (sum != 100) {
errors.push_back({ERR_FATAL, "[data.fillCompress] values must add up to 100"});
}
}
if (cfg.data.fillCompress.size() > 5) {
errors.push_back({ERR_FATAL, "[data.fillCompress] may only have up to 5 values"});
}
// Check GFX options // Check GFX options
if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2) if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2)
...@@ -2162,8 +2176,104 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2162,8 +2176,104 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
{ {
size_t N = cpuBuffer.size(); size_t N = cpuBuffer.size();
// Source buffer if (!cfg.data.fillCompress.empty()) {
if (bufferIdx >= 0) { // 0 -> Random
// 1 -> 1B0 - The upper 1 byte of each aligned 2 bytes is 0
// 2 -> 2B0 - The upper 2 bytes of each aligned 4 bytes are 0
// 3 -> 4B0 - The upper 4 bytes of each aligned 8 bytes are 0
// 4 -> 32B0 - The upper 32 bytes of each 64-byte line are 0
// Fill buffer with random floats
std::mt19937 gen;
gen.seed(bufferIdx * 425);
std::uniform_real_distribution<float> dist(-100000.0f, +100000.0f);
for (size_t i = 0; i < N; i++) {
cpuBuffer[i] = dist(gen);
}
// Figure out distribution for lines based on the percentages given
size_t numLines = N / 16;
size_t leftover = numLines;
std::vector<size_t> lineCounts(5, 0);
std::set<std::pair<double, int>> remainder;
// Assign rounded down values first
std::vector<int> percentages = cfg.data.fillCompress;
while (percentages.size() < 5) percentages.push_back(0);
for (int i = 0; i < percentages.size(); i++){
lineCounts[i] = (size_t)(numLines * (percentages[i] / 100.0));
leftover -= lineCounts[i];
remainder.insert(std::make_pair(numLines * (percentages[i] / 100.0) - lineCounts[i], i));
}
// Assign leftovers based on largest remainder
while (leftover != 0) {
auto last = *remainder.rbegin();
lineCounts[last.second]++;
remainder.erase(last);
leftover--;
}
// Randomly decide which lines get assigned to which types
std::vector<int> lineTypes(numLines, 0);
int offset = lineCounts[0];
for (int i = 1; i < 5; i++) {
for (int j = 0; j < lineCounts[i]; j++)
lineTypes[offset++] = i;
}
std::shuffle(lineTypes.begin(), lineTypes.end(), gen);
// Apply zero-ing
int dumpLines = getenv("DUMP_LINES") ? atoi(getenv("DUMP_LINES")) : 0;
if (dumpLines) {
printf("Input pattern 64B line statistics for bufferIdx %d:\n", bufferIdx);
printf("Total lines: %lu\n", numLines);
printf("- 0: Random : %8lu (%8.3f%%)\n", lineCounts[0], 100.0 * lineCounts[0] / (1.0 * numLines));
printf("- 1: 1B0 : %8lu (%8.3f%%)\n", lineCounts[1], 100.0 * lineCounts[1] / (1.0 * numLines));
printf("- 2: 2B0 : %8lu (%8.3f%%)\n", lineCounts[2], 100.0 * lineCounts[2] / (1.0 * numLines));
printf("- 3: 4B0 : %8lu (%8.3f%%)\n", lineCounts[3], 100.0 * lineCounts[3] / (1.0 * numLines));
printf("- 4: 32B0 : %8lu (%8.3f%%)\n", lineCounts[4], 100.0 * lineCounts[4] / (1.0 * numLines));
}
for (int line = 0; line < numLines; line++) {
unsigned char* linePtr = (unsigned char*)&cpuBuffer[line * 16];
switch (lineTypes[line]) {
case 1: // 1B0
for (int i = 0; i < 32; i++)
linePtr[2*i+1] = 0;
break;
case 2: // 2B0
for (int i = 0; i < 16; i++) {
linePtr[4*i+2] = 0;
linePtr[4*i+3] = 0;
}
break;
case 3: // 4B0
for (int i = 0; i < 8; i++) {
linePtr[8*i+4] = 0;
linePtr[8*i+5] = 0;
linePtr[8*i+6] = 0;
linePtr[8*i+7] = 0;
}
break;
case 4: // 32B0
for (int i = 32; i < 64; i++)
linePtr[i] = 0;
break;
}
if (line < dumpLines) {
printf("Line %02d [%d]: ", line, lineTypes[line]);
for (int j = 63; j >= 0; j--){
printf("%02x ", linePtr[j]);
if (j % 16 == 0) printf(" ");
}
printf("\n");
}
}
} else {
// Use fill pattern if specified // Use fill pattern if specified
size_t patternLen = cfg.data.fillPattern.size(); size_t patternLen = cfg.data.fillPattern.size();
if (patternLen > 0) { if (patternLen > 0) {
...@@ -2177,27 +2287,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2177,27 +2287,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (leftOver) if (leftOver)
memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), leftOver * sizeof(float)); memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), leftOver * sizeof(float));
} else { } else {
// Fall back to pseudo-random
for (size_t i = 0; i < N; ++i) for (size_t i = 0; i < N; ++i)
cpuBuffer[i] = PrepSrcValue(bufferIdx, i); cpuBuffer[i] = PrepSrcValue(bufferIdx, i);
} }
} else { // Destination buffer
int numSrcs = -bufferIdx - 1;
if (numSrcs == 0) {
// Note: 0x75757575 = 13323083.0
memset(cpuBuffer.data(), MEMSET_CHAR, N * sizeof(float));
} else {
PrepareReference(cfg, cpuBuffer, 0);
if (numSrcs > 1) {
std::vector<float> temp(N);
for (int i = 1; i < numSrcs; i++) {
PrepareReference(cfg, temp, i);
for (int j = 0; j < N; j++) {
cpuBuffer[i] += temp[i];
}
}
}
}
} }
} }
...@@ -2472,7 +2565,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2472,7 +2565,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
std::random_device rd; std::random_device rd;
std::default_random_engine gen(rd()); std::mt19937 gen(rd());
std::shuffle(indices.begin(), indices.end(), gen); std::shuffle(indices.begin(), indices.end(), gen);
// Build randomized threadblock list // Build randomized threadblock list
...@@ -3055,7 +3148,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3055,7 +3148,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
GPU_KERNEL_UNROLL_DECL(320), GPU_KERNEL_UNROLL_DECL(320),
GPU_KERNEL_UNROLL_DECL(384), GPU_KERNEL_UNROLL_DECL(384),
GPU_KERNEL_UNROLL_DECL(448), GPU_KERNEL_UNROLL_DECL(448),
GPU_KERNEL_UNROLL_DECL(512) GPU_KERNEL_UNROLL_DECL(512),
GPU_KERNEL_UNROLL_DECL(576),
GPU_KERNEL_UNROLL_DECL(640),
GPU_KERNEL_UNROLL_DECL(704),
GPU_KERNEL_UNROLL_DECL(768),
GPU_KERNEL_UNROLL_DECL(832),
GPU_KERNEL_UNROLL_DECL(896),
GPU_KERNEL_UNROLL_DECL(960),
GPU_KERNEL_UNROLL_DECL(1024),
}; };
#undef GPU_KERNEL_UNROLL_DECL #undef GPU_KERNEL_UNROLL_DECL
#undef GPU_KERNEL_DWORD_DECL #undef GPU_KERNEL_DWORD_DECL
...@@ -3442,6 +3543,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3442,6 +3543,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
vector<float> outputBuffer(maxN); vector<float> outputBuffer(maxN);
vector<vector<float>> dstReference(maxNumSrcs + 1, vector<float>(maxN)); vector<vector<float>> dstReference(maxNumSrcs + 1, vector<float>(maxN));
{ {
size_t initOffset = cfg.data.byteOffset / sizeof(float);
vector<vector<float>> srcReference(maxNumSrcs, vector<float>(maxN)); vector<vector<float>> srcReference(maxNumSrcs, vector<float>(maxN));
memset(dstReference[0].data(), MEMSET_CHAR, maxNumBytes); memset(dstReference[0].data(), MEMSET_CHAR, maxNumBytes);
...@@ -3458,7 +3560,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3458,7 +3560,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Initialize all src memory buffers // Initialize all src memory buffers
for (auto resource : transferResources) { for (auto resource : transferResources) {
for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) { for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) {
ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx], srcReference[srcIdx].data(), resource->numBytes, ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx] + initOffset, srcReference[srcIdx].data(), resource->numBytes,
hipMemcpyDefault), errResults); hipMemcpyDefault), errResults);
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment