Unverified Commit 3b2a07d6 authored by alexxu-amd's avatar alexxu-amd Committed by GitHub
Browse files

Sync release/rocm-rel-7.1 into docs/7.1.0

parents 6bcbcf4d a824bc1b
......@@ -3,6 +3,27 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.64.00
### Added
- Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes
- Added FILL_COMPRESS to allow more control over input data pattern
- FILL_COMPRESS takes in a comma-separated list of integer percentages (that must add up to 100)
that sets the percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0 data patterns
- Bins:
- 0 - random
- 1 - 1B0 upper 1 byte of each aligned 2 bytes is 0
- 2 - 2B0 upper 2 bytes of each aligned 4 bytes is 0
- 3 - 4B0 upper 4 bytes of each aligned 8 bytes is 0
- 4 - 32B0 upper 32 bytes of each aligned 64-byte line are 0
- FILL_PATTERN will be ignored if FILL_COMPRESS is specified
- Additional details about data patterns generated will be printed if the debug env var DUMP_LINES is
set to a non-zero value, which also corresponds to how many 64 byte lines will be printed
### Modified
- Increased GFX_BLOCKSIZE limit from 512 to 1024 (still requires multiple of 64)
### Fixed
- Fixed bug when using BYTE_OFFSET
## v1.63.00
### Added
- Added `gfx950`, `gfx1150`, and `gfx1151` to default GPU targets list in CMake builds
......
......@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE)
message(STATUS "CMAKE_TOOLCHAIN_FILE: ${CMAKE_TOOLCHAIN_FILE}")
endif()
set(VERSION_STRING "1.63.00")
set(VERSION_STRING "1.64.00")
project(TransferBench VERSION ${VERSION_STRING} LANGUAGES CXX)
## Load CMake modules
......
......@@ -83,4 +83,3 @@ TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
clean:
rm -f ./TransferBench ./TransferBenchCuda
rocm-docs-core==1.22.0
rocm-docs-core==1.23.0
......@@ -187,7 +187,7 @@ requests==2.32.2
# via
# pygithub
# sphinx
rocm-docs-core==1.22.0
rocm-docs-core==1.23.0
# via -r requirements.in
rpds-py==0.22.3
# via
......
......@@ -77,6 +77,7 @@ public:
int blockBytes; // Each subexecutor, except the last, gets a multiple of this many bytes to copy
int byteOffset; // Byte-offset for memory allocations
vector<float> fillPattern; // Pattern of floats used to fill source data
vector<int> fillCompress; // Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
int validateSource; // Validate source GPU memory immediately after preparation
......@@ -137,6 +138,7 @@ public:
alwaysValidate = GetEnvVar("ALWAYS_VALIDATE" , 0);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
fillCompress = GetEnvVarArray("FILL_COMPRESS" , {});
gfxBlockOrder = GetEnvVar("GFX_BLOCK_ORDER" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
......@@ -314,6 +316,7 @@ public:
printf(" CLOSEST_NIC - Comma-separated list of per-GPU closest NIC (default=auto)\n");
#endif
printf(" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'\n");
printf(" FILL_COMPRESS - Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0\n");
printf(" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits\n");
printf(" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved\n");
printf(" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)\n");
......@@ -400,6 +403,8 @@ public:
#endif
Print("CU_MASK", getenv("CU_MASK") ? 1 : 0,
"%s", (cuMask.size() ? GetCuMaskDesc().c_str() : "All"));
Print("FILL_COMPRESS", getenv("FILL_COMPRESS") ? 1 : 0,
"%s", (fillCompress.size() ? GetStr(fillCompress).c_str() : "Not specified"));
Print("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
"%s", (fillPattern.size() ? getenv("FILL_PATTERN") : TransferBench::GetStrAttribute(ATR_SRC_PREP_DESCRIPTION).c_str()));
Print("GFX_BLOCK_ORDER", gfxBlockOrder,
......@@ -493,6 +498,27 @@ public:
}
static std::vector<int> GetEnvVarArray(std::string const& varname, std::vector<int> const& defaultValue)
{
if (getenv(varname.c_str())) {
std::vector<int> values;
char* arrayStr = getenv(varname.c_str());
char* token = strtok(arrayStr, ",");
while (token) {
int val;
if (sscanf(token, "%d", &val) == 1) {
values.push_back(val);
} else {
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
return values;
}
return defaultValue;
}
static std::vector<int> GetEnvVarRangeArray(std::string const& varname, std::vector<int> const& defaultValue)
{
if (getenv(varname.c_str())) {
char* rangeStr = getenv(varname.c_str());
......@@ -524,6 +550,15 @@ public:
return defaultValue;
}
std::string GetStr(std::vector<int> const& varnameList) const {
std::string result = "";
for (int i = 0; i < varnameList.size(); i++) {
if (i) result += ",";
result += std::to_string(varnameList[i]);
}
return result;
}
std::string GetCuMaskDesc() const
{
std::vector<std::pair<int, int>> runs;
......@@ -572,9 +607,10 @@ public:
cfg.data.alwaysValidate = alwaysValidate;
cfg.data.blockBytes = blockBytes;
cfg.data.byteOffset = byteOffset;
cfg.data.fillCompress = fillCompress;
cfg.data.fillPattern = fillPattern;
cfg.data.validateDirect = validateDirect;
cfg.data.validateSource = validateSource;
cfg.data.fillPattern = fillPattern;
cfg.dma.useHipEvents = useHipEvents;
cfg.dma.useHsaCopy = useHsaDma;
......
......@@ -50,6 +50,7 @@ void AllToAllSweepPreset(EnvVars& ev,
int useSpray = EnvVars::GetEnvVar("USE_SPRAY", 0);
int verbose = EnvVars::GetEnvVar("VERBOSE", 0);
std::vector<int> blockList = EnvVars::GetEnvVarArray("BLOCKSIZES", {256});
std::vector<int> unrollList = EnvVars::GetEnvVarArray("UNROLLS", {1,2,3,4,6,8});
std::vector<int> numCusList = EnvVars::GetEnvVarArray("NUM_CUS", {4,8,12,16,24,32});
......@@ -77,6 +78,7 @@ void AllToAllSweepPreset(EnvVars& ev,
ev.Print("A2A_MODE" , (a2aMode == A2A_CUSTOM) ? std::to_string(numSrcs) + ":" + std::to_string(numDsts) : std::to_string(a2aMode),
(a2aMode == A2A_CUSTOM) ? (std::to_string(numSrcs) + " read(s) " +
std::to_string(numDsts) + " write(s)").c_str(): a2aModeStr[a2aMode]);
ev.Print("BLOCKSIZES" , blockList.size() , EnvVars::ToStr(blockList).c_str());
ev.Print("SHOW_MIN_ONLY" , showMinOnly , showMinOnly ? "Showing only slowest GPU results" : "Showing slowest and fastest GPU results");
ev.Print("NUM_CUS" , numCusList.size(), EnvVars::ToStr(numCusList).c_str());
ev.Print("NUM_GPU_DEVICES", numGpus , "Using %d GPUs", numGpus);
......@@ -180,6 +182,10 @@ void AllToAllSweepPreset(EnvVars& ev,
std::map<std::pair<int, int>, TransferBench::TestResults> results;
// Display summary
for (int blockSize : blockList) {
printf("Blocksize: %d\n", blockSize);
ev.gfxBlockSize = cfg.gfx.blockSize = blockSize;
printf("#CUs\\Unroll");
for (int u : unrollList) {
printf(" %d(Min) ", u);
......@@ -225,4 +231,5 @@ void AllToAllSweepPreset(EnvVars& ev,
}
}
}
}
}
/*
Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......@@ -66,7 +66,7 @@ namespace TransferBench
using std::set;
using std::vector;
constexpr char VERSION[] = "1.63";
constexpr char VERSION[] = "1.64";
/**
* Enumeration of supported Executor types
......@@ -166,6 +166,7 @@ namespace TransferBench
int blockBytes = 256; ///< Each subexecutor works on a multiple of this many bytes
int byteOffset = 0; ///< Byte-offset for memory allocations
vector<float> fillPattern = {}; ///< Pattern of floats used to fill source data
vector<int> fillCompress = {}; ///< Customized data patterns (overrides fillPattern if non-empty)
int validateDirect = 0; ///< Validate GPU results directly instead of copying to host
int validateSource = 0; ///< Validate src GPU memory immediately after preparation
};
......@@ -599,7 +600,7 @@ namespace {
// Constants
//========================================================================================
int constexpr MAX_BLOCKSIZE = 512; // Max threadblock size
int constexpr MAX_BLOCKSIZE = 1024; // Max threadblock size
int constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64; // Max wavegroups/warps
int constexpr MAX_UNROLL = 8; // Max unroll factor
int constexpr MAX_SRCS = 8; // Max srcs per Transfer
......@@ -681,8 +682,8 @@ namespace {
int canAccess;
ERR_CHECK(hipDeviceCanAccessPeer(&canAccess, deviceId, peerDeviceId));
if (!canAccess)
return {ERR_FATAL,
"Unable to enable peer access from GPU devices %d to %d", peerDeviceId, deviceId};
return {ERR_FATAL, "Peer access is unavailable between GPU devices %d to %d."
"For AMD hardware, check IOMMU configuration", peerDeviceId, deviceId};
ERR_CHECK(hipSetDevice(deviceId));
hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
......@@ -973,6 +974,19 @@ namespace {
errors.push_back({ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)});
if (cfg.data.byteOffset < 0 || cfg.data.byteOffset % sizeof(float))
errors.push_back({ERR_FATAL, "[data.byteOffset] must be positive multiple of %lu", sizeof(float)});
if (cfg.data.fillCompress.size() > 0 && cfg.data.fillPattern.size() > 0)
errors.push_back({ERR_WARN, "[data.fillCompress] will override [data.fillPattern] when both are specified"});
if (cfg.data.fillCompress.size() > 0) {
int sum = 0;
for (int bin : cfg.data.fillCompress)
sum += bin;
if (sum != 100) {
errors.push_back({ERR_FATAL, "[data.fillCompress] values must add up to 100"});
}
}
if (cfg.data.fillCompress.size() > 5) {
errors.push_back({ERR_FATAL, "[data.fillCompress] may only have up to 5 values"});
}
// Check GFX options
if (cfg.gfx.blockOrder < 0 || cfg.gfx.blockOrder > 2)
......@@ -2162,8 +2176,104 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
{
size_t N = cpuBuffer.size();
// Source buffer
if (bufferIdx >= 0) {
if (!cfg.data.fillCompress.empty()) {
// 0 -> Random
// 1 -> 1B0 - The upper 1 byte of each aligned 2 bytes is 0
// 2 -> 2B0 - The upper 2 bytes of each aligned 4 bytes are 0
// 3 -> 4B0 - The upper 4 bytes of each aligned 8 bytes are 0
// 4 -> 32B0 - The upper 32 bytes of each 64-byte line are 0
// Fill buffer with random floats
std::mt19937 gen;
gen.seed(bufferIdx * 425);
std::uniform_real_distribution<float> dist(-100000.0f, +100000.0f);
for (size_t i = 0; i < N; i++) {
cpuBuffer[i] = dist(gen);
}
// Figure out distribution for lines based on the percentages given
size_t numLines = N / 16;
size_t leftover = numLines;
std::vector<size_t> lineCounts(5, 0);
std::set<std::pair<double, int>> remainder;
// Assign rounded down values first
std::vector<int> percentages = cfg.data.fillCompress;
while (percentages.size() < 5) percentages.push_back(0);
for (int i = 0; i < percentages.size(); i++){
lineCounts[i] = (size_t)(numLines * (percentages[i] / 100.0));
leftover -= lineCounts[i];
remainder.insert(std::make_pair(numLines * (percentages[i] / 100.0) - lineCounts[i], i));
}
// Assign leftovers based on largest remainder
while (leftover != 0) {
auto last = *remainder.rbegin();
lineCounts[last.second]++;
remainder.erase(last);
leftover--;
}
// Randomly decide which lines get assigned to which types
std::vector<int> lineTypes(numLines, 0);
int offset = lineCounts[0];
for (int i = 1; i < 5; i++) {
for (int j = 0; j < lineCounts[i]; j++)
lineTypes[offset++] = i;
}
std::shuffle(lineTypes.begin(), lineTypes.end(), gen);
// Apply zero-ing
int dumpLines = getenv("DUMP_LINES") ? atoi(getenv("DUMP_LINES")) : 0;
if (dumpLines) {
printf("Input pattern 64B line statistics for bufferIdx %d:\n", bufferIdx);
printf("Total lines: %lu\n", numLines);
printf("- 0: Random : %8lu (%8.3f%%)\n", lineCounts[0], 100.0 * lineCounts[0] / (1.0 * numLines));
printf("- 1: 1B0 : %8lu (%8.3f%%)\n", lineCounts[1], 100.0 * lineCounts[1] / (1.0 * numLines));
printf("- 2: 2B0 : %8lu (%8.3f%%)\n", lineCounts[2], 100.0 * lineCounts[2] / (1.0 * numLines));
printf("- 3: 4B0 : %8lu (%8.3f%%)\n", lineCounts[3], 100.0 * lineCounts[3] / (1.0 * numLines));
printf("- 4: 32B0 : %8lu (%8.3f%%)\n", lineCounts[4], 100.0 * lineCounts[4] / (1.0 * numLines));
}
for (int line = 0; line < numLines; line++) {
unsigned char* linePtr = (unsigned char*)&cpuBuffer[line * 16];
switch (lineTypes[line]) {
case 1: // 1B0
for (int i = 0; i < 32; i++)
linePtr[2*i+1] = 0;
break;
case 2: // 2B0
for (int i = 0; i < 16; i++) {
linePtr[4*i+2] = 0;
linePtr[4*i+3] = 0;
}
break;
case 3: // 4B0
for (int i = 0; i < 8; i++) {
linePtr[8*i+4] = 0;
linePtr[8*i+5] = 0;
linePtr[8*i+6] = 0;
linePtr[8*i+7] = 0;
}
break;
case 4: // 32B0
for (int i = 32; i < 64; i++)
linePtr[i] = 0;
break;
}
if (line < dumpLines) {
printf("Line %02d [%d]: ", line, lineTypes[line]);
for (int j = 63; j >= 0; j--){
printf("%02x ", linePtr[j]);
if (j % 16 == 0) printf(" ");
}
printf("\n");
}
}
} else {
// Use fill pattern if specified
size_t patternLen = cfg.data.fillPattern.size();
if (patternLen > 0) {
......@@ -2177,27 +2287,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (leftOver)
memcpy(cpuBufferPtr, cfg.data.fillPattern.data(), leftOver * sizeof(float));
} else {
// Fall back to pseudo-random
for (size_t i = 0; i < N; ++i)
cpuBuffer[i] = PrepSrcValue(bufferIdx, i);
}
} else { // Destination buffer
int numSrcs = -bufferIdx - 1;
if (numSrcs == 0) {
// Note: 0x75757575 = 13323083.0
memset(cpuBuffer.data(), MEMSET_CHAR, N * sizeof(float));
} else {
PrepareReference(cfg, cpuBuffer, 0);
if (numSrcs > 1) {
std::vector<float> temp(N);
for (int i = 1; i < numSrcs; i++) {
PrepareReference(cfg, temp, i);
for (int j = 0; j < N; j++) {
cpuBuffer[i] += temp[i];
}
}
}
}
}
}
......@@ -2472,7 +2565,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
std::random_device rd;
std::default_random_engine gen(rd());
std::mt19937 gen(rd());
std::shuffle(indices.begin(), indices.end(), gen);
// Build randomized threadblock list
......@@ -3055,7 +3148,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
GPU_KERNEL_UNROLL_DECL(320),
GPU_KERNEL_UNROLL_DECL(384),
GPU_KERNEL_UNROLL_DECL(448),
GPU_KERNEL_UNROLL_DECL(512)
GPU_KERNEL_UNROLL_DECL(512),
GPU_KERNEL_UNROLL_DECL(576),
GPU_KERNEL_UNROLL_DECL(640),
GPU_KERNEL_UNROLL_DECL(704),
GPU_KERNEL_UNROLL_DECL(768),
GPU_KERNEL_UNROLL_DECL(832),
GPU_KERNEL_UNROLL_DECL(896),
GPU_KERNEL_UNROLL_DECL(960),
GPU_KERNEL_UNROLL_DECL(1024),
};
#undef GPU_KERNEL_UNROLL_DECL
#undef GPU_KERNEL_DWORD_DECL
......@@ -3442,6 +3543,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
vector<float> outputBuffer(maxN);
vector<vector<float>> dstReference(maxNumSrcs + 1, vector<float>(maxN));
{
size_t initOffset = cfg.data.byteOffset / sizeof(float);
vector<vector<float>> srcReference(maxNumSrcs, vector<float>(maxN));
memset(dstReference[0].data(), MEMSET_CHAR, maxNumBytes);
......@@ -3458,7 +3560,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Initialize all src memory buffers
for (auto resource : transferResources) {
for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) {
ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx], srcReference[srcIdx].data(), resource->numBytes,
ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx] + initOffset, srcReference[srcIdx].data(), resource->numBytes,
hipMemcpyDefault), errResults);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment