Unverified Commit b30aefb6 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Fixing topology detection memory access and CU masking for multi XCD GPUs (#116)

* Fixing potential out-of-bounds write during topology detection
* Fixing CU_MASK for multi-XCD GPUs
* Adding sub-iterations via NUM_SUBITERATIONS
* Adding support for variable subexecutor Transfers
* Adding healthcheck preset
parent ae843a6f
......@@ -3,6 +3,29 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.51
## Modified
- CSV output has been modified slightly to match normal terminal output
- Output for non single stream mode has been changed to match single stream mode (results per Executor)
### Added
- Support for sub-iterations via NUM_SUBITERATIONS. This allows for additional looping during an iteration
If set to 0, this should infinitely loop (which may be useful for some debug purposes)
- Support for variable number of subexecutors (currently for GPU-GFX executor only). Setting subExecutors to
0 will run over a range of CUs to use, and report only the results of the best one found. This can be tuned
for performance by setting the MIN_VAR_SUBEXEC and MAX_VAR_SUBEXEC environment variables to narrow the
search space. The number of CUs used will be identical for all variable subExecutor transfers
- Experimental new "healthcheck" preset config which currently only supports MI300 series. This preset runs
through CPU to GPU bandwidth tests and all-to-all XGMI bandwidth tests and compares against expected values
Pass criteria limits can be modified (due to platform differences) via the environment variables
LIMIT_UDIR (undirectional), LIMIT_BDIR (bidirectional), and LIMIT_A2A (Per GPU-GPU link bandwidth)
### Fixed
- Fixed out-of-bounds memory access during topology detection that can happen if the number of
CPUs is less than the number of NUMA domains
- Fixed CU masking functionality on multi-XCD architectures (e.g. MI300)
## v1.50
### Added
......
......@@ -67,8 +67,9 @@ make
* Running TransferBench with no arguments displays usage instructions and detected topology
information
* You can use several preset configurations instead of a configuration file:
* `a2a` : All-to-all benchmark test
* `cmdline`: Take in Transfers to run from command-line instead of via file
* `a2a` : All-to-all benchmark test
* `cmdline` : Take in Transfers to run from command-line instead of via file
* `healthcheck` : Simple health check (supported on MI300 series only)
* `p2p` : Peer-to-peer benchmark test
* `pcopy` : Benchmark parallel copies from a single GPU to other GPUs
* `rsweep` : Random sweep across possible sets of transfers
......
This diff is collapsed.
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.50"
#define TB_VERSION "1.51"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -84,9 +84,12 @@ public:
int gfxUnroll; // GFX-kernel unroll factor
int gfxWaveOrder; // GFX-kernel wavefront ordering
int hideEnv; // Skip printing environment variable
int minNumVarSubExec; // Minimum # of subexecutors to use for variable subExec Transfers
int maxNumVarSubExec; // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected)
int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected)
int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead
int numSubIterations; // Number of subiterations to perform
int numWarmups; // Number of un-timed warmup iterations to perform
int outputToCsv; // Output in CSV format
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
......@@ -188,9 +191,12 @@ public:
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0);
minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC" , 1);
maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC" , 0);
numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus);
numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
numSubIterations = GetEnvVar("NUM_SUBITERATIONS" , 1);
numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
......@@ -299,6 +305,24 @@ public:
}
else fillPattern.clear();
// Figure out number of xccs per device
int maxNumXccs = 64;
xccIdsPerDevice.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
{
int* data;
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
HIP_CALL(hipDeviceSynchronize());
xccIdsPerDevice[i].clear();
for (int j = 0; j < maxNumXccs; j++)
xccIdsPerDevice[i].insert(data[j]);
HIP_CALL(hipHostFree(data));
}
// Check for CU mask
cuMask.clear();
char* cuMaskStr = getenv("CU_MASK");
......@@ -308,6 +332,7 @@ public:
printf("[WARN] CU_MASK is not supported in CUDA\n");
#else
std::vector<std::pair<int, int>> ranges;
int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
int maxCU = 0;
char* token = strtok(cuMaskStr, ",");
while (token)
......@@ -330,36 +355,22 @@ public:
}
token = strtok(NULL, ",");
}
cuMask.resize(maxCU / 32 + 1, 0);
cuMask.resize(2 * numXccs, 0);
for (auto range : ranges)
{
for (int i = range.first; i <= range.second; i++)
{
cuMask[i / 32] |= (1 << (i % 32));
for (int x = 0; x < numXccs; x++)
{
int targetBit = i * numXccs + x;
cuMask[targetBit/32] |= (1<<(targetBit%32));
}
}
}
#endif
}
// Figure out number of xccs per device
int maxNumXccs = 64;
xccIdsPerDevice.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
{
int* data;
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
HIP_CALL(hipDeviceSynchronize());
xccIdsPerDevice[i].clear();
for (int j = 0; j < maxNumXccs; j++)
xccIdsPerDevice[i].insert(data[j]);
HIP_CALL(hipHostFree(data));
}
// Parse preferred XCC table (if provided
prefXccTable.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
......@@ -429,6 +440,11 @@ public:
printf("[ERROR] BLOCK_ORDER must be 0 (Sequential), 1 (Interleaved), or 2 (Random)\n");
exit(1);
}
if (minNumVarSubExec < 1)
{
printf("[ERROR] Minimum number of subexecutors for variable subexector transfers must be at least 1\n");
exit(1);
}
if (numWarmups < 0)
{
printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
......@@ -524,8 +540,10 @@ public:
// Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
numCpusPerNuma.resize(numDetectedCpus);
int const totalCpus = numa_num_configured_cpus();
for (int i = 0; i < totalCpus; i++)
numCpusPerNuma[numa_node_of_cpu(i)]++;
for (int i = 0; i < totalCpus; i++) {
int node = numa_node_of_cpu(i);
if (node >= 0) numCpusPerNuma[node]++;
}
// Build array of wall clock rates per GPU device
wallClockPerDeviceMhz.resize(numDetectedGpus);
......@@ -583,9 +601,12 @@ public:
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on individual disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
printf(" MIN_VAR_SUBEXEC - Minumum # of subexecutors to use for variable subExec Transfers\n");
printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" NUM_SUBITERATIONS=S - Perform S sub-iteration(s) per iteration. Must be non-negative\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
......@@ -649,6 +670,12 @@ public:
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll")));
PRINT_EV("MIN_VAR_SUBEXEC", minNumVarSubExec,
std::string("Using at least ") + std::to_string(minNumVarSubExec) + " subexecutor(s) for variable subExec tranfers");
PRINT_EV("MAX_VAR_SUBEXEC", maxNumVarSubExec,
maxNumVarSubExec ?
std::string("Using at most ") + std::to_string(maxNumVarSubExec) + " subexecutor(s) for variable subExec tranfers" :
"Using up to maximum device subexecutors for variable subExec tranfers");
PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
......@@ -656,6 +683,8 @@ public:
PRINT_EV("NUM_ITERATIONS", numIterations,
std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
+ (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
PRINT_EV("NUM_SUBITERATIONS", numSubIterations,
std::string("Running ") + (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)) + " subiterations");
PRINT_EV("NUM_WARMUPS", numWarmups,
std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
......@@ -828,36 +857,27 @@ public:
std::string GetCuMaskDesc() const
{
std::vector<std::pair<int, int>> runs;
int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
bool inRun = false;
std::pair<int, int> curr;
int used = 0;
for (int i = 0; i < cuMask.size(); i++)
{
for (int j = 0; j < 32; j++)
{
if (cuMask[i] & (1 << j))
{
used++;
if (!inRun)
{
inRun = true;
curr.first = i * 32 + j;
}
for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
used++;
if (!inRun) {
inRun = true;
curr.first = targetBit / numXccs;
}
else
{
if (inRun)
{
inRun = false;
curr.second = i * 32 + j - 1;
runs.push_back(curr);
}
} else {
if (inRun) {
inRun = false;
curr.second = targetBit / numXccs - 1;
runs.push_back(curr);
}
}
}
if (inRun)
curr.second = cuMask.size() * 32 - 1;
curr.second = (cuMask.size() * 32) / numXccs - 1;
std::string result = "CUs used: (" + std::to_string(used) + ") ";
for (int i = 0; i < runs.size(); i++)
......
......@@ -174,7 +174,7 @@ template <> __device__ __forceinline__ float4 MemsetVal(){ return make
template <int BLOCKSIZE, int UNROLL>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder)
GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
{
int64_t startCycle;
if (threadIdx.x == 0) startCycle = GetTimestamp();
......@@ -216,84 +216,88 @@ __global__ void __launch_bounds__(BLOCKSIZE)
case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves; teamStride2 = 1; waveStride2 = nTeams; break;
}
// First loop: Each wavefront in the team works on UNROLL float4s per thread
size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride;
{
float4 val[UNROLL];
if (numSrcs == 0)
int subIterations = 0;
while (1) {
// First loop: Each wavefront in the team works on UNROLL float4s per thread
size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride;
{
#pragma unroll
for (int u = 0; u < UNROLL; u++)
val[u] = MemsetVal<float4>();
}
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
{
// Read sources into memory and accumulate in registers
if (numSrcs)
{
for (int u = 0; u < UNROLL; u++)
val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
for (int s = 1; s < numSrcs; s++)
for (int u = 0; u < UNROLL; u++)
val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
}
// Write accumulation to all outputs
for (int d = 0; d < numDsts; d++)
{
float4 val[UNROLL];
if (numSrcs == 0) {
#pragma unroll
for (int u = 0; u < UNROLL; u++)
dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
val[u] = MemsetVal<float4>();
}
}
}
// Second loop: Deal with remaining float4s
{
if (loop1Limit < numFloat4)
{
float4 val;
if (numSrcs == 0) val = MemsetVal<float4>();
size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
{
// Read sources into memory and accumulate in registers
if (numSrcs)
{
val = srcFloat4[0][idx];
for (int u = 0; u < UNROLL; u++)
val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
for (int s = 1; s < numSrcs; s++)
val += srcFloat4[s][idx];
for (int u = 0; u < UNROLL; u++)
val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
}
// Write accumulation to all outputs
for (int d = 0; d < numDsts; d++)
dstFloat4[d][idx] = val;
{
#pragma unroll
for (int u = 0; u < UNROLL; u++)
dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
}
}
}
}
// Third loop; Deal with remaining floats
{
if (numFloat4 * 4 < p.N)
// Second loop: Deal with remaining float4s
{
float val;
if (numSrcs == 0) val = MemsetVal<float>();
size_t const loop3Stride = nTeams * nWaves * warpSize;
for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
if (loop1Limit < numFloat4)
{
if (numSrcs)
float4 val;
if (numSrcs == 0) val = MemsetVal<float4>();
size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
{
val = p.src[0][idx];
for (int s = 1; s < numSrcs; s++)
val += p.src[s][idx];
if (numSrcs)
{
val = srcFloat4[0][idx];
for (int s = 1; s < numSrcs; s++)
val += srcFloat4[s][idx];
}
for (int d = 0; d < numDsts; d++)
dstFloat4[d][idx] = val;
}
}
}
for (int d = 0; d < numDsts; d++)
p.dst[d][idx] = val;
// Third loop; Deal with remaining floats
{
if (numFloat4 * 4 < p.N)
{
float val;
if (numSrcs == 0) val = MemsetVal<float>();
size_t const loop3Stride = nTeams * nWaves * warpSize;
for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
{
if (numSrcs)
{
val = p.src[0][idx];
for (int s = 1; s < numSrcs; s++)
val += p.src[s][idx];
}
for (int d = 0; d < numDsts; d++)
p.dst[d][idx] = val;
}
}
}
if (++subIterations == numSubIterations) break;
}
// Wait for all threads to finish
......@@ -308,7 +312,7 @@ __global__ void __launch_bounds__(BLOCKSIZE)
}
}
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int);
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{GpuReduceKernel<BLOCKSIZE, 1>, \
......
......@@ -158,6 +158,25 @@ struct ExecutorInfo
double totalTime;
};
struct ExeResult
{
double bandwidthGbs;
double durationMsec;
double sumBandwidthGbs;
size_t totalBytes;
std::vector<int> transferIdx;
};
struct TestResults
{
size_t numTimedIterations;
size_t totalBytesTransferred;
double totalBandwidthCpu;
double totalDurationMsec;
double overheadMsec;
std::map<std::pair<ExeType, int>, ExeResult> exeResults;
};
typedef std::pair<ExeType, int> Executor;
typedef std::map<Executor, ExecutorInfo> TransferMap;
......@@ -179,7 +198,8 @@ void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transf
void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
std::vector<Transfer>& transfers, bool verbose = true,
double* totalBandwidthCpu = nullptr);
TestResults ExecuteTransfersImpl(EnvVars const& ev, std::vector<Transfer>& transfers);
void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results);
void EnablePeerAccess(int const deviceId, int const peerDeviceId);
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
......@@ -192,6 +212,7 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
void RunHealthCheck(EnvVars ev);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment