Unverified Commit b30aefb6 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Fixing topology detection memory access and CU masking for multi XCD GPUs (#116)

* Fixing potential out-of-bounds write during topology detection
* Fixing CU_MASK for multi-XCD GPUs
* Adding sub-iterations via NUM_SUBITERATIONS
* Adding support for variable subexecutor Transfers
* Adding healthcheck preset
parent ae843a6f
......@@ -3,6 +3,29 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.51
## Modified
- CSV output has been modified slightly to match normal terminal output
- Output for non single stream mode has been changed to match single stream mode (results per Executor)
### Added
- Support for sub-iterations via NUM_SUBITERATIONS. This allows for additional looping during an iteration
If set to 0, this should infinitely loop (which may be useful for some debug purposes)
- Support for variable number of subexecutors (currently for GPU-GFX executor only). Setting subExecutors to
0 will run over a range of CUs to use, and report only the results of the best one found. This can be tuned
for performance by setting the MIN_VAR_SUBEXEC and MAX_VAR_SUBEXEC environment variables to narrow the
search space. The number of CUs used will be identical for all variable subExecutor transfers
- Experimental new "healthcheck" preset config which currently only supports MI300 series. This preset runs
through CPU to GPU bandwidth tests and all-to-all XGMI bandwidth tests and compares against expected values
Pass criteria limits can be modified (due to platform differences) via the environment variables
LIMIT_UDIR (undirectional), LIMIT_BDIR (bidirectional), and LIMIT_A2A (Per GPU-GPU link bandwidth)
### Fixed
- Fixed out-of-bounds memory access during topology detection that can happen if the number of
CPUs is less than the number of NUMA domains
- Fixed CU masking functionality on multi-XCD architectures (e.g. MI300)
## v1.50
### Added
......
......@@ -67,8 +67,9 @@ make
* Running TransferBench with no arguments displays usage instructions and detected topology
information
* You can use several preset configurations instead of a configuration file:
* `a2a` : All-to-all benchmark test
* `cmdline`: Take in Transfers to run from command-line instead of via file
* `a2a` : All-to-all benchmark test
* `cmdline` : Take in Transfers to run from command-line instead of via file
* `healthcheck` : Simple health check (supported on MI300 series only)
* `p2p` : Peer-to-peer benchmark test
* `pcopy` : Benchmark parallel copies from a single GPU to other GPUs
* `rsweep` : Random sweep across possible sets of transfers
......
......@@ -116,6 +116,11 @@ int main(int argc, char **argv)
RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
exit(0);
}
// Health check
else if (!strcmp(argv[1], "healthcheck")) {
RunHealthCheck(ev);
exit(0);
}
// - Test schmoo benchmark
else if (!strcmp(argv[1], "schmoo"))
{
......@@ -211,12 +216,8 @@ int main(int argc, char **argv)
}
else if (!strcmp(argv[1], "cmdline"))
{
// Print environment variables and CSV header
// Print environment variables
ev.DisplayEnvVars();
if (ev.outputToCsv)
{
printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
}
// Read Transfer from command line
std::string cmdlineTransfer;
......@@ -263,10 +264,6 @@ int main(int argc, char **argv)
// Print environment variables and CSV header
ev.DisplayEnvVars();
if (ev.outputToCsv)
{
printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),SrcAddr,DstAddr\n");
}
int testNum = 0;
char line[MAX_LINE_LEN];
......@@ -313,8 +310,78 @@ void ExecuteTransfers(EnvVars const& ev,
bool verbose,
double* totalBandwidthCpu)
{
int const initOffset = ev.byteOffset / sizeof(float);
// Check for any Transfers using variable number of sub-executors
std::vector<int> varTransfers;
std::vector<int> numUsedSubExec(ev.numGpuDevices, 0);
std::vector<int> numVarSubExec(ev.numGpuDevices, 0);
for (int i = 0; i < transfers.size(); i++) {
Transfer& t = transfers[i];
t.transferIndex = i;
t.numBytesActual = (t.numBytes ? t.numBytes : N * sizeof(float));
if (t.exeType == EXE_GPU_GFX) {
if (t.numSubExecs == 0) {
varTransfers.push_back(i);
numVarSubExec[t.exeIndex]++;
} else {
numUsedSubExec[t.exeIndex] += t.numSubExecs;
}
} else if (t.numSubExecs == 0) {
printf("[ERROR] Variable subexecutor count is only supported for GFX executor\n");
exit(1);
}
}
if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
TestResults testResults;
if (varTransfers.size() == 0) {
testResults = ExecuteTransfersImpl(ev, transfers);
} else {
// Determine maximum number of subexecutors
int maxNumSubExec = 0;
if (ev.maxNumVarSubExec) {
maxNumSubExec = ev.maxNumVarSubExec;
} else {
HIP_CALL(hipDeviceGetAttribute(&maxNumSubExec, hipDeviceAttributeMultiprocessorCount, 0));
for (int device = 0; device < ev.numGpuDevices; device++) {
int numSubExec = 0;
HIP_CALL(hipDeviceGetAttribute(&numSubExec, hipDeviceAttributeMultiprocessorCount, device));
int leftOverSubExec = numSubExec - numUsedSubExec[device];
if (leftOverSubExec < numVarSubExec[device])
maxNumSubExec = 1;
else if (numVarSubExec[device] != 0) {
maxNumSubExec = std::min(maxNumSubExec, leftOverSubExec / numVarSubExec[device]);
}
}
}
// Loop over subexecs
std::vector<Transfer> bestTransfers;
for (int numSubExec = ev.minNumVarSubExec; numSubExec <= maxNumSubExec; numSubExec++) {
std::vector<Transfer> currTransfers = transfers;
for (auto idx : varTransfers) {
currTransfers[idx].numSubExecs = numSubExec;
}
TestResults tempResults = ExecuteTransfersImpl(ev, currTransfers);
if (tempResults.totalBandwidthCpu > testResults.totalBandwidthCpu) {
bestTransfers = currTransfers;
testResults = tempResults;
}
}
transfers = bestTransfers;
}
if (totalBandwidthCpu) *totalBandwidthCpu = testResults.totalBandwidthCpu;
if (verbose) {
ReportResults(ev, transfers, testResults);
}
}
TestResults ExecuteTransfersImpl(EnvVars const& ev,
std::vector<Transfer>& transfers)
{
// Map transfers by executor
TransferMap transferMap;
for (int i = 0; i < transfers.size(); i++)
......@@ -341,15 +408,12 @@ void ExecuteTransfers(EnvVars const& ev,
// Loop over each transfer this executor is involved in
for (Transfer* transfer : exeInfo.transfers)
{
// Determine how many bytes to copy for this Transfer (use custom if pre-specified)
transfer->numBytesActual = (transfer->numBytes ? transfer->numBytes : N * sizeof(float));
// Allocate source memory
transfer->srcMem.resize(transfer->numSrcs);
for (int iSrc = 0; iSrc < transfer->numSrcs; ++iSrc)
{
MemType const& srcType = transfer->srcType[iSrc];
int const srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
int const srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
// Ensure executing GPU can access source memory
if (IsGpuType(exeType) && IsGpuType(srcType) && srcIndex != exeIndex)
......@@ -363,7 +427,7 @@ void ExecuteTransfers(EnvVars const& ev,
for (int iDst = 0; iDst < transfer->numDsts; ++iDst)
{
MemType const& dstType = transfer->dstType[iDst];
int const dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
int const dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
// Ensure executing GPU can access destination memory
if (IsGpuType(exeType) && IsGpuType(dstType) && dstIndex != exeIndex)
......@@ -509,7 +573,6 @@ void ExecuteTransfers(EnvVars const& ev,
}
}
if (verbose && !ev.outputToCsv) printf("Test %d:\n", testNum);
// Prepare input memory and block parameters for current N
bool isSrcCorrect = true;
......@@ -610,7 +673,7 @@ void ExecuteTransfers(EnvVars const& ev,
if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
// Pause before starting first timed iteration in interactive mode
if (verbose && ev.useInteractive && iteration == 0)
if (ev.useInteractive && iteration == 0)
{
printf("Memory prepared:\n");
......@@ -674,7 +737,7 @@ void ExecuteTransfers(EnvVars const& ev,
}
// Pause for interactive mode
if (verbose && isSrcCorrect && ev.useInteractive)
if (isSrcCorrect && ev.useInteractive)
{
printf("Transfers complete. Hit <Enter> to continue: ");
if (scanf("%*c") != 0)
......@@ -695,243 +758,47 @@ void ExecuteTransfers(EnvVars const& ev,
totalBytesTransferred += transfer->numBytesActual;
}
// Report timings
totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
if (totalBandwidthCpu) *totalBandwidthCpu = totalBandwidthGbs;
double maxGpuTime = 0;
// Record results
TestResults testResults;
testResults.numTimedIterations = numTimedIterations;
testResults.totalBytesTransferred = totalBytesTransferred;
testResults.totalDurationMsec = totalCpuTime / (1.0 * numTimedIterations * ev.numSubIterations) * 1000;
testResults.totalBandwidthCpu = (totalBytesTransferred / 1.0E6) / testResults.totalDurationMsec;
double maxExeDurationMsec = 0.0;
if (!isSrcCorrect) goto cleanup;
if (ev.useSingleStream)
{
for (auto& exeInfoPair : transferMap)
{
ExecutorInfo exeInfo = exeInfoPair.second;
ExeType const exeType = exeInfoPair.first.first;
int const exeIndex = exeInfoPair.first.second;
// Compute total time for non GPU executors
if (exeType != EXE_GPU_GFX)
{
exeInfo.totalTime = 0;
for (auto const& transfer : exeInfo.transfers)
exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
}
double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
double sumBandwidthGbs = 0.0;
for (auto& transfer: exeInfo.transfers)
{
transfer->transferTime /= (1.0 * numTimedIterations);
transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = exeBandwidthGbs;
sumBandwidthGbs += transfer->transferBandwidth;
}
if (verbose && !ev.outputToCsv)
{
printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes, sumBandwidthGbs);
}
for (auto& exeInfoPair : transferMap)
{
ExecutorInfo exeInfo = exeInfoPair.second;
ExeType const exeType = exeInfoPair.first.first;
int const exeIndex = exeInfoPair.first.second;
ExeResult& exeResult = testResults.exeResults[std::make_pair(exeType, exeIndex)];
int totalCUs = 0;
// Compute total time for non GPU executors
if (exeType != EXE_GPU_GFX || ev.useSingleStream == 0)
{
exeInfo.totalTime = 0;
for (auto const& transfer : exeInfo.transfers)
{
totalCUs += transfer->numSubExecs;
char exeSubIndexStr[32] = "";
if (ev.useXccFilter || transfer->exeType == EXE_GPU_DMA)
{
if (transfer->exeSubIndex == -1)
sprintf(exeSubIndexStr, ".*");
else
sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
}
if (!verbose) continue;
if (!ev.outputToCsv)
{
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
transfer->transferIndex,
transfer->transferBandwidth,
transfer->transferTime,
transfer->numBytesActual,
transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex,
exeSubIndexStr,
transfer->numSubExecs,
transfer->DstToStr().c_str());
if (ev.showIterations)
{
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++)
{
times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
double const varTime = fabs(transfer->transferTime - transfer->perIterationTime[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - transfer->transferBandwidth);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
for (auto t : times)
{
double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);
std::set<int> usedXccs;
if (t.second - 1 < transfer->perIterationCUs.size())
{
printf(" CUs:");
for (auto x : transfer->perIterationCUs[t.second - 1])
{
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
}
}
else
{
printf("%d,%d,%lu,%s,%c%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
testNum, transfer->transferIndex, transfer->numBytesActual,
transfer->SrcToStr().c_str(),
MemTypeStr[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
transfer->DstToStr().c_str(),
transfer->numSubExecs,
transfer->transferBandwidth, transfer->transferTime,
PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
PtrVectorToStr(transfer->dstMem, initOffset).c_str());
}
}
if (verbose && ev.outputToCsv)
{
printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL\n",
testNum, totalBytesTransferred,
MemTypeStr[exeType], exeIndex, totalCUs,
exeBandwidthGbs, exeDurationMsec);
}
exeInfo.totalTime = std::max(exeInfo.totalTime, transfer->transferTime);
}
}
else
{
for (auto const& transferPair : transferList)
{
Transfer* transfer = transferPair.second;
transfer->transferTime /= (1.0 * numTimedIterations);
transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = transfer->transferBandwidth;
maxGpuTime = std::max(maxGpuTime, transfer->transferTime);
if (!verbose) continue;
char exeSubIndexStr[32] = "";
if (ev.useXccFilter)
{
if (transfer->exeSubIndex == -1 || transfer->exeType == EXE_GPU_DMA)
sprintf(exeSubIndexStr, ".*");
else
sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
}
if (!ev.outputToCsv)
{
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d%s:%03d -> %s\n",
transfer->transferIndex,
transfer->transferBandwidth, transfer->transferTime,
transfer->numBytesActual,
transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
transfer->numSubExecs,
transfer->DstToStr().c_str());
if (ev.showIterations)
{
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++)
{
times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
double const varTime = fabs(transfer->transferTime - transfer->perIterationTime[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - transfer->transferBandwidth);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
for (auto t : times)
{
double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);
std::set<int> usedXccs;
if (t.second - 1 < transfer->perIterationCUs.size())
{
printf(" CUs:");
for (auto x : transfer->perIterationCUs[t.second - 1])
{
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %d", x);
printf("\n");
}
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
}
}
else
{
printf("%d,%d,%lu,%s,%s%02d%s,%s,%d,%.3f,%.3f,%s,%s\n",
testNum, transfer->transferIndex, transfer->numBytesActual,
transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex, exeSubIndexStr,
transfer->DstToStr().c_str(),
transfer->numSubExecs,
transfer->transferBandwidth, transfer->transferTime,
PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
PtrVectorToStr(transfer->dstMem, initOffset).c_str());
}
}
}
exeResult.totalBytes = exeInfo.totalBytes;
exeResult.durationMsec = exeInfo.totalTime / (1.0 * numTimedIterations * ev.numSubIterations);
exeResult.bandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeResult.durationMsec * 1000.0f;
exeResult.sumBandwidthGbs = 0;
maxExeDurationMsec = std::max(maxExeDurationMsec, exeResult.durationMsec);
// Display aggregate statistics
if (verbose)
{
if (!ev.outputToCsv)
{
printf(" Aggregate (CPU) | %7.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
}
else
for (auto& transfer: exeInfo.transfers)
{
printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL\n",
testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
exeResult.transferIdx.push_back(transfer->transferIndex);
transfer->transferTime /= (1.0 * numTimedIterations * ev.numSubIterations);
transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = exeResult.bandwidthGbs;
exeResult.sumBandwidthGbs += transfer->transferBandwidth;
}
}
testResults.overheadMsec = testResults.totalDurationMsec - maxExeDurationMsec;
// Release GPU memory
cleanup:
......@@ -983,6 +850,8 @@ cleanup:
}
}
}
return testResults;
}
void DisplayUsage(char const* cmdName)
......@@ -1006,6 +875,7 @@ void DisplayUsage(char const* cmdName)
printf(" a2a - GPU All-To-All benchmark\n");
printf(" - 3rd optional arg: # of SubExecs to use\n");
printf(" cmdline - Read Transfers from command line arguments (after N)\n");
printf(" healthcheck - Simple bandwidth health check (MI300 series only)\n");
printf(" p2p - Peer-to-peer benchmark tests\n");
printf(" rwrite/pcopy - Parallel writes/copies from single GPU to other GPUs\n");
printf(" - 3rd optional arg: # GPU SubExecs per Transfer\n");
......@@ -1396,9 +1266,9 @@ void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transf
if (!advancedMode)
{
iss >> numSubExecs;
if (numSubExecs <= 0 || iss.fail())
if (numSubExecs < 0 || iss.fail())
{
printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numSubExecs);
printf("Parsing error: Number of blocks to use (%d) must be non-negative\n", numSubExecs);
exit(1);
}
}
......@@ -1683,14 +1553,14 @@ void RunTransfer(EnvVars const& ev, int const iteration,
#if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1]
<<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
<<<gridSize, blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
HIP_CALL(hipEventRecord(stopEvent, stream));
#else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gfxBlockSize/64 - 1][ev.gfxUnroll - 1],
gridSize, blockSize,
ev.sharedMemBytes, stream,
startEvent, stopEvent,
0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder);
0, transfer->subExecParamGpuPtr, ev.gfxWaveOrder, ev.numSubIterations);
#endif
// Synchronize per iteration, unless in single sync mode, in which case
// synchronize during last warmup / last actual iteration
......@@ -1757,13 +1627,13 @@ void RunTransfer(EnvVars const& ev, int const iteration,
hipEvent_t& startEvent = exeInfo.startEvents[transferIdx];
hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
int subIteration = 0;
HIP_CALL(hipEventRecord(startEvent, stream));
if (transfer->numSrcs == 1 && transfer->numDsts == 1)
{
do {
HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
transfer->numBytesActual, hipMemcpyDefault,
stream));
}
} while (++subIteration != ev.numSubIterations);
HIP_CALL(hipEventRecord(stopEvent, stream));
HIP_CALL(hipStreamSynchronize(stream));
......@@ -1772,6 +1642,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// Record GPU timing
float gpuDeltaMsec;
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
//gpuDeltaMsec /= (1.0 * ev.numSubIterations);
transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations)
transfer->perIterationTime.push_back(gpuDeltaMsec);
......@@ -1784,23 +1655,27 @@ void RunTransfer(EnvVars const& ev, int const iteration,
exit(1);
#else
// Target specific DMA engine
auto cpuStart = std::chrono::high_resolution_clock::now();
// Atomically set signal to 1
HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
int subIterations = 0;
do {
// Atomically set signal to 1
HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
transfer->srcMem[0], transfer->srcAgent,
transfer->numBytesActual, 0, NULL,
transfer->signal,
transfer->sdmaEngineId, true));
// Wait for SDMA transfer to complete
// NOTE: "A wait operation can spuriously resume at any time sooner than the timeout
// (for example, due to system or other external factors) even when the
// condition has not been met.)
while(hsa_signal_wait_scacquire(transfer->signal,
HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
HSA_WAIT_STATE_ACTIVE) >= 1);
} while (++subIterations < ev.numSubIterations);
auto cpuStart = std::chrono::high_resolution_clock::now();
HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
transfer->srcMem[0], transfer->srcAgent,
transfer->numBytesActual, 0, NULL,
transfer->signal,
transfer->sdmaEngineId, true));
// Wait for SDMA transfer to complete
// NOTE: "A wait operation can spuriously resume at any time sooner than the timeout
// (for example, due to system or other external factors) even when the
// condition has not been met.)
while(hsa_signal_wait_scacquire(transfer->signal,
HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
HSA_WAIT_STATE_ACTIVE) >= 1);
if (iteration >= 0)
{
// Record GPU timing
......@@ -1825,15 +1700,18 @@ void RunTransfer(EnvVars const& ev, int const iteration,
std::vector<std::thread> childThreads;
int subIteration = 0;
auto cpuStart = std::chrono::high_resolution_clock::now();
do {
// Launch each subExecutor in child-threads to perform memcopies
for (int i = 0; i < transfer->numSubExecs; ++i)
childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
// Launch each subExecutor in child-threads to perform memcopies
for (int i = 0; i < transfer->numSubExecs; ++i)
childThreads.push_back(std::thread(CpuReduceKernel, std::ref(transfer->subExecParam[i])));
// Wait for child-threads to finish
for (int i = 0; i < transfer->numSubExecs; ++i)
childThreads[i].join();
// Wait for child-threads to finish
for (int i = 0; i < transfer->numSubExecs; ++i)
childThreads[i].join();
childThreads.clear();
} while (++subIteration != ev.numSubIterations);
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
......@@ -3171,3 +3049,316 @@ std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initO
}
return ss.str();
}
void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results)
{
char sep = ev.outputToCsv ? ',' : '|';
size_t numTimedIterations = results.numTimedIterations;
// Loop over each executor
for (auto exeInfoPair : results.exeResults) {
ExeResult const& exeResult = exeInfoPair.second;
ExeType exeType = exeInfoPair.first.first;
int exeIndex = exeInfoPair.first.second;
printf(" Executor: %3s %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, sep, exeResult.bandwidthGbs, sep,
exeResult.durationMsec, sep, exeResult.totalBytes, sep, exeResult.sumBandwidthGbs);
for (int idx : exeResult.transferIdx) {
Transfer const& t = transfers[idx];
char exeSubIndexStr[32] = "";
if (ev.useXccFilter || t.exeType == EXE_GPU_DMA) {
if (t.exeSubIndex == -1)
sprintf(exeSubIndexStr, ".*");
else
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
}
printf(" Transfer %02d %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c %s -> %s%02d%s:%03d -> %s\n",
t.transferIndex, sep,
t.transferBandwidth, sep,
t.transferTime, sep,
t.numBytesActual, sep,
t.SrcToStr().c_str(),
ExeTypeName[t.exeType], t.exeIndex,
exeSubIndexStr,
t.numSubExecs,
t.DstToStr().c_str());
// Show per-iteration timing information
if (ev.showIterations) {
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++) {
times.insert(std::make_pair(t.perIterationTime[i], i+1));
double const varTime = fabs(t.transferTime - t.perIterationTime[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / t.perIterationTime[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - t.transferBandwidth);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
for (auto time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d %c %7.3f GB/s %c %8.3f ms %c", time.second, sep, iterBandwidthGbs, sep, iterDurationMsec, sep);
std::set<int> usedXccs;
if (time.second - 1 < t.perIterationCUs.size()) {
printf(" CUs:");
for (auto x : t.perIterationCUs[time.second - 1]) {
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev %c %7.3f GB/s %c %8.3f ms %c\n", sep, stdDevBw, sep, stdDevTime, sep);
}
}
}
printf(" Aggregate (CPU) %c %7.3f GB/s %c %8.3f ms %c %12lu bytes %c Overhead: %.3f ms\n", sep,
results.totalBandwidthCpu, sep, results.totalDurationMsec, sep, results.totalBytesTransferred, sep, results.overheadMsec);
}
void RunHealthCheck(EnvVars ev)
{
// Check for supported platforms
#if defined(__NVCC__)
printf("[WARN] healthcheck preset not supported on NVIDIA hardware\n");
return;
#else
bool hasFail = false;
// Force use of single stream
ev.useSingleStream = 1;
for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, gpuId));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
if (!(archName == "gfx940" || archName == "gfx941" || archName == "gfx942"))
{
printf("[WARN] healthcheck preset is currently only supported on MI300 series hardware\n");
exit(1);
}
}
// Pass limits
double udirLimit = getenv("LIMIT_UDIR") ? atof(getenv("LIMIT_UDIR")) : (int)(48 * 0.95);
double bdirLimit = getenv("LIMIT_BDIR") ? atof(getenv("LIMIT_BDIR")) : (int)(96 * 0.95);
double a2aLimit = getenv("LIMIT_A2A") ? atof(getenv("LIMIT_A2A")) : (int)(45 * 0.95);
// Run CPU to GPU
// Run unidirectional read from CPU to GPU
printf("Testing unidirectional reads from CPU ");
{
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeType = EXE_GPU_GFX;
t.exeIndex = gpuId;
t.numBytes = 64*1024*1024;
t.numBytesActual = 64*1024*1024;
t.numSrcs = 1;
t.srcType.push_back(MEM_CPU);
t.srcIndex.push_back(GetClosestNumaNode(gpuId));
t.numDsts = 0;
t.dstType.clear();
t.dstIndex.clear();
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
bestResult = std::max(bestResult, t.transferBandwidth);
if (t.transferBandwidth >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run unidirectional write from GPU to CPU
printf("Testing unidirectional writes to CPU ");
{
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeType = EXE_GPU_GFX;
t.exeIndex = gpuId;
t.numBytes = 64*1024*1024;
t.numBytesActual = 64*1024*1024;
t.numDsts = 1;
t.dstType.push_back(MEM_CPU);
t.dstIndex.push_back(GetClosestNumaNode(gpuId));
t.numSrcs = 0;
t.srcType.clear();
t.srcIndex.clear();
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t.numSubExecs = cu;
TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
bestResult = std::max(bestResult, t.transferBandwidth);
if (t.transferBandwidth >= udirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, udirLimit);
}
}
}
// Run bidirectional tests
printf("Testing bidirectional reads + writes ");
{
std::vector<std::pair<int, double>> fails;
for (int gpuId = 0; gpuId < ev.numGpuDevices; gpuId++) {
printf("."); fflush(stdout);
std::vector<Transfer> transfers(2);
Transfer& t0 = transfers[0];
Transfer& t1 = transfers[1];
t0.exeType = EXE_GPU_GFX;
t0.exeIndex = gpuId;
t0.numBytes = 64*1024*1024;
t0.numBytesActual = 64*1024*1024;
t0.numSrcs = 1;
t0.srcType.push_back(MEM_CPU);
t0.srcIndex.push_back(GetClosestNumaNode(gpuId));
t0.numDsts = 0;
t0.dstType.clear();
t0.dstIndex.clear();
t1.exeType = EXE_GPU_GFX;
t1.exeIndex = gpuId;
t1.numBytes = 64*1024*1024;
t1.numBytesActual = 64*1024*1024;
t1.numDsts = 1;
t1.dstType.push_back(MEM_CPU);
t1.dstIndex.push_back(GetClosestNumaNode(gpuId));
t1.numSrcs = 0;
t1.srcType.clear();
t1.srcIndex.clear();
// Loop over number of CUs to use
bool passed = false;
double bestResult = 0;
for (int cu = 7; cu <= 10; cu++) {
t0.numSubExecs = cu;
t1.numSubExecs = cu;
TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
double sum = t0.transferBandwidth + t1.transferBandwidth;
bestResult = std::max(bestResult, sum);
if (sum >= bdirLimit) {
passed = true;
break;
}
}
if (!passed) fails.push_back(std::make_pair(gpuId, bestResult));
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d: Measured: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first, p.second, bdirLimit);
}
}
}
// Run XGMI tests:
printf("Testing all-to-all XGMI copies "); fflush(stdout);
{
ev.gfxUnroll = 2;
std::vector<Transfer> transfers;
for (int i = 0; i < ev.numGpuDevices; i++) {
for (int j = 0; j < ev.numGpuDevices; j++) {
if (i == j) continue;
Transfer t;
t.exeType = EXE_GPU_GFX;
t.exeIndex = i;
t.numBytes = t.numBytesActual = 64*1024*1024;
t.numSrcs = 1;
t.numDsts = 1;
t.numSubExecs = 8;
t.srcType.push_back(MEM_GPU_FINE);
t.dstType.push_back(MEM_GPU_FINE);
t.srcIndex.push_back(i);
t.dstIndex.push_back(j);
transfers.push_back(t);
}
}
TestResults tesResults = ExecuteTransfersImpl(ev, transfers);
std::vector<std::pair<std::pair<int,int>, double>> fails;
int transferIdx = 0;
for (int i = 0; i < ev.numGpuDevices; i++) {
printf("."); fflush(stdout);
for (int j = 0; j < ev.numGpuDevices; j++) {
if (i == j) continue;
Transfer const& t = transfers[transferIdx];
if (t.transferBandwidth < a2aLimit) {
fails.push_back(std::make_pair(std::make_pair(i,j), t.transferBandwidth));
}
transferIdx++;
}
}
if (fails.size() == 0) {
printf("PASS\n");
} else {
hasFail = true;
printf("FAIL (%lu test(s))\n", fails.size());
for (auto p : fails) {
printf(" GPU %02d to GPU %02d: %6.2f GB/s Criteria: %6.2f GB/s\n", p.first.first, p.first.second, p.second, a2aLimit);
}
}
}
exit(hasFail ? 1 : 0);
#endif
}
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.50"
#define TB_VERSION "1.51"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -84,9 +84,12 @@ public:
int gfxUnroll; // GFX-kernel unroll factor
int gfxWaveOrder; // GFX-kernel wavefront ordering
int hideEnv; // Skip printing environment variable
int minNumVarSubExec; // Minimum # of subexecutors to use for variable subExec Transfers
int maxNumVarSubExec; // Maximum # of subexecutors to use for variable subExec Transfers (0 to use device limit)
int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected)
int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected)
int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead
int numSubIterations; // Number of subiterations to perform
int numWarmups; // Number of un-timed warmup iterations to perform
int outputToCsv; // Output in CSV format
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
......@@ -188,9 +191,12 @@ public:
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0);
minNumVarSubExec = GetEnvVar("MIN_VAR_SUBEXEC" , 1);
maxNumVarSubExec = GetEnvVar("MAX_VAR_SUBEXEC" , 0);
numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus);
numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
numSubIterations = GetEnvVar("NUM_SUBITERATIONS" , 1);
numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
......@@ -299,6 +305,24 @@ public:
}
else fillPattern.clear();
// Figure out number of xccs per device
int maxNumXccs = 64;
xccIdsPerDevice.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
{
int* data;
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
HIP_CALL(hipDeviceSynchronize());
xccIdsPerDevice[i].clear();
for (int j = 0; j < maxNumXccs; j++)
xccIdsPerDevice[i].insert(data[j]);
HIP_CALL(hipHostFree(data));
}
// Check for CU mask
cuMask.clear();
char* cuMaskStr = getenv("CU_MASK");
......@@ -308,6 +332,7 @@ public:
printf("[WARN] CU_MASK is not supported in CUDA\n");
#else
std::vector<std::pair<int, int>> ranges;
int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
int maxCU = 0;
char* token = strtok(cuMaskStr, ",");
while (token)
......@@ -330,36 +355,22 @@ public:
}
token = strtok(NULL, ",");
}
cuMask.resize(maxCU / 32 + 1, 0);
cuMask.resize(2 * numXccs, 0);
for (auto range : ranges)
{
for (int i = range.first; i <= range.second; i++)
{
cuMask[i / 32] |= (1 << (i % 32));
for (int x = 0; x < numXccs; x++)
{
int targetBit = i * numXccs + x;
cuMask[targetBit/32] |= (1<<(targetBit%32));
}
}
}
#endif
}
// Figure out number of xccs per device
int maxNumXccs = 64;
xccIdsPerDevice.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
{
int* data;
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
HIP_CALL(hipDeviceSynchronize());
xccIdsPerDevice[i].clear();
for (int j = 0; j < maxNumXccs; j++)
xccIdsPerDevice[i].insert(data[j]);
HIP_CALL(hipHostFree(data));
}
// Parse preferred XCC table (if provided
prefXccTable.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
......@@ -429,6 +440,11 @@ public:
printf("[ERROR] BLOCK_ORDER must be 0 (Sequential), 1 (Interleaved), or 2 (Random)\n");
exit(1);
}
if (minNumVarSubExec < 1)
{
printf("[ERROR] Minimum number of subexecutors for variable subexector transfers must be at least 1\n");
exit(1);
}
if (numWarmups < 0)
{
printf("[ERROR] NUM_WARMUPS must be set to a non-negative number\n");
......@@ -524,8 +540,10 @@ public:
// Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
numCpusPerNuma.resize(numDetectedCpus);
int const totalCpus = numa_num_configured_cpus();
for (int i = 0; i < totalCpus; i++)
numCpusPerNuma[numa_node_of_cpu(i)]++;
for (int i = 0; i < totalCpus; i++) {
int node = numa_node_of_cpu(i);
if (node >= 0) numCpusPerNuma[node]++;
}
// Build array of wall clock rates per GPU device
wallClockPerDeviceMhz.resize(numDetectedGpus);
......@@ -583,9 +601,12 @@ public:
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on individual disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
printf(" MIN_VAR_SUBEXEC - Minumum # of subexecutors to use for variable subExec Transfers\n");
printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" NUM_SUBITERATIONS=S - Perform S sub-iteration(s) per iteration. Must be non-negative\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
......@@ -649,6 +670,12 @@ public:
gfxWaveOrder == 3 ? "Wavefront,CU,Unroll" :
gfxWaveOrder == 4 ? "CU,Unroll,Wavefront" :
"CU,Wavefront,Unroll")));
PRINT_EV("MIN_VAR_SUBEXEC", minNumVarSubExec,
std::string("Using at least ") + std::to_string(minNumVarSubExec) + " subexecutor(s) for variable subExec tranfers");
PRINT_EV("MAX_VAR_SUBEXEC", maxNumVarSubExec,
maxNumVarSubExec ?
std::string("Using at most ") + std::to_string(maxNumVarSubExec) + " subexecutor(s) for variable subExec tranfers" :
"Using up to maximum device subexecutors for variable subExec tranfers");
PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
......@@ -656,6 +683,8 @@ public:
PRINT_EV("NUM_ITERATIONS", numIterations,
std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
+ (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
PRINT_EV("NUM_SUBITERATIONS", numSubIterations,
std::string("Running ") + (numSubIterations == 0 ? "infinite" : std::to_string(numSubIterations)) + " subiterations");
PRINT_EV("NUM_WARMUPS", numWarmups,
std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
......@@ -828,36 +857,27 @@ public:
std::string GetCuMaskDesc() const
{
std::vector<std::pair<int, int>> runs;
int numXccs = (xccIdsPerDevice.size() > 0 ? xccIdsPerDevice[0].size() : 1);
bool inRun = false;
std::pair<int, int> curr;
int used = 0;
for (int i = 0; i < cuMask.size(); i++)
{
for (int j = 0; j < 32; j++)
{
if (cuMask[i] & (1 << j))
{
used++;
if (!inRun)
{
inRun = true;
curr.first = i * 32 + j;
}
for (int targetBit = 0; targetBit < cuMask.size() * 32; targetBit += numXccs) {
if (cuMask[targetBit/32] & (1 << (targetBit%32))) {
used++;
if (!inRun) {
inRun = true;
curr.first = targetBit / numXccs;
}
else
{
if (inRun)
{
inRun = false;
curr.second = i * 32 + j - 1;
runs.push_back(curr);
}
} else {
if (inRun) {
inRun = false;
curr.second = targetBit / numXccs - 1;
runs.push_back(curr);
}
}
}
if (inRun)
curr.second = cuMask.size() * 32 - 1;
curr.second = (cuMask.size() * 32) / numXccs - 1;
std::string result = "CUs used: (" + std::to_string(used) + ") ";
for (int i = 0; i < runs.size(); i++)
......
......@@ -174,7 +174,7 @@ template <> __device__ __forceinline__ float4 MemsetVal(){ return make
template <int BLOCKSIZE, int UNROLL>
__global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel(SubExecParam* params, int waveOrder)
GpuReduceKernel(SubExecParam* params, int waveOrder, int numSubIterations)
{
int64_t startCycle;
if (threadIdx.x == 0) startCycle = GetTimestamp();
......@@ -216,84 +216,88 @@ __global__ void __launch_bounds__(BLOCKSIZE)
case 5: /* C,W,U */ teamStride = 1; waveStride = nTeams; unrlStride = nTeams * nWaves; teamStride2 = 1; waveStride2 = nTeams; break;
}
// First loop: Each wavefront in the team works on UNROLL float4s per thread
size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride;
{
float4 val[UNROLL];
if (numSrcs == 0)
int subIterations = 0;
while (1) {
// First loop: Each wavefront in the team works on UNROLL float4s per thread
size_t const loop1Stride = nTeams * nWaves * UNROLL * warpSize;
size_t const loop1Limit = numFloat4 / loop1Stride * loop1Stride;
{
#pragma unroll
for (int u = 0; u < UNROLL; u++)
val[u] = MemsetVal<float4>();
}
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
{
// Read sources into memory and accumulate in registers
if (numSrcs)
{
for (int u = 0; u < UNROLL; u++)
val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
for (int s = 1; s < numSrcs; s++)
for (int u = 0; u < UNROLL; u++)
val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
}
// Write accumulation to all outputs
for (int d = 0; d < numDsts; d++)
{
float4 val[UNROLL];
if (numSrcs == 0) {
#pragma unroll
for (int u = 0; u < UNROLL; u++)
dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
val[u] = MemsetVal<float4>();
}
}
}
// Second loop: Deal with remaining float4s
{
if (loop1Limit < numFloat4)
{
float4 val;
if (numSrcs == 0) val = MemsetVal<float4>();
size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
for (size_t idx = (teamIdx * teamStride + waveIdx * waveStride) * warpSize + tIdx; idx < loop1Limit; idx += loop1Stride)
{
// Read sources into memory and accumulate in registers
if (numSrcs)
{
val = srcFloat4[0][idx];
for (int u = 0; u < UNROLL; u++)
val[u] = srcFloat4[0][idx + u * unrlStride * warpSize];
for (int s = 1; s < numSrcs; s++)
val += srcFloat4[s][idx];
for (int u = 0; u < UNROLL; u++)
val[u] += srcFloat4[s][idx + u * unrlStride * warpSize];
}
// Write accumulation to all outputs
for (int d = 0; d < numDsts; d++)
dstFloat4[d][idx] = val;
{
#pragma unroll
for (int u = 0; u < UNROLL; u++)
dstFloat4[d][idx + u * unrlStride * warpSize] = val[u];
}
}
}
}
// Third loop; Deal with remaining floats
{
if (numFloat4 * 4 < p.N)
// Second loop: Deal with remaining float4s
{
float val;
if (numSrcs == 0) val = MemsetVal<float>();
size_t const loop3Stride = nTeams * nWaves * warpSize;
for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
if (loop1Limit < numFloat4)
{
if (numSrcs)
float4 val;
if (numSrcs == 0) val = MemsetVal<float4>();
size_t const loop2Stride = nTeams * nWaves * warpSize;
for (size_t idx = loop1Limit + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < numFloat4; idx += loop2Stride)
{
val = p.src[0][idx];
for (int s = 1; s < numSrcs; s++)
val += p.src[s][idx];
if (numSrcs)
{
val = srcFloat4[0][idx];
for (int s = 1; s < numSrcs; s++)
val += srcFloat4[s][idx];
}
for (int d = 0; d < numDsts; d++)
dstFloat4[d][idx] = val;
}
}
}
for (int d = 0; d < numDsts; d++)
p.dst[d][idx] = val;
// Third loop; Deal with remaining floats
{
if (numFloat4 * 4 < p.N)
{
float val;
if (numSrcs == 0) val = MemsetVal<float>();
size_t const loop3Stride = nTeams * nWaves * warpSize;
for( size_t idx = numFloat4 * 4 + (teamIdx * teamStride2 + waveIdx * waveStride2) * warpSize + tIdx; idx < p.N; idx += loop3Stride)
{
if (numSrcs)
{
val = p.src[0][idx];
for (int s = 1; s < numSrcs; s++)
val += p.src[s][idx];
}
for (int d = 0; d < numDsts; d++)
p.dst[d][idx] = val;
}
}
}
if (++subIterations == numSubIterations) break;
}
// Wait for all threads to finish
......@@ -308,7 +312,7 @@ __global__ void __launch_bounds__(BLOCKSIZE)
}
}
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int);
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int);
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \
{GpuReduceKernel<BLOCKSIZE, 1>, \
......
......@@ -158,6 +158,25 @@ struct ExecutorInfo
double totalTime;
};
struct ExeResult
{
double bandwidthGbs;
double durationMsec;
double sumBandwidthGbs;
size_t totalBytes;
std::vector<int> transferIdx;
};
struct TestResults
{
size_t numTimedIterations;
size_t totalBytesTransferred;
double totalBandwidthCpu;
double totalDurationMsec;
double overheadMsec;
std::map<std::pair<ExeType, int>, ExeResult> exeResults;
};
typedef std::pair<ExeType, int> Executor;
typedef std::map<Executor, ExecutorInfo> TransferMap;
......@@ -179,7 +198,8 @@ void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transf
void ExecuteTransfers(EnvVars const& ev, int const testNum, size_t const N,
std::vector<Transfer>& transfers, bool verbose = true,
double* totalBandwidthCpu = nullptr);
TestResults ExecuteTransfersImpl(EnvVars const& ev, std::vector<Transfer>& transfers);
void ReportResults(EnvVars const& ev, std::vector<Transfer> const& transfers, TestResults const results);
void EnablePeerAccess(int const deviceId, int const peerDeviceId);
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
void DeallocateMemory(MemType memType, void* memPtr, size_t const size = 0);
......@@ -192,6 +212,7 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
void RunHealthCheck(EnvVars ev);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment