Unverified Commit bbd72a6c authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.66 - Multi-Rank support (#224)

* Adding System singleton to support multi-node (communication and topology)
* Adding multi-node parsing, rank and device wildcard expansion
* Adding multi-node topology, and various support functions
* Adding multi-node consistency validation of Config and Transfers
* Introducing SINGLE_KERNEL=1 to Makefile to speed up compilation during development
* Updating CHANGELOG.  Overhauling wildcard parsing.  Adding dryrun
* Client refactoring.  Introduction of tabular formatted results and a2a multi-rank preset
* Adding MPI support into CMakeFiles
* Cleaning up multi-node topology using TableHelper
* Reducing compile time by removing some kernel variants
* Updating documentation.  Adding nicrings preset
* Adding NIC_FILTER to allow NIC device filtering via regex
* Updating supported memory types
* Fixing P2P preset, and adding some extra memIndex utility functions
parent 26717d50
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......@@ -19,15 +19,20 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
void SchmooPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
int SchmooPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
if (TransferBench::GetNumRanks() > 1) {
Utils::Print("[ERROR] Schmoo preset currently not supported for multi-node\n");
return 1;
}
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numDetectedGpus < 2) {
printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
exit(1);
return 1;
}
// Collect env vars for this preset
......@@ -53,7 +58,7 @@ void SchmooPreset(EnvVars& ev,
// Validate env vars
if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) {
printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
exit(1);
return 1;
}
TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
......@@ -85,18 +90,18 @@ void SchmooPreset(EnvVars& ev,
// Local Read
t.srcs = {{memType, localIdx}};
t.dsts = {};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
Utils::PrintErrors(results.errResults);
return 1;
}
double const localRead = results.tfrResults[0].avgBandwidthGbPerSec;
// Local Write
t.srcs = {};
t.dsts = {{memType, localIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
Utils::PrintErrors(results.errResults);
return 1;
}
double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec;
......@@ -105,40 +110,41 @@ void SchmooPreset(EnvVars& ev,
t.dsts = {{memType, localIdx}};
t.srcs = {};
t.dsts = {{memType, localIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
Utils::PrintErrors(results.errResults);
return 1;
}
double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Read
t.srcs = {{memType, remoteIdx}};
t.dsts = {};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
Utils::PrintErrors(results.errResults);
return 1;
}
double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Write
t.srcs = {};
t.dsts = {{memType, remoteIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
Utils::PrintErrors(results.errResults);
return 1;
}
double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Copy
t.srcs = {{memType, localIdx}};
t.dsts = {{memType, remoteIdx}};
if (!RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
exit(1);
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
Utils::PrintErrors(results.errResults);
return 1;
}
double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec;
printf(" %3d %11.3f %11.3f %11.3f %11.3f %11.3f %11.3f \n",
numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
}
return 0;
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......@@ -28,9 +28,9 @@ void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& tran
for (auto const& transfer : transfers)
{
fprintf(fp, " (%s->%c%d->%s %d %lu)",
MemDevicesToStr(transfer.srcs).c_str(),
Utils::MemDevicesToStr(transfer.srcs).c_str(),
ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex,
MemDevicesToStr(transfer.dsts).c_str(),
Utils::MemDevicesToStr(transfer.dsts).c_str(),
transfer.numSubExecs,
transfer.numBytes);
}
......@@ -39,10 +39,15 @@ void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& tran
}
}
void SweepPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
int SweepPreset(EnvVars& ev,
size_t const numBytesPerTransfer,
std::string const presetName)
{
if (TransferBench::GetNumRanks() > 1) {
Utils::Print("[ERROR] Sweep preset currently not supported for multi-node\n");
return 1;
}
bool const isRandom = (presetName == "rsweep");
int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
......@@ -98,33 +103,33 @@ void SweepPreset(EnvVars& ev,
for (auto ch : sweepSrc) {
if (!strchr(MemTypeStr, ch)) {
printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
exit(1);
return 1;
}
if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) {
printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
exit(1);
return 1;
}
}
for (auto ch : sweepDst) {
if (!strchr(MemTypeStr, ch)) {
printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
exit(1);
return 1;
}
if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) {
printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
exit(1);
return 1;
}
}
for (auto ch : sweepExe) {
if (!strchr(ExeTypeStr, ch)) {
printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
exit(1);
return 1;
}
if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) {
printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
exit(1);
return 1;
}
}
......@@ -273,7 +278,7 @@ void SweepPreset(EnvVars& ev,
if (sweepMin > numPossible) {
printf("No valid test configurations exist\n");
return;
return 0;
}
if (ev.outputToCsv) {
......@@ -333,10 +338,10 @@ void SweepPreset(EnvVars& ev,
LogTransfers(fp, ++numTestsRun, transfers);
if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults);
if (!continueOnErr) exit(1);
Utils::PrintErrors(results.errResults);
if (!continueOnErr) return 1;
} else {
PrintResults(ev, numTestsRun, transfers, results);
Utils::PrintResults(ev, numTestsRun, transfers, results);
}
// Check for test limit
......@@ -366,4 +371,5 @@ void SweepPreset(EnvVars& ev,
}
}
if (fp) fclose(fp);
return 0;
}
/*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
......@@ -23,6 +23,7 @@ THE SOFTWARE.
#pragma once
#include "TransferBench.hpp"
#include "Utilities.hpp"
static int RemappedCpuIndex(int origIdx)
{
......@@ -63,15 +64,15 @@ static void PrintNicToGPUTopo(bool outputToCsv)
ibvDeviceList[i].busId.c_str(),
ibvDeviceList[i].numaNode,
closestGpusStr.c_str(),
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A",
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? ibvDeviceList[i].gidDescriptor.c_str() : "N/A"
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort ? std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A",
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort ? ibvDeviceList[i].gidDescriptor.c_str() : "N/A"
);
}
printf("\n");
#endif
}
void DisplayTopology(bool outputToCsv)
void DisplaySingleRankTopology(bool outputToCsv)
{
int numCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
......@@ -140,59 +141,185 @@ void DisplayTopology(bool outputToCsv)
return;
#else
// Print headers
if (!outputToCsv) {
printf(" |");
for (int j = 0; j < numGpus; j++) {
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, j));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
printf(" %6s |", archName.c_str());
if (numGpus > 0) {
if (!outputToCsv) {
printf(" |");
for (int j = 0; j < numGpus; j++) {
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, j));
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
printf(" %6s |", archName.c_str());
}
printf("\n");
}
printf("\n");
}
printf(" %c", sep);
for (int j = 0; j < numGpus; j++)
printf(" GPU %02d %c", j, sep);
printf(" PCIe Bus ID %c #CUs %c NUMA %c #DMA %c #XCC %c NIC\n", sep, sep, sep, sep, sep);
printf(" %c", sep);
for (int j = 0; j < numGpus; j++)
printf(" GPU %02d %c", j, sep);
printf(" PCIe Bus ID %c #CUs %c NUMA %c #DMA %c #XCC %c NIC\n", sep, sep, sep, sep, sep);
if (!outputToCsv) {
for (int j = 0; j <= numGpus; j++)
printf("--------+");
printf("--------------+------+------+------+------+------\n");
if (!outputToCsv) {
for (int j = 0; j <= numGpus; j++)
printf("--------+");
printf("--------------+------+------+------+------+------\n");
}
// Loop over each GPU device
for (int i = 0; i < numGpus; i++) {
printf(" GPU %02d %c", i, sep);
// Print off link information
for (int j = 0; j < numGpus; j++) {
if (i == j) {
printf(" N/A %c", sep);
} else {
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
printf(" %s-%d %c",
linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT" :
linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI" :
linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE" :
linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFB" :
linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI" : "????",
hopCount, sep);
}
}
char pciBusId[20];
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
printf(" %-11s %c %-4d %c %-4d %c %-4d %c %-4d %c %-4d\n",
pciBusId, sep,
TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}), sep,
TransferBench::GetClosestCpuNumaToGpu(i), sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}), sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}), sep,
TransferBench::GetClosestNicToGpu(i));
}
}
#endif
}
// Loop over each GPU device
for (int i = 0; i < numGpus; i++) {
printf(" GPU %02d %c", i, sep);
// Print off link information
for (int j = 0; j < numGpus; j++) {
if (i == j) {
printf(" N/A %c", sep);
} else {
uint32_t linkType, hopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(i, j, &linkType, &hopCount));
printf(" %s-%d %c",
linkType == HSA_AMD_LINK_INFO_TYPE_HYPERTRANSPORT ? " HT" :
linkType == HSA_AMD_LINK_INFO_TYPE_QPI ? " QPI" :
linkType == HSA_AMD_LINK_INFO_TYPE_PCIE ? "PCIE" :
linkType == HSA_AMD_LINK_INFO_TYPE_INFINBAND ? "INFB" :
linkType == HSA_AMD_LINK_INFO_TYPE_XGMI ? "XGMI" : "????",
hopCount, sep);
void DisplayMultiRankTopology(bool outputToCsv, bool showBorders)
{
Utils::RankGroupMap& groups = Utils::GetRankGroupMap();
printf("%d rank(s) in %lu homogeneous group(s)\n", TransferBench::GetNumRanks(), groups.size());
printf("\n");
// Print off each group
int groupNum = 1;
for (auto const& group : groups) {
Utils::GroupKey const& key = group.first;
std::vector<int> const& hosts = group.second;
std::string ppodId = std::get<0>(key);
int vpodId = std::get<1>(key);
std::vector<std::string> cpuNames = std::get<2>(key);
std::vector<int> cpuSubExecs = std::get<3>(key);
std::vector<std::string> gpuNames = std::get<4>(key);
std::vector<int> gpuSubExecs = std::get<5>(key);
std::vector<int> gpuClosestCpu = std::get<6>(key);
std::vector<std::string> nicNames = std::get<7>(key);
std::vector<int> nicClosestCpu = std::get<8>(key);
std::vector<int> nicClosestGpu = std::get<9>(key);
std::vector<int> nicIsActive = std::get<10>(key);
int numRanks = hosts.size();
int numCpus = cpuNames.size();
int numGpus = gpuNames.size();
int numNics = nicNames.size();
int numExecutors = numCpus + numGpus + numNics;
int numActiveNics = 0;
for (auto x : nicIsActive) numActiveNics += x;
if (groupNum > 1) printf("\n");
printf("Group %03d: %d rank(s) %d CPU(s) %d GPU(s) %d NIC(s) (%d active NICs)\n",
groupNum++, numRanks, numCpus, numGpus, numNics, numActiveNics);
// Determine size of table
int numCols = 7;
int numRows = 1 + std::max(numRanks, numExecutors);
TransferBench::Utils::TableHelper table(numRows, numCols);
// Table borders / alignment
for (int col = 0; col <= numCols; col++) {
table.DrawColBorder(col);
table.SetColAlignment(col, TransferBench::Utils::TableHelper::ALIGN_LEFT);
}
table.DrawRowBorder(0);
table.DrawRowBorder(1);
table.DrawRowBorder(numRows);
// Table header
table.Set(0, 0, " Rank ");
table.Set(0, 1, " Hostname ");
table.Set(0, 2, " POD ");
table.Set(0, 3, " VID ");
table.Set(0, 4, " Executor ");
table.Set(0, 5, " Executor Name ");
table.Set(0, 6, " #SE ");
// Fill in ranks / hosts
for (int i = 0; i < numRanks; i++) {
int rank = hosts[i];
table.Set(1 + i, 0, " %04d ", rank);
table.Set(1 + i, 1, " %s ", TransferBench::GetHostname(rank).c_str());
}
// Fill in PPOD and VPOD
table.Set(1, 2, " %s ", ppodId.c_str());
table.Set(1, 3, " %d ", vpodId);
// Fill in Executor information
int rowIdx = 1;
for (int cpuIndex = 0; cpuIndex < numCpus; cpuIndex++) {
table.Set(rowIdx, 4, " CPU %02d ", cpuIndex);
table.Set(rowIdx, 5, " %s ", cpuNames[cpuIndex].c_str());
table.Set(rowIdx, 6, " %d ", cpuSubExecs[cpuIndex]);
rowIdx++;
// Loop over each GPU closest to this CPU executor
for (int gpuIndex = 0; gpuIndex < numGpus; gpuIndex++) {
if (gpuClosestCpu[gpuIndex] != cpuIndex) continue;
table.Set(rowIdx, 4, " - GPU %02d ", gpuIndex);
table.Set(rowIdx, 5, " - %s ", gpuNames[gpuIndex].c_str());
table.Set(rowIdx, 6, " %d ", gpuSubExecs[gpuIndex]);
rowIdx++;
// Loop over each NIC closest to this GPU
for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
if (nicClosestGpu[nicIndex] != gpuIndex) continue;
table.Set(rowIdx, 4, " - NIC %02d ", nicIndex);
table.Set(rowIdx, 5, " - %s", nicNames[nicIndex].c_str());
table.Set(rowIdx, 6, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF");
rowIdx++;
}
}
// Loop over remaining NICs not associated with GPU but associated with this CPU
for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
if (nicClosestGpu[nicIndex] != -1 || nicClosestCpu[nicIndex] != cpuIndex) continue;
table.Set(rowIdx, 4, " - NIC %02d ", nicIndex);
table.Set(rowIdx, 5, " - %s ", nicNames[nicIndex].c_str());
table.Set(rowIdx, 6, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF");
rowIdx++;
}
}
table.PrintTable(outputToCsv, showBorders);
}
char pciBusId[20];
HIP_CALL(hipDeviceGetPCIBusId(pciBusId, 20, i));
printf(" %-11s %c %-4d %c %-4d %c %-4d %c %-4d %c %-4d\n",
pciBusId, sep,
TransferBench::GetNumSubExecutors({EXE_GPU_GFX, i}), sep,
TransferBench::GetClosestCpuNumaToGpu(i), sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_DMA, i}), sep,
TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}), sep,
TransferBench::GetClosestNicToGpu(i));
if (Utils::HasDuplicateHostname()) {
printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
}
#endif
}
void DisplayTopology(bool outputToCsv, bool showBorders)
{
if (GetNumRanks() > 1)
DisplayMultiRankTopology(outputToCsv, showBorders);
else
DisplaySingleRankTopology(outputToCsv);
}
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment