Unverified Commit bbd72a6c authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

TransferBench v1.66 - Multi-Rank support (#224)

* Adding System singleton to support multi-node (communication and topology)
* Adding multi-node parsing, rank and device wildcard expansion
* Adding multi-node topology, and various support functions
* Adding multi-node consistency validation of Config and Transfers
* Introducing SINGLE_KERNEL=1 to Makefile to speed up compilation during development
* Updating CHANGELOG.  Overhauling wildcard parsing.  Adding dryrun
* Client refactoring.  Introduction of tabular formatted results and a2a multi-rank preset
* Adding MPI support into CMakeFiles
* Cleaning up multi-node topology using TableHelper
* Reducing compile time by removing some kernel variants
* Updating documentation.  Adding nicrings preset
* Adding NIC_FILTER to allow NIC device filtering via regex
* Updating supported memory types
* Fixing P2P preset, and adding some extra memIndex utility functions
parent 26717d50
/* /*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -19,15 +19,20 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ...@@ -19,15 +19,20 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
*/ */
void SchmooPreset(EnvVars& ev, int SchmooPreset(EnvVars& ev,
size_t const numBytesPerTransfer, size_t const numBytesPerTransfer,
std::string const presetName) std::string const presetName)
{ {
if (TransferBench::GetNumRanks() > 1) {
Utils::Print("[ERROR] Schmoo preset currently not supported for multi-node\n");
return 1;
}
int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); int numDetectedGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
if (numDetectedGpus < 2) { if (numDetectedGpus < 2) {
printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n"); printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
exit(1); return 1;
} }
// Collect env vars for this preset // Collect env vars for this preset
...@@ -53,7 +58,7 @@ void SchmooPreset(EnvVars& ev, ...@@ -53,7 +58,7 @@ void SchmooPreset(EnvVars& ev,
// Validate env vars // Validate env vars
if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) { if (localIdx >= numDetectedGpus || remoteIdx >= numDetectedGpus) {
printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx); printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
exit(1); return 1;
} }
TransferBench::ConfigOptions cfg = ev.ToConfigOptions(); TransferBench::ConfigOptions cfg = ev.ToConfigOptions();
...@@ -85,18 +90,18 @@ void SchmooPreset(EnvVars& ev, ...@@ -85,18 +90,18 @@ void SchmooPreset(EnvVars& ev,
// Local Read // Local Read
t.srcs = {{memType, localIdx}}; t.srcs = {{memType, localIdx}};
t.dsts = {}; t.dsts = {};
if (!RunTransfers(cfg, transfers, results)) { if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults); Utils::PrintErrors(results.errResults);
exit(1); return 1;
} }
double const localRead = results.tfrResults[0].avgBandwidthGbPerSec; double const localRead = results.tfrResults[0].avgBandwidthGbPerSec;
// Local Write // Local Write
t.srcs = {}; t.srcs = {};
t.dsts = {{memType, localIdx}}; t.dsts = {{memType, localIdx}};
if (!RunTransfers(cfg, transfers, results)) { if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults); Utils::PrintErrors(results.errResults);
exit(1); return 1;
} }
double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec; double const localWrite = results.tfrResults[0].avgBandwidthGbPerSec;
...@@ -105,40 +110,41 @@ void SchmooPreset(EnvVars& ev, ...@@ -105,40 +110,41 @@ void SchmooPreset(EnvVars& ev,
t.dsts = {{memType, localIdx}}; t.dsts = {{memType, localIdx}};
t.srcs = {}; t.srcs = {};
t.dsts = {{memType, localIdx}}; t.dsts = {{memType, localIdx}};
if (!RunTransfers(cfg, transfers, results)) { if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults); Utils::PrintErrors(results.errResults);
exit(1); return 1;
} }
double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec; double const localCopy = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Read // Remote Read
t.srcs = {{memType, remoteIdx}}; t.srcs = {{memType, remoteIdx}};
t.dsts = {}; t.dsts = {};
if (!RunTransfers(cfg, transfers, results)) { if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults); Utils::PrintErrors(results.errResults);
exit(1); return 1;
} }
double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec; double const remoteRead = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Write // Remote Write
t.srcs = {}; t.srcs = {};
t.dsts = {{memType, remoteIdx}}; t.dsts = {{memType, remoteIdx}};
if (!RunTransfers(cfg, transfers, results)) { if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults); Utils::PrintErrors(results.errResults);
exit(1); return 1;
} }
double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec; double const remoteWrite = results.tfrResults[0].avgBandwidthGbPerSec;
// Remote Copy // Remote Copy
t.srcs = {{memType, localIdx}}; t.srcs = {{memType, localIdx}};
t.dsts = {{memType, remoteIdx}}; t.dsts = {{memType, remoteIdx}};
if (!RunTransfers(cfg, transfers, results)) { if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults); Utils::PrintErrors(results.errResults);
exit(1); return 1;
} }
double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec; double const remoteCopy = results.tfrResults[0].avgBandwidthGbPerSec;
printf(" %3d %11.3f %11.3f %11.3f %11.3f %11.3f %11.3f \n", printf(" %3d %11.3f %11.3f %11.3f %11.3f %11.3f %11.3f \n",
numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy); numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
} }
return 0;
} }
/* /*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -28,9 +28,9 @@ void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& tran ...@@ -28,9 +28,9 @@ void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& tran
for (auto const& transfer : transfers) for (auto const& transfer : transfers)
{ {
fprintf(fp, " (%s->%c%d->%s %d %lu)", fprintf(fp, " (%s->%c%d->%s %d %lu)",
MemDevicesToStr(transfer.srcs).c_str(), Utils::MemDevicesToStr(transfer.srcs).c_str(),
ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex, ExeTypeStr[transfer.exeDevice.exeType], transfer.exeDevice.exeIndex,
MemDevicesToStr(transfer.dsts).c_str(), Utils::MemDevicesToStr(transfer.dsts).c_str(),
transfer.numSubExecs, transfer.numSubExecs,
transfer.numBytes); transfer.numBytes);
} }
...@@ -39,10 +39,15 @@ void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& tran ...@@ -39,10 +39,15 @@ void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& tran
} }
} }
void SweepPreset(EnvVars& ev, int SweepPreset(EnvVars& ev,
size_t const numBytesPerTransfer, size_t const numBytesPerTransfer,
std::string const presetName) std::string const presetName)
{ {
if (TransferBench::GetNumRanks() > 1) {
Utils::Print("[ERROR] Sweep preset currently not supported for multi-node\n");
return 1;
}
bool const isRandom = (presetName == "rsweep"); bool const isRandom = (presetName == "rsweep");
int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU); int numDetectedCpus = TransferBench::GetNumExecutors(EXE_CPU);
...@@ -98,33 +103,33 @@ void SweepPreset(EnvVars& ev, ...@@ -98,33 +103,33 @@ void SweepPreset(EnvVars& ev,
for (auto ch : sweepSrc) { for (auto ch : sweepSrc) {
if (!strchr(MemTypeStr, ch)) { if (!strchr(MemTypeStr, ch)) {
printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch); printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
exit(1); return 1;
} }
if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) { if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch)) {
printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch); printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
exit(1); return 1;
} }
} }
for (auto ch : sweepDst) { for (auto ch : sweepDst) {
if (!strchr(MemTypeStr, ch)) { if (!strchr(MemTypeStr, ch)) {
printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch); printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
exit(1); return 1;
} }
if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) { if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch)) {
printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch); printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
exit(1); return 1;
} }
} }
for (auto ch : sweepExe) { for (auto ch : sweepExe) {
if (!strchr(ExeTypeStr, ch)) { if (!strchr(ExeTypeStr, ch)) {
printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch); printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
exit(1); return 1;
} }
if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) { if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch)) {
printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch); printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
exit(1); return 1;
} }
} }
...@@ -273,7 +278,7 @@ void SweepPreset(EnvVars& ev, ...@@ -273,7 +278,7 @@ void SweepPreset(EnvVars& ev,
if (sweepMin > numPossible) { if (sweepMin > numPossible) {
printf("No valid test configurations exist\n"); printf("No valid test configurations exist\n");
return; return 0;
} }
if (ev.outputToCsv) { if (ev.outputToCsv) {
...@@ -333,10 +338,10 @@ void SweepPreset(EnvVars& ev, ...@@ -333,10 +338,10 @@ void SweepPreset(EnvVars& ev,
LogTransfers(fp, ++numTestsRun, transfers); LogTransfers(fp, ++numTestsRun, transfers);
if (!TransferBench::RunTransfers(cfg, transfers, results)) { if (!TransferBench::RunTransfers(cfg, transfers, results)) {
PrintErrors(results.errResults); Utils::PrintErrors(results.errResults);
if (!continueOnErr) exit(1); if (!continueOnErr) return 1;
} else { } else {
PrintResults(ev, numTestsRun, transfers, results); Utils::PrintResults(ev, numTestsRun, transfers, results);
} }
// Check for test limit // Check for test limit
...@@ -366,4 +371,5 @@ void SweepPreset(EnvVars& ev, ...@@ -366,4 +371,5 @@ void SweepPreset(EnvVars& ev,
} }
} }
if (fp) fclose(fp); if (fp) fclose(fp);
return 0;
} }
/* /*
Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -23,6 +23,7 @@ THE SOFTWARE. ...@@ -23,6 +23,7 @@ THE SOFTWARE.
#pragma once #pragma once
#include "TransferBench.hpp" #include "TransferBench.hpp"
#include "Utilities.hpp"
static int RemappedCpuIndex(int origIdx) static int RemappedCpuIndex(int origIdx)
{ {
...@@ -63,15 +64,15 @@ static void PrintNicToGPUTopo(bool outputToCsv) ...@@ -63,15 +64,15 @@ static void PrintNicToGPUTopo(bool outputToCsv)
ibvDeviceList[i].busId.c_str(), ibvDeviceList[i].busId.c_str(),
ibvDeviceList[i].numaNode, ibvDeviceList[i].numaNode,
closestGpusStr.c_str(), closestGpusStr.c_str(),
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A", ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort ? std::to_string(ibvDeviceList[i].gidIndex).c_str() : "N/A",
ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort? ibvDeviceList[i].gidDescriptor.c_str() : "N/A" ibvDeviceList[i].isRoce && ibvDeviceList[i].hasActivePort ? ibvDeviceList[i].gidDescriptor.c_str() : "N/A"
); );
} }
printf("\n"); printf("\n");
#endif #endif
} }
void DisplayTopology(bool outputToCsv) void DisplaySingleRankTopology(bool outputToCsv)
{ {
int numCpus = TransferBench::GetNumExecutors(EXE_CPU); int numCpus = TransferBench::GetNumExecutors(EXE_CPU);
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX); int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX);
...@@ -140,6 +141,7 @@ void DisplayTopology(bool outputToCsv) ...@@ -140,6 +141,7 @@ void DisplayTopology(bool outputToCsv)
return; return;
#else #else
// Print headers // Print headers
if (numGpus > 0) {
if (!outputToCsv) { if (!outputToCsv) {
printf(" |"); printf(" |");
for (int j = 0; j < numGpus; j++) { for (int j = 0; j < numGpus; j++) {
...@@ -194,5 +196,130 @@ void DisplayTopology(bool outputToCsv) ...@@ -194,5 +196,130 @@ void DisplayTopology(bool outputToCsv)
TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}), sep, TransferBench::GetNumExecutorSubIndices({EXE_GPU_GFX, i}), sep,
TransferBench::GetClosestNicToGpu(i)); TransferBench::GetClosestNicToGpu(i));
} }
}
#endif #endif
} }
void DisplayMultiRankTopology(bool outputToCsv, bool showBorders)
{
Utils::RankGroupMap& groups = Utils::GetRankGroupMap();
printf("%d rank(s) in %lu homogeneous group(s)\n", TransferBench::GetNumRanks(), groups.size());
printf("\n");
// Print off each group
int groupNum = 1;
for (auto const& group : groups) {
Utils::GroupKey const& key = group.first;
std::vector<int> const& hosts = group.second;
std::string ppodId = std::get<0>(key);
int vpodId = std::get<1>(key);
std::vector<std::string> cpuNames = std::get<2>(key);
std::vector<int> cpuSubExecs = std::get<3>(key);
std::vector<std::string> gpuNames = std::get<4>(key);
std::vector<int> gpuSubExecs = std::get<5>(key);
std::vector<int> gpuClosestCpu = std::get<6>(key);
std::vector<std::string> nicNames = std::get<7>(key);
std::vector<int> nicClosestCpu = std::get<8>(key);
std::vector<int> nicClosestGpu = std::get<9>(key);
std::vector<int> nicIsActive = std::get<10>(key);
int numRanks = hosts.size();
int numCpus = cpuNames.size();
int numGpus = gpuNames.size();
int numNics = nicNames.size();
int numExecutors = numCpus + numGpus + numNics;
int numActiveNics = 0;
for (auto x : nicIsActive) numActiveNics += x;
if (groupNum > 1) printf("\n");
printf("Group %03d: %d rank(s) %d CPU(s) %d GPU(s) %d NIC(s) (%d active NICs)\n",
groupNum++, numRanks, numCpus, numGpus, numNics, numActiveNics);
// Determine size of table
int numCols = 7;
int numRows = 1 + std::max(numRanks, numExecutors);
TransferBench::Utils::TableHelper table(numRows, numCols);
// Table borders / alignment
for (int col = 0; col <= numCols; col++) {
table.DrawColBorder(col);
table.SetColAlignment(col, TransferBench::Utils::TableHelper::ALIGN_LEFT);
}
table.DrawRowBorder(0);
table.DrawRowBorder(1);
table.DrawRowBorder(numRows);
// Table header
table.Set(0, 0, " Rank ");
table.Set(0, 1, " Hostname ");
table.Set(0, 2, " POD ");
table.Set(0, 3, " VID ");
table.Set(0, 4, " Executor ");
table.Set(0, 5, " Executor Name ");
table.Set(0, 6, " #SE ");
// Fill in ranks / hosts
for (int i = 0; i < numRanks; i++) {
int rank = hosts[i];
table.Set(1 + i, 0, " %04d ", rank);
table.Set(1 + i, 1, " %s ", TransferBench::GetHostname(rank).c_str());
}
// Fill in PPOD and VPOD
table.Set(1, 2, " %s ", ppodId.c_str());
table.Set(1, 3, " %d ", vpodId);
// Fill in Executor information
int rowIdx = 1;
for (int cpuIndex = 0; cpuIndex < numCpus; cpuIndex++) {
table.Set(rowIdx, 4, " CPU %02d ", cpuIndex);
table.Set(rowIdx, 5, " %s ", cpuNames[cpuIndex].c_str());
table.Set(rowIdx, 6, " %d ", cpuSubExecs[cpuIndex]);
rowIdx++;
// Loop over each GPU closest to this CPU executor
for (int gpuIndex = 0; gpuIndex < numGpus; gpuIndex++) {
if (gpuClosestCpu[gpuIndex] != cpuIndex) continue;
table.Set(rowIdx, 4, " - GPU %02d ", gpuIndex);
table.Set(rowIdx, 5, " - %s ", gpuNames[gpuIndex].c_str());
table.Set(rowIdx, 6, " %d ", gpuSubExecs[gpuIndex]);
rowIdx++;
// Loop over each NIC closest to this GPU
for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
if (nicClosestGpu[nicIndex] != gpuIndex) continue;
table.Set(rowIdx, 4, " - NIC %02d ", nicIndex);
table.Set(rowIdx, 5, " - %s", nicNames[nicIndex].c_str());
table.Set(rowIdx, 6, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF");
rowIdx++;
}
}
// Loop over remaining NICs not associated with GPU but associated with this CPU
for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
if (nicClosestGpu[nicIndex] != -1 || nicClosestCpu[nicIndex] != cpuIndex) continue;
table.Set(rowIdx, 4, " - NIC %02d ", nicIndex);
table.Set(rowIdx, 5, " - %s ", nicNames[nicIndex].c_str());
table.Set(rowIdx, 6, " %s ", nicIsActive[nicIndex] ? "ON" : "OFF");
rowIdx++;
}
}
table.PrintTable(outputToCsv, showBorders);
}
if (Utils::HasDuplicateHostname()) {
printf("[WARN] It is recommended to run TransferBench with one rank per host to avoid potential aliasing of executors\n");
}
}
void DisplayTopology(bool outputToCsv, bool showBorders)
{
if (GetNumRanks() > 1)
DisplayMultiRankTopology(outputToCsv, showBorders);
else
DisplaySingleRankTopology(outputToCsv);
}
/*
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
#pragma once
#include <unordered_map>
#include <unordered_set>
#include "TransferBench.hpp"
namespace TransferBench::Utils
{
// Helper class to help format tabular data / output to CSV
class TableHelper
{
public:
// Column alignment options
enum {
ALIGN_LEFT = -1,
ALIGN_CENTER = 0,
ALIGN_RIGHT = 1
} AlignType;
enum {
BORDER_TOP = 1,
BORDER_BOT = 2,
BORDER_LEFT = 4,
BORDER_RIGHT = 8,
BORDER_ALL = 15,
} BorderType;
// Helper class to print off tabled data
TableHelper(int numRows, int numCols, int precision = 2);
// Set the value for a particular cell
template <typename T>
void Set(int rowIdx, int colIdx, T const& value);
void Set(int rowIdx, int colIdx, const char* format, ...);
// Set the alignment for a given cell
void SetCellAlignment(int rowIdx, int colIdx, int8_t alignMode);
// Set the alignment for all cells in a given column
void SetColAlignment(int colIdx, int8_t alignMode);
// Set the alignment for all cells in a given row
void SetRowAlignment(int rowIdx, int8_t alignMode);
// Set border around a cell
void SetCellBorder(int rowIdx, int colIdx, int borderMask);
// Draws a horizontal border on top of given row
void DrawRowBorder(int rowIdx);
// Draws a vertical border prior to given column
void DrawColBorder(int colIdx);
// Print the table
void PrintTable(bool outputToCsv, bool drawBorders = true);
private:
int numRows;
int numCols;
int precision;
std::vector<std::vector<std::string>> table;
std::vector<std::vector<int8_t>> alignment;
std::vector<int> colWidth;
std::unordered_map<int, std::unordered_set<int>> rowBorders;
std::unordered_map<int, std::unordered_set<int>> colBorders;
};
// Group information
typedef std::tuple<
std::string, // RackId
int, // VPod
std::vector<std::string>, // CPU Names
std::vector<int>, // CPU #Subexecutors
std::vector<std::string>, // GPU Names
std::vector<int>, // GPU #Subexecutors
std::vector<int>, // GPU Closest NUMA
std::vector<std::string>, // NIC Names
std::vector<int>, // NIC Closest NUMA
std::vector<int>, // NIC Closest GPU
std::vector<int> // NIC is active
> GroupKey;
typedef std::map<GroupKey, std::vector<int>> RankGroupMap;
// Get information about how ranks can be organized into homogenous groups
RankGroupMap& GetRankGroupMap();
// Return the number of homogenous groups of ranks
int numRankGroups();
// Helper function to convert an ExeType to a string
std::string ExeTypeToStr(ExeType exeType);
// Helper function that converts MemDevices to a string
std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices);
// Helper function to determine if current rank does output
bool RankDoesOutput();
// Helper function that only prints if current rank does output
void Print(const char* format, ...);
// Helper function to deal with ErrResults (exits on fatal error)
void CheckForError(ErrResult const& error);
// Helper function to deal with vector of ErrREsults (exits on fatal error)
void PrintErrors(std::vector<ErrResult> const& errors);
// Helper function to print TransferBench test results
void PrintResults(EnvVars const& ev, int const testNum,
std::vector<Transfer> const& transfers,
TestResults const& results);
// Returns true if more than one rank share the same hostname
bool HasDuplicateHostname();
// Helper function to map between integer index and memory types
MemType GetCpuMemType(int memTypeIdx);
MemType GetGpuMemType(int memTypeIdx);
MemType GetMemType(int memTypeIdx, bool isCpu);
// Helper function to map between integer index and memory type name
std::string GetCpuMemTypeStr(int memTypeIdx);
std::string GetGpuMemTypeStr(int memTypeIdx);
std::string GetMemTypeStr(int memTypeIdx, bool isCpu);
// Helper function to list all available options
std::string GetAllCpuMemTypeStr();
std::string GetAllGpuMemTypeStr();
std::string GetAllMemTypeStr(bool isCpu);
// Implementation details below
//================================================================
TableHelper::TableHelper(int numRows, int numCols, int precision) :
numRows(numRows), numCols(numCols), precision(precision)
{
if (numRows < 0 || numCols < 0) {
Print("[ERROR] Cannot create TableHelper of negative size\n");
exit(1);
}
// Initialize internal data structures
table.resize(numRows, std::vector<std::string>(numCols, ""));
alignment.resize(numRows, std::vector<int8_t>(numCols, ALIGN_RIGHT));
colWidth.resize(numCols, 0);
}
template <typename T>
void TableHelper::Set(int rowIdx, int colIdx, T const& value)
{
if (0 <= rowIdx && rowIdx < numRows && 0 <= colIdx && colIdx < numCols) {
std::stringstream ss;
if constexpr (std::is_floating_point_v<T>) {
ss << std::fixed << std::setprecision(precision) << value;
} else {
ss << value;
}
table[rowIdx][colIdx] = ss.str();
colWidth[colIdx] = std::max(colWidth[colIdx], static_cast<int>(table[rowIdx][colIdx].size()));
}
}
void TableHelper::Set(int rowIdx, int colIdx, const char* format, ...)
{
if (0 <= rowIdx && rowIdx < numRows && 0 <= colIdx && colIdx < numCols) {
va_list args, args_copy;
va_start(args, format);
// Figure out size of the string
va_copy(args_copy, args);
int size = std::vsnprintf(nullptr, 0, format, args_copy);
va_end(args_copy);
table[rowIdx][colIdx].resize(size, '\0');
std::vsnprintf(table[rowIdx][colIdx].data(), size + 1, format, args);
va_end(args);
colWidth[colIdx] = std::max(colWidth[colIdx], static_cast<int>(table[rowIdx][colIdx].size()));
}
}
void TableHelper::SetCellAlignment(int rowIdx, int colIdx, int8_t alignMode)
{
if (0 <= rowIdx && rowIdx < numRows && 0 <= colIdx && colIdx < numCols && -1 <= alignMode && alignMode <= 1)
alignment[rowIdx][colIdx] = alignMode;
}
void TableHelper::SetColAlignment(int colIdx, int8_t alignMode)
{
if (0 <= colIdx && colIdx < numCols && -1 <= alignMode && alignMode <= 1)
for (int rowIdx = 0; rowIdx < numRows; rowIdx++)
alignment[rowIdx][colIdx] = alignMode;
}
void TableHelper::SetRowAlignment(int rowIdx, int8_t alignMode)
{
if (0 <= rowIdx && rowIdx < numRows && -1 <= alignMode && alignMode <= 1)
for (int colIdx = 0; colIdx < numCols; colIdx++)
alignment[rowIdx][colIdx] = alignMode;
}
void TableHelper::SetCellBorder(int rowIdx, int colIdx, int borderMask)
{
if (0 <= rowIdx && rowIdx < numRows && 0 <= colIdx && colIdx < numCols) {
if (borderMask & BORDER_TOP) rowBorders[rowIdx ].insert(colIdx); else rowBorders[rowIdx ].erase(colIdx);
if (borderMask & BORDER_BOT) rowBorders[rowIdx+1].insert(colIdx); else rowBorders[rowIdx+1].erase(colIdx);
if (borderMask & BORDER_LEFT) colBorders[colIdx ].insert(rowIdx); else colBorders[colIdx ].erase(rowIdx);
if (borderMask & BORDER_RIGHT) colBorders[colIdx+1].insert(rowIdx); else colBorders[colIdx+1].erase(rowIdx);
}
}
void TableHelper::DrawRowBorder(int rowIdx)
{
if (0 <= rowIdx && rowIdx <= numRows)
for (int colIdx = 0; colIdx < numCols; colIdx++)
rowBorders[rowIdx].insert(colIdx);
}
void TableHelper::DrawColBorder(int colIdx)
{
if (0 <= colIdx && colIdx <= numCols)
for (int rowIdx = 0; rowIdx < numRows; rowIdx++)
colBorders[colIdx].insert(rowIdx);
}
void TableHelper::PrintTable(bool outputToCsv, bool drawBorders)
{
if (!RankDoesOutput()) return;
std::string borders[16] =
{" ", "│", "│", "│",
"─", "┘", "┐", "┤",
"─", "└", "┌", "├",
"─", "┴", "┬", "┼"};
int mask;
for (int rowIdx = 0; rowIdx <= numRows; rowIdx++) {
// Draw "top" border
if (!outputToCsv && drawBorders && rowBorders[rowIdx].size() > 0) {
for (int colIdx = 0; colIdx <= numCols; colIdx++) {
mask = 0;
if (colBorders[colIdx].count(rowIdx-1)) mask |= BORDER_TOP;
if (colBorders[colIdx].count(rowIdx )) mask |= BORDER_BOT;
if (rowBorders[rowIdx].count(colIdx-1)) mask |= BORDER_LEFT;
if (rowBorders[rowIdx].count(colIdx )) mask |= BORDER_RIGHT;
Print("%s", borders[mask].c_str());
if (colIdx < numCols) {
std::string ch = rowBorders[rowIdx].count(colIdx) ? "─" : " ";
for (int i = 0; i < colWidth[colIdx]; i++) Print("%s", ch.c_str());
}
}
Print("\n");
}
if (rowIdx == numRows) break;
// Print off table data
for (int colIdx = 0; colIdx <= numCols; colIdx++) {
if (!outputToCsv)
Print("%s", drawBorders && colBorders[colIdx].count(rowIdx) ? "│" : " ");
if (colIdx == numCols) break;
int gap = colWidth[colIdx] - table[rowIdx][colIdx].size();
int lgap, rgap;
switch (alignment[rowIdx][colIdx]) {
case ALIGN_LEFT: lgap = 0; rgap = gap; break;
case ALIGN_CENTER: lgap = gap/2; rgap = gap - lgap; break;
case ALIGN_RIGHT: lgap = gap; rgap = 0; break;
}
for (int i = 0; i < lgap; i++) printf(" ");
Print("%s", table[rowIdx][colIdx].c_str());
for (int i = 0; i < rgap; i++) printf(" ");
if (outputToCsv) Print(",");
}
Print("\n");
}
}
RankGroupMap& GetRankGroupMap()
{
static RankGroupMap groups;
static bool initialized = false;
if (!initialized) {
// Build GroupKey for each rank
for (int rank = 0; rank < TransferBench::GetNumRanks(); rank++) {
std::string ppodId = TransferBench::GetPpodId(rank);
int vpodId = TransferBench::GetVpodId(rank);
// CPU information
int numCpus = TransferBench::GetNumExecutors(EXE_CPU, rank);
std::vector<std::string> cpuNames;
std::vector<int> cpuNumSubExecs;
for (int exeIndex = 0; exeIndex < numCpus; exeIndex++) {
ExeDevice exeDevice = {EXE_CPU, exeIndex, rank};
cpuNames.push_back(TransferBench::GetExecutorName(exeDevice));
cpuNumSubExecs.push_back(TransferBench::GetNumSubExecutors(exeDevice));
}
// GPU information
int numGpus = TransferBench::GetNumExecutors(EXE_GPU_GFX, rank);
std::vector<std::string> gpuNames;
std::vector<int> gpuNumSubExecs;
std::vector<int> gpuClosestCpu;
for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) {
ExeDevice exeDevice = {EXE_GPU_GFX, exeIndex, rank};
gpuNames.push_back(TransferBench::GetExecutorName(exeDevice));
gpuNumSubExecs.push_back(TransferBench::GetNumSubExecutors(exeDevice));
gpuClosestCpu.push_back(TransferBench::GetClosestCpuNumaToGpu(exeIndex, rank));
}
// NIC information
int numNics = TransferBench::GetNumExecutors(EXE_NIC, rank);
std::vector<int> nicClosestGpu(numNics, -1);
for (int gpuIndex = 0; gpuIndex < numGpus; gpuIndex++) {
std::vector<int> nicIndices;
TransferBench::GetClosestNicsToGpu(nicIndices, gpuIndex, rank);
for (auto nicIndex : nicIndices) {
nicClosestGpu[nicIndex] = gpuIndex;
}
}
std::vector<std::string> nicNames;
std::vector<int> nicClosestCpu;
std::vector<int> nicIsActive;
for (int exeIndex = 0; exeIndex < numNics; exeIndex++) {
ExeDevice exeDevice = {EXE_NIC, exeIndex, rank};
nicNames.push_back(TransferBench::GetExecutorName(exeDevice));
nicClosestCpu.push_back(TransferBench::GetClosestCpuNumaToNic(exeIndex, rank));
nicIsActive.push_back(TransferBench::NicIsActive(exeIndex, rank));
}
GroupKey key(ppodId, vpodId,
cpuNames, cpuNumSubExecs,
gpuNames, gpuNumSubExecs, gpuClosestCpu,
nicNames, nicClosestCpu, nicClosestGpu, nicIsActive);
groups[key].push_back(rank);
}
initialized = true;
}
return groups;
}
int GetNumRankGroups()
{
return GetRankGroupMap().size();
}
// Helper function to convert an ExeType to a string
std::string ExeTypeToStr(ExeType exeType)
{
switch (exeType) {
case EXE_CPU: return "CPU";
case EXE_GPU_GFX: return "GPU";
case EXE_GPU_DMA: return "DMA";
case EXE_NIC: return "NIC";
case EXE_NIC_NEAREST: return "NIC";
default: return "N/A";
}
}
// Helper function that converts MemDevices to a string
std::string MemDevicesToStr(std::vector<MemDevice> const& memDevices)
{
if (memDevices.empty()) return "N";
bool isMultiNode = TransferBench::GetNumRanks() > 1;
std::stringstream ss;
for (auto const& m : memDevices) {
if (isMultiNode)
ss << "R" << m.memRank;
ss << TransferBench::MemTypeStr[m.memType] << m.memIndex;
}
return ss.str();
}
// Helper function to determine if current rank does output
bool RankDoesOutput()
{
return (TransferBench::GetCommMode() != TransferBench::COMM_MPI ||
TransferBench::GetRank() == 0);
}
// Helper function that only prints if current rank does output
void Print(const char* format, ...)
{
if (RankDoesOutput()) {
va_list args;
va_start(args, format);
vprintf(format, args);
va_end(args);
}
}
// Helper function to deal with ErrResults (exits on fatal error)
void CheckForError(ErrResult const& error)
{
switch (error.errType) {
case ERR_NONE: return;
case ERR_WARN:
Print("[WARN] %s\n", error.errMsg.c_str());
return;
case ERR_FATAL:
Print("[ERROR] %s\n", error.errMsg.c_str());
exit(1);
default:
break;
}
}
// Helper function to deal with vector of ErrREsults (exits on fatal error)
void PrintErrors(std::vector<ErrResult> const& errors)
{
// When running in MPI mode, only the first rank produces output
bool isFatal = false;
for (auto const& err : errors) {
Print("[%s] %s\n", err.errType == ERR_FATAL ? "ERROR" : "WARN", err.errMsg.c_str());
isFatal |= (err.errType == ERR_FATAL);
}
if (isFatal) exit(1);
}
// Print TransferBench test results
void PrintResults(EnvVars const& ev, int const testNum,
std::vector<Transfer> const& transfers,
TestResults const& results)
{
if (!RankDoesOutput()) return;
if (!ev.outputToCsv) printf("Test %d:\n", testNum);
bool isMultiRank = TransferBench::GetNumRanks() > 1;
// Figure out table dimensions
int numCols = 5, numRows = 1;
size_t numTimedIterations = results.numTimedIterations;
for (auto const& exeInfoPair : results.exeResults) {
ExeResult const& exeResult = exeInfoPair.second;
numRows += 1 + exeResult.transferIdx.size();
if (ev.showIterations) {
numRows += (numTimedIterations + 1);
// Check that per-iteration information exists
for (int idx : exeResult.transferIdx) {
TransferResult const& r = results.tfrResults[idx];
if (r.perIterMsec.size() != numTimedIterations) {
Print("[ERROR] Per iteration timing data unavailable: Expected %lu data points, but have %lu\n",
numTimedIterations, r.perIterMsec.size());
exit(1);
}
}
}
}
TableHelper table(numRows, numCols);
for (int col = 1; col < numCols; col++)
table.DrawColBorder(col);
// Loop over each executor
int rowIdx = 0;
for (auto const& exeInfoPair : results.exeResults) {
ExeDevice const& exeDevice = exeInfoPair.first;
ExeResult const& exeResult = exeInfoPair.second;
ExeType const exeType = exeDevice.exeType;
int32_t const exeIndex = exeDevice.exeIndex;
// Display Executor results
table.DrawRowBorder(rowIdx);
if (isMultiRank) {
table.Set(rowIdx, 0, " Executor: Rank %d %3s %02d ", exeDevice.exeRank, ExeTypeToStr(exeType).c_str(), exeIndex);
table.Set(rowIdx, 4, " %7.3f GB/s (sum) [%s]", exeResult.sumBandwidthGbPerSec, GetHostname(exeDevice.exeRank).c_str());
} else {
table.Set(rowIdx, 0, " Executor: %3s %02d ", ExeTypeToStr(exeType).c_str(), exeIndex);
table.Set(rowIdx, 4, " %7.3f GB/s (sum)", exeResult.sumBandwidthGbPerSec);
}
table.Set(rowIdx, 1, "%8.3f GB/s " , exeResult.avgBandwidthGbPerSec);
table.Set(rowIdx, 2, "%8.3f ms " , exeResult.avgDurationMsec);
table.Set(rowIdx, 3, "%12lu bytes ", exeResult.numBytes);
table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT);
rowIdx++;
table.DrawRowBorder(rowIdx);
// Loop over the Transfers for this executor
for (int idx : exeResult.transferIdx) {
Transfer const& t = transfers[idx];
TransferResult const& r = results.tfrResults[idx];
table.Set(rowIdx, 0, "Transfer %-4d ", idx);
table.Set(rowIdx, 1, "%8.3f GB/s " , r.avgBandwidthGbPerSec);
table.Set(rowIdx, 2, "%8.3f ms " , r.avgDurationMsec);
table.Set(rowIdx, 3, "%12lu bytes " , r.numBytes);
char exeSubIndexStr[32] = "";
if (t.exeSubIndex != -1)
sprintf(exeSubIndexStr, ".%d", t.exeSubIndex);
if (isMultiRank) {
table.Set(rowIdx, 4, " %s -> R%d%c%d%s:%d -> %s",
MemDevicesToStr(t.srcs).c_str(),
exeDevice.exeRank, ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
} else {
table.Set(rowIdx, 4, " %s -> %c%d%s:%d -> %s",
MemDevicesToStr(t.srcs).c_str(),
ExeTypeStr[t.exeDevice.exeType], t.exeDevice.exeIndex,
exeSubIndexStr, t.numSubExecs,
MemDevicesToStr(t.dsts).c_str());
}
table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT);
rowIdx++;
// Show per-iteration timing information
if (ev.showIterations) {
// Compute standard deviation and track iterations by speed
std::set<std::pair<double, int>> times;
double stdDevTime = 0;
double stdDevBw = 0;
for (int i = 0; i < numTimedIterations; i++) {
times.insert(std::make_pair(r.perIterMsec[i], i+1));
double const varTime = fabs(r.avgDurationMsec - r.perIterMsec[i]);
stdDevTime += varTime * varTime;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / r.perIterMsec[i] * 1000.0f;
double const varBw = fabs(iterBandwidthGbs - r.avgBandwidthGbPerSec);
stdDevBw += varBw * varBw;
}
stdDevTime = sqrt(stdDevTime / numTimedIterations);
stdDevBw = sqrt(stdDevBw / numTimedIterations);
// Loop over iterations (fastest to slowest)
for (auto& time : times) {
double iterDurationMsec = time.first;
double iterBandwidthGbs = (t.numBytes / 1.0E9) / iterDurationMsec * 1000.0f;
std::set<int> usedXccs;
std::stringstream ss1;
if (exeDevice.exeType == EXE_GPU_GFX) {
if (time.second - 1 < r.perIterCUs.size()) {
ss1 << " CUs: ";
for (auto x : r.perIterCUs[time.second - 1]) {
ss1 << x.first << ":" << std::setfill('0') << std::setw(2) << x.second << " ";
usedXccs.insert(x.first);
}
}
}
std::stringstream ss2;
if (!usedXccs.empty()) {
ss2 << " XCCs:";
for (auto x : usedXccs)
ss2 << " " << x;
}
table.Set(rowIdx, 0, "Iter %03d ", time.second);
table.Set(rowIdx, 1, "%8.3f GB/s ", iterBandwidthGbs);
table.Set(rowIdx, 2, "%8.3f ms ", iterDurationMsec);
table.Set(rowIdx, 3, ss1.str());
table.Set(rowIdx, 4, ss2.str());
rowIdx++;
}
table.Set(rowIdx, 0, "StandardDev ");
table.Set(rowIdx, 1, "%8.3f GB/s ", stdDevBw);
table.Set(rowIdx, 2, "%8.3f ms ", stdDevTime);
rowIdx++;
table.DrawRowBorder(rowIdx);
}
}
}
table.DrawRowBorder(rowIdx);
table.Set(rowIdx, 0, "Aggregate (CPU) ");
table.Set(rowIdx, 1, "%8.3f GB/s " , results.avgTotalBandwidthGbPerSec);
table.Set(rowIdx, 2, "%8.3f ms " , results.avgTotalDurationMsec);
table.Set(rowIdx, 3, "%12lu bytes " , results.totalBytesTransferred);
table.Set(rowIdx, 4, " Overhead %.3f ms", results.overheadMsec);
table.SetCellAlignment(rowIdx, 4, TableHelper::ALIGN_LEFT);
table.DrawRowBorder(rowIdx);
table.PrintTable(ev.outputToCsv, ev.showBorders);
}
bool HasDuplicateHostname()
{
std::set<std::string> seenHosts;
for (int rank = 0; rank < TransferBench::GetNumRanks(); rank++) {
std::string hostname = TransferBench::GetHostname(rank);
if (seenHosts.count(hostname)) return true;
seenHosts.insert(hostname);
}
return false;
}
// Helper function to map between integer index and memory types
MemType GetCpuMemType(int memTypeIdx)
{
switch (memTypeIdx) {
case 0: return MEM_CPU;
case 1: return MEM_CPU_COHERENT;
case 2: return MEM_CPU_NONCOHERENT;
case 3: return MEM_CPU_UNCACHED;
case 4: return MEM_CPU_UNPINNED;
default: return MEM_CPU;
}
}
MemType GetGpuMemType(int memTypeIdx)
{
switch (memTypeIdx) {
case 0: return MEM_GPU;
case 1: return MEM_GPU_FINE;
case 2: return MEM_GPU_UNCACHED;
case 3: return MEM_MANAGED;
default: return MEM_GPU;
}
}
MemType GetMemType(int memTypeIdx, bool isCpu)
{
return isCpu ? GetCpuMemType(memTypeIdx) : GetGpuMemType(memTypeIdx);
}
// Helper function to map between integer index and memory type name
std::string GetCpuMemTypeStr(int memTypeIdx)
{
switch (memTypeIdx) {
case 0: return "default CPU";
case 1: return "coherent CPU";
case 2: return "non-coherent CPU";
case 3: return "uncached CPU";
case 4: return "unpinned CPU";
default: return "default CPU";
}
}
std::string GetGpuMemTypeStr(int memTypeIdx)
{
switch (memTypeIdx) {
case 0: return "default GPU";
case 1: return "fine-grained GPU";
case 2: return "uncached GPU";
case 3: return "managed";
default: return "default GPU";
}
}
std::string GetMemTypeStr(int memTypeIdx, bool isCpu)
{
return isCpu ? GetCpuMemTypeStr(memTypeIdx) : GetGpuMemTypeStr(memTypeIdx);
}
std::string GetAllCpuMemTypeStr()
{
return "0=default, 1=coherent, 2=non-coherent, 3=uncached, 4=unpinned";
}
std::string GetAllGpuMemTypeStr()
{
return "0=default, 1=fine-grained, 2=uncached, 3=managed";
}
std::string GetAllMemTypeStr(bool isCpu)
{
return isCpu ? GetAllCpuMemTypeStr() : GetAllGpuMemTypeStr();
}
};
...@@ -23,34 +23,46 @@ THE SOFTWARE. ...@@ -23,34 +23,46 @@ THE SOFTWARE.
/// @cond /// @cond
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <arpa/inet.h>
#include <atomic>
#include <barrier>
#include <cstring> #include <cstring>
#include <fcntl.h>
#include <filesystem>
#include <fstream>
#include <functional>
#include <future> #include <future>
#include <map> #include <map>
#include <mutex>
#include <netinet/in.h>
#include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev) #include <numa.h> // If not found, try installing libnuma-dev (e.g apt-get install libnuma-dev)
#include <numaif.h> #include <numaif.h>
#include <random> #include <random>
#include <regex>
#include <set> #include <set>
#include <sstream> #include <sstream>
#include <stdarg.h> #include <stdarg.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <thread> #include <thread>
#include <unistd.h> #include <unistd.h>
#include <vector> #include <vector>
#ifdef NIC_EXEC_ENABLED #ifdef NIC_EXEC_ENABLED
#include <infiniband/verbs.h> #include <infiniband/verbs.h>
#include <stdio.h> #endif
#include <string.h>
#include <stdint.h> #ifdef MPI_COMM_ENABLED
#include <stdbool.h> #include <mpi.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <unistd.h>
#include <filesystem>
#include <fstream>
#endif #endif
#if defined(__NVCC__) #if defined(__NVCC__)
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <nvml.h>
#else #else
#include <hip/hip_ext.h> #include <hip/hip_ext.h>
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
...@@ -66,7 +78,7 @@ namespace TransferBench ...@@ -66,7 +78,7 @@ namespace TransferBench
using std::set; using std::set;
using std::vector; using std::vector;
constexpr char VERSION[] = "1.65"; constexpr char VERSION[] = "1.66";
/** /**
* Enumeration of supported Executor types * Enumeration of supported Executor types
...@@ -93,9 +105,14 @@ namespace TransferBench ...@@ -93,9 +105,14 @@ namespace TransferBench
{ {
ExeType exeType; ///< Executor type ExeType exeType; ///< Executor type
int32_t exeIndex; ///< Executor index int32_t exeIndex; ///< Executor index
int32_t exeRank = 0; ///< Executor rank
int32_t exeSlot = 0; ///< Executor slot
bool operator<(ExeDevice const& other) const { bool operator<(ExeDevice const& other) const {
return (exeType < other.exeType) || (exeType == other.exeType && exeIndex < other.exeIndex); return ((exeRank != other.exeRank) ? (exeRank < other.exeRank) :
(exeType != other.exeType) ? (exeType < other.exeType) :
(exeIndex != other.exeIndex) ? (exeIndex < other.exeIndex) :
(exeSlot < other.exeSlot));
} }
}; };
...@@ -106,18 +123,21 @@ namespace TransferBench ...@@ -106,18 +123,21 @@ namespace TransferBench
*/ */
enum MemType enum MemType
{ {
MEM_CPU = 0, ///< Coarse-grained pinned CPU memory MEM_CPU = 0, ///< Default pinned CPU memory (via hipHostMalloc)
MEM_GPU = 1, ///< Coarse-grained global GPU memory MEM_CPU_CLOSEST = 1, ///< Default pinned CPU memory (indexed by closest GPU)
MEM_CPU_FINE = 2, ///< Fine-grained pinned CPU memory MEM_CPU_COHERENT = 2, MEM_CPU_FINE = 2, ///< Coherent pinned CPU memory (via hipHostMallocCoherent flag)
MEM_GPU_FINE = 3, ///< Fine-grained global GPU memory MEM_CPU_NONCOHERENT = 3, ///< Noncoherent pinned CPU memory (via hipHostMallocNonCoherent flag)
MEM_CPU_UNPINNED = 4, ///< Unpinned CPU memory MEM_CPU_UNCACHED = 4, ///< Uncached pinned CPU memory (via hipHostMallocUncached flag)
MEM_NULL = 5, ///< NULL memory - used for empty MEM_CPU_UNPINNED = 5, ///< Unpinned CPU memory
MEM_MANAGED = 6, ///< Managed memory MEM_GPU = 6, ///< Default GPU memory (via hipMalloc)
MEM_CPU_CLOSEST = 7, ///< Coarse-grained pinned CPU memory indexed by closest GPU MEM_GPU_FINE = 7, ///< Fine-grained GPU memory (via hipDeviceMallocFinegrained flag)
MEM_GPU_UNCACHED = 8, ///< Uncached GPU memory (via hipDeviceMallocUncached flag)
MEM_MANAGED = 9, ///< Managed memory
MEM_NULL = 10, ///< NULL memory - used for empty
}; };
char const MemTypeStr[9] = "CGBFUNMP"; char const MemTypeStr[12] = "CPBDKHGFUMN";
inline bool IsCpuMemType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED || m == MEM_CPU_CLOSEST); } inline bool IsCpuMemType(MemType m) { return (MEM_CPU <= m && m <= MEM_CPU_UNPINNED);}
inline bool IsGpuMemType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); } inline bool IsGpuMemType(MemType m) { return (MEM_GPU <= m && m <= MEM_MANAGED);}
/** /**
* A MemDevice indicates a memory type on a specific device * A MemDevice indicates a memory type on a specific device
...@@ -126,9 +146,17 @@ namespace TransferBench ...@@ -126,9 +146,17 @@ namespace TransferBench
{ {
MemType memType; ///< Memory type MemType memType; ///< Memory type
int32_t memIndex; ///< Device index int32_t memIndex; ///< Device index
int32_t memRank = 0; ///< Rank index
bool operator<(MemDevice const& other) const { bool operator<(MemDevice const& other) const {
return (memType < other.memType) || (memType == other.memType && memIndex < other.memIndex); return ((memType != other.memType) ? (memType < other.memType) :
(memIndex != other.memIndex) ? (memIndex < other.memIndex) :
(memRank < other.memRank));
}
bool operator==(MemDevice const& other) const {
return (memType == other.memType &&
memIndex == other.memIndex &&
memRank == other.memRank);
} }
}; };
...@@ -142,6 +170,7 @@ namespace TransferBench ...@@ -142,6 +170,7 @@ namespace TransferBench
vector<MemDevice> dsts = {}; ///< List of destination memory devices vector<MemDevice> dsts = {}; ///< List of destination memory devices
ExeDevice exeDevice = {}; ///< Executor to use ExeDevice exeDevice = {}; ///< Executor to use
int32_t exeSubIndex = -1; ///< Executor subindex int32_t exeSubIndex = -1; ///< Executor subindex
int32_t exeSubSlot = 0; ///< Executor subslot
int numSubExecs = 0; ///< Number of subExecutors to use for this Transfer int numSubExecs = 0; ///< Number of subExecutors to use for this Transfer
}; };
...@@ -204,12 +233,12 @@ namespace TransferBench ...@@ -204,12 +233,12 @@ namespace TransferBench
*/ */
struct NicOptions struct NicOptions
{ {
vector<int> closestNics = {}; ///< Overrides the auto-detected closest NIC per GPU size_t chunkBytes = 1<<30; ///< How much bytes to transfer at a time
int ibGidIndex = -1; ///< GID Index for RoCE NICs (-1 is auto) int ibGidIndex = -1; ///< GID Index for RoCE NICs (-1 is auto)
uint8_t ibPort = 1; ///< NIC port number to be used uint8_t ibPort = 1; ///< NIC port number to be used
int ipAddressFamily = 4; ///< 4=IPv4, 6=IPv6 (used for auto GID detection) int ipAddressFamily = 4; ///< 4=IPv4, 6=IPv6 (used for auto GID detection)
int maxRecvWorkReq = 16; ///< Maximum number of recv work requests per queue pair int maxRecvWorkReq = 16; ///< Maximum number of recv work requests per queue pair
int maxSendWorkReq = 16; ///< Maximum number of send work requests per queue pair int maxSendWorkReq = 1024; ///< Maximum number of send work requests per queue pair
int queueSize = 100; ///< Completion queue size int queueSize = 100; ///< Completion queue size
int roceVersion = 2; ///< RoCE version (used for auto GID detection) int roceVersion = 2; ///< RoCE version (used for auto GID detection)
int useRelaxedOrder = 1; ///< Use relaxed ordering int useRelaxedOrder = 1; ///< Use relaxed ordering
...@@ -265,6 +294,16 @@ namespace TransferBench ...@@ -265,6 +294,16 @@ namespace TransferBench
"RoCEv2 IPv4-mapped IPv6" "RoCEv2 IPv4-mapped IPv6"
}; };
/**
* Enumeration of possible communication mode types
*/
enum CommType
{
COMM_NONE = 0, ///< Single node only
COMM_MPI = 1, ///< MPI-based communication
COMM_SOCKET = 2 ///< Socket-based communication
};
/** /**
* ErrResult consists of error type and error message * ErrResult consists of error type and error message
*/ */
...@@ -377,12 +416,22 @@ namespace TransferBench ...@@ -377,12 +416,22 @@ namespace TransferBench
std::string GetStrAttribute(StrAttribute attribute); std::string GetStrAttribute(StrAttribute attribute);
/** /**
* Returns information about number of available available Executors * Returns information about number of available Executors given an executor type
* *
* @param[in] exeType Executor type to query * @param[in] exeType Executor type to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns Number of detected Executors of exeType * @returns Number of detected Executors of exeType
*/ */
int GetNumExecutors(ExeType exeType); int GetNumExecutors(ExeType exeType, int targetRank = -1);
/**
* Returns information about number of available Executors given a memory type
*
* @param[in] memType Memory type to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns Number of detected Executors for memType
*/
int GetNumExecutors(MemType memType, int targetRank = -1);
/** /**
* Returns the number of possible Executor subindices * Returns the number of possible Executor subindices
...@@ -408,26 +457,98 @@ namespace TransferBench ...@@ -408,26 +457,98 @@ namespace TransferBench
* Returns the index of the NUMA node closest to the given GPU * Returns the index of the NUMA node closest to the given GPU
* *
* @param[in] gpuIndex Index of the GPU to query * @param[in] gpuIndex Index of the GPU to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect * @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect
*/ */
int GetClosestCpuNumaToGpu(int gpuIndex); int GetClosestCpuNumaToGpu(int gpuIndex, int targetRank = -1);
/** /**
* Returns the index of the NUMA node closest to the given NIC * Returns the index of the NUMA node closest to the given NIC
* *
* @param[in] nicIndex Index of the NIC to query * @param[in] nicIndex Index of the NIC to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns NUMA node index closest to the NIC nicIndex, or -1 if unable to detect * @returns NUMA node index closest to the NIC nicIndex, or -1 if unable to detect
*/ */
int GetClosestCpuNumaToNic(int nicIndex); int GetClosestCpuNumaToNic(int nicIndex, int targetRank = -1);
/** /**
* Returns the index of the NIC closest to the given GPU * Returns the index of a NIC closest to the given GPU
* *
* @param[in] gpuIndex Index of the GPU to query * @param[in] gpuIndex Index of the GPU to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @note This function is applicable when the IBV/RDMA executor is available * @note This function is applicable when the IBV/RDMA executor is available
* @returns IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect * @returns IB Verbs capable NIC index closest to GPU gpuIndex, or -1 if unable to detect
*/ */
int GetClosestNicToGpu(int gpuIndex); int GetClosestNicToGpu(int gpuIndex, int targetRank = -1);
/**
* Returns the indices of the NICs closest to the given CPU
*
* @param[out] nicIndices Vector that will contain NIC indices closest to given CPU
* @param[in] cpuIndex Index of the CPU to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @note This function is applicable when the IBV/RDMA executor is available
* @returns IB Verbs capable NIC indices closest to CPU cpuIndex, or empty if unable to detect
*/
void GetClosestNicsToCpu(std::vector<int>& nicIndices, int cpuIndex, int targetRank = -1);
/**
* Returns the indices of the NICs closest to the given GPU
*
* @param[out] nicIndices Vector that will contain NIC indices closest to given GPU
* @param[in] gpuIndex Index of the GPU to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @note This function is applicable when the IBV/RDMA executor is available
* @returns IB Verbs capable NIC indices closest to GPU gpuIndex, or empty if unable to detect
*/
void GetClosestNicsToGpu(std::vector<int>& nicIndices, int gpuIndex, int targetRank = -1);
/**
* @returns 0-indexed rank for this process
*/
int GetRank();
/**
* @returns The total numbers of ranks participating
*/
int GetNumRanks();
/**
* @returns Gets the current communication mode
*/
int GetCommMode();
/**
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns Gets the hostname for the target rank
**/
std::string GetHostname(int targetRank = -1);
/**
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns Gets the physical pod identifier for the target rank
**/
std::string GetPpodId(int targetRank = -1);
/**
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns Gets the virtual pod identifier for the target rank
**/
int GetVpodId(int targetRank = -1);
/**
* @param[in] exeDevice The specific Executor to query
* @returns Name of the executor
*/
std::string GetExecutorName(ExeDevice exeDevice);
/**
*
* @param[in] nicIndex The NIC index to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns Returns 1 if and only if NIC exists and has an active port
*/
int NicIsActive(int nicIndex, int targetRank = -1);
/** /**
* Helper function to parse a line containing Transfers into a vector of Transfers * Helper function to parse a line containing Transfers into a vector of Transfers
...@@ -603,7 +724,6 @@ namespace { ...@@ -603,7 +724,6 @@ namespace {
//======================================================================================== //========================================================================================
int constexpr MAX_BLOCKSIZE = 1024; // Max threadblock size int constexpr MAX_BLOCKSIZE = 1024; // Max threadblock size
int constexpr MAX_WAVEGROUPS = MAX_BLOCKSIZE / 64; // Max wavegroups/warps
int constexpr MAX_UNROLL = 8; // Max unroll factor int constexpr MAX_UNROLL = 8; // Max unroll factor
int constexpr MAX_SRCS = 8; // Max srcs per Transfer int constexpr MAX_SRCS = 8; // Max srcs per Transfer
int constexpr MAX_DSTS = 8; // Max dsts per Transfer int constexpr MAX_DSTS = 8; // Max dsts per Transfer
...@@ -642,9 +762,252 @@ namespace { ...@@ -642,9 +762,252 @@ namespace {
return numSubExecs; return numSubExecs;
} }
// Parsing-related functions // System singleton
//======================================================================================== //========================================================================================
/**
* System singleton class used for multi-node capability / topology dectection
*
* This supports three possible communication modes - Socket-based, MPI-based, disabled
*
* - Will first attempt to use sockets if TB_RANK env var is detected
* - Will then try MPI-based, if compiled with MPI support
* - Drop back to single node functionality
* - Configuration for socket-based communicator is read via environment variables
* - TB_RANK: Rank of this process (0-based)
* - TB_NUM_RANKS: Total number of processes
* - TB_MASTER_ADDR: IP address of rank 0
* - TB_MASTER_PORT: Port for communication (default: 29500)
*/
class System
{
public:
static System& Get() {
static System instance;
return instance;
}
/**
* @returns 0-indexed rank for this process
*/
int GetRank() const { return rank; }
/**
* @returns The total numbers of ranks participating
*/
int GetNumRanks() const { return numRanks; }
/**
* @returns The communication mode
*/
int GetCommMode() const { return commMode; }
bool& IsVerbose() { return verbose; }
// Communication functions
/**
* Barrier that all ranks must arrive at before proceeding
*/
void Barrier();
/**
* Send data to a single destination rank
* Requires a matching call to RecvData on destination rank
* NOTE: For socket-based communicator, this must involve rank 0
*
* @param[in] dstRank Rank to send to
* @param[in] numBytes Number of bytes to send
* @param[in] sendData Data to send
*/
void SendData(int dstRank, size_t const numBytes, const void* sendData) const;
/**
* Recevive data from a single source rank
* Requires a matching call to SendData on source rank
* NOTE: For socket-based communicator, this must involve rank 0
*
* @param[in] srcRank Rank to receive from
* @param[in] numBytes Number of bytes to receive
* @param[in] recvData Buffer to receive data into
*/
void RecvData(int srcRank, size_t const numBytes, void* recvData) const;
/**
* Modifies provided input to true if any rank provides a true input
*
* @param[in] flag Flag to compare across ranks
* @returns True if and only if any rank provided a flag with value of true
*/
bool Any(bool const flag) const;
/**
* Broadcast data from root to all ranks
* All ranks must participate in this call
*
* @param[in] root Rank that sends data
* @param[in] numBytes Number of bytes to transfer
* @param[in/out] data Buffer to send from root / to receive into on other ranks
*/
void Broadcast(int root, size_t const numBytes, void* data) const;
/**
* Collect errors across ranks
* @param[in,out] errResults List of errors per rank
*/
void AllGatherErrors(vector<ErrResult>& errResults) const;
// Topology functions
/**
* Returns information about number of available Executors
*
* @param[in] exeType Executor type to query
* @param[in] targetRank Rank to query. (-1 for local rank)
* @returns Number of detected Executors of exeType
*/
int GetNumExecutors(ExeType exeType, int targetRank = -1) const;
/**
* Returns the number of possible Executor subindices
*
* @note For CPU, this is 0
* @note For GFX, this refers to the number of XCDs
* @note For DMA, this refers to the number of DMA engines
*
* @param[in] exeDevice The specific Executor to query
* @returns Number of detected executor subindices
*/
int GetNumExecutorSubIndices(ExeDevice exeDevice) const;
/**
* Returns number of subExecutors for a given ExeDevice
*
* @param[in] exeDevice The specific Executor to query
* @returns Number of detected subExecutors for the given ExePair
*/
int GetNumSubExecutors(ExeDevice exeDevice) const;
/**
* Returns the index of the NUMA node closest to the given GPU
*
* @param[in] gpuIndex Index of the GPU to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns NUMA node index closest to GPU gpuIndex, or -1 if unable to detect
*/
int GetClosestCpuNumaToGpu(int gpuIndex, int targetRank = -1) const;
/**
* Returns the index of the NUMA node closest to the given NIC
*
* @param[in] nicIndex Index of the NIC to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @returns NUMA node index closest to the NIC nicIndex, or -1 if unable to detect
*/
int GetClosestCpuNumaToNic(int nicIndex, int targetRank = -1) const;
/**
* Returns the indices of the NICs closest to the given GPU
*
* @param[out] nicIndices Vector that will contain NIC indices closest to given GPU
* @param[in] gpuIndex Index of the GPU to query
* @param[in] targetRank Rank to query (-1 for local rank)
* @note This function is applicable when the IBV/RDMA executor is available
* @returns IB Verbs capable NIC indices closest to GPU gpuIndex, or empty if unable to detect
*/
void GetClosestNicsToGpu(std::vector<int>& nicIndices, int gpuIndex, int targetRank = -1) const;
std::string GetHostname(int targetRank) const;
std::string GetPpodId(int targetRank) const;
int GetVpodId(int targetRank) const;
std::string GetExecutorName(ExeDevice exeDevice) const;
int NicIsActive(int nicIndex, int targetRank) const;
#if !defined(__NVCC__)
ErrResult GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent) const;
ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent) const;
#endif
template <typename T>
void BroadcastVector(int root, vector<T>& data) const;
void BroadcastString(int root, std::string& string) const;
void BroadcastExeResult(int root, ExeResult& exeResult) const;
void BroadcastTfrResult(int root, TransferResult& tfrResult) const;
private:
System();
~System();
System(System const&) = delete;
System(System&&) = delete;
System& operator=(System const&) = delete;
System& operator=(System&&) = delete;
int rank;
int numRanks;
bool verbose = false;
#if !defined(__NVCC__)
std::vector<hsa_agent_t> cpuAgents;
std::vector<hsa_agent_t> gpuAgents;
#endif
int commMode; ///< Communication mode
#ifdef MPI_COMM_ENABLED
bool mpiInit = false; ///< Whether or not MPI_Init was called
MPI_Comm comm; ///< MPI communicator
#endif
// Socket related
std::string masterAddr; ///< Rank 0 master address
int masterPort; ///< Rank 0 master port
std::vector<int> sockets; ///< Master list of sockets
int listenSocket; ///< Master listener socket
// Topology related
struct RankTopology
{
char hostname[33];
char ppodId[256];
int vpodId;
std::map<ExeType, int> numExecutors;
std::map<pair<ExeType, int>, int> numExecutorSubIndices;
std::map<pair<ExeType, int>, int> numSubExecutors;
std::map<int, int> closestCpuNumaToGpu;
std::map<int, int> closestCpuNumaToNic;
std::map<int, int> nicIsActive;
std::map<int, vector<int>> closestNicsToGpu;
std::map<pair<ExeType, int>, std::string> executorName;
};
std::vector<RankTopology> rankInfo; ///< Topology of each rank
void SetupSocketCommunicator();
void SetupMpiCommunicator();
void GetRankTopology(RankTopology& topo);
void CollectTopology();
std::string GetCpuName() const;
template <typename KeyType, typename ValType>
void SendMap(int peerRank, std::map<KeyType, std::vector<ValType>> const& mapToSend) const;
template <typename KeyType, typename ValType>
void SendMap(int peerRank, std::map<KeyType, ValType> const& mapToSend) const;
template <typename KeyType>
void SendMap(int peerRank, std::map<KeyType, std::string> const& mapToSend) const;
template <typename KeyType, typename ValType>
void RecvMap(int peerRank, std::map<KeyType, std::vector<ValType>>& mapToRecv) const;
template <typename KeyType, typename ValType>
void RecvMap(int peerRank, std::map<KeyType, ValType>& mapToRecv) const;
template <typename KeyType>
void RecvMap(int peerRank, std::map<KeyType, std::string>& mapToRecv) const;
void SendRankTopo(int peerRank, RankTopology const& topo) const;
void RecvRankTopo(int peerRank, RankTopology& topo) const;
};
// Parsing-related functions
//========================================================================================
static ErrResult CharToMemType(char const c, MemType& memType) static ErrResult CharToMemType(char const c, MemType& memType)
{ {
char const* val = strchr(MemTypeStr, toupper(c)); char const* val = strchr(MemTypeStr, toupper(c));
...@@ -665,47 +1028,223 @@ namespace { ...@@ -665,47 +1028,223 @@ namespace {
return {ERR_FATAL, "Unexpected executor type (%c)", c}; return {ERR_FATAL, "Unexpected executor type (%c)", c};
} }
static ErrResult ParseMemType(std::string const& token, struct WildcardMemDevice
std::vector<MemDevice>& memDevices)
{ {
char memTypeChar;
int offset = 0, memIndex, inc;
MemType memType; MemType memType;
bool found = false; vector<int> memRanks;
vector<int> memIndices;
};
struct WildcardExeDevice
{
ExeType exeType;
std::vector<int> exeRanks;
std::vector<int> exeIndices;
std::vector<int> exeSlots;
std::vector<int> exeSubIndices;
std::vector<int> exeSubSlots;
};
struct WildcardTransfer
{
std::vector<WildcardMemDevice> mem[2]; // 0 = SRCs, 1 = DSTs
WildcardExeDevice exe;
};
static char const* ParseRange(char const* start, int fullCount, std::vector<int>& range)
{
range.clear();
char const* ptr = start;
if (!ptr) return 0;
// Full wildcard
if (*ptr == '*') {
if (fullCount >= 0) {
for (int i = 0; i < fullCount; i++)
range.push_back(i);
} else {
range.push_back(fullCount);
}
return ++ptr;
}
// Ranged wildcard
if (*ptr == '[') {
std::string rangeStr(++ptr);
size_t endPos = rangeStr.find(']');
if (endPos == std::string::npos) return 0;
rangeStr.erase(endPos);
ptr += endPos+1;
std::set<int> values;
char* token = strtok(rangeStr.data(), ",");
while (token) {
int start, end;
if (sscanf(token, "%d..%d", &start, &end) == 2) {
if (start < 0 || end < 0 || end <= start) return 0;
for (int i = start; i <= end; i++)
values.insert(i);
} else if (sscanf(token, "%d", &start) == 1) {
values.insert(start);
} else {
return 0;
}
token = strtok(NULL, ",");
}
if (values.empty()) return 0;
for (auto v : values) range.push_back(v);
return ptr;
}
// Single number
char* endPtr;
int val = strtol(ptr, &endPtr, 10);
if (endPtr == ptr) return 0;
else range.push_back(val);
return endPtr;
}
static char const* ParseAlphaRange(char const* start, std::vector<int>& range)
{
range.clear();
char const* ptr = start;
if (!ptr) return 0;
// Full wildcard
if (*ptr == '*') {
range.push_back(-1);
return ++ptr;
}
// Ranged wildcard
if (*ptr == '[') {
std::string rangeStr(++ptr);
size_t endPos = rangeStr.find(']');
if (endPos == std::string::npos) return 0;
rangeStr.erase(endPos);
ptr += endPos+1;
std::set<int> values;
char* token = strtok(rangeStr.data(), ",");
while (token) {
char start, end;
if (sscanf(token, "%c..%c", &start, &end) == 2 && isalpha(toupper(start)) && isalpha(toupper(end))) {
int realStart = toupper(start) - 'A';
int realEnd = toupper(end) - 'A';
if (realStart < 0 || realEnd < 0) return 0;
for (int i = realStart; i <= realEnd; i++)
values.insert(i);
} else if (sscanf(token, "%c", &start) == 1 && isalpha(toupper(start))) {
int realStart = toupper(start) - 'A';
values.insert(realStart);
} else {
return 0;
}
token = strtok(NULL, ",");
}
for (auto v : values) range.push_back(v);
return ptr;
}
// Single character
if (isalpha(toupper(*ptr))) {
range.push_back(toupper(*ptr)-'A');
++ptr;
}
return ptr;
}
static ErrResult ParseMemType(std::string const& token,
std::vector<WildcardMemDevice>& memDevices)
{
memDevices.clear(); memDevices.clear();
while (sscanf(token.c_str() + offset, " %c %d%n", &memTypeChar, &memIndex, &inc) == 2) {
offset += inc;
ErrResult err = CharToMemType(memTypeChar, memType); char const* ptr = token.c_str();
if (err.errType != ERR_NONE) return err; while (*ptr) {
WildcardMemDevice w;
// Parse memory rank if it exists
if (*ptr == 'R' || *ptr == 'r') {
ptr++; // Skip 'R'
ptr = ParseRange(ptr, GetNumRanks(), w.memRanks);
if (!ptr) return {ERR_FATAL, "Unable to parse rank index in memory token %s", token.c_str()};
} else {
// Otherwise will be replaced by "local" wildcard
w.memRanks.clear();
}
if (memType != MEM_NULL) // Parse memory type
memDevices.push_back({memType, memIndex}); ERR_CHECK(CharToMemType(*ptr, w.memType));
found = true; ptr++; // Skip memory type
// Parse memory index
if (w.memType != MEM_NULL) {
ptr = ParseRange(ptr, -1, w.memIndices);
if (!ptr) return {ERR_FATAL, "Unable to parse device index in memory token %s", token.c_str()};
memDevices.push_back(w);
} }
if (found) return ERR_NONE; }
return {ERR_FATAL, return ERR_NONE;
"Unable to parse memory type token %s. Expected one of %s followed by an index",
token.c_str(), MemTypeStr};
} }
static ErrResult ParseExeType(std::string const& token, static ErrResult ParseExeType(std::string const& token,
ExeDevice& exeDevice, WildcardExeDevice& exeDevice)
int& exeSubIndex)
{ {
char exeTypeChar; char const* ptr = token.c_str();
exeSubIndex = -1;
int numTokensParsed = sscanf(token.c_str(), // Check for rank prefix
" %c%d.%d", &exeTypeChar, &exeDevice.exeIndex, &exeSubIndex); if (*ptr == 'R' || *ptr == 'r') {
if (numTokensParsed < 2) { ptr++; // Skip 'R'
return {ERR_FATAL, ptr = ParseRange(ptr, GetNumRanks(), exeDevice.exeRanks);
"Unable to parse valid executor token (%s)." if (!ptr) return {ERR_FATAL, "Unable to parse rank index in executor token %s", token.c_str()};
"Expected one of %s followed by an index", } else {
token.c_str(), ExeTypeStr}; exeDevice.exeRanks.clear();
} }
return CharToExeType(exeTypeChar, exeDevice.exeType);
// Parse executor type
ERR_CHECK(CharToExeType(*ptr, exeDevice.exeType));
ptr++; // Skip executor type char
// Parse executor index
// This is optional for EXE_NIC_NEAREST as long as nothing further is specified
char const* endPtr = ParseRange(ptr, -1, exeDevice.exeIndices);
if (!endPtr) {
if (exeDevice.exeType == EXE_NIC_NEAREST && *endPtr == 0) {
if (exeDevice.exeRanks.size() != 0) {
return {ERR_FATAL, "Wildcard NIC executor may not be specified with rank in executor token %s", token.c_str()};
}
exeDevice.exeIndices.clear();
return ERR_NONE;
} else {
return {ERR_FATAL, "Unable to parse device index in executor token %s", token.c_str()};
}
} else {
ptr = endPtr;
}
// Parse (optional) executor slot
ptr = ParseAlphaRange(ptr, exeDevice.exeSlots);
if (!ptr) return {ERR_FATAL, "Unable to parse executor slot in executor token %s", token.c_str()};
// Check for subindex after device
if (*ptr == '.') {
ptr++; // Skip '.'
ptr = ParseRange(ptr, -2, exeDevice.exeSubIndices);
if (!ptr) return {ERR_FATAL, "Unable to parse subindex in executor token %s", token.c_str()};
}
// Ensure that EXE_NIC has non-empty subindex
if (exeDevice.exeType == EXE_NIC && exeDevice.exeSubIndices.size() == 0) {
return {ERR_FATAL, "NIC executor requires specification of a subindex in executor token %s", token.c_str()};
}
// Parse (optional) executor subslot
ptr = ParseAlphaRange(ptr, exeDevice.exeSubSlots);
if (!ptr) return {ERR_FATAL, "Unable to parse subslot in executor token %s", token.c_str()};
return ERR_NONE;
} }
// Memory-related functions // Memory-related functions
...@@ -766,7 +1305,7 @@ namespace { ...@@ -766,7 +1305,7 @@ namespace {
} }
// Allocate memory // Allocate memory
static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr) static ErrResult AllocateMemory(MemDevice memDevice, size_t numBytes, void** memPtr, bool isShareable = false)
{ {
if (numBytes == 0) { if (numBytes == 0) {
return {ERR_FATAL, "Unable to allocate 0 bytes"}; return {ERR_FATAL, "Unable to allocate 0 bytes"};
...@@ -786,17 +1325,33 @@ namespace { ...@@ -786,17 +1325,33 @@ namespace {
numa_set_preferred(numaIdx); numa_set_preferred(numaIdx);
// Allocate host-pinned memory (should respect NUMA mem policy) // Allocate host-pinned memory (should respect NUMA mem policy)
if (memType == MEM_CPU_FINE) { int flags = 0;
#if !defined(__NVCC__)
flags |= hipHostMallocNumaUser;
#endif
if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) {
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, flags));
} else if (memType == MEM_CPU_COHERENT) {
#if defined (__NVCC__)
return {ERR_FATAL, "Coherent pinned-CPU memory not supported on NVIDIA platform"};
#else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, flags | hipHostMallocCoherent));
#endif
} else if (memType == MEM_CPU_NONCOHERENT) {
#if defined (__NVCC__) #if defined (__NVCC__)
return {ERR_FATAL, "Fine-grained CPU memory not supported on NVIDIA platform"}; return {ERR_FATAL, "Non-coherent pinned-CPU memory not supported on NVIDIA platform"};
#else #else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocCoherent)); ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, flags | hipHostMallocNonCoherent));
#endif #endif
} else if (memType == MEM_CPU || memType == MEM_CPU_CLOSEST) { } else if (memType == MEM_CPU_UNCACHED) {
#if defined (__NVCC__) #if defined (__NVCC__)
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, 0)); return {ERR_FATAL, "Coherent CPU memory not supported on NVIDIA platform"};
#else
#if HIP_VERSION_MAJOR >= 7
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, flags | hipHostMallocUncached));
#else #else
ERR_CHECK(hipHostMalloc((void **)memPtr, numBytes, hipHostMallocNumaUser | hipHostMallocNonCoherent)); return {ERR_FATAL, "Uncached pinned-CPU memory requires ROCm 7.0"};
#endif
#endif #endif
} else if (memType == MEM_CPU_UNPINNED) { } else if (memType == MEM_CPU_UNPINNED) {
*memPtr = numa_alloc_onnode(numBytes, numaIdx); *memPtr = numa_alloc_onnode(numBytes, numaIdx);
...@@ -818,6 +1373,13 @@ namespace { ...@@ -818,6 +1373,13 @@ namespace {
} else if (memType == MEM_GPU_FINE) { } else if (memType == MEM_GPU_FINE) {
#if defined (__NVCC__) #if defined (__NVCC__)
return {ERR_FATAL, "Fine-grained GPU memory not supported on NVIDIA platform"}; return {ERR_FATAL, "Fine-grained GPU memory not supported on NVIDIA platform"};
#else
int flag = hipDeviceMallocFinegrained;
ERR_CHECK(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
#endif
} else if (memType == MEM_GPU_UNCACHED) {
#if defined (__NVCC__)
return {ERR_FATAL, "Uncached GPU memory not supported on NVIDIA platform"};
#else #else
int flag = hipDeviceMallocUncached; int flag = hipDeviceMallocUncached;
ERR_CHECK(hipExtMallocWithFlags((void**)memPtr, numBytes, flag)); ERR_CHECK(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
...@@ -843,7 +1405,7 @@ namespace { ...@@ -843,7 +1405,7 @@ namespace {
return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes}; return {ERR_FATAL, "Attempted to free null pointer for %lu bytes", bytes};
switch (memType) { switch (memType) {
case MEM_CPU: case MEM_CPU_FINE: case MEM_CPU_CLOSEST: case MEM_CPU: case MEM_CPU_CLOSEST: case MEM_CPU_COHERENT: case MEM_CPU_NONCOHERENT: case MEM_CPU_UNCACHED:
{ {
ERR_CHECK(hipHostFree(memPtr)); ERR_CHECK(hipHostFree(memPtr));
break; break;
...@@ -853,7 +1415,7 @@ namespace { ...@@ -853,7 +1415,7 @@ namespace {
numa_free(memPtr, bytes); numa_free(memPtr, bytes);
break; break;
} }
case MEM_GPU : case MEM_GPU_FINE: case MEM_MANAGED: case MEM_GPU : case MEM_GPU_FINE: case MEM_GPU_UNCACHED: case MEM_MANAGED:
{ {
ERR_CHECK(hipFree(memPtr)); ERR_CHECK(hipFree(memPtr));
break; break;
...@@ -864,104 +1426,33 @@ namespace { ...@@ -864,104 +1426,33 @@ namespace {
return ERR_NONE; return ERR_NONE;
} }
// HSA-related functions // Setup validation-related functions
//======================================================================================== //========================================================================================
// This function resolves executors that may be indexed by "nearest"
#if !defined(__NVCC__) static ErrResult GetActualExecutor(ExeDevice const& origExeDevice,
// Get the hsa_agent_t associated with a ExeDevice ExeDevice& actualExeDevice,
static ErrResult GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent) int rankOverride = -1)
{ {
static bool isInitialized = false; // By default, nothing needs to change
static std::vector<hsa_agent_t> cpuAgents; actualExeDevice = origExeDevice;
static std::vector<hsa_agent_t> gpuAgents;
int const& exeIndex = exeDevice.exeIndex;
int const numCpus = GetNumExecutors(EXE_CPU);
int const numGpus = GetNumExecutors(EXE_GPU_GFX);
// Initialize results on first use
if (!isInitialized) {
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
ErrResult err; // Check that executor rank is valid
int32_t* tempBuffer; int exeRank = (rankOverride == -1 ? origExeDevice.exeRank : rankOverride);
if (exeRank < 0 || exeRank >= GetNumRanks())
return {ERR_FATAL, "Rank index must be between 0 and %d (instead of %d)", GetNumRanks() - 1, exeRank};
// Index CPU agents // When using NIC_NEAREST, remap to the closest NIC to the GPU
cpuAgents.clear(); if (origExeDevice.exeType == EXE_NIC_NEAREST) {
for (int i = 0; i < numCpus; i++) { actualExeDevice.exeType = EXE_NIC;
ERR_CHECK(AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer)); actualExeDevice.exeRank = exeRank;
ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL)); std::vector<int> nicIndices;
cpuAgents.push_back(info.agentOwner); GetClosestNicsToGpu(nicIndices, origExeDevice.exeIndex, exeRank);
ERR_CHECK(DeallocateMemory(MEM_CPU, tempBuffer, 1024)); if (origExeDevice.exeSlot < 0 || origExeDevice.exeSlot >= nicIndices.size()) {
} return {ERR_FATAL, "Rank %d GPU %d closest NIC slot %d is invalid (%lu slots detected)",
exeRank, origExeDevice.exeIndex, origExeDevice.exeSlot, nicIndices.size()};
// Index GPU agents
gpuAgents.clear();
for (int i = 0; i < numGpus; i++) {
ERR_CHECK(AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer));
ERR_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
gpuAgents.push_back(info.agentOwner);
ERR_CHECK(DeallocateMemory(MEM_GPU, tempBuffer, 1024));
}
isInitialized = true;
}
switch (exeDevice.exeType) {
case EXE_CPU:
if (exeIndex < 0 || exeIndex >= numCpus)
return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1};
agent = cpuAgents[exeDevice.exeIndex];
break;
case EXE_GPU_GFX: case EXE_GPU_DMA:
if (exeIndex < 0 || exeIndex >= numGpus)
return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
agent = gpuAgents[exeIndex];
break;
default:
return {ERR_FATAL,
"Attempting to get HSA agent of unknown or unsupported executor type (%d)",
exeDevice.exeType};
}
return ERR_NONE;
}
// Get the hsa_agent_t associated with a MemDevice
static ErrResult GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent)
{
if (memDevice.memType == MEM_CPU_CLOSEST)
return GetHsaAgent({EXE_CPU, GetClosestCpuNumaToGpu(memDevice.memIndex)}, agent);
if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
return {ERR_FATAL,
"Unable to get HSA agent for memDevice (%d,%d)",
memDevice.memType, memDevice.memIndex};
}
#endif
// Setup validation-related functions
//========================================================================================
static ErrResult GetActualExecutor(ConfigOptions const& cfg,
ExeDevice const& origExeDevice,
ExeDevice& actualExeDevice)
{
// By default, nothing needs to change
actualExeDevice = origExeDevice;
// When using NIC_NEAREST, remap to the closest NIC to the GPU
if (origExeDevice.exeType == EXE_NIC_NEAREST) {
actualExeDevice.exeType = EXE_NIC;
if (cfg.nic.closestNics.size() > 0) {
if (origExeDevice.exeIndex < 0 || origExeDevice.exeIndex >= cfg.nic.closestNics.size())
return {ERR_FATAL, "NIC index is out of range (%d)", origExeDevice.exeIndex};
actualExeDevice.exeIndex = cfg.nic.closestNics[origExeDevice.exeIndex];
} else {
actualExeDevice.exeIndex = GetClosestNicToGpu(origExeDevice.exeIndex);
} }
actualExeDevice.exeIndex = nicIndices[actualExeDevice.exeSlot];
actualExeDevice.exeSlot = 0;
} }
return ERR_NONE; return ERR_NONE;
} }
...@@ -972,22 +1463,27 @@ namespace { ...@@ -972,22 +1463,27 @@ namespace {
if (memDevice.memType == MEM_NULL) if (memDevice.memType == MEM_NULL)
return ERR_NONE; return ERR_NONE;
if (memDevice.memRank < 0 || memDevice.memRank >= GetNumRanks()) {
return {ERR_FATAL,
"Rank index must be between 0 and %d (instead of %d)", GetNumRanks() - 1, memDevice.memRank};
}
if (IsCpuMemType(memDevice.memType) && memDevice.memType != MEM_CPU_CLOSEST) { if (IsCpuMemType(memDevice.memType) && memDevice.memType != MEM_CPU_CLOSEST) {
int numCpus = GetNumExecutors(EXE_CPU); int numCpus = GetNumExecutors(EXE_CPU, memDevice.memRank);
if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus) if (memDevice.memIndex < 0 || memDevice.memIndex >= numCpus)
return {ERR_FATAL, return {ERR_FATAL,
"CPU index must be between 0 and %d (instead of %d)", numCpus - 1, memDevice.memIndex}; "CPU index must be between 0 and %d (instead of %d) on rank %d", numCpus - 1, memDevice.memIndex, memDevice.memRank};
return ERR_NONE; return ERR_NONE;
} }
if (IsGpuMemType(memDevice.memType) || memDevice.memType == MEM_CPU_CLOSEST) { if (IsGpuMemType(memDevice.memType) || memDevice.memType == MEM_CPU_CLOSEST) {
int numGpus = GetNumExecutors(EXE_GPU_GFX); int numGpus = GetNumExecutors(EXE_GPU_GFX, memDevice.memRank);
if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus) if (memDevice.memIndex < 0 || memDevice.memIndex >= numGpus)
return {ERR_FATAL, return {ERR_FATAL,
"GPU index must be between 0 and %d (instead of %d)", numGpus - 1, memDevice.memIndex}; "GPU index must be between 0 and %d (instead of %d) on rank %d", numGpus - 1, memDevice.memIndex, memDevice.memRank};
if (memDevice.memType == MEM_CPU_CLOSEST) { if (memDevice.memType == MEM_CPU_CLOSEST) {
if (GetClosestCpuNumaToGpu(memDevice.memIndex) == -1) { if (GetClosestCpuNumaToGpu(memDevice.memIndex, memDevice.memRank) == -1) {
return {ERR_FATAL, "Unable to determine closest NUMA node for GPU %d", memDevice.memIndex}; return {ERR_FATAL, "Unable to determine closest NUMA node for GPU %d on rank %d", memDevice.memIndex, memDevice.memRank};
} }
} }
return ERR_NONE; return ERR_NONE;
...@@ -995,6 +1491,119 @@ namespace { ...@@ -995,6 +1491,119 @@ namespace {
return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType}; return {ERR_FATAL, "Unsupported memory type (%d)", memDevice.memType};
} }
static void CheckMultiNodeConfigConsistency(ConfigOptions const& cfg,
std::vector<ErrResult>& errors)
{
if (GetCommMode() == COMM_NONE) return;
if (System::Get().IsVerbose()) {
printf("[INFO] Rank %d checking config consistency\n", GetRank());
}
// To check consistency, compare against rank 0
int root = 0;
#define ADD_ERROR(STR) errors.push_back({ERR_FATAL, STR " must be consistent across all ranks"})
// Compare general options
{
GeneralOptions general = cfg.general;
System::Get().Broadcast(root, sizeof(general), &general);
if (general.numIterations != cfg.general.numIterations) ADD_ERROR("cfg.general.numIterations");
if (general.numSubIterations != cfg.general.numSubIterations) ADD_ERROR("cfg.general.numSubIterations");
if (general.numWarmups != cfg.general.numWarmups) ADD_ERROR("cfg.general.numWarmups");
if (general.recordPerIteration != cfg.general.recordPerIteration) ADD_ERROR("cfg.general.recordPerIteration");
if (general.useInteractive != cfg.general.useInteractive) ADD_ERROR("cfg.general.useInteractive");
}
// Compare data options
{
DataOptions data = cfg.data;
System::Get().Broadcast(root, sizeof(data), &data);
// data.alwaysValidate is permitted to be different across ranks
if (data.blockBytes != cfg.data.blockBytes) ADD_ERROR("cfg.data.blockBytes");
if (data.byteOffset != cfg.data.byteOffset) ADD_ERROR("cfg.data.byteOffset");
size_t fillPatternSize = cfg.data.fillPattern.size();
System::Get().Broadcast(root, sizeof(fillPatternSize), &fillPatternSize);
if (fillPatternSize != cfg.data.fillPattern.size()) {
ADD_ERROR("cfg.data.fillPattern");
} else if (fillPatternSize > 0) {
auto fillPatternTemp = cfg.data.fillPattern;
System::Get().BroadcastVector(0, fillPatternTemp);
for (size_t i = 0; i < fillPatternSize; i++) {
if (fillPatternTemp[i] != cfg.data.fillPattern[i]) {
ADD_ERROR("cfg.data.fillPattern");
break;
}
}
}
size_t fillCompressSize = cfg.data.fillCompress.size();
System::Get().Broadcast(root, sizeof(fillCompressSize), &fillCompressSize);
if (fillCompressSize != cfg.data.fillCompress.size()) {
ADD_ERROR("cfg.data.fillCompress");
} else if (fillCompressSize > 0) {
auto fillCompressTemp = cfg.data.fillCompress;
System::Get().BroadcastVector(0, fillCompressTemp);
for (size_t i = 0; i < fillCompressSize; i++) {
if (fillCompressTemp[i] != cfg.data.fillCompress[i]) {
ADD_ERROR("cfg.data.fillCompress");
break;
}
}
}
// data.validateDirect is permitted to be different across ranks
// data.validateSource is permitted to be different across ranks
}
// Compare GFX Executor options
{
GfxOptions gfx = cfg.gfx;
System::Get().Broadcast(root, sizeof(gfx), &gfx);
if (gfx.blockOrder != cfg.gfx.blockOrder) ADD_ERROR("cfg.gfx.blockOrder");
if (gfx.blockSize != cfg.gfx.blockSize) ADD_ERROR("cfg.gfx.blockSize");
// gfx.cuMask is permitted to be different across ranks
// gfx.perfXccTable is permitted to be different across ranks
if (gfx.seType != cfg.gfx.seType) ADD_ERROR("cfg.gfx.seType");
if (gfx.temporalMode != cfg.gfx.temporalMode) ADD_ERROR("cfg.gfx.temporalMode");
if (gfx.unrollFactor != cfg.gfx.unrollFactor) ADD_ERROR("cfg.gfx.unrollFactor)");
if (gfx.useHipEvents != cfg.gfx.useHipEvents) ADD_ERROR("cfg.gfx.useHipEvents");
if (gfx.useMultiStream != cfg.gfx.useMultiStream) ADD_ERROR("cfg.gfx.useMultiStream");
if (gfx.useSingleTeam != cfg.gfx.useSingleTeam) ADD_ERROR("cfg.gfx.useSingleTeam");
if (gfx.waveOrder != cfg.gfx.waveOrder) ADD_ERROR("cfg.gfx.waveOrder");
if (gfx.wordSize != cfg.gfx.wordSize) ADD_ERROR("cfg.gfx.wordSize");
}
// Compare DMA Executor options
{
DmaOptions dma = cfg.dma;
System::Get().Broadcast(root, sizeof(dma), &dma);
if (dma.useHipEvents != cfg.dma.useHipEvents) ADD_ERROR("cfg.dma.useHipEvents");
if (dma.useHsaCopy != cfg.dma.useHsaCopy) ADD_ERROR("cfg.dma.useHsaCopy");
}
// Compare NIC options
{
NicOptions nic = cfg.nic;
System::Get().Broadcast(root, sizeof(nic), &nic);
if (nic.chunkBytes != cfg.nic.chunkBytes) ADD_ERROR("cfg.nic.chunkBytes");
// nic.ibGidIndex is permitted to be different across ranks
// nic.ibPort is permitted to be different across ranks
if (nic.ipAddressFamily != cfg.nic.ipAddressFamily) ADD_ERROR("cfg.nic.ipAddressFamily");
if (nic.maxRecvWorkReq != cfg.nic.maxRecvWorkReq) ADD_ERROR("cfg.nic.maxRecvWorkReq");
if (nic.maxSendWorkReq != cfg.nic.maxSendWorkReq) ADD_ERROR("cfg.nic.maxSendWorkReq");
// nic.queueSize is permitted to be different across ranks
if (nic.roceVersion != cfg.nic.roceVersion) ADD_ERROR("cfg.nic.roceVersion");
if (nic.useRelaxedOrder != cfg.nic.useRelaxedOrder) ADD_ERROR("cfg.nic.useRelaxedOrder");
if (nic.useNuma != cfg.nic.useNuma) ADD_ERROR("cfg.nic.useNuma");
}
#undef ADD_ERROR
}
// Validate configuration options - return trues if and only if an fatal error is detected // Validate configuration options - return trues if and only if an fatal error is detected
static bool ConfigOptionsHaveErrors(ConfigOptions const& cfg, static bool ConfigOptionsHaveErrors(ConfigOptions const& cfg,
std::vector<ErrResult>& errors) std::vector<ErrResult>& errors)
...@@ -1003,6 +1612,9 @@ namespace { ...@@ -1003,6 +1612,9 @@ namespace {
if (cfg.general.numWarmups < 0) if (cfg.general.numWarmups < 0)
errors.push_back({ERR_FATAL, "[general.numWarmups] must be a non-negative number"}); errors.push_back({ERR_FATAL, "[general.numWarmups] must be a non-negative number"});
// Check that config options are consistent (where necessary) across all ranks
CheckMultiNodeConfigConsistency(cfg, errors);
// Check data options // Check data options
if (cfg.data.blockBytes == 0 || cfg.data.blockBytes % 4) if (cfg.data.blockBytes == 0 || cfg.data.blockBytes % 4)
errors.push_back({ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)}); errors.push_back({ERR_FATAL, "[data.blockBytes] must be positive multiple of %lu", sizeof(float)});
...@@ -1085,16 +1697,9 @@ namespace { ...@@ -1085,16 +1697,9 @@ namespace {
// Check NIC options // Check NIC options
#ifdef NIC_EXEC_ENABLED #ifdef NIC_EXEC_ENABLED
int numNics = GetNumExecutors(EXE_NIC); if (cfg.nic.chunkBytes == 0 || (cfg.nic.chunkBytes % 4 != 0)) {
for (auto const& nic : cfg.nic.closestNics) errors.push_back({ERR_FATAL, "[nic.chunkBytes] must be a non-negative multiple of 4"});
if (nic < 0 || nic >= numNics) }
errors.push_back({ERR_FATAL, "NIC index (%d) in user-specified closest NIC list must be between 0 and %d",
nic, numNics - 1});
size_t closetNicsSize = cfg.nic.closestNics.size();
if (closetNicsSize > 0 && closetNicsSize < numGpus)
errors.push_back({ERR_FATAL, "User-specified closest NIC list must match GPU count of %d",
numGpus});
#endif #endif
// NVIDIA specific // NVIDIA specific
...@@ -1123,20 +1728,73 @@ namespace { ...@@ -1123,20 +1728,73 @@ namespace {
return false; return false;
} }
static void CheckMultiNodeTransferConsistency(std::vector<Transfer> const& transfers,
std::vector<ErrResult>& errors)
{
if (GetCommMode() == COMM_NONE) return;
if (System::Get().IsVerbose()) {
printf("[INFO] Rank %d checking transfers consistency\n", GetRank());
}
// To check consistency, compare against rank 0
int root = 0;
#define ADD_ERROR(STR) \
do { \
isInconsistent = true; \
if (System::Get().IsVerbose()) \
errors.push_back({ERR_FATAL, STR " must be the same for Transfer %d on all ranks", i}); \
} while(0)
size_t numTransfers = transfers.size();
System::Get().Broadcast(root, sizeof(numTransfers), &numTransfers);
if (numTransfers != transfers.size()) {
errors.push_back({ERR_FATAL, "The number of Transfers to run must be consistent across ranks"});
}
bool isInconsistent = false;
for (size_t i = 0; i < numTransfers; i++) {
Transfer t = transfers[i];
System::Get().Broadcast(root, sizeof(t.numBytes), &t.numBytes);
System::Get().BroadcastVector(root, t.srcs);
System::Get().BroadcastVector(root, t.dsts);
System::Get().Broadcast(root, sizeof(t.exeDevice), &t.exeDevice);
System::Get().Broadcast(root, sizeof(t.exeSubIndex), &t.exeSubIndex);
System::Get().Broadcast(root, sizeof(t.exeSubSlot), &t.exeSubSlot);
System::Get().Broadcast(root, sizeof(t.numSubExecs), &t.numSubExecs);
if (t.numBytes != transfers[i].numBytes) ADD_ERROR("numBytes");
if (t.srcs != transfers[i].srcs) ADD_ERROR("Source memory locations");
if (t.dsts != transfers[i].dsts) ADD_ERROR("Destination memory locations");
if (t.exeDevice < transfers[i].exeDevice ||
transfers[i].exeDevice < t.exeDevice) ADD_ERROR("Executor device");
if (t.exeSubIndex != transfers[i].exeSubIndex) ADD_ERROR("Executor subindex");
if (t.exeSubSlot != transfers[i].exeSubSlot) ADD_ERROR("Executor dst slot");
if (t.numSubExecs != transfers[i].numSubExecs) ADD_ERROR("Num SubExecutors");
}
if (isInconsistent && !System::Get().IsVerbose()) {
errors.push_back({ERR_FATAL, "Transfers to execute must be identical across all ranks"});
}
#undef ADD_ERROR
}
// Validate Transfers to execute - returns true if and only if fatal error detected // Validate Transfers to execute - returns true if and only if fatal error detected
static bool TransfersHaveErrors(ConfigOptions const& cfg, static bool TransfersHaveErrors(ConfigOptions const& cfg,
std::vector<Transfer> const& transfers, std::vector<Transfer> const& transfers,
std::vector<ErrResult>& errors) std::vector<ErrResult>& errors)
{ {
int numCpus = GetNumExecutors(EXE_CPU);
int numGpus = GetNumExecutors(EXE_GPU_GFX);
int numNics = GetNumExecutors(EXE_NIC);
std::set<ExeDevice> executors; std::set<ExeDevice> executors;
std::map<ExeDevice, int> transferCount; std::map<ExeDevice, int> transferCount;
std::map<ExeDevice, int> useSubIndexCount; std::map<ExeDevice, int> useSubIndexCount;
std::map<ExeDevice, int> totalSubExecs; std::map<ExeDevice, int> totalSubExecs;
// Check that the set of requested transfers is consistent across all ranks
CheckMultiNodeTransferConsistency(transfers, errors);
// Per-Transfer checks // Per-Transfer checks
for (size_t i = 0; i < transfers.size(); i++) { for (size_t i = 0; i < transfers.size(); i++) {
Transfer const& t = transfers[i]; Transfer const& t = transfers[i];
...@@ -1144,6 +1802,9 @@ namespace { ...@@ -1144,6 +1802,9 @@ namespace {
if (t.numBytes == 0) if (t.numBytes == 0)
errors.push_back({ERR_FATAL, "Transfer %d: Cannot perform 0-byte transfers", i}); errors.push_back({ERR_FATAL, "Transfer %d: Cannot perform 0-byte transfers", i});
// Each subexecutor is assigned a multiple of cfg.data.blockBytes, however this may
// mean that some subexecutors might not have any work assigned to them if the amount to
// transfer is small
if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) { if (t.exeDevice.exeType == EXE_GPU_GFX || t.exeDevice.exeType == EXE_CPU) {
size_t const N = t.numBytes / sizeof(float); size_t const N = t.numBytes / sizeof(float);
int const targetMultiple = cfg.data.blockBytes / sizeof(float); int const targetMultiple = cfg.data.blockBytes / sizeof(float);
...@@ -1152,8 +1813,8 @@ namespace { ...@@ -1152,8 +1813,8 @@ namespace {
if (maxSubExecToUse < t.numSubExecs) if (maxSubExecToUse < t.numSubExecs)
errors.push_back({ERR_WARN, errors.push_back({ERR_WARN,
"Transfer %d data size is too small - will only use %d of %d subexecutors", "Transfer %d data size is too small - will only use %d of %d subexecutors due to blockBytes of %d",
i, maxSubExecToUse, t.numSubExecs}); i, maxSubExecToUse, t.numSubExecs, cfg.data.blockBytes});
} }
// Check sources and destinations // Check sources and destinations
...@@ -1171,21 +1832,29 @@ namespace { ...@@ -1171,21 +1832,29 @@ namespace {
errors.push_back({ERR_FATAL, "Transfer %d: DST %d: %s", i, j, err.errMsg.c_str()}); errors.push_back({ERR_FATAL, "Transfer %d: DST %d: %s", i, j, err.errMsg.c_str()});
} }
// Check executor // Check executor rank
if (t.exeDevice.exeRank < 0 || t.exeDevice.exeRank >= GetNumRanks()) {
errors.push_back({ERR_FATAL,
"Rank index for executor must be between 0 and %d (instead of %d)", GetNumRanks() - 1, t.exeDevice.exeRank});
continue;
}
executors.insert(t.exeDevice); executors.insert(t.exeDevice);
transferCount[t.exeDevice]++; transferCount[t.exeDevice]++;
int numExecutors = GetNumExecutors(t.exeDevice.exeType, t.exeDevice.exeRank);
switch (t.exeDevice.exeType) { switch (t.exeDevice.exeType) {
case EXE_CPU: case EXE_CPU:
if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numCpus) if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors)
errors.push_back({ERR_FATAL, errors.push_back({ERR_FATAL,
"Transfer %d: CPU index must be between 0 and %d (instead of %d)", "Transfer %d: CPU index must be between 0 and %d (instead of %d) for rank %d",
i, numCpus - 1, t.exeDevice.exeIndex}); i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank});
break; break;
case EXE_GPU_GFX: case EXE_GPU_GFX:
if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) { if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) {
errors.push_back({ERR_FATAL, errors.push_back({ERR_FATAL,
"Transfer %d: GFX index must be between 0 and %d (instead of %d)", "Transfer %d: GFX index must be between 0 and %d (instead of %d) for rank %d",
i, numGpus - 1, t.exeDevice.exeIndex}); i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank});
} else { } else {
if (t.exeSubIndex != -1) { if (t.exeSubIndex != -1) {
#if defined(__NVCC__) #if defined(__NVCC__)
...@@ -1196,7 +1865,7 @@ namespace { ...@@ -1196,7 +1865,7 @@ namespace {
int numSubIndices = GetNumExecutorSubIndices(t.exeDevice); int numSubIndices = GetNumExecutorSubIndices(t.exeDevice);
if (t.exeSubIndex >= numSubIndices) if (t.exeSubIndex >= numSubIndices)
errors.push_back({ERR_FATAL, errors.push_back({ERR_FATAL,
"Transfer %d: GFX subIndex (XCC) must be between 0 and %d", i, numSubIndices - 1}); "Transfer %d: GFX subIndex (XCC) must be between 0 and %d for rank %d", i, numSubIndices - 1, t.exeDevice.exeRank});
#endif #endif
} }
} }
...@@ -1207,10 +1876,10 @@ namespace { ...@@ -1207,10 +1876,10 @@ namespace {
"Transfer %d: DMA executor must have exactly 1 source and 1 destination", i}); "Transfer %d: DMA executor must have exactly 1 source and 1 destination", i});
} }
if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numGpus) { if (t.exeDevice.exeIndex < 0 || t.exeDevice.exeIndex >= numExecutors) {
errors.push_back({ERR_FATAL, errors.push_back({ERR_FATAL,
"Transfer %d: DMA index must be between 0 and %d (instead of %d)", "Transfer %d: DMA index must be between 0 and %d (instead of %d) for rank %d",
i, numGpus - 1, t.exeDevice.exeIndex}); i, numExecutors - 1, t.exeDevice.exeIndex, t.exeDevice.exeRank});
// Cannot proceed with any further checks // Cannot proceed with any further checks
continue; continue;
} }
...@@ -1230,12 +1899,12 @@ namespace { ...@@ -1230,12 +1899,12 @@ namespace {
// Check that engine Id exists between agents // Check that engine Id exists between agents
hsa_agent_t srcAgent, dstAgent; hsa_agent_t srcAgent, dstAgent;
ErrResult err; ErrResult err;
err = GetHsaAgent(t.srcs[0], srcAgent); err = System::Get().GetHsaAgent(t.srcs[0], srcAgent);
if (err.errType != ERR_NONE) { if (err.errType != ERR_NONE) {
errors.push_back(err); errors.push_back(err);
if (err.errType == ERR_FATAL) break; if (err.errType == ERR_FATAL) break;
} }
err = GetHsaAgent(t.dsts[0], dstAgent); err = System::Get().GetHsaAgent(t.dsts[0], dstAgent);
if (err.errType != ERR_NONE) { if (err.errType != ERR_NONE) {
errors.push_back(err); errors.push_back(err);
if (err.errType == ERR_FATAL) break; if (err.errType == ERR_FATAL) break;
...@@ -1268,46 +1937,84 @@ namespace { ...@@ -1268,46 +1937,84 @@ namespace {
if (IsGpuMemType(t.srcs[0].memType)) { if (IsGpuMemType(t.srcs[0].memType)) {
if (t.srcs[0].memIndex != t.exeDevice.exeIndex) { if (t.srcs[0].memIndex != t.exeDevice.exeIndex) {
errors.push_back({ERR_WARN, errors.push_back({ERR_WARN,
"Transfer %d: DMA executor will automatically switch to using the source memory device (%d) not (%d)", "Transfer %d: DMA executor may automatically switch to using the source memory device (%d) not (%d)",
i, t.srcs[0].memIndex, t.exeDevice.exeIndex}); i, t.srcs[0].memIndex, t.exeDevice.exeIndex});
} }
} else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) { } else if (t.dsts[0].memIndex != t.exeDevice.exeIndex) {
errors.push_back({ERR_WARN, errors.push_back({ERR_WARN,
"Transfer %d: DMA executor will automatically switch to using the destination memory device (%d) not (%d)", "Transfer %d: DMA executor may automatically switch to using the destination memory device (%d) not (%d)",
i, t.dsts[0].memIndex, t.exeDevice.exeIndex}); i, t.dsts[0].memIndex, t.exeDevice.exeIndex});
} }
} }
break; break;
case EXE_NIC: case EXE_NIC: case EXE_NIC_NEAREST:
#ifdef NIC_EXEC_ENABLED #ifdef NIC_EXEC_ENABLED
{ {
int srcIndex = t.exeDevice.exeIndex; // NIC Executors can only execute a copy operation
int dstIndex = t.exeSubIndex; if (t.srcs.size() != 1 || t.dsts.size() != 1) {
if (srcIndex < 0 || srcIndex >= numNics) errors.push_back({ERR_FATAL, "Transfer %d: NIC executor requires single SRC and single DST", i});
errors.push_back({ERR_FATAL, "Transfer %d: src NIC executor indexes an out-of-range NIC (%d)", i, srcIndex}); break;
if (dstIndex < 0 || dstIndex >= numNics)
errors.push_back({ERR_FATAL, "Transfer %d: dst NIC executor indexes an out-of-range NIC (%d)", i, dstIndex});
} }
#else
errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i}); // NIC executor cannot do remote read + remote write - either src or dst must be local
#endif int srcExeRank = t.exeDevice.exeRank;
int srcMemRank = t.srcs[0].memRank;
int dstMemRank = t.dsts[0].memRank;
int dstExeRank = (srcExeRank == srcMemRank ? dstMemRank : srcMemRank);
if (srcMemRank != srcExeRank && dstMemRank != srcExeRank) {
errors.push_back({ERR_FATAL,
"Transfer %d: NIC executor rank (%d) must be same as SRC memory rank (%d) or DST memory rank (%d)", i, srcExeRank, srcMemRank, dstMemRank});
break; break;
case EXE_NIC_NEAREST: }
#ifdef NIC_EXEC_ENABLED
{ // The SRC NIC executor is the one that initiates either a (remote read/local write) or (local read/remote write) copy operation
ExeDevice srcExeDevice; ExeDevice srcExeDevice;
ErrResult errSrc = GetActualExecutor(cfg, t.exeDevice, srcExeDevice); ErrResult errSrc = GetActualExecutor(t.exeDevice, srcExeDevice);
if (errSrc.errType != ERR_NONE) errors.push_back(errSrc); if (errSrc.errType != ERR_NONE) errors.push_back(errSrc);
// Check that the SRC NIC exists and is active
if (srcExeDevice.exeIndex < 0 || srcExeDevice.exeIndex >= GetNumExecutors(EXE_NIC, srcExeRank)) {
errors.push_back({ERR_FATAL, "Transfer %d: Rank %d SRC NIC executor indexes an out-of-range NIC (%d). Detected %d NICs",
i, srcExeRank, srcExeDevice.exeIndex, GetNumExecutors(EXE_NIC, srcExeRank)});
} else if (!NicIsActive(srcExeDevice.exeIndex, srcExeDevice.exeRank)) {
errors.push_back({ERR_FATAL, "Transfer %d: Rank %d SRC NIC executor %d is not active", i, srcExeDevice.exeRank, srcExeDevice.exeIndex});
}
// The DST NIC executor facilitates the copy but issues no commands
ExeDevice dstOrgDevice = {t.exeDevice.exeType, t.exeSubIndex, dstExeRank, t.exeSubSlot};
ExeDevice dstExeDevice; ExeDevice dstExeDevice;
ErrResult errDst = GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice); ErrResult errDst = GetActualExecutor(dstOrgDevice, dstExeDevice);
if (errDst.errType != ERR_NONE) errors.push_back(errDst);
// Check that the DST NIC exists and is active
if (dstExeDevice.exeIndex < 0 || dstExeDevice.exeIndex >= GetNumExecutors(EXE_NIC, dstExeRank)) {
errors.push_back({ERR_FATAL, "Transfer %d: Rank %d DST NIC executor indexes an out-of-range NIC (%d). Detected %d NICs",
i, dstExeRank, dstExeDevice.exeIndex, GetNumExecutors(EXE_NIC, dstExeRank)});
} else if (!NicIsActive(dstExeDevice.exeIndex, dstExeDevice.exeRank)) {
errors.push_back({ERR_FATAL, "Transfer %d: Rank %d DST NIC executor %d is not active", i, dstExeDevice.exeRank, dstExeDevice.exeIndex});
}
} }
#else #else
errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available", i}); errors.push_back({ERR_FATAL, "Transfer %d: NIC executor is requested but is not available.", i});
#endif #endif
break; break;
} }
// Check for multi-node support
// Currently this is not supported for CPU/GPU executors
if (IsCpuExeType(t.exeDevice.exeType) || IsGpuExeType(t.exeDevice.exeType)) {
bool crossRank = false;
for (auto const& src : t.srcs) {
crossRank |= (src.memRank != t.exeDevice.exeRank);
}
for (auto const& dst : t.dsts) {
crossRank |= (dst.memRank != t.exeDevice.exeRank);
}
if (crossRank) {
errors.push_back({ERR_FATAL, "Transfer %d: Executor on rank %d can not access memory across ranks\n",
i, t.exeDevice.exeRank});
}
}
// Check subexecutors // Check subexecutors
if (t.numSubExecs <= 0) if (t.numSubExecs <= 0)
errors.push_back({ERR_FATAL, "Transfer %d: # of subexecutors must be positive", i}); errors.push_back({ERR_FATAL, "Transfer %d: # of subexecutors must be positive", i});
...@@ -1462,8 +2169,9 @@ namespace { ...@@ -1462,8 +2169,9 @@ namespace {
ibv_mr* srcMemRegion; ///< Memory region for SRC ibv_mr* srcMemRegion; ///< Memory region for SRC
ibv_mr* dstMemRegion; ///< Memory region for DST ibv_mr* dstMemRegion; ///< Memory region for DST
uint8_t qpCount; ///< Number of QPs to be used for transferring data uint8_t qpCount; ///< Number of QPs to be used for transferring data
vector<ibv_sge> sgePerQueuePair; ///< Scatter-gather elements per queue pair bool srcIsExeNic; ///< Whether SRC or DST NIC initiates traffic
vector<ibv_send_wr> sendWorkRequests; ///< Send work requests per queue pair vector<vector<ibv_sge>> sgePerQueuePair; ///< Scatter-gather elements per queue pair
vector<vector<ibv_send_wr>>sendWorkRequests; ///< Send work requests per queue pair
#endif #endif
// Counters // Counters
...@@ -1651,9 +2359,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -1651,9 +2359,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int numIbvDevices = 0; int numIbvDevices = 0;
ibv_device** deviceList = ibv_get_device_list(&numIbvDevices); ibv_device** deviceList = ibv_get_device_list(&numIbvDevices);
// Check for NIC_FILTER
// By default, accept all NIC names
std::string nicFilterPattern = getenv("NIC_FILTER") ? getenv("NIC_FILTER") : ".*";
if (deviceList && numIbvDevices > 0) { if (deviceList && numIbvDevices > 0) {
// Loop over each device to collect information // Loop over each device to collect information
for (int i = 0; i < numIbvDevices; i++) { for (int i = 0; i < numIbvDevices; i++) {
// Filter by name
if (!std::regex_match(deviceList[i]->name, std::regex(nicFilterPattern))) continue;
IbvDevice ibvDevice; IbvDevice ibvDevice;
ibvDevice.devicePtr = deviceList[i]; ibvDevice.devicePtr = deviceList[i];
ibvDevice.name = deviceList[i]->name; ibvDevice.name = deviceList[i]->name;
...@@ -1730,7 +2446,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -1730,7 +2446,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
//======================================================================================== //========================================================================================
// Prints off PCIe tree // Prints off PCIe tree
static void PrintPCIeTree(PCIeNode const& node, static inline void PrintPCIeTree(PCIeNode const& node,
std::string const& prefix = "", std::string const& prefix = "",
bool isLast = true) bool isLast = true)
{ {
...@@ -1781,7 +2497,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -1781,7 +2497,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Build PCIe tree on first use // Build PCIe tree on first use
if (!isInitialized) { if (!isInitialized) {
// Add NICs to the tree // Add NICs to the tree
int numNics = GetNumExecutors(EXE_NIC);
auto const& ibvDeviceList = GetIbvDeviceList(); auto const& ibvDeviceList = GetIbvDeviceList();
for (IbvDevice const& ibvDevice : ibvDeviceList) { for (IbvDevice const& ibvDevice : ibvDeviceList) {
if (!ibvDevice.hasActivePort || ibvDevice.busId == "") continue; if (!ibvDevice.hasActivePort || ibvDevice.busId == "") continue;
...@@ -1789,7 +2504,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -1789,7 +2504,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
// Add GPUs to the tree // Add GPUs to the tree
int numGpus = GetNumExecutors(EXE_GPU_GFX); int numGpus = 0;
if (hipGetDeviceCount(&numGpus) != hipSuccess) numGpus = 0;
for (int i = 0; i < numGpus; ++i) { for (int i = 0; i < numGpus; ++i) {
char hipPciBusId[64]; char hipPciBusId[64];
if (hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i) == hipSuccess) { if (hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i) == hipSuccess) {
...@@ -1960,12 +2676,20 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -1960,12 +2676,20 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
return ERR_NONE; return ERR_NONE;
} }
// Structure used to exchange connection information
struct __attribute__((packed)) ConnInfo
{
uint16_t lid; // Local routing id
ibv_gid gid; // Global routing id (RoCE)
int gidIdx; // Global routing id index (RoCE)
uint32_t qpn; // Queue pair number
uint32_t rkey; // Remote memory access key
uint64_t vaddr; // Remote virtual address of the memory region
};
// Transition QueuePair to Ready to Receive State // Transition QueuePair to Ready to Receive State
static ErrResult TransitionQpToRtr(ibv_qp* qp, static ErrResult TransitionQpToRtr(ibv_qp* qp,
uint16_t const& dlid, ConnInfo const& connInfo,
uint32_t const& dqpn,
ibv_gid const& gid,
uint8_t const& gidIndex,
uint8_t const& port, uint8_t const& port,
bool const& isRoCE, bool const& isRoCE,
ibv_mtu const& mtu) ibv_mtu const& mtu)
...@@ -1979,19 +2703,19 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -1979,19 +2703,19 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
attr.min_rnr_timer = 12; attr.min_rnr_timer = 12;
if (isRoCE) { if (isRoCE) {
attr.ah_attr.is_global = 1; attr.ah_attr.is_global = 1;
attr.ah_attr.grh.dgid.global.subnet_prefix = gid.global.subnet_prefix; attr.ah_attr.grh.dgid.global.subnet_prefix = connInfo.gid.global.subnet_prefix;
attr.ah_attr.grh.dgid.global.interface_id = gid.global.interface_id; attr.ah_attr.grh.dgid.global.interface_id = connInfo.gid.global.interface_id;
attr.ah_attr.grh.flow_label = 0; attr.ah_attr.grh.flow_label = 0;
attr.ah_attr.grh.sgid_index = gidIndex; attr.ah_attr.grh.sgid_index = connInfo.gidIdx;
attr.ah_attr.grh.hop_limit = 255; attr.ah_attr.grh.hop_limit = 255;
} else { } else {
attr.ah_attr.is_global = 0; attr.ah_attr.is_global = 0;
attr.ah_attr.dlid = dlid; attr.ah_attr.dlid = connInfo.lid;
} }
attr.ah_attr.sl = 0; attr.ah_attr.sl = 0;
attr.ah_attr.src_path_bits = 0; attr.ah_attr.src_path_bits = 0;
attr.ah_attr.port_num = port; attr.ah_attr.port_num = port;
attr.dest_qp_num = dqpn; attr.dest_qp_num = connInfo.qpn;
// Modify the QP // Modify the QP
int ret = ibv_modify_qp(qp, &attr, int ret = ibv_modify_qp(qp, &attr,
...@@ -2033,38 +2757,32 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2033,38 +2757,32 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg, static ErrResult PrepareNicTransferResources(ConfigOptions const& cfg,
ExeDevice const& srcExeDevice, ExeDevice const& nicExeDevice,
Transfer const& t, Transfer const& t,
TransferResources& rss) TransferResources& rss)
{ {
// Switch to the closest NUMA node to this NIC // The NIC executor is the one that initiates either a (remote read/local write) or (local read/remote write) copy operation
int numaNode = GetIbvDeviceList()[srcExeDevice.exeIndex].numaNode; // The NON executor is the NIC executor that facilitates the copy but issues no commands
if (numaNode != -1) // TransferResources will be mostly prepared only on the ranks that are involved in this transfer, although all ranks pass
numa_run_on_node(numaNode); // through this code
int const srcMemRank = t.srcs[0].memRank;
int const port = cfg.nic.ibPort; int const dstMemRank = t.dsts[0].memRank;
int const nicExeRank = nicExeDevice.exeRank;
// Figure out destination NIC (Accounts for possible remap due to use of EXE_NIC_NEAREST) int const nonExeRank = (nicExeRank == srcMemRank ? dstMemRank : srcMemRank);
ExeDevice dstExeDevice; rss.srcIsExeNic = (srcMemRank == nicExeRank);
ERR_CHECK(GetActualExecutor(cfg, {t.exeDevice.exeType, t.exeSubIndex}, dstExeDevice));
// Figure out non Executor (Accounts for possible remap due to use of EXE_NIC_NEAREST)
rss.srcNicIndex = srcExeDevice.exeIndex; ExeDevice nonOrgDevice = {t.exeDevice.exeType, t.exeSubIndex, nonExeRank, t.exeSubSlot};
rss.dstNicIndex = dstExeDevice.exeIndex; ExeDevice nonExeDevice;
ERR_CHECK(GetActualExecutor(nonOrgDevice, nonExeDevice));
// All ranks track which NIC was used and number of queue pairs used
rss.srcNicIndex = (nicExeRank == srcMemRank ? nicExeDevice.exeIndex : nonExeDevice.exeIndex);
rss.dstNicIndex = (nicExeRank == srcMemRank ? nonExeDevice.exeIndex : nicExeDevice.exeIndex);
rss.qpCount = t.numSubExecs; rss.qpCount = t.numSubExecs;
// Check for valid NICs and active ports // Establish memory access flags
int numNics = GetNumExecutors(EXE_NIC);
if (rss.srcNicIndex < 0 || rss.srcNicIndex >= numNics)
return {ERR_FATAL, "SRC NIC index is out of range (%d)", rss.srcNicIndex};
if (rss.dstNicIndex < 0 || rss.dstNicIndex >= numNics)
return {ERR_FATAL, "DST NIC index is out of range (%d)", rss.dstNicIndex};
if (!GetIbvDeviceList()[rss.srcNicIndex].hasActivePort)
return {ERR_FATAL, "SRC NIC %d is not active\n", rss.srcNicIndex};
if (!GetIbvDeviceList()[rss.dstNicIndex].hasActivePort)
return {ERR_FATAL, "DST NIC %d is not active\n", rss.dstNicIndex};
// Queue pair flags
unsigned int rdmaAccessFlags = (IBV_ACCESS_LOCAL_WRITE | unsigned int rdmaAccessFlags = (IBV_ACCESS_LOCAL_WRITE |
IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_READ |
IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_WRITE |
...@@ -2073,129 +2791,221 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2073,129 +2791,221 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
unsigned int rdmaMemRegFlags = rdmaAccessFlags; unsigned int rdmaMemRegFlags = rdmaAccessFlags;
if (cfg.nic.useRelaxedOrder) rdmaMemRegFlags |= IBV_ACCESS_RELAXED_ORDERING; if (cfg.nic.useRelaxedOrder) rdmaMemRegFlags |= IBV_ACCESS_RELAXED_ORDERING;
// Open NIC contexts int const port = cfg.nic.ibPort;
IBV_PTR_CALL(rss.srcContext, ibv_open_device, GetIbvDeviceList()[rss.srcNicIndex].devicePtr);
IBV_PTR_CALL(rss.dstContext, ibv_open_device, GetIbvDeviceList()[rss.dstNicIndex].devicePtr);
// Open protection domains // Prepare NIC on SRC mem rank
int srcGidIndex = cfg.nic.ibGidIndex;
bool srcIsRoCE = false;
if (GetRank() == srcMemRank) {
// Switch to closest CPU NUMA domain
int numaNode = GetIbvDeviceList()[rss.srcNicIndex].numaNode;
if (numaNode != -1)
numa_run_on_node(numaNode);
// Open SRC NIC context
IBV_PTR_CALL(rss.srcContext, ibv_open_device, GetIbvDeviceList()[rss.srcNicIndex].devicePtr);
// Open SRC protection domain
IBV_PTR_CALL(rss.srcProtect, ibv_alloc_pd, rss.srcContext); IBV_PTR_CALL(rss.srcProtect, ibv_alloc_pd, rss.srcContext);
IBV_PTR_CALL(rss.dstProtect, ibv_alloc_pd, rss.dstContext); // Register SRC memory region
// Register memory region
IBV_PTR_CALL(rss.srcMemRegion, ibv_reg_mr, rss.srcProtect, rss.srcMem[0], rss.numBytes, rdmaMemRegFlags); IBV_PTR_CALL(rss.srcMemRegion, ibv_reg_mr, rss.srcProtect, rss.srcMem[0], rss.numBytes, rdmaMemRegFlags);
IBV_PTR_CALL(rss.dstMemRegion, ibv_reg_mr, rss.dstProtect, rss.dstMem[0], rss.numBytes, rdmaMemRegFlags); // Create SRC completion queues
// Create completion queues
IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, cfg.nic.queueSize, NULL, NULL, 0); IBV_PTR_CALL(rss.srcCompQueue, ibv_create_cq, rss.srcContext, cfg.nic.queueSize, NULL, NULL, 0);
IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, cfg.nic.queueSize, NULL, NULL, 0); // Get SRC port attributes
// Get port attributes
IBV_CALL(ibv_query_port, rss.srcContext, port, &rss.srcPortAttr); IBV_CALL(ibv_query_port, rss.srcContext, port, &rss.srcPortAttr);
IBV_CALL(ibv_query_port, rss.dstContext, port, &rss.dstPortAttr);
if (rss.srcPortAttr.link_layer != rss.dstPortAttr.link_layer)
return {ERR_FATAL, "SRC NIC (%d) and DST NIC (%d) do not have the same link layer", rss.srcNicIndex, rss.dstNicIndex};
// Prepare GID index
int srcGidIndex = cfg.nic.ibGidIndex;
int dstGidIndex = cfg.nic.ibGidIndex;
// Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately // Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately
bool isRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET); srcIsRoCE = (rss.srcPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET);
if (isRoCE) { if (srcIsRoCE) {
// Try to auto-detect the GID index // Try to auto-detect the GID index
std::pair<int, std::string> srcGidInfo (srcGidIndex, ""); std::pair<int, std::string> srcGidInfo (srcGidIndex, "");
std::pair<int, std::string> dstGidInfo (dstGidIndex, ""); ERR_CHECK(GetGidIndex(rss.srcContext, rss.srcPortAttr.gid_tbl_len, port, srcGidInfo));
ERR_CHECK(GetGidIndex(rss.srcContext, rss.srcPortAttr.gid_tbl_len, cfg.nic.ibPort, srcGidInfo));
ERR_CHECK(GetGidIndex(rss.dstContext, rss.dstPortAttr.gid_tbl_len, cfg.nic.ibPort, dstGidInfo));
srcGidIndex = srcGidInfo.first; srcGidIndex = srcGidInfo.first;
dstGidIndex = dstGidInfo.first;
IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid); IBV_CALL(ibv_query_gid, rss.srcContext, port, srcGidIndex, &rss.srcGid);
IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid);
} }
// Prepare queue pairs and send elements // Prepare queue pairs and send elements
rss.srcQueuePairs.resize(rss.qpCount); rss.srcQueuePairs.resize(rss.qpCount);
rss.dstQueuePairs.resize(rss.qpCount); for (int i = 0; i < rss.qpCount; i++) {
rss.sgePerQueuePair.resize(rss.qpCount); // Create SRC queue pair
rss.sendWorkRequests.resize(rss.qpCount);
for (int i = 0; i < rss.qpCount; ++i) {
// Create scatter-gather element for the portion of memory assigned to this queue pair
ibv_sge sg = {};
sg.addr = (uint64_t)rss.subExecParamCpu[i].src[0];
sg.length = rss.subExecParamCpu[i].N * sizeof(float);
sg.lkey = rss.srcMemRegion->lkey;
rss.sgePerQueuePair[i] = sg;
// Create send work request
ibv_send_wr wr = {};
wr.wr_id = i;
wr.sg_list = &rss.sgePerQueuePair[i];
wr.num_sge = 1;
wr.opcode = IBV_WR_RDMA_WRITE;
wr.send_flags = IBV_SEND_SIGNALED;
wr.wr.rdma.remote_addr = (uint64_t)rss.subExecParamCpu[i].dst[0];
wr.wr.rdma.rkey = rss.dstMemRegion->rkey;
rss.sendWorkRequests[i] = wr;
// Create SRC/DST queue pairs
ERR_CHECK(CreateQueuePair(cfg, rss.srcProtect, rss.srcCompQueue, rss.srcQueuePairs[i])); ERR_CHECK(CreateQueuePair(cfg, rss.srcProtect, rss.srcCompQueue, rss.srcQueuePairs[i]));
ERR_CHECK(CreateQueuePair(cfg, rss.dstProtect, rss.dstCompQueue, rss.dstQueuePairs[i])); // Initialize SRC queue pairs
ERR_CHECK(InitQueuePair(rss.srcQueuePairs[i], port, rdmaAccessFlags));
}
}
// Prepare NIC on DST mem rank
int dstGidIndex = cfg.nic.ibGidIndex;
bool dstIsRoCE = false;
if (GetRank() == dstMemRank) {
// Switch to closest CPU NUMA domain
int numaNode = GetIbvDeviceList()[rss.dstNicIndex].numaNode;
if (numaNode != -1)
numa_run_on_node(numaNode);
// Open DST NIC contexts
IBV_PTR_CALL(rss.dstContext, ibv_open_device, GetIbvDeviceList()[rss.dstNicIndex].devicePtr);
// Open DST protection domain
IBV_PTR_CALL(rss.dstProtect, ibv_alloc_pd, rss.dstContext);
// Register DST memory region
IBV_PTR_CALL(rss.dstMemRegion, ibv_reg_mr, rss.dstProtect, rss.dstMem[0], rss.numBytes, rdmaMemRegFlags);
// Create DST completion queues
IBV_PTR_CALL(rss.dstCompQueue, ibv_create_cq, rss.dstContext, cfg.nic.queueSize, NULL, NULL, 0);
// Get DST port attributes
IBV_CALL(ibv_query_port, rss.dstContext, port, &rss.dstPortAttr);
// Check for RDMA over Converged Ethernet (RoCE) and update GID index appropriately
dstIsRoCE = (rss.dstPortAttr.link_layer == IBV_LINK_LAYER_ETHERNET);
if (dstIsRoCE) {
// Try to auto-detect the GID index
std::pair<int, std::string> dstGidInfo (dstGidIndex, "");
ERR_CHECK(GetGidIndex(rss.dstContext, rss.dstPortAttr.gid_tbl_len, port, dstGidInfo));
dstGidIndex = dstGidInfo.first;
IBV_CALL(ibv_query_gid, rss.dstContext, port, dstGidIndex, &rss.dstGid);
}
// Prepare queue pairs
rss.dstQueuePairs.resize(rss.qpCount);
for (int i = 0; i < rss.qpCount; i++) {
// Create DST queue pair
ERR_CHECK(CreateQueuePair(cfg, rss.dstProtect, rss.dstCompQueue, rss.dstQueuePairs[i]));
// Initialize SRC/DST queue pairs // Initialize SRC/DST queue pairs
ERR_CHECK(InitQueuePair(rss.srcQueuePairs[i], port, rdmaAccessFlags));
ERR_CHECK(InitQueuePair(rss.dstQueuePairs[i], port, rdmaAccessFlags)); ERR_CHECK(InitQueuePair(rss.dstQueuePairs[i], port, rdmaAccessFlags));
}
}
// Transition the SRC queue pair to ready to receive // Executor rank prepares send elements and work requests
ERR_CHECK(TransitionQpToRtr(rss.srcQueuePairs[i], rss.dstPortAttr.lid, if (GetRank() == nicExeRank) {
rss.dstQueuePairs[i]->qp_num, rss.dstGid, rss.sgePerQueuePair.resize(rss.qpCount);
dstGidIndex, port, isRoCE, rss.sendWorkRequests.resize(rss.qpCount);
rss.srcPortAttr.active_mtu)); }
// Transition the SRC queue pair to ready to send // Broadcast SRC/DST port link_layer so that all ranks know it so that they can be compared
System::Get().Broadcast(srcMemRank, sizeof(rss.srcPortAttr.link_layer), &rss.srcPortAttr.link_layer);
System::Get().Broadcast(dstMemRank, sizeof(rss.dstPortAttr.link_layer), &rss.dstPortAttr.link_layer);
if (rss.srcPortAttr.link_layer != rss.dstPortAttr.link_layer) {
printf("[ERROR] Link layer do not match (%d vs %d)\n", rss.srcPortAttr.link_layer, rss.dstPortAttr.link_layer);
return {ERR_FATAL, "SRC NIC (%d) [Rank %d] and DST NIC (%d) [Rank %d] do not have the same link layer [%d vs %d]",
rss.srcNicIndex, srcMemRank, rss.dstNicIndex, dstMemRank, rss.srcPortAttr.link_layer, rss.dstPortAttr.link_layer};
}
ConnInfo dstConnInfo = {};
ConnInfo srcConnInfo = {};
for (int i = 0; i < rss.qpCount; i++) {
// Prepare and exchange SRC connection information
if (GetRank() == srcMemRank) {
srcConnInfo.lid = rss.srcPortAttr.lid;
srcConnInfo.gid = rss.srcGid;
srcConnInfo.gidIdx = srcGidIndex;
srcConnInfo.qpn = rss.srcQueuePairs[i]->qp_num;
srcConnInfo.rkey = rss.srcMemRegion->rkey;
srcConnInfo.vaddr = (uint64_t)rss.subExecParamCpu[i].src[0];
}
System::Get().Broadcast(srcMemRank, sizeof(srcConnInfo), &srcConnInfo);
// Prepare and exchange DST connection information
if (GetRank() == dstMemRank) {
dstConnInfo.lid = rss.dstPortAttr.lid;
dstConnInfo.gid = rss.dstGid;
dstConnInfo.gidIdx = dstGidIndex;
dstConnInfo.qpn = rss.dstQueuePairs[i]->qp_num;
dstConnInfo.rkey = rss.dstMemRegion->rkey;
dstConnInfo.vaddr = (uint64_t)rss.subExecParamCpu[i].dst[0];
}
System::Get().Broadcast(dstMemRank, sizeof(dstConnInfo), &dstConnInfo);
// Move queue pairs to ready-to-receive (RTR), using exchanged connection info
// Then move them to read-to-send (RTS)
if (GetRank() == srcMemRank) {
ERR_CHECK(TransitionQpToRtr(rss.srcQueuePairs[i], dstConnInfo, port, srcIsRoCE, rss.srcPortAttr.active_mtu));
ERR_CHECK(TransitionQpToRts(rss.srcQueuePairs[i])); ERR_CHECK(TransitionQpToRts(rss.srcQueuePairs[i]));
}
if (GetRank() == dstMemRank) {
ERR_CHECK(TransitionQpToRtr(rss.dstQueuePairs[i], srcConnInfo, port, dstIsRoCE, rss.dstPortAttr.active_mtu));
ERR_CHECK(TransitionQpToRts(rss.dstQueuePairs[i]));
}
// Transition the DST queue pair to ready to receive // Prepare scatter-gather element / work request for this queue pair in advance
ERR_CHECK(TransitionQpToRtr(rss.dstQueuePairs[i], rss.srcPortAttr.lid, if (GetRank() == nicExeRank) {
rss.srcQueuePairs[i]->qp_num, rss.srcGid, // Process the data to transfer in chunks (of cfg.nic.chunkBytes)
srcGidIndex, port, isRoCE, size_t remaining = rss.subExecParamCpu[i].N * sizeof(float);
rss.dstPortAttr.active_mtu)); size_t const numChunks = (remaining + cfg.nic.chunkBytes - 1) / cfg.nic.chunkBytes;
uint8_t* local = (nicExeRank == srcMemRank ? (uint8_t*)rss.subExecParamCpu[i].src[0]
: (uint8_t*)rss.subExecParamCpu[i].dst[0]);
auto const opcode = (nicExeRank == srcMemRank ? IBV_WR_RDMA_WRITE : IBV_WR_RDMA_READ);
uint64_t remote = (nicExeRank == srcMemRank ? dstConnInfo.vaddr : srcConnInfo.vaddr);
auto const lkey = (nicExeRank == srcMemRank ? rss.srcMemRegion->lkey : rss.dstMemRegion->lkey);
auto const rkey = (nicExeRank == srcMemRank ? dstConnInfo.rkey : srcConnInfo.rkey);
if (System::Get().IsVerbose()) {
printf("[INFO] Transfer %d SubExec %d executed by rank %d NIC %d is %s with %lu chunks\n",
rss.transferIdx, i, nicExeRank, nicExeDevice.exeIndex,
(opcode == IBV_WR_RDMA_WRITE ? "remote write" : "remote read"),
numChunks);
}
rss.sgePerQueuePair[i].resize(numChunks, {});
rss.sendWorkRequests[i].resize(numChunks, {});
for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx++) {
bool isLastChunk = (chunkIdx == numChunks - 1);
size_t currChunkBytes = isLastChunk ? remaining : cfg.nic.chunkBytes;
// Prepare scatter gather element
ibv_sge& sg = rss.sgePerQueuePair[i][chunkIdx];
sg.length = currChunkBytes;
sg.addr = (uintptr_t)local;
sg.lkey = lkey;
// Prepare work request
ibv_send_wr& wr = rss.sendWorkRequests[i][chunkIdx];
wr.wr_id = i;
wr.sg_list = &rss.sgePerQueuePair[i][chunkIdx];
wr.num_sge = 1;
wr.send_flags = isLastChunk ? IBV_SEND_SIGNALED : 0; // Only last chunk is signalled
wr.opcode = opcode;
wr.wr.rdma.remote_addr = remote;
wr.wr.rdma.rkey = rkey;
// Transition the DST queue pair to ready to send if (System::Get().IsVerbose()) {
ERR_CHECK(TransitionQpToRts(rss.dstQueuePairs[i])); printf("[INFO] Transfer %d SubExec %d chunk %lu local %p remote %p of size %lu\n",
rss.transferIdx, i, chunkIdx, (void*)local, (void*)remote, currChunkBytes);
} }
// Increment locations
remaining -= currChunkBytes;
local += currChunkBytes;
remote += currChunkBytes;
}
}
}
return ERR_NONE; return ERR_NONE;
} }
static ErrResult TeardownNicTransferResources(TransferResources& rss) static ErrResult TeardownNicTransferResources(TransferResources& rss, Transfer const& t)
{ {
bool isSrcRank = (GetRank() == t.srcs[0].memRank);
bool isDstRank = (GetRank() == t.dsts[0].memRank);
// Deregister memory regions // Deregister memory regions
IBV_CALL(ibv_dereg_mr, rss.srcMemRegion); if (isSrcRank) IBV_CALL(ibv_dereg_mr, rss.srcMemRegion);
IBV_CALL(ibv_dereg_mr, rss.dstMemRegion); if (isDstRank) IBV_CALL(ibv_dereg_mr, rss.dstMemRegion);
// Destroy queue pairs // Destroy queue pairs
if (isSrcRank) {
for (auto srcQueuePair : rss.srcQueuePairs) for (auto srcQueuePair : rss.srcQueuePairs)
IBV_CALL(ibv_destroy_qp, srcQueuePair); IBV_CALL(ibv_destroy_qp, srcQueuePair);
rss.srcQueuePairs.clear(); rss.srcQueuePairs.clear();
}
if (isDstRank) {
for (auto dstQueuePair : rss.dstQueuePairs) for (auto dstQueuePair : rss.dstQueuePairs)
IBV_CALL(ibv_destroy_qp, dstQueuePair); IBV_CALL(ibv_destroy_qp, dstQueuePair);
rss.dstQueuePairs.clear(); rss.dstQueuePairs.clear();
}
// Destroy completion queues // Destroy completion queues
IBV_CALL(ibv_destroy_cq, rss.srcCompQueue); if (isSrcRank) IBV_CALL(ibv_destroy_cq, rss.srcCompQueue);
IBV_CALL(ibv_destroy_cq, rss.dstCompQueue); if (isDstRank) IBV_CALL(ibv_destroy_cq, rss.dstCompQueue);
// Deallocate protection domains // Deallocate protection domains
IBV_CALL(ibv_dealloc_pd, rss.srcProtect); if (isSrcRank) IBV_CALL(ibv_dealloc_pd, rss.srcProtect);
IBV_CALL(ibv_dealloc_pd, rss.dstProtect); if (isDstRank) IBV_CALL(ibv_dealloc_pd, rss.dstProtect);
// Destroy context // Destroy context
IBV_CALL(ibv_close_device, rss.srcContext); if (isSrcRank) IBV_CALL(ibv_close_device, rss.srcContext);
IBV_CALL(ibv_close_device, rss.dstContext); if (isDstRank) IBV_CALL(ibv_close_device, rss.dstContext);
return ERR_NONE; return ERR_NONE;
} }
...@@ -2351,6 +3161,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2351,6 +3161,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
float const* expected = dstReference[t.srcs.size()].data(); float const* expected = dstReference[t.srcs.size()].data();
for (int dstIdx = 0; dstIdx < rss->dstMem.size(); dstIdx++) { for (int dstIdx = 0; dstIdx < rss->dstMem.size(); dstIdx++) {
// Validation is only done on the rank the destination memory is on
if (t.dsts[dstIdx].memRank != GetRank()) continue;
if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) { if (IsCpuMemType(t.dsts[dstIdx].memType) || cfg.data.validateDirect) {
output = (rss->dstMem[dstIdx]) + initOffset; output = (rss->dstMem[dstIdx]) + initOffset;
} else { } else {
...@@ -2363,8 +3175,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2363,8 +3175,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Difference found - find first error // Difference found - find first error
for (size_t i = 0; i < N; i++) { for (size_t i = 0; i < N; i++) {
if (output[i] != expected[i]) { if (output[i] != expected[i]) {
return {ERR_FATAL, "Transfer %d: Unexpected mismatch at index %lu of destination %d: Expected %10.5f Actual: %10.5f", return {ERR_FATAL, "Transfer %d: Unexpected mismatch at index %lu of destination %d on rank %d: Expected %10.5f Actual: %10.5f",
transferIdx, i, dstIdx, expected[i], output[i]}; transferIdx, i, dstIdx, t.dsts[dstIdx].memRank, expected[i], output[i]};
} }
} }
return {ERR_FATAL, "Transfer %d: Unexpected output mismatch for destination %d", transferIdx, dstIdx}; return {ERR_FATAL, "Transfer %d: Unexpected output mismatch for destination %d", transferIdx, dstIdx};
...@@ -2392,7 +3204,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2392,7 +3204,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int const initOffset = cfg.data.byteOffset / sizeof(float); int const initOffset = cfg.data.byteOffset / sizeof(float);
int const targetMultiple = cfg.data.blockBytes / sizeof(float); int const targetMultiple = cfg.data.blockBytes / sizeof(float);
// In some cases, there may not be enough data for all subExectors // In some cases, there may not be enough data for all subExecutors
int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple, int const maxSubExecToUse = std::min((size_t)(N + targetMultiple - 1) / targetMultiple,
(size_t)transfer.numSubExecs); (size_t)transfer.numSubExecs);
...@@ -2461,39 +3273,70 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2461,39 +3273,70 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ExeInfo& exeInfo) ExeInfo& exeInfo)
{ {
exeInfo.totalDurationMsec = 0.0; exeInfo.totalDurationMsec = 0.0;
int const localRank = GetRank();
if (System::Get().IsVerbose()) {
printf("[INFO] Rank %d preparing executor (%c%d on Rank %d)\n",
localRank, ExeTypeStr[exeDevice.exeType], exeDevice.exeIndex, exeDevice.exeRank);
}
// Loop over each transfer this executor is involved in // Loop over each transfer this executor is involved in
for (auto& rss : exeInfo.resources) { for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx]; Transfer const& t = transfers[rss.transferIdx];
rss.numBytes = t.numBytes; rss.numBytes = t.numBytes;
if (System::Get().IsVerbose()) {
printf("[INFO] Rank %d preparing transfer %d (%lu SRC %lu DST)\n",
localRank, rss.transferIdx, t.srcs.size(), t.dsts.size());
}
// Allocate source memory // Allocate source memory
rss.srcMem.resize(t.srcs.size()); rss.srcMem.resize(t.srcs.size());
for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) { for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
MemDevice const& srcMemDevice = t.srcs[iSrc]; MemDevice const& srcMemDevice = t.srcs[iSrc];
// Ensure executing GPU can access source memory // Ensure executing GPU can access source memory
if (IsGpuExeType(exeDevice.exeType) && IsGpuMemType(srcMemDevice.memType) && // This only applies to memory being accessed by a local GPU executor
if (IsGpuExeType(exeDevice.exeType) &&
IsGpuMemType(srcMemDevice.memType) &&
srcMemDevice.memRank == localRank &&
exeDevice.exeRank == localRank &&
srcMemDevice.memIndex != exeDevice.exeIndex) { srcMemDevice.memIndex != exeDevice.exeIndex) {
ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, srcMemDevice.memIndex)); ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, srcMemDevice.memIndex));
} }
// Allocate source memory (on the correct rank)
if (srcMemDevice.memRank == localRank) {
ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc])); ERR_CHECK(AllocateMemory(srcMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.srcMem[iSrc]));
} }
// Pass this pointer to all ranks (Used for pointer arithmetic, not defererenced on non-local ranks)
System::Get().Broadcast(srcMemDevice.memRank, sizeof(rss.srcMem[iSrc]), &rss.srcMem[iSrc]);
}
// Allocate destination memory // Allocate destination memory
rss.dstMem.resize(t.dsts.size()); rss.dstMem.resize(t.dsts.size());
for (int iDst = 0; iDst < t.dsts.size(); ++iDst) { for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
MemDevice const& dstMemDevice = t.dsts[iDst]; MemDevice const& dstMemDevice = t.dsts[iDst];
// Ensure executing GPU can access destination memory // Ensure executing GPU can access destination memory
if (IsGpuExeType(exeDevice.exeType) && IsGpuMemType(dstMemDevice.memType) && if (IsGpuExeType(exeDevice.exeType) &&
IsGpuMemType(dstMemDevice.memType) &&
dstMemDevice.memRank == localRank &&
exeDevice.exeRank == localRank &&
dstMemDevice.memIndex != exeDevice.exeIndex) { dstMemDevice.memIndex != exeDevice.exeIndex) {
ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, dstMemDevice.memIndex)); ERR_CHECK(EnablePeerAccess(exeDevice.exeIndex, dstMemDevice.memIndex));
} }
// Allocate destination memory (on the correct rank)
if (dstMemDevice.memRank == localRank) {
ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst])); ERR_CHECK(AllocateMemory(dstMemDevice, t.numBytes + cfg.data.byteOffset, (void**)&rss.dstMem[iDst]));
} }
// Pass this pointer to all ranks (Used for pointer arithmetic, not defererenced on non-local ranks)
System::Get().Broadcast(dstMemDevice.memRank, sizeof(rss.dstMem[iDst]), &rss.dstMem[iDst]);
}
if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) { // Prepare HSA DMA copy specific resources
if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy) && exeDevice.exeRank == localRank) {
#if !defined(__NVCC__) #if !defined(__NVCC__)
// Collect HSA agent information // Collect HSA agent information
hsa_amd_pointer_info_t info; hsa_amd_pointer_info_t info;
...@@ -2512,12 +3355,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2512,12 +3355,12 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
#endif #endif
} }
// Prepare subexecutor parameters // Prepare subexecutor parameters (on all ranks)
ERR_CHECK(PrepareSubExecParams(cfg, t, rss)); ERR_CHECK(PrepareSubExecParams(cfg, t, rss));
} }
// Prepare additional requirements for GPU-based executors // Prepare additional requirements for GPU-based executors
if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) { if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) {
ERR_CHECK(hipSetDevice(exeDevice.exeIndex)); ERR_CHECK(hipSetDevice(exeDevice.exeIndex));
// Determine how many streams to use // Determine how many streams to use
...@@ -2551,7 +3394,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2551,7 +3394,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
// Prepare for GPU GFX executor // Prepare for GPU GFX executor
if (exeDevice.exeType == EXE_GPU_GFX) { if (exeDevice.exeType == EXE_GPU_GFX && exeDevice.exeRank == localRank) {
// Allocate one contiguous chunk of GPU memory for threadblock parameters // Allocate one contiguous chunk of GPU memory for threadblock parameters
// This allows support for executing one transfer per stream, or all transfers in a single stream // This allows support for executing one transfer per stream, or all transfers in a single stream
#if !defined(__NVCC__) #if !defined(__NVCC__)
...@@ -2575,7 +3418,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2575,7 +3418,6 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if (cfg.gfx.useMultiStream || cfg.gfx.blockOrder == 0) { if (cfg.gfx.useMultiStream || cfg.gfx.blockOrder == 0) {
// Threadblocks are ordered sequentially one transfer at a time // Threadblocks are ordered sequentially one transfer at a time
for (auto& rss : exeInfo.resources) { for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx];
rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset; rss.subExecParamGpuPtr = exeInfo.subExecParamGpu + transferOffset;
for (auto p : rss.subExecParamCpu) { for (auto p : rss.subExecParamCpu) {
rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size()); rss.subExecIdx.push_back(exeInfo.subExecParamCpu.size());
...@@ -2648,23 +3490,29 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2648,23 +3490,29 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
vector<Transfer> const& transfers, vector<Transfer> const& transfers,
ExeInfo& exeInfo) ExeInfo& exeInfo)
{ {
int const localRank = GetRank();
// Loop over each transfer this executor is involved in // Loop over each transfer this executor is involved in
for (auto& rss : exeInfo.resources) { for (auto& rss : exeInfo.resources) {
Transfer const& t = transfers[rss.transferIdx]; Transfer const& t = transfers[rss.transferIdx];
// Deallocate source memory // Deallocate source memory
for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) { for (int iSrc = 0; iSrc < t.srcs.size(); ++iSrc) {
if (t.srcs[iSrc].memRank == localRank) {
ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, rss.srcMem[iSrc], t.numBytes + cfg.data.byteOffset)); ERR_CHECK(DeallocateMemory(t.srcs[iSrc].memType, rss.srcMem[iSrc], t.numBytes + cfg.data.byteOffset));
} }
}
// Deallocate destination memory // Deallocate destination memory
for (int iDst = 0; iDst < t.dsts.size(); ++iDst) { for (int iDst = 0; iDst < t.dsts.size(); ++iDst) {
if (t.dsts[iDst].memRank == localRank) {
ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, rss.dstMem[iDst], t.numBytes + cfg.data.byteOffset)); ERR_CHECK(DeallocateMemory(t.dsts[iDst].memType, rss.dstMem[iDst], t.numBytes + cfg.data.byteOffset));
} }
}
// Destroy HSA signal for DMA executor // Destroy HSA signal for DMA executor
#if !defined(__NVCC__) #if !defined(__NVCC__)
if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy)) { if (exeDevice.exeType == EXE_GPU_DMA && (t.exeSubIndex != -1 || cfg.dma.useHsaCopy) && exeDevice.exeRank == localRank) {
ERR_CHECK(hsa_signal_destroy(rss.signal)); ERR_CHECK(hsa_signal_destroy(rss.signal));
} }
#endif #endif
...@@ -2672,13 +3520,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2672,13 +3520,13 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Destroy NIC related resources // Destroy NIC related resources
#ifdef NIC_EXEC_ENABLED #ifdef NIC_EXEC_ENABLED
if (IsNicExeType(exeDevice.exeType)) { if (IsNicExeType(exeDevice.exeType)) {
ERR_CHECK(TeardownNicTransferResources(rss)); ERR_CHECK(TeardownNicTransferResources(rss, t));
} }
#endif #endif
} }
// Teardown additional requirements for GPU-based executors // Teardown additional requirements for GPU-based executors
if (exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) { if ((exeDevice.exeType == EXE_GPU_GFX || exeDevice.exeType == EXE_GPU_DMA) && exeDevice.exeRank == localRank) {
for (auto stream : exeInfo.streams) for (auto stream : exeInfo.streams)
ERR_CHECK(hipStreamDestroy(stream)); ERR_CHECK(hipStreamDestroy(stream));
if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) { if (cfg.gfx.useHipEvents || cfg.dma.useHipEvents) {
...@@ -2689,7 +3537,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2689,7 +3537,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
} }
if (exeDevice.exeType == EXE_GPU_GFX) { if (exeDevice.exeType == EXE_GPU_GFX && exeDevice.exeRank == localRank) {
#if !defined(__NVCC__) #if !defined(__NVCC__)
MemType memType = MEM_GPU; MemType memType = MEM_GPU;
#else #else
...@@ -2806,15 +3654,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2806,15 +3654,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int const exeIndex, int const exeIndex,
TransferResources& rss) TransferResources& rss)
{ {
// Loop over each of the queue pairs and post work request
// Loop over each of the queue pairs and post the send
ibv_send_wr* badWorkReq; ibv_send_wr* badWorkReq;
for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) { for (int qpIndex = 0; qpIndex < rss.qpCount; qpIndex++) {
int error = ibv_post_send(rss.srcQueuePairs[qpIndex], &rss.sendWorkRequests[qpIndex], &badWorkReq); size_t numChunks = rss.sendWorkRequests[qpIndex].size();
for (size_t chunkIdx = 0; chunkIdx < numChunks; chunkIdx++) {
int error = ibv_post_send(rss.srcIsExeNic ? rss.srcQueuePairs[qpIndex] : rss.dstQueuePairs[qpIndex],
&rss.sendWorkRequests[qpIndex][chunkIdx], &badWorkReq);
if (error) if (error)
return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d Error code %d\n", return {ERR_FATAL, "Transfer %d: Error when calling ibv_post_send for QP %d chunk %lu of %lu (Error code %d = %s)\n",
rss.transferIdx, qpIndex, error}; rss.transferIdx, qpIndex, chunkIdx, numChunks, error, strerror(error)};
}
} }
return ERR_NONE; return ERR_NONE;
} }
...@@ -2855,11 +3705,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -2855,11 +3705,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Poll the completion queue until all queue pairs are complete // Poll the completion queue until all queue pairs are complete
// The order of completion doesn't matter because this completion queue is dedicated to this Transfer // The order of completion doesn't matter because this completion queue is dedicated to this Transfer
ibv_wc wc; ibv_wc wc;
int nc = ibv_poll_cq(rss.srcCompQueue, 1, &wc); int nc = ibv_poll_cq(rss.srcIsExeNic ? rss.srcCompQueue : rss.dstCompQueue, 1, &wc);
if (nc > 0) { if (nc > 0) {
receivedQPs[i]++; receivedQPs[i]++;
if (wc.status != IBV_WC_SUCCESS) { if (wc.status != IBV_WC_SUCCESS) {
return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion", rss.transferIdx}; return {ERR_FATAL, "Transfer %d: Received unsuccessful work completion [status code %d]", rss.transferIdx, wc.status};
} }
} else if (nc < 0) { } else if (nc < 0) {
return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx}; return {ERR_FATAL, "Transfer %d: Received negative work completion", rss.transferIdx};
...@@ -3015,8 +3865,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3015,8 +3865,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
// Kernel for GFX execution // Kernel for GFX execution
template <typename PACKED_FLOAT, int BLOCKSIZE, int UNROLL, int TEMPORAL_MODE> template <typename PACKED_FLOAT, int LAUNCH_BOUND, int UNROLL, int TEMPORAL_MODE>
__global__ void __launch_bounds__(BLOCKSIZE) __global__ void __launch_bounds__(LAUNCH_BOUND)
GpuReduceKernel(SubExecParam* params, int seType, int waveOrder, int numSubIterations) GpuReduceKernel(SubExecParam* params, int seType, int waveOrder, int numSubIterations)
{ {
int64_t startCycle; int64_t startCycle;
...@@ -3032,7 +3882,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3032,7 +3882,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} else { } else {
// Warp-level: each warp is a subexecutor // Warp-level: each warp is a subexecutor
int warpIdx = threadIdx.x / warpSize; int warpIdx = threadIdx.x / warpSize;
int warpsPerBlock = BLOCKSIZE / warpSize; int warpsPerBlock = blockDim.x / warpSize;
subExecIdx = blockIdx.y * warpsPerBlock + warpIdx; subExecIdx = blockIdx.y * warpsPerBlock + warpIdx;
} }
...@@ -3062,7 +3912,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3062,7 +3912,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int32_t nWaves, waveIdx; int32_t nWaves, waveIdx;
if (seType == 0) { if (seType == 0) {
// Threadblock-level: all wavefronts in block work together // Threadblock-level: all wavefronts in block work together
nWaves = BLOCKSIZE / warpSize; // Number of wavefronts within this threadblock nWaves = blockDim.x / warpSize; // Number of wavefronts within this threadblock
waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock waveIdx = threadIdx.x / warpSize; // Index of this wavefront within the threadblock
} else { } else {
// Warp-level: each warp works independently // Warp-level: each warp works independently
...@@ -3193,48 +4043,39 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3193,48 +4043,39 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
} }
#define GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, DWORD) \ #define GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, DWORD) \
{GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_NONE>, \ {GpuReduceKernel<DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL_NONE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_LOAD>, \ GpuReduceKernel<DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL_LOAD>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_STORE>, \ GpuReduceKernel<DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL_STORE>, \
GpuReduceKernel<DWORD, BLOCKSIZE, UNROLL, TEMPORAL_BOTH>} GpuReduceKernel<DWORD, LAUNCH_BOUND, UNROLL, TEMPORAL_BOTH>}
#define GPU_KERNEL_DWORD_DECL(BLOCKSIZE, UNROLL) \ #define GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, UNROLL) \
{GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float), \ {GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, float), \
GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float2), \ GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, float2), \
GPU_KERNEL_TEMPORAL_DECL(BLOCKSIZE, UNROLL, float4)} GPU_KERNEL_TEMPORAL_DECL(LAUNCH_BOUND, UNROLL, float4)}
#define GPU_KERNEL_UNROLL_DECL(BLOCKSIZE) \ #define GPU_KERNEL_UNROLL_DECL(LAUNCH_BOUND) \
{GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 1), \ {GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 1), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 2), \ GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 2), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 3), \ GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 3), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 4), \ GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 4), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 5), \ GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 5), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 6), \ GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 6), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 7), \ GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 7), \
GPU_KERNEL_DWORD_DECL(BLOCKSIZE, 8)} GPU_KERNEL_DWORD_DECL(LAUNCH_BOUND, 8)}
// Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size / temporal) // Table of all GPU Reduction kernel functions (templated blocksize / unroll / dword size / temporal)
typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int, int); typedef void (*GpuKernelFuncPtr)(SubExecParam*, int, int, int);
GpuKernelFuncPtr GpuKernelTable[MAX_WAVEGROUPS][MAX_UNROLL][3][4] = #ifndef SINGLE_KERNEL
GpuKernelFuncPtr GpuKernelTable[4][MAX_UNROLL][3][4] =
{ {
GPU_KERNEL_UNROLL_DECL(64),
GPU_KERNEL_UNROLL_DECL(128),
GPU_KERNEL_UNROLL_DECL(192),
GPU_KERNEL_UNROLL_DECL(256), GPU_KERNEL_UNROLL_DECL(256),
GPU_KERNEL_UNROLL_DECL(320),
GPU_KERNEL_UNROLL_DECL(384),
GPU_KERNEL_UNROLL_DECL(448),
GPU_KERNEL_UNROLL_DECL(512), GPU_KERNEL_UNROLL_DECL(512),
GPU_KERNEL_UNROLL_DECL(576),
GPU_KERNEL_UNROLL_DECL(640),
GPU_KERNEL_UNROLL_DECL(704),
GPU_KERNEL_UNROLL_DECL(768), GPU_KERNEL_UNROLL_DECL(768),
GPU_KERNEL_UNROLL_DECL(832),
GPU_KERNEL_UNROLL_DECL(896),
GPU_KERNEL_UNROLL_DECL(960),
GPU_KERNEL_UNROLL_DECL(1024), GPU_KERNEL_UNROLL_DECL(1024),
}; };
#endif
#undef GPU_KERNEL_UNROLL_DECL #undef GPU_KERNEL_UNROLL_DECL
#undef GPU_KERNEL_DWORD_DECL #undef GPU_KERNEL_DWORD_DECL
#undef GPU_KERNEL_TEMPORAL_DECL #undef GPU_KERNEL_TEMPORAL_DECL
...@@ -3259,7 +4100,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3259,7 +4100,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 : cfg.gfx.wordSize == 2 ? 1 :
2; 2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode]; #ifdef SINGLE_KERNEL
auto gpuKernel = GpuReduceKernel<float4, 256, 1, 0>;
#else
auto gpuKernel = GpuKernelTable[(cfg.gfx.blockSize+255)/256 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
#endif
#if defined(__NVCC__) #if defined(__NVCC__)
if (startEvent != NULL) if (startEvent != NULL)
...@@ -3336,7 +4181,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3336,7 +4181,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 : int wordSizeIdx = cfg.gfx.wordSize == 1 ? 0 :
cfg.gfx.wordSize == 2 ? 1 : cfg.gfx.wordSize == 2 ? 1 :
2; 2;
auto gpuKernel = GpuKernelTable[cfg.gfx.blockSize/64 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode]; #ifdef SINGLE_KERNEL
auto gpuKernel = GpuReduceKernel<float4, 256, 1, 0>;
#else
auto gpuKernel = GpuKernelTable[(cfg.gfx.blockSize+255)/256 - 1][cfg.gfx.unrollFactor - 1][wordSizeIdx][cfg.gfx.temporalMode];
#endif
#if defined(__NVCC__) #if defined(__NVCC__)
if (cfg.gfx.useHipEvents) if (cfg.gfx.useHipEvents)
...@@ -3575,11 +4424,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3575,11 +4424,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
auto& errResults = results.errResults; auto& errResults = results.errResults;
errResults.clear(); errResults.clear();
// Check for valid configuration // Check for valid configuration and quit if any rank has fatal error
if (ConfigOptionsHaveErrors(cfg, errResults)) return false; if (System::Get().Any(ConfigOptionsHaveErrors(cfg, errResults))) {
System::Get().AllGatherErrors(errResults);
return false;
}
// Check for valid transfers // Check for valid transfers and quit if any rank has fatal error
if (TransfersHaveErrors(cfg, transfers, errResults)) return false; if (System::Get().Any(TransfersHaveErrors(cfg, transfers, errResults))) {
System::Get().AllGatherErrors(errResults);
return false;
}
// Collect up transfers by executor // Collect up transfers by executor
int minNumSrcs = MAX_SRCS + 1; int minNumSrcs = MAX_SRCS + 1;
...@@ -3589,7 +4444,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3589,7 +4444,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
for (int i = 0; i < transfers.size(); i++) { for (int i = 0; i < transfers.size(); i++) {
Transfer const& t = transfers[i]; Transfer const& t = transfers[i];
ExeDevice exeDevice; ExeDevice exeDevice;
ERR_APPEND(GetActualExecutor(cfg, t.exeDevice, exeDevice), errResults); ERR_APPEND(GetActualExecutor(t.exeDevice, exeDevice), errResults);
TransferResources resource = {}; TransferResources resource = {};
resource.transferIdx = i; resource.transferIdx = i;
...@@ -3607,6 +4462,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3607,6 +4462,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Loop over each executor and prepare // Loop over each executor and prepare
// - Allocates memory for each Transfer // - Allocates memory for each Transfer
// - Set up work for subexecutors // - Set up work for subexecutors
int const localRank = GetRank();
vector<ExeDevice> localExecutors;
vector<TransferResources*> transferResources; vector<TransferResources*> transferResources;
for (auto& exeInfoPair : executorMap) { for (auto& exeInfoPair : executorMap) {
ExeDevice const& exeDevice = exeInfoPair.first; ExeDevice const& exeDevice = exeInfoPair.first;
...@@ -3616,6 +4473,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3616,6 +4473,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
for (auto& resource : exeInfo.resources) { for (auto& resource : exeInfo.resources) {
transferResources.push_back(&resource); transferResources.push_back(&resource);
} }
// Track executors that are on this rank
if (exeDevice.exeRank == localRank) {
localExecutors.push_back(exeDevice);
}
} }
// Prepare reference src/dst arrays - only once for largest size // Prepare reference src/dst arrays - only once for largest size
...@@ -3637,21 +4498,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3637,21 +4498,24 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
for (int numSrcs = 0; numSrcs < minNumSrcs; numSrcs++) for (int numSrcs = 0; numSrcs < minNumSrcs; numSrcs++)
dstReference[numSrcs].clear(); dstReference[numSrcs].clear();
// Initialize all src memory buffers // Initialize all src memory buffers (if on local rank)
for (auto resource : transferResources) { for (auto resource : transferResources) {
Transfer const& t = transfers[resource->transferIdx];
for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) { for (int srcIdx = 0; srcIdx < resource->srcMem.size(); srcIdx++) {
if (t.srcs[srcIdx].memRank == localRank) {
ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx] + initOffset, srcReference[srcIdx].data(), resource->numBytes, ERR_APPEND(hipMemcpy(resource->srcMem[srcIdx] + initOffset, srcReference[srcIdx].data(), resource->numBytes,
hipMemcpyDefault), errResults); hipMemcpyDefault), errResults);
} }
} }
} }
}
// Pause before starting when running in iteractive mode // Pause before starting when running in iteractive mode
if (cfg.general.useInteractive) { if (cfg.general.useInteractive) {
if (localRank == 0) {
printf("Memory prepared:\n"); printf("Memory prepared:\n");
for (int i = 0; i < transfers.size(); i++) { for (int i = 0; i < transfers.size(); i++) {
ExeInfo const& exeInfo = executorMap[transfers[i].exeDevice];
printf("Transfer %03d:\n", i); printf("Transfer %03d:\n", i);
for (int iSrc = 0; iSrc < transfers[i].srcs.size(); ++iSrc) for (int iSrc = 0; iSrc < transfers[i].srcs.size(); ++iSrc)
printf(" SRC %0d: %p\n", iSrc, transferResources[i]->srcMem[iSrc]); printf(" SRC %0d: %p\n", iSrc, transferResources[i]->srcMem[iSrc]);
...@@ -3659,12 +4523,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3659,12 +4523,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
printf(" DST %0d: %p\n", iDst, transferResources[i]->dstMem[iDst]); printf(" DST %0d: %p\n", iDst, transferResources[i]->dstMem[iDst]);
} }
printf("Hit <Enter> to continue: "); printf("Hit <Enter> to continue: ");
fflush(stdout);
if (scanf("%*c") != 0) { if (scanf("%*c") != 0) {
printf("[ERROR] Unexpected input\n"); printf("[ERROR] Unexpected input\n");
exit(1); exit(1);
} }
printf("\n"); printf("\n");
} }
System::Get().Barrier();
}
// Perform iterations // Perform iterations
size_t numTimedIterations = 0; size_t numTimedIterations = 0;
...@@ -3672,20 +4539,26 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3672,20 +4539,26 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
for (int iteration = -cfg.general.numWarmups; ; iteration++) { for (int iteration = -cfg.general.numWarmups; ; iteration++) {
// Stop if number of iterations/seconds has reached limit // Stop if number of iterations/seconds has reached limit
if (cfg.general.numIterations > 0 && iteration >= cfg.general.numIterations) break; if (cfg.general.numIterations > 0 && iteration >= cfg.general.numIterations) break;
if (cfg.general.numIterations < 0 && totalCpuTimeSec > -cfg.general.numIterations) break;
// NOTE: Time-based limit is based on first rank to avoid any skew issues
bool shouldStop = (cfg.general.numIterations < 0 && totalCpuTimeSec > -cfg.general.numIterations);
System::Get().Broadcast(0, sizeof(shouldStop), &shouldStop);
if (shouldStop) break;
// Wait for all ranks before starting any timing
System::Get().Barrier();
// Start CPU timing for this iteration // Start CPU timing for this iteration
auto cpuStart = std::chrono::high_resolution_clock::now(); auto cpuStart = std::chrono::high_resolution_clock::now();
// Execute all Transfers in parallel // Execute all Transfers in parallel
std::vector<std::future<ErrResult>> asyncExecutors; std::vector<std::future<ErrResult>> asyncExecutors;
for (auto& exeInfoPair : executorMap) { for (auto const& exeDevice : localExecutors) {
asyncExecutors.emplace_back(std::async(std::launch::async, RunExecutor, asyncExecutors.emplace_back(std::async(std::launch::async, RunExecutor,
iteration, iteration,
std::cref(cfg), std::cref(cfg),
std::cref(exeInfoPair.first), std::cref(exeDevice),
std::ref(exeInfoPair.second))); std::ref(executorMap[exeDevice])));
} }
// Wait for all threads to finish // Wait for all threads to finish
...@@ -3693,6 +4566,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3693,6 +4566,9 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_APPEND(asyncExecutor.get(), errResults); ERR_APPEND(asyncExecutor.get(), errResults);
} }
// Wait for all ranks to finish
System::Get().Barrier();
// Stop CPU timing for this iteration // Stop CPU timing for this iteration
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() / cfg.general.numSubIterations; double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() / cfg.general.numSubIterations;
...@@ -3710,12 +4586,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3710,12 +4586,16 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Pause for interactive mode // Pause for interactive mode
if (cfg.general.useInteractive) { if (cfg.general.useInteractive) {
if (localRank == 0) {
printf("Transfers complete. Hit <Enter> to continue: "); printf("Transfers complete. Hit <Enter> to continue: ");
if (scanf("%*c") != 0) { if (scanf("%*c") != 0) {
printf("[ERROR] Unexpected input\n"); printf("[ERROR] Unexpected input\n");
exit(1); exit(1);
} }
printf("\n"); printf("\n");
fflush(stdout);
}
System::Get().Barrier();
} }
// Validate results // Validate results
...@@ -3736,16 +4616,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3736,16 +4616,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ExeDevice const& exeDevice = exeInfoPair.first; ExeDevice const& exeDevice = exeInfoPair.first;
ExeInfo& exeInfo = exeInfoPair.second; ExeInfo& exeInfo = exeInfoPair.second;
results.totalBytesTransferred += exeInfo.totalBytes;
// Copy over executor results // Copy over executor results
ExeResult& exeResult = results.exeResults[exeDevice]; ExeResult exeResult;
if (exeDevice.exeRank == localRank) {
// Local executor collects results
exeResult.numBytes = exeInfo.totalBytes; exeResult.numBytes = exeInfo.totalBytes;
exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations; exeResult.avgDurationMsec = exeInfo.totalDurationMsec / numTimedIterations;
exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec; exeResult.avgBandwidthGbPerSec = (exeResult.numBytes / 1.0e6) / exeResult.avgDurationMsec;
exeResult.sumBandwidthGbPerSec = 0.0; exeResult.sumBandwidthGbPerSec = 0.0;
exeResult.transferIdx.clear(); exeResult.transferIdx.clear();
results.totalBytesTransferred += exeInfo.totalBytes;
results.overheadMsec = std::min(results.overheadMsec, (results.avgTotalDurationMsec -
exeResult.avgDurationMsec));
// Copy over transfer results // Copy over transfer results
for (auto const& rss : exeInfo.resources) { for (auto const& rss : exeInfo.resources) {
...@@ -3769,6 +4650,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3769,6 +4650,17 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
exeResult.sumBandwidthGbPerSec += tfrResult.avgBandwidthGbPerSec; exeResult.sumBandwidthGbPerSec += tfrResult.avgBandwidthGbPerSec;
} }
} }
// Send executor and transfer result to all ranks
System::Get().BroadcastExeResult(exeDevice.exeRank, exeResult);
for (int const transferIdx : exeResult.transferIdx) {
System::Get().BroadcastTfrResult(exeDevice.exeRank, results.tfrResults[transferIdx]);
}
results.exeResults[exeDevice] = exeResult;
results.overheadMsec = std::min(results.overheadMsec, (results.avgTotalDurationMsec -
exeResult.avgDurationMsec));
}
results.avgTotalBandwidthGbPerSec = (results.totalBytesTransferred / 1.0e6) / results.avgTotalDurationMsec; results.avgTotalBandwidthGbPerSec = (results.totalBytesTransferred / 1.0e6) / results.avgTotalDurationMsec;
// Teardown executors // Teardown executors
...@@ -3778,6 +4670,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3778,6 +4670,11 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
ERR_APPEND(TeardownExecutor(cfg, exeDevice, transfers, exeInfo), errResults); ERR_APPEND(TeardownExecutor(cfg, exeDevice, transfers, exeInfo), errResults);
} }
System::Get().AllGatherErrors(errResults);
for (auto const& err : errResults) {
if (err.errType == ERR_FATAL) return false;
}
return true; return true;
} }
...@@ -3800,6 +4697,330 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3800,6 +4697,330 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
} }
bool RecursiveWildcardTransferExpansion(WildcardTransfer& wc,
int const& baseRankIndex,
size_t const& numBytes,
int const& numSubExecs,
std::vector<Transfer>& transfers)
{
// Basic implementation idea:
// - This recursive function procedes through each Transfer characteristic that has multiple possible values,
// selects one, then proceeds.
// - At the "end", each characteristic will only have one option, which will then be used to specify the
// Transfer to be added to transfers
bool result = false;
// Resolve memory wildcards first
for (int isDst = 0; isDst <= 1; isDst++) {
for (int iMem = 0; iMem < wc.mem[isDst].size(); iMem++) {
// Resolve mem rank wildcards first
if (wc.mem[isDst][iMem].memRanks.size() == 0) {
// Replace empty rank with baseRankIndex
wc.mem[isDst][iMem].memRanks = {baseRankIndex};
RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
wc.mem[isDst][iMem].memRanks.clear();
return true;
} else if (wc.mem[isDst][iMem].memRanks.size() > 1) {
// Loop over each possible rank and recurse
std::vector<int> memRanks;
memRanks.swap(wc.mem[isDst][iMem].memRanks);
for (auto x : memRanks) {
wc.mem[isDst][iMem].memRanks = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.mem[isDst][iMem].memRanks.swap(memRanks);
return result;
}
// At this point, there should be only 1 (valid) rank assigned to this SRC
if (wc.mem[isDst][iMem].memRanks.size() != 1 || wc.mem[isDst][iMem].memRanks[0] < 0) {
printf("[ERROR] Unexpected number of ranks / invalid number of ranks for %s %d\n", isDst ? "DST" : "SRC", iMem);
exit(1);
}
// Resolve mem index wildcards
// Mem devices should have at least one index
if (wc.mem[isDst][iMem].memIndices.size() == 0) {
printf("[ERROR] MemIndex for %s %d cannot be empty\n", isDst ? "DST" : "SRC", iMem);
exit(1);
}
// Loop over user provided list of device indices
if (wc.mem[isDst][iMem].memIndices.size() > 1) {
std::vector<int> memIndices;
memIndices.swap(wc.mem[isDst][iMem].memIndices);
for (auto x : memIndices) {
wc.mem[isDst][iMem].memIndices = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.mem[isDst][iMem].memIndices.swap(memIndices);
return result;
} else if (wc.mem[isDst][iMem].memIndices.size() == 1 && wc.mem[isDst][iMem].memIndices[0] == -1) {
// Wildcard - loop over all possible device indices for this memory type
int numExecutors = GetNumExecutors(wc.mem[isDst][iMem].memType, wc.mem[isDst][iMem].memRanks[0]);
for (int x = 0; x < numExecutors; x++) {
wc.mem[isDst][iMem].memIndices[0] = x;
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.mem[isDst][iMem].memIndices[0] = -1;
return result;
}
}
}
// Check for NIC wildcard (device index) first
if (wc.exe.exeType == EXE_NIC_NEAREST &&
wc.exe.exeRanks.size() == 0 &&
wc.exe.exeIndices.size() == 0 &&
wc.exe.exeSlots.size() == 0 &&
wc.exe.exeSubIndices.size() == 0 &&
wc.exe.exeSubSlots.size() == 0) {
// Find (first) closest NIC to the SRC memory location
std::vector<int> srcNicIndices;
if (IsCpuMemType(wc.mem[0][0].memType)) {
GetClosestNicsToCpu(srcNicIndices, wc.mem[0][0].memIndices[0], wc.mem[0][0].memRanks[0]);
} else {
GetClosestNicsToGpu(srcNicIndices, wc.mem[0][0].memIndices[0], wc.mem[0][0].memRanks[0]);
}
// Find (first) closest NIC to the DST memory location
std::vector<int> dstNicIndices;
if (IsCpuMemType(wc.mem[1][0].memType)) {
GetClosestNicsToCpu(dstNicIndices, wc.mem[1][0].memIndices[0], wc.mem[1][0].memRanks[0]);
} else {
GetClosestNicsToGpu(dstNicIndices, wc.mem[1][0].memIndices[0], wc.mem[1][0].memRanks[0]);
}
// If valid, fill in all wildcards
if (srcNicIndices.size() > 0 && dstNicIndices.size() > 0) {
wc.exe.exeRanks = {wc.mem[0][0].memRanks[0]};
wc.exe.exeIndices = {srcNicIndices[0]};
wc.exe.exeSlots = {0};
wc.exe.exeSubIndices = {dstNicIndices[0]};
wc.exe.exeSubSlots = {0};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
wc.exe.exeRanks.clear();
wc.exe.exeIndices.clear();
wc.exe.exeSlots.clear();
wc.exe.exeSubIndices.clear();
wc.exe.exeSubSlots.clear();
return result;
} else {
return false;
}
}
// Resolve EXE rank
if (wc.exe.exeRanks.size() == 0) {
// No rank provided - Assign the current base rank index
wc.exe.exeRanks = {baseRankIndex};
RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
wc.exe.exeRanks.clear();
return true;
} else if (wc.exe.exeRanks.size() > 1) {
// Loop over user provided ranks
std::vector<int> exeRanks;
exeRanks.swap(wc.exe.exeRanks);
for (auto x : exeRanks) {
wc.exe.exeRanks = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeRanks.swap(exeRanks);
return result;
} else if (wc.exe.exeRanks[0] == -1) {
printf("[ERROR] Exe rank should not be -1\n");
exit(1);
}
// Resolve EXE indices
if (wc.exe.exeIndices.size() == 0) {
printf("[ERROR] Exe index should never be empty\n");
exit(1);
} else if (wc.exe.exeIndices.size() > 1) {
// Loop over user provided indices
std::vector<int> exeIndices;
exeIndices.swap(wc.exe.exeIndices);
for (auto x : exeIndices) {
wc.exe.exeIndices = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeIndices.swap(exeIndices);
return result;
} else if (wc.exe.exeIndices[0] == -1) {
// Wildcard - loop over all possible executor indices
int numExecutors = GetNumExecutors(wc.exe.exeType, wc.exe.exeRanks[0]);
for (int x = 0; x < numExecutors; x++) {
wc.exe.exeIndices[0] = x;
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeIndices[0] = -1;
return result;
}
// Resolve EXE slots (only apples to EXE_NIC_NEAREST)
if (wc.exe.exeSlots.size() == 0) {
// Slot won't be used, so just assign 0
wc.exe.exeSlots = {0};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
wc.exe.exeSlots.clear();
return result;
} else if (wc.exe.exeSlots.size() > 1) {
// Loop over user provided slots
std::vector<int> exeSlots;
exeSlots.swap(wc.exe.exeSlots);
for (auto x : exeSlots) {
wc.exe.exeSlots = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeSlots.swap(exeSlots);
return result;
} else if (wc.exe.exeSlots[0] == -1) {
// Wildcard - Loop over all possible slots, based on SRC memory type
std::vector<int> srcNicIndices;
if (IsCpuMemType(wc.mem[0][0].memType)) {
GetClosestNicsToCpu(srcNicIndices, wc.mem[0][0].memIndices[0], wc.mem[0][0].memRanks[0]);
} else {
GetClosestNicsToGpu(srcNicIndices, wc.mem[0][0].memIndices[0], wc.mem[0][0].memRanks[0]);
}
for (auto x : srcNicIndices) {
wc.exe.exeSlots = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeSlots = {-1};
return result;
}
// Resolve EXE subindex
if (wc.exe.exeSubIndices.size() == 0) {
if (IsCpuExeType(wc.exe.exeType) || IsGpuExeType(wc.exe.exeType)) {
wc.exe.exeSubIndices = {-1};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
wc.exe.exeSubIndices.clear();
return result;
} else if (wc.exe.exeType == EXE_NIC) {
printf("[ERROR] NIC executor requires a subindex be specified\n");
exit(1);
} else if (wc.exe.exeType == EXE_NIC_NEAREST) {
// Assign NIC closest to DST mem
std::vector<int> dstNicIndices;
if (IsCpuMemType(wc.mem[1][0].memType)) {
GetClosestNicsToCpu(dstNicIndices, wc.mem[1][0].memIndices[0], wc.mem[1][0].memRanks[0]);
} else {
GetClosestNicsToGpu(dstNicIndices, wc.mem[1][0].memIndices[0], wc.mem[1][0].memRanks[0]);
}
if (dstNicIndices.size() > 0) {
wc.exe.exeSubIndices = {dstNicIndices[0]};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
wc.exe.exeSubIndices.clear();
}
return result;
}
} else if (wc.exe.exeSubIndices.size() > 1) {
// Loop over all user provided subindices
std::vector<int> exeSubIndices;
exeSubIndices.swap(wc.exe.exeSubIndices);
for (auto x : exeSubIndices) {
wc.exe.exeSubIndices = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeSubIndices.swap(exeSubIndices);
return result;
} else if (wc.exe.exeSubIndices[0] == -2) {
switch (wc.exe.exeType) {
case EXE_CPU:
wc.exe.exeSubIndices[0] = -1;
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
wc.exe.exeSubIndices[0] = -2;
return result;
case EXE_GPU_GFX: case EXE_GPU_DMA:
{
// Iterate over all available subindices
ExeDevice exeDevice = {wc.exe.exeType, wc.exe.exeIndices[0], wc.exe.exeRanks[0], 0};
int numSubIndices = GetNumExecutorSubIndices(exeDevice);
for (int x = 0; x < numSubIndices; x++) {
wc.exe.exeSubIndices = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeSubIndices = {-1};
return result;
}
case EXE_NIC: case EXE_NIC_NEAREST:
{
// Iterates over total number of DST NICs
int numIndices = 0;
if (wc.exe.exeType == EXE_NIC) {
numIndices = GetNumExecutors(EXE_NIC, wc.mem[1][0].memRanks[0]);
} else {
numIndices = GetNumExecutors(EXE_GPU_GFX, wc.mem[1][0].memRanks[0]);
}
for (int x = 0; x < numIndices; x++) {
wc.exe.exeSubIndices = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeSubIndices = {-1};
return result;
}
}
return result;
}
// Resolve EXE subslots (only apples to EXE_NIC_NEAREST)
if (wc.exe.exeSubSlots.size() == 0) {
// Subslot won't be used, so just assign 0
wc.exe.exeSubSlots = {0};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
wc.exe.exeSubSlots.clear();
return result;
} else if (wc.exe.exeSubSlots.size() > 1) {
// Loop over user provided slots
std::vector<int> exeSubSlots;
exeSubSlots.swap(wc.exe.exeSubSlots);
for (auto x : exeSubSlots) {
wc.exe.exeSubSlots = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeSubSlots.swap(exeSubSlots);
return result;
} else if (wc.exe.exeSubSlots[0] == -1) {
// Wildcard - Loop over all possible slots, based on DST memory type
std::vector<int> dstNicIndices;
if (IsCpuMemType(wc.mem[1][0].memType)) {
GetClosestNicsToCpu(dstNicIndices, wc.mem[1][0].memIndices[0], wc.mem[1][0].memRanks[0]);
} else {
GetClosestNicsToGpu(dstNicIndices, wc.mem[1][0].memIndices[0], wc.mem[1][0].memRanks[0]);
}
for (auto x : dstNicIndices) {
wc.exe.exeSubSlots = {x};
result |= RecursiveWildcardTransferExpansion(wc, baseRankIndex, numBytes, numSubExecs, transfers);
}
wc.exe.exeSubSlots = {-1};
return result;
}
// Only reach here when each candidate has been narrowed down to 1 option
// Create Transfer and add to list
Transfer t;
t.numBytes = numBytes;
t.numSubExecs = numSubExecs;
for (int iSrc = 0; iSrc < wc.mem[0].size(); iSrc++)
t.srcs.push_back({wc.mem[0][iSrc].memType, wc.mem[0][iSrc].memIndices[0], wc.mem[0][iSrc].memRanks[0]});
for (int iDst = 0; iDst < wc.mem[1].size(); iDst++)
t.dsts.push_back({wc.mem[1][iDst].memType, wc.mem[1][iDst].memIndices[0], wc.mem[1][iDst].memRanks[0]});
t.exeDevice.exeType = wc.exe.exeType;
t.exeDevice.exeIndex = wc.exe.exeIndices[0];
t.exeDevice.exeRank = wc.exe.exeRanks[0];
t.exeDevice.exeSlot = wc.exe.exeSlots[0];
t.exeSubIndex = wc.exe.exeSubIndices[0];
t.exeSubSlot = wc.exe.exeSubSlots[0];
transfers.push_back(t);
return false;
}
ErrResult ParseTransfers(std::string line, ErrResult ParseTransfers(std::string line,
std::vector<Transfer>& transfers) std::vector<Transfer>& transfers)
{ {
...@@ -3809,7 +5030,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3809,7 +5030,8 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
transfers.clear(); transfers.clear();
// Read in number of transfers // Read in number of transfers descriptions
// NOTE: Transfers descriptions with wildcards get expanded to multiple transfers
int numTransfers = 0; int numTransfers = 0;
std::istringstream iss(line); std::istringstream iss(line);
iss >> numTransfers; iss >> numTransfers;
...@@ -3832,261 +5054,1130 @@ static bool IsConfiguredGid(union ibv_gid const& gid) ...@@ -3832,261 +5054,1130 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
} }
for (int i = 0; i < numTransfers; i++) { for (int i = 0; i < numTransfers; i++) {
Transfer transfer; size_t numBytes;
if (!advancedMode) { if (!advancedMode) {
iss >> srcStr >> exeStr >> dstStr; iss >> srcStr >> exeStr >> dstStr;
transfer.numSubExecs = numSubExecs;
if (iss.fail()) { if (iss.fail()) {
return {ERR_FATAL, return {ERR_FATAL,
"Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet", i+1}; "Parsing error: Unable to read valid Transfer %d (SRC EXE DST) triplet", i+1};
} }
transfer.numBytes = 0; numBytes = 0;
} else { } else {
iss >> srcStr >> exeStr >> dstStr >> transfer.numSubExecs >> numBytesToken; iss >> srcStr >> exeStr >> dstStr >> numSubExecs >> numBytesToken;
if (iss.fail()) { if (iss.fail()) {
return {ERR_FATAL, return {ERR_FATAL,
"Parsing error: Unable to read valid Transfer %d (SRC EXE DST $CU #Bytes) tuple", i+1}; "Parsing error: Unable to read valid Transfer %d (SRC EXE DST $CU #Bytes) tuple", i+1};
} }
if (sscanf(numBytesToken.c_str(), "%lu", &transfer.numBytes) != 1) { if (sscanf(numBytesToken.c_str(), "%lu", &numBytes) != 1) {
return {ERR_FATAL, return {ERR_FATAL,
"Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple", i+1}; "Parsing error: Unable to read valid Transfer %d (SRC EXE DST #CU #Bytes) tuple", i+1};
} }
char units = numBytesToken.back(); char units = numBytesToken.back();
switch (toupper(units)) { switch (toupper(units)) {
case 'G': transfer.numBytes *= 1024; case 'G': numBytes *= 1024;
case 'M': transfer.numBytes *= 1024; case 'M': numBytes *= 1024;
case 'K': transfer.numBytes *= 1024; case 'K': numBytes *= 1024;
} }
} }
ERR_CHECK(ParseMemType(srcStr, transfer.srcs)); WildcardTransfer wct;
ERR_CHECK(ParseMemType(dstStr, transfer.dsts)); ERR_CHECK(ParseMemType(srcStr, wct.mem[0]));
ERR_CHECK(ParseExeType(exeStr, transfer.exeDevice, transfer.exeSubIndex)); ERR_CHECK(ParseMemType(dstStr, wct.mem[1]));
transfers.push_back(transfer); ERR_CHECK(ParseExeType(exeStr, wct.exe));
// Perform wildcard expansion
int numRanks = GetNumRanks();
for (int localRankIndex = 0; localRankIndex < numRanks; localRankIndex++) {
bool localRankModified = RecursiveWildcardTransferExpansion(wct, localRankIndex, numBytes, numSubExecs, transfers);
if (!localRankModified) break;
} }
}
return ERR_NONE; return ERR_NONE;
} }
int GetNumExecutors(ExeType exeType) // System related
//========================================================================================
System::System() :
rank(0), numRanks(1), commMode(COMM_NONE)
{ {
switch (exeType) { verbose = getenv("TB_VERBOSE") ? atoi(getenv("TB_VERBOSE")) : 0;
case EXE_CPU:
return numa_num_configured_nodes(); if (getenv("TB_PAUSE")) {
case EXE_GPU_GFX: case EXE_GPU_DMA: printf("Pausing for debug attachment\n");
{ volatile bool pause = true;
int numDetectedGpus = 0; while (pause);
hipError_t status = hipGetDeviceCount(&numDetectedGpus);
if (status != hipSuccess) numDetectedGpus = 0;
return numDetectedGpus;
} }
#ifdef NIC_EXEC_ENABLED
case EXE_NIC: case EXE_NIC_NEAREST: // Priority 1: Socket communicator
{ SetupSocketCommunicator();
return GetIbvDeviceList().size();
// Priority 2: MPI communicator
if (commMode == COMM_NONE) {
SetupMpiCommunicator();
} }
#endif
default: if (verbose && commMode == COMM_NONE) {
return 0; printf("[INFO] Running in single node mode\n");
} }
// Collect topology and distribute across all ranks
CollectTopology();
} }
int GetNumSubExecutors(ExeDevice exeDevice) System::~System()
{ {
int const& exeIndex = exeDevice.exeIndex; #ifdef MPI_COMM_ENABLED
if (commMode == COMM_MPI) {
if (mpiInit == true) {
MPI_Finalize();
}
}
#endif
if (commMode == COMM_SOCKET) {
// Close all sockets
for (auto& sock : sockets) {
if (sock != -1) {
close(sock);
sock = -1;
}
}
switch(exeDevice.exeType) { if (listenSocket != -1) {
case EXE_CPU: close(listenSocket);
{ listenSocket = -1;
int numCores = 0;
for (int i = 0; i < numa_num_configured_cpus(); i++)
if (numa_node_of_cpu(i) == exeIndex) numCores++;
return numCores;
} }
case EXE_GPU_GFX:
{
int numGpus = GetNumExecutors(EXE_GPU_GFX);
if (exeIndex < 0 || numGpus <= exeIndex) return 0;
int numDeviceCUs = 0;
hipError_t status = hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex);
if (status != hipSuccess) numDeviceCUs = 0;
return numDeviceCUs;
} }
case EXE_GPU_DMA: }
void System::SetupSocketCommunicator()
{ {
return 1; char* rankStr = getenv("TB_RANK");
char* numRanksStr = getenv("TB_NUM_RANKS");
char* masterAddrStr = getenv("TB_MASTER_ADDR");
char* masterPortStr = getenv("TB_MASTER_PORT");
// Socket communicator requires rank / numRanks / masterAddr
if (!rankStr || !numRanksStr || !masterAddrStr) {
if (verbose) {
printf("[INFO] SocketCommunicator skipped due to missing TB_RANK | TB_NUM_RANKS | TB_MASTER_ADDR\n");
} }
default: return;
return 0;
} }
rank = atoi(rankStr);
numRanks = atoi(numRanksStr);
masterAddr = masterAddrStr;
masterPort = masterPortStr ? atoi(masterPortStr) : 29500;
if (rank < 0 || rank >= numRanks) {
printf("[ERROR] Invalid rank index. Must be between 0 and %d (not %d)\n", numRanks - 1, rank);
exit(1);
} }
int GetNumExecutorSubIndices(ExeDevice exeDevice) sockets.resize(numRanks, -1);
{
// Executor subindices are not supported on NVIDIA hardware
#if defined(__NVCC__)
return 0;
#else
int const& exeIndex = exeDevice.exeIndex;
switch(exeDevice.exeType) { // Rank 0 acts as server for others to connect to
case EXE_CPU: return 0; int opt = 1;
case EXE_GPU_GFX: if (rank == 0) {
{ // Create listening socket
hsa_agent_t agent; listenSocket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
ErrResult err = GetHsaAgent(exeDevice, agent); if (listenSocket == -1) {
if (err.errType != ERR_NONE) return 0; printf("[ERROR] Unable to create listener socket\n");
int numXccs = 1; exit(1);
if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_XCC, &numXccs) != HSA_STATUS_SUCCESS)
return 1;
return numXccs;
} }
case EXE_GPU_DMA:
{
std::set<int> engineIds;
ErrResult err;
// Get HSA agent for this GPU // Allow address reuse
hsa_agent_t agent; setsockopt(listenSocket, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
err = GetHsaAgent(exeDevice, agent);
if (err.errType != ERR_NONE) return 0;
int numTotalEngines = 0, numEnginesA = 0, numEnginesB = 0; // Bind to port
if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_ENG, &numEnginesA) sockaddr_in serverAddr;
== HSA_STATUS_SUCCESS) memset(&serverAddr, 0, sizeof(serverAddr));
numTotalEngines += numEnginesA; serverAddr.sin_family = AF_INET;
if (hsa_agent_get_info(agent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG, &numEnginesB) serverAddr.sin_addr.s_addr = INADDR_ANY;
== HSA_STATUS_SUCCESS) serverAddr.sin_port = htons(masterPort);
numTotalEngines += numEnginesB;
return numTotalEngines; if (bind(listenSocket, (sockaddr*)&serverAddr, sizeof(serverAddr)) == -1) {
printf("[ERROR] Failed to bind listen socket\n");
exit(1);
} }
default:
return 0; if (listen(listenSocket, numRanks) == -1) {
printf("[ERROR] Failed to listen on socket\n");
exit(1);
} }
#endif // Accept connections from other ranks
printf("Waiting for connections from %d other ranks [listening on TB_MASTER_ADDR=%s TB_MASTER_PORT=%d]\n",
numRanks-1, masterAddr.c_str(), masterPort);
for (int i = 1; i < numRanks; i++) {
sockaddr_in clientAddr;
socklen_t clientAddrLen = sizeof(clientAddr);
auto clientSocket = accept(listenSocket, (sockaddr*)&clientAddr, &clientAddrLen);
if (clientSocket == -1) {
printf("[ERROR] Failed to accept connection from rank %d\n", i);
exit(1);
} }
int GetClosestCpuNumaToGpu(int gpuIndex) // Receive rank ID from client
{ int clientRank;
// Closest NUMA is not supported on NVIDIA hardware at this time recv(clientSocket, (char*)&clientRank, sizeof(clientRank), 0);
#if defined(__NVCC__)
return -1;
#else
hsa_agent_t gpuAgent;
ErrResult err = GetHsaAgent({EXE_GPU_GFX, gpuIndex}, gpuAgent);
if (err.errType != ERR_NONE) return -1;
hsa_agent_t closestCpuAgent; if (clientRank < 0 || clientRank >= numRanks) {
if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, &closestCpuAgent) close(clientSocket);
== HSA_STATUS_SUCCESS) { printf("[ERROR] Invalid rank received: %d\n", clientRank);
int numCpus = GetNumExecutors(EXE_CPU); exit(1);
for (int i = 0; i < numCpus; i++) {
hsa_agent_t cpuAgent;
err = GetHsaAgent({EXE_CPU, i}, cpuAgent);
if (err.errType != ERR_NONE) return -1;
if (cpuAgent.handle == closestCpuAgent.handle) return i;
} }
if (verbose) {
printf("[INFO] Rank 0 accepted connection from rank %d\n", clientRank);
} }
return -1; sockets[clientRank] = clientSocket;
#endif
} }
} else {
int GetClosestCpuNumaToNic(int nicIndex) // All other ranks connect to rank 0
{ int sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
#ifdef NIC_EXEC_ENABLED if (sock == -1) {
int numNics = GetNumExecutors(EXE_NIC); printf("[ERROR] Failed to create socket\n");
if (nicIndex < 0 || nicIndex >= numNics) return -1; exit(1);
return GetIbvDeviceList()[nicIndex].numaNode;
#else
return -1;
#endif
} }
sockaddr_in serverAddr;
memset(&serverAddr, 0, sizeof(serverAddr));
serverAddr.sin_family = AF_INET;
serverAddr.sin_port = htons(masterPort);
if (inet_pton(AF_INET, masterAddr.c_str(), &serverAddr.sin_addr) <= 0) {
printf("[ERROR] Invalid master address: %s\n", masterAddr.c_str());
exit(1);
}
int GetClosestNicToGpu(int gpuIndex) // Retry connection with backoff
{ if (verbose)
#ifdef NIC_EXEC_ENABLED printf("[INFO] Rank %d attempting to connect to %s:%d\n", rank, masterAddrStr, masterPort);
static bool isInitialized = false; int maxRetries = 50;
static std::vector<int> closestNicId; for (int retry = 0; retry < maxRetries; retry++) {
if (connect(sock, (sockaddr*)&serverAddr, sizeof(serverAddr)) == 0) {
int numGpus = GetNumExecutors(EXE_GPU_GFX); break;
if (gpuIndex < 0 || gpuIndex >= numGpus) return -1; }
if (retry == maxRetries - 1) {
// Build closest NICs per GPU on first use printf("[ERROR] Failed to connect to master after %d retries\n", maxRetries);
if (!isInitialized) { }
closestNicId.resize(numGpus, -1); sleep(1);
}
// Build up list of NIC bus addresses
std::vector<std::string> ibvAddressList;
auto const& ibvDeviceList = GetIbvDeviceList();
for (auto const& ibvDevice : ibvDeviceList)
ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : "");
// Track how many times a device has been assigned as "closest" // Send local rank to the server
// This allows distributed work across devices using multiple ports (sharing the same busID) send(sock, (char*)&rank, sizeof(rank), 0);
// NOTE: This isn't necessarily optimal, but likely to work in most cases involving multi-port sockets[0] = sock;
// Counter example: }
//
// G0 prefers (N0,N1), picks N0
// G1 prefers (N1,N2), picks N1
// G2 prefers N0, picks N0
//
// instead of G0->N1, G1->N2, G2->N0
std::vector<int> assignedCount(ibvDeviceList.size(), 0); commMode = COMM_SOCKET;
};
// Loop over each GPU to find the closest NIC(s) based on PCIe address void System::SetupMpiCommunicator()
for (int i = 0; i < numGpus; i++) { {
// Collect PCIe address for the GPU #ifdef MPI_COMM_ENABLED
char hipPciBusId[64]; int flag;
hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), i); MPI_Initialized(&flag);
if (err != hipSuccess) { if (!flag) {
#ifdef VERBS_DEBUG MPI_Init(NULL, NULL);
printf("Failed to get PCI Bus ID for HIP device %d: %s\n", i, hipGetErrorString(err)); mpiInit = true;
}
comm = MPI_COMM_WORLD;
MPI_Comm_rank(comm, &rank);
MPI_Comm_size(comm, &numRanks);
if (numRanks > 1) {
if (verbose) {
printf("[INFO] Enabling MPI communicator (%d ranks found)\n", numRanks);
}
commMode = COMM_MPI;
} else if (mpiInit) {
// Drop out of MPI use for single node
MPI_Finalize();
}
#endif #endif
closestNicId[i] = -1;
continue;
} }
// Find closest NICs void System::Barrier()
std::set<int> closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList); {
#ifdef MPI_COMM_ENABLED
if (commMode == COMM_MPI) {
MPI_Barrier(comm);
return;
}
#endif
if (commMode == COMM_SOCKET) {
char dummy = 0;
// Simple barrier using rank 0 to coordinate
if (rank == 0) {
// Wait for notification from all ranks
for (int peerRank = 1; peerRank < numRanks; peerRank++)
RecvData(peerRank, 1, &dummy);
// Release all ranks
for (int peerRank = 1; peerRank < numRanks; peerRank++)
SendData(peerRank, 1, &dummy);
} else {
// Send notification to root
SendData(0, 1, &dummy);
// Pick the least-used NIC to assign as closest // Wait for release from root
int closestIdx = -1; RecvData(0, 1, &dummy);
for (auto idx : closestNicIdxs) { }
if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx]) }
closestIdx = idx;
} }
// The following will only use distance between bus IDs void System::SendData(int dstRank, size_t const numBytes, const void* sendData) const
// to determine the closest NIC to GPU if the PCIe tree approach fails {
if (closestIdx < 0) { #ifdef MPI_COMM_ENABLED
#ifdef VERBS_DEBUG if (commMode == COMM_MPI) {
printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n"); MPI_Send(sendData, numBytes, MPI_BYTE, dstRank, 1234, comm);
return;
}
#endif #endif
if (commMode == COMM_SOCKET) {
if (rank != 0 && dstRank != 0) {
printf("[ERROR] Socket communicator is limited to sending from/to rank 0\n");
exit(1);
}
auto sock = sockets[dstRank];
int minDistance = std::numeric_limits<int>::max(); // Send data
for (int j = 0; j < ibvDeviceList.size(); j++) { size_t totalSent = 0;
if (ibvDeviceList[j].busId != "") { while (totalSent < numBytes) {
int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[j].busId); auto sent = send(sock, (char*)sendData + totalSent, numBytes - totalSent, 0);
if (distance < minDistance && distance >= 0) { if (sent == -1) {
minDistance = distance; printf("[ERROR] Send failed (rank %d to rank %d)\n", rank, dstRank);
closestIdx = j; exit(1);
} }
totalSent += sent;
} }
} }
} }
closestNicId[i] = closestIdx;
if (closestIdx != -1) assignedCount[closestIdx]++; void System::RecvData(int srcRank, size_t const numBytes, void* recvData) const
{
#ifdef MPI_COMM_ENABLED
if (commMode == COMM_MPI) {
MPI_Status status;
MPI_Recv(recvData, numBytes, MPI_BYTE, srcRank, 1234, comm, &status);
return;
} }
isInitialized = true; #endif
if (commMode == COMM_SOCKET) {
if (rank != 0 && srcRank != 0) {
printf("[ERROR] Socket communicator is limited to receiving from/at rank 0\n");
exit(1);
}
auto sock = sockets[srcRank];
size_t totalRecv = 0;
while (totalRecv < numBytes) {
auto recvd = recv(sock, (char*)recvData + totalRecv, numBytes - totalRecv, 0);
if (recvd == -1 || recvd == 0) {
printf("[ERROR] Recv failed (rank %d from rank %d)\n", rank, srcRank);
perror("recv");
exit(1);
}
totalRecv += recvd;
}
}
}
void System::Broadcast(int root, size_t const numBytes, void* data) const
{
if (numBytes == 0) return;
#ifdef MPI_COMM_ENABLED
if (commMode == COMM_MPI) {
int err = MPI_Bcast(data, numBytes, MPI_CHAR, root, comm);
if (err != MPI_SUCCESS) {
printf("[ERROR] MPI_Bcast failed with error code %d\n", err);
}
return;
} }
return closestNicId[gpuIndex];
#else
return -1;
#endif #endif
if (commMode == COMM_SOCKET) {
// Relay through rank 0 first
if (root != 0) {
if (rank == root) {
SendData(0, numBytes, data);
} else if (rank == 0) {
RecvData(root, numBytes, data);
}
}
if (rank == 0) {
for (int peer = 1; peer < numRanks; peer++) {
SendData(peer, numBytes, data);
}
} else {
RecvData(0, numBytes, data);
}
}
}
bool System::Any(bool const flag) const
{
bool result = false;
for (int i = 0; i < numRanks; i++) {
bool flagToSend = flag;
Broadcast(i, sizeof(flagToSend), &flagToSend);
result |= flagToSend;
if (result) break;
}
return result;
}
std::string System::GetCpuName() const
{
std::ifstream cpuInfo("/proc/cpuinfo");
std::string line;
if (cpuInfo.is_open()) {
while (std::getline(cpuInfo, line)) {
if (line.find("model name") != std::string::npos) {
size_t colonIdx = line.find(":");
if (colonIdx != std::string::npos) {
return line.substr(colonIdx + 2);
}
}
}
}
return "Unknown CPU";
}
void System::GetRankTopology(RankTopology& topo)
{
// Clear topology structure first
topo.numExecutors.clear();
topo.numExecutorSubIndices.clear();
topo.numSubExecutors.clear();
topo.closestCpuNumaToGpu.clear();
topo.closestCpuNumaToNic.clear();
topo.closestNicsToGpu.clear();
memset(topo.hostname, 0, sizeof(topo.hostname));
gethostname(topo.hostname, 32);
char* firstDotPtr = std::strchr(topo.hostname, '.');
if (firstDotPtr) *firstDotPtr = 0;
// NOTE: Placeholder values
strcpy(topo.ppodId, "N/A");
topo.vpodId = -1;
// CPU Executor
int numCpus = numa_num_configured_nodes();
topo.numExecutors[EXE_CPU] = numCpus;
std::string cpuName = GetCpuName();
for (int exeIndex = 0; exeIndex < numCpus; exeIndex++) {
topo.numExecutorSubIndices[{EXE_CPU, exeIndex}] = 0;
topo.executorName[{EXE_CPU, exeIndex}] = cpuName;
}
for (int cpuCore = 0; cpuCore < numa_num_configured_cpus(); cpuCore++) {
topo.numSubExecutors[{EXE_CPU, numa_node_of_cpu(cpuCore)}]++;
}
if (verbose) {
for (int exeIndex = 0; exeIndex < numCpus; exeIndex++) {
printf("[INFO] Rank %03d: CPU [%02d/%02d] %03d cores (%s)\n", rank, exeIndex, numCpus,
topo.numSubExecutors[{EXE_CPU, exeIndex}],
topo.executorName[{EXE_CPU, exeIndex}].c_str());
}
}
// GPU Executor
int numGpus = 0;
hipError_t status = hipGetDeviceCount(&numGpus);
if (status != hipSuccess) numGpus = 0;
topo.numExecutors[EXE_GPU_GFX] = numGpus;
topo.numExecutors[EXE_GPU_DMA] = numGpus;
for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) {
int numDeviceCUs = 0;
int numXccs = 0;
int numDmaEngines = 0;
int closestNuma = -1;
if (hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex) != hipSuccess) {
numDeviceCUs = 0;
}
std::string gpuName = "Unknown GPU";
hipDeviceProp_t props;
if (hipGetDeviceProperties(&props, exeIndex) == hipSuccess) {
gpuName = props.name;
}
topo.executorName[{EXE_GPU_GFX, exeIndex}] = gpuName;
topo.executorName[{EXE_GPU_DMA, exeIndex}] = gpuName;
#if !defined(__NVCC__)
hsa_agent_t gpuAgent = gpuAgents[exeIndex];
if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_XCC, &numXccs) != HSA_STATUS_SUCCESS)
numXccs = 1;
int numEnginesA, numEnginesB;
if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_ENG, &numEnginesA)
== HSA_STATUS_SUCCESS)
numDmaEngines += numEnginesA;
if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NUM_SDMA_XGMI_ENG, &numEnginesB)
== HSA_STATUS_SUCCESS)
numDmaEngines += numEnginesB;
hsa_agent_t closestCpuAgent;
if (hsa_agent_get_info(gpuAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_NEAREST_CPU, &closestCpuAgent)
== HSA_STATUS_SUCCESS) {
for (int cpuIndex = 0; cpuIndex < numCpus; cpuIndex++) {
hsa_agent_t cpuAgent = cpuAgents[cpuIndex];
if (cpuAgent.handle == closestCpuAgent.handle) {
closestNuma = cpuIndex;
break;
}
}
}
#endif
topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}] = numXccs;
topo.numExecutorSubIndices[{EXE_GPU_DMA, exeIndex}] = numDmaEngines;
topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}] = numDeviceCUs;
topo.numSubExecutors[{EXE_GPU_DMA, exeIndex}] = 1;
topo.closestCpuNumaToGpu[exeIndex] = closestNuma;
topo.closestNicsToGpu[exeIndex] = {};
}
// NIC Executor
int numNics = 0;
#ifdef NIC_EXEC_ENABLED
numNics = GetIbvDeviceList().size();
for (int exeIndex = 0; exeIndex < numNics; exeIndex++) {
topo.closestCpuNumaToNic[exeIndex] = GetIbvDeviceList()[exeIndex].numaNode;
topo.executorName[{EXE_NIC, exeIndex}] = GetIbvDeviceList()[exeIndex].name;
topo.nicIsActive[exeIndex] = GetIbvDeviceList()[exeIndex].hasActivePort;
if (verbose) {
printf("[INFO] Rank %03d: NIC [%02d/%02d] on CPU NUMA %d\n", rank, exeIndex, numNics, topo.closestCpuNumaToNic[exeIndex]);
}
}
#endif
topo.numExecutors[EXE_NIC] = topo.numExecutors[EXE_NIC_NEAREST] = numNics;
for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
topo.numSubExecutors[{EXE_NIC, nicIndex}] = 0;
topo.numExecutorSubIndices[{EXE_NIC, nicIndex}] = 0;
std::string gpuName = "Unknown GPU";
}
for (int gpuIndex = 0; gpuIndex < numGpus; gpuIndex++) {
topo.numSubExecutors[{EXE_NIC_NEAREST, gpuIndex}] = 0;
topo.numExecutorSubIndices[{EXE_NIC_NEAREST, gpuIndex}] = 0;
}
// Figure out closest NICs to GPUs
#ifdef NIC_EXEC_ENABLED
// Build up list of NIC bus addresses
std::vector<std::string> ibvAddressList;
auto const& ibvDeviceList = GetIbvDeviceList();
for (auto const& ibvDevice : ibvDeviceList)
ibvAddressList.push_back(ibvDevice.hasActivePort ? ibvDevice.busId : "");
// Track how many times a device has been assigned as "closest"
// This allows distributed work across devices using multiple ports (sharing the same busID)
// NOTE: This isn't necessarily optimal, but likely to work in most cases involving multi-port
// Counter example:
//
// G0 prefers (N0,N1), picks N0
// G1 prefers (N1,N2), picks N1
// G2 prefers N0, picks N0
//
// instead of G0->N1, G1->N2, G2->N0
std::vector<int> assignedCount(ibvDeviceList.size(), 0);
// Loop over each GPU to find the closest NIC(s) based on PCIe address
for (int gpuIndex = 0; gpuIndex < numGpus; gpuIndex++) {
// Collect PCIe address for the GPU
char hipPciBusId[64];
hipError_t err = hipDeviceGetPCIBusId(hipPciBusId, sizeof(hipPciBusId), gpuIndex);
if (err != hipSuccess) {
#ifdef VERBS_DEBUG
printf("Failed to get PCI Bus ID for HIP device %d: %s\n", gpuIndex, hipGetErrorString(err));
#endif
continue;
}
// Find closest NICs
std::set<int> closestNicIdxs = GetNearestDevicesInTree(hipPciBusId, ibvAddressList);
// Pick the least-used NIC to assign as closest
int closestIdx = -1;
for (auto idx : closestNicIdxs) {
if (closestIdx == -1 || assignedCount[idx] < assignedCount[closestIdx])
closestIdx = idx;
}
// The following will only use distance between bus IDs
// to determine the closest NIC to GPU if the PCIe tree approach fails
if (closestIdx < 0) {
#ifdef VERBS_DEBUG
printf("[WARN] Falling back to PCIe bus ID distance to determine proximity\n");
#endif
int minDistance = std::numeric_limits<int>::max();
for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
if (ibvDeviceList[nicIndex].busId != "") {
int distance = GetBusIdDistance(hipPciBusId, ibvDeviceList[nicIndex].busId);
if (distance < minDistance && distance >= 0) {
minDistance = distance;
closestIdx = nicIndex;
}
}
}
}
if (closestIdx != -1) {
topo.closestNicsToGpu[gpuIndex].push_back(closestIdx);
assignedCount[closestIdx]++;
}
}
#endif
if (verbose) {
for (int exeIndex = 0; exeIndex < numGpus; exeIndex++) {
printf("[INFO] Rank %03d: GPU [%02d/%02d] %d XCCs %03d CUs on CPU NUMA %d Closests NICs:", rank, exeIndex, numGpus,
topo.numExecutorSubIndices[{EXE_GPU_GFX, exeIndex}],
topo.numSubExecutors[{EXE_GPU_GFX, exeIndex}],
topo.closestCpuNumaToGpu[exeIndex]);
if (topo.closestNicsToGpu[exeIndex].size() == 0) {
printf(" none");
} else {
for (auto nicIndex : topo.closestNicsToGpu[exeIndex]) {
printf(" %d", nicIndex);
}
printf("\n");
}
}
}
}
template <typename KeyType, typename ValType>
void System::SendMap(int peerRank, std::map<KeyType, std::vector<ValType>> const& mapToSend) const
{
size_t mapSize = mapToSend.size();
SendData(peerRank, sizeof(mapSize), &mapSize);
for (auto const& p : mapToSend) {
SendData(peerRank, sizeof(p.first), &p.first);
size_t vectorSize = p.second.size();
SendData(peerRank, sizeof(vectorSize), &vectorSize);
for (auto const& v : p.second) {
SendData(peerRank, sizeof(v), &v);
}
}
fflush(stdout);
}
template <typename KeyType, typename ValType>
void System::SendMap(int peerRank, std::map<KeyType, ValType> const& mapToSend) const
{
size_t mapSize = mapToSend.size();
SendData(peerRank, sizeof(mapSize), &mapSize);
for (auto const p : mapToSend) {
SendData(peerRank, sizeof(p), &p);
}
}
template <typename KeyType>
void System::SendMap(int peerRank, std::map<KeyType, std::string> const& mapToSend) const
{
size_t mapSize = mapToSend.size();
SendData(peerRank, sizeof(mapSize), &mapSize);
for (auto const p : mapToSend) {
size_t strlen = p.second.size();
SendData(peerRank, sizeof(p.first), &p.first);
SendData(peerRank, sizeof(strlen), &strlen);
if (strlen) SendData(peerRank, strlen, p.second.data());
}
}
template <typename KeyType, typename ValType>
void System::RecvMap(int peerRank, std::map<KeyType, std::vector<ValType>>& mapToRecv) const
{
mapToRecv.clear();
size_t mapSize;
RecvData(peerRank, sizeof(mapSize), &mapSize);
for (size_t i = 0; i < mapSize; i++) {
KeyType key;
size_t vectorSize;
std::vector<ValType> values;
RecvData(peerRank, sizeof(key), &key);
RecvData(peerRank, sizeof(vectorSize), &vectorSize);
if (vectorSize) {
values.resize(vectorSize);
for (size_t j = 0; j < vectorSize; j++) {
RecvData(peerRank, sizeof(ValType), &values[j]);
}
}
mapToRecv[key] = values;
}
}
template <typename KeyType>
void System::RecvMap(int peerRank, std::map<KeyType, std::string>& mapToRecv) const
{
mapToRecv.clear();
size_t mapSize;
RecvData(peerRank, sizeof(mapSize), &mapSize);
for (size_t i = 0; i < mapSize; i++) {
KeyType key;
size_t strlen;
std::string value;
RecvData(peerRank, sizeof(key), &key);
RecvData(peerRank, sizeof(size_t), &strlen);
if (strlen) {
value.resize(strlen);
RecvData(peerRank, strlen, value.data());
}
mapToRecv[key] = value;
}
}
template <typename KeyType, typename ValType>
void System::RecvMap(int peerRank, std::map<KeyType, ValType>& mapToRecv) const
{
mapToRecv.clear();
size_t mapSize;
RecvData(peerRank, sizeof(mapSize), &mapSize);
for (size_t i = 0; i < mapSize; i++) {
std::pair<KeyType, ValType> p;
RecvData(peerRank, sizeof(p), &p);
mapToRecv[p.first] = p.second;
}
}
void System::SendRankTopo(int peerRank, RankTopology const& topo) const
{
SendData(peerRank, sizeof(topo.hostname), topo.hostname);
SendData(peerRank, sizeof(topo.ppodId), &topo.ppodId);
SendData(peerRank, sizeof(topo.vpodId), &topo.vpodId);
SendMap(peerRank, topo.numExecutors);
SendMap(peerRank, topo.numExecutorSubIndices);
SendMap(peerRank, topo.numSubExecutors);
SendMap(peerRank, topo.closestCpuNumaToGpu);
SendMap(peerRank, topo.closestCpuNumaToNic);
SendMap(peerRank, topo.nicIsActive);
SendMap(peerRank, topo.closestNicsToGpu);
SendMap(peerRank, topo.executorName);
};
void System::RecvRankTopo(int peerRank, RankTopology& topo) const
{
RecvData(peerRank, sizeof(topo.hostname), topo.hostname);
RecvData(peerRank, sizeof(topo.ppodId), &topo.ppodId);
RecvData(peerRank, sizeof(topo.vpodId), &topo.vpodId);
RecvMap(peerRank, topo.numExecutors);
RecvMap(peerRank, topo.numExecutorSubIndices);
RecvMap(peerRank, topo.numSubExecutors);
RecvMap(peerRank, topo.closestCpuNumaToGpu);
RecvMap(peerRank, topo.closestCpuNumaToNic);
RecvMap(peerRank, topo.nicIsActive);
RecvMap(peerRank, topo.closestNicsToGpu);
RecvMap(peerRank, topo.executorName);
}
template <typename T>
void System::BroadcastVector(int root, vector<T>& data) const
{
// This assumes T is trivially copyable
static_assert(std::is_trivially_copyable<T>::value);
size_t len = data.size();
Broadcast(root, sizeof(len), &len);
data.resize(len);
if (len) {
Broadcast(root, sizeof(T) * len, data.data());
}
}
void System::BroadcastString(int root, std::string& string) const
{
size_t len = string.size();
Broadcast(root, sizeof(len), &len);
string.resize(len);
if (len) {
Broadcast(root, len, string.data());
}
}
void System::BroadcastExeResult(int root, ExeResult& exeResult) const
{
#define BROADCAST(X) Broadcast(root, sizeof(X), &X)
BROADCAST(exeResult.numBytes);
BROADCAST(exeResult.avgDurationMsec);
BROADCAST(exeResult.avgBandwidthGbPerSec);
BROADCAST(exeResult.sumBandwidthGbPerSec);
BroadcastVector(root, exeResult.transferIdx);
#undef BROADCAST
}
void System::BroadcastTfrResult(int root, TransferResult& tfrResult) const
{
#define BROADCAST(X) Broadcast(root, sizeof(X), &X)
BROADCAST(tfrResult.numBytes);
BROADCAST(tfrResult.avgDurationMsec);
BROADCAST(tfrResult.avgBandwidthGbPerSec);
BroadcastVector(root, tfrResult.perIterMsec);
BROADCAST(tfrResult.exeDevice);
BROADCAST(tfrResult.exeDstDevice);
// Per-Iteration CU results need to be handled in a custom manner
size_t perIterCuSize = tfrResult.perIterCUs.size();
BROADCAST(perIterCuSize);
if (perIterCuSize > 0) {
tfrResult.perIterCUs.resize(perIterCuSize);
for (size_t i = 0; i < perIterCuSize; i++) {
size_t setSize;
//vector<set<pair<int,int>>> perIterCUs; ///< GFX-Executor only. XCC:CU used per iteration
if (GetRank() == root) {
setSize = tfrResult.perIterCUs[i].size();
BROADCAST(setSize);
if (setSize > 0) {
for (pair<int,int> const& x : tfrResult.perIterCUs[i]) {
pair<int, int> p = x;
BROADCAST(p);
}
}
} else {
BROADCAST(setSize);
tfrResult.perIterCUs[i].clear();
if (setSize > 0) {
pair<int, int> p;
BROADCAST(p);
tfrResult.perIterCUs[i].insert(p);
}
}
}
} else {
tfrResult.perIterCUs.clear();
}
#undef BROADCAST
};
void System::AllGatherErrors(vector<ErrResult>& errResults) const
{
if (commMode == COMM_NONE) return;
vector<ErrResult> tempResults = std::move(errResults);
for (int i = 0; i < numRanks; i++) {
size_t errListSize = tempResults.size();
Broadcast(i, sizeof(errListSize), &errListSize);
for (size_t j = 0; j < errListSize; j++) {
ErrResult errResult;
if (rank == i) errResult = tempResults[j];
Broadcast(i, sizeof(errResult.errType), &errResult.errType);
BroadcastString(i, errResult.errMsg);
errResult.errMsg += " (Rank " + std::to_string(i) + ")";
errResults.push_back(errResult);
}
}
}
#if !defined(__NVCC__)
// Get the hsa_agent_t associated with a ExeDevice
ErrResult System::GetHsaAgent(ExeDevice const& exeDevice, hsa_agent_t& agent) const
{
int numCpus = static_cast<int>(cpuAgents.size());
int numGpus = static_cast<int>(gpuAgents.size());
int exeIndex = exeDevice.exeIndex;
switch (exeDevice.exeType) {
case EXE_CPU:
if (exeIndex < 0 || exeIndex >= numCpus)
return {ERR_FATAL, "CPU index must be between 0 and %d inclusively", numCpus - 1};
agent = cpuAgents[exeDevice.exeIndex];
break;
case EXE_GPU_GFX: case EXE_GPU_DMA:
if (exeIndex < 0 || exeIndex >= numGpus)
return {ERR_FATAL, "GPU index must be between 0 and %d inclusively", numGpus - 1};
agent = gpuAgents[exeIndex];
break;
default:
return {ERR_FATAL,
"Attempting to get HSA agent of unknown or unsupported executor type (%d)",
exeDevice.exeType};
}
return ERR_NONE;
}
// Get the hsa_agent_t associated with a MemDevice
ErrResult System::GetHsaAgent(MemDevice const& memDevice, hsa_agent_t& agent) const
{
if (memDevice.memType == MEM_CPU_CLOSEST)
return GetHsaAgent({EXE_CPU, GetClosestCpuNumaToGpu(memDevice.memIndex)}, agent);
if (IsCpuMemType(memDevice.memType)) return GetHsaAgent({EXE_CPU, memDevice.memIndex}, agent);
if (IsGpuMemType(memDevice.memType)) return GetHsaAgent({EXE_GPU_GFX, memDevice.memIndex}, agent);
return {ERR_FATAL,
"Unable to get HSA agent for memDevice (%d,%d)",
memDevice.memType, memDevice.memIndex};
}
#endif
void System::CollectTopology()
{
// Cache the HSA agents for each device
#if !defined(__NVCC__)
{
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
ErrResult err;
int32_t* tempBuffer;
// Index CPU agents
cpuAgents.clear();
int numCpus = numa_num_configured_nodes();
for (int i = 0; i < numCpus; i++) {
AllocateMemory({MEM_CPU, i}, 1024, (void**)&tempBuffer);
hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL);
cpuAgents.push_back(info.agentOwner);
DeallocateMemory(MEM_CPU, tempBuffer, 1024);
}
// Index GPU agents
int numGpus = 0;
hipError_t status = hipGetDeviceCount(&numGpus);
if (status != hipSuccess) numGpus = 0;
gpuAgents.clear();
for (int i = 0; i < numGpus; i++) {
AllocateMemory({MEM_GPU, i}, 1024, (void**)&tempBuffer);
hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL);
gpuAgents.push_back(info.agentOwner);
DeallocateMemory(MEM_GPU, tempBuffer, 1024);
}
}
#endif
// Collect the topology of the local node
RankTopology localTopo;
GetRankTopology(localTopo);
// Distribute amongst all ranks
rankInfo.resize(numRanks);
if (rank == 0) {
// Receive topology info from each rank
rankInfo[0] = localTopo;
for (int peerRank = 1; peerRank < numRanks; peerRank++) {
if (verbose) {
printf("[INFO] Rank 0 receives topology from Rank %d\n", peerRank);
}
RecvRankTopo(peerRank, rankInfo[peerRank]);
}
// Send out full set of info to each rank
for (int peerRank = 1; peerRank < numRanks; peerRank++) {
for (int i = 0; i < numRanks; i++) {
if (verbose) {
printf("[INFO] Rank 0 sends topology %d to Rank %d\n", i, peerRank);
}
SendRankTopo(peerRank, rankInfo[i]);
}
}
} else {
// Send local topology info back to root
if (verbose) {
printf("[INF0] Rank %d sends topology from Rank 0\n", rank);
}
SendRankTopo(0, localTopo);
for (int i = 0; i < numRanks; i++) {
RecvRankTopo(0, rankInfo[i]);
if (verbose) {
printf("[INF0] Rank %d receives topology %d from Rank 0\n", rank, i);
}
}
}
}
int System::GetNumExecutors(ExeType exeType, int targetRank) const
{
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
if (rankInfo[targetRank].numExecutors.count(exeType) == 0) return 0;
return rankInfo[targetRank].numExecutors.at(exeType);
}
int System::GetNumExecutorSubIndices(ExeDevice exeDevice) const
{
int targetRank = exeDevice.exeRank;
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
if (rankInfo[targetRank].numExecutorSubIndices.count({exeDevice.exeType, exeDevice.exeIndex}) == 0)
return 0;
return rankInfo[targetRank].numExecutorSubIndices.at({exeDevice.exeType, exeDevice.exeIndex});
}
int System::GetNumSubExecutors(ExeDevice exeDevice) const
{
int targetRank = exeDevice.exeRank;
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
if (rankInfo[targetRank].numSubExecutors.count({exeDevice.exeType, exeDevice.exeIndex}) == 0)
return 0;
return rankInfo[targetRank].numSubExecutors.at({exeDevice.exeType, exeDevice.exeIndex});
}
int System::GetClosestCpuNumaToGpu(int gpuIndex, int targetRank) const
{
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
if (gpuIndex < 0 || gpuIndex >= GetNumExecutors(EXE_GPU_GFX, targetRank)) return 0;
return rankInfo[targetRank].closestCpuNumaToGpu.at(gpuIndex);
}
int System::GetClosestCpuNumaToNic(int nicIndex, int targetRank) const
{
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
if (nicIndex < 0 || nicIndex >= GetNumExecutors(EXE_NIC, targetRank)) return 0;
return rankInfo[targetRank].closestCpuNumaToNic.at(nicIndex);
}
void System::GetClosestNicsToGpu(std::vector<int>& nicIndices, int gpuIndex, int targetRank) const
{
nicIndices.clear();
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
if (gpuIndex < 0 || gpuIndex >= GetNumExecutors(EXE_GPU_GFX, targetRank)) return;
nicIndices = rankInfo[targetRank].closestNicsToGpu.at(gpuIndex);
}
std::string System::GetHostname(int targetRank) const
{
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
return rankInfo[targetRank].hostname;
}
std::string System::GetPpodId(int targetRank) const
{
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
return rankInfo[targetRank].ppodId;
}
int System::GetVpodId(int targetRank) const
{
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
return rankInfo[targetRank].vpodId;
}
std::string System::GetExecutorName(ExeDevice exeDevice) const
{
int targetRank = exeDevice.exeRank;
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
if (rankInfo[targetRank].executorName.count({exeDevice.exeType, exeDevice.exeIndex}) == 0)
return "Unknown device";
return rankInfo[targetRank].executorName.at({exeDevice.exeType, exeDevice.exeIndex});
}
int System::NicIsActive(int nicIndex, int targetRank) const
{
if (targetRank < 0 || targetRank >= numRanks) targetRank = rank;
if (rankInfo[targetRank].nicIsActive.count(nicIndex) == 0) return 0;
return rankInfo[targetRank].nicIsActive.at(nicIndex);
}
int GetNumExecutors(ExeType exeType, int targetRank)
{
return System::Get().GetNumExecutors(exeType, targetRank);
}
int GetNumExecutors(MemType memType, int targetRank)
{
if (IsCpuMemType(memType)) return GetNumExecutors(EXE_CPU, targetRank);
if (IsGpuMemType(memType)) return GetNumExecutors(EXE_GPU_GFX, targetRank);
return 0;
}
int GetNumSubExecutors(ExeDevice exeDevice)
{
return System::Get().GetNumSubExecutors(exeDevice);
}
int GetNumExecutorSubIndices(ExeDevice exeDevice)
{
return System::Get().GetNumExecutorSubIndices(exeDevice);
}
int GetClosestCpuNumaToGpu(int gpuIndex, int targetRank)
{
return System::Get().GetClosestCpuNumaToGpu(gpuIndex, targetRank);
}
int GetClosestCpuNumaToNic(int nicIndex, int targetRank)
{
return System::Get().GetClosestCpuNumaToNic(nicIndex, targetRank);
}
int GetClosestNicToGpu(int gpuIndex, int targetRank)
{
std::vector<int> nicIndices;
System::Get().GetClosestNicsToGpu(nicIndices, gpuIndex, targetRank);
if (nicIndices.size() == 0) return -1;
return nicIndices[0];
}
void GetClosestNicsToGpu(std::vector<int>& nicIndices, int gpuIndex, int targetRank)
{
System::Get().GetClosestNicsToGpu(nicIndices, gpuIndex, targetRank);
}
void GetClosestNicsToCpu(std::vector<int>& nicIndices, int cpuIndex, int targetRank)
{
int numNics = GetNumExecutors(EXE_NIC, targetRank);
nicIndices.clear();
for (int nicIndex = 0; nicIndex < numNics; nicIndex++) {
if (GetClosestCpuNumaToNic(nicIndex, targetRank) == cpuIndex) {
nicIndices.push_back(nicIndex);
}
}
}
int GetRank()
{
return System::Get().GetRank();
}
int GetNumRanks()
{
return System::Get().GetNumRanks();
}
int GetCommMode()
{
return System::Get().GetCommMode();
}
std::string GetHostname(int targetRank)
{
return System::Get().GetHostname(targetRank);
}
std::string GetPpodId(int targetRank)
{
return System::Get().GetPpodId(targetRank);
}
int GetVpodId(int targetRank)
{
return System::Get().GetVpodId(targetRank);
}
std::string GetExecutorName(ExeDevice exeDevice)
{
return System::Get().GetExecutorName(exeDevice);
}
int NicIsActive(int nicIndex, int targetRank)
{
return System::Get().NicIsActive(nicIndex, targetRank);
} }
// Undefine CUDA compatibility macros // Undefine CUDA compatibility macros
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment