Unverified Commit f5e9cf34 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.41 Adding schmoo benchmark, fixing timing reports for variable-iteration modes (#73)

* v1.41 Adding schmoo benchmark, fixing timing reports for variable-iteration modes
parent 437b6e70
...@@ -3,6 +3,15 @@ ...@@ -3,6 +3,15 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.41
### Additions
* Adding schmoo preset config benchmarks local/remote reads/writes/copies
* Usage: ./TransferBench schmoo <numBytes=64M> <localIdx=0> <remoteIdx=1> <maxNumCUs=32>
### Fixes
* Fixing some misreported timings when running with non-fixed number of iterations
## v1.40 ## v1.40
### Fixes ### Fixes
......
...@@ -116,6 +116,39 @@ int main(int argc, char **argv) ...@@ -116,6 +116,39 @@ int main(int argc, char **argv)
RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs); RunAllToAllBenchmark(ev, numBytesPerTransfer, numSubExecs);
exit(0); exit(0);
} }
// - Test schmoo benchmark
else if (!strcmp(argv[1], "schmoo"))
{
if (ev.numGpuDevices < 2)
{
printf("[ERROR] Schmoo benchmark requires at least 2 GPUs\n");
exit(1);
}
ev.configMode = CFG_SCHMOO;
int localIdx = (argc > 3 ? atoi(argv[3]) : 0);
int remoteIdx = (argc > 4 ? atoi(argv[4]) : 1);
int maxSubExecs = (argc > 5 ? atoi(argv[3]) : 32);
if (localIdx >= ev.numGpuDevices || remoteIdx >= ev.numGpuDevices)
{
printf("[ERROR] Cannot execute schmoo test with local GPU device %d, remote GPU device %d\n", localIdx, remoteIdx);
exit(1);
}
ev.DisplaySchmooEnvVars();
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
do
{
RunSchmooBenchmark(ev, curr * sizeof(float), localIdx, remoteIdx, maxSubExecs);
if (numBytesPerTransfer != 0) exit(0);
curr += delta;
} while (curr < N * 2);
}
}
else if (!strcmp(argv[1], "cmdline")) else if (!strcmp(argv[1], "cmdline"))
{ {
// Print environment variables and CSV header // Print environment variables and CSV header
...@@ -547,8 +580,8 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -547,8 +580,8 @@ void ExecuteTransfers(EnvVars const& ev,
int totalCUs = 0; int totalCUs = 0;
for (auto const& transfer : exeInfo.transfers) for (auto const& transfer : exeInfo.transfers)
{ {
double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations); transfer->transferTime /= (1.0 * numTimedIterations);
double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f; double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
totalCUs += transfer->numSubExecs; totalCUs += transfer->numSubExecs;
if (!verbose) continue; if (!verbose) continue;
...@@ -557,7 +590,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -557,7 +590,7 @@ void ExecuteTransfers(EnvVars const& ev,
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
transfer->transferIndex, transfer->transferIndex,
transferBandwidthGbs, transferBandwidthGbs,
transferDurationMsec, transfer->transferTime,
transfer->numBytesActual, transfer->numBytesActual,
transfer->SrcToStr().c_str(), transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex,
...@@ -572,7 +605,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -572,7 +605,7 @@ void ExecuteTransfers(EnvVars const& ev,
for (int i = 0; i < numTimedIterations; i++) for (int i = 0; i < numTimedIterations; i++)
{ {
times.insert(std::make_pair(transfer->perIterationTime[i], i+1)); times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
double const varTime = fabs(transferDurationMsec - transfer->perIterationTime[i]); double const varTime = fabs(transfer->transferTime - transfer->perIterationTime[i]);
stdDevTime += varTime * varTime; stdDevTime += varTime * varTime;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f; double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
...@@ -614,7 +647,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -614,7 +647,7 @@ void ExecuteTransfers(EnvVars const& ev,
MemTypeStr[transfer->exeType], transfer->exeIndex, MemTypeStr[transfer->exeType], transfer->exeIndex,
transfer->DstToStr().c_str(), transfer->DstToStr().c_str(),
transfer->numSubExecs, transfer->numSubExecs,
transferBandwidthGbs, transferDurationMsec, transferBandwidthGbs, transfer->transferTime,
PtrVectorToStr(transfer->srcMem, initOffset).c_str(), PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
PtrVectorToStr(transfer->dstMem, initOffset).c_str()); PtrVectorToStr(transfer->dstMem, initOffset).c_str());
} }
...@@ -634,15 +667,15 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -634,15 +667,15 @@ void ExecuteTransfers(EnvVars const& ev,
for (auto const& transferPair : transferList) for (auto const& transferPair : transferList)
{ {
Transfer* transfer = transferPair.second; Transfer* transfer = transferPair.second;
double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations); transfer->transferTime /= (1.0 * numTimedIterations);
double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f; double transferBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
maxGpuTime = std::max(maxGpuTime, transferDurationMsec); maxGpuTime = std::max(maxGpuTime, transfer->transferTime);
if (!verbose) continue; if (!verbose) continue;
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n", printf(" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s\n",
transfer->transferIndex, transfer->transferIndex,
transferBandwidthGbs, transferDurationMsec, transferBandwidthGbs, transfer->transferTime,
transfer->numBytesActual, transfer->numBytesActual,
transfer->SrcToStr().c_str(), transfer->SrcToStr().c_str(),
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex,
...@@ -657,7 +690,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -657,7 +690,7 @@ void ExecuteTransfers(EnvVars const& ev,
for (int i = 0; i < numTimedIterations; i++) for (int i = 0; i < numTimedIterations; i++)
{ {
times.insert(std::make_pair(transfer->perIterationTime[i], i+1)); times.insert(std::make_pair(transfer->perIterationTime[i], i+1));
double const varTime = fabs(transferDurationMsec - transfer->perIterationTime[i]); double const varTime = fabs(transfer->transferTime - transfer->perIterationTime[i]);
stdDevTime += varTime * varTime; stdDevTime += varTime * varTime;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f; double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / transfer->perIterationTime[i] * 1000.0f;
...@@ -698,7 +731,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -698,7 +731,7 @@ void ExecuteTransfers(EnvVars const& ev,
ExeTypeName[transfer->exeType], transfer->exeIndex, ExeTypeName[transfer->exeType], transfer->exeIndex,
transfer->DstToStr().c_str(), transfer->DstToStr().c_str(),
transfer->numSubExecs, transfer->numSubExecs,
transferBandwidthGbs, transferDurationMsec, transferBandwidthGbs, transfer->transferTime,
PtrVectorToStr(transfer->srcMem, initOffset).c_str(), PtrVectorToStr(transfer->srcMem, initOffset).c_str(),
PtrVectorToStr(transfer->dstMem, initOffset).c_str()); PtrVectorToStr(transfer->dstMem, initOffset).c_str());
} }
...@@ -1646,7 +1679,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1646,7 +1679,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
for (int dir = 0; dir <= isBidirectional; dir++) for (int dir = 0; dir <= isBidirectional; dir++)
{ {
double const avgTime = transfers[dir].transferTime / ev.numIterations; double const avgTime = transfers[dir].transferTime;
double const avgBw = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f; double const avgBw = (transfers[dir].numBytesActual / 1.0E9) / avgTime * 1000.0f;
avgBandwidth[dir].push_back(avgBw); avgBandwidth[dir].push_back(avgBw);
...@@ -1849,8 +1882,7 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co ...@@ -1849,8 +1882,7 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
transfers[0].dstIndex[0] = i < numCpus ? i : i - numCpus; transfers[0].dstIndex[0] = i < numCpus ? i : i - numCpus;
ExecuteTransfers(ev, 0, N, transfers, false); ExecuteTransfers(ev, 0, N, transfers, false);
double transferDurationMsec = transfers[0].transferTime / (1.0 * ev.numIterations); double transferBandwidthGbs = (transfers[0].numBytesActual / 1.0E9) / transfers[0].transferTime * 1000.0f;
double transferBandwidthGbs = (transfers[0].numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
printf("%c%7.2f ", separator, transferBandwidthGbs); printf("%c%7.2f ", separator, transferBandwidthGbs);
if (transferBandwidthGbs > bestResult[i].first) if (transferBandwidthGbs > bestResult[i].first)
...@@ -1955,8 +1987,7 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i ...@@ -1955,8 +1987,7 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
if (reIndex.count(std::make_pair(src, dst))) if (reIndex.count(std::make_pair(src, dst)))
{ {
Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]]; Transfer const& transfer = transfers[reIndex[std::make_pair(src,dst)]];
double transferDurationMsec = transfer.transferTime / (1.0 * ev.numIterations); double transferBandwidthGbs = (transfer.numBytesActual / 1.0E9) / transfer.transferTime * 1000.0f;
double transferBandwidthGbs = (transfer.numBytesActual / 1.0E9) / transferDurationMsec * 1000.0f;
colTotalBandwidth[dst] += transferBandwidthGbs; colTotalBandwidth[dst] += transferBandwidthGbs;
rowTotalBandwidth += transferBandwidthGbs; rowTotalBandwidth += transferBandwidthGbs;
totalBandwidthGpu += transferBandwidthGbs; totalBandwidthGpu += transferBandwidthGbs;
...@@ -2248,6 +2279,105 @@ std::string Transfer::DstToStr() const ...@@ -2248,6 +2279,105 @@ std::string Transfer::DstToStr() const
return ss.str(); return ss.str();
} }
void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs)
{
printf("Bytes to transfer: %lu Local GPU: %d Remote GPU: %d\n", numBytesPerTransfer, localIdx, remoteIdx);
printf("| #CUs | Local Read | LocalWrite | Local Copy | RemoteRead |Remote Write| RemoteCopy |\n");
printf("|------|------------|------------|------------|------------|------------|------------|\n");
std::vector<Transfer> transfers(1);
Transfer& t = transfers[0];
t.exeType = EXE_GPU_GFX;
t.exeIndex = localIdx;
t.exeSubIndex = -1;
t.numBytes = numBytesPerTransfer;
for (int numCUs = 1; numCUs <= maxSubExecs; numCUs++)
{
t.numSubExecs = numCUs;
// Local Read
t.numSrcs = 1;
t.numDsts = 0;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex[0] = localIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const localRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Local Write
t.numSrcs = 0;
t.numDsts = 1;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex[0] = localIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const localWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Local Copy
t.numSrcs = 1;
t.numDsts = 1;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex[0] = localIdx;
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex[0] = localIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const localCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Remote Read
t.numSrcs = 1;
t.numDsts = 0;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex[0] = remoteIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const remoteRead = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Remote Write
t.numSrcs = 0;
t.numDsts = 1;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex[0] = remoteIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const remoteWrite = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
// Remote Copy
t.numSrcs = 1;
t.numDsts = 1;
t.srcType.resize(t.numSrcs);
t.dstType.resize(t.numDsts);
t.srcIndex.resize(t.numSrcs);
t.dstIndex.resize(t.numDsts);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex[0] = localIdx;
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex[0] = remoteIdx;
ExecuteTransfers(ev, 0, 0, transfers, false);
double const remoteCopy = (t.numBytesActual / 1.0E9) / t.transferTime * 1000.0f;
printf("| %3d | %10.3f | %10.3f | %10.3f | %10.3f | %10.3f | %10.3f |\n",
numCUs, localRead, localWrite, localCopy, remoteRead, remoteWrite, remoteCopy);
}
}
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom) void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
{ {
ev.DisplaySweepEnvVars(); ev.DisplaySweepEnvVars();
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.40" #define TB_VERSION "1.41"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -40,7 +40,8 @@ enum ConfigModeEnum ...@@ -40,7 +40,8 @@ enum ConfigModeEnum
CFG_P2P = 1, CFG_P2P = 1,
CFG_SWEEP = 2, CFG_SWEEP = 2,
CFG_SCALE = 3, CFG_SCALE = 3,
CFG_A2A = 4 CFG_A2A = 4,
CFG_SCHMOO = 5
}; };
enum BlockOrderEnum enum BlockOrderEnum
...@@ -728,6 +729,16 @@ public: ...@@ -728,6 +729,16 @@ public:
printf("\n"); printf("\n");
} }
void DisplaySchmooEnvVars() const
{
DisplayEnvVars();
if (hideEnv) return;
if (!outputToCsv)
printf("[Schmoo Related]\n");
PRINT_EV("USE_FINE_GRAIN", useFineGrain,
std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
}
// Helper function that gets parses environment variable or sets to default value // Helper function that gets parses environment variable or sets to default value
static int GetEnvVar(std::string const& varname, int defaultValue) static int GetEnvVar(std::string const& varname, int defaultValue)
{ {
......
...@@ -100,32 +100,33 @@ ExeType inline CharToExeType(char const c) ...@@ -100,32 +100,33 @@ ExeType inline CharToExeType(char const c)
// then writes the summation to each of the specified destination memory location(s) // then writes the summation to each of the specified destination memory location(s)
struct Transfer struct Transfer
{ {
int transferIndex; // Transfer identifier (within a Test) // Inputs
ExeType exeType; // Transfer executor type ExeType exeType; // Transfer executor type
int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU) int exeIndex; // Executor index (NUMA node for CPU / device ID for GPU)
int exeSubIndex; // Executor subindex int exeSubIndex; // Executor subindex
int numSubExecs; // Number of subExecutors to use for this Transfer int numSubExecs; // Number of subExecutors to use for this Transfer
size_t numBytes; // # of bytes requested to Transfer (may be 0 to fallback to default) size_t numBytes; // # of bytes requested to Transfer (may be 0 to fallback to default)
size_t numBytesActual; // Actual number of bytes to copy
double transferTime; // Time taken in milliseconds
int numSrcs; // Number of sources int numSrcs; // Number of sources
std::vector<MemType> srcType; // Source memory types std::vector<MemType> srcType; // Source memory types
std::vector<int> srcIndex; // Source device indice std::vector<int> srcIndex; // Source device indice
std::vector<float*> srcMem; // Source memory
int numDsts; // Number of destinations int numDsts; // Number of destinations
std::vector<MemType> dstType; // Destination memory type std::vector<MemType> dstType; // Destination memory type
std::vector<int> dstIndex; // Destination device index std::vector<int> dstIndex; // Destination device index
std::vector<float*> dstMem; // Destination memory
// Outputs
size_t numBytesActual; // Actual number of bytes to copy
double transferTime; // Time taken in milliseconds
std::vector<double> perIterationTime; // Per-iteration timing
std::vector<std::set<std::pair<int,int>>> perIterationCUs; // Per-iteration CU usage
// Internal
int transferIndex; // Transfer identifier (within a Test)
std::vector<float*> srcMem; // Source memory
std::vector<float*> dstMem; // Destination memory
std::vector<SubExecParam> subExecParam; // Defines subarrays assigned to each threadblock std::vector<SubExecParam> subExecParam; // Defines subarrays assigned to each threadblock
SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
std::vector<int> subExecIdx; // Indicies into subExecParamGpu std::vector<int> subExecIdx; // Indicies into subExecParamGpu
std::vector<double> perIterationTime; // Per-iteration timing
std::vector<std::set<std::pair<int,int>>> perIterationCUs; // Per-iteration CU usage
// Prepares src/dst subarray pointers for each SubExecutor // Prepares src/dst subarray pointers for each SubExecutor
void PrepareSubExecParams(EnvVars const& ev); void PrepareSubExecParams(EnvVars const& ev);
...@@ -190,6 +191,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N); ...@@ -190,6 +191,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N);
void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int const maxSubExecs); void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int const maxSubExecs);
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom); void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExec, int const numCpuSubExec, bool const isRandom);
void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs); void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount); std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment