Unverified Commit eaf32b47 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Adding pcopy benchmark, fixing CPU kernel on null destination (#101)

* Adding pcopy benchmark, fixing CPU kernel on null destination
parent 4c4fa4a3
......@@ -3,6 +3,15 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.50
### Added
- Adding new parallel copy preset benchmark (pcopy)
- Usage: ./TransferBench pcopy <numBytes=64M> <#CUs=8> <srcGpu=0> <minGpus=1> <maxGpus=#GPU-1>
### Fixed
- Removed non-copies DMA Transfers (this had previously been using hipMemset)
- Fixed CPU executor when operating on null destination
## v1.49
### Fixes
......
......@@ -70,10 +70,11 @@ make
* `a2a` : All-to-all benchmark test
* `cmdline`: Take in Transfers to run from command-line instead of via file
* `p2p` : Peer-to-peer benchmark test
* `pcopy` : Benchmark parallel copies from a single GPU to other GPUs
* `rsweep` : Random sweep across possible sets of transfers
* `rwrite` : Benchmarks parallel remote writes from a single GPU
* `rwrite` : Benchmarks parallel remote writes from a single GPU to other GPUs
* `scaling`: GPU subexecutor scaling tests
* 'schmoo` : Local/Remote read/write/copy between two GPUs
* `schmoo` : Local/Remote read/write/copy between two GPUs
* `sweep` : Sweep across possible sets of transfers
* When using the same GPU executor in multiple simultaneous transfers on separate streams (USE_SINGLE_STREAM=0),
......
......@@ -175,6 +175,40 @@ int main(int argc, char **argv)
} while (curr < N * 2);
}
}
else if (!strcmp(argv[1], "pcopy"))
{
if (ev.numGpuDevices < 2)
{
printf("[ERROR] Parallel copy benchmark requires at least 2 GPUs\n");
exit(1);
}
ev.DisplayParallelCopyEnvVars();
int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8);
int srcIdx = (argc > 4 ? atoi(argv[4]) : 0);
int minGpus = (argc > 5 ? atoi(argv[5]) : 1);
int maxGpus = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
if (maxGpus > ev.gpuMaxHwQueues && ev.useDmaCopy)
{
printf("[ERROR] DMA executor %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
srcIdx, maxGpus, ev.gpuMaxHwQueues);
printf("[ERROR] Aborting to avoid misleading results due to potential serialization of Transfers\n");
exit(1);
}
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
do
{
RunParallelCopyBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
if (numBytesPerTransfer != 0) exit(0);
curr += delta;
} while (curr < N * 2);
}
}
else if (!strcmp(argv[1], "cmdline"))
{
// Print environment variables and CSV header
......@@ -969,16 +1003,22 @@ void DisplayUsage(char const* cmdName)
printf(" config: Either:\n");
printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
printf(" - Name of preset config:\n");
printf(" a2a - GPU All-To-All benchmark\n");
printf(" - 3rd optional arg: # of SubExecs to use\n");
printf(" cmdline - Read Transfers from command line arguments (after N)\n");
printf(" p2p - Peer-to-peer benchmark tests\n");
printf(" rwrite/pcopy - Parallel writes/copies from single GPU to other GPUs\n");
printf(" - 3rd optional arg: # GPU SubExecs per Transfer\n");
printf(" - 4th optional arg: Root GPU index\n");
printf(" - 5th optional arg: Min number of other GPUs to transfer to\n");
printf(" - 6th optional arg: Max number of other GPUs to transfer to\n");
printf(" sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
printf(" - 3rd optional arg: # GPU SubExecs per Transfer\n");
printf(" - 4th optional arg: # CPU SubExecs per Transfer\n");
printf(" scaling - GPU SubExec scaling copy test\n");
printf(" scaling - GPU GFX SubExec scaling copy test\n");
printf(" - 3th optional arg: Max # of SubExecs to use\n");
printf(" - 4rd optional arg: GPU index to use as executor\n");
printf(" a2a - GPU All-To-All benchmark\n");
printf(" - 3rd optional arg: # of SubExecs to use\n");
printf(" cmdline - Read Transfers from command line arguments (after N)\n");
printf(" schmoo - Local/RemoteRead/Write/Copy between two GPUs\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
......@@ -1413,9 +1453,9 @@ void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transf
exit(1);
}
if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs != 1 || transfer.numDsts != 1))
{
printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
printf("[ERROR] GPU DMA executor can only be used for single source + single dst copies\n");
exit(1);
}
......@@ -1718,12 +1758,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,
hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
HIP_CALL(hipEventRecord(startEvent, stream));
if (transfer->numSrcs == 0 && transfer->numDsts == 1)
{
HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
MEMSET_CHAR, transfer->numBytesActual, stream));
}
else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
if (transfer->numSrcs == 1 && transfer->numDsts == 1)
{
HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
transfer->numBytesActual, hipMemcpyDefault,
......@@ -2756,7 +2791,91 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
{
printf(" (%s %c%d %s)",
transfers[i].SrcToStr().c_str(),
MemTypeStr[transfers[i].exeType], transfers[i].exeIndex,
ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
transfers[i].DstToStr().c_str());
}
printf("\n");
}
}
}
}
void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
{
if (ev.useDmaCopy)
printf("Bytes to copy: %lu from GPU %d using DMA [Sweeping %d to %d parallel writes]\n",
numBytesPerTransfer, srcIdx, minGpus, maxGpus);
else
printf("Bytes to copy: %lu from GPU %d using GFX (%d CUs) [Sweeping %d to %d parallel writes]\n",
numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
char sep = (ev.outputToCsv ? ',' : ' ');
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (i == srcIdx) continue;
printf(" GPU %-3d %c", i, sep);
}
printf("\n");
if (!ev.outputToCsv)
{
for (int i = 0; i < ev.numGpuDevices-1; i++)
{
printf("-------------");
}
printf("\n");
}
for (int p = minGpus; p <= maxGpus; p++)
{
for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
{
if (bitmask & (1<<srcIdx)) continue;
if (__builtin_popcount(bitmask) == p)
{
std::vector<Transfer> transfers;
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (bitmask & (1<<i))
{
Transfer t;
t.exeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
t.exeSubIndex = -1;
t.numSubExecs = ev.useDmaCopy ? 1 : numSubExecs;
t.numBytes = numBytesPerTransfer;
t.numSrcs = 1;
t.numDsts = 1;
t.exeIndex = srcIdx;
t.srcType.resize(1);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex.resize(1);
t.srcIndex[0] = srcIdx;
t.dstType.resize(1);
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex.resize(1);
t.dstIndex[0] = i;
transfers.push_back(t);
}
}
ExecuteTransfers(ev, 0, 0, transfers, false);
int counter = 0;
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (bitmask & (1<<i))
printf(" %8.3f %c", transfers[counter++].transferBandwidth, sep);
else if (i != srcIdx)
printf(" %c", sep);
}
printf(" %d %d", p, numSubExecs);
for (auto i = 0; i < transfers.size(); i++)
{
printf(" (%s %c%d %s)",
transfers[i].SrcToStr().c_str(),
ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
transfers[i].DstToStr().c_str());
}
printf("\n");
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.49"
#define TB_VERSION "1.50"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -797,6 +797,18 @@ public:
printf("\n");
}
void DisplayParallelCopyEnvVars() const
{
DisplayEnvVars();
if (hideEnv) return;
if (!outputToCsv)
printf("[Parallel-copy Related]\n");
PRINT_EV("USE_FINE_GRAIN", useFineGrain,
std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
PRINT_EV("USE_GPU_DMA", useDmaCopy,
std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
printf("\n");
}
// Helper function that gets parses environment variable or sets to default value
static int GetEnvVar(std::string const& varname, int defaultValue)
......
......@@ -95,16 +95,32 @@ void CpuReduceKernel(SubExecParam const& p)
else if (numSrcs == 1)
{
float const* __restrict__ src = p.src[0];
if (numDsts == 0)
{
float sum = 0.0;
for (int j = 0; j < p.N; j++)
sum += p.src[0][j];
// Add a dummy check to ensure the read is not optimized out
if (sum != sum)
{
printf("[ERROR] Nan detected\n");
}
}
else
{
for (int i = 0; i < numDsts; ++i)
{
memcpy(p.dst[i], src, p.N * sizeof(float));
}
}
}
else
{
float sum = 0.0f;
for (int j = 0; j < p.N; j++)
{
float sum = p.src[0][j];
sum = p.src[0][j];
for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
}
......
......@@ -191,6 +191,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment