Commit 1c2484aa authored by Sam Wu's avatar Sam Wu
Browse files

Update doc reqs

parents 6b1aa592 3ee75292
......@@ -3,6 +3,15 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.50
### Added
- Adding new parallel copy preset benchmark (pcopy)
- Usage: ./TransferBench pcopy <numBytes=64M> <#CUs=8> <srcGpu=0> <minGpus=1> <maxGpus=#GPU-1>
### Fixed
- Removed non-copies DMA Transfers (this had previously been using hipMemset)
- Fixed CPU executor when operating on null destination
## v1.49
### Fixes
......
......@@ -70,10 +70,11 @@ make
* `a2a` : All-to-all benchmark test
* `cmdline`: Take in Transfers to run from command-line instead of via file
* `p2p` : Peer-to-peer benchmark test
* `pcopy` : Benchmark parallel copies from a single GPU to other GPUs
* `rsweep` : Random sweep across possible sets of transfers
* `rwrite` : Benchmarks parallel remote writes from a single GPU
* `rwrite` : Benchmarks parallel remote writes from a single GPU to other GPUs
* `scaling`: GPU subexecutor scaling tests
* 'schmoo` : Local/Remote read/write/copy between two GPUs
* `schmoo` : Local/Remote read/write/copy between two GPUs
* `sweep` : Sweep across possible sets of transfers
* When using the same GPU executor in multiple simultaneous transfers on separate streams (USE_SINGLE_STREAM=0),
......
rocm-docs-core==0.36.0
rocm-docs-core==0.38.1
......@@ -2,7 +2,7 @@
# This file is autogenerated by pip-compile with Python 3.8
# by the following command:
#
# pip-compile --resolver=backtracking requirements.in
# pip-compile --resolver=backtracking docs/sphinx/requirements.in
#
accessible-pygments==0.0.4
# via pydata-sphinx-theme
......@@ -25,54 +25,35 @@ cffi==1.16.0
charset-normalizer==3.3.2
# via requests
click==8.1.7
# via
# click-log
# doxysphinx
# sphinx-external-toc
click-log==0.4.0
# via doxysphinx
# via sphinx-external-toc
cryptography==42.0.5
# via pyjwt
deprecated==1.2.14
# via pygithub
docutils==0.17.1
docutils==0.19
# via
# breathe
# myst-parser
# pybtex-docutils
# pydata-sphinx-theme
# sphinx
# sphinxcontrib-bibtex
doxysphinx==3.3.7
# via rocm-docs-core
fastjsonschema==2.19.1
# via rocm-docs-core
gitdb==4.0.11
# via gitpython
gitpython==3.1.42
gitpython==3.1.43
# via rocm-docs-core
idna==3.6
idna==3.7
# via requests
imagesize==1.4.1
# via sphinx
importlib-metadata==7.0.2
# via
# sphinx
# sphinxcontrib-bibtex
importlib-resources==6.3.2
# via
# mpire
# rocm-docs-core
importlib-metadata==7.1.0
# via sphinx
importlib-resources==6.4.0
# via rocm-docs-core
jinja2==3.1.3
# via
# myst-parser
# sphinx
latexcodec==3.0.0
# via pybtex
libsass==0.22.0
# via doxysphinx
lxml==4.9.4
# via doxysphinx
markdown-it-py==2.2.0
# via
# mdit-py-plugins
......@@ -83,60 +64,44 @@ mdit-py-plugins==0.3.5
# via myst-parser
mdurl==0.1.2
# via markdown-it-py
mpire==2.10.1
# via doxysphinx
myst-parser==1.0.0
# via rocm-docs-core
numpy==1.24.4
# via -r requirements.in
packaging==24.0
# via
# pydata-sphinx-theme
# sphinx
pybtex==0.24.0
# via
# pybtex-docutils
# sphinxcontrib-bibtex
pybtex-docutils==1.0.3
# via sphinxcontrib-bibtex
pycparser==2.21
pycparser==2.22
# via cffi
pydata-sphinx-theme==0.14.4
# via
# rocm-docs-core
# sphinx-book-theme
pygithub==2.2.0
pygithub==2.3.0
# via rocm-docs-core
pygments==2.17.2
# via
# accessible-pygments
# mpire
# pydata-sphinx-theme
# sphinx
pyjson5==1.6.6
# via doxysphinx
pyjwt[crypto]==2.8.0
# via pygithub
# via
# pygithub
# pyjwt
pynacl==1.5.0
# via pygithub
pyparsing==3.1.2
# via doxysphinx
pytz==2024.1
# via babel
pyyaml==6.0.1
# via
# myst-parser
# pybtex
# rocm-docs-core
# sphinx-external-toc
requests==2.31.0
# via
# pygithub
# sphinx
rocm-docs-core[api_reference]==0.37.0
# via -r requirements.in
six==1.16.0
# via pybtex
rocm-docs-core==0.38.1
# via -r docs/sphinx/requirements.in
smmap==5.0.1
# via gitdb
snowballstemmer==2.2.0
......@@ -154,7 +119,6 @@ sphinx==5.3.0
# sphinx-design
# sphinx-external-toc
# sphinx-notfound-page
# sphinxcontrib-bibtex
sphinx-book-theme==1.0.1
# via rocm-docs-core
sphinx-copybutton==0.5.2
......@@ -167,8 +131,6 @@ sphinx-notfound-page==1.0.0
# via rocm-docs-core
sphinxcontrib-applehelp==1.0.4
# via sphinx
sphinxcontrib-bibtex==2.6.2
# via -r requirements.in
sphinxcontrib-devhelp==1.0.2
# via sphinx
sphinxcontrib-htmlhelp==2.0.1
......@@ -179,9 +141,7 @@ sphinxcontrib-qthelp==1.0.3
# via sphinx
sphinxcontrib-serializinghtml==1.1.5
# via sphinx
tqdm==4.66.2
# via mpire
typing-extensions==4.10.0
typing-extensions==4.11.0
# via
# pydata-sphinx-theme
# pygithub
......
......@@ -175,6 +175,40 @@ int main(int argc, char **argv)
} while (curr < N * 2);
}
}
else if (!strcmp(argv[1], "pcopy"))
{
if (ev.numGpuDevices < 2)
{
printf("[ERROR] Parallel copy benchmark requires at least 2 GPUs\n");
exit(1);
}
ev.DisplayParallelCopyEnvVars();
int numSubExecs = (argc > 3 ? atoi(argv[3]) : 8);
int srcIdx = (argc > 4 ? atoi(argv[4]) : 0);
int minGpus = (argc > 5 ? atoi(argv[5]) : 1);
int maxGpus = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
if (maxGpus > ev.gpuMaxHwQueues && ev.useDmaCopy)
{
printf("[ERROR] DMA executor %d attempting %d parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
srcIdx, maxGpus, ev.gpuMaxHwQueues);
printf("[ERROR] Aborting to avoid misleading results due to potential serialization of Transfers\n");
exit(1);
}
for (int N = 256; N <= (1<<27); N *= 2)
{
int delta = std::max(1, N / ev.samplingFactor);
int curr = (numBytesPerTransfer == 0) ? N : numBytesPerTransfer / sizeof(float);
do
{
RunParallelCopyBenchmark(ev, curr * sizeof(float), numSubExecs, srcIdx, minGpus, maxGpus);
if (numBytesPerTransfer != 0) exit(0);
curr += delta;
} while (curr < N * 2);
}
}
else if (!strcmp(argv[1], "cmdline"))
{
// Print environment variables and CSV header
......@@ -969,16 +1003,22 @@ void DisplayUsage(char const* cmdName)
printf(" config: Either:\n");
printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
printf(" - Name of preset config:\n");
printf(" a2a - GPU All-To-All benchmark\n");
printf(" - 3rd optional arg: # of SubExecs to use\n");
printf(" cmdline - Read Transfers from command line arguments (after N)\n");
printf(" p2p - Peer-to-peer benchmark tests\n");
printf(" rwrite/pcopy - Parallel writes/copies from single GPU to other GPUs\n");
printf(" - 3rd optional arg: # GPU SubExecs per Transfer\n");
printf(" - 4th optional arg: Root GPU index\n");
printf(" - 5th optional arg: Min number of other GPUs to transfer to\n");
printf(" - 6th optional arg: Max number of other GPUs to transfer to\n");
printf(" sweep/rsweep - Sweep/random sweep across possible sets of Transfers\n");
printf(" - 3rd optional arg: # GPU SubExecs per Transfer\n");
printf(" - 4th optional arg: # CPU SubExecs per Transfer\n");
printf(" scaling - GPU SubExec scaling copy test\n");
printf(" scaling - GPU GFX SubExec scaling copy test\n");
printf(" - 3th optional arg: Max # of SubExecs to use\n");
printf(" - 4rd optional arg: GPU index to use as executor\n");
printf(" a2a - GPU All-To-All benchmark\n");
printf(" - 3rd optional arg: # of SubExecs to use\n");
printf(" cmdline - Read Transfers from command line arguments (after N)\n");
printf(" schmoo - Local/RemoteRead/Write/Copy between two GPUs\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
DEFAULT_BYTES_PER_TRANSFER);
......@@ -1413,9 +1453,9 @@ void ParseTransfers(EnvVars const& ev, char* line, std::vector<Transfer>& transf
exit(1);
}
if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs > 1 || transfer.numDsts > 1))
if (transfer.exeType == EXE_GPU_DMA && (transfer.numSrcs != 1 || transfer.numDsts != 1))
{
printf("[ERROR] GPU DMA executor can only be used for single source / single dst Transfers\n");
printf("[ERROR] GPU DMA executor can only be used for single source + single dst copies\n");
exit(1);
}
......@@ -1718,12 +1758,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,
hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
HIP_CALL(hipEventRecord(startEvent, stream));
if (transfer->numSrcs == 0 && transfer->numDsts == 1)
{
HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
MEMSET_CHAR, transfer->numBytesActual, stream));
}
else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
if (transfer->numSrcs == 1 && transfer->numDsts == 1)
{
HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
transfer->numBytesActual, hipMemcpyDefault,
......@@ -2756,7 +2791,91 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
{
printf(" (%s %c%d %s)",
transfers[i].SrcToStr().c_str(),
MemTypeStr[transfers[i].exeType], transfers[i].exeIndex,
ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
transfers[i].DstToStr().c_str());
}
printf("\n");
}
}
}
}
void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
{
if (ev.useDmaCopy)
printf("Bytes to copy: %lu from GPU %d using DMA [Sweeping %d to %d parallel writes]\n",
numBytesPerTransfer, srcIdx, minGpus, maxGpus);
else
printf("Bytes to copy: %lu from GPU %d using GFX (%d CUs) [Sweeping %d to %d parallel writes]\n",
numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
char sep = (ev.outputToCsv ? ',' : ' ');
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (i == srcIdx) continue;
printf(" GPU %-3d %c", i, sep);
}
printf("\n");
if (!ev.outputToCsv)
{
for (int i = 0; i < ev.numGpuDevices-1; i++)
{
printf("-------------");
}
printf("\n");
}
for (int p = minGpus; p <= maxGpus; p++)
{
for (int bitmask = 0; bitmask < (1<<ev.numGpuDevices); bitmask++)
{
if (bitmask & (1<<srcIdx)) continue;
if (__builtin_popcount(bitmask) == p)
{
std::vector<Transfer> transfers;
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (bitmask & (1<<i))
{
Transfer t;
t.exeType = ev.useDmaCopy ? EXE_GPU_DMA : EXE_GPU_GFX;
t.exeSubIndex = -1;
t.numSubExecs = ev.useDmaCopy ? 1 : numSubExecs;
t.numBytes = numBytesPerTransfer;
t.numSrcs = 1;
t.numDsts = 1;
t.exeIndex = srcIdx;
t.srcType.resize(1);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex.resize(1);
t.srcIndex[0] = srcIdx;
t.dstType.resize(1);
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex.resize(1);
t.dstIndex[0] = i;
transfers.push_back(t);
}
}
ExecuteTransfers(ev, 0, 0, transfers, false);
int counter = 0;
for (int i = 0; i < ev.numGpuDevices; i++)
{
if (bitmask & (1<<i))
printf(" %8.3f %c", transfers[counter++].transferBandwidth, sep);
else if (i != srcIdx)
printf(" %c", sep);
}
printf(" %d %d", p, numSubExecs);
for (auto i = 0; i < transfers.size(); i++)
{
printf(" (%s %c%d %s)",
transfers[i].SrcToStr().c_str(),
ExeTypeStr[transfers[i].exeType], transfers[i].exeIndex,
transfers[i].DstToStr().c_str());
}
printf("\n");
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.49"
#define TB_VERSION "1.50"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -797,6 +797,18 @@ public:
printf("\n");
}
void DisplayParallelCopyEnvVars() const
{
DisplayEnvVars();
if (hideEnv) return;
if (!outputToCsv)
printf("[Parallel-copy Related]\n");
PRINT_EV("USE_FINE_GRAIN", useFineGrain,
std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
PRINT_EV("USE_GPU_DMA", useDmaCopy,
std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
printf("\n");
}
// Helper function that gets parses environment variable or sets to default value
static int GetEnvVar(std::string const& varname, int defaultValue)
......
......@@ -95,16 +95,32 @@ void CpuReduceKernel(SubExecParam const& p)
else if (numSrcs == 1)
{
float const* __restrict__ src = p.src[0];
if (numDsts == 0)
{
float sum = 0.0;
for (int j = 0; j < p.N; j++)
sum += p.src[0][j];
// Add a dummy check to ensure the read is not optimized out
if (sum != sum)
{
printf("[ERROR] Nan detected\n");
}
}
else
{
for (int i = 0; i < numDsts; ++i)
{
memcpy(p.dst[i], src, p.N * sizeof(float));
}
}
}
else
{
float sum = 0.0f;
for (int j = 0; j < p.N; j++)
{
float sum = p.src[0][j];
sum = p.src[0][j];
for (int i = 1; i < numSrcs; i++) sum += p.src[i][j];
for (int i = 0; i < numDsts; i++) p.dst[i][j] = sum;
}
......
......@@ -191,6 +191,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const numSubExecs);
void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int const localIdx, int const remoteIdx, int const maxSubExecs);
void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
void RunParallelCopyBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus);
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment