"references/video_classification/train.py" did not exist on "ad0daff1bbc8b148a6d96df6c9b1d9b9c1b6adad"
Unverified Commit 9598bdb6 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Merge pull request #1 from gilbertlee-amd/SweepPreset

Implementing sweep preset
parents ddb6508f 93430da1
# Changelog for TransferBench
## v1.03
### Added
- New preset modes stress-test benchmarks "sweep" and "randomsweep"
- sweep iterates over all possible sets of Transfers to test
- randomsweep iterates over random sets of Transfers
- New sweep-only environment variables can modify sweep
- SWEEP_SRC - String containing only "B","C","F", or "G", defining possible source memory types
- SWEEP_EXE - String containing only "C", or "G", defining possible executors
- SWEEP_DST - String containing only "B","C","F", or "G", defining possible destination memory types
- SWEEP_SRC_IS_EXE - Restrict executor to be the same as the source if non-zero
- SWEEP_MIN - Minimum number of parallel transfers to test
- SWEEP_MAX - Maximum number of parallel transfers to test
- SWEEP_COUNT - Maximum number of tests to run
- SWEEP_TIME_LIMIT - Maximum number of seconds to run tests for
- New environment variable to restrict number of available GPUs to test on (primarily for sweep runs)
- NUM_CPU_DEVICES - Number of CPU devices
- NUM_GPU_DEVICES - Number of GPU devices
### Changed
- Fixed timing display for CPU-executors when using single stream mode
## v1.02
### Added
- Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test
......
......@@ -25,7 +25,9 @@ THE SOFTWARE.
#include <algorithm>
#define TB_VERSION "1.02"
#define TB_VERSION "1.03"
extern char const MemTypeStr[];
// This class manages environment variable that affect TransferBench
class EnvVars
......@@ -37,10 +39,21 @@ public:
int const DEFAULT_SAMPLING_FACTOR = 1;
int const DEFAULT_NUM_CPU_PER_TRANSFER = 4;
int const DEFAULT_SWEEP_SRC_IS_EXE = 0;
std::string const DEFAULT_SWEEP_SRC = "CG";
std::string const DEFAULT_SWEEP_EXE = "CG";
std::string const DEFAULT_SWEEP_DST = "CG";
int const DEFAULT_SWEEP_MIN = 1;
int const DEFAULT_SWEEP_MAX = 24;
int const DEFAULT_SWEEP_TEST_LIMIT = 0;
int const DEFAULT_SWEEP_TIME_LIMIT = 0;
// Environment variables
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
int byteOffset; // Byte-offset for memory allocations
int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected)
int numCpuPerTransfer; // Number of CPU child threads to use per CPU Transfer
int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected)
int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead
int numWarmups; // Number of un-timed warmup iterations to perform
int outputToCsv; // Output in CSV format
......@@ -54,6 +67,16 @@ public:
std::vector<float> fillPattern; // Pattern of floats used to fill source data
// Environment variables only for Sweep-preset
int sweepSrcIsExe; // Non-zero if executor should always be the same as source
int sweepMin; // Min number of simultaneous Transfers to be executed per test
int sweepMax; // Max number of simulatneous Transfers to be executed per test
int sweepTestLimit; // Max number of tests to run during sweep (0 = no limit)
int sweepTimeLimit; // Max number of seconds to run sweep for (0 = no limit)
std::string sweepSrc; // Set of src memory types to be swept
std::string sweepExe; // Set of executors to be swept
std::string sweepDst; // Set of dst memory types to be swept
// Constructor that collects values
EnvVars()
{
......@@ -61,9 +84,15 @@ public:
hipDeviceGetAttribute(&maxSharedMemBytes,
hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0);
int numDetectedCpus = numa_num_configured_nodes();
int numDetectedGpus;
hipGetDeviceCount(&numGpuDevices);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus);
numCpuPerTransfer = GetEnvVar("NUM_CPU_PER_TRANSFER", DEFAULT_NUM_CPU_PER_TRANSFER);
numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
......@@ -75,6 +104,15 @@ public:
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0);
sweepSrcIsExe = GetEnvVar("SWEEP_SRC_IS_EXE", DEFAULT_SWEEP_SRC_IS_EXE);
sweepMin = GetEnvVar("SWEEP_MIN", DEFAULT_SWEEP_MIN);
sweepMax = GetEnvVar("SWEEP_MAX", DEFAULT_SWEEP_MAX);
sweepSrc = GetEnvVar("SWEEP_SRC", DEFAULT_SWEEP_SRC);
sweepExe = GetEnvVar("SWEEP_EXE", DEFAULT_SWEEP_EXE);
sweepDst = GetEnvVar("SWEEP_DST", DEFAULT_SWEEP_DST);
sweepTestLimit = GetEnvVar("SWEEP_TEST_LIMIT", DEFAULT_SWEEP_TEST_LIMIT);
sweepTimeLimit = GetEnvVar("SWEEP_TIME_LIMIT", DEFAULT_SWEEP_TIME_LIMIT);
// Check for fill pattern
char* pattern = getenv("FILL_PATTERN");
if (pattern != NULL)
......@@ -134,6 +172,16 @@ public:
else fillPattern.clear();
// Perform some basic validation
if (numCpuDevices > numDetectedCpus)
{
printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
exit(1);
}
if (numGpuDevices > numDetectedGpus)
{
printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
exit(1);
}
if (byteOffset % sizeof(float))
{
printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
......@@ -169,6 +217,49 @@ public:
printf("[ERROR] Single stream mode cannot be used with HIP calls\n");
exit(1);
}
for (auto ch : sweepSrc)
{
if (!strchr(MemTypeStr, ch))
{
printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
exit(1);
}
if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
{
printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
exit(1);
}
}
for (auto ch : sweepDst)
{
if (!strchr(MemTypeStr, ch))
{
printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
exit(1);
}
if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
{
printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
exit(1);
}
}
char const* permittedExecutors = "CG";
for (auto ch : sweepExe)
{
if (!strchr(permittedExecutors, ch))
{
printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
exit(1);
}
if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
{
printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
exit(1);
}
}
}
// Display info on the env vars that can be used
......@@ -179,7 +270,9 @@ public:
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GCPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
......@@ -207,8 +300,10 @@ public:
else
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("\n");
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
printf("%-20s = %12d : Running %d %s per topology\n", "NUM_ITERATIONS", numIterations,
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations,
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
......@@ -236,13 +331,70 @@ public:
}
};
// Display env var settings
void DisplaySweepEnvVars() const
{
if (!outputToCsv)
{
printf("Sweep configuration (TransferBench v%s)\n", TB_VERSION);
printf("=====================================================\n");
printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str());
printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str());
printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str());
printf("%-20s = %12d : Transfer executor %s Transfer source\n", "SWEEP_SRC_IS_EXE", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
printf("%-20s = %12d : Min simultaneous Transfers\n", "SWEEP_MIN", sweepMin);
printf("%-20s = %12d : Max simultaneous Transfers (0 = no limit)\n", "SWEEP_MAX", sweepMax);
printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit);
printf("%-20s = %12d : Max number of seconds to run sweep for (0 = no limit)\n", "SWEEP_TIME_LIMIT", sweepTimeLimit);
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN"));
else
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("\n");
printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations,
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
outputToCsv ? "CSV" : "console");
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
useHipCall ? "HIP functions" : "custom kernels");
if (useHipCall && !useMemset)
{
char* env = getenv("HSA_ENABLE_SDMA");
printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
(env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
}
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("\n");
}
};
// Helper function that gets parses environment variable or sets to default value
static int GetEnvVar(std::string const varname, int defaultValue)
static int GetEnvVar(std::string const& varname, int defaultValue)
{
if (getenv(varname.c_str()))
return atoi(getenv(varname.c_str()));
return defaultValue;
}
static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
{
if (getenv(varname.c_str()))
return getenv(varname.c_str());
return defaultValue;
}
};
#endif
......@@ -12,3 +12,17 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be
* `make`
If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
## Hints and suggestions
- Running TransferBench with no arguments will display usage instructions and detected topology information
- There are several preset configurations that can be used instead of a configuration file
including:
- p2p - Peer to peer benchmark test
- sweep - Sweep across possible sets of Transfers
- rsweep - Random sweep across possible sets of Transfers
- When using the same GPU executor in multiple simultaneous Transfers, performance may be
serialized due to the maximum number of hardware queues available.
- The number of maximum hardware queues can be adjusted via GPU_MAX_HW_QUEUES
- Alternatively, running in single stream mode (USE_SINGLE_STREAM=1) may avoid this issue
by launching all Transfers on a single stream instead of individual streams
......@@ -24,6 +24,7 @@ THE SOFTWARE.
// on the same node
#include <numa.h>
#include <numaif.h>
#include <random>
#include <stack>
#include <thread>
......@@ -33,6 +34,13 @@ THE SOFTWARE.
int main(int argc, char **argv)
{
// Check for NUMA library support
if (numa_available() == -1)
{
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
// Display usage instructions and detected topology
if (argc <= 1)
{
......@@ -63,13 +71,15 @@ int main(int argc, char **argv)
}
PopulateTestSizes(numBytesPerTransfer, ev.samplingFactor, valuesOfN);
// Find the largest N to be used - memory will only be allocated once per set of simulatenous Transfers
size_t maxN = valuesOfN[0];
for (auto N : valuesOfN)
maxN = std::max(maxN, N);
// Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test
if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
// Check for preset tests
// - Tests that sweep across possible sets of Transfers
if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
{
RunSweepPreset(ev, numBytesPerTransfer, !strcmp(argv[1], "rsweep"));
exit(0);
}
// - Tests that benchmark peer-to-peer performance
else if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr"))
{
int numBlocksToUse = 0;
......@@ -96,33 +106,14 @@ int main(int argc, char **argv)
exit(1);
}
// Check for NUMA library support
if (numa_available() == -1)
{
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
// Print environment variables and CSV header
ev.DisplayEnvVars();
int const initOffset = ev.byteOffset / sizeof(float);
std::stack<std::thread> threads;
// Collect the number of available CPUs/GPUs on this machine
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
int const numCpuDevices = numa_num_configured_nodes();
// Track unique pair of transfers that get used
std::set<std::pair<int, int>> peerAccessTracker;
// Print CSV header
if (ev.outputToCsv)
{
printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms),"
"TransferDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n");
}
// Loop over each line in the Transfer configuration file
int testNum = 0;
char line[2048];
while(fgets(line, 2048, fp))
......@@ -130,21 +121,49 @@ int main(int argc, char **argv)
// Check if line is a comment to be echoed to output (starts with ##)
if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);
// Parse transfers from configuration file
TransferMap transferMap;
ParseTransfers(line, numCpuDevices, numGpuDevices, transferMap);
if (transferMap.size() == 0) continue;
// Parse set of parallel Transfers to execute
std::vector<Transfer> transfers;
ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
if (transfers.empty()) continue;
ExecuteTransfers(ev, ++testNum, valuesOfN, transfers);
}
fclose(fp);
return 0;
}
testNum++;
void ExecuteTransfers(EnvVars const& ev,
int testNum,
std::vector<size_t> const& valuesOfN,
std::vector<Transfer>& transfers)
{
int const initOffset = ev.byteOffset / sizeof(float);
// Find the largest N to be used - memory will only be allocated once per set of Transfers
size_t maxN = valuesOfN[0];
for (auto N : valuesOfN)
maxN = std::max(maxN, N);
// Prepare (maximum) memory for each transfer
// Map transfers by executor
TransferMap transferMap;
for (Transfer const& transfer : transfers)
{
Executor executor(transfer.exeMemType, transfer.exeIndex);
ExecutorInfo& executorInfo = transferMap[executor];
executorInfo.transfers.push_back(transfer);
}
// Loop over each executor and prepare GPU resources
std::vector<Transfer*> transferList;
for (auto& exeInfoPair : transferMap)
{
Executor const& executor = exeInfoPair.first;
ExecutorInfo& exeInfo = exeInfoPair.second;
exeInfo.totalTime = 0.0;
exeInfo.totalBlocks = 0;
// Loop over each transfer this executor is involved in
for (Transfer& transfer : exeInfo.transfers)
{
// Get some aliases to transfer variables
......@@ -163,40 +182,29 @@ int main(int argc, char **argv)
{
// Ensure executing GPU can access source memory
if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
{
auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
if (!peerAccessTracker.count(exeSrcPair))
{
EnablePeerAccess(exeIndex, srcIndex);
peerAccessTracker.insert(exeSrcPair);
}
}
// Ensure executing GPU can access destination memory
if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex)
{
auto exeDstPair = std::make_pair(exeIndex, dstIndex);
if (!peerAccessTracker.count(exeDstPair))
{
EnablePeerAccess(exeIndex, dstIndex);
peerAccessTracker.insert(exeDstPair);
}
}
}
// Allocate (maximum) source / destination memory based on type / device index
AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.srcMem);
AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.dstMem);
transfer.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse);
exeInfo.totalBlocks += transfer.blockParam.size();
transferList.push_back(&transfer);
}
// Prepare GPU resources for GPU executors
MemType const exeMemType = exeInfoPair.first.first;
int const exeIndex = RemappedIndex(exeInfoPair.first.second, exeMemType);
// Prepare per-threadblock parameters for GPU executors
MemType const exeMemType = executor.first;
int const exeIndex = RemappedIndex(executor.second, exeMemType);
if (exeMemType == MEM_GPU)
{
// Allocate one contiguous chunk of GPU memory for threadblock parameters
// This allows support for executing one transfer per stream, or all transfers in a single stream
AllocateMemory(exeMemType, exeIndex, exeInfo.totalBlocks * sizeof(BlockParam),
(void**)&exeInfo.blockParamGpu);
......@@ -212,6 +220,7 @@ int main(int argc, char **argv)
HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
}
// Assign each transfer its portion of threadblock parameters
int transferOffset = 0;
for (int i = 0; i < exeInfo.transfers.size(); i++)
{
......@@ -232,9 +241,9 @@ int main(int argc, char **argv)
ExecutorInfo& exeInfo = exeInfoPair.second;
int transferOffset = 0;
for (int i = 0; i < exeInfo.transfers.size(); ++i)
{
// Prepare subarrays each threadblock works on and fill src memory with patterned data
Transfer& transfer = exeInfo.transfers[i];
transfer.PrepareBlockParams(ev, N);
......@@ -253,6 +262,7 @@ int main(int argc, char **argv)
// Launch kernels (warmup iterations are not counted)
double totalCpuTime = 0;
size_t numTimedIterations = 0;
std::stack<std::thread> threads;
for (int iteration = -ev.numWarmups; ; iteration++)
{
if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
......@@ -273,7 +283,8 @@ int main(int argc, char **argv)
for (auto& exeInfoPair : transferMap)
{
ExecutorInfo& exeInfo = exeInfoPair.second;
int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
int const numTransfersToRun = (IsGpuType(exeInfoPair.first.first) && ev.useSingleStream) ?
1 : exeInfo.transfers.size();
for (int i = 0; i < numTransfersToRun; ++i)
threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo), i));
}
......@@ -319,23 +330,36 @@ int main(int argc, char **argv)
{
for (auto& exeInfoPair : transferMap)
{
ExecutorInfo const& exeInfo = exeInfoPair.second;
ExecutorInfo exeInfo = exeInfoPair.second;
MemType const exeMemType = exeInfoPair.first.first;
int const exeIndex = exeInfoPair.first.second;
// Compute total time for CPU executors
if (!IsGpuType(exeMemType))
{
exeInfo.totalTime = 0;
for (auto const& transfer : exeInfo.transfers)
exeInfo.totalTime = std::max(exeInfo.totalTime, transfer.transferTime);
}
double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) / exeDurationMsec * 1000.0f;
double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) /
exeDurationMsec * 1000.0f;
maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
if (!ev.outputToCsv)
{
printf(" Executor: %cPU %02d (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |\n",
MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec);
for (auto transfer : exeInfo.transfers)
}
for (auto const& transfer : exeInfo.transfers)
{
double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
if (!ev.outputToCsv)
{
printf(" Transfer %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
transfer.transferIndex,
transferBandwidthGbs,
......@@ -345,8 +369,23 @@ int main(int argc, char **argv)
transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
MemTypeStr[transfer.dstMemType], transfer.dstIndex);
}
}
else
{
printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n",
testNum, N * sizeof(float),
MemTypeStr[transfer.srcMemType], transfer.srcIndex,
MemTypeStr[transfer.exeMemType], transfer.exeIndex,
MemTypeStr[transfer.dstMemType], transfer.dstIndex,
transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
transferBandwidthGbs, transferDurationMsec,
GetTransferDesc(transfer).c_str(),
transfer.srcMem + initOffset, transfer.dstMem + initOffset,
ev.byteOffset,
ev.numWarmups, numTimedIterations);
}
}
if (ev.outputToCsv)
{
printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
testNum, N * sizeof(float),
......@@ -359,7 +398,7 @@ int main(int argc, char **argv)
}
else
{
for (auto transfer : transferList)
for (auto const& transfer : transferList)
{
double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
......@@ -395,8 +434,8 @@ int main(int argc, char **argv)
// Display aggregate statistics
if (!ev.outputToCsv)
{
printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", totalBandwidthGbs, totalCpuTime,
totalCpuTime - maxGpuTime);
printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n",
totalBandwidthGbs, totalCpuTime, totalCpuTime - maxGpuTime);
}
else
{
......@@ -437,10 +476,6 @@ int main(int argc, char **argv)
}
}
}
}
fclose(fp);
return 0;
}
void DisplayUsage(char const* cmdName)
......@@ -461,10 +496,10 @@ void DisplayUsage(char const* cmdName)
printf(" config: Either:\n");
printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
printf(" - Name of preset benchmark:\n");
printf(" p2p - All CPU/GPU pairs benchmark\n");
printf(" p2p_rr - All CPU/GPU pairs benchmark with remote reads\n");
printf(" g2g - All GPU/GPU pairs benchmark\n");
printf(" g2g_rr - All GPU/GPU pairs benchmark with remote reads\n");
printf(" p2p{_rr} - All CPU/GPU pairs benchmark {with remote reads}\n");
printf(" g2g{_rr} - All GPU/GPU pairs benchmark {with remote reads}\n");
printf(" sweep - Sweep across possible sets of Transfers\n");
printf(" rsweep - Randomly sweep across possible sets of Transfers\n");
printf(" - 3rd optional argument will be used as # of CUs to use (uses all by default)\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
......@@ -649,15 +684,15 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
}
// Helper function to parse a list of Transfer definitions
void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferMap)
void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
{
// Replace any round brackets or '->' with spaces,
for (int i = 1; line[i]; i++)
if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
transferMap.clear();
int numTransfers = 0;
transfers.clear();
int numTransfers = 0;
std::istringstream iss(line);
iss >> numTransfers;
if (iss.fail()) return;
......@@ -665,75 +700,43 @@ void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferM
std::string exeMem;
std::string srcMem;
std::string dstMem;
if (numTransfers > 0)
{
// Method 1: Take in triples (srcMem, exeMem, dstMem)
// If numTransfers < 0, read quads (srcMem, exeMem, dstMem, #CUs)
// otherwise read triples (srcMem, exeMem, dstMem)
bool const perTransferCUs = (numTransfers < 0);
numTransfers = abs(numTransfers);
int numBlocksToUse;
if (!perTransferCUs)
{
iss >> numBlocksToUse;
if (numBlocksToUse <= 0 || iss.fail())
{
printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse);
exit(1);
}
for (int i = 0; i < numTransfers; i++)
{
Transfer transfer;
transfer.transferIndex = i;
iss >> srcMem >> exeMem >> dstMem;
if (iss.fail())
{
printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
exit(1);
}
ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
transfer.numBlocksToUse = numBlocksToUse;
// Ensure executor is either CPU or GPU
if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
{
printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n",
srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
exit(1);
}
Executor executor(transfer.exeMemType, transfer.exeIndex);
ExecutorInfo& executorInfo = transferMap[executor];
executorInfo.totalBlocks += transfer.numBlocksToUse;
executorInfo.transfers.push_back(transfer);
}
}
else
{
// Method 2: Read in quads (srcMem, exeMem, dstMem, Read common # blocks to use, then read (src, dst) doubles
numTransfers *= -1;
for (int i = 0; i < numTransfers; i++)
{
Transfer transfer;
transfer.transferIndex = i;
iss >> srcMem >> exeMem >> dstMem >> transfer.numBlocksToUse;
iss >> srcMem >> exeMem >> dstMem;
if (perTransferCUs) iss >> numBlocksToUse;
if (iss.fail())
{
if (perTransferCUs)
printf("Parsing error: Unable to read valid Transfer quadruple (possibly missing a SRC or EXE or DST or #CU)\n");
else
printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
exit(1);
}
ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
{
printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n"
, srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
exit(1);
}
Executor executor(transfer.exeMemType, transfer.exeIndex);
ExecutorInfo& executorInfo = transferMap[executor];
executorInfo.totalBlocks += transfer.numBlocksToUse;
executorInfo.transfers.push_back(transfer);
}
transfer.numBlocksToUse = numBlocksToUse;
transfers.push_back(transfer);
}
}
......@@ -747,7 +750,13 @@ void EnablePeerAccess(int const deviceId, int const peerDeviceId)
exit(1);
}
HIP_CALL(hipSetDevice(deviceId));
HIP_CALL(hipDeviceEnablePeerAccess(peerDeviceId, 0));
hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
{
printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
deviceId, peerDeviceId, hipGetErrorString(error));
exit(1);
}
}
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
......@@ -982,7 +991,8 @@ std::string GetTransferDesc(Transfer const& transfer)
+ GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex);
}
void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx)
void RunTransfer(EnvVars const& ev, size_t const N, int const iteration,
ExecutorInfo& exeInfo, int const transferIdx)
{
Transfer& transfer = exeInfo.transfers[transferIdx];
......@@ -1348,3 +1358,151 @@ int GetWallClockRate(int deviceId)
}
return wallClockPerDeviceMhz[deviceId];
}
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom)
{
ev.DisplaySweepEnvVars();
std::vector<size_t> valuesOfN(1, numBytesPerTransfer / sizeof(float));
// Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
bool hasCpuExecutor = false;
bool hasGpuExecutor = false;
std::vector<std::pair<MemType, int>> exeList;
for (auto exe : ev.sweepExe)
{
MemType const exeMemType = CharToMemType(exe);
int numDevices;
if (IsGpuType(exeMemType))
{
numDevices = ev.numGpuDevices;
hasGpuExecutor = true;
}
else
{
numDevices = ev.numCpuDevices;
hasCpuExecutor = true;
}
for (int exeIndex = 0; exeIndex < numDevices; ++exeIndex)
exeList.push_back(std::make_pair(exeMemType, exeIndex));
}
int numExes = ev.sweepSrcIsExe ? 1 : exeList.size();
std::vector<std::pair<MemType, int>> srcList;
for (auto src : ev.sweepSrc)
{
MemType const srcMemType = CharToMemType(src);
int const numDevices = IsGpuType(srcMemType) ? ev.numGpuDevices : ev.numCpuDevices;
// Skip source memory type if executor is supposed to be source but not specified
if ((IsGpuType(srcMemType) && !hasGpuExecutor) ||
(!IsGpuType(srcMemType) && !hasCpuExecutor)) continue;
for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
srcList.push_back(std::make_pair(srcMemType, srcIndex));
}
int numSrcs = srcList.size();
std::vector<std::pair<MemType, int>> dstList;
for (auto dst : ev.sweepDst)
{
MemType const dstMemType = CharToMemType(dst);
int const numDevices = IsGpuType(dstMemType) ? ev.numGpuDevices : ev.numCpuDevices;
for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
dstList.push_back(std::make_pair(dstMemType, dstIndex));
}
int numDsts = dstList.size();
int const numPossible = numSrcs * numExes * numDsts;
int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
if (ev.sweepSrcIsExe)
{
printf("Num possible (SRC/DST) triplets: (%d/%d) = %d\n", numSrcs, numDsts, numPossible);
}
else
{
printf("Num possible (SRC/EXE/DST) triplets: (%d/%d/%d) = %d\n", numSrcs, numExes, numDsts, numPossible);
}
if (ev.sweepMin > numPossible)
{
printf("No valid test configurations exist\n");
return;
}
int numTestsRun = 0;
int M = ev.sweepMin;
// Create bitmask of numPossible triplets, of which M will be chosen
std::string bitmask(M, 1); bitmask.resize(numPossible, 0);
auto rng = std::default_random_engine {};
auto cpuStart = std::chrono::high_resolution_clock::now();
while (1)
{
if (isRandom)
{
// Pick random number of simultaneous transfers to execute
// NOTE: This currently skews distribution due to some #s having more possibilities than others
M = ((maxParallelTransfers > ev.sweepMin) ? (rand() % (maxParallelTransfers - ev.sweepMin)) : 0)
+ ev.sweepMin;
// Generate a random bitmask
for (int i = 0; i < numPossible; i++)
bitmask[i] = (i < M) ? 1 : 0;
std::shuffle(bitmask.begin(), bitmask.end(), rng);
}
// Convert bitmask to list of Transfers
std::vector<Transfer> transfers;
for (int value = 0; value < numPossible; ++value)
{
if (bitmask[value])
{
// Convert integer value to (SRC->EXE->DST) triplet
Transfer transfer;
int srcValue = value / numDsts / numExes;
int exeValue = value / numDsts % numExes;
int dstValue = value % numDsts;
transfer.srcMemType = srcList[srcValue].first;
transfer.srcIndex = srcList[srcValue].second;
transfer.exeMemType = ev.sweepSrcIsExe ? transfer.srcMemType : exeList[exeValue].first;
transfer.exeIndex = ev.sweepSrcIsExe ? transfer.srcIndex : exeList[exeValue].second;
transfer.dstMemType = dstList[dstValue].first;
transfer.dstIndex = dstList[dstValue].second;
transfer.numBlocksToUse = IsGpuType(transfer.exeMemType) ? 4 : ev.numCpuPerTransfer;
transfer.transferIndex = transfers.size();
transfers.push_back(transfer);
}
}
ExecuteTransfers(ev, ++numTestsRun, valuesOfN, transfers);
// Check for test limit
if (numTestsRun == ev.sweepTestLimit)
{
printf("Test limit reached\n");
break;
}
// Check for time limit
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
{
printf("Time limit exceeded\n");
break;
}
// Increment bitmask if not random sweep
if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
{
M++;
// Check for completion
if (M > maxParallelTransfers)
{
printf("Sweep complete\n");
break;
}
for (int i = 0; i < numPossible; i++)
bitmask[i] = (i < M) ? 1 : 0;
}
}
}
......@@ -61,8 +61,27 @@ typedef enum
MEM_GPU_FINE = 3 // Fine-grained global GPU memory
} MemType;
bool IsGpuType(MemType m)
{
return (m == MEM_GPU || m == MEM_GPU_FINE);
}
char const MemTypeStr[5] = "CGBF";
MemType inline CharToMemType(char const c)
{
switch (c)
{
case 'C': return MEM_CPU;
case 'G': return MEM_GPU;
case 'B': return MEM_CPU_FINE;
case 'F': return MEM_GPU_FINE;
default:
printf("[ERROR] Unexpected mem type (%c)\n", c);
exit(1);
}
}
typedef enum
{
MODE_FILL = 0, // Fill data with pattern
......@@ -141,7 +160,10 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
MemType* memType, int* memIndex);
void ParseTransfers(char* line, int numCpus, int numGpus,
TransferMap& transferMap);
std::vector<Transfer>& transfers);
void ExecuteTransfers(EnvVars const& ev, int testNum, std::vector<size_t> const& valuesOfN,
std::vector<Transfer>& transfers);
void EnablePeerAccess(int const deviceId, int const peerDeviceId);
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
......@@ -150,6 +172,7 @@ void CheckPages(char* byteArray, size_t numBytes, int targetId);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom);
// Return the maximum bandwidth measured for given (src/dst) pair
double GetPeakBandwidth(EnvVars const& ev,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment