Commit 93430da1 authored by Gilbert Lee's avatar Gilbert Lee
Browse files

Implementing sweep preset

parent ddb6508f
# Changelog for TransferBench # Changelog for TransferBench
## v1.03
### Added
- New preset modes stress-test benchmarks "sweep" and "randomsweep"
- sweep iterates over all possible sets of Transfers to test
- randomsweep iterates over random sets of Transfers
- New sweep-only environment variables can modify sweep
- SWEEP_SRC - String containing only "B","C","F", or "G", defining possible source memory types
- SWEEP_EXE - String containing only "C", or "G", defining possible executors
- SWEEP_DST - String containing only "B","C","F", or "G", defining possible destination memory types
- SWEEP_SRC_IS_EXE - Restrict executor to be the same as the source if non-zero
- SWEEP_MIN - Minimum number of parallel transfers to test
- SWEEP_MAX - Maximum number of parallel transfers to test
- SWEEP_COUNT - Maximum number of tests to run
- SWEEP_TIME_LIMIT - Maximum number of seconds to run tests for
- New environment variable to restrict number of available GPUs to test on (primarily for sweep runs)
- NUM_CPU_DEVICES - Number of CPU devices
- NUM_GPU_DEVICES - Number of GPU devices
### Changed
- Fixed timing display for CPU-executors when using single stream mode
## v1.02 ## v1.02
### Added ### Added
- Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test - Setting NUM_ITERATIONS to negative number indicates to run for -NUM_ITERATIONS seconds per Test
......
...@@ -25,7 +25,9 @@ THE SOFTWARE. ...@@ -25,7 +25,9 @@ THE SOFTWARE.
#include <algorithm> #include <algorithm>
#define TB_VERSION "1.02" #define TB_VERSION "1.03"
extern char const MemTypeStr[];
// This class manages environment variable that affect TransferBench // This class manages environment variable that affect TransferBench
class EnvVars class EnvVars
...@@ -37,10 +39,21 @@ public: ...@@ -37,10 +39,21 @@ public:
int const DEFAULT_SAMPLING_FACTOR = 1; int const DEFAULT_SAMPLING_FACTOR = 1;
int const DEFAULT_NUM_CPU_PER_TRANSFER = 4; int const DEFAULT_NUM_CPU_PER_TRANSFER = 4;
int const DEFAULT_SWEEP_SRC_IS_EXE = 0;
std::string const DEFAULT_SWEEP_SRC = "CG";
std::string const DEFAULT_SWEEP_EXE = "CG";
std::string const DEFAULT_SWEEP_DST = "CG";
int const DEFAULT_SWEEP_MIN = 1;
int const DEFAULT_SWEEP_MAX = 24;
int const DEFAULT_SWEEP_TEST_LIMIT = 0;
int const DEFAULT_SWEEP_TIME_LIMIT = 0;
// Environment variables // Environment variables
int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy int blockBytes; // Each CU, except the last, gets a multiple of this many bytes to copy
int byteOffset; // Byte-offset for memory allocations int byteOffset; // Byte-offset for memory allocations
int numCpuDevices; // Number of CPU devices to use (defaults to # NUMA nodes detected)
int numCpuPerTransfer; // Number of CPU child threads to use per CPU Transfer int numCpuPerTransfer; // Number of CPU child threads to use per CPU Transfer
int numGpuDevices; // Number of GPU devices to use (defaults to # HIP devices detected)
int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead int numIterations; // Number of timed iterations to perform. If negative, run for -numIterations seconds instead
int numWarmups; // Number of un-timed warmup iterations to perform int numWarmups; // Number of un-timed warmup iterations to perform
int outputToCsv; // Output in CSV format int outputToCsv; // Output in CSV format
...@@ -54,6 +67,16 @@ public: ...@@ -54,6 +67,16 @@ public:
std::vector<float> fillPattern; // Pattern of floats used to fill source data std::vector<float> fillPattern; // Pattern of floats used to fill source data
// Environment variables only for Sweep-preset
int sweepSrcIsExe; // Non-zero if executor should always be the same as source
int sweepMin; // Min number of simultaneous Transfers to be executed per test
int sweepMax; // Max number of simulatneous Transfers to be executed per test
int sweepTestLimit; // Max number of tests to run during sweep (0 = no limit)
int sweepTimeLimit; // Max number of seconds to run sweep for (0 = no limit)
std::string sweepSrc; // Set of src memory types to be swept
std::string sweepExe; // Set of executors to be swept
std::string sweepDst; // Set of dst memory types to be swept
// Constructor that collects values // Constructor that collects values
EnvVars() EnvVars()
{ {
...@@ -61,9 +84,15 @@ public: ...@@ -61,9 +84,15 @@ public:
hipDeviceGetAttribute(&maxSharedMemBytes, hipDeviceGetAttribute(&maxSharedMemBytes,
hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0); hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, 0);
int numDetectedCpus = numa_num_configured_nodes();
int numDetectedGpus;
hipGetDeviceCount(&numGpuDevices);
blockBytes = GetEnvVar("BLOCK_BYTES" , 256); blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0); byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
numCpuDevices = GetEnvVar("NUM_CPU_DEVICES" , numDetectedCpus);
numCpuPerTransfer = GetEnvVar("NUM_CPU_PER_TRANSFER", DEFAULT_NUM_CPU_PER_TRANSFER); numCpuPerTransfer = GetEnvVar("NUM_CPU_PER_TRANSFER", DEFAULT_NUM_CPU_PER_TRANSFER);
numGpuDevices = GetEnvVar("NUM_GPU_DEVICES" , numDetectedGpus);
numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS); numIterations = GetEnvVar("NUM_ITERATIONS" , DEFAULT_NUM_ITERATIONS);
numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS); numWarmups = GetEnvVar("NUM_WARMUPS" , DEFAULT_NUM_WARMUPS);
outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0); outputToCsv = GetEnvVar("OUTPUT_TO_CSV" , 0);
...@@ -75,6 +104,15 @@ public: ...@@ -75,6 +104,15 @@ public:
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0); useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0);
sweepSrcIsExe = GetEnvVar("SWEEP_SRC_IS_EXE", DEFAULT_SWEEP_SRC_IS_EXE);
sweepMin = GetEnvVar("SWEEP_MIN", DEFAULT_SWEEP_MIN);
sweepMax = GetEnvVar("SWEEP_MAX", DEFAULT_SWEEP_MAX);
sweepSrc = GetEnvVar("SWEEP_SRC", DEFAULT_SWEEP_SRC);
sweepExe = GetEnvVar("SWEEP_EXE", DEFAULT_SWEEP_EXE);
sweepDst = GetEnvVar("SWEEP_DST", DEFAULT_SWEEP_DST);
sweepTestLimit = GetEnvVar("SWEEP_TEST_LIMIT", DEFAULT_SWEEP_TEST_LIMIT);
sweepTimeLimit = GetEnvVar("SWEEP_TIME_LIMIT", DEFAULT_SWEEP_TIME_LIMIT);
// Check for fill pattern // Check for fill pattern
char* pattern = getenv("FILL_PATTERN"); char* pattern = getenv("FILL_PATTERN");
if (pattern != NULL) if (pattern != NULL)
...@@ -134,6 +172,16 @@ public: ...@@ -134,6 +172,16 @@ public:
else fillPattern.clear(); else fillPattern.clear();
// Perform some basic validation // Perform some basic validation
if (numCpuDevices > numDetectedCpus)
{
printf("[ERROR] Number of CPUs to use (%d) cannot exceed number of detected CPUs (%d)\n", numCpuDevices, numDetectedCpus);
exit(1);
}
if (numGpuDevices > numDetectedGpus)
{
printf("[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)\n", numGpuDevices, numDetectedGpus);
exit(1);
}
if (byteOffset % sizeof(float)) if (byteOffset % sizeof(float))
{ {
printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float)); printf("[ERROR] BYTE_OFFSET must be set to multiple of %lu\n", sizeof(float));
...@@ -169,6 +217,49 @@ public: ...@@ -169,6 +217,49 @@ public:
printf("[ERROR] Single stream mode cannot be used with HIP calls\n"); printf("[ERROR] Single stream mode cannot be used with HIP calls\n");
exit(1); exit(1);
} }
for (auto ch : sweepSrc)
{
if (!strchr(MemTypeStr, ch))
{
printf("[ERROR] Unrecognized memory type '%c' specified for sweep source\n", ch);
exit(1);
}
if (strchr(sweepSrc.c_str(), ch) != strrchr(sweepSrc.c_str(), ch))
{
printf("[ERROR] Duplicate memory type '%c' specified for sweep source\n", ch);
exit(1);
}
}
for (auto ch : sweepDst)
{
if (!strchr(MemTypeStr, ch))
{
printf("[ERROR] Unrecognized memory type '%c' specified for sweep destination\n", ch);
exit(1);
}
if (strchr(sweepDst.c_str(), ch) != strrchr(sweepDst.c_str(), ch))
{
printf("[ERROR] Duplicate memory type '%c' specified for sweep destination\n", ch);
exit(1);
}
}
char const* permittedExecutors = "CG";
for (auto ch : sweepExe)
{
if (!strchr(permittedExecutors, ch))
{
printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n", ch);
exit(1);
}
if (strchr(sweepExe.c_str(), ch) != strrchr(sweepExe.c_str(), ch))
{
printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n", ch);
exit(1);
}
}
} }
// Display info on the env vars that can be used // Display info on the env vars that can be used
...@@ -176,20 +267,22 @@ public: ...@@ -176,20 +267,22 @@ public:
{ {
printf("Environment variables:\n"); printf("Environment variables:\n");
printf("======================\n"); printf("======================\n");
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n"); printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n"); printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n"); printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n"); printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n"); printf(" NUM_GPU_DEVICES=X - Restrict number of GCPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n"); printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n"); printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n"); printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n"); printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" USE_HIP_CALL - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n"); printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); printf(" USE_HIP_CALL - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
printf(" USE_MEMSET - Perform a memset instead of a copy (ignores source memory)\n"); printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n"); printf(" USE_MEMSET - Perform a memset instead of a copy (ignores source memory)\n");
printf(" USE_SINGLE_STREAM - Use single stream per device instead of per Transfer. Cannot be used with USE_HIP_CALL\n"); printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_SINGLE_STREAM - Use single stream per device instead of per Transfer. Cannot be used with USE_HIP_CALL\n");
} }
// Display env var settings // Display env var settings
...@@ -207,8 +300,10 @@ public: ...@@ -207,8 +300,10 @@ public:
else else
printf("Pseudo-random: (Element i = i modulo 383 + 31)"); printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("\n"); printf("\n");
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer); printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
printf("%-20s = %12d : Running %d %s per topology\n", "NUM_ITERATIONS", numIterations, printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations,
numIterations > 0 ? numIterations : -numIterations, numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)"); numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups); printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
...@@ -236,13 +331,70 @@ public: ...@@ -236,13 +331,70 @@ public:
} }
}; };
// Display env var settings
void DisplaySweepEnvVars() const
{
if (!outputToCsv)
{
printf("Sweep configuration (TransferBench v%s)\n", TB_VERSION);
printf("=====================================================\n");
printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str());
printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str());
printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str());
printf("%-20s = %12d : Transfer executor %s Transfer source\n", "SWEEP_SRC_IS_EXE", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
printf("%-20s = %12d : Min simultaneous Transfers\n", "SWEEP_MIN", sweepMin);
printf("%-20s = %12d : Max simultaneous Transfers (0 = no limit)\n", "SWEEP_MAX", sweepMax);
printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit);
printf("%-20s = %12d : Max number of seconds to run sweep for (0 = no limit)\n", "SWEEP_TIME_LIMIT", sweepTimeLimit);
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes);
printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN"));
else
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("\n");
printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations,
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups);
printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
outputToCsv ? "CSV" : "console");
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
printf("%-20s = %12d : Using %s for GPU-executed copies\n", "USE_HIP_CALL", useHipCall,
useHipCall ? "HIP functions" : "custom kernels");
if (useHipCall && !useMemset)
{
char* env = getenv("HSA_ENABLE_SDMA");
printf("%-20s = %12s : %s\n", "HSA_ENABLE_SDMA", env,
(env && !strcmp(env, "0")) ? "Using blit kernels for hipMemcpy" : "Using DMA copy engines");
}
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("\n");
}
};
// Helper function that gets parses environment variable or sets to default value // Helper function that gets parses environment variable or sets to default value
static int GetEnvVar(std::string const varname, int defaultValue) static int GetEnvVar(std::string const& varname, int defaultValue)
{ {
if (getenv(varname.c_str())) if (getenv(varname.c_str()))
return atoi(getenv(varname.c_str())); return atoi(getenv(varname.c_str()));
return defaultValue; return defaultValue;
} }
static std::string GetEnvVar(std::string const& varname, std::string const& defaultValue)
{
if (getenv(varname.c_str()))
return getenv(varname.c_str());
return defaultValue;
}
}; };
#endif #endif
...@@ -12,3 +12,17 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be ...@@ -12,3 +12,17 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be
* `make` * `make`
If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately If ROCm is installed in a folder other than `/opt/rocm/`, set ROCM_PATH appropriately
## Hints and suggestions
- Running TransferBench with no arguments will display usage instructions and detected topology information
- There are several preset configurations that can be used instead of a configuration file
including:
- p2p - Peer to peer benchmark test
- sweep - Sweep across possible sets of Transfers
- rsweep - Random sweep across possible sets of Transfers
- When using the same GPU executor in multiple simultaneous Transfers, performance may be
serialized due to the maximum number of hardware queues available.
- The number of maximum hardware queues can be adjusted via GPU_MAX_HW_QUEUES
- Alternatively, running in single stream mode (USE_SINGLE_STREAM=1) may avoid this issue
by launching all Transfers on a single stream instead of individual streams
...@@ -24,6 +24,7 @@ THE SOFTWARE. ...@@ -24,6 +24,7 @@ THE SOFTWARE.
// on the same node // on the same node
#include <numa.h> #include <numa.h>
#include <numaif.h> #include <numaif.h>
#include <random>
#include <stack> #include <stack>
#include <thread> #include <thread>
...@@ -33,6 +34,13 @@ THE SOFTWARE. ...@@ -33,6 +34,13 @@ THE SOFTWARE.
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
// Check for NUMA library support
if (numa_available() == -1)
{
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
// Display usage instructions and detected topology // Display usage instructions and detected topology
if (argc <= 1) if (argc <= 1)
{ {
...@@ -63,14 +71,16 @@ int main(int argc, char **argv) ...@@ -63,14 +71,16 @@ int main(int argc, char **argv)
} }
PopulateTestSizes(numBytesPerTransfer, ev.samplingFactor, valuesOfN); PopulateTestSizes(numBytesPerTransfer, ev.samplingFactor, valuesOfN);
// Find the largest N to be used - memory will only be allocated once per set of simulatenous Transfers // Check for preset tests
size_t maxN = valuesOfN[0]; // - Tests that sweep across possible sets of Transfers
for (auto N : valuesOfN) if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
maxN = std::max(maxN, N); {
RunSweepPreset(ev, numBytesPerTransfer, !strcmp(argv[1], "rsweep"));
// Execute only peer to peer benchmark mode, similar to rocm-bandwidth-test exit(0);
if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") || }
!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr")) // - Tests that benchmark peer-to-peer performance
else if (!strcmp(argv[1], "p2p") || !strcmp(argv[1], "p2p_rr") ||
!strcmp(argv[1], "g2g") || !strcmp(argv[1], "g2g_rr"))
{ {
int numBlocksToUse = 0; int numBlocksToUse = 0;
if (argc > 3) if (argc > 3)
...@@ -96,33 +106,14 @@ int main(int argc, char **argv) ...@@ -96,33 +106,14 @@ int main(int argc, char **argv)
exit(1); exit(1);
} }
// Check for NUMA library support // Print environment variables and CSV header
if (numa_available() == -1)
{
printf("[ERROR] NUMA library not supported. Check to see if libnuma has been installed on this system\n");
exit(1);
}
ev.DisplayEnvVars(); ev.DisplayEnvVars();
int const initOffset = ev.byteOffset / sizeof(float);
std::stack<std::thread> threads;
// Collect the number of available CPUs/GPUs on this machine
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
int const numCpuDevices = numa_num_configured_nodes();
// Track unique pair of transfers that get used
std::set<std::pair<int, int>> peerAccessTracker;
// Print CSV header
if (ev.outputToCsv) if (ev.outputToCsv)
{ {
printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms)," printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms),"
"TransferDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n"); "TransferDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n");
} }
// Loop over each line in the Transfer configuration file
int testNum = 0; int testNum = 0;
char line[2048]; char line[2048];
while(fgets(line, 2048, fp)) while(fgets(line, 2048, fp))
...@@ -130,317 +121,361 @@ int main(int argc, char **argv) ...@@ -130,317 +121,361 @@ int main(int argc, char **argv)
// Check if line is a comment to be echoed to output (starts with ##) // Check if line is a comment to be echoed to output (starts with ##)
if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line); if (!ev.outputToCsv && line[0] == '#' && line[1] == '#') printf("%s", line);
// Parse transfers from configuration file // Parse set of parallel Transfers to execute
TransferMap transferMap; std::vector<Transfer> transfers;
ParseTransfers(line, numCpuDevices, numGpuDevices, transferMap); ParseTransfers(line, ev.numCpuDevices, ev.numGpuDevices, transfers);
if (transferMap.size() == 0) continue; if (transfers.empty()) continue;
testNum++; ExecuteTransfers(ev, ++testNum, valuesOfN, transfers);
}
fclose(fp);
// Prepare (maximum) memory for each transfer return 0;
std::vector<Transfer*> transferList; }
for (auto& exeInfoPair : transferMap)
{
ExecutorInfo& exeInfo = exeInfoPair.second;
exeInfo.totalTime = 0.0;
exeInfo.totalBlocks = 0;
for (Transfer& transfer : exeInfo.transfers) void ExecuteTransfers(EnvVars const& ev,
{ int testNum,
// Get some aliases to transfer variables std::vector<size_t> const& valuesOfN,
MemType const& exeMemType = transfer.exeMemType; std::vector<Transfer>& transfers)
MemType const& srcMemType = transfer.srcMemType; {
MemType const& dstMemType = transfer.dstMemType; int const initOffset = ev.byteOffset / sizeof(float);
int const& blocksToUse = transfer.numBlocksToUse;
// Get potentially remapped device indices
int const srcIndex = RemappedIndex(transfer.srcIndex, srcMemType);
int const exeIndex = RemappedIndex(transfer.exeIndex, exeMemType);
int const dstIndex = RemappedIndex(transfer.dstIndex, dstMemType);
// Enable peer-to-peer access if necessary (can only be called once per unique pair)
if (exeMemType == MEM_GPU)
{
// Ensure executing GPU can access source memory
if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
{
auto exeSrcPair = std::make_pair(exeIndex, srcIndex);
if (!peerAccessTracker.count(exeSrcPair))
{
EnablePeerAccess(exeIndex, srcIndex);
peerAccessTracker.insert(exeSrcPair);
}
}
// Ensure executing GPU can access destination memory // Find the largest N to be used - memory will only be allocated once per set of Transfers
if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex) size_t maxN = valuesOfN[0];
{ for (auto N : valuesOfN)
auto exeDstPair = std::make_pair(exeIndex, dstIndex); maxN = std::max(maxN, N);
if (!peerAccessTracker.count(exeDstPair))
{
EnablePeerAccess(exeIndex, dstIndex);
peerAccessTracker.insert(exeDstPair);
}
}
}
// Allocate (maximum) source / destination memory based on type / device index // Map transfers by executor
AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.srcMem); TransferMap transferMap;
AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.dstMem); for (Transfer const& transfer : transfers)
transfer.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse); {
exeInfo.totalBlocks += transfer.blockParam.size(); Executor executor(transfer.exeMemType, transfer.exeIndex);
transferList.push_back(&transfer); ExecutorInfo& executorInfo = transferMap[executor];
} executorInfo.transfers.push_back(transfer);
}
// Prepare GPU resources for GPU executors // Loop over each executor and prepare GPU resources
MemType const exeMemType = exeInfoPair.first.first; std::vector<Transfer*> transferList;
int const exeIndex = RemappedIndex(exeInfoPair.first.second, exeMemType); for (auto& exeInfoPair : transferMap)
{
Executor const& executor = exeInfoPair.first;
ExecutorInfo& exeInfo = exeInfoPair.second;
exeInfo.totalTime = 0.0;
exeInfo.totalBlocks = 0;
// Loop over each transfer this executor is involved in
for (Transfer& transfer : exeInfo.transfers)
{
// Get some aliases to transfer variables
MemType const& exeMemType = transfer.exeMemType;
MemType const& srcMemType = transfer.srcMemType;
MemType const& dstMemType = transfer.dstMemType;
int const& blocksToUse = transfer.numBlocksToUse;
// Get potentially remapped device indices
int const srcIndex = RemappedIndex(transfer.srcIndex, srcMemType);
int const exeIndex = RemappedIndex(transfer.exeIndex, exeMemType);
int const dstIndex = RemappedIndex(transfer.dstIndex, dstMemType);
// Enable peer-to-peer access if necessary (can only be called once per unique pair)
if (exeMemType == MEM_GPU) if (exeMemType == MEM_GPU)
{ {
AllocateMemory(exeMemType, exeIndex, exeInfo.totalBlocks * sizeof(BlockParam), // Ensure executing GPU can access source memory
(void**)&exeInfo.blockParamGpu); if ((srcMemType == MEM_GPU || srcMemType == MEM_GPU_FINE) && srcIndex != exeIndex)
EnablePeerAccess(exeIndex, srcIndex);
int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size(); // Ensure executing GPU can access destination memory
exeInfo.streams.resize(numTransfersToRun); if ((dstMemType == MEM_GPU || dstMemType == MEM_GPU_FINE) && dstIndex != exeIndex)
exeInfo.startEvents.resize(numTransfersToRun); EnablePeerAccess(exeIndex, dstIndex);
exeInfo.stopEvents.resize(numTransfersToRun);
for (int i = 0; i < numTransfersToRun; ++i)
{
HIP_CALL(hipSetDevice(exeIndex));
HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
}
int transferOffset = 0;
for (int i = 0; i < exeInfo.transfers.size(); i++)
{
exeInfo.transfers[i].blockParamGpuPtr = exeInfo.blockParamGpu + transferOffset;
transferOffset += exeInfo.transfers[i].blockParam.size();
}
} }
// Allocate (maximum) source / destination memory based on type / device index
AllocateMemory(srcMemType, srcIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.srcMem);
AllocateMemory(dstMemType, dstIndex, maxN * sizeof(float) + ev.byteOffset, (void**)&transfer.dstMem);
transfer.blockParam.resize(exeMemType == MEM_CPU ? ev.numCpuPerTransfer : blocksToUse);
exeInfo.totalBlocks += transfer.blockParam.size();
transferList.push_back(&transfer);
} }
// Loop over all the different number of bytes to use per Transfer // Prepare per-threadblock parameters for GPU executors
for (auto N : valuesOfN) MemType const exeMemType = executor.first;
int const exeIndex = RemappedIndex(executor.second, exeMemType);
if (exeMemType == MEM_GPU)
{ {
if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float)); // Allocate one contiguous chunk of GPU memory for threadblock parameters
// This allows support for executing one transfer per stream, or all transfers in a single stream
// Prepare input memory and block parameters for current N AllocateMemory(exeMemType, exeIndex, exeInfo.totalBlocks * sizeof(BlockParam),
for (auto& exeInfoPair : transferMap) (void**)&exeInfo.blockParamGpu);
int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
exeInfo.streams.resize(numTransfersToRun);
exeInfo.startEvents.resize(numTransfersToRun);
exeInfo.stopEvents.resize(numTransfersToRun);
for (int i = 0; i < numTransfersToRun; ++i)
{ {
ExecutorInfo& exeInfo = exeInfoPair.second; HIP_CALL(hipSetDevice(exeIndex));
HIP_CALL(hipStreamCreate(&exeInfo.streams[i]));
HIP_CALL(hipEventCreate(&exeInfo.startEvents[i]));
HIP_CALL(hipEventCreate(&exeInfo.stopEvents[i]));
}
int transferOffset = 0; // Assign each transfer its portion of threadblock parameters
int transferOffset = 0;
for (int i = 0; i < exeInfo.transfers.size(); i++)
{
exeInfo.transfers[i].blockParamGpuPtr = exeInfo.blockParamGpu + transferOffset;
transferOffset += exeInfo.transfers[i].blockParam.size();
}
}
}
for (int i = 0; i < exeInfo.transfers.size(); ++i) // Loop over all the different number of bytes to use per Transfer
{ for (auto N : valuesOfN)
Transfer& transfer = exeInfo.transfers[i]; {
transfer.PrepareBlockParams(ev, N); if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float));
// Copy block parameters to GPU for GPU executors // Prepare input memory and block parameters for current N
if (transfer.exeMemType == MEM_GPU) for (auto& exeInfoPair : transferMap)
{ {
HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[transferOffset], ExecutorInfo& exeInfo = exeInfoPair.second;
transfer.blockParam.data(),
transfer.blockParam.size() * sizeof(BlockParam),
hipMemcpyHostToDevice));
transferOffset += transfer.blockParam.size();
}
}
}
// Launch kernels (warmup iterations are not counted) int transferOffset = 0;
double totalCpuTime = 0; for (int i = 0; i < exeInfo.transfers.size(); ++i)
size_t numTimedIterations = 0;
for (int iteration = -ev.numWarmups; ; iteration++)
{ {
if (ev.numIterations > 0 && iteration >= ev.numIterations) break; // Prepare subarrays each threadblock works on and fill src memory with patterned data
if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break; Transfer& transfer = exeInfo.transfers[i];
transfer.PrepareBlockParams(ev, N);
// Pause before starting first timed iteration in interactive mode // Copy block parameters to GPU for GPU executors
if (ev.useInteractive && iteration == 0) if (transfer.exeMemType == MEM_GPU)
{ {
printf("Hit <Enter> to continue: "); HIP_CALL(hipMemcpy(&exeInfo.blockParamGpu[transferOffset],
scanf("%*c"); transfer.blockParam.data(),
printf("\n"); transfer.blockParam.size() * sizeof(BlockParam),
hipMemcpyHostToDevice));
transferOffset += transfer.blockParam.size();
} }
}
}
// Start CPU timing for this iteration // Launch kernels (warmup iterations are not counted)
auto cpuStart = std::chrono::high_resolution_clock::now(); double totalCpuTime = 0;
size_t numTimedIterations = 0;
std::stack<std::thread> threads;
for (int iteration = -ev.numWarmups; ; iteration++)
{
if (ev.numIterations > 0 && iteration >= ev.numIterations) break;
if (ev.numIterations < 0 && totalCpuTime > -ev.numIterations) break;
// Execute all Transfers in parallel // Pause before starting first timed iteration in interactive mode
for (auto& exeInfoPair : transferMap) if (ev.useInteractive && iteration == 0)
{ {
ExecutorInfo& exeInfo = exeInfoPair.second; printf("Hit <Enter> to continue: ");
int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size(); scanf("%*c");
for (int i = 0; i < numTransfersToRun; ++i) printf("\n");
threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo), i)); }
}
// Wait for all threads to finish // Start CPU timing for this iteration
int const numTransfers = threads.size(); auto cpuStart = std::chrono::high_resolution_clock::now();
for (int i = 0; i < numTransfers; i++)
{
threads.top().join();
threads.pop();
}
// Stop CPU timing for this iteration // Execute all Transfers in parallel
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; for (auto& exeInfoPair : transferMap)
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count(); {
ExecutorInfo& exeInfo = exeInfoPair.second;
int const numTransfersToRun = (IsGpuType(exeInfoPair.first.first) && ev.useSingleStream) ?
1 : exeInfo.transfers.size();
for (int i = 0; i < numTransfersToRun; ++i)
threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo), i));
}
if (iteration >= 0) // Wait for all threads to finish
{ int const numTransfers = threads.size();
++numTimedIterations; for (int i = 0; i < numTransfers; i++)
totalCpuTime += deltaSec; {
} threads.top().join();
threads.pop();
} }
// Pause for interactive mode // Stop CPU timing for this iteration
if (ev.useInteractive) auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaSec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
if (iteration >= 0)
{ {
printf("Transfers complete. Hit <Enter> to continue: "); ++numTimedIterations;
scanf("%*c"); totalCpuTime += deltaSec;
printf("\n");
} }
}
// Pause for interactive mode
if (ev.useInteractive)
{
printf("Transfers complete. Hit <Enter> to continue: ");
scanf("%*c");
printf("\n");
}
// Validate that each transfer has transferred correctly // Validate that each transfer has transferred correctly
int const numTransfers = transferList.size(); int const numTransfers = transferList.size();
for (auto transfer : transferList) for (auto transfer : transferList)
CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset); CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset);
// Report timings // Report timings
totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000; totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
double totalBandwidthGbs = (numTransfers * N * sizeof(float) / 1.0E6) / totalCpuTime; double totalBandwidthGbs = (numTransfers * N * sizeof(float) / 1.0E6) / totalCpuTime;
double maxGpuTime = 0; double maxGpuTime = 0;
if (ev.useSingleStream) if (ev.useSingleStream)
{
for (auto& exeInfoPair : transferMap)
{ {
for (auto& exeInfoPair : transferMap) ExecutorInfo exeInfo = exeInfoPair.second;
MemType const exeMemType = exeInfoPair.first.first;
int const exeIndex = exeInfoPair.first.second;
// Compute total time for CPU executors
if (!IsGpuType(exeMemType))
{ {
ExecutorInfo const& exeInfo = exeInfoPair.second; exeInfo.totalTime = 0;
MemType const exeMemType = exeInfoPair.first.first; for (auto const& transfer : exeInfo.transfers)
int const exeIndex = exeInfoPair.first.second; exeInfo.totalTime = std::max(exeInfo.totalTime, transfer.transferTime);
}
double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations); double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) / exeDurationMsec * 1000.0f; double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) /
maxGpuTime = std::max(maxGpuTime, exeDurationMsec); exeDurationMsec * 1000.0f;
maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Executor: %cPU %02d (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |\n", printf(" Executor: %cPU %02d (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |\n",
MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec); MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec);
for (auto transfer : exeInfo.transfers)
{
double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
printf(" Transfer %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
transfer.transferIndex,
transferBandwidthGbs,
transferDurationMsec,
MemTypeStr[transfer.srcMemType], transfer.srcIndex,
MemTypeStr[transfer.exeMemType], transfer.exeIndex,
transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
MemTypeStr[transfer.dstMemType], transfer.dstIndex);
}
}
else
{
printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
testNum, N * sizeof(float),
MemTypeStr[exeMemType], exeIndex,
exeBandwidthGbs, exeDurationMsec,
ev.byteOffset,
ev.numWarmups, numTimedIterations);
}
} }
}
else for (auto const& transfer : exeInfo.transfers)
{
for (auto transfer : transferList)
{ {
double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations); double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f; double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s\n", printf(" Transfer %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
transfer->transferIndex, transfer.transferIndex,
MemTypeStr[transfer->srcMemType], transfer->srcIndex, transferBandwidthGbs,
MemTypeStr[transfer->exeMemType], transfer->exeIndex, transferDurationMsec,
transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse, MemTypeStr[transfer.srcMemType], transfer.srcIndex,
MemTypeStr[transfer->dstMemType], transfer->dstIndex, MemTypeStr[transfer.exeMemType], transfer.exeIndex,
transferBandwidthGbs, transferDurationMsec, transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
GetTransferDesc(*transfer).c_str()); MemTypeStr[transfer.dstMemType], transfer.dstIndex);
} }
else else
{ {
printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n", printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n",
testNum, N * sizeof(float), testNum, N * sizeof(float),
MemTypeStr[transfer->srcMemType], transfer->srcIndex, MemTypeStr[transfer.srcMemType], transfer.srcIndex,
MemTypeStr[transfer->exeMemType], transfer->exeIndex, MemTypeStr[transfer.exeMemType], transfer.exeIndex,
MemTypeStr[transfer->dstMemType], transfer->dstIndex, MemTypeStr[transfer.dstMemType], transfer.dstIndex,
transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse, transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
transferBandwidthGbs, transferDurationMsec, transferBandwidthGbs, transferDurationMsec,
GetTransferDesc(*transfer).c_str(), GetTransferDesc(transfer).c_str(),
transfer->srcMem + initOffset, transfer->dstMem + initOffset, transfer.srcMem + initOffset, transfer.dstMem + initOffset,
ev.byteOffset, ev.byteOffset,
ev.numWarmups, numTimedIterations); ev.numWarmups, numTimedIterations);
} }
} }
}
// Display aggregate statistics if (ev.outputToCsv)
if (!ev.outputToCsv) {
{ printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", totalBandwidthGbs, totalCpuTime, testNum, N * sizeof(float),
totalCpuTime - maxGpuTime); MemTypeStr[exeMemType], exeIndex,
exeBandwidthGbs, exeDurationMsec,
ev.byteOffset,
ev.numWarmups, numTimedIterations);
}
} }
else }
else
{
for (auto const& transfer : transferList)
{ {
printf("%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n", double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset, double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
ev.numWarmups, numTimedIterations); maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
if (!ev.outputToCsv)
{
printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s\n",
transfer->transferIndex,
MemTypeStr[transfer->srcMemType], transfer->srcIndex,
MemTypeStr[transfer->exeMemType], transfer->exeIndex,
transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
MemTypeStr[transfer->dstMemType], transfer->dstIndex,
transferBandwidthGbs, transferDurationMsec,
GetTransferDesc(*transfer).c_str());
}
else
{
printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n",
testNum, N * sizeof(float),
MemTypeStr[transfer->srcMemType], transfer->srcIndex,
MemTypeStr[transfer->exeMemType], transfer->exeIndex,
MemTypeStr[transfer->dstMemType], transfer->dstIndex,
transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
transferBandwidthGbs, transferDurationMsec,
GetTransferDesc(*transfer).c_str(),
transfer->srcMem + initOffset, transfer->dstMem + initOffset,
ev.byteOffset,
ev.numWarmups, numTimedIterations);
}
} }
} }
// Release GPU memory // Display aggregate statistics
for (auto exeInfoPair : transferMap) if (!ev.outputToCsv)
{ {
ExecutorInfo& exeInfo = exeInfoPair.second; printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n",
for (auto& transfer : exeInfo.transfers) totalBandwidthGbs, totalCpuTime, totalCpuTime - maxGpuTime);
{ }
// Get some aliases to Transfer variables else
MemType const& exeMemType = transfer.exeMemType; {
MemType const& srcMemType = transfer.srcMemType; printf("%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n",
MemType const& dstMemType = transfer.dstMemType; testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset,
ev.numWarmups, numTimedIterations);
// Allocate (maximum) source / destination memory based on type / device index }
DeallocateMemory(srcMemType, transfer.srcMem); }
DeallocateMemory(dstMemType, transfer.dstMem);
transfer.blockParam.clear();
}
MemType const exeMemType = exeInfoPair.first.first; // Release GPU memory
int const exeIndex = RemappedIndex(exeInfoPair.first.second, exeMemType); for (auto exeInfoPair : transferMap)
if (exeMemType == MEM_GPU) {
ExecutorInfo& exeInfo = exeInfoPair.second;
for (auto& transfer : exeInfo.transfers)
{
// Get some aliases to Transfer variables
MemType const& exeMemType = transfer.exeMemType;
MemType const& srcMemType = transfer.srcMemType;
MemType const& dstMemType = transfer.dstMemType;
// Allocate (maximum) source / destination memory based on type / device index
DeallocateMemory(srcMemType, transfer.srcMem);
DeallocateMemory(dstMemType, transfer.dstMem);
transfer.blockParam.clear();
}
MemType const exeMemType = exeInfoPair.first.first;
int const exeIndex = RemappedIndex(exeInfoPair.first.second, exeMemType);
if (exeMemType == MEM_GPU)
{
DeallocateMemory(exeMemType, exeInfo.blockParamGpu);
int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size();
for (int i = 0; i < numTransfersToRun; ++i)
{ {
DeallocateMemory(exeMemType, exeInfo.blockParamGpu); HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
int const numTransfersToRun = ev.useSingleStream ? 1 : exeInfo.transfers.size(); HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
for (int i = 0; i < numTransfersToRun; ++i) HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
{
HIP_CALL(hipEventDestroy(exeInfo.startEvents[i]));
HIP_CALL(hipEventDestroy(exeInfo.stopEvents[i]));
HIP_CALL(hipStreamDestroy(exeInfo.streams[i]));
}
} }
} }
} }
fclose(fp);
return 0;
} }
void DisplayUsage(char const* cmdName) void DisplayUsage(char const* cmdName)
...@@ -461,10 +496,10 @@ void DisplayUsage(char const* cmdName) ...@@ -461,10 +496,10 @@ void DisplayUsage(char const* cmdName)
printf(" config: Either:\n"); printf(" config: Either:\n");
printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n"); printf(" - Filename of configFile containing Transfers to execute (see example.cfg for format)\n");
printf(" - Name of preset benchmark:\n"); printf(" - Name of preset benchmark:\n");
printf(" p2p - All CPU/GPU pairs benchmark\n"); printf(" p2p{_rr} - All CPU/GPU pairs benchmark {with remote reads}\n");
printf(" p2p_rr - All CPU/GPU pairs benchmark with remote reads\n"); printf(" g2g{_rr} - All GPU/GPU pairs benchmark {with remote reads}\n");
printf(" g2g - All GPU/GPU pairs benchmark\n"); printf(" sweep - Sweep across possible sets of Transfers\n");
printf(" g2g_rr - All GPU/GPU pairs benchmark with remote reads\n"); printf(" rsweep - Randomly sweep across possible sets of Transfers\n");
printf(" - 3rd optional argument will be used as # of CUs to use (uses all by default)\n"); printf(" - 3rd optional argument will be used as # of CUs to use (uses all by default)\n");
printf(" N : (Optional) Number of bytes to copy per Transfer.\n"); printf(" N : (Optional) Number of bytes to copy per Transfer.\n");
printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n", printf(" If not specified, defaults to %lu bytes. Must be a multiple of 4 bytes\n",
...@@ -649,15 +684,15 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus ...@@ -649,15 +684,15 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
} }
// Helper function to parse a list of Transfer definitions // Helper function to parse a list of Transfer definitions
void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferMap) void ParseTransfers(char* line, int numCpus, int numGpus, std::vector<Transfer>& transfers)
{ {
// Replace any round brackets or '->' with spaces, // Replace any round brackets or '->' with spaces,
for (int i = 1; line[i]; i++) for (int i = 1; line[i]; i++)
if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' '; if (line[i] == '(' || line[i] == ')' || line[i] == '-' || line[i] == '>' ) line[i] = ' ';
transferMap.clear(); transfers.clear();
int numTransfers = 0;
int numTransfers = 0;
std::istringstream iss(line); std::istringstream iss(line);
iss >> numTransfers; iss >> numTransfers;
if (iss.fail()) return; if (iss.fail()) return;
...@@ -665,75 +700,43 @@ void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferM ...@@ -665,75 +700,43 @@ void ParseTransfers(char* line, int numCpus, int numGpus, TransferMap& transferM
std::string exeMem; std::string exeMem;
std::string srcMem; std::string srcMem;
std::string dstMem; std::string dstMem;
if (numTransfers > 0)
// If numTransfers < 0, read quads (srcMem, exeMem, dstMem, #CUs)
// otherwise read triples (srcMem, exeMem, dstMem)
bool const perTransferCUs = (numTransfers < 0);
numTransfers = abs(numTransfers);
int numBlocksToUse;
if (!perTransferCUs)
{ {
// Method 1: Take in triples (srcMem, exeMem, dstMem)
int numBlocksToUse;
iss >> numBlocksToUse; iss >> numBlocksToUse;
if (numBlocksToUse <= 0 || iss.fail()) if (numBlocksToUse <= 0 || iss.fail())
{ {
printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse); printf("Parsing error: Number of blocks to use (%d) must be greater than 0\n", numBlocksToUse);
exit(1); exit(1);
} }
for (int i = 0; i < numTransfers; i++)
{
Transfer transfer;
transfer.transferIndex = i;
iss >> srcMem >> exeMem >> dstMem;
if (iss.fail())
{
printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
exit(1);
}
ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
transfer.numBlocksToUse = numBlocksToUse;
// Ensure executor is either CPU or GPU
if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
{
printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n",
srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
exit(1);
}
Executor executor(transfer.exeMemType, transfer.exeIndex);
ExecutorInfo& executorInfo = transferMap[executor];
executorInfo.totalBlocks += transfer.numBlocksToUse;
executorInfo.transfers.push_back(transfer);
}
} }
else
{
// Method 2: Read in quads (srcMem, exeMem, dstMem, Read common # blocks to use, then read (src, dst) doubles
numTransfers *= -1;
for (int i = 0; i < numTransfers; i++) for (int i = 0; i < numTransfers; i++)
{
Transfer transfer;
transfer.transferIndex = i;
iss >> srcMem >> exeMem >> dstMem;
if (perTransferCUs) iss >> numBlocksToUse;
if (iss.fail())
{ {
Transfer transfer; if (perTransferCUs)
transfer.transferIndex = i;
iss >> srcMem >> exeMem >> dstMem >> transfer.numBlocksToUse;
if (iss.fail())
{
printf("Parsing error: Unable to read valid Transfer quadruple (possibly missing a SRC or EXE or DST or #CU)\n"); printf("Parsing error: Unable to read valid Transfer quadruple (possibly missing a SRC or EXE or DST or #CU)\n");
exit(1); else
} printf("Parsing error: Unable to read valid Transfer triplet (possibly missing a SRC or EXE or DST)\n");
ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex); exit(1);
ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
if (transfer.exeMemType != MEM_CPU && transfer.exeMemType != MEM_GPU)
{
printf("[ERROR] Executor must either be CPU ('C') or GPU ('G'), (from (%s->%s->%s %d))\n"
, srcMem.c_str(), exeMem.c_str(), dstMem.c_str(), transfer.numBlocksToUse);
exit(1);
}
Executor executor(transfer.exeMemType, transfer.exeIndex);
ExecutorInfo& executorInfo = transferMap[executor];
executorInfo.totalBlocks += transfer.numBlocksToUse;
executorInfo.transfers.push_back(transfer);
} }
ParseMemType(srcMem, numCpus, numGpus, &transfer.srcMemType, &transfer.srcIndex);
ParseMemType(exeMem, numCpus, numGpus, &transfer.exeMemType, &transfer.exeIndex);
ParseMemType(dstMem, numCpus, numGpus, &transfer.dstMemType, &transfer.dstIndex);
transfer.numBlocksToUse = numBlocksToUse;
transfers.push_back(transfer);
} }
} }
...@@ -747,7 +750,13 @@ void EnablePeerAccess(int const deviceId, int const peerDeviceId) ...@@ -747,7 +750,13 @@ void EnablePeerAccess(int const deviceId, int const peerDeviceId)
exit(1); exit(1);
} }
HIP_CALL(hipSetDevice(deviceId)); HIP_CALL(hipSetDevice(deviceId));
HIP_CALL(hipDeviceEnablePeerAccess(peerDeviceId, 0)); hipError_t error = hipDeviceEnablePeerAccess(peerDeviceId, 0);
if (error != hipSuccess && error != hipErrorPeerAccessAlreadyEnabled)
{
printf("[ERROR] Unable to enable peer to peer access from %d to %d (%s)\n",
deviceId, peerDeviceId, hipGetErrorString(error));
exit(1);
}
} }
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr) void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr)
...@@ -982,7 +991,8 @@ std::string GetTransferDesc(Transfer const& transfer) ...@@ -982,7 +991,8 @@ std::string GetTransferDesc(Transfer const& transfer)
+ GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex); + GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex);
} }
void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx) void RunTransfer(EnvVars const& ev, size_t const N, int const iteration,
ExecutorInfo& exeInfo, int const transferIdx)
{ {
Transfer& transfer = exeInfo.transfers[transferIdx]; Transfer& transfer = exeInfo.transfers[transferIdx];
...@@ -1348,3 +1358,151 @@ int GetWallClockRate(int deviceId) ...@@ -1348,3 +1358,151 @@ int GetWallClockRate(int deviceId)
} }
return wallClockPerDeviceMhz[deviceId]; return wallClockPerDeviceMhz[deviceId];
} }
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom)
{
ev.DisplaySweepEnvVars();
std::vector<size_t> valuesOfN(1, numBytesPerTransfer / sizeof(float));
// Compute how many possible Transfers are permitted (unique SRC/EXE/DST triplets)
bool hasCpuExecutor = false;
bool hasGpuExecutor = false;
std::vector<std::pair<MemType, int>> exeList;
for (auto exe : ev.sweepExe)
{
MemType const exeMemType = CharToMemType(exe);
int numDevices;
if (IsGpuType(exeMemType))
{
numDevices = ev.numGpuDevices;
hasGpuExecutor = true;
}
else
{
numDevices = ev.numCpuDevices;
hasCpuExecutor = true;
}
for (int exeIndex = 0; exeIndex < numDevices; ++exeIndex)
exeList.push_back(std::make_pair(exeMemType, exeIndex));
}
int numExes = ev.sweepSrcIsExe ? 1 : exeList.size();
std::vector<std::pair<MemType, int>> srcList;
for (auto src : ev.sweepSrc)
{
MemType const srcMemType = CharToMemType(src);
int const numDevices = IsGpuType(srcMemType) ? ev.numGpuDevices : ev.numCpuDevices;
// Skip source memory type if executor is supposed to be source but not specified
if ((IsGpuType(srcMemType) && !hasGpuExecutor) ||
(!IsGpuType(srcMemType) && !hasCpuExecutor)) continue;
for (int srcIndex = 0; srcIndex < numDevices; ++srcIndex)
srcList.push_back(std::make_pair(srcMemType, srcIndex));
}
int numSrcs = srcList.size();
std::vector<std::pair<MemType, int>> dstList;
for (auto dst : ev.sweepDst)
{
MemType const dstMemType = CharToMemType(dst);
int const numDevices = IsGpuType(dstMemType) ? ev.numGpuDevices : ev.numCpuDevices;
for (int dstIndex = 0; dstIndex < numDevices; ++dstIndex)
dstList.push_back(std::make_pair(dstMemType, dstIndex));
}
int numDsts = dstList.size();
int const numPossible = numSrcs * numExes * numDsts;
int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
if (ev.sweepSrcIsExe)
{
printf("Num possible (SRC/DST) triplets: (%d/%d) = %d\n", numSrcs, numDsts, numPossible);
}
else
{
printf("Num possible (SRC/EXE/DST) triplets: (%d/%d/%d) = %d\n", numSrcs, numExes, numDsts, numPossible);
}
if (ev.sweepMin > numPossible)
{
printf("No valid test configurations exist\n");
return;
}
int numTestsRun = 0;
int M = ev.sweepMin;
// Create bitmask of numPossible triplets, of which M will be chosen
std::string bitmask(M, 1); bitmask.resize(numPossible, 0);
auto rng = std::default_random_engine {};
auto cpuStart = std::chrono::high_resolution_clock::now();
while (1)
{
if (isRandom)
{
// Pick random number of simultaneous transfers to execute
// NOTE: This currently skews distribution due to some #s having more possibilities than others
M = ((maxParallelTransfers > ev.sweepMin) ? (rand() % (maxParallelTransfers - ev.sweepMin)) : 0)
+ ev.sweepMin;
// Generate a random bitmask
for (int i = 0; i < numPossible; i++)
bitmask[i] = (i < M) ? 1 : 0;
std::shuffle(bitmask.begin(), bitmask.end(), rng);
}
// Convert bitmask to list of Transfers
std::vector<Transfer> transfers;
for (int value = 0; value < numPossible; ++value)
{
if (bitmask[value])
{
// Convert integer value to (SRC->EXE->DST) triplet
Transfer transfer;
int srcValue = value / numDsts / numExes;
int exeValue = value / numDsts % numExes;
int dstValue = value % numDsts;
transfer.srcMemType = srcList[srcValue].first;
transfer.srcIndex = srcList[srcValue].second;
transfer.exeMemType = ev.sweepSrcIsExe ? transfer.srcMemType : exeList[exeValue].first;
transfer.exeIndex = ev.sweepSrcIsExe ? transfer.srcIndex : exeList[exeValue].second;
transfer.dstMemType = dstList[dstValue].first;
transfer.dstIndex = dstList[dstValue].second;
transfer.numBlocksToUse = IsGpuType(transfer.exeMemType) ? 4 : ev.numCpuPerTransfer;
transfer.transferIndex = transfers.size();
transfers.push_back(transfer);
}
}
ExecuteTransfers(ev, ++numTestsRun, valuesOfN, transfers);
// Check for test limit
if (numTestsRun == ev.sweepTestLimit)
{
printf("Test limit reached\n");
break;
}
// Check for time limit
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double totalCpuTime = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count();
if (ev.sweepTimeLimit && totalCpuTime > ev.sweepTimeLimit)
{
printf("Time limit exceeded\n");
break;
}
// Increment bitmask if not random sweep
if (!isRandom && !std::prev_permutation(bitmask.begin(), bitmask.end()))
{
M++;
// Check for completion
if (M > maxParallelTransfers)
{
printf("Sweep complete\n");
break;
}
for (int i = 0; i < numPossible; i++)
bitmask[i] = (i < M) ? 1 : 0;
}
}
}
...@@ -61,8 +61,27 @@ typedef enum ...@@ -61,8 +61,27 @@ typedef enum
MEM_GPU_FINE = 3 // Fine-grained global GPU memory MEM_GPU_FINE = 3 // Fine-grained global GPU memory
} MemType; } MemType;
bool IsGpuType(MemType m)
{
return (m == MEM_GPU || m == MEM_GPU_FINE);
}
char const MemTypeStr[5] = "CGBF"; char const MemTypeStr[5] = "CGBF";
MemType inline CharToMemType(char const c)
{
switch (c)
{
case 'C': return MEM_CPU;
case 'G': return MEM_GPU;
case 'B': return MEM_CPU_FINE;
case 'F': return MEM_GPU_FINE;
default:
printf("[ERROR] Unexpected mem type (%c)\n", c);
exit(1);
}
}
typedef enum typedef enum
{ {
MODE_FILL = 0, // Fill data with pattern MODE_FILL = 0, // Fill data with pattern
...@@ -141,7 +160,10 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus ...@@ -141,7 +160,10 @@ void ParseMemType(std::string const& token, int const numCpus, int const numGpus
MemType* memType, int* memIndex); MemType* memType, int* memIndex);
void ParseTransfers(char* line, int numCpus, int numGpus, void ParseTransfers(char* line, int numCpus, int numGpus,
TransferMap& transferMap); std::vector<Transfer>& transfers);
void ExecuteTransfers(EnvVars const& ev, int testNum, std::vector<size_t> const& valuesOfN,
std::vector<Transfer>& transfers);
void EnablePeerAccess(int const deviceId, int const peerDeviceId); void EnablePeerAccess(int const deviceId, int const peerDeviceId);
void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr); void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPtr);
...@@ -150,6 +172,7 @@ void CheckPages(char* byteArray, size_t numBytes, int targetId); ...@@ -150,6 +172,7 @@ void CheckPages(char* byteArray, size_t numBytes, int targetId);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr); void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx); void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu); void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom);
// Return the maximum bandwidth measured for given (src/dst) pair // Return the maximum bandwidth measured for given (src/dst) pair
double GetPeakBandwidth(EnvVars const& ev, double GetPeakBandwidth(EnvVars const& ev,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment