Commit 811064b5 authored by Gilbert Lee's avatar Gilbert Lee
Browse files

Adding new sweep environment variables

parent 93430da1
# Changelog for TransferBench # Changelog for TransferBench
## v1.04
### Added
- New environment variables for sweep based presets
- SWEEP_XGMI_MIN - Min number of XGMI hops for Transfers
- SWEEP_XGMI_MAX - Max number of XGMI hops for Transfers
- SWEEP_SEED - Random seed being used
- SWEEP_RAND_BYTES - Use random amount of bytes (up to pre-specified N) for each Transfer
### Changed
- CSV output for sweep includes env vars section followed by output
- CSV output no longer lists env var parameters in columns
- Default number of warmup iterations changed from 3 to 1
- Splitting CSV output of link type to ExeToSrcLinkType and ExeToDstLinkType
## v1.03 ## v1.03
### Added ### Added
- New preset modes stress-test benchmarks "sweep" and "randomsweep" - New preset modes stress-test benchmarks "sweep" and "randomsweep"
......
...@@ -24,17 +24,25 @@ THE SOFTWARE. ...@@ -24,17 +24,25 @@ THE SOFTWARE.
#define ENVVARS_HPP #define ENVVARS_HPP
#include <algorithm> #include <algorithm>
#include <random>
#define TB_VERSION "1.03" #include <time.h>
#define TB_VERSION "1.04"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
enum ConfigModeEnum
{
CFG_FILE = 0,
CFG_P2P = 1,
CFG_SWEEP = 2
};
// This class manages environment variable that affect TransferBench // This class manages environment variable that affect TransferBench
class EnvVars class EnvVars
{ {
public: public:
// Default configuration values // Default configuration values
int const DEFAULT_NUM_WARMUPS = 3; int const DEFAULT_NUM_WARMUPS = 1;
int const DEFAULT_NUM_ITERATIONS = 10; int const DEFAULT_NUM_ITERATIONS = 10;
int const DEFAULT_SAMPLING_FACTOR = 1; int const DEFAULT_SAMPLING_FACTOR = 1;
int const DEFAULT_NUM_CPU_PER_TRANSFER = 4; int const DEFAULT_NUM_CPU_PER_TRANSFER = 4;
...@@ -73,10 +81,20 @@ public: ...@@ -73,10 +81,20 @@ public:
int sweepMax; // Max number of simulatneous Transfers to be executed per test int sweepMax; // Max number of simulatneous Transfers to be executed per test
int sweepTestLimit; // Max number of tests to run during sweep (0 = no limit) int sweepTestLimit; // Max number of tests to run during sweep (0 = no limit)
int sweepTimeLimit; // Max number of seconds to run sweep for (0 = no limit) int sweepTimeLimit; // Max number of seconds to run sweep for (0 = no limit)
int sweepXgmiMin; // Min number of XGMI hops for Transfers
int sweepXgmiMax; // Max number of XGMI hops for Transfers (-1 = no limit)
int sweepSeed; // Random seed to use
int sweepRandBytes; // Whether or not to use random number of bytes per Transfer
std::string sweepSrc; // Set of src memory types to be swept std::string sweepSrc; // Set of src memory types to be swept
std::string sweepExe; // Set of executors to be swept std::string sweepExe; // Set of executors to be swept
std::string sweepDst; // Set of dst memory types to be swept std::string sweepDst; // Set of dst memory types to be swept
// Used to track current configuration mode
ConfigModeEnum configMode;
// Random generator
std::default_random_engine *generator;
// Constructor that collects values // Constructor that collects values
EnvVars() EnvVars()
{ {
...@@ -104,14 +122,23 @@ public: ...@@ -104,14 +122,23 @@ public:
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0); useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0);
sweepSrcIsExe = GetEnvVar("SWEEP_SRC_IS_EXE", DEFAULT_SWEEP_SRC_IS_EXE); sweepSrcIsExe = GetEnvVar("SWEEP_SRC_IS_EXE" , DEFAULT_SWEEP_SRC_IS_EXE);
sweepMin = GetEnvVar("SWEEP_MIN", DEFAULT_SWEEP_MIN); sweepMin = GetEnvVar("SWEEP_MIN" , DEFAULT_SWEEP_MIN);
sweepMax = GetEnvVar("SWEEP_MAX", DEFAULT_SWEEP_MAX); sweepMax = GetEnvVar("SWEEP_MAX" , DEFAULT_SWEEP_MAX);
sweepSrc = GetEnvVar("SWEEP_SRC", DEFAULT_SWEEP_SRC); sweepSrc = GetEnvVar("SWEEP_SRC" , DEFAULT_SWEEP_SRC);
sweepExe = GetEnvVar("SWEEP_EXE", DEFAULT_SWEEP_EXE); sweepExe = GetEnvVar("SWEEP_EXE" , DEFAULT_SWEEP_EXE);
sweepDst = GetEnvVar("SWEEP_DST", DEFAULT_SWEEP_DST); sweepDst = GetEnvVar("SWEEP_DST" , DEFAULT_SWEEP_DST);
sweepTestLimit = GetEnvVar("SWEEP_TEST_LIMIT", DEFAULT_SWEEP_TEST_LIMIT); sweepTestLimit = GetEnvVar("SWEEP_TEST_LIMIT" , DEFAULT_SWEEP_TEST_LIMIT);
sweepTimeLimit = GetEnvVar("SWEEP_TIME_LIMIT", DEFAULT_SWEEP_TIME_LIMIT); sweepTimeLimit = GetEnvVar("SWEEP_TIME_LIMIT" , DEFAULT_SWEEP_TIME_LIMIT);
sweepXgmiMin = GetEnvVar("SWEEP_XGMI_MIN" , 0);
sweepXgmiMax = GetEnvVar("SWEEP_XGMI_MAX" , -1);
sweepRandBytes = GetEnvVar("SWEEP_RAND_BYTES" , 0);
// Determine random seed
char *sweepSeedStr = getenv("SWEEP_SEED");
sweepSeed = (sweepSeedStr != NULL ? atoi(sweepSeedStr) : time(NULL));
generator = new std::default_random_engine(sweepSeed);
// Check for fill pattern // Check for fill pattern
char* pattern = getenv("FILL_PATTERN"); char* pattern = getenv("FILL_PATTERN");
...@@ -303,10 +330,10 @@ public: ...@@ -303,10 +330,10 @@ public:
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices); printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer); printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices); printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations, printf("%-20s = %12d : Running %d %s per Test\n", "NUM_ITERATIONS", numIterations,
numIterations > 0 ? numIterations : -numIterations, numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)"); numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups); printf("%-20s = %12d : Running %d warmup iteration(s) per Test\n", "NUM_WARMUPS", numWarmups, numWarmups);
printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv, printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
outputToCsv ? "CSV" : "console"); outputToCsv ? "CSV" : "console");
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES", printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
...@@ -329,6 +356,30 @@ public: ...@@ -329,6 +356,30 @@ public:
useSingleStream, (useSingleStream ? "device" : "Transfer")); useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("\n"); printf("\n");
} }
else
{
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
printf("BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy\n", blockBytes, blockBytes);
printf("BYTE_OFFSET,%d,Using byte offset of %d\n", byteOffset, byteOffset);
printf("FILL_PATTERN,%s,", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN"));
else
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("\n");
printf("NUM_CPU_DEVICES,%d,Using %d CPU devices\n" , numCpuDevices, numCpuDevices);
printf("NUM_CPU_PER_TRANSFER,%d,Using %d CPU thread(s) per CPU-executed Transfer\n", numCpuPerTransfer, numCpuPerTransfer);
printf("NUM_GPU_DEVICES,%d,Using %d GPU devices\n", numGpuDevices, numGpuDevices);
printf("NUM_ITERATIONS,%d,Running %d %s per Test\n", numIterations,
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test\n", numWarmups, numWarmups);
printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes);
printf("USE_HIP_CALL,%d,Using %s for GPU-executed copies\n", useHipCall, useHipCall ? "HIP functions" : "custom kernels");
printf("USE_MEMSET,%d,Performing %s\n", useMemset, useMemset ? "memset" : "memcopy");
printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer"));
}
}; };
// Display env var settings // Display env var settings
...@@ -338,6 +389,7 @@ public: ...@@ -338,6 +389,7 @@ public:
{ {
printf("Sweep configuration (TransferBench v%s)\n", TB_VERSION); printf("Sweep configuration (TransferBench v%s)\n", TB_VERSION);
printf("=====================================================\n"); printf("=====================================================\n");
printf("%-20s = %12d : Random seed\n", "SWEEP_SEED", sweepSeed);
printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str()); printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str());
printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str()); printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str());
printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str()); printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str());
...@@ -346,6 +398,9 @@ public: ...@@ -346,6 +398,9 @@ public:
printf("%-20s = %12d : Max simultaneous Transfers (0 = no limit)\n", "SWEEP_MAX", sweepMax); printf("%-20s = %12d : Max simultaneous Transfers (0 = no limit)\n", "SWEEP_MAX", sweepMax);
printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit); printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit);
printf("%-20s = %12d : Max number of seconds to run sweep for (0 = no limit)\n", "SWEEP_TIME_LIMIT", sweepTimeLimit); printf("%-20s = %12d : Max number of seconds to run sweep for (0 = no limit)\n", "SWEEP_TIME_LIMIT", sweepTimeLimit);
printf("%-20s = %12d : Min number of XGMI hops for Transfers\n", "SWEEP_XGMI_MIN", sweepXgmiMin);
printf("%-20s = %12d : Max number of XGMI hops for Transfers (-1 = no limit)\n", "SWEEP_XGMI_MAX", sweepXgmiMax);
printf("%-20s = %12d : Using %s number of bytes per Transfer\n", "SWEEP_RAND_BYTES", sweepRandBytes, sweepRandBytes ? "random" : "constant");
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices); printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer); printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n", "NUM_CPU_PER_TRANSFER", numCpuPerTransfer, numCpuPerTransfer);
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices); printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
...@@ -357,10 +412,10 @@ public: ...@@ -357,10 +412,10 @@ public:
else else
printf("Pseudo-random: (Element i = i modulo 383 + 31)"); printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("\n"); printf("\n");
printf("%-20s = %12d : Running %d %s per test\n", "NUM_ITERATIONS", numIterations, printf("%-20s = %12d : Running %d %s per Test\n", "NUM_ITERATIONS", numIterations,
numIterations > 0 ? numIterations : -numIterations, numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)"); numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("%-20s = %12d : Running %d warmup iteration(s) per topology\n", "NUM_WARMUPS", numWarmups, numWarmups); printf("%-20s = %12d : Running %d warmup iteration(s) per Test\n", "NUM_WARMUPS", numWarmups, numWarmups);
printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv, printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
outputToCsv ? "CSV" : "console"); outputToCsv ? "CSV" : "console");
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES", printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
...@@ -379,6 +434,41 @@ public: ...@@ -379,6 +434,41 @@ public:
useSingleStream, (useSingleStream ? "device" : "Transfer")); useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("\n"); printf("\n");
} }
else
{
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
printf("SWEEP_SRC,%s,Source Memory Types to sweep\n", sweepSrc.c_str());
printf("SWEEP_EXE,%s,Executor Types to sweep\n", sweepExe.c_str());
printf("SWEEP_DST,%s,Destination Memory Types to sweep\n", sweepDst.c_str());
printf("SWEEP_SRC_IS_EXE,%d, Transfer executor %s Transfer source\n", sweepSrcIsExe, sweepSrcIsExe ? "must match" : "may have any");
printf("SWEEP_SEED,%d,Random seed\n", sweepSeed);
printf("SWEEP_MIN,%d,Min simultaneous Transfers\n", sweepMin);
printf("SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)\n", sweepMax);
printf("SWEEP_TEST_LIMIT,%d,Max number of tests to run during sweep (0 = no limit)\n", sweepTestLimit);
printf("SWEEP_TIME_LIMIT,%d,Max number of seconds to run sweep for (0 = no limit)\n", sweepTimeLimit);
printf("SWEEP_XGMI_MIN,%d,Min number of XGMI hops for Transfers\n", sweepXgmiMin);
printf("SWEEP_XGMI_MAX,%d,Max number of XGMI hops for Transfers (-1 = no limit)\n", sweepXgmiMax);
printf("SWEEP_RAND_BYTES,%d,Using %s number of bytes per Transfer\n", sweepRandBytes, sweepRandBytes ? "random" : "constant");
printf("NUM_CPU_DEVICES,%d,Using %d CPU devices\n" , numCpuDevices, numCpuDevices);
printf("NUM_CPU_PER_TRANSFER,%d,Using %d CPU thread(s) per CPU-executed Transfer\n", numCpuPerTransfer, numCpuPerTransfer);
printf("NUM_GPU_DEVICES,%d,Using %d GPU devices\n", numGpuDevices, numGpuDevices);
printf("BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy\n", blockBytes, blockBytes);
printf("BYTE_OFFSET,%d,Using byte offset of %d\n", byteOffset, byteOffset);
printf("FILL_PATTERN,%s,", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN"));
else
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("\n");
printf("NUM_ITERATIONS,%d,Running %d %s per Test\n", numIterations,
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test\n", numWarmups, numWarmups);
printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes);
printf("USE_HIP_CALL,%d,Using %s for GPU-executed copies\n", useHipCall, useHipCall ? "HIP functions" : "custom kernels");
printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer"));
}
}; };
// Helper function that gets parses environment variable or sets to default value // Helper function that gets parses environment variable or sets to default value
......
...@@ -75,6 +75,7 @@ int main(int argc, char **argv) ...@@ -75,6 +75,7 @@ int main(int argc, char **argv)
// - Tests that sweep across possible sets of Transfers // - Tests that sweep across possible sets of Transfers
if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep")) if (!strcmp(argv[1], "sweep") || !strcmp(argv[1], "rsweep"))
{ {
ev.configMode = CFG_SWEEP;
RunSweepPreset(ev, numBytesPerTransfer, !strcmp(argv[1], "rsweep")); RunSweepPreset(ev, numBytesPerTransfer, !strcmp(argv[1], "rsweep"));
exit(0); exit(0);
} }
...@@ -94,11 +95,13 @@ int main(int argc, char **argv) ...@@ -94,11 +95,13 @@ int main(int argc, char **argv)
int skipCpu = (!strcmp(argv[1], "g2g" ) || !strcmp(argv[1], "g2g_rr") ? 1 : 0); int skipCpu = (!strcmp(argv[1], "g2g" ) || !strcmp(argv[1], "g2g_rr") ? 1 : 0);
// Execute peer to peer benchmark mode // Execute peer to peer benchmark mode
ev.configMode = CFG_P2P;
RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float), numBlocksToUse, readMode, skipCpu); RunPeerToPeerBenchmarks(ev, numBytesPerTransfer / sizeof(float), numBlocksToUse, readMode, skipCpu);
exit(0); exit(0);
} }
// Check that Transfer configuration file can be opened // Check that Transfer configuration file can be opened
ev.configMode = CFG_FILE;
FILE* fp = fopen(argv[1], "r"); FILE* fp = fopen(argv[1], "r");
if (!fp) if (!fp)
{ {
...@@ -110,8 +113,8 @@ int main(int argc, char **argv) ...@@ -110,8 +113,8 @@ int main(int argc, char **argv)
ev.DisplayEnvVars(); ev.DisplayEnvVars();
if (ev.outputToCsv) if (ev.outputToCsv)
{ {
printf("Test,NumBytes,SrcMem,Executor,DstMem,CUs,BW(GB/s),Time(ms)," printf("Test#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
"TransferDesc,SrcAddr,DstAddr,ByteOffset,numWarmups,numIters\n"); "ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
} }
int testNum = 0; int testNum = 0;
...@@ -233,19 +236,23 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -233,19 +236,23 @@ void ExecuteTransfers(EnvVars const& ev,
// Loop over all the different number of bytes to use per Transfer // Loop over all the different number of bytes to use per Transfer
for (auto N : valuesOfN) for (auto N : valuesOfN)
{ {
if (!ev.outputToCsv) printf("Test %d: [%lu bytes]\n", testNum, N * sizeof(float)); std::uniform_int_distribution<int> distribution(1,N);
if (!ev.outputToCsv) printf("Test %d:\n", testNum);
// Prepare input memory and block parameters for current N // Prepare input memory and block parameters for current N
for (auto& exeInfoPair : transferMap) for (auto& exeInfoPair : transferMap)
{ {
ExecutorInfo& exeInfo = exeInfoPair.second; ExecutorInfo& exeInfo = exeInfoPair.second;
exeInfo.totalBytes = 0;
int transferOffset = 0; int transferOffset = 0;
for (int i = 0; i < exeInfo.transfers.size(); ++i) for (int i = 0; i < exeInfo.transfers.size(); ++i)
{ {
// Prepare subarrays each threadblock works on and fill src memory with patterned data // Prepare subarrays each threadblock works on and fill src memory with patterned data
Transfer& transfer = exeInfo.transfers[i]; Transfer& transfer = exeInfo.transfers[i];
transfer.PrepareBlockParams(ev, N); transfer.numBytes = ((ev.configMode == CFG_SWEEP && ev.sweepRandBytes) ? distribution(*ev.generator) : N) * sizeof(float);
transfer.PrepareBlockParams(ev, transfer.numBytes / sizeof(float));
exeInfo.totalBytes += transfer.numBytes;
// Copy block parameters to GPU for GPU executors // Copy block parameters to GPU for GPU executors
if (transfer.exeMemType == MEM_GPU) if (transfer.exeMemType == MEM_GPU)
...@@ -286,7 +293,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -286,7 +293,7 @@ void ExecuteTransfers(EnvVars const& ev,
int const numTransfersToRun = (IsGpuType(exeInfoPair.first.first) && ev.useSingleStream) ? int const numTransfersToRun = (IsGpuType(exeInfoPair.first.first) && ev.useSingleStream) ?
1 : exeInfo.transfers.size(); 1 : exeInfo.transfers.size();
for (int i = 0; i < numTransfersToRun; ++i) for (int i = 0; i < numTransfersToRun; ++i)
threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo), i)); threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo), i));
} }
// Wait for all threads to finish // Wait for all threads to finish
...@@ -317,13 +324,17 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -317,13 +324,17 @@ void ExecuteTransfers(EnvVars const& ev,
} }
// Validate that each transfer has transferred correctly // Validate that each transfer has transferred correctly
size_t totalBytesTransferred = 0;
int const numTransfers = transferList.size(); int const numTransfers = transferList.size();
for (auto transfer : transferList) for (auto transfer : transferList)
CheckOrFill(MODE_CHECK, N, ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset); {
CheckOrFill(MODE_CHECK, transfer->numBytes / sizeof(float), ev.useMemset, ev.useHipCall, ev.fillPattern, transfer->dstMem + initOffset);
totalBytesTransferred += transfer->numBytes;
}
// Report timings // Report timings
totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000; totalCpuTime = totalCpuTime / (1.0 * numTimedIterations) * 1000;
double totalBandwidthGbs = (numTransfers * N * sizeof(float) / 1.0E6) / totalCpuTime; double totalBandwidthGbs = (totalBytesTransferred / 1.0E6) / totalCpuTime;
double maxGpuTime = 0; double maxGpuTime = 0;
if (ev.useSingleStream) if (ev.useSingleStream)
...@@ -343,56 +354,56 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -343,56 +354,56 @@ void ExecuteTransfers(EnvVars const& ev,
} }
double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations); double exeDurationMsec = exeInfo.totalTime / (1.0 * numTimedIterations);
double exeBandwidthGbs = (exeInfo.transfers.size() * N * sizeof(float) / 1.0E9) / double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
exeDurationMsec * 1000.0f;
maxGpuTime = std::max(maxGpuTime, exeDurationMsec); maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Executor: %cPU %02d (# Transfers %02lu)| %9.3f GB/s | %8.3f ms |\n", printf(" Executor: %cPU %02d (# Transfers %02lu)| %9.3f GB/s | %8.3f ms | %12lu bytes\n",
MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec); MemTypeStr[exeMemType], exeIndex, exeInfo.transfers.size(), exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
} }
int totalCUs = 0;
for (auto const& transfer : exeInfo.transfers) for (auto const& transfer : exeInfo.transfers)
{ {
double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations); double transferDurationMsec = transfer.transferTime / (1.0 * numTimedIterations);
double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f; double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f;
totalCUs += transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse;
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Transfer %02d | %9.3f GB/s | %8.3f ms | %c%02d -> %c%02d:(%03d) -> %c%02d\n", printf(" Transfer %02d | %9.3f GB/s | %8.3f ms | %12lu bytes | %c%02d -> %c%02d:(%03d) -> %c%02d\n",
transfer.transferIndex, transfer.transferIndex,
transferBandwidthGbs, transferBandwidthGbs,
transferDurationMsec, transferDurationMsec,
transfer.numBytes,
MemTypeStr[transfer.srcMemType], transfer.srcIndex, MemTypeStr[transfer.srcMemType], transfer.srcIndex,
MemTypeStr[transfer.exeMemType], transfer.exeIndex, MemTypeStr[transfer.exeMemType], transfer.exeIndex,
transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse, transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
MemTypeStr[transfer.dstMemType], transfer.dstIndex); MemTypeStr[transfer.dstMemType], transfer.dstIndex);
} }
else else
{ {
printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n", printf("%d,%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%s,%p,%p\n",
testNum, N * sizeof(float), testNum, transfer.transferIndex, transfer.numBytes,
MemTypeStr[transfer.srcMemType], transfer.srcIndex, MemTypeStr[transfer.srcMemType], transfer.srcIndex,
MemTypeStr[transfer.exeMemType], transfer.exeIndex, MemTypeStr[transfer.exeMemType], transfer.exeIndex,
MemTypeStr[transfer.dstMemType], transfer.dstIndex, MemTypeStr[transfer.dstMemType], transfer.dstIndex,
transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse, transfer.exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer.numBlocksToUse,
transferBandwidthGbs, transferDurationMsec, transferBandwidthGbs, transferDurationMsec,
GetTransferDesc(transfer).c_str(), GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.srcMemType, transfer.srcIndex).c_str(),
transfer.srcMem + initOffset, transfer.dstMem + initOffset, GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex).c_str(),
ev.byteOffset, transfer.srcMem + initOffset, transfer.dstMem + initOffset);
ev.numWarmups, numTimedIterations);
} }
} }
if (ev.outputToCsv) if (ev.outputToCsv)
{ {
printf("%d,%lu,ALL,%c%02d,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n", printf("%d,ALL,%lu,ALL,%c%02d,ALL,%d,%.3f,%.3f,ALL,ALL,ALL,ALL\n",
testNum, N * sizeof(float), testNum, totalBytesTransferred,
MemTypeStr[exeMemType], exeIndex, MemTypeStr[exeMemType], exeIndex, totalCUs,
exeBandwidthGbs, exeDurationMsec, exeBandwidthGbs, exeDurationMsec);
ev.byteOffset,
ev.numWarmups, numTimedIterations);
} }
} }
} }
...@@ -401,32 +412,32 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -401,32 +412,32 @@ void ExecuteTransfers(EnvVars const& ev,
for (auto const& transfer : transferList) for (auto const& transfer : transferList)
{ {
double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations); double transferDurationMsec = transfer->transferTime / (1.0 * numTimedIterations);
double transferBandwidthGbs = (N * sizeof(float) / 1.0E9) / transferDurationMsec * 1000.0f; double transferBandwidthGbs = (transfer->numBytes / 1.0E9) / transferDurationMsec * 1000.0f;
maxGpuTime = std::max(maxGpuTime, transferDurationMsec); maxGpuTime = std::max(maxGpuTime, transferDurationMsec);
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %-16s\n", printf(" Transfer %02d: %c%02d -> [%cPU %02d:%03d] -> %c%02d | %9.3f GB/s | %8.3f ms | %12lu bytes | %-16s\n",
transfer->transferIndex, transfer->transferIndex,
MemTypeStr[transfer->srcMemType], transfer->srcIndex, MemTypeStr[transfer->srcMemType], transfer->srcIndex,
MemTypeStr[transfer->exeMemType], transfer->exeIndex, MemTypeStr[transfer->exeMemType], transfer->exeIndex,
transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse, transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
MemTypeStr[transfer->dstMemType], transfer->dstIndex, MemTypeStr[transfer->dstMemType], transfer->dstIndex,
transferBandwidthGbs, transferDurationMsec, transferBandwidthGbs, transferDurationMsec,
transfer->numBytes,
GetTransferDesc(*transfer).c_str()); GetTransferDesc(*transfer).c_str());
} }
else else
{ {
printf("%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%p,%p,%d,%d,%lu\n", printf("%d,%d,%lu,%c%02d,%c%02d,%c%02d,%d,%.3f,%.3f,%s,%s,%p,%p\n",
testNum, N * sizeof(float), testNum, transfer->transferIndex, transfer->numBytes,
MemTypeStr[transfer->srcMemType], transfer->srcIndex, MemTypeStr[transfer->srcMemType], transfer->srcIndex,
MemTypeStr[transfer->exeMemType], transfer->exeIndex, MemTypeStr[transfer->exeMemType], transfer->exeIndex,
MemTypeStr[transfer->dstMemType], transfer->dstIndex, MemTypeStr[transfer->dstMemType], transfer->dstIndex,
transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse, transfer->exeMemType == MEM_CPU ? ev.numCpuPerTransfer : transfer->numBlocksToUse,
transferBandwidthGbs, transferDurationMsec, transferBandwidthGbs, transferDurationMsec,
GetTransferDesc(*transfer).c_str(), GetDesc(transfer->exeMemType, transfer->exeIndex, transfer->srcMemType, transfer->srcIndex).c_str(),
transfer->srcMem + initOffset, transfer->dstMem + initOffset, GetDesc(transfer->exeMemType, transfer->exeIndex, transfer->dstMemType, transfer->dstIndex).c_str(),
ev.byteOffset, transfer->srcMem + initOffset, transfer->dstMem + initOffset);
ev.numWarmups, numTimedIterations);
} }
} }
} }
...@@ -434,14 +445,13 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -434,14 +445,13 @@ void ExecuteTransfers(EnvVars const& ev,
// Display aggregate statistics // Display aggregate statistics
if (!ev.outputToCsv) if (!ev.outputToCsv)
{ {
printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms | Overhead: %.3f ms\n", printf(" Aggregate Bandwidth (CPU timed) | %9.3f GB/s | %8.3f ms | %12lu bytes | Overhead: %.3f ms\n",
totalBandwidthGbs, totalCpuTime, totalCpuTime - maxGpuTime); totalBandwidthGbs, totalCpuTime, totalBytesTransferred, totalCpuTime - maxGpuTime);
} }
else else
{ {
printf("%d,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,%d,%d,%lu\n", printf("%d,ALL,%lu,ALL,ALL,ALL,ALL,%.3f,%.3f,ALL,ALL,ALL,ALL\n",
testNum, N * sizeof(float), totalBandwidthGbs, totalCpuTime, ev.byteOffset, testNum, totalBytesTransferred, totalBandwidthGbs, totalCpuTime);
ev.numWarmups, numTimedIterations);
} }
} }
...@@ -991,7 +1001,7 @@ std::string GetTransferDesc(Transfer const& transfer) ...@@ -991,7 +1001,7 @@ std::string GetTransferDesc(Transfer const& transfer)
+ GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex); + GetDesc(transfer.exeMemType, transfer.exeIndex, transfer.dstMemType, transfer.dstIndex);
} }
void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, void RunTransfer(EnvVars const& ev, int const iteration,
ExecutorInfo& exeInfo, int const transferIdx) ExecutorInfo& exeInfo, int const transferIdx)
{ {
Transfer& transfer = exeInfo.transfers[transferIdx]; Transfer& transfer = exeInfo.transfers[transferIdx];
...@@ -1016,11 +1026,11 @@ void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ...@@ -1016,11 +1026,11 @@ void RunTransfer(EnvVars const& ev, size_t const N, int const iteration,
// Execute hipMemset / hipMemcpy // Execute hipMemset / hipMemcpy
if (ev.useMemset) if (ev.useMemset)
HIP_CALL(hipMemsetAsync(transfer.dstMem + initOffset, 42, N * sizeof(float), stream)); HIP_CALL(hipMemsetAsync(transfer.dstMem + initOffset, 42, transfer.numBytes, stream));
else else
HIP_CALL(hipMemcpyAsync(transfer.dstMem + initOffset, HIP_CALL(hipMemcpyAsync(transfer.dstMem + initOffset,
transfer.srcMem + initOffset, transfer.srcMem + initOffset,
N * sizeof(float), hipMemcpyDefault, transfer.numBytes, hipMemcpyDefault,
stream)); stream));
// Record stop event // Record stop event
HIP_CALL(hipEventRecord(stopEvent, stream)); HIP_CALL(hipEventRecord(stopEvent, stream));
...@@ -1259,7 +1269,7 @@ double GetPeakBandwidth(EnvVars const& ev, ...@@ -1259,7 +1269,7 @@ double GetPeakBandwidth(EnvVars const& ev,
{ {
// Perform timed iterations // Perform timed iterations
for (int i = 0; i <= isBidirectional; i++) for (int i = 0; i <= isBidirectional; i++)
threads.push(std::thread(RunTransfer, std::ref(ev), N, iteration, std::ref(exeInfo[i]), 0)); threads.push(std::thread(RunTransfer, std::ref(ev), iteration, std::ref(exeInfo[i]), 0));
// Wait for all threads to finish // Wait for all threads to finish
for (int i = 0; i <= isBidirectional; i++) for (int i = 0; i <= isBidirectional; i++)
...@@ -1412,28 +1422,118 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co ...@@ -1412,28 +1422,118 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
} }
int numDsts = dstList.size(); int numDsts = dstList.size();
int const numPossible = numSrcs * numExes * numDsts; // Build array of possibilities, respecting any additional restrictions (e.g. XGMI hop count)
int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax); struct TransferInfo
if (ev.sweepSrcIsExe)
{ {
printf("Num possible (SRC/DST) triplets: (%d/%d) = %d\n", numSrcs, numDsts, numPossible); MemType srcMemType; int srcIndex;
} MemType exeMemType; int exeIndex;
else MemType dstMemType; int dstIndex;
};
// If either XGMI minimum is non-zero, or XGMI maximum is specified and non-zero then both links must be XGMI
bool const useXgmiOnly = (ev.sweepXgmiMin > 0 || ev.sweepXgmiMax > 0);
std::vector<TransferInfo> possibleTransfers;
TransferInfo tinfo;
for (int i = 0; i < numExes; ++i)
{ {
printf("Num possible (SRC/EXE/DST) triplets: (%d/%d/%d) = %d\n", numSrcs, numExes, numDsts, numPossible); // Skip CPU executors if XGMI link must be used
if (useXgmiOnly && !IsGpuType(exeList[i].first)) continue;
tinfo.exeMemType = exeList[i].first;
tinfo.exeIndex = exeList[i].second;
bool isXgmiSrc = false;
int numHopsSrc = 0;
for (int j = 0; j < numSrcs; ++j)
{
if (IsGpuType(exeList[i].first) && IsGpuType(srcList[j].first))
{
if (exeList[i].second != srcList[j].second)
{
uint32_t exeToSrcLinkType, exeToSrcHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, MEM_GPU),
RemappedIndex(srcList[j].second, MEM_GPU),
&exeToSrcLinkType,
&exeToSrcHopCount));
isXgmiSrc = (exeToSrcLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiSrc) numHopsSrc = exeToSrcHopCount;
}
else
{
isXgmiSrc = true;
numHopsSrc = 0;
}
// Skip this SRC if it is not XGMI but only XGMI links may be used
if (useXgmiOnly && !isXgmiSrc) continue;
// Skip this SRC if XGMI distance is already past limit
if (ev.sweepXgmiMax >= 0 && isXgmiSrc && numHopsSrc > ev.sweepXgmiMax) continue;
}
else if (useXgmiOnly) continue;
tinfo.srcMemType = srcList[j].first;
tinfo.srcIndex = srcList[j].second;
bool isXgmiDst = false;
int numHopsDst = 0;
for (int k = 0; k < numDsts; ++k)
{
if (IsGpuType(exeList[i].first) && IsGpuType(dstList[k].first))
{
if (exeList[i].second != dstList[k].second)
{
uint32_t exeToDstLinkType, exeToDstHopCount;
HIP_CALL(hipExtGetLinkTypeAndHopCount(RemappedIndex(exeList[i].second, MEM_GPU),
RemappedIndex(dstList[k].second, MEM_GPU),
&exeToDstLinkType,
&exeToDstHopCount));
isXgmiDst = (exeToDstLinkType == HSA_AMD_LINK_INFO_TYPE_XGMI);
if (isXgmiDst) numHopsDst = exeToDstHopCount;
}
else
{
isXgmiDst = true;
numHopsDst = 0;
}
}
// Skip this DST if it is not XGMI but only XGMI links may be used
if (useXgmiOnly && !isXgmiDst) continue;
// Skip this DST if total XGMI distance (SRC + DST) is less than min limit
if (ev.sweepXgmiMin > 0 && (numHopsSrc + numHopsDst < ev.sweepXgmiMin)) continue;
// Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
if (ev.sweepXgmiMax >= 0 && (numHopsSrc + numHopsDst) > ev.sweepXgmiMax) continue;
tinfo.dstMemType = dstList[k].first;
tinfo.dstIndex = dstList[k].second;
possibleTransfers.push_back(tinfo);
}
}
} }
int const numPossible = (int)possibleTransfers.size();
int maxParallelTransfers = (ev.sweepMax == 0 ? numPossible : ev.sweepMax);
if (ev.sweepMin > numPossible) if (ev.sweepMin > numPossible)
{ {
printf("No valid test configurations exist\n"); printf("No valid test configurations exist\n");
return; return;
} }
if (ev.outputToCsv)
{
printf("\nTest#,Transfer#,NumBytes,Src,Exe,Dst,CUs,BW(GB/s),Time(ms),"
"ExeToSrcLinkType,ExeToDstLinkType,SrcAddr,DstAddr\n");
}
int numTestsRun = 0; int numTestsRun = 0;
int M = ev.sweepMin; int M = ev.sweepMin;
// Create bitmask of numPossible triplets, of which M will be chosen // Create bitmask of numPossible triplets, of which M will be chosen
std::string bitmask(M, 1); bitmask.resize(numPossible, 0); std::string bitmask(M, 1); bitmask.resize(numPossible, 0);
auto rng = std::default_random_engine {};
auto cpuStart = std::chrono::high_resolution_clock::now(); auto cpuStart = std::chrono::high_resolution_clock::now();
while (1) while (1)
{ {
...@@ -1447,7 +1547,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co ...@@ -1447,7 +1547,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
// Generate a random bitmask // Generate a random bitmask
for (int i = 0; i < numPossible; i++) for (int i = 0; i < numPossible; i++)
bitmask[i] = (i < M) ? 1 : 0; bitmask[i] = (i < M) ? 1 : 0;
std::shuffle(bitmask.begin(), bitmask.end(), rng); std::shuffle(bitmask.begin(), bitmask.end(), *ev.generator);
} }
// Convert bitmask to list of Transfers // Convert bitmask to list of Transfers
...@@ -1458,17 +1558,14 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co ...@@ -1458,17 +1558,14 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool co
{ {
// Convert integer value to (SRC->EXE->DST) triplet // Convert integer value to (SRC->EXE->DST) triplet
Transfer transfer; Transfer transfer;
int srcValue = value / numDsts / numExes; transfer.srcMemType = possibleTransfers[value].srcMemType;
int exeValue = value / numDsts % numExes; transfer.srcIndex = possibleTransfers[value].srcIndex;
int dstValue = value % numDsts; transfer.exeMemType = possibleTransfers[value].exeMemType;
transfer.srcMemType = srcList[srcValue].first; transfer.exeIndex = possibleTransfers[value].exeIndex;
transfer.srcIndex = srcList[srcValue].second; transfer.dstMemType = possibleTransfers[value].dstMemType;
transfer.exeMemType = ev.sweepSrcIsExe ? transfer.srcMemType : exeList[exeValue].first; transfer.dstIndex = possibleTransfers[value].dstIndex;
transfer.exeIndex = ev.sweepSrcIsExe ? transfer.srcIndex : exeList[exeValue].second;
transfer.dstMemType = dstList[dstValue].first;
transfer.dstIndex = dstList[dstValue].second;
transfer.numBlocksToUse = IsGpuType(transfer.exeMemType) ? 4 : ev.numCpuPerTransfer; transfer.numBlocksToUse = IsGpuType(transfer.exeMemType) ? 4 : ev.numCpuPerTransfer;
transfer.transferIndex = transfers.size(); transfer.transferIndex = transfers.size();
transfers.push_back(transfer); transfers.push_back(transfer);
} }
} }
......
...@@ -111,6 +111,7 @@ struct Transfer ...@@ -111,6 +111,7 @@ struct Transfer
MemType dstMemType; // Destination memory type MemType dstMemType; // Destination memory type
int dstIndex; // Destination device index int dstIndex; // Destination device index
int numBlocksToUse; // Number of threadblocks to use for this Transfer int numBlocksToUse; // Number of threadblocks to use for this Transfer
size_t numBytes; // Number of bytes to Transfer
// Memory // Memory
float* srcMem; // Source memory float* srcMem; // Source memory
...@@ -132,6 +133,7 @@ typedef std::pair<MemType, int> Executor; ...@@ -132,6 +133,7 @@ typedef std::pair<MemType, int> Executor;
struct ExecutorInfo struct ExecutorInfo
{ {
std::vector<Transfer> transfers; // Transfers to execute std::vector<Transfer> transfers; // Transfers to execute
size_t totalBytes; // Total bytes this executor transfers
// For GPU-Executors // For GPU-Executors
int totalBlocks; // Total number of CUs/CPU threads to use int totalBlocks; // Total number of CUs/CPU threads to use
...@@ -170,7 +172,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt ...@@ -170,7 +172,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
void DeallocateMemory(MemType memType, void* memPtr); void DeallocateMemory(MemType memType, void* memPtr);
void CheckPages(char* byteArray, size_t numBytes, int targetId); void CheckPages(char* byteArray, size_t numBytes, int targetId);
void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr); void CheckOrFill(ModeType mode, int N, bool isMemset, bool isHipCall, std::vector<float> const& fillPattern, float* ptr);
void RunTransfer(EnvVars const& ev, size_t const N, int const iteration, ExecutorInfo& exeInfo, int const transferIdx); void RunTransfer(EnvVars const& ev, int const iteration, ExecutorInfo& exeInfo, int const transferIdx);
void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu); void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N, int numBlocksToUse, int readMode, int skipCpu);
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom); void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, bool const isRandom);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment