Unverified Commit 5901ce0e authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Adding direct destination mem validation, env var refactor (#19)

parent e6f64e97
# Changelog for TransferBench # Changelog for TransferBench
## v1.18
### Added
- Adding ability to validate GPU destination memory directly without going through CPU staging buffer (VALIDATE_DIRECT)
- NOTE: This will only work on AMD devices with large-bar access enable and may slow things down considerably
### Changed
- Refactored how environment variables are displayed
- Mismatch stops after first detected error within an array instead of list all mismatched elements
## v1.17 ## v1.17
### Added ### Added
- Allow switch to GFX kernel for source array initialization (USE_PREP_KERNEL) - Allow switch to GFX kernel for source array initialization (USE_PREP_KERNEL)
......
...@@ -1215,7 +1215,17 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1215,7 +1215,17 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
ev.useRemoteRead ? "Local" : "Remote", ev.useRemoteRead ? "Local" : "Remote",
ev.useDmaCopy ? "DMA" : "GFX"); ev.useDmaCopy ? "DMA" : "GFX");
printf("%10s", "SRC\\DST"); if (isBidirectional)
{
printf("%12s", "SRC\\DST");
}
else
{
if (ev.useRemoteRead)
printf("%12s", "SRC\\EXE+DST");
else
printf("%12s", "SRC+EXE\\DST");
}
for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i); for (int i = 0; i < numCpus; i++) printf("%7s %02d", "CPU", i);
for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i); for (int i = 0; i < numGpus; i++) printf("%7s %02d", "GPU", i);
printf("\n"); printf("\n");
...@@ -1228,7 +1238,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N) ...@@ -1228,7 +1238,7 @@ void RunPeerToPeerBenchmarks(EnvVars const& ev, size_t N)
int const srcIndex = (srcType == MEM_CPU ? src : src - numCpus); int const srcIndex = (srcType == MEM_CPU ? src : src - numCpus);
if (!ev.outputToCsv) if (!ev.outputToCsv)
printf("%7s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex); printf("%9s %02d", (srcType == MEM_CPU) ? "CPU" : "GPU", srcIndex);
for (int dst = 0; dst < numDevices; dst++) for (int dst = 0; dst < numDevices; dst++)
{ {
...@@ -1482,7 +1492,7 @@ void Transfer::ValidateDst(EnvVars const& ev) ...@@ -1482,7 +1492,7 @@ void Transfer::ValidateDst(EnvVars const& ev)
for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx) for (int dstIdx = 0; dstIdx < this->numDsts; ++dstIdx)
{ {
float* output; float* output;
if (IsCpuType(this->dstType[dstIdx])) if (IsCpuType(this->dstType[dstIdx]) || ev.validateDirect)
{ {
output = this->dstMem[dstIdx] + initOffset; output = this->dstMem[dstIdx] + initOffset;
} }
...@@ -1525,6 +1535,8 @@ void Transfer::ValidateDst(EnvVars const& ev) ...@@ -1525,6 +1535,8 @@ void Transfer::ValidateDst(EnvVars const& ev)
this->DstToStr().c_str()); this->DstToStr().c_str());
if (!ev.continueOnError) if (!ev.continueOnError)
exit(1); exit(1);
else
break;
} }
} }
} }
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.17" #define TB_VERSION "1.18"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -77,6 +77,7 @@ public: ...@@ -77,6 +77,7 @@ public:
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern) int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
std::vector<float> fillPattern; // Pattern of floats used to fill source data std::vector<float> fillPattern; // Pattern of floats used to fill source data
...@@ -156,6 +157,7 @@ public: ...@@ -156,6 +157,7 @@ public:
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0); usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0); useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0);
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
enableDebug = GetEnvVar("DEBUG" , 0); enableDebug = GetEnvVar("DEBUG" , 0);
gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel); gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel);
...@@ -382,218 +384,112 @@ public: ...@@ -382,218 +384,112 @@ public:
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n"); printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n"); printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n"); printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n");
} }
// Helper macro to switch between CSV and terminal output
#define PRINT_EV(NAME, VALUE, DESCRIPTION) \
printf("%-20s%s%12d%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ", (DESCRIPTION).c_str())
#define PRINT_ES(NAME, VALUE, DESCRIPTION) \
printf("%-20s%s%12s%s%s\n", NAME, outputToCsv ? "," : " = ", VALUE, outputToCsv ? "," : " : ", (DESCRIPTION).c_str())
// Display env var settings // Display env var settings
void DisplayEnvVars() const void DisplayEnvVars() const
{ {
if (!outputToCsv) if (!outputToCsv)
{ {
printf("Run configuration (TransferBench v%s)\n", TB_VERSION); printf("TransferBench v%s\n", TB_VERSION);
printf("=====================================================\n"); printf("=====================================================\n");
printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes); printf("[Common]\n");
printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset);
printf("%-20s = %12d : Continue on error\n", "CONTINUE_ON_ERROR", continueOnError);
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN"));
else
printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n");
printf("%-20s = %12d : Using GPU kernel %d [%s]\n" , "GPU_KERNEL", gpuKernel, gpuKernel, GpuKernelNames[gpuKernel].c_str());
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
printf("%-20s = %12d : Running %d %s per Test\n", "NUM_ITERATIONS", numIterations,
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("%-20s = %12d : Running %d warmup iteration(s) per Test\n", "NUM_WARMUPS", numWarmups, numWarmups);
printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
outputToCsv ? "CSV" : "console");
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive,
useInteractive ? "interactive" : "non-interactive");
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("%-20s = %12d : Using %s to initialize source data\n", "USE_PREP_KERNEL",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("\n");
} }
else else
{
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION); printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
printf("BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy\n", blockBytes, blockBytes);
printf("BYTE_OFFSET,%d,Using byte offset of %d\n", byteOffset, byteOffset); PRINT_EV("BLOCK_BYTES", blockBytes,
printf("CONTINUE_ON_ERROR,%d,Continue test on mismatch error\n", continueOnError); std::string("Each CU gets a multiple of " + std::to_string(blockBytes) + " bytes to copy"));
printf("FILL_PATTERN,%s,", getenv("FILL_PATTERN") ? "(specified)" : "(unset)"); PRINT_EV("BYTE_OFFSET", byteOffset,
if (fillPattern.size()) std::string("Using byte offset of " + std::to_string(byteOffset)));
printf("Pattern: %s", getenv("FILL_PATTERN")); PRINT_EV("CONTINUE_ON_ERROR", continueOnError,
else std::string(continueOnError ? "Continue on mismatch error" : "Stop after first error"));
printf("Pseudo-random: %s", PrepSrcValueString().c_str()); PRINT_EV("FILL_PATTERN", getenv("FILL_PATTERN") ? 1 : 0,
printf("\n"); (fillPattern.size() ? std::string(getenv("FILL_PATTERN")) : PrepSrcValueString()));
printf("NUM_CPU_DEVICES,%d,Using %d CPU devices\n" , numCpuDevices, numCpuDevices); PRINT_EV("GPU_KERNEL", gpuKernel,
printf("NUM_GPU_DEVICES,%d,Using %d GPU devices\n", numGpuDevices, numGpuDevices); std::string("Using GPU kernel ") + std::to_string(gpuKernel) + " [" + std::string(GpuKernelNames[gpuKernel]) + "]");
printf("NUM_ITERATIONS,%d,Running %d %s per Test\n", numIterations, PRINT_EV("NUM_CPU_DEVICES", numCpuDevices,
numIterations > 0 ? numIterations : -numIterations, std::string("Using ") + std::to_string(numCpuDevices) + " CPU devices");
numIterations > 0 ? "timed iteration(s)" : "second(s)"); PRINT_EV("NUM_GPU_DEVICES", numGpuDevices,
printf("NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test\n", numWarmups, numWarmups); std::string("Using ") + std::to_string(numGpuDevices) + " GPU devices");
printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes); PRINT_EV("NUM_ITERATIONS", numIterations,
printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP")); std::string("Running ") + std::to_string(numIterations > 0 ? numIterations : -numIterations) + " "
printf("USE_PREP_KERNEL,%d,Using %s to initialize source data\n", + (numIterations > 0 ? " timed iteration(s)" : "seconds(s) per Test"));
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy")); PRINT_EV("NUM_WARMUPS", numWarmups,
printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer")); std::string("Running " + std::to_string(numWarmups) + " warmup iteration(s) per Test"));
} PRINT_EV("SHARED_MEM_BYTES", sharedMemBytes,
std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
PRINT_EV("USE_INTERACTIVE", useInteractive,
std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
std::string("Use ") + (usePcieIndexing ? "PCIe" : "HIP") + " GPU device indexing");
PRINT_EV("USE_PREP_KERNEL", usePrepSrcKernel,
std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
PRINT_EV("VALIDATE_DIRECT", validateDirect,
std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
printf("\n");
}; };
// Display env var for P2P Benchmark preset // Display env var for P2P Benchmark preset
void DisplayP2PBenchmarkEnvVars() const void DisplayP2PBenchmarkEnvVars() const
{ {
DisplayEnvVars();
if (!outputToCsv) if (!outputToCsv)
{ printf("[P2P Related]\n");
printf("Peer-to-peer Benchmark configuration (TransferBench v%s)\n", TB_VERSION);
printf("=====================================================\n"); PRINT_EV("NUM_CPU_SE", numCpuSubExecs,
printf("%-20s = %12d : Using %s as executor\n", "USE_REMOTE_READ", useRemoteRead , useRemoteRead ? "DST" : "SRC"); std::string("Using ") + std::to_string(numCpuSubExecs) + " CPU subexecutors");
printf("%-20s = %12d : Using GPU-%s as GPU executor\n", "USE_GPU_DMA" , useDmaCopy , useDmaCopy ? "DMA" : "GFX"); PRINT_EV("NUM_GPU_SE", numGpuSubExecs,
printf("%-20s = %12d : Using %d CPU subexecutors\n", "NUM_CPU_SE" , numCpuSubExecs, numCpuSubExecs); std::string("Using ") + std::to_string(numGpuSubExecs) + " GPU subexecutors");
printf("%-20s = %12d : Using %d GPU subexecutors\n", "NUM_GPU_SE" , numGpuSubExecs, numGpuSubExecs); PRINT_EV("USE_GPU_DMA", useDmaCopy,
std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes); PRINT_EV("USE_REMOTE_READ", useRemoteRead,
printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset); std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
printf("%-20s = %12d : Continue on error\n", "CONTINUE_ON_ERROR", continueOnError); printf("\n");
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN"));
else
printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n");
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices);
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices);
printf("%-20s = %12d : Running %d %s per Test\n", "NUM_ITERATIONS", numIterations,
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("%-20s = %12d : Running %d warmup iteration(s) per Test\n", "NUM_WARMUPS", numWarmups, numWarmups);
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
printf("%-20s = %12d : Running in %s mode\n", "USE_INTERACTIVE", useInteractive,
useInteractive ? "interactive" : "non-interactive");
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("%-20s = %12d : Using %s to initialize source data\n", "USE_PREP_KERNEL",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("\n");
}
else
{
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
printf("USE_REMOTE_READ,%d,Using %s as executor\n", useRemoteRead, useRemoteRead ? "DST" : "SRC");
printf("USE_GPU_DMA,%d,Using GPU-%s as GPU executor\n", useDmaCopy , useDmaCopy ? "DMA" : "GFX");
printf("NUM_CPU_SE,%d,Using %d CPU subexecutors\n", numCpuSubExecs, numCpuSubExecs);
printf("NUM_GPU_SE,%d,Using %d GPU subexecutors\n", numGpuSubExecs, numGpuSubExecs);
printf("BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy\n", blockBytes, blockBytes);
printf("BYTE_OFFSET,%d,Using byte offset of %d\n", byteOffset, byteOffset);
printf("FILL_PATTERN,%s,", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN"));
else
printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n");
printf("NUM_CPU_DEVICES,%d,Using %d CPU devices\n" , numCpuDevices, numCpuDevices);
printf("NUM_GPU_DEVICES,%d,Using %d GPU devices\n", numGpuDevices, numGpuDevices);
printf("NUM_ITERATIONS,%d,Running %d %s per Test\n", numIterations,
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test\n", numWarmups, numWarmups);
printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes);
printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("USE_PREP_KERNEL,%d,Using %s to initialize source data\n",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("\n");
}
} }
// Display env var settings // Display env var settings
void DisplaySweepEnvVars() const void DisplaySweepEnvVars() const
{ {
DisplayEnvVars();
if (!outputToCsv) if (!outputToCsv)
{ printf("[Sweep Related]\n");
printf("Sweep configuration (TransferBench v%s)\n", TB_VERSION); PRINT_ES("SWEEP_DST", sweepDst.c_str(),
printf("=====================================================\n"); std::string("Destination Memory Types to sweep"));
printf("%-20s = %12d : Random seed\n", "SWEEP_SEED", sweepSeed); PRINT_ES("SWEEP_EXE", sweepExe.c_str(),
printf("%-20s = %12s : Source Memory Types to sweep\n", "SWEEP_SRC", sweepSrc.c_str()); std::string("Executor Types to sweep"));
printf("%-20s = %12s : Executor Types to sweep\n", "SWEEP_EXE", sweepExe.c_str()); PRINT_EV("SWEEP_MAX", sweepMax,
printf("%-20s = %12s : Destination Memory Types to sweep\n", "SWEEP_DST", sweepDst.c_str()); std::string("Max simultaneous transfers (0 = no limit)"));
printf("%-20s = %12d : Min simultaneous Transfers\n", "SWEEP_MIN", sweepMin); PRINT_EV("SWEEP_MIN", sweepMin,
printf("%-20s = %12d : Max simultaneous Transfers (0 = no limit)\n", "SWEEP_MAX", sweepMax); std::string("Min simultaenous transfers"));
printf("%-20s = %12d : Max number of tests to run during sweep (0 = no limit)\n", "SWEEP_TEST_LIMIT", sweepTestLimit); PRINT_EV("SWEEP_RAND_BYTES", sweepRandBytes,
printf("%-20s = %12d : Max number of seconds to run sweep for (0 = no limit)\n", "SWEEP_TIME_LIMIT", sweepTimeLimit); std::string("Using ") + (sweepRandBytes ? "random" : "constant") + " number of bytes per Transfer");
printf("%-20s = %12d : Min number of XGMI hops for Transfers\n", "SWEEP_XGMI_MIN", sweepXgmiMin); PRINT_EV("SWEEP_SEED", sweepSeed,
printf("%-20s = %12d : Max number of XGMI hops for Transfers (-1 = no limit)\n", "SWEEP_XGMI_MAX", sweepXgmiMax); std::string("Random seed set to ") + std::to_string(sweepSeed));
printf("%-20s = %12d : Using %s number of bytes per Transfer\n", "SWEEP_RAND_BYTES", sweepRandBytes, sweepRandBytes ? "random" : "constant"); PRINT_ES("SWEEP_SRC", sweepSrc.c_str(),
printf("%-20s = %12d : Using %d CPU devices\n" , "NUM_CPU_DEVICES", numCpuDevices, numCpuDevices); std::string("Source Memory Types to sweep"));
printf("%-20s = %12d : Using %d GPU devices\n", "NUM_GPU_DEVICES", numGpuDevices, numGpuDevices); PRINT_EV("SWEEP_TEST_LIMIT", sweepTestLimit,
printf("%-20s = %12d : Each CU gets a multiple of %d bytes to copy\n", "BLOCK_BYTES", blockBytes, blockBytes); std::string("Max number of tests to run during sweep (0 = no limit)"));
printf("%-20s = %12d : Using byte offset of %d\n", "BYTE_OFFSET", byteOffset, byteOffset); PRINT_EV("SWEEP_TIME_LIMIT", sweepTimeLimit,
printf("%-20s = %12s : ", "FILL_PATTERN", getenv("FILL_PATTERN") ? "(specified)" : "(unset)"); std::string("Max number of seconds to run sweep for (0 = no limit)"));
if (fillPattern.size()) PRINT_EV("SWEEP_XGMI_MAX", sweepXgmiMax,
printf("Pattern: %s", getenv("FILL_PATTERN")); std::string("Max number of XGMI hops for Transfers (-1 = no limit)"));
else PRINT_EV("SWEEP_XGMI_MIN", sweepXgmiMin,
printf("Pseudo-random: %s", PrepSrcValueString().c_str()); std::string("Min number of XGMI hops for Transfers"));
printf("\n"); printf("\n");
printf("%-20s = %12d : Running %d %s per Test\n", "NUM_ITERATIONS", numIterations, }
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("%-20s = %12d : Running %d warmup iteration(s) per Test\n", "NUM_WARMUPS", numWarmups, numWarmups);
printf("%-20s = %12d : Output to %s\n", "OUTPUT_TO_CSV", outputToCsv,
outputToCsv ? "CSV" : "console");
printf("%-20s = %12s : Using %d shared mem per threadblock\n", "SHARED_MEM_BYTES",
getenv("SHARED_MEM_BYTES") ? "(specified)" : "(unset)", sharedMemBytes);
printf("%-20s = %12d : Using %s-based GPU indexing\n", "USE_PCIE_INDEX",
usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("USE_PREP_KERNEL,%d,Using %s to initialize source data\n",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("%-20s = %12d : Using single stream per %s\n", "USE_SINGLE_STREAM",
useSingleStream, (useSingleStream ? "device" : "Transfer"));
printf("%-20s = %12d : Continue on error\n", "CONTINUE_ON_ERROR", continueOnError);
printf("\n");
}
else
{
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
printf("SWEEP_SRC,%s,Source Memory Types to sweep\n", sweepSrc.c_str());
printf("SWEEP_EXE,%s,Executor Types to sweep\n", sweepExe.c_str());
printf("SWEEP_DST,%s,Destination Memory Types to sweep\n", sweepDst.c_str());
printf("SWEEP_SEED,%d,Random seed\n", sweepSeed);
printf("SWEEP_MIN,%d,Min simultaneous Transfers\n", sweepMin);
printf("SWEEP_MAX,%d,Max simultaneous Transfers (0 = no limit)\n", sweepMax);
printf("SWEEP_TEST_LIMIT,%d,Max number of tests to run during sweep (0 = no limit)\n", sweepTestLimit);
printf("SWEEP_TIME_LIMIT,%d,Max number of seconds to run sweep for (0 = no limit)\n", sweepTimeLimit);
printf("SWEEP_XGMI_MIN,%d,Min number of XGMI hops for Transfers\n", sweepXgmiMin);
printf("SWEEP_XGMI_MAX,%d,Max number of XGMI hops for Transfers (-1 = no limit)\n", sweepXgmiMax);
printf("SWEEP_RAND_BYTES,%d,Using %s number of bytes per Transfer\n", sweepRandBytes, sweepRandBytes ? "random" : "constant");
printf("NUM_CPU_DEVICES,%d,Using %d CPU devices\n" , numCpuDevices, numCpuDevices);
printf("NUM_GPU_DEVICES,%d,Using %d GPU devices\n", numGpuDevices, numGpuDevices);
printf("BLOCK_BYTES,%d,Each CU gets a multiple of %d bytes to copy\n", blockBytes, blockBytes);
printf("BYTE_OFFSET,%d,Using byte offset of %d\n", byteOffset, byteOffset);
printf("FILL_PATTERN,%s,", getenv("FILL_PATTERN") ? "(specified)" : "(unset)");
if (fillPattern.size())
printf("Pattern: %s", getenv("FILL_PATTERN"));
else
printf("Pseudo-random: %s", PrepSrcValueString().c_str());
printf("\n");
printf("NUM_ITERATIONS,%d,Running %d %s per Test\n", numIterations,
numIterations > 0 ? numIterations : -numIterations,
numIterations > 0 ? "timed iteration(s)" : "second(s)");
printf("NUM_WARMUPS,%d,Running %d warmup iteration(s) per Test\n", numWarmups, numWarmups);
printf("SHARED_MEM_BYTES,%d,Using %d shared mem per threadblock\n", sharedMemBytes, sharedMemBytes);
printf("USE_PCIE_INDEX,%d,Using %s-based GPU indexing\n", usePcieIndexing, (usePcieIndexing ? "PCIe" : "HIP"));
printf("USE_PREP_KERNEL,%d,Using %s to initialize source data\n",
usePrepSrcKernel, (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy"));
printf("USE_SINGLE_STREAM,%d,Using single stream per %s\n", useSingleStream, (useSingleStream ? "device" : "Transfer"));
}
};
// Helper function that gets parses environment variable or sets to default value // Helper function that gets parses environment variable or sets to default value
static int GetEnvVar(std::string const& varname, int defaultValue) static int GetEnvVar(std::string const& varname, int defaultValue)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment