printf("[ERROR] Duplicate executor type '%c' specified for sweep executor\n",ch);
exit(1);
}
}
}
}
// Display info on the env vars that can be used
// Display info on the env vars that can be used
...
@@ -176,20 +267,22 @@ public:
...
@@ -176,20 +267,22 @@ public:
{
{
printf("Environment variables:\n");
printf("Environment variables:\n");
printf("======================\n");
printf("======================\n");
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
printf(" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n");
printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GCPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" USE_HIP_CALL - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_HIP_CALL - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
printf(" USE_MEMSET - Perform a memset instead of a copy (ignores source memory)\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_MEMSET - Perform a memset instead of a copy (ignores source memory)\n");
printf(" USE_SINGLE_STREAM - Use single stream per device instead of per Transfer. Cannot be used with USE_HIP_CALL\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_SINGLE_STREAM - Use single stream per device instead of per Transfer. Cannot be used with USE_HIP_CALL\n");
}
}
// Display env var settings
// Display env var settings
...
@@ -207,8 +300,10 @@ public:
...
@@ -207,8 +300,10 @@ public:
else
else
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("\n");
printf("\n");
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-based-copy Transfer\n","NUM_CPU_PER_TRANSFER",numCpuPerTransfer,numCpuPerTransfer);
printf("%-20s = %12d : Using %d CPU devices\n","NUM_CPU_DEVICES",numCpuDevices,numCpuDevices);
printf("%-20s = %12d : Running %d %s per topology\n","NUM_ITERATIONS",numIterations,
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n","NUM_CPU_PER_TRANSFER",numCpuPerTransfer,numCpuPerTransfer);
printf("%-20s = %12d : Using %d GPU devices\n","NUM_GPU_DEVICES",numGpuDevices,numGpuDevices);
printf("%-20s = %12d : Running %d %s per test\n","NUM_ITERATIONS",numIterations,