printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n",maxSharedMemBytes);
printf("[ERROR] SHARED_MEM_BYTES must be between 0 and %d\n",maxSharedMemBytes);
...
@@ -239,9 +265,16 @@ public:
...
@@ -239,9 +265,16 @@ public:
printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
printf("[ERROR] BLOCK_BYTES must be a positive multiple of 4\n");
exit(1);
exit(1);
}
}
if(useSingleStream&&useHipCall)
if(numGpuSubExecs<=0)
{
printf("[ERROR] NUM_GPU_SE must be greater than 0\n");
exit(1);
}
if(numCpuSubExecs<=0)
{
{
printf("[ERROR] Single stream mode cannot be used with HIP calls\n");
printf("[ERROR] NUM_CPU_SE must be greater than 0\n");
exit(1);
exit(1);
}
}
...
@@ -273,10 +306,9 @@ public:
...
@@ -273,10 +306,9 @@ public:
}
}
}
}
charconst*permittedExecutors="CG";
for(autoch:sweepExe)
for(autoch:sweepExe)
{
{
if(!strchr(permittedExecutors,ch))
if(!strchr(ExeTypeStr,ch))
{
{
printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n",ch);
printf("[ERROR] Unrecognized executor type '%c' specified for sweep executor\n",ch);
exit(1);
exit(1);
...
@@ -287,12 +319,30 @@ public:
...
@@ -287,12 +319,30 @@ public:
exit(1);
exit(1);
}
}
}
}
if(gpuKernel<0||gpuKernel>NUM_GPU_KERNELS)
{
printf("[ERROR] GPU kernel must be between 0 and %d\n",NUM_GPU_KERNELS);
exit(1);
}
// Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
// Determine how many CPUs exit per NUMA node (to avoid executing on NUMA without CPUs)
numCpusPerNuma.resize(numDetectedCpus);
numCpusPerNuma.resize(numDetectedCpus);
intconsttotalCpus=numa_num_configured_cpus();
intconsttotalCpus=numa_num_configured_cpus();
for(inti=0;i<totalCpus;i++)
for(inti=0;i<totalCpus;i++)
numCpusPerNuma[numa_node_of_cpu(i)]++;
numCpusPerNuma[numa_node_of_cpu(i)]++;
// Check for deprecated env vars
if(getenv("USE_HIP_CALL"))
{
printf("[WARN] USE_HIP_CALL has been deprecated. Please use DMA executor 'D' or set USE_GPU_DMA for P2P-Benchmark preset\n");
exit(1);
}
char*enableSdma=getenv("HSA_ENABLE_SDMA");
if(enableSdma&&!strcmp(enableSdma,"0"))
{
printf("[WARN] DMA functionality disabled due to environment variable HSA_ENABLE_SDMA=0. Copies will fallback to blit kernels\n");
}
}
}
// Display info on the env vars that can be used
// Display info on the env vars that can be used
...
@@ -304,18 +354,15 @@ public:
...
@@ -304,18 +354,15 @@ public:
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_CPU_PER_TRANSFER=C - Use C threads per Transfer for CPU-executed copies\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GCPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" USE_HIP_CALL - Use hipMemcpy/hipMemset instead of custom shader kernels for GPU-executed copies\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_MEMSET - Perform a memset instead of a copy (ignores source memory)\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_SINGLE_STREAM - Use single stream per device instead of per Transfer. Cannot be used with USE_HIP_CALL\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
}
}
// Display env var settings
// Display env var settings
...
@@ -331,10 +378,10 @@ public:
...
@@ -331,10 +378,10 @@ public:
if(fillPattern.size())
if(fillPattern.size())
printf("Pattern: %s",getenv("FILL_PATTERN"));
printf("Pattern: %s",getenv("FILL_PATTERN"));
else
else
printf("Pseudo-random: (Element i = i modulo 383 + 31)");
printf("Pseudo-random: (Element i = i modulo 383 + 31) * (InputIdx + 1)");
printf("\n");
printf("\n");
printf("%-20s = %12d : Using GPU kernel %d [%s]\n","GPU_KERNEL",gpuKernel,gpuKernel,GpuKernelNames[gpuKernel].c_str());
printf("%-20s = %12d : Using %d CPU devices\n","NUM_CPU_DEVICES",numCpuDevices,numCpuDevices);
printf("%-20s = %12d : Using %d CPU devices\n","NUM_CPU_DEVICES",numCpuDevices,numCpuDevices);
printf("%-20s = %12d : Using %d CPU thread(s) per CPU-executed Transfer\n","NUM_CPU_PER_TRANSFER",numCpuPerTransfer,numCpuPerTransfer);
printf("%-20s = %12d : Using %d GPU devices\n","NUM_GPU_DEVICES",numGpuDevices,numGpuDevices);
printf("%-20s = %12d : Using %d GPU devices\n","NUM_GPU_DEVICES",numGpuDevices,numGpuDevices);
printf("%-20s = %12d : Running %d %s per Test\n","NUM_ITERATIONS",numIterations,
printf("%-20s = %12d : Running %d %s per Test\n","NUM_ITERATIONS",numIterations,
numIterations>0?numIterations:-numIterations,
numIterations>0?numIterations:-numIterations,
...
@@ -344,18 +391,8 @@ public:
...
@@ -344,18 +391,8 @@ public:
outputToCsv?"CSV":"console");
outputToCsv?"CSV":"console");
printf("%-20s = %12s : Using %d shared mem per threadblock\n","SHARED_MEM_BYTES",
printf("%-20s = %12s : Using %d shared mem per threadblock\n","SHARED_MEM_BYTES",