printf(" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0\n");
printf(" CONTINUE_ON_ERROR - Continue tests even after mismatch detected\n");
printf(" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)\n");
printf(" FILL_PATTERN=STR - Fill input buffer with pattern specified in hex digits (0-9,a-f,A-F). Must be even number of digits, (byte-level big-endian)\n");
printf(" GFX_UNROLL - Unroll factor for GFX kernel (0=auto), must be less than %d\n",MAX_UNROLL);
printf(" GFX_SINGLE_TEAM - Have subexecutors work together on full array instead of working on individual disjoint subarrays\n");
printf(" GFX_WAVE_ORDER - Stride pattern for GFX kernel (0=UWC,1=UCW,2=WUC,3=WCU,4=CUW,5=CWU)\n");
printf(" HIDE_ENV - Hide environment variable value listing\n");
printf(" MIN_VAR_SUBEXEC - Minumum # of subexecutors to use for variable subExec Transfers\n");
printf(" MAX_VAR_SUBEXEC - Maximum # of subexecutors to use for variable subExec Transfers (0 for device limits)\n");
printf(" NUM_CPU_DEVICES=X - Restrict number of CPUs to X. May not be greater than # detected NUMA nodes\n");
printf(" NUM_GPU_DEVICES=X - Restrict number of GPUs to X. May not be greater than # detected HIP devices\n");
printf(" NUM_ITERATIONS=I - Perform I timed iteration(s) per test\n");
printf(" NUM_SUBITERATIONS=S - Perform S sub-iteration(s) per iteration. Must be non-negative\n");
printf(" NUM_WARMUPS=W - Perform W untimed warmup iteration(s) per test\n");
printf(" OUTPUT_TO_CSV - Outputs to CSV format if set\n");
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info\n");
printf(" USE_HSA_DMA - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
printf(" USE_XCC_FILTER - Use XCC filtering (experimental)\n");
printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n");
}
// Helper macro to switch between CSV and terminal output