Unverified Commit 600cf131 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

V1.52 candidate (#122)

* Adding USE_HSA_DMA to switch to using hsa_amd_memory_async_copy in lieu of hipMemcpyAsync
* Adding USE_GPU_DMA for A2A benchmark
* Adding largeBAR check and fix for 0-hop GPU-CPU links
parent 340244a6
...@@ -3,6 +3,14 @@ ...@@ -3,6 +3,14 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.52
### Added
- Added USE_HSA_DMA env var to switch to using hsa_amd_memory_async_copy instead of hipMemcpyAsync for DMA execution
- Added ability to set USE_GPU_DMA env var for a2a benchmark
- Adding check for large BAR enablement for GPU devices during topology check
### Fixed
- Potential memory leak if HSA reports 0 hops between GPUs and CPUs
## v1.51 ## v1.51
## Modified ## Modified
......
...@@ -508,9 +508,9 @@ TestResults ExecuteTransfersImpl(EnvVars const& ev, ...@@ -508,9 +508,9 @@ TestResults ExecuteTransfersImpl(EnvVars const& ev,
for (Transfer* transfer : exeInfo.transfers) for (Transfer* transfer : exeInfo.transfers)
{ {
if (transfer->exeSubIndex != -1) if (transfer->exeSubIndex != -1 || ev.useHsaDma)
{ {
useTargetDma = true; useTargetDma = (transfer->exeSubIndex != -1);
#if defined(__NVCC__) #if defined(__NVCC__)
printf("[ERROR] DMA executor subindex not supported on NVIDIA hardware\n"); printf("[ERROR] DMA executor subindex not supported on NVIDIA hardware\n");
...@@ -544,18 +544,20 @@ TestResults ExecuteTransfersImpl(EnvVars const& ev, ...@@ -544,18 +544,20 @@ TestResults ExecuteTransfersImpl(EnvVars const& ev,
} }
// Check that engine Id exists between agents // Check that engine Id exists between agents
uint32_t engineIdMask = 0; if (useTargetDma) {
HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent, uint32_t engineIdMask = 0;
transfer->srcAgent, HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent,
&engineIdMask)); transfer->srcAgent,
transfer->sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << transfer->exeSubIndex); &engineIdMask));
if (!(transfer->sdmaEngineId & engineIdMask)) transfer->sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << transfer->exeSubIndex);
{ if (!(transfer->sdmaEngineId & engineIdMask))
printf("[ERROR] DMA executor %d.%d does not exist or cannot copy between source %s to destination %s\n", {
transfer->exeIndex, transfer->exeSubIndex, printf("[ERROR] DMA executor %d.%d does not exist or cannot copy between source %s to destination %s\n",
transfer->SrcToStr().c_str(), transfer->exeIndex, transfer->exeSubIndex,
transfer->DstToStr().c_str()); transfer->SrcToStr().c_str(),
exit(1); transfer->DstToStr().c_str());
exit(1);
}
} }
#endif #endif
} }
...@@ -822,7 +824,7 @@ cleanup: ...@@ -822,7 +824,7 @@ cleanup:
} }
transfer->subExecParam.clear(); transfer->subExecParam.clear();
if (exeType == EXE_GPU_DMA && transfer->exeSubIndex != -1) if (exeType == EXE_GPU_DMA && (transfer->exeSubIndex != -1 || ev.useHsaDma))
{ {
#if !defined(__NVCC__) #if !defined(__NVCC__)
HSA_CHECK(hsa_signal_destroy(transfer->signal)); HSA_CHECK(hsa_signal_destroy(transfer->signal));
...@@ -1161,6 +1163,15 @@ void DisplayTopology(bool const outputToCsv) ...@@ -1161,6 +1163,15 @@ void DisplayTopology(bool const outputToCsv)
printf("\n"); printf("\n");
} }
} }
// Check that large BAR is enabled on all GPUs
for (int i = 0; i < numGpuDevices; i++) {
int const deviceIdx = RemappedIndex(i, false);
int isLargeBar = 0;
HIP_CALL(hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, deviceIdx));
if (!isLargeBar)
printf("[WARN] Large BAR is not enabled for GPU %d in BIOS. This may result in segfaults\n", i);
}
#endif #endif
} }
...@@ -1619,65 +1630,61 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1619,65 +1630,61 @@ void RunTransfer(EnvVars const& ev, int const iteration,
{ {
int const exeIndex = RemappedIndex(transfer->exeIndex, false); int const exeIndex = RemappedIndex(transfer->exeIndex, false);
if (transfer->exeSubIndex == -1) int subIterations = 0;
{ if (transfer->exeSubIndex == -1 && !ev.useHsaDma) {
// Switch to executing GPU // Switch to executing GPU
HIP_CALL(hipSetDevice(exeIndex)); HIP_CALL(hipSetDevice(exeIndex));
hipStream_t& stream = exeInfo.streams[transferIdx]; hipStream_t& stream = exeInfo.streams[transferIdx];
hipEvent_t& startEvent = exeInfo.startEvents[transferIdx]; hipEvent_t& startEvent = exeInfo.startEvents[transferIdx];
hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx]; hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
int subIteration = 0;
HIP_CALL(hipEventRecord(startEvent, stream)); HIP_CALL(hipEventRecord(startEvent, stream));
do { do {
HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0], HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
transfer->numBytesActual, hipMemcpyDefault, transfer->numBytesActual, hipMemcpyDefault,
stream)); stream));
} while (++subIteration != ev.numSubIterations); } while (++subIterations != ev.numSubIterations);
HIP_CALL(hipEventRecord(stopEvent, stream)); HIP_CALL(hipEventRecord(stopEvent, stream));
HIP_CALL(hipStreamSynchronize(stream)); HIP_CALL(hipStreamSynchronize(stream));
if (iteration >= 0) // Record time based on HIP events
{ if (iteration >= 0) {
// Record GPU timing
float gpuDeltaMsec; float gpuDeltaMsec;
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
//gpuDeltaMsec /= (1.0 * ev.numSubIterations);
transfer->transferTime += gpuDeltaMsec; transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations) if (ev.showIterations)
transfer->perIterationTime.push_back(gpuDeltaMsec); transfer->perIterationTime.push_back(gpuDeltaMsec);
} }
} } else {
else
{
#if defined(__NVCC__) #if defined(__NVCC__)
printf("[ERROR] CUDA does not support targeting specific DMA engines\n"); printf("[ERROR] CUDA does not support targeting specific DMA engines\n");
exit(1); exit(1);
#else #else
// Target specific DMA engine // Use hsa_amd_memory copy (either targeted or untargeted)
auto cpuStart = std::chrono::high_resolution_clock::now(); auto cpuStart = std::chrono::high_resolution_clock::now();
int subIterations = 0;
do { do {
// Atomically set signal to 1 // Atomically set signal to 1
HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1)); HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
if (ev.useHsaDma) {
HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent, HSA_CALL(hsa_amd_memory_async_copy(transfer->dstMem[0], transfer->dstAgent,
transfer->srcMem[0], transfer->srcAgent, transfer->srcMem[0], transfer->srcAgent,
transfer->numBytesActual, 0, NULL, transfer->numBytesActual, 0, NULL,
transfer->signal, transfer->signal));
transfer->sdmaEngineId, true)); } else {
HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
transfer->srcMem[0], transfer->srcAgent,
transfer->numBytesActual, 0, NULL,
transfer->signal,
transfer->sdmaEngineId, true));
}
// Wait for SDMA transfer to complete // Wait for SDMA transfer to complete
// NOTE: "A wait operation can spuriously resume at any time sooner than the timeout
// (for example, due to system or other external factors) even when the
// condition has not been met.)
while(hsa_signal_wait_scacquire(transfer->signal, while(hsa_signal_wait_scacquire(transfer->signal,
HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX, HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
HSA_WAIT_STATE_ACTIVE) >= 1); HSA_WAIT_STATE_ACTIVE) >= 1);
} while (++subIterations < ev.numSubIterations); } while (++subIterations < ev.numSubIterations);
if (iteration >= 0) if (iteration >= 0) {
{
// Record GPU timing // Record GPU timing
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart; auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0; double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
...@@ -2106,7 +2113,7 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i ...@@ -2106,7 +2113,7 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
transfer.numSubExecs = numSubExecs; transfer.numSubExecs = numSubExecs;
transfer.numSrcs = ev.a2aMode == 2 ? 0 : 1; transfer.numSrcs = ev.a2aMode == 2 ? 0 : 1;
transfer.numDsts = ev.a2aMode == 1 ? 0 : 1; transfer.numDsts = ev.a2aMode == 1 ? 0 : 1;
transfer.exeType = EXE_GPU_GFX; transfer.exeType = (ev.useDmaCopy && transfer.numSrcs == 1 && transfer.numDsts == 1) ? EXE_GPU_DMA : EXE_GPU_GFX;
transfer.exeSubIndex = -1; transfer.exeSubIndex = -1;
transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU); transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
transfer.dstType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU); transfer.dstType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.51" #define TB_VERSION "1.52"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -95,6 +95,7 @@ public: ...@@ -95,6 +95,7 @@ public:
int samplingFactor; // Affects how many different values of N are generated (when N set to 0) int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
int sharedMemBytes; // Amount of shared memory to use per threadblock int sharedMemBytes; // Amount of shared memory to use per threadblock
int showIterations; // Show per-iteration timing info int showIterations; // Show per-iteration timing info
int useHsaDma; // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
int useInteractive; // Pause for user-input before starting transfer loop int useInteractive; // Pause for user-input before starting transfer loop
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern) int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
...@@ -202,6 +203,7 @@ public: ...@@ -202,6 +203,7 @@ public:
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR); samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , defaultSharedMemBytes); sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , defaultSharedMemBytes);
showIterations = GetEnvVar("SHOW_ITERATIONS" , 0); showIterations = GetEnvVar("SHOW_ITERATIONS" , 0);
useHsaDma = GetEnvVar("USE_HSA_DMA" , 0);
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0); useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0); usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0);
...@@ -612,6 +614,7 @@ public: ...@@ -612,6 +614,7 @@ public:
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n"); printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n"); printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info\n"); printf(" SHOW_ITERATIONS - Show per-iteration timing info\n");
printf(" USE_HSA_DMA - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n"); printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n"); printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n"); printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n");
...@@ -691,6 +694,8 @@ public: ...@@ -691,6 +694,8 @@ public:
std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock")); std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
PRINT_EV("SHOW_ITERATIONS", showIterations, PRINT_EV("SHOW_ITERATIONS", showIterations,
std::string(showIterations ? "Showing" : "Hiding") + " per-iteration timing"); std::string(showIterations ? "Showing" : "Hiding") + " per-iteration timing");
PRINT_EV("USE_HSA_DMA", useHsaDma,
std::string("Using ") + (useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync") + " for DMA execution");
PRINT_EV("USE_INTERACTIVE", useInteractive, PRINT_EV("USE_INTERACTIVE", useInteractive,
std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode"); std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
PRINT_EV("USE_PCIE_INDEX", usePcieIndexing, PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
...@@ -797,6 +802,8 @@ public: ...@@ -797,6 +802,8 @@ public:
"Perform write-only")); "Perform write-only"));
PRINT_EV("USE_FINE_GRAIN", useFineGrain, PRINT_EV("USE_FINE_GRAIN", useFineGrain,
std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory"); std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
PRINT_EV("USE_GPU_DMA", useDmaCopy,
std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
PRINT_EV("USE_REMOTE_READ", useRemoteRead, PRINT_EV("USE_REMOTE_READ", useRemoteRead,
std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor"); std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
......
...@@ -84,24 +84,22 @@ AgentData& GetAgentData() ...@@ -84,24 +84,22 @@ AgentData& GetAgentData()
{ {
static AgentData agentData = {}; static AgentData agentData = {};
if (!agentData.isInitialized) if (!agentData.isInitialized) {
{
agentData.isInitialized = true; agentData.isInitialized = true;
// Add all detected agents to the list // Add all detected agents to the list
HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData)); HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
// Loop over each GPU // Loop over each GPU
for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++) for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++) {
{
// Collect memory pool // Collect memory pool
hsa_amd_memory_pool_t pool; hsa_amd_memory_pool_t pool;
HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool)); HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
// Loop over each CPU agent and check distance // Loop over each CPU agent and check distance
agentData.closestNumaNode[i] = 0;
int bestDistance = -1; int bestDistance = -1;
for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++) for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++) {
{
// Determine number of hops from GPU memory pool to CPU agent // Determine number of hops from GPU memory pool to CPU agent
uint32_t hops = 0; uint32_t hops = 0;
HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j], HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
...@@ -109,23 +107,23 @@ AgentData& GetAgentData() ...@@ -109,23 +107,23 @@ AgentData& GetAgentData()
HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS, HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
&hops)); &hops));
// Gather link info // Gather link info
hsa_amd_memory_pool_link_info_t* link_info = if (hops) {
(hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t)); hsa_amd_memory_pool_link_info_t* link_info =
HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j], (hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
pool, HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO, pool,
link_info)); HSA_AMD_AGENT_MEMORY_POOL_INFO_LINK_INFO,
int numaDist = 0; link_info));
for (int k = 0; k < hops; k++) int numaDist = 0;
{ for (int k = 0; k < hops; k++)
numaDist += link_info[k].numa_distance; numaDist += link_info[k].numa_distance;
}
if (bestDistance == -1 || numaDist < bestDistance) if (bestDistance == -1 || numaDist < bestDistance) {
{ agentData.closestNumaNode[i] = j;
agentData.closestNumaNode[i] = j; bestDistance = numaDist;
bestDistance = numaDist; }
free(link_info);
} }
free(link_info);
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment