Unverified Commit 600cf131 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

V1.52 candidate (#122)

* Adding USE_HSA_DMA to switch to using hsa_amd_memory_async_copy in lieu of hipMemcpyAsync
* Adding USE_GPU_DMA for A2A benchmark
* Adding largeBAR check and fix for 0-hop GPU-CPU links
parent 340244a6
......@@ -3,6 +3,14 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.52
### Added
- Added USE_HSA_DMA env var to switch to using hsa_amd_memory_async_copy instead of hipMemcpyAsync for DMA execution
- Added ability to set USE_GPU_DMA env var for a2a benchmark
- Adding check for large BAR enablement for GPU devices during topology check
### Fixed
- Potential memory leak if HSA reports 0 hops between GPUs and CPUs
## v1.51
## Modified
......
......@@ -508,9 +508,9 @@ TestResults ExecuteTransfersImpl(EnvVars const& ev,
for (Transfer* transfer : exeInfo.transfers)
{
if (transfer->exeSubIndex != -1)
if (transfer->exeSubIndex != -1 || ev.useHsaDma)
{
useTargetDma = true;
useTargetDma = (transfer->exeSubIndex != -1);
#if defined(__NVCC__)
printf("[ERROR] DMA executor subindex not supported on NVIDIA hardware\n");
......@@ -544,6 +544,7 @@ TestResults ExecuteTransfersImpl(EnvVars const& ev,
}
// Check that engine Id exists between agents
if (useTargetDma) {
uint32_t engineIdMask = 0;
HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent,
transfer->srcAgent,
......@@ -557,6 +558,7 @@ TestResults ExecuteTransfersImpl(EnvVars const& ev,
transfer->DstToStr().c_str());
exit(1);
}
}
#endif
}
else
......@@ -822,7 +824,7 @@ cleanup:
}
transfer->subExecParam.clear();
if (exeType == EXE_GPU_DMA && transfer->exeSubIndex != -1)
if (exeType == EXE_GPU_DMA && (transfer->exeSubIndex != -1 || ev.useHsaDma))
{
#if !defined(__NVCC__)
HSA_CHECK(hsa_signal_destroy(transfer->signal));
......@@ -1161,6 +1163,15 @@ void DisplayTopology(bool const outputToCsv)
printf("\n");
}
}
// Check that large BAR is enabled on all GPUs
for (int i = 0; i < numGpuDevices; i++) {
int const deviceIdx = RemappedIndex(i, false);
int isLargeBar = 0;
HIP_CALL(hipDeviceGetAttribute(&isLargeBar, hipDeviceAttributeIsLargeBar, deviceIdx));
if (!isLargeBar)
printf("[WARN] Large BAR is not enabled for GPU %d in BIOS. This may result in segfaults\n", i);
}
#endif
}
......@@ -1619,65 +1630,61 @@ void RunTransfer(EnvVars const& ev, int const iteration,
{
int const exeIndex = RemappedIndex(transfer->exeIndex, false);
if (transfer->exeSubIndex == -1)
{
int subIterations = 0;
if (transfer->exeSubIndex == -1 && !ev.useHsaDma) {
// Switch to executing GPU
HIP_CALL(hipSetDevice(exeIndex));
hipStream_t& stream = exeInfo.streams[transferIdx];
hipEvent_t& startEvent = exeInfo.startEvents[transferIdx];
hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
int subIteration = 0;
HIP_CALL(hipEventRecord(startEvent, stream));
do {
HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
transfer->numBytesActual, hipMemcpyDefault,
stream));
} while (++subIteration != ev.numSubIterations);
} while (++subIterations != ev.numSubIterations);
HIP_CALL(hipEventRecord(stopEvent, stream));
HIP_CALL(hipStreamSynchronize(stream));
if (iteration >= 0)
{
// Record GPU timing
// Record time based on HIP events
if (iteration >= 0) {
float gpuDeltaMsec;
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
//gpuDeltaMsec /= (1.0 * ev.numSubIterations);
transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations)
transfer->perIterationTime.push_back(gpuDeltaMsec);
}
}
else
{
} else {
#if defined(__NVCC__)
printf("[ERROR] CUDA does not support targeting specific DMA engines\n");
exit(1);
#else
// Target specific DMA engine
// Use hsa_amd_memory copy (either targeted or untargeted)
auto cpuStart = std::chrono::high_resolution_clock::now();
int subIterations = 0;
do {
// Atomically set signal to 1
HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
if (ev.useHsaDma) {
HSA_CALL(hsa_amd_memory_async_copy(transfer->dstMem[0], transfer->dstAgent,
transfer->srcMem[0], transfer->srcAgent,
transfer->numBytesActual, 0, NULL,
transfer->signal));
} else {
HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
transfer->srcMem[0], transfer->srcAgent,
transfer->numBytesActual, 0, NULL,
transfer->signal,
transfer->sdmaEngineId, true));
}
// Wait for SDMA transfer to complete
// NOTE: "A wait operation can spuriously resume at any time sooner than the timeout
// (for example, due to system or other external factors) even when the
// condition has not been met.)
while(hsa_signal_wait_scacquire(transfer->signal,
HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
HSA_WAIT_STATE_ACTIVE) >= 1);
} while (++subIterations < ev.numSubIterations);
if (iteration >= 0)
{
if (iteration >= 0) {
// Record GPU timing
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
......@@ -2106,7 +2113,7 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
transfer.numSubExecs = numSubExecs;
transfer.numSrcs = ev.a2aMode == 2 ? 0 : 1;
transfer.numDsts = ev.a2aMode == 1 ? 0 : 1;
transfer.exeType = EXE_GPU_GFX;
transfer.exeType = (ev.useDmaCopy && transfer.numSrcs == 1 && transfer.numDsts == 1) ? EXE_GPU_DMA : EXE_GPU_GFX;
transfer.exeSubIndex = -1;
transfer.srcType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
transfer.dstType.resize(1, ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.51"
#define TB_VERSION "1.52"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -95,6 +95,7 @@ public:
int samplingFactor; // Affects how many different values of N are generated (when N set to 0)
int sharedMemBytes; // Amount of shared memory to use per threadblock
int showIterations; // Show per-iteration timing info
int useHsaDma; // Use hsa_amd_async_copy instead of hipMemcpy for non-targetted DMA executions
int useInteractive; // Pause for user-input before starting transfer loop
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
......@@ -202,6 +203,7 @@ public:
samplingFactor = GetEnvVar("SAMPLING_FACTOR" , DEFAULT_SAMPLING_FACTOR);
sharedMemBytes = GetEnvVar("SHARED_MEM_BYTES" , defaultSharedMemBytes);
showIterations = GetEnvVar("SHOW_ITERATIONS" , 0);
useHsaDma = GetEnvVar("USE_HSA_DMA" , 0);
useInteractive = GetEnvVar("USE_INTERACTIVE" , 0);
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0);
......@@ -612,6 +614,7 @@ public:
printf(" SAMPLING_FACTOR=F - Add F samples (when possible) between powers of 2 when auto-generating data sizes\n");
printf(" SHARED_MEM_BYTES=X - Use X shared mem bytes per threadblock, potentially to avoid multiple threadblocks per CU\n");
printf(" SHOW_ITERATIONS - Show per-iteration timing info\n");
printf(" USE_HSA_DMA - Use hsa_amd_async_copy instead of hipMemcpy for non-targeted DMA execution\n");
printf(" USE_INTERACTIVE - Pause for user-input before starting transfer loop\n");
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n");
......@@ -691,6 +694,8 @@ public:
std::string("Using " + std::to_string(sharedMemBytes) + " shared mem per threadblock"));
PRINT_EV("SHOW_ITERATIONS", showIterations,
std::string(showIterations ? "Showing" : "Hiding") + " per-iteration timing");
PRINT_EV("USE_HSA_DMA", useHsaDma,
std::string("Using ") + (useHsaDma ? "hsa_amd_async_copy" : "hipMemcpyAsync") + " for DMA execution");
PRINT_EV("USE_INTERACTIVE", useInteractive,
std::string("Running in ") + (useInteractive ? "interactive" : "non-interactive") + " mode");
PRINT_EV("USE_PCIE_INDEX", usePcieIndexing,
......@@ -797,6 +802,8 @@ public:
"Perform write-only"));
PRINT_EV("USE_FINE_GRAIN", useFineGrain,
std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
PRINT_EV("USE_GPU_DMA", useDmaCopy,
std::string("Using GPU-") + (useDmaCopy ? "DMA" : "GFX") + " as GPU executor");
PRINT_EV("USE_REMOTE_READ", useRemoteRead,
std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
......
......@@ -84,24 +84,22 @@ AgentData& GetAgentData()
{
static AgentData agentData = {};
if (!agentData.isInitialized)
{
if (!agentData.isInitialized) {
agentData.isInitialized = true;
// Add all detected agents to the list
HSA_CHECK(hsa_iterate_agents(AgentInfoCallback, &agentData));
// Loop over each GPU
for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++)
{
for (uint32_t i = 0; i < agentData.gpuAgents.size(); i++) {
// Collect memory pool
hsa_amd_memory_pool_t pool;
HSA_CHECK(hsa_amd_agent_iterate_memory_pools(agentData.gpuAgents[i], MemPoolInfoCallback, &pool));
// Loop over each CPU agent and check distance
agentData.closestNumaNode[i] = 0;
int bestDistance = -1;
for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++)
{
for (uint32_t j = 0; j < agentData.cpuAgents.size(); j++) {
// Determine number of hops from GPU memory pool to CPU agent
uint32_t hops = 0;
HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
......@@ -109,6 +107,7 @@ AgentData& GetAgentData()
HSA_AMD_AGENT_MEMORY_POOL_INFO_NUM_LINK_HOPS,
&hops));
// Gather link info
if (hops) {
hsa_amd_memory_pool_link_info_t* link_info =
(hsa_amd_memory_pool_link_info_t *)malloc(hops * sizeof(hsa_amd_memory_pool_link_info_t));
HSA_CHECK(hsa_amd_agent_memory_pool_get_info(agentData.cpuAgents[j],
......@@ -117,11 +116,9 @@ AgentData& GetAgentData()
link_info));
int numaDist = 0;
for (int k = 0; k < hops; k++)
{
numaDist += link_info[k].numa_distance;
}
if (bestDistance == -1 || numaDist < bestDistance)
{
if (bestDistance == -1 || numaDist < bestDistance) {
agentData.closestNumaNode[i] = j;
bestDistance = numaDist;
}
......@@ -129,6 +126,7 @@ AgentData& GetAgentData()
}
}
}
}
return agentData;
}
#endif
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment