Unverified Commit aa801b97 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

V1.48 - Adding targeting specific DMA engine support (#87)

* Adding targeted DMA engine support
* Fixing CUDA compilation for H100
parent ef4252bb
...@@ -3,6 +3,18 @@ ...@@ -3,6 +3,18 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.48
### Fixes
* Various fixes for TransferBenchCuda
### Additions
* Support for targeting specific DMA engines via executor subindex (e.g. D0.1)
* Printing warnings when exeuctors are overcommited
### Modifications
* USE_REMOTE_READ supported for rwrite preset benchmark
## v1.47 ## v1.47
### Fixes ### Fixes
......
# Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. # Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
if (DEFINED ENV{ROCM_PATH}) if (DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory") set(ROCM_PATH "$ENV{ROCM_PATH}" CACHE STRING "ROCm install directory")
else() else()
......
Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
......
...@@ -10,6 +10,7 @@ Documentation for TransferBench is available at ...@@ -10,6 +10,7 @@ Documentation for TransferBench is available at
* You must have a ROCm stack installed on your system (HIP runtime) * You must have a ROCm stack installed on your system (HIP runtime)
* You must have `libnuma` installed on your system * You must have `libnuma` installed on your system
* AMD IOMMU must be enabled and set to passthrough for AMD Instinct cards
## Documentation ## Documentation
...@@ -66,11 +67,15 @@ make ...@@ -66,11 +67,15 @@ make
* Running TransferBench with no arguments displays usage instructions and detected topology * Running TransferBench with no arguments displays usage instructions and detected topology
information information
* You can use several preset configurations instead of a configuration file: * You can use several preset configurations instead of a configuration file:
* `p2p`: Peer-to-peer benchmark test * `a2a` : All-to-all benchmark test
* `sweep`: Sweep across possible sets of transfers * `cmdline`: Take in Transfers to run from command-line instead of via file
* `rsweep`: Random sweep across possible sets of transfers * `p2p` : Peer-to-peer benchmark test
* When using the same GPU executor in multiple simultaneous transfers, performance may be * `rsweep` : Random sweep across possible sets of transfers
serialized due to the maximum number of hardware queues available * `rwrite` : Benchmarks parallel remote writes from a single GPU
* `scaling`: GPU subexecutor scaling tests
* 'schmoo` : Local/Remote read/write/copy between two GPUs
* `sweep` : Sweep across possible sets of transfers
* When using the same GPU executor in multiple simultaneous transfers on separate streams (USE_SINGLE_STREAM=0),
performance may be serialized due to the maximum number of hardware queues available
* The number of maximum hardware queues can be adjusted via `GPU_MAX_HW_QUEUES` * The number of maximum hardware queues can be adjusted via `GPU_MAX_HW_QUEUES`
* Alternatively, running in single-stream mode (`USE_SINGLE_STREAM`=1) may avoid this issue
by launching all transfers on a single stream, rather than on individual streams
...@@ -13,7 +13,7 @@ else ...@@ -13,7 +13,7 @@ else
endif endif
CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64 CXXFLAGS = -O3 -Iinclude -I$(ROCM_PATH)/include -lnuma -L$(ROCM_PATH)/lib -lhsa-runtime64
NVFLAGS = -O3 -Iinclude -x cu -lnuma -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_75,code=sm_75 NVFLAGS = -O3 -Iinclude -x cu -lnuma -arch=native
LDFLAGS += -lpthread LDFLAGS += -lpthread
all: $(EXE) all: $(EXE)
......
/* /*
Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -161,7 +161,7 @@ int main(int argc, char **argv) ...@@ -161,7 +161,7 @@ int main(int argc, char **argv)
int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4); int numSubExecs = (argc > 3 ? atoi(argv[3]) : 4);
int srcIdx = (argc > 4 ? atoi(argv[4]) : 0); int srcIdx = (argc > 4 ? atoi(argv[4]) : 0);
int minGpus = (argc > 5 ? atoi(argv[5]) : 1); int minGpus = (argc > 5 ? atoi(argv[5]) : 1);
int maxGpus = (argc > 6 ? atoi(argv[6]) : std::min(ev.numGpuDevices - 1, 3)); int maxGpus = (argc > 6 ? atoi(argv[6]) : ev.numGpuDevices - 1);
for (int N = 256; N <= (1<<27); N *= 2) for (int N = 256; N <= (1<<27); N *= 2)
{ {
...@@ -358,6 +358,9 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -358,6 +358,9 @@ void ExecuteTransfers(EnvVars const& ev,
{ {
#if !defined(__NVCC__) #if !defined(__NVCC__)
HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data())); HIP_CALL(hipExtStreamCreateWithCUMask(&exeInfo.streams[i], ev.cuMask.size(), ev.cuMask.data()));
#else
printf("[ERROR] CU Masking in not supported on NVIDIA hardware\n");
exit(-1);
#endif #endif
} }
else else
...@@ -376,9 +379,98 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -376,9 +379,98 @@ void ExecuteTransfers(EnvVars const& ev,
AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam), AllocateMemory(MEM_GPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
(void**)&exeInfo.subExecParamGpu); (void**)&exeInfo.subExecParamGpu);
#else #else
AllocateMemory(MEM_CPU, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam), AllocateMemory(MEM_MANAGED, exeIndex, exeInfo.totalSubExecs * sizeof(SubExecParam),
(void**)&exeInfo.subExecParamGpu); (void**)&exeInfo.subExecParamGpu);
#endif #endif
// Check for sufficient subExecutors
int numDeviceCUs = 0;
HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, exeIndex));
if (exeInfo.totalSubExecs > numDeviceCUs)
{
printf("[WARN] GFX executor %d requesting %d total subexecutors, however only has %d. Some Transfers may be serialized\n",
exeIndex, exeInfo.totalSubExecs, numDeviceCUs);
}
}
// Check for targeted DMA
if (exeType == EXE_GPU_DMA)
{
bool useRandomDma = false;
bool useTargetDma = false;
// Check for sufficient hardware queues
#if !defined(__NVCC_)
if (exeInfo.transfers.size() > ev.gpuMaxHwQueues)
{
printf("[WARN] DMA executor %d attempting %lu parallel transfers, however GPU_MAX_HW_QUEUES only set to %d\n",
exeIndex, exeInfo.transfers.size(), ev.gpuMaxHwQueues);
}
#endif
for (Transfer* transfer : exeInfo.transfers)
{
if (transfer->exeSubIndex != -1)
{
useTargetDma = true;
#if defined(__NVCC__)
printf("[ERROR] DMA executor subindex not supported on NVIDIA hardware\n");
exit(-1);
#else
if (transfer->numSrcs != 1 || transfer->numDsts != 1)
{
printf("[ERROR] DMA Transfer must have at exactly one source and one destination");
exit(1);
}
// Collect HSA agent information
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
HSA_CHECK(hsa_amd_pointer_info(transfer->dstMem[0], &info, NULL, NULL, NULL));
transfer->dstAgent = info.agentOwner;
HSA_CHECK(hsa_amd_pointer_info(transfer->srcMem[0], &info, NULL, NULL, NULL));
transfer->srcAgent = info.agentOwner;
// Create HSA completion signal
HSA_CHECK(hsa_signal_create(1, 0, NULL, &transfer->signal));
// Check for valid engine Id
if (transfer->exeSubIndex < -1 || transfer->exeSubIndex >= 32)
{
printf("[ERROR] DMA executor subindex must be between 0 and 31\n");
exit(1);
}
// Check that engine Id exists between agents
uint32_t engineIdMask = 0;
HSA_CHECK(hsa_amd_memory_copy_engine_status(transfer->dstAgent,
transfer->srcAgent,
&engineIdMask));
transfer->sdmaEngineId = (hsa_amd_sdma_engine_id_t)(1U << transfer->exeSubIndex);
if (!(transfer->sdmaEngineId & engineIdMask))
{
printf("[ERROR] DMA executor %d.%d does not exist or cannot copy between source %s to destination %s\n",
transfer->exeIndex, transfer->exeSubIndex,
transfer->SrcToStr().c_str(),
transfer->DstToStr().c_str());
exit(1);
}
#endif
}
else
{
useRandomDma = true;
}
}
if (useRandomDma && useTargetDma)
{
printf("[WARN] Mixing targeted and untargetted DMA execution on GPU %d may result in resource conflicts\n",
exeIndex);
}
} }
} }
} }
...@@ -618,7 +710,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -618,7 +710,7 @@ void ExecuteTransfers(EnvVars const& ev,
totalCUs += transfer->numSubExecs; totalCUs += transfer->numSubExecs;
char exeSubIndexStr[32] = ""; char exeSubIndexStr[32] = "";
if (ev.useXccFilter) if (ev.useXccFilter || transfer->exeType == EXE_GPU_DMA)
{ {
if (transfer->exeSubIndex == -1) if (transfer->exeSubIndex == -1)
sprintf(exeSubIndexStr, ".*"); sprintf(exeSubIndexStr, ".*");
...@@ -719,7 +811,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -719,7 +811,7 @@ void ExecuteTransfers(EnvVars const& ev,
char exeSubIndexStr[32] = ""; char exeSubIndexStr[32] = "";
if (ev.useXccFilter) if (ev.useXccFilter)
{ {
if (transfer->exeSubIndex == -1) if (transfer->exeSubIndex == -1 || transfer->exeType == EXE_GPU_DMA)
sprintf(exeSubIndexStr, ".*"); sprintf(exeSubIndexStr, ".*");
else else
sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex); sprintf(exeSubIndexStr, ".%d", transfer->exeSubIndex);
...@@ -828,6 +920,13 @@ cleanup: ...@@ -828,6 +920,13 @@ cleanup:
DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset); DeallocateMemory(dstType, transfer->dstMem[iDst], transfer->numBytesActual + ev.byteOffset);
} }
transfer->subExecParam.clear(); transfer->subExecParam.clear();
if (exeType == EXE_GPU_DMA && transfer->exeSubIndex != -1)
{
#if !defined(__NVCC__)
HSA_CHECK(hsa_signal_destroy(transfer->signal));
#endif
}
} }
if (IsGpuType(exeType)) if (IsGpuType(exeType))
...@@ -845,7 +944,7 @@ cleanup: ...@@ -845,7 +944,7 @@ cleanup:
#if !defined(__NVCC__) #if !defined(__NVCC__)
DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu); DeallocateMemory(MEM_GPU, exeInfo.subExecParamGpu);
#else #else
DeallocateMemory(MEM_CPU, exeInfo.subExecParamGpu); DeallocateMemory(MEM_MANAGED, exeInfo.subExecParamGpu);
#endif #endif
} }
} }
...@@ -1015,9 +1114,52 @@ void DisplayTopology(bool const outputToCsv) ...@@ -1015,9 +1114,52 @@ void DisplayTopology(bool const outputToCsv)
printf("\n"); printf("\n");
#if defined(__NVCC__) #if defined(__NVCC__)
for (int i = 0; i < numGpuDevices; i++)
{
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, i));
printf(" GPU %02d | %s\n", i, prop.name);
}
// No further topology detection done for NVIDIA platforms // No further topology detection done for NVIDIA platforms
return; return;
#endif #else
// Figure out DMA engines per GPU
std::vector<std::set<int>> dmaEngineIdsPerDevice(numGpuDevices);
{
std::vector<hsa_agent_t> agentList;
hsa_amd_pointer_info_t info;
info.size = sizeof(info);
for (int deviceId = 0; deviceId < numGpuDevices; deviceId++)
{
HIP_CALL(hipSetDevice(deviceId));
int32_t* tempBuffer;
HIP_CALL(hipMalloc((void**)&tempBuffer, 1024));
HSA_CHECK(hsa_amd_pointer_info(tempBuffer, &info, NULL, NULL, NULL));
agentList.push_back(info.agentOwner);
HIP_CALL(hipFree(tempBuffer));
}
for (int srcDevice = 0; srcDevice < numGpuDevices; srcDevice++)
{
dmaEngineIdsPerDevice[srcDevice].clear();
for (int dstDevice = 0; dstDevice < numGpuDevices; dstDevice++)
{
if (srcDevice == dstDevice) continue;
uint32_t engineIdMask = 0;
if (hsa_amd_memory_copy_engine_status(agentList[dstDevice],
agentList[srcDevice],
&engineIdMask) != HSA_STATUS_SUCCESS)
continue;
for (int engineId = 0; engineId < 32; engineId++)
{
if (engineIdMask & (1U << engineId))
dmaEngineIdsPerDevice[srcDevice].insert(engineId);
}
}
}
}
// Print out detected GPU topology // Print out detected GPU topology
if (outputToCsv) if (outputToCsv)
...@@ -1025,7 +1167,7 @@ void DisplayTopology(bool const outputToCsv) ...@@ -1025,7 +1167,7 @@ void DisplayTopology(bool const outputToCsv)
printf("GPU"); printf("GPU");
for (int j = 0; j < numGpuDevices; j++) for (int j = 0; j < numGpuDevices; j++)
printf(",GPU %02d", j); printf(",GPU %02d", j);
printf(",PCIe Bus ID,ClosestNUMA\n"); printf(",PCIe Bus ID,ClosestNUMA,DMA engines\n");
} }
else else
{ {
...@@ -1042,13 +1184,12 @@ void DisplayTopology(bool const outputToCsv) ...@@ -1042,13 +1184,12 @@ void DisplayTopology(bool const outputToCsv)
printf(" |"); printf(" |");
for (int j = 0; j < numGpuDevices; j++) for (int j = 0; j < numGpuDevices; j++)
printf(" GPU %02d |", j); printf(" GPU %02d |", j);
printf(" PCIe Bus ID | #CUs | Closest NUMA\n"); printf(" PCIe Bus ID | #CUs | Closest NUMA | DMA engines\n");
for (int j = 0; j <= numGpuDevices; j++) for (int j = 0; j <= numGpuDevices; j++)
printf("--------+"); printf("--------+");
printf("--------------+------+-------------\n"); printf("--------------+------+-------------+------------\n");
} }
#if !defined(__NVCC__)
char pciBusId[20]; char pciBusId[20];
for (int i = 0; i < numGpuDevices; i++) for (int i = 0; i < numGpuDevices; i++)
{ {
...@@ -1085,9 +1226,19 @@ void DisplayTopology(bool const outputToCsv) ...@@ -1085,9 +1226,19 @@ void DisplayTopology(bool const outputToCsv)
HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx)); HIP_CALL(hipDeviceGetAttribute(&numDeviceCUs, hipDeviceAttributeMultiprocessorCount, deviceIdx));
if (outputToCsv) if (outputToCsv)
printf("%s,%d,%d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx)); printf("%s,%d,%d,", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
else else
printf(" %11s | %4d | %d\n", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx)); {
printf(" %11s | %4d | %-12d |", pciBusId, numDeviceCUs, GetClosestNumaNode(deviceIdx));
bool isFirst = true;
for (auto x : dmaEngineIdsPerDevice[deviceIdx])
{
if (isFirst) isFirst = false; else printf(",");
printf("%d", x);
}
printf("\n");
}
} }
#endif #endif
} }
...@@ -1324,6 +1475,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt ...@@ -1324,6 +1475,7 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
// Check that the allocated pages are actually on the correct NUMA node // Check that the allocated pages are actually on the correct NUMA node
memset(*memPtr, 0, numBytes); memset(*memPtr, 0, numBytes);
CheckPages((char*)*memPtr, numBytes, devIndex); CheckPages((char*)*memPtr, numBytes, devIndex);
// Reset to default numa mem policy // Reset to default numa mem policy
...@@ -1344,13 +1496,15 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt ...@@ -1344,13 +1496,15 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
exit(1); exit(1);
#else #else
HIP_CALL(hipSetDevice(devIndex)); HIP_CALL(hipSetDevice(devIndex));
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, 0));
int flag = hipDeviceMallocUncached; int flag = hipDeviceMallocUncached;
HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, flag)); HIP_CALL(hipExtMallocWithFlags((void**)memPtr, numBytes, flag));
#endif #endif
} }
else if (memType == MEM_MANAGED)
{
HIP_CALL(hipSetDevice(devIndex));
HIP_CALL(hipMallocManaged((void**)memPtr, numBytes));
}
HIP_CALL(hipMemset(*memPtr, 0, numBytes)); HIP_CALL(hipMemset(*memPtr, 0, numBytes));
HIP_CALL(hipDeviceSynchronize()); HIP_CALL(hipDeviceSynchronize());
} }
...@@ -1390,6 +1544,15 @@ void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes) ...@@ -1390,6 +1544,15 @@ void DeallocateMemory(MemType memType, void* memPtr, size_t const bytes)
} }
HIP_CALL(hipFree(memPtr)); HIP_CALL(hipFree(memPtr));
} }
else if (memType == MEM_MANAGED)
{
if (memPtr == nullptr)
{
printf("[ERROR] Attempting to free null managed pointer for %lu bytes. Skipping hipMFree\n", bytes);
return;
}
HIP_CALL(hipFree(memPtr));
}
} }
void CheckPages(char* array, size_t numBytes, int targetId) void CheckPages(char* array, size_t numBytes, int targetId)
...@@ -1432,11 +1595,15 @@ void CheckPages(char* array, size_t numBytes, int targetId) ...@@ -1432,11 +1595,15 @@ void CheckPages(char* array, size_t numBytes, int targetId)
uint32_t GetId(uint32_t hwId) uint32_t GetId(uint32_t hwId)
{ {
#if defined(__NVCC_)
return hwId;
#else
// Based on instinct-mi200-cdna2-instruction-set-architecture.pdf // Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
int const shId = (hwId >> 12) & 1; int const shId = (hwId >> 12) & 1;
int const cuId = (hwId >> 8) & 15; int const cuId = (hwId >> 8) & 15;
int const seId = (hwId >> 13) & 3; int const seId = (hwId >> 13) & 3;
return (shId << 5) + (cuId << 2) + seId; return (shId << 5) + (cuId << 2) + seId;
#endif
} }
void RunTransfer(EnvVars const& ev, int const iteration, void RunTransfer(EnvVars const& ev, int const iteration,
...@@ -1529,37 +1696,75 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1529,37 +1696,75 @@ void RunTransfer(EnvVars const& ev, int const iteration,
} }
else if (transfer->exeType == EXE_GPU_DMA) else if (transfer->exeType == EXE_GPU_DMA)
{ {
// Switch to executing GPU
int const exeIndex = RemappedIndex(transfer->exeIndex, false); int const exeIndex = RemappedIndex(transfer->exeIndex, false);
HIP_CALL(hipSetDevice(exeIndex));
hipStream_t& stream = exeInfo.streams[transferIdx];
hipEvent_t& startEvent = exeInfo.startEvents[transferIdx];
hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
HIP_CALL(hipEventRecord(startEvent, stream)); if (transfer->exeSubIndex == -1)
if (transfer->numSrcs == 0 && transfer->numDsts == 1)
{ {
HIP_CALL(hipMemsetAsync(transfer->dstMem[0], // Switch to executing GPU
MEMSET_CHAR, transfer->numBytesActual, stream)); HIP_CALL(hipSetDevice(exeIndex));
} hipStream_t& stream = exeInfo.streams[transferIdx];
else if (transfer->numSrcs == 1 && transfer->numDsts == 1) hipEvent_t& startEvent = exeInfo.startEvents[transferIdx];
{ hipEvent_t& stopEvent = exeInfo.stopEvents[transferIdx];
HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
transfer->numBytesActual, hipMemcpyDefault,
stream));
}
HIP_CALL(hipEventRecord(stopEvent, stream));
HIP_CALL(hipStreamSynchronize(stream));
if (iteration >= 0) HIP_CALL(hipEventRecord(startEvent, stream));
if (transfer->numSrcs == 0 && transfer->numDsts == 1)
{
HIP_CALL(hipMemsetAsync(transfer->dstMem[0],
MEMSET_CHAR, transfer->numBytesActual, stream));
}
else if (transfer->numSrcs == 1 && transfer->numDsts == 1)
{
HIP_CALL(hipMemcpyAsync(transfer->dstMem[0], transfer->srcMem[0],
transfer->numBytesActual, hipMemcpyDefault,
stream));
}
HIP_CALL(hipEventRecord(stopEvent, stream));
HIP_CALL(hipStreamSynchronize(stream));
if (iteration >= 0)
{
// Record GPU timing
float gpuDeltaMsec;
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent));
transfer->transferTime += gpuDeltaMsec;
if (ev.showIterations)
transfer->perIterationTime.push_back(gpuDeltaMsec);
}
}
else
{ {
// Record GPU timing #if defined(__NVCC__)
float gpuDeltaMsec; printf("[ERROR] CUDA does not support targeting specific DMA engines\n");
HIP_CALL(hipEventElapsedTime(&gpuDeltaMsec, startEvent, stopEvent)); exit(1);
transfer->transferTime += gpuDeltaMsec; #else
if (ev.showIterations) // Target specific DMA engine
transfer->perIterationTime.push_back(gpuDeltaMsec);
// Atomically set signal to 1
HSA_CALL(hsa_signal_store_screlease(transfer->signal, 1));
auto cpuStart = std::chrono::high_resolution_clock::now();
HSA_CALL(hsa_amd_memory_async_copy_on_engine(transfer->dstMem[0], transfer->dstAgent,
transfer->srcMem[0], transfer->srcAgent,
transfer->numBytesActual, 0, NULL,
transfer->signal,
transfer->sdmaEngineId, true));
// Wait for SDMA transfer to complete
// NOTE: "A wait operation can spuriously resume at any time sooner than the timeout
// (for example, due to system or other external factors) even when the
// condition has not been met.)
while(hsa_signal_wait_scacquire(transfer->signal,
HSA_SIGNAL_CONDITION_LT, 1, UINT64_MAX,
HSA_WAIT_STATE_ACTIVE) >= 1);
if (iteration >= 0)
{
// Record GPU timing
auto cpuDelta = std::chrono::high_resolution_clock::now() - cpuStart;
double deltaMsec = std::chrono::duration_cast<std::chrono::duration<double>>(cpuDelta).count() * 1000.0;
transfer->transferTime += deltaMsec;
if (ev.showIterations)
transfer->perIterationTime.push_back(deltaMsec);
}
#endif
} }
} }
else if (transfer->exeType == EXE_CPU) // CPU execution agent else if (transfer->exeType == EXE_CPU) // CPU execution agent
...@@ -2260,6 +2465,8 @@ bool Transfer::PrepareSrc(EnvVars const& ev) ...@@ -2260,6 +2465,8 @@ bool Transfer::PrepareSrc(EnvVars const& ev)
ExeTypeStr[this->exeType], this->exeIndex, ExeTypeStr[this->exeType], this->exeIndex,
this->numSubExecs, this->numSubExecs,
this->DstToStr().c_str()); this->DstToStr().c_str());
printf("[ERROR] Possible cause is misconfigured IOMMU (AMD Instinct cards require amd_iommu=on and iommu=pt)\n");
printf("[ERROR] Please see https://community.amd.com/t5/knowledge-base/iommu-advisory-for-amd-instinct/ta-p/484601 for more details\n");
if (!ev.continueOnError) if (!ev.continueOnError)
exit(1); exit(1);
return false; return false;
...@@ -2461,7 +2668,8 @@ void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int ...@@ -2461,7 +2668,8 @@ void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int
void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus) void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int numSubExecs, int const srcIdx, int minGpus, int maxGpus)
{ {
printf("Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus); printf("Bytes to %s: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]\n",
ev.useRemoteRead ? "read" : "write", numBytesPerTransfer, srcIdx, numSubExecs, minGpus, maxGpus);
char sep = (ev.outputToCsv ? ',' : ' '); char sep = (ev.outputToCsv ? ',' : ' ');
...@@ -2493,17 +2701,31 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer ...@@ -2493,17 +2701,31 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
if (bitmask & (1<<i)) if (bitmask & (1<<i))
{ {
Transfer t; Transfer t;
t.dstType.resize(1);
t.dstIndex.resize(1);
t.exeType = EXE_GPU_GFX; t.exeType = EXE_GPU_GFX;
t.exeIndex = srcIdx;
t.exeSubIndex = -1; t.exeSubIndex = -1;
t.numSubExecs = numSubExecs; t.numSubExecs = numSubExecs;
t.numBytes = numBytesPerTransfer; t.numBytes = numBytesPerTransfer;
t.numSrcs = 0;
t.numDsts = 1; if (ev.useRemoteRead)
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU); {
t.dstIndex[0] = i; t.numSrcs = 1;
t.numDsts = 0;
t.exeIndex = i;
t.srcType.resize(1);
t.srcType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.srcIndex.resize(1);
t.srcIndex[0] = srcIdx;
}
else
{
t.numSrcs = 0;
t.numDsts = 1;
t.exeIndex = srcIdx;
t.dstType.resize(1);
t.dstType[0] = (ev.useFineGrain ? MEM_GPU_FINE : MEM_GPU);
t.dstIndex.resize(1);
t.dstIndex[0] = i;
}
transfers.push_back(t); transfers.push_back(t);
} }
} }
...@@ -2521,7 +2743,10 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer ...@@ -2521,7 +2743,10 @@ void RunRemoteWriteBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer
printf(" %d %d", p, numSubExecs); printf(" %d %d", p, numSubExecs);
for (auto i = 0; i < transfers.size(); i++) for (auto i = 0; i < transfers.size(); i++)
{ {
printf(" (N0 G%d %c%d)", srcIdx, MemTypeStr[transfers[i].dstType[0]], transfers[i].dstIndex[0]); printf(" (%s %c%d %s)",
transfers[i].SrcToStr().c_str(),
MemTypeStr[transfers[i].exeType], transfers[i].exeIndex,
transfers[i].DstToStr().c_str());
} }
printf("\n"); printf("\n");
} }
......
/* /*
Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -22,6 +22,18 @@ THE SOFTWARE. ...@@ -22,6 +22,18 @@ THE SOFTWARE.
#pragma once #pragma once
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) \
{ \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \
<< ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#if defined(__NVCC__) #if defined(__NVCC__)
#include <cuda_runtime.h> #include <cuda_runtime.h>
...@@ -65,6 +77,7 @@ THE SOFTWARE. ...@@ -65,6 +77,7 @@ THE SOFTWARE.
#define hipHostFree cudaFreeHost #define hipHostFree cudaFreeHost
#define hipHostMalloc cudaMallocHost #define hipHostMalloc cudaMallocHost
#define hipMalloc cudaMalloc #define hipMalloc cudaMalloc
#define hipMallocManaged cudaMallocManaged
#define hipMemcpy cudaMemcpy #define hipMemcpy cudaMemcpy
#define hipMemcpyAsync cudaMemcpyAsync #define hipMemcpyAsync cudaMemcpyAsync
#define hipMemset cudaMemset #define hipMemset cudaMemset
......
/* /*
Copyright (c) 2021-2023 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.47" #define TB_VERSION "1.48"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -130,6 +130,7 @@ public: ...@@ -130,6 +130,7 @@ public:
// Developer features // Developer features
int enableDebug; // Enable debug output int enableDebug; // Enable debug output
int gpuMaxHwQueues; // Tracks GPU_MAX_HW_QUEUES environment variable
// Used to track current configuration mode // Used to track current configuration mode
ConfigModeEnum configMode; ConfigModeEnum configMode;
...@@ -202,6 +203,7 @@ public: ...@@ -202,6 +203,7 @@ public:
useXccFilter = GetEnvVar("USE_XCC_FILTER" , 0); useXccFilter = GetEnvVar("USE_XCC_FILTER" , 0);
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0); validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
enableDebug = GetEnvVar("DEBUG" , 0); enableDebug = GetEnvVar("DEBUG" , 0);
gpuMaxHwQueues = GetEnvVar("GPU_MAX_HW_QUEUES" , 4);
// P2P Benchmark related // P2P Benchmark related
useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); // Needed for numGpuSubExec useDmaCopy = GetEnvVar("USE_GPU_DMA" , 0); // Needed for numGpuSubExec
...@@ -790,6 +792,9 @@ public: ...@@ -790,6 +792,9 @@ public:
printf("[Remote-Write Related]\n"); printf("[Remote-Write Related]\n");
PRINT_EV("USE_FINE_GRAIN", useFineGrain, PRINT_EV("USE_FINE_GRAIN", useFineGrain,
std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory"); std::string("Using ") + (useFineGrain ? "fine" : "coarse") + "-grained memory");
PRINT_EV("USE_REMOTE_READ", useRemoteRead,
std::string("Performing remote ") + (useRemoteRead ? "reads" : "writes"));
printf("\n");
} }
......
/* /*
Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) 2021-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN ...@@ -20,6 +20,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
*/ */
#pragma once
// Helper macro for checking HSA calls // Helper macro for checking HSA calls
#define HSA_CHECK(cmd) \ #define HSA_CHECK(cmd) \
do { \ do { \
......
/* /*
Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) 2022-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -62,12 +62,15 @@ struct SubExecParam ...@@ -62,12 +62,15 @@ struct SubExecParam
}; };
// Macro for collecting HW_REG_HW_ID // Macro for collecting HW_REG_HW_ID
#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__NVCC__) #if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__)
#define __trace_hwreg() \ #define GetHwId(hwId) \
p.hwId = 0 hwId = 0
#elif defined(__NVCC__)
#define GetHwId(hwId) \
asm("mov.u32 %0, %smid;" : "=r"(hwId) )
#else #else
#define __trace_hwreg() \ #define GetHwId(hwId) \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (p.hwId)); asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (hwId));
#endif #endif
// Macro for collecting HW_REG_XCC_ID // Macro for collecting HW_REG_XCC_ID
...@@ -284,10 +287,8 @@ __global__ void __launch_bounds__(BLOCKSIZE) ...@@ -284,10 +287,8 @@ __global__ void __launch_bounds__(BLOCKSIZE)
__threadfence_system(); __threadfence_system();
p.stopCycle = GetTimestamp(); p.stopCycle = GetTimestamp();
p.startCycle = startCycle; p.startCycle = startCycle;
#if !defined(__NVCC__) GetHwId(p.hwId);
p.xccId = xccId; GetXccId(p.xccId);
#endif
__trace_hwreg();
} }
} }
......
/* /*
Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. Copyright (c) 2019-2024 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal
...@@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, ...@@ -19,6 +19,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE. THE SOFTWARE.
*/ */
#pragma once
#include <vector> #include <vector>
#include <sstream> #include <sstream>
...@@ -31,21 +32,7 @@ THE SOFTWARE. ...@@ -31,21 +32,7 @@ THE SOFTWARE.
#include <map> #include <map>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include "Compatibility.hpp" #include "Compatibility.hpp"
// Helper macro for catching HIP errors
#define HIP_CALL(cmd) \
do { \
hipError_t error = (cmd); \
if (error != hipSuccess) \
{ \
std::cerr << "Encountered HIP error (" << hipGetErrorString(error) \
<< ") at line " << __LINE__ << " in file " << __FILE__ << "\n"; \
exit(-1); \
} \
} while (0)
#include "EnvVars.hpp" #include "EnvVars.hpp"
// Simple configuration parameters // Simple configuration parameters
...@@ -62,6 +49,7 @@ typedef enum ...@@ -62,6 +49,7 @@ typedef enum
MEM_GPU_FINE = 3, // Fine-grained global GPU memory MEM_GPU_FINE = 3, // Fine-grained global GPU memory
MEM_CPU_UNPINNED = 4, // Unpinned CPU memory MEM_CPU_UNPINNED = 4, // Unpinned CPU memory
MEM_NULL = 5, // NULL memory - used for empty MEM_NULL = 5, // NULL memory - used for empty
MEM_MANAGED = 6
} MemType; } MemType;
typedef enum typedef enum
...@@ -71,12 +59,12 @@ typedef enum ...@@ -71,12 +59,12 @@ typedef enum
EXE_GPU_DMA = 2, // GPU SDMA-based executor (subExecutor = streams) EXE_GPU_DMA = 2, // GPU SDMA-based executor (subExecutor = streams)
} ExeType; } ExeType;
bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE); } bool IsGpuType(MemType m) { return (m == MEM_GPU || m == MEM_GPU_FINE || m == MEM_MANAGED); }
bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); }; bool IsCpuType(MemType m) { return (m == MEM_CPU || m == MEM_CPU_FINE || m == MEM_CPU_UNPINNED); };
bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); }; bool IsGpuType(ExeType e) { return (e == EXE_GPU_GFX || e == EXE_GPU_DMA); };
bool IsCpuType(ExeType e) { return (e == EXE_CPU); }; bool IsCpuType(ExeType e) { return (e == EXE_CPU); };
char const MemTypeStr[7] = "CGBFUN"; char const MemTypeStr[8] = "CGBFUNM";
char const ExeTypeStr[4] = "CGD"; char const ExeTypeStr[4] = "CGD";
char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"}; char const ExeTypeName[3][4] = {"CPU", "GPU", "DMA"};
...@@ -129,6 +117,14 @@ struct Transfer ...@@ -129,6 +117,14 @@ struct Transfer
SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam SubExecParam* subExecParamGpuPtr; // Pointer to GPU copy of subExecParam
std::vector<int> subExecIdx; // Indicies into subExecParamGpu std::vector<int> subExecIdx; // Indicies into subExecParamGpu
#if !defined(__NVCC__)
// For targeted-SDMA
hsa_agent_t dstAgent; // DMA destination memory agent
hsa_agent_t srcAgent; // DMA source memory agent
hsa_signal_t signal; // HSA signal for completion
hsa_amd_sdma_engine_id_t sdmaEngineId; // DMA engine ID
#endif
// Prepares src/dst subarray pointers for each SubExecutor // Prepares src/dst subarray pointers for each SubExecutor
void PrepareSubExecParams(EnvVars const& ev); void PrepareSubExecParams(EnvVars const& ev);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment