Unverified Commit 08997e88 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.36 - Adding experimental USE_XCC_FILTER (#67)

parent e047656f
......@@ -3,6 +3,13 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.36
### Additions
* (Experimental) Adding XCC filtering - combined with XCC_PREF_TABLE, this tries to select
specific XCCs to use for specific (SRC->DST) Transfers
## v1.35
### Additions
......
......@@ -259,7 +259,7 @@ void ExecuteTransfers(EnvVars const& ev,
int const srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
// Ensure executing GPU can access source memory
if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex)
if (IsGpuType(exeType) && IsGpuType(srcType) && srcIndex != exeIndex)
EnablePeerAccess(exeIndex, srcIndex);
AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
......@@ -273,7 +273,7 @@ void ExecuteTransfers(EnvVars const& ev,
int const dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
// Ensure executing GPU can access destination memory
if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex)
if (IsGpuType(exeType) && IsGpuType(dstType) && dstIndex != exeIndex)
EnablePeerAccess(exeIndex, dstIndex);
AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
......@@ -1362,13 +1362,15 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// In single stream mode, all the threadblocks for this GPU are launched
// Otherwise, just launch the threadblocks associated with this single Transfer
int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);
#if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
HIP_CALL(hipEventRecord(stopEvent, stream));
#else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
dim3(numBlocksToRun, 1, 1),
dim3(numXCCs, numBlocksToRun, 1),
dim3(ev.blockSize, 1, 1),
ev.sharedMemBytes, stream,
startEvent, stopEvent,
......@@ -1999,6 +2001,15 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
for (int iDst = 0; iDst < this->numDsts; ++iDst)
p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
p.preferredXccId = -1;
if (ev.useXccFilter)
{
if (this->exeType == EXE_GPU_GFX && this->numDsts == 1 && IsGpuType(this->dstType[0]))
{
p.preferredXccId = ev.prefXccTable[this->exeIndex][this->dstIndex[0]];
}
}
if (ev.enableDebug)
{
printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.35"
#define TB_VERSION "1.36"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -91,10 +91,12 @@ public:
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
int useXccFilter; // Use XCC filtering (experimental)
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
std::vector<float> fillPattern; // Pattern of floats used to fill source data
std::vector<uint32_t> cuMask; // Bit-vector representing the CU mask
std::vector<std::vector<int>> prefXccTable;
// Environment variables only for P2P preset
int numCpuSubExecs; // Number of CPU subexecttors to use
......@@ -135,6 +137,8 @@ public:
std::vector<int> wallClockPerDeviceMhz;
std::vector<std::set<int>> xccIdsPerDevice;
// Constructor that collects values
EnvVars()
{
......@@ -187,6 +191,7 @@ public:
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0);
useXccFilter = GetEnvVar("USE_XCC_FILTER" , 0);
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
enableDebug = GetEnvVar("DEBUG" , 0);
gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel);
......@@ -327,6 +332,60 @@ public:
#endif
}
// Figure out number of xccs per device
int maxNumXccs = 64;
xccIdsPerDevice.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
{
int* data;
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
HIP_CALL(hipDeviceSynchronize());
xccIdsPerDevice[i].clear();
for (int j = 0; j < maxNumXccs; j++)
xccIdsPerDevice[i].insert(data[j]);
HIP_CALL(hipHostFree(data));
}
// Parse preferred XCC table (if provided
{
prefXccTable.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
{
prefXccTable[i].resize(numGpuDevices, 0);
}
char* prefXccStr = getenv("XCC_PREF_TABLE");
char* token = strtok(prefXccStr, ",");
int tokenCount = 0;
while (token)
{
int xccId;
if (sscanf(token, "%d", &xccId) == 1)
{
int src = tokenCount / numGpuDevices;
int dst = tokenCount % numGpuDevices;
if (xccIdsPerDevice[src].count(xccId) == 0)
{
printf("[ERROR] GPU %d does not contain XCC %d\n", src, xccId);
exit(1);
}
prefXccTable[src][dst] = xccId;
tokenCount++;
if (tokenCount == (numGpuDevices * numGpuDevices)) break;
}
else
{
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
}
// Perform some basic validation
if (numCpuDevices > numDetectedCpus)
{
......@@ -503,6 +562,7 @@ public:
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
printf(" USE_XCC_FILTER - Use XCC filtering (experimental)\n");
printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n");
}
......@@ -519,8 +579,8 @@ public:
if (!outputToCsv)
{
printf("TransferBench v%s\n", TB_VERSION);
printf("=====================================================\n");
if (!hideEnv) printf("[Common]\n");
printf("===============================================================\n");
if (!hideEnv) printf("[Common] (Suppress by setting HIDE_ENV=1)\n");
}
else if (!hideEnv)
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
......@@ -566,6 +626,21 @@ public:
std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
PRINT_EV("USE_XCC_FILTER", useXccFilter,
std::string("XCC filtering ") + (useXccFilter ? "enabled" : "disabled"));
if (useXccFilter)
{
printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
printf("%36s: ", "");
for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
for (int i = 0; i < numGpuDevices; i++)
{
printf("%36s: GPU %3d ", "", i);
for (int j = 0; j < numGpuDevices; j++)
printf(" %3d", prefXccTable[i][j]);
printf(" %3lu\n", xccIdsPerDevice[i].size());
}
}
PRINT_EV("VALIDATE_DIRECT", validateDirect,
std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
printf("\n");
......
......@@ -40,6 +40,7 @@ struct SubExecParam
int numDsts; // Number of destination arrays
float* src[MAX_SRCS]; // Source array pointers
float* dst[MAX_DSTS]; // Destination array pointers
uint32_t preferredXccId; // XCC ID to execute on
// Outputs
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
......@@ -59,11 +60,11 @@ struct SubExecParam
// Macro for collecting HW_REG_XCC_ID
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#define __trace_xccreg() \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (p.xccId));
#define GetXccId(val) \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
#else
#define __trace_xccreg() \
p.xccId = 0
#define GetXccId(val) \
val = 0
#endif
void CpuReduceKernel(SubExecParam const& p)
......@@ -105,6 +106,13 @@ __host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
}
__global__ void CollectXccIdsKernel(int* xccIds)
{
int xccId;
GetXccId(xccId);
xccIds[blockIdx.x] = xccId;
}
// GPU kernel to prepare src buffer data
__global__ void
PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
......@@ -127,10 +135,17 @@ template <int LOOP1_UNROLL>
__global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel(SubExecParam* params)
{
int64_t startCycle = wall_clock64();
int64_t startCycle;
if (threadIdx.x == 0) startCycle = wall_clock64();
SubExecParam& p = params[blockIdx.y];
// Filter by XCC if desired
int xccId;
GetXccId(xccId);
if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
// Operate on wavefront granularity
SubExecParam& p = params[blockIdx.x];
int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts;
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
......@@ -234,8 +249,8 @@ GpuReduceKernel(SubExecParam* params)
{
p.stopCycle = wall_clock64();
p.startCycle = startCycle;
p.xccId = xccId;
__trace_hwreg();
__trace_xccreg();
}
}
......@@ -369,7 +384,7 @@ __global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel2(SubExecParam* params)
{
int64_t startCycle = wall_clock64();
SubExecParam& p = params[blockIdx.x];
SubExecParam& p = params[blockIdx.y];
size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
if (numFloatsLeft)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment