Unverified Commit 08997e88 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.36 - Adding experimental USE_XCC_FILTER (#67)

parent e047656f
...@@ -3,6 +3,13 @@ ...@@ -3,6 +3,13 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.36
### Additions
* (Experimental) Adding XCC filtering - combined with XCC_PREF_TABLE, this tries to select
specific XCCs to use for specific (SRC->DST) Transfers
## v1.35 ## v1.35
### Additions ### Additions
......
...@@ -259,7 +259,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -259,7 +259,7 @@ void ExecuteTransfers(EnvVars const& ev,
int const srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType)); int const srcIndex = RemappedIndex(transfer->srcIndex[iSrc], IsCpuType(srcType));
// Ensure executing GPU can access source memory // Ensure executing GPU can access source memory
if (IsGpuType(exeType) == MEM_GPU && IsGpuType(srcType) && srcIndex != exeIndex) if (IsGpuType(exeType) && IsGpuType(srcType) && srcIndex != exeIndex)
EnablePeerAccess(exeIndex, srcIndex); EnablePeerAccess(exeIndex, srcIndex);
AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]); AllocateMemory(srcType, srcIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->srcMem[iSrc]);
...@@ -273,7 +273,7 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -273,7 +273,7 @@ void ExecuteTransfers(EnvVars const& ev,
int const dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType)); int const dstIndex = RemappedIndex(transfer->dstIndex[iDst], IsCpuType(dstType));
// Ensure executing GPU can access destination memory // Ensure executing GPU can access destination memory
if (IsGpuType(exeType) == MEM_GPU && IsGpuType(dstType) && dstIndex != exeIndex) if (IsGpuType(exeType) && IsGpuType(dstType) && dstIndex != exeIndex)
EnablePeerAccess(exeIndex, dstIndex); EnablePeerAccess(exeIndex, dstIndex);
AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]); AllocateMemory(dstType, dstIndex, transfer->numBytesActual + ev.byteOffset, (void**)&transfer->dstMem[iDst]);
...@@ -1362,13 +1362,15 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1362,13 +1362,15 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// In single stream mode, all the threadblocks for this GPU are launched // In single stream mode, all the threadblocks for this GPU are launched
// Otherwise, just launch the threadblocks associated with this single Transfer // Otherwise, just launch the threadblocks associated with this single Transfer
int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs; int const numBlocksToRun = ev.useSingleStream ? exeInfo.totalSubExecs : transfer->numSubExecs;
int const numXCCs = (ev.useXccFilter ? ev.xccIdsPerDevice[exeIndex].size() : 1);
#if defined(__NVCC__) #if defined(__NVCC__)
HIP_CALL(hipEventRecord(startEvent, stream)); HIP_CALL(hipEventRecord(startEvent, stream));
GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr); GpuKernelTable[ev.gpuKernel]<<<numBlocksToRun, ev.blockSize, ev.sharedMemBytes, stream>>>(transfer->subExecParamGpuPtr);
HIP_CALL(hipEventRecord(stopEvent, stream)); HIP_CALL(hipEventRecord(stopEvent, stream));
#else #else
hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel], hipExtLaunchKernelGGL(GpuKernelTable[ev.gpuKernel],
dim3(numBlocksToRun, 1, 1), dim3(numXCCs, numBlocksToRun, 1),
dim3(ev.blockSize, 1, 1), dim3(ev.blockSize, 1, 1),
ev.sharedMemBytes, stream, ev.sharedMemBytes, stream,
startEvent, stopEvent, startEvent, stopEvent,
...@@ -1999,6 +2001,15 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev) ...@@ -1999,6 +2001,15 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
for (int iDst = 0; iDst < this->numDsts; ++iDst) for (int iDst = 0; iDst < this->numDsts; ++iDst)
p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset; p.dst[iDst] = this->dstMem[iDst] + assigned + initOffset;
p.preferredXccId = -1;
if (ev.useXccFilter)
{
if (this->exeType == EXE_GPU_GFX && this->numDsts == 1 && IsGpuType(this->dstType[0]))
{
p.preferredXccId = ev.prefXccTable[this->exeIndex][this->dstIndex[0]];
}
}
if (ev.enableDebug) if (ev.enableDebug)
{ {
printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n", printf("Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu\n",
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.35" #define TB_VERSION "1.36"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -91,10 +91,12 @@ public: ...@@ -91,10 +91,12 @@ public:
int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device int usePcieIndexing; // Base GPU indexing on PCIe address instead of HIP device
int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern) int usePrepSrcKernel; // Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer int useSingleStream; // Use a single stream per GPU GFX executor instead of stream per Transfer
int useXccFilter; // Use XCC filtering (experimental)
int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host int validateDirect; // Validate GPU destination memory directly instead of staging GPU memory on host
std::vector<float> fillPattern; // Pattern of floats used to fill source data std::vector<float> fillPattern; // Pattern of floats used to fill source data
std::vector<uint32_t> cuMask; // Bit-vector representing the CU mask std::vector<uint32_t> cuMask; // Bit-vector representing the CU mask
std::vector<std::vector<int>> prefXccTable;
// Environment variables only for P2P preset // Environment variables only for P2P preset
int numCpuSubExecs; // Number of CPU subexecttors to use int numCpuSubExecs; // Number of CPU subexecttors to use
...@@ -135,6 +137,8 @@ public: ...@@ -135,6 +137,8 @@ public:
std::vector<int> wallClockPerDeviceMhz; std::vector<int> wallClockPerDeviceMhz;
std::vector<std::set<int>> xccIdsPerDevice;
// Constructor that collects values // Constructor that collects values
EnvVars() EnvVars()
{ {
...@@ -187,6 +191,7 @@ public: ...@@ -187,6 +191,7 @@ public:
usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0); usePcieIndexing = GetEnvVar("USE_PCIE_INDEX" , 0);
usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0); usePrepSrcKernel = GetEnvVar("USE_PREP_KERNEL" , 0);
useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0); useSingleStream = GetEnvVar("USE_SINGLE_STREAM" , 0);
useXccFilter = GetEnvVar("USE_XCC_FILTER" , 0);
validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0); validateDirect = GetEnvVar("VALIDATE_DIRECT" , 0);
enableDebug = GetEnvVar("DEBUG" , 0); enableDebug = GetEnvVar("DEBUG" , 0);
gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel); gpuKernel = GetEnvVar("GPU_KERNEL" , defaultGpuKernel);
...@@ -327,6 +332,60 @@ public: ...@@ -327,6 +332,60 @@ public:
#endif #endif
} }
// Figure out number of xccs per device
int maxNumXccs = 64;
xccIdsPerDevice.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
{
int* data;
HIP_CALL(hipSetDevice(i));
HIP_CALL(hipHostMalloc((void**)&data, maxNumXccs * sizeof(int)));
CollectXccIdsKernel<<<maxNumXccs, 1>>>(data);
HIP_CALL(hipDeviceSynchronize());
xccIdsPerDevice[i].clear();
for (int j = 0; j < maxNumXccs; j++)
xccIdsPerDevice[i].insert(data[j]);
HIP_CALL(hipHostFree(data));
}
// Parse preferred XCC table (if provided
{
prefXccTable.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
{
prefXccTable[i].resize(numGpuDevices, 0);
}
char* prefXccStr = getenv("XCC_PREF_TABLE");
char* token = strtok(prefXccStr, ",");
int tokenCount = 0;
while (token)
{
int xccId;
if (sscanf(token, "%d", &xccId) == 1)
{
int src = tokenCount / numGpuDevices;
int dst = tokenCount % numGpuDevices;
if (xccIdsPerDevice[src].count(xccId) == 0)
{
printf("[ERROR] GPU %d does not contain XCC %d\n", src, xccId);
exit(1);
}
prefXccTable[src][dst] = xccId;
tokenCount++;
if (tokenCount == (numGpuDevices * numGpuDevices)) break;
}
else
{
printf("[ERROR] Unrecognized token [%s]\n", token);
exit(1);
}
token = strtok(NULL, ",");
}
}
// Perform some basic validation // Perform some basic validation
if (numCpuDevices > numDetectedCpus) if (numCpuDevices > numDetectedCpus)
{ {
...@@ -503,6 +562,7 @@ public: ...@@ -503,6 +562,7 @@ public:
printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n"); printf(" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing\n");
printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n"); printf(" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern\n");
printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n"); printf(" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer\n");
printf(" USE_XCC_FILTER - Use XCC filtering (experimental)\n");
printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n"); printf(" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host\n");
} }
...@@ -519,8 +579,8 @@ public: ...@@ -519,8 +579,8 @@ public:
if (!outputToCsv) if (!outputToCsv)
{ {
printf("TransferBench v%s\n", TB_VERSION); printf("TransferBench v%s\n", TB_VERSION);
printf("=====================================================\n"); printf("===============================================================\n");
if (!hideEnv) printf("[Common]\n"); if (!hideEnv) printf("[Common] (Suppress by setting HIDE_ENV=1)\n");
} }
else if (!hideEnv) else if (!hideEnv)
printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION); printf("EnvVar,Value,Description,(TransferBench v%s)\n", TB_VERSION);
...@@ -566,6 +626,21 @@ public: ...@@ -566,6 +626,21 @@ public:
std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data"); std::string("Using ") + (usePrepSrcKernel ? "GPU kernels" : "hipMemcpy") + " to initialize source data");
PRINT_EV("USE_SINGLE_STREAM", useSingleStream, PRINT_EV("USE_SINGLE_STREAM", useSingleStream,
std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer")); std::string("Using single stream per ") + (useSingleStream ? "device" : "Transfer"));
PRINT_EV("USE_XCC_FILTER", useXccFilter,
std::string("XCC filtering ") + (useXccFilter ? "enabled" : "disabled"));
if (useXccFilter)
{
printf("%36s: Preferred XCC Table (XCC_PREF_TABLE)\n", "");
printf("%36s: ", "");
for (int i = 0; i < numGpuDevices; i++) printf(" %3d", i); printf(" (#XCCs)\n");
for (int i = 0; i < numGpuDevices; i++)
{
printf("%36s: GPU %3d ", "", i);
for (int j = 0; j < numGpuDevices; j++)
printf(" %3d", prefXccTable[i][j]);
printf(" %3lu\n", xccIdsPerDevice[i].size());
}
}
PRINT_EV("VALIDATE_DIRECT", validateDirect, PRINT_EV("VALIDATE_DIRECT", validateDirect,
std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer")); std::string("Validate GPU destination memory ") + (validateDirect ? "directly" : "via CPU staging buffer"));
printf("\n"); printf("\n");
......
...@@ -40,6 +40,7 @@ struct SubExecParam ...@@ -40,6 +40,7 @@ struct SubExecParam
int numDsts; // Number of destination arrays int numDsts; // Number of destination arrays
float* src[MAX_SRCS]; // Source array pointers float* src[MAX_SRCS]; // Source array pointers
float* dst[MAX_DSTS]; // Destination array pointers float* dst[MAX_DSTS]; // Destination array pointers
uint32_t preferredXccId; // XCC ID to execute on
// Outputs // Outputs
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor) long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
...@@ -59,11 +60,11 @@ struct SubExecParam ...@@ -59,11 +60,11 @@ struct SubExecParam
// Macro for collecting HW_REG_XCC_ID // Macro for collecting HW_REG_XCC_ID
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#define __trace_xccreg() \ #define GetXccId(val) \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (p.xccId)); asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (val));
#else #else
#define __trace_xccreg() \ #define GetXccId(val) \
p.xccId = 0 val = 0
#endif #endif
void CpuReduceKernel(SubExecParam const& p) void CpuReduceKernel(SubExecParam const& p)
...@@ -105,6 +106,13 @@ __host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx) ...@@ -105,6 +106,13 @@ __host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1); return (((idx % 383) * 517) % 383 + 31) * (srcBufferIdx + 1);
} }
__global__ void CollectXccIdsKernel(int* xccIds)
{
int xccId;
GetXccId(xccId);
xccIds[blockIdx.x] = xccId;
}
// GPU kernel to prepare src buffer data // GPU kernel to prepare src buffer data
__global__ void __global__ void
PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx) PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
...@@ -127,10 +135,17 @@ template <int LOOP1_UNROLL> ...@@ -127,10 +135,17 @@ template <int LOOP1_UNROLL>
__global__ void __launch_bounds__(MAX_BLOCKSIZE) __global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel(SubExecParam* params) GpuReduceKernel(SubExecParam* params)
{ {
int64_t startCycle = wall_clock64(); int64_t startCycle;
if (threadIdx.x == 0) startCycle = wall_clock64();
SubExecParam& p = params[blockIdx.y];
// Filter by XCC if desired
int xccId;
GetXccId(xccId);
if (p.preferredXccId != -1 && xccId != p.preferredXccId) return;
// Operate on wavefront granularity // Operate on wavefront granularity
SubExecParam& p = params[blockIdx.x];
int const numSrcs = p.numSrcs; int const numSrcs = p.numSrcs;
int const numDsts = p.numDsts; int const numDsts = p.numDsts;
int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number int const waveId = threadIdx.x / WARP_SIZE; // Wavefront number
...@@ -234,8 +249,8 @@ GpuReduceKernel(SubExecParam* params) ...@@ -234,8 +249,8 @@ GpuReduceKernel(SubExecParam* params)
{ {
p.stopCycle = wall_clock64(); p.stopCycle = wall_clock64();
p.startCycle = startCycle; p.startCycle = startCycle;
p.xccId = xccId;
__trace_hwreg(); __trace_hwreg();
__trace_xccreg();
} }
} }
...@@ -369,7 +384,7 @@ __global__ void __launch_bounds__(MAX_BLOCKSIZE) ...@@ -369,7 +384,7 @@ __global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel2(SubExecParam* params) GpuReduceKernel2(SubExecParam* params)
{ {
int64_t startCycle = wall_clock64(); int64_t startCycle = wall_clock64();
SubExecParam& p = params[blockIdx.x]; SubExecParam& p = params[blockIdx.y];
size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8); size_t numFloatsLeft = GpuReduceFunc<float4>(p, 0, p.N, 8);
if (numFloatsLeft) if (numFloatsLeft)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment