Unverified Commit 79a3a003 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

V1.31 candidate (#58)

* Adding xccID output to SHOW_ITERATIONS
parent e7cfab75
# Changelog for TransferBench
## v1.31
### Modified
- SHOW_ITERATIONS now show XCC:CU instead of just CU ID
- SHOW_ITERATIONS also printed when USE_SINGLE_STREAM=1
## v1.30
### Added
- BLOCK_SIZE added to control threadblock size (Must be multiple of 64, up to 512)
......
......@@ -578,7 +578,22 @@ void ExecuteTransfers(EnvVars const& ev,
{
double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |\n", t.second, iterBandwidthGbs, iterDurationMsec);
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);
std::set<int> usedXccs;
if (t.second - 1 < transfer->perIterationCUs.size())
{
printf(" CUs:");
for (auto x : transfer->perIterationCUs[t.second - 1])
{
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %02d", x);
printf("\n");
}
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
}
......@@ -649,12 +664,19 @@ void ExecuteTransfers(EnvVars const& ev,
double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);
std::set<int> usedXccs;
if (t.second - 1 < transfer->perIterationCUs.size())
{
printf(" CUs:");
for (auto x : transfer->perIterationCUs[t.second - 1])
printf(" %2d", x);
{
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %d", x);
printf("\n");
}
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
......@@ -1362,13 +1384,14 @@ void RunTransfer(EnvVars const& ev, int const iteration,
long long minStartCycle = std::numeric_limits<long long>::max();
long long maxStopCycle = std::numeric_limits<long long>::min();
std::set<int> CUs;
std::set<std::pair<int,int>> CUs;
for (auto subExecIdx : currTransfer->subExecIdx)
{
minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle);
if (ev.showIterations)
CUs.insert(GetId(exeInfo.subExecParamGpu[subExecIdx].hwId));
CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
}
int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
......@@ -1387,9 +1410,10 @@ void RunTransfer(EnvVars const& ev, int const iteration,
if (ev.showIterations)
{
transfer->perIterationTime.push_back(gpuDeltaMsec);
std::set<int> CUs;
std::set<std::pair<int,int>> CUs;
for (int i = 0; i < transfer->numSubExecs; i++)
CUs.insert(GetId(transfer->subExecParamGpuPtr[i].hwId));
CUs.insert(std::make_pair(transfer->subExecParamGpuPtr[i].xccId,
GetId(transfer->subExecParamGpuPtr[i].hwId)));
transfer->perIterationCUs.push_back(CUs);
}
}
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.30"
#define TB_VERSION "1.31"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......
......@@ -45,6 +45,7 @@ struct SubExecParam
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor)
uint32_t hwId; // Hardware ID
uint32_t xccId; // XCC ID
};
// Macro for collecting HW_REG_HW_ID
......@@ -56,6 +57,15 @@ struct SubExecParam
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (p.hwId));
#endif
// Macro for collecting HW_REG_XCC_ID
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#define __trace_xccreg() \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (p.xccId));
#else
#define __trace_xccreg() \
p.xccId = 0
#endif
void CpuReduceKernel(SubExecParam const& p)
{
int const& numSrcs = p.numSrcs;
......@@ -225,6 +235,7 @@ GpuReduceKernel(SubExecParam* params)
p.stopCycle = wall_clock64();
p.startCycle = startCycle;
__trace_hwreg();
__trace_xccreg();
}
}
......
......@@ -121,7 +121,7 @@ struct Transfer
std::vector<int> subExecIdx; // Indicies into subExecParamGpu
std::vector<double> perIterationTime; // Per-iteration timing
std::vector<std::set<int>> perIterationCUs; // Per-iteration CU usage
std::vector<std::set<std::pair<int,int>>> perIterationCUs; // Per-iteration CU usage
// Prepares src/dst subarray pointers for each SubExecutor
void PrepareSubExecParams(EnvVars const& ev);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment