Unverified Commit 79a3a003 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

V1.31 candidate (#58)

* Adding xccID output to SHOW_ITERATIONS
parent e7cfab75
# Changelog for TransferBench # Changelog for TransferBench
## v1.31
### Modified
- SHOW_ITERATIONS now show XCC:CU instead of just CU ID
- SHOW_ITERATIONS also printed when USE_SINGLE_STREAM=1
## v1.30 ## v1.30
### Added ### Added
- BLOCK_SIZE added to control threadblock size (Must be multiple of 64, up to 512) - BLOCK_SIZE added to control threadblock size (Must be multiple of 64, up to 512)
......
...@@ -578,7 +578,22 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -578,7 +578,22 @@ void ExecuteTransfers(EnvVars const& ev,
{ {
double iterDurationMsec = t.first; double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f; double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |\n", t.second, iterBandwidthGbs, iterDurationMsec); printf(" Iter %03d | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);
std::set<int> usedXccs;
if (t.second - 1 < transfer->perIterationCUs.size())
{
printf(" CUs:");
for (auto x : transfer->perIterationCUs[t.second - 1])
{
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
}
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %02d", x);
printf("\n");
} }
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime); printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
} }
...@@ -649,12 +664,19 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -649,12 +664,19 @@ void ExecuteTransfers(EnvVars const& ev,
double iterDurationMsec = t.first; double iterDurationMsec = t.first;
double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f; double iterBandwidthGbs = (transfer->numBytesActual / 1.0E9) / iterDurationMsec * 1000.0f;
printf(" Iter %03d | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec); printf(" Iter %03d | %7.3f GB/s | %8.3f ms |", t.second, iterBandwidthGbs, iterDurationMsec);
std::set<int> usedXccs;
if (t.second - 1 < transfer->perIterationCUs.size()) if (t.second - 1 < transfer->perIterationCUs.size())
{ {
printf(" CUs:"); printf(" CUs:");
for (auto x : transfer->perIterationCUs[t.second - 1]) for (auto x : transfer->perIterationCUs[t.second - 1])
printf(" %2d", x); {
printf(" %02d:%02d", x.first, x.second);
usedXccs.insert(x.first);
}
} }
printf(" XCCs:");
for (auto x : usedXccs)
printf(" %d", x);
printf("\n"); printf("\n");
} }
printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime); printf(" StandardDev | %7.3f GB/s | %8.3f ms |\n", stdDevBw, stdDevTime);
...@@ -1362,13 +1384,14 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1362,13 +1384,14 @@ void RunTransfer(EnvVars const& ev, int const iteration,
long long minStartCycle = std::numeric_limits<long long>::max(); long long minStartCycle = std::numeric_limits<long long>::max();
long long maxStopCycle = std::numeric_limits<long long>::min(); long long maxStopCycle = std::numeric_limits<long long>::min();
std::set<int> CUs; std::set<std::pair<int,int>> CUs;
for (auto subExecIdx : currTransfer->subExecIdx) for (auto subExecIdx : currTransfer->subExecIdx)
{ {
minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle); minStartCycle = std::min(minStartCycle, exeInfo.subExecParamGpu[subExecIdx].startCycle);
maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle); maxStopCycle = std::max(maxStopCycle, exeInfo.subExecParamGpu[subExecIdx].stopCycle);
if (ev.showIterations) if (ev.showIterations)
CUs.insert(GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)); CUs.insert(std::make_pair(exeInfo.subExecParamGpu[subExecIdx].xccId,
GetId(exeInfo.subExecParamGpu[subExecIdx].hwId)));
} }
int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex]; int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate); double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
...@@ -1387,9 +1410,10 @@ void RunTransfer(EnvVars const& ev, int const iteration, ...@@ -1387,9 +1410,10 @@ void RunTransfer(EnvVars const& ev, int const iteration,
if (ev.showIterations) if (ev.showIterations)
{ {
transfer->perIterationTime.push_back(gpuDeltaMsec); transfer->perIterationTime.push_back(gpuDeltaMsec);
std::set<int> CUs; std::set<std::pair<int,int>> CUs;
for (int i = 0; i < transfer->numSubExecs; i++) for (int i = 0; i < transfer->numSubExecs; i++)
CUs.insert(GetId(transfer->subExecParamGpuPtr[i].hwId)); CUs.insert(std::make_pair(transfer->subExecParamGpuPtr[i].xccId,
GetId(transfer->subExecParamGpuPtr[i].hwId)));
transfer->perIterationCUs.push_back(CUs); transfer->perIterationCUs.push_back(CUs);
} }
} }
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.30" #define TB_VERSION "1.31"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
......
...@@ -45,6 +45,7 @@ struct SubExecParam ...@@ -45,6 +45,7 @@ struct SubExecParam
long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor) long long startCycle; // Start timestamp for in-kernel timing (GPU-GFX executor)
long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor) long long stopCycle; // Stop timestamp for in-kernel timing (GPU-GFX executor)
uint32_t hwId; // Hardware ID uint32_t hwId; // Hardware ID
uint32_t xccId; // XCC ID
}; };
// Macro for collecting HW_REG_HW_ID // Macro for collecting HW_REG_HW_ID
...@@ -56,6 +57,15 @@ struct SubExecParam ...@@ -56,6 +57,15 @@ struct SubExecParam
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (p.hwId)); asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_HW_ID)" : "=s" (p.hwId));
#endif #endif
// Macro for collecting HW_REG_XCC_ID
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#define __trace_xccreg() \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (p.xccId));
#else
#define __trace_xccreg() \
p.xccId = 0
#endif
void CpuReduceKernel(SubExecParam const& p) void CpuReduceKernel(SubExecParam const& p)
{ {
int const& numSrcs = p.numSrcs; int const& numSrcs = p.numSrcs;
...@@ -225,6 +235,7 @@ GpuReduceKernel(SubExecParam* params) ...@@ -225,6 +235,7 @@ GpuReduceKernel(SubExecParam* params)
p.stopCycle = wall_clock64(); p.stopCycle = wall_clock64();
p.startCycle = startCycle; p.startCycle = startCycle;
__trace_hwreg(); __trace_hwreg();
__trace_xccreg();
} }
} }
......
...@@ -121,7 +121,7 @@ struct Transfer ...@@ -121,7 +121,7 @@ struct Transfer
std::vector<int> subExecIdx; // Indicies into subExecParamGpu std::vector<int> subExecIdx; // Indicies into subExecParamGpu
std::vector<double> perIterationTime; // Per-iteration timing std::vector<double> perIterationTime; // Per-iteration timing
std::vector<std::set<int>> perIterationCUs; // Per-iteration CU usage std::vector<std::set<std::pair<int,int>>> perIterationCUs; // Per-iteration CU usage
// Prepares src/dst subarray pointers for each SubExecutor // Prepares src/dst subarray pointers for each SubExecutor
void PrepareSubExecParams(EnvVars const& ev); void PrepareSubExecParams(EnvVars const& ev);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment