Unverified Commit 0b29707e authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

Fix inf, CU labelling. Update default kernels for gfx94x (#56)

parent 0b7b979e
# Changelog for TransferBench
## v1.29
### Added
- a2a preset config now responds to USE_REMOTE_READ
### Fixed
- Race-condition during wall-clock initialization caused "inf" during single stream runs
- CU numbering output after CU masking
### Modified
- Default number of warmups reverted to 3
- Default unroll factor for gfx940/941 set to 6
## v1.28
### Added
- Added A2A_DIRECT which only executes all-to-all only directly connected GPUs (on by default now)
......
......@@ -1254,9 +1254,9 @@ void CheckPages(char* array, size_t numBytes, int targetId)
uint32_t GetId(uint32_t hwId)
{
// Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
int const shId = (hwId >> 12) & 1;
int const cuId = (hwId >> 8) & 7;
int const seId = (hwId >> 13) & 3;
int const shId = (hwId >> 12) & 1;
int const cuId = (hwId >> 8) & 15;
int const seId = (hwId >> 13) & 3;
return (shId << 5) + (cuId << 2) + seId;
}
......@@ -1313,7 +1313,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,
minStartCycle = std::min(minStartCycle, currTransfer->subExecParamGpuPtr[i].startCycle);
maxStopCycle = std::max(maxStopCycle, currTransfer->subExecParamGpuPtr[i].stopCycle);
}
int const wallClockRate = GetWallClockRate(exeIndex);
int const wallClockRate = ev.wallClockPerDeviceMhz[exeIndex];
double iterationTimeMs = (maxStopCycle - minStartCycle) / (double)(wallClockRate);
currTransfer->transferTime += iterationTimeMs;
if (ev.showIterations)
......@@ -1799,10 +1799,11 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
for (int i = 0; i < numGpus; i++)
{
transfer.srcIndex[0] = i;
transfer.exeIndex = i;
for (int j = 0; j < numGpus; j++)
{
transfer.dstIndex[0] = j;
transfer.exeIndex = (ev.useRemoteRead ? j : i);
if (ev.a2aDirect)
{
#if !defined(__NVCC__)
......@@ -2124,41 +2125,6 @@ std::string Transfer::DstToStr() const
return ss.str();
}
// NOTE: This is a stop-gap solution until HIP provides wallclock values
int GetWallClockRate(int deviceId)
{
static std::vector<int> wallClockPerDeviceMhz;
if (wallClockPerDeviceMhz.size() == 0)
{
int numGpuDevices;
HIP_CALL(hipGetDeviceCount(&numGpuDevices));
wallClockPerDeviceMhz.resize(numGpuDevices);
for (int i = 0; i < numGpuDevices; i++)
{
#if defined(__NVCC__)
int value = 1410000;
//HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
//value *= 1000;
#else
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, i));
int value = 25000;
switch (prop.gcnArch)
{
case 906: case 910: value = 25000; break;
case 940: case 941: case 942: value = 100000; break;
default:
printf("Unrecognized GCN arch %d\n", prop.gcnArch);
}
#endif
wallClockPerDeviceMhz[i] = value;
}
}
return wallClockPerDeviceMhz[deviceId];
}
void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int const numGpuSubExecs, int const numCpuSubExecs, bool const isRandom)
{
ev.DisplaySweepEnvVars();
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.28"
#define TB_VERSION "1.29"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -48,7 +48,7 @@ class EnvVars
{
public:
// Default configuration values
int const DEFAULT_NUM_WARMUPS = 1;
int const DEFAULT_NUM_WARMUPS = 3;
int const DEFAULT_NUM_ITERATIONS = 10;
int const DEFAULT_SAMPLING_FACTOR = 1;
......@@ -123,6 +123,8 @@ public:
// Track how many CPUs are available per NUMA node
std::vector<int> numCpusPerNuma;
std::vector<int> wallClockPerDeviceMhz;
// Constructor that collects values
EnvVars()
{
......@@ -152,6 +154,8 @@ public:
int defaultGpuKernel = 0;
if (archName == "gfx906") defaultGpuKernel = 13;
else if (archName == "gfx90a") defaultGpuKernel = 9;
else if (archName == "gfx940") defaultGpuKernel = 6;
else if (archName == "gfx941") defaultGpuKernel = 6;
blockBytes = GetEnvVar("BLOCK_BYTES" , 256);
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
......@@ -411,6 +415,26 @@ public:
for (int i = 0; i < totalCpus; i++)
numCpusPerNuma[numa_node_of_cpu(i)]++;
// Build array of wall clock rates per GPU device
wallClockPerDeviceMhz.resize(numDetectedGpus);
for (int i = 0; i < numDetectedGpus; i++)
{
#if defined(__NVCC__)
// NOTE: wallClock doesn't exist in CUDA. This may need to be adjusted / run with fixed clocks
wallClockPerDeviceMhz[i] = 1410000;
#else
hipDeviceProp_t prop;
HIP_CALL(hipGetDeviceProperties(&prop, i));
int value = 25000;
std::string fullName = prop.gcnArchName;
std::string archName = fullName.substr(0, fullName.find(':'));
if (archName == "gfx940" || archName == "gfx941" || archName == "gfx942")
wallClockPerDeviceMhz[i] = 100000;
else
wallClockPerDeviceMhz[i] = 25000;
#endif
}
// Check for deprecated env vars
if (getenv("USE_HIP_CALL"))
{
......@@ -577,6 +601,9 @@ public:
printf("[AllToAll Related]\n");
PRINT_EV("A2A_DIRECT", a2aDirect,
std::string(a2aDirect ? "Only using direct links" : "Full all-to-all"));
PRINT_EV("USE_REMOTE_READ", useRemoteRead,
std::string("Using ") + (useRemoteRead ? "DST" : "SRC") + " as executor");
printf("\n");
}
......
......@@ -193,6 +193,5 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
std::string GetLinkTypeDesc(uint32_t linkType, uint32_t hopCount);
int RemappedIndex(int const origIdx, bool const isCpuType);
int GetWallClockRate(int deviceId);
void LogTransfers(FILE *fp, int const testNum, std::vector<Transfer> const& transfers);
std::string PtrVectorToStr(std::vector<float*> const& strVector, int const initOffset);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment