Unverified Commit d5445b95 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.46 Fixing GFX_UNROLL for gfx906, enabling GFX_SINGLE_TEAM by default (#80)

parent c19ae1c1
......@@ -3,6 +3,15 @@
Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.46
### Fixes
* Fixing GFX_UNROLL set to 13 (past 8) on gfx906 cards
### Modifications
* GFX_SINGLE_TEAM=1 by default
* Adding field showing summation of individual Transfer bandwidths for Executors
## v1.45
### Additions
......
......@@ -597,18 +597,24 @@ void ExecuteTransfers(EnvVars const& ev,
double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
double sumBandwidthGbs = 0.0;
for (auto& transfer: exeInfo.transfers)
{
transfer->transferTime /= (1.0 * numTimedIterations);
transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = exeBandwidthGbs;
sumBandwidthGbs += transfer->transferBandwidth;
}
if (verbose && !ev.outputToCsv)
{
printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n",
ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes);
printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes, sumBandwidthGbs);
}
int totalCUs = 0;
for (auto const& transfer : exeInfo.transfers)
{
transfer->transferTime /= (1.0 * numTimedIterations);
transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = exeBandwidthGbs;
totalCUs += transfer->numSubExecs;
char exeSubIndexStr[32] = "";
......
......@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.45"
#define TB_VERSION "1.46"
extern char const MemTypeStr[];
extern char const ExeTypeStr[];
......@@ -171,8 +171,8 @@ public:
// Different hardware pick different GPU kernels
// This performance difference is generally only noticable when executing fewer CUs
int defaultGfxUnroll = 4;
if (archName == "gfx906") defaultGfxUnroll = 13;
else if (archName == "gfx90a") defaultGfxUnroll = 9;
if (archName == "gfx906") defaultGfxUnroll = 8;
else if (archName == "gfx90a") defaultGfxUnroll = 8;
else if (archName == "gfx940") defaultGfxUnroll = 6;
else if (archName == "gfx941") defaultGfxUnroll = 6;
else if (archName == "gfx942") defaultGfxUnroll = 4;
......@@ -183,7 +183,7 @@ public:
byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 0);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0);
......@@ -509,7 +509,7 @@ public:
if (gfxUnroll < 1 || gfxUnroll > MAX_UNROLL)
{
printf("[ERROR] GFX kernel unroll factor must be between 1 and %d\n", MAX_UNROLL);
printf("[ERROR] GFX kernel unroll factor must be between 1 and %d (Not %d)\n", MAX_UNROLL, gfxUnroll);
exit(1);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment