Unverified Commit d5445b95 authored by gilbertlee-amd's avatar gilbertlee-amd Committed by GitHub
Browse files

v1.46 Fixing GFX_UNROLL for gfx906, enabling GFX_SINGLE_TEAM by default (#80)

parent c19ae1c1
...@@ -3,6 +3,15 @@ ...@@ -3,6 +3,15 @@
Documentation for TransferBench is available at Documentation for TransferBench is available at
[https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench). [https://rocm.docs.amd.com/projects/TransferBench](https://rocm.docs.amd.com/projects/TransferBench).
## v1.46
### Fixes
* Fixing GFX_UNROLL set to 13 (past 8) on gfx906 cards
### Modifications
* GFX_SINGLE_TEAM=1 by default
* Adding field showing summation of individual Transfer bandwidths for Executors
## v1.45 ## v1.45
### Additions ### Additions
......
...@@ -597,18 +597,24 @@ void ExecuteTransfers(EnvVars const& ev, ...@@ -597,18 +597,24 @@ void ExecuteTransfers(EnvVars const& ev,
double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f; double exeBandwidthGbs = (exeInfo.totalBytes / 1.0E9) / exeDurationMsec * 1000.0f;
maxGpuTime = std::max(maxGpuTime, exeDurationMsec); maxGpuTime = std::max(maxGpuTime, exeDurationMsec);
double sumBandwidthGbs = 0.0;
for (auto& transfer: exeInfo.transfers)
{
transfer->transferTime /= (1.0 * numTimedIterations);
transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = exeBandwidthGbs;
sumBandwidthGbs += transfer->transferBandwidth;
}
if (verbose && !ev.outputToCsv) if (verbose && !ev.outputToCsv)
{ {
printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes\n", printf(" Executor: %3s %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %-7.3f GB/s (sum)\n",
ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes); ExeTypeName[exeType], exeIndex, exeBandwidthGbs, exeDurationMsec, exeInfo.totalBytes, sumBandwidthGbs);
} }
int totalCUs = 0; int totalCUs = 0;
for (auto const& transfer : exeInfo.transfers) for (auto const& transfer : exeInfo.transfers)
{ {
transfer->transferTime /= (1.0 * numTimedIterations);
transfer->transferBandwidth = (transfer->numBytesActual / 1.0E9) / transfer->transferTime * 1000.0f;
transfer->executorBandwidth = exeBandwidthGbs;
totalCUs += transfer->numSubExecs; totalCUs += transfer->numSubExecs;
char exeSubIndexStr[32] = ""; char exeSubIndexStr[32] = "";
......
...@@ -29,7 +29,7 @@ THE SOFTWARE. ...@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp" #include "Compatibility.hpp"
#include "Kernels.hpp" #include "Kernels.hpp"
#define TB_VERSION "1.45" #define TB_VERSION "1.46"
extern char const MemTypeStr[]; extern char const MemTypeStr[];
extern char const ExeTypeStr[]; extern char const ExeTypeStr[];
...@@ -171,8 +171,8 @@ public: ...@@ -171,8 +171,8 @@ public:
// Different hardware pick different GPU kernels // Different hardware pick different GPU kernels
// This performance difference is generally only noticable when executing fewer CUs // This performance difference is generally only noticable when executing fewer CUs
int defaultGfxUnroll = 4; int defaultGfxUnroll = 4;
if (archName == "gfx906") defaultGfxUnroll = 13; if (archName == "gfx906") defaultGfxUnroll = 8;
else if (archName == "gfx90a") defaultGfxUnroll = 9; else if (archName == "gfx90a") defaultGfxUnroll = 8;
else if (archName == "gfx940") defaultGfxUnroll = 6; else if (archName == "gfx940") defaultGfxUnroll = 6;
else if (archName == "gfx941") defaultGfxUnroll = 6; else if (archName == "gfx941") defaultGfxUnroll = 6;
else if (archName == "gfx942") defaultGfxUnroll = 4; else if (archName == "gfx942") defaultGfxUnroll = 4;
...@@ -183,7 +183,7 @@ public: ...@@ -183,7 +183,7 @@ public:
byteOffset = GetEnvVar("BYTE_OFFSET" , 0); byteOffset = GetEnvVar("BYTE_OFFSET" , 0);
continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0); continueOnError = GetEnvVar("CONTINUE_ON_ERROR" , 0);
gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256); gfxBlockSize = GetEnvVar("GFX_BLOCK_SIZE" , 256);
gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 0); gfxSingleTeam = GetEnvVar("GFX_SINGLE_TEAM" , 1);
gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll); gfxUnroll = GetEnvVar("GFX_UNROLL" , defaultGfxUnroll);
gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0); gfxWaveOrder = GetEnvVar("GFX_WAVE_ORDER" , 0);
hideEnv = GetEnvVar("HIDE_ENV" , 0); hideEnv = GetEnvVar("HIDE_ENV" , 0);
...@@ -509,7 +509,7 @@ public: ...@@ -509,7 +509,7 @@ public:
if (gfxUnroll < 1 || gfxUnroll > MAX_UNROLL) if (gfxUnroll < 1 || gfxUnroll > MAX_UNROLL)
{ {
printf("[ERROR] GFX kernel unroll factor must be between 1 and %d\n", MAX_UNROLL); printf("[ERROR] GFX kernel unroll factor must be between 1 and %d (Not %d)\n", MAX_UNROLL, gfxUnroll);
exit(1); exit(1);
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment