Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
0b29707e
Unverified
Commit
0b29707e
authored
Oct 11, 2023
by
gilbertlee-amd
Committed by
GitHub
Oct 11, 2023
Browse files
Fix inf, CU labelling. Update default kernels for gfx94x (#56)
parent
0b7b979e
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
45 additions
and
43 deletions
+45
-43
CHANGELOG.md
CHANGELOG.md
+10
-0
src/TransferBench.cpp
src/TransferBench.cpp
+6
-40
src/include/EnvVars.hpp
src/include/EnvVars.hpp
+29
-2
src/include/TransferBench.hpp
src/include/TransferBench.hpp
+0
-1
No files found.
CHANGELOG.md
View file @
0b29707e
# Changelog for TransferBench
## v1.29
### Added
-
a2a preset config now responds to USE_REMOTE_READ
### Fixed
-
Race-condition during wall-clock initialization caused "inf" during single stream runs
-
CU numbering output after CU masking
### Modified
-
Default number of warmups reverted to 3
-
Default unroll factor for gfx940/941 set to 6
## v1.28
### Added
-
Added A2A_DIRECT which only executes all-to-all only directly connected GPUs (on by default now)
...
...
src/TransferBench.cpp
View file @
0b29707e
...
...
@@ -1255,7 +1255,7 @@ uint32_t GetId(uint32_t hwId)
{
// Based on instinct-mi200-cdna2-instruction-set-architecture.pdf
int
const
shId
=
(
hwId
>>
12
)
&
1
;
int
const
cuId
=
(
hwId
>>
8
)
&
7
;
int
const
cuId
=
(
hwId
>>
8
)
&
15
;
int
const
seId
=
(
hwId
>>
13
)
&
3
;
return
(
shId
<<
5
)
+
(
cuId
<<
2
)
+
seId
;
}
...
...
@@ -1313,7 +1313,7 @@ void RunTransfer(EnvVars const& ev, int const iteration,
minStartCycle
=
std
::
min
(
minStartCycle
,
currTransfer
->
subExecParamGpuPtr
[
i
].
startCycle
);
maxStopCycle
=
std
::
max
(
maxStopCycle
,
currTransfer
->
subExecParamGpuPtr
[
i
].
stopCycle
);
}
int
const
wallClockRate
=
GetW
allClock
Rate
(
exeIndex
)
;
int
const
wallClockRate
=
ev
.
w
allClock
PerDeviceMhz
[
exeIndex
]
;
double
iterationTimeMs
=
(
maxStopCycle
-
minStartCycle
)
/
(
double
)(
wallClockRate
);
currTransfer
->
transferTime
+=
iterationTimeMs
;
if
(
ev
.
showIterations
)
...
...
@@ -1799,10 +1799,11 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
for
(
int
i
=
0
;
i
<
numGpus
;
i
++
)
{
transfer
.
srcIndex
[
0
]
=
i
;
transfer
.
exeIndex
=
i
;
for
(
int
j
=
0
;
j
<
numGpus
;
j
++
)
{
transfer
.
dstIndex
[
0
]
=
j
;
transfer
.
exeIndex
=
(
ev
.
useRemoteRead
?
j
:
i
);
if
(
ev
.
a2aDirect
)
{
#if !defined(__NVCC__)
...
...
@@ -2124,41 +2125,6 @@ std::string Transfer::DstToStr() const
return
ss
.
str
();
}
// NOTE: This is a stop-gap solution until HIP provides wallclock values
int
GetWallClockRate
(
int
deviceId
)
{
static
std
::
vector
<
int
>
wallClockPerDeviceMhz
;
if
(
wallClockPerDeviceMhz
.
size
()
==
0
)
{
int
numGpuDevices
;
HIP_CALL
(
hipGetDeviceCount
(
&
numGpuDevices
));
wallClockPerDeviceMhz
.
resize
(
numGpuDevices
);
for
(
int
i
=
0
;
i
<
numGpuDevices
;
i
++
)
{
#if defined(__NVCC__)
int
value
=
1410000
;
//HIP_CALL(hipDeviceGetAttribute(&value, hipDeviceAttributeClockRate, i));
//value *= 1000;
#else
hipDeviceProp_t
prop
;
HIP_CALL
(
hipGetDeviceProperties
(
&
prop
,
i
));
int
value
=
25000
;
switch
(
prop
.
gcnArch
)
{
case
906
:
case
910
:
value
=
25000
;
break
;
case
940
:
case
941
:
case
942
:
value
=
100000
;
break
;
default:
printf
(
"Unrecognized GCN arch %d
\n
"
,
prop
.
gcnArch
);
}
#endif
wallClockPerDeviceMhz
[
i
]
=
value
;
}
}
return
wallClockPerDeviceMhz
[
deviceId
];
}
void
RunSweepPreset
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
const
numGpuSubExecs
,
int
const
numCpuSubExecs
,
bool
const
isRandom
)
{
ev
.
DisplaySweepEnvVars
();
...
...
src/include/EnvVars.hpp
View file @
0b29707e
...
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.2
8
"
#define TB_VERSION "1.2
9
"
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
...
@@ -48,7 +48,7 @@ class EnvVars
{
public:
// Default configuration values
int
const
DEFAULT_NUM_WARMUPS
=
1
;
int
const
DEFAULT_NUM_WARMUPS
=
3
;
int
const
DEFAULT_NUM_ITERATIONS
=
10
;
int
const
DEFAULT_SAMPLING_FACTOR
=
1
;
...
...
@@ -123,6 +123,8 @@ public:
// Track how many CPUs are available per NUMA node
std
::
vector
<
int
>
numCpusPerNuma
;
std
::
vector
<
int
>
wallClockPerDeviceMhz
;
// Constructor that collects values
EnvVars
()
{
...
...
@@ -152,6 +154,8 @@ public:
int
defaultGpuKernel
=
0
;
if
(
archName
==
"gfx906"
)
defaultGpuKernel
=
13
;
else
if
(
archName
==
"gfx90a"
)
defaultGpuKernel
=
9
;
else
if
(
archName
==
"gfx940"
)
defaultGpuKernel
=
6
;
else
if
(
archName
==
"gfx941"
)
defaultGpuKernel
=
6
;
blockBytes
=
GetEnvVar
(
"BLOCK_BYTES"
,
256
);
byteOffset
=
GetEnvVar
(
"BYTE_OFFSET"
,
0
);
...
...
@@ -411,6 +415,26 @@ public:
for
(
int
i
=
0
;
i
<
totalCpus
;
i
++
)
numCpusPerNuma
[
numa_node_of_cpu
(
i
)]
++
;
// Build array of wall clock rates per GPU device
wallClockPerDeviceMhz
.
resize
(
numDetectedGpus
);
for
(
int
i
=
0
;
i
<
numDetectedGpus
;
i
++
)
{
#if defined(__NVCC__)
// NOTE: wallClock doesn't exist in CUDA. This may need to be adjusted / run with fixed clocks
wallClockPerDeviceMhz
[
i
]
=
1410000
;
#else
hipDeviceProp_t
prop
;
HIP_CALL
(
hipGetDeviceProperties
(
&
prop
,
i
));
int
value
=
25000
;
std
::
string
fullName
=
prop
.
gcnArchName
;
std
::
string
archName
=
fullName
.
substr
(
0
,
fullName
.
find
(
':'
));
if
(
archName
==
"gfx940"
||
archName
==
"gfx941"
||
archName
==
"gfx942"
)
wallClockPerDeviceMhz
[
i
]
=
100000
;
else
wallClockPerDeviceMhz
[
i
]
=
25000
;
#endif
}
// Check for deprecated env vars
if
(
getenv
(
"USE_HIP_CALL"
))
{
...
...
@@ -577,6 +601,9 @@ public:
printf
(
"[AllToAll Related]
\n
"
);
PRINT_EV
(
"A2A_DIRECT"
,
a2aDirect
,
std
::
string
(
a2aDirect
?
"Only using direct links"
:
"Full all-to-all"
));
PRINT_EV
(
"USE_REMOTE_READ"
,
useRemoteRead
,
std
::
string
(
"Using "
)
+
(
useRemoteRead
?
"DST"
:
"SRC"
)
+
" as executor"
);
printf
(
"
\n
"
);
}
...
...
src/include/TransferBench.hpp
View file @
0b29707e
...
...
@@ -193,6 +193,5 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
std
::
string
GetLinkTypeDesc
(
uint32_t
linkType
,
uint32_t
hopCount
);
int
RemappedIndex
(
int
const
origIdx
,
bool
const
isCpuType
);
int
GetWallClockRate
(
int
deviceId
);
void
LogTransfers
(
FILE
*
fp
,
int
const
testNum
,
std
::
vector
<
Transfer
>
const
&
transfers
);
std
::
string
PtrVectorToStr
(
std
::
vector
<
float
*>
const
&
strVector
,
int
const
initOffset
);
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment