Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
30f1c584
Unverified
Commit
30f1c584
authored
Nov 30, 2023
by
gilbertlee-amd
Committed by
GitHub
Nov 30, 2023
Browse files
v1.43 Minor changes to a2a output (#76)
parent
8b2cd85d
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
59 additions
and
44 deletions
+59
-44
CHANGELOG.md
CHANGELOG.md
+5
-0
src/TransferBench.cpp
src/TransferBench.cpp
+50
-42
src/include/EnvVars.hpp
src/include/EnvVars.hpp
+1
-1
src/include/TransferBench.hpp
src/include/TransferBench.hpp
+3
-1
No files found.
CHANGELOG.md
View file @
30f1c584
...
@@ -3,6 +3,11 @@
...
@@ -3,6 +3,11 @@
Documentation for TransferBench is available at
Documentation for TransferBench is available at
[
https://rocm.docs.amd.com/projects/TransferBench
](
https://rocm.docs.amd.com/projects/TransferBench
)
.
[
https://rocm.docs.amd.com/projects/TransferBench
](
https://rocm.docs.amd.com/projects/TransferBench
)
.
## v1.43
### Changes
*
Modifying a2a to show executor timing, as well as executor min/max bandwidth
## v1.42
## v1.42
### Fixes
### Fixes
...
...
src/TransferBench.cpp
View file @
30f1c584
...
@@ -581,7 +581,8 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -581,7 +581,8 @@ void ExecuteTransfers(EnvVars const& ev,
for
(
auto
const
&
transfer
:
exeInfo
.
transfers
)
for
(
auto
const
&
transfer
:
exeInfo
.
transfers
)
{
{
transfer
->
transferTime
/=
(
1.0
*
numTimedIterations
);
transfer
->
transferTime
/=
(
1.0
*
numTimedIterations
);
double
transferBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
transferTime
*
1000.0
f
;
transfer
->
transferBandwidth
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
transferTime
*
1000.0
f
;
transfer
->
executorBandwidth
=
exeBandwidthGbs
;
totalCUs
+=
transfer
->
numSubExecs
;
totalCUs
+=
transfer
->
numSubExecs
;
if
(
!
verbose
)
continue
;
if
(
!
verbose
)
continue
;
...
@@ -589,7 +590,7 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -589,7 +590,7 @@ void ExecuteTransfers(EnvVars const& ev,
{
{
printf
(
" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s
\n
"
,
printf
(
" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s
\n
"
,
transfer
->
transferIndex
,
transfer
->
transferIndex
,
transferBandwidth
Gbs
,
transfer
->
transferBandwidth
,
transfer
->
transferTime
,
transfer
->
transferTime
,
transfer
->
numBytesActual
,
transfer
->
numBytesActual
,
transfer
->
SrcToStr
().
c_str
(),
transfer
->
SrcToStr
().
c_str
(),
...
@@ -609,7 +610,7 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -609,7 +610,7 @@ void ExecuteTransfers(EnvVars const& ev,
stdDevTime
+=
varTime
*
varTime
;
stdDevTime
+=
varTime
*
varTime
;
double
iterBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
perIterationTime
[
i
]
*
1000.0
f
;
double
iterBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
perIterationTime
[
i
]
*
1000.0
f
;
double
const
varBw
=
fabs
(
iterBandwidthGbs
-
transferBandwidth
Gbs
);
double
const
varBw
=
fabs
(
iterBandwidthGbs
-
transfer
->
transferBandwidth
);
stdDevBw
+=
varBw
*
varBw
;
stdDevBw
+=
varBw
*
varBw
;
}
}
stdDevTime
=
sqrt
(
stdDevTime
/
numTimedIterations
);
stdDevTime
=
sqrt
(
stdDevTime
/
numTimedIterations
);
...
@@ -647,7 +648,7 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -647,7 +648,7 @@ void ExecuteTransfers(EnvVars const& ev,
MemTypeStr
[
transfer
->
exeType
],
transfer
->
exeIndex
,
MemTypeStr
[
transfer
->
exeType
],
transfer
->
exeIndex
,
transfer
->
DstToStr
().
c_str
(),
transfer
->
DstToStr
().
c_str
(),
transfer
->
numSubExecs
,
transfer
->
numSubExecs
,
transferBandwidth
Gbs
,
transfer
->
transferTime
,
transfer
->
transferBandwidth
,
transfer
->
transferTime
,
PtrVectorToStr
(
transfer
->
srcMem
,
initOffset
).
c_str
(),
PtrVectorToStr
(
transfer
->
srcMem
,
initOffset
).
c_str
(),
PtrVectorToStr
(
transfer
->
dstMem
,
initOffset
).
c_str
());
PtrVectorToStr
(
transfer
->
dstMem
,
initOffset
).
c_str
());
}
}
...
@@ -668,14 +669,15 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -668,14 +669,15 @@ void ExecuteTransfers(EnvVars const& ev,
{
{
Transfer
*
transfer
=
transferPair
.
second
;
Transfer
*
transfer
=
transferPair
.
second
;
transfer
->
transferTime
/=
(
1.0
*
numTimedIterations
);
transfer
->
transferTime
/=
(
1.0
*
numTimedIterations
);
double
transferBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
transferTime
*
1000.0
f
;
transfer
->
transferBandwidth
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
transferTime
*
1000.0
f
;
transfer
->
executorBandwidth
=
transfer
->
transferBandwidth
;
maxGpuTime
=
std
::
max
(
maxGpuTime
,
transfer
->
transferTime
);
maxGpuTime
=
std
::
max
(
maxGpuTime
,
transfer
->
transferTime
);
if
(
!
verbose
)
continue
;
if
(
!
verbose
)
continue
;
if
(
!
ev
.
outputToCsv
)
if
(
!
ev
.
outputToCsv
)
{
{
printf
(
" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s
\n
"
,
printf
(
" Transfer %02d | %7.3f GB/s | %8.3f ms | %12lu bytes | %s -> %s%02d:%03d -> %s
\n
"
,
transfer
->
transferIndex
,
transfer
->
transferIndex
,
transferBandwidth
Gbs
,
transfer
->
transferTime
,
transfer
->
transferBandwidth
,
transfer
->
transferTime
,
transfer
->
numBytesActual
,
transfer
->
numBytesActual
,
transfer
->
SrcToStr
().
c_str
(),
transfer
->
SrcToStr
().
c_str
(),
ExeTypeName
[
transfer
->
exeType
],
transfer
->
exeIndex
,
ExeTypeName
[
transfer
->
exeType
],
transfer
->
exeIndex
,
...
@@ -694,7 +696,7 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -694,7 +696,7 @@ void ExecuteTransfers(EnvVars const& ev,
stdDevTime
+=
varTime
*
varTime
;
stdDevTime
+=
varTime
*
varTime
;
double
iterBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
perIterationTime
[
i
]
*
1000.0
f
;
double
iterBandwidthGbs
=
(
transfer
->
numBytesActual
/
1.0E9
)
/
transfer
->
perIterationTime
[
i
]
*
1000.0
f
;
double
const
varBw
=
fabs
(
iterBandwidthGbs
-
transferBandwidth
Gbs
);
double
const
varBw
=
fabs
(
iterBandwidthGbs
-
transfer
->
transferBandwidth
);
stdDevBw
+=
varBw
*
varBw
;
stdDevBw
+=
varBw
*
varBw
;
}
}
stdDevTime
=
sqrt
(
stdDevTime
/
numTimedIterations
);
stdDevTime
=
sqrt
(
stdDevTime
/
numTimedIterations
);
...
@@ -731,7 +733,7 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -731,7 +733,7 @@ void ExecuteTransfers(EnvVars const& ev,
ExeTypeName
[
transfer
->
exeType
],
transfer
->
exeIndex
,
ExeTypeName
[
transfer
->
exeType
],
transfer
->
exeIndex
,
transfer
->
DstToStr
().
c_str
(),
transfer
->
DstToStr
().
c_str
(),
transfer
->
numSubExecs
,
transfer
->
numSubExecs
,
transferBandwidth
Gbs
,
transfer
->
transferTime
,
transfer
->
transferBandwidth
,
transfer
->
transferTime
,
PtrVectorToStr
(
transfer
->
srcMem
,
initOffset
).
c_str
(),
PtrVectorToStr
(
transfer
->
srcMem
,
initOffset
).
c_str
(),
PtrVectorToStr
(
transfer
->
dstMem
,
initOffset
).
c_str
());
PtrVectorToStr
(
transfer
->
dstMem
,
initOffset
).
c_str
());
}
}
...
@@ -1849,20 +1851,21 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
...
@@ -1849,20 +1851,21 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
char
separator
=
(
ev
.
outputToCsv
?
','
:
' '
);
char
separator
=
(
ev
.
outputToCsv
?
','
:
' '
);
std
::
vector
<
Transfer
>
transfers
(
1
);
std
::
vector
<
Transfer
>
transfers
(
1
);
transfers
[
0
].
numBytes
=
N
*
sizeof
(
float
);
Transfer
&
t
=
transfers
[
0
];
transfers
[
0
].
numSrcs
=
1
;
t
.
numBytes
=
N
*
sizeof
(
float
);
transfers
[
0
].
numDsts
=
1
;
t
.
numSrcs
=
1
;
transfers
[
0
].
exeType
=
EXE_GPU_GFX
;
t
.
numDsts
=
1
;
transfers
[
0
].
exeIndex
=
exeIndex
;
t
.
exeType
=
EXE_GPU_GFX
;
transfers
[
0
].
exeSubIndex
=
-
1
;
t
.
exeIndex
=
exeIndex
;
transfers
[
0
].
srcType
.
resize
(
1
,
MEM_GPU
);
t
.
exeSubIndex
=
-
1
;
transfers
[
0
].
dstType
.
resize
(
1
,
MEM_GPU
);
t
.
srcType
.
resize
(
1
,
MEM_GPU
);
transfers
[
0
].
srcIndex
.
resize
(
1
);
t
.
dstType
.
resize
(
1
,
MEM_GPU
);
transfers
[
0
].
dstIndex
.
resize
(
1
);
t
.
srcIndex
.
resize
(
1
);
t
.
dstIndex
.
resize
(
1
);
printf
(
"GPU-GFX Scaling benchmark:
\n
"
);
printf
(
"GPU-GFX Scaling benchmark:
\n
"
);
printf
(
"==========================
\n
"
);
printf
(
"==========================
\n
"
);
printf
(
"- Copying %lu bytes from GPU %d to other devices
\n
"
,
t
ransfers
[
0
]
.
numBytes
,
exeIndex
);
printf
(
"- Copying %lu bytes from GPU %d to other devices
\n
"
,
t
.
numBytes
,
exeIndex
);
printf
(
"- All numbers reported as GB/sec
\n\n
"
);
printf
(
"- All numbers reported as GB/sec
\n\n
"
);
printf
(
"NumCUs"
);
printf
(
"NumCUs"
);
...
@@ -1873,21 +1876,20 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
...
@@ -1873,21 +1876,20 @@ void RunScalingBenchmark(EnvVars const& ev, size_t N, int const exeIndex, int co
std
::
vector
<
std
::
pair
<
double
,
int
>>
bestResult
(
numDevices
);
std
::
vector
<
std
::
pair
<
double
,
int
>>
bestResult
(
numDevices
);
for
(
int
numSubExec
=
1
;
numSubExec
<=
maxSubExecs
;
numSubExec
++
)
for
(
int
numSubExec
=
1
;
numSubExec
<=
maxSubExecs
;
numSubExec
++
)
{
{
t
ransfers
[
0
]
.
numSubExecs
=
numSubExec
;
t
.
numSubExecs
=
numSubExec
;
printf
(
"%4d "
,
numSubExec
);
printf
(
"%4d "
,
numSubExec
);
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
for
(
int
i
=
0
;
i
<
numDevices
;
i
++
)
{
{
t
ransfers
[
0
]
.
dstType
[
0
]
=
i
<
numCpus
?
MEM_CPU
:
MEM_GPU
;
t
.
dstType
[
0
]
=
i
<
numCpus
?
MEM_CPU
:
MEM_GPU
;
t
ransfers
[
0
]
.
dstIndex
[
0
]
=
i
<
numCpus
?
i
:
i
-
numCpus
;
t
.
dstIndex
[
0
]
=
i
<
numCpus
?
i
:
i
-
numCpus
;
ExecuteTransfers
(
ev
,
0
,
N
,
transfers
,
false
);
ExecuteTransfers
(
ev
,
0
,
N
,
transfers
,
false
);
double
transferBandwidthGbs
=
(
transfers
[
0
].
numBytesActual
/
1.0E9
)
/
transfers
[
0
].
transferTime
*
1000.0
f
;
printf
(
"%c%7.2f "
,
separator
,
t
.
transferBandwidth
);
printf
(
"%c%7.2f "
,
separator
,
transferBandwidthGbs
);
if
(
transferBandwidth
Gbs
>
bestResult
[
i
].
first
)
if
(
t
.
transferBandwidth
>
bestResult
[
i
].
first
)
{
{
bestResult
[
i
].
first
=
transferBandwidth
Gbs
;
bestResult
[
i
].
first
=
t
.
transferBandwidth
;
bestResult
[
i
].
second
=
numSubExec
;
bestResult
[
i
].
second
=
numSubExec
;
}
}
}
}
...
@@ -1960,14 +1962,14 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
...
@@ -1960,14 +1962,14 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
if
(
transfers
.
size
()
==
0
)
return
;
if
(
transfers
.
size
()
==
0
)
return
;
double
totalBandwidthCpu
=
0
;
double
totalBandwidthCpu
=
0
;
ExecuteTransfers
(
ev
,
0
,
numBytesPerTransfer
/
sizeof
(
float
),
transfers
,
true
,
&
totalBandwidthCpu
);
ExecuteTransfers
(
ev
,
0
,
numBytesPerTransfer
/
sizeof
(
float
),
transfers
,
!
ev
.
hideEnv
,
&
totalBandwidthCpu
);
printf
(
"
\n
Summary:
\n
"
);
printf
(
"
\n
Summary:
\n
"
);
printf
(
"==========================================================
\n
"
);
printf
(
"==========================================================
\n
"
);
printf
(
"SRC
\\
DST"
);
printf
(
"SRC
\\
DST
"
);
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
printf
(
"%cGPU %02d "
,
separator
,
dst
);
printf
(
"%cGPU %02d "
,
separator
,
dst
);
printf
(
" %cSTotal
\n
"
,
separator
);
printf
(
" %cSTotal
%cActual
\n
"
,
separator
,
separator
);
std
::
map
<
std
::
pair
<
int
,
int
>
,
int
>
reIndex
;
std
::
map
<
std
::
pair
<
int
,
int
>
,
int
>
reIndex
;
for
(
int
i
=
0
;
i
<
transfers
.
size
();
i
++
)
for
(
int
i
=
0
;
i
<
transfers
.
size
();
i
++
)
...
@@ -1977,41 +1979,47 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
...
@@ -1977,41 +1979,47 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
}
}
double
totalBandwidthGpu
=
0.0
;
double
totalBandwidthGpu
=
0.0
;
double
minExecutorBandwidth
=
std
::
numeric_limits
<
double
>::
max
();
double
maxExecutorBandwidth
=
0.0
;
std
::
vector
<
double
>
colTotalBandwidth
(
numGpus
+
1
,
0.0
);
std
::
vector
<
double
>
colTotalBandwidth
(
numGpus
+
1
,
0.0
);
for
(
int
src
=
0
;
src
<
numGpus
;
src
++
)
for
(
int
src
=
0
;
src
<
numGpus
;
src
++
)
{
{
double
rowTotalBandwidth
=
0
;
double
rowTotalBandwidth
=
0
;
double
executorBandwidth
=
0
;
printf
(
"GPU %02d"
,
src
);
printf
(
"GPU %02d"
,
src
);
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
{
{
if
(
reIndex
.
count
(
std
::
make_pair
(
src
,
dst
)))
if
(
reIndex
.
count
(
std
::
make_pair
(
src
,
dst
)))
{
{
Transfer
const
&
transfer
=
transfers
[
reIndex
[
std
::
make_pair
(
src
,
dst
)]];
Transfer
const
&
transfer
=
transfers
[
reIndex
[
std
::
make_pair
(
src
,
dst
)]];
double
transfer
Bandwidth
Gbs
=
(
transfer
.
numBytesActual
/
1.0E9
)
/
transfer
.
transferTime
*
1000.0
f
;
colTotal
Bandwidth
[
dst
]
+
=
transfer
.
transferBandwidth
;
col
TotalBandwidth
[
dst
]
+=
transferBandwidth
Gbs
;
row
TotalBandwidth
+=
transfer
.
transferBandwidth
;
rowT
otalBandwidth
+=
transferBandwidth
Gbs
;
t
otalBandwidth
Gpu
+=
transfer
.
transferBandwidth
;
total
Bandwidth
Gpu
+=
transferBandwidth
Gbs
;
executorBandwidth
=
std
::
max
(
executor
Bandwidth
,
transfer
.
executor
Bandwidth
)
;
printf
(
"%c%
7.2
f "
,
separator
,
transferBandwidth
Gbs
);
printf
(
"%c%
8.3
f "
,
separator
,
transfer
.
transferBandwidth
);
}
}
else
else
{
{
printf
(
"%c%
7
s "
,
separator
,
"N/A"
);
printf
(
"%c%
8
s "
,
separator
,
"N/A"
);
}
}
}
}
printf
(
" %c%7.2f
\n
"
,
separator
,
rowTotalBandwidth
);
printf
(
" %c%8.3f %c%8.3f
\n
"
,
separator
,
rowTotalBandwidth
,
separator
,
executorBandwidth
);
minExecutorBandwidth
=
std
::
min
(
minExecutorBandwidth
,
executorBandwidth
);
maxExecutorBandwidth
=
std
::
max
(
maxExecutorBandwidth
,
executorBandwidth
);
colTotalBandwidth
[
numGpus
]
+=
rowTotalBandwidth
;
colTotalBandwidth
[
numGpus
]
+=
rowTotalBandwidth
;
}
}
printf
(
"
\n
RTotal"
);
printf
(
"
\n
RTotal"
);
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
for
(
int
dst
=
0
;
dst
<
numGpus
;
dst
++
)
{
{
printf
(
"%c%
7.2
f "
,
separator
,
colTotalBandwidth
[
dst
]);
printf
(
"%c%
8.3
f "
,
separator
,
colTotalBandwidth
[
dst
]);
}
}
printf
(
" %c%7.2f
\n
"
,
separator
,
colTotalBandwidth
[
numGpus
]);
printf
(
" %c%8.3f %c%8.3f %c%8.3f
\n
"
,
separator
,
colTotalBandwidth
[
numGpus
],
separator
,
minExecutorBandwidth
,
separator
,
maxExecutorBandwidth
);
printf
(
"
\n
"
);
printf
(
"
\n
"
);
printf
(
"Average bandwidth (GPU Timed): %
7.2
f GB/s
\n
"
,
totalBandwidthGpu
/
transfers
.
size
());
printf
(
"Average bandwidth (GPU Timed): %
8.3
f GB/s
\n
"
,
totalBandwidthGpu
/
transfers
.
size
());
printf
(
"Aggregate bandwidth (GPU Timed): %
7.2
f GB/s
\n
"
,
totalBandwidthGpu
);
printf
(
"Aggregate bandwidth (GPU Timed): %
8.3
f GB/s
\n
"
,
totalBandwidthGpu
);
printf
(
"Aggregate bandwidth (CPU Timed): %
7.2
f GB/s
\n
"
,
totalBandwidthCpu
);
printf
(
"Aggregate bandwidth (CPU Timed): %
8.3
f GB/s
\n
"
,
totalBandwidthCpu
);
}
}
void
Transfer
::
PrepareSubExecParams
(
EnvVars
const
&
ev
)
void
Transfer
::
PrepareSubExecParams
(
EnvVars
const
&
ev
)
...
...
src/include/EnvVars.hpp
View file @
30f1c584
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Compatibility.hpp"
#include "Kernels.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.4
2
"
#define TB_VERSION "1.4
3
"
extern
char
const
MemTypeStr
[];
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
...
src/include/TransferBench.hpp
View file @
30f1c584
...
@@ -115,7 +115,9 @@ struct Transfer
...
@@ -115,7 +115,9 @@ struct Transfer
// Outputs
// Outputs
size_t
numBytesActual
;
// Actual number of bytes to copy
size_t
numBytesActual
;
// Actual number of bytes to copy
double
transferTime
;
// Time taken in milliseconds
double
transferTime
;
// Time taken in milliseconds for this transfer
double
transferBandwidth
;
// Transfer bandwidth (GB/s)
double
executorBandwidth
;
// Executor bandwidth (GB/s)
std
::
vector
<
double
>
perIterationTime
;
// Per-iteration timing
std
::
vector
<
double
>
perIterationTime
;
// Per-iteration timing
std
::
vector
<
std
::
set
<
std
::
pair
<
int
,
int
>>>
perIterationCUs
;
// Per-iteration CU usage
std
::
vector
<
std
::
set
<
std
::
pair
<
int
,
int
>>>
perIterationCUs
;
// Per-iteration CU usage
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment