Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
c72a105f
Unverified
Commit
c72a105f
authored
Jan 31, 2023
by
gilbertlee-amd
Committed by
GitHub
Jan 31, 2023
Browse files
v1.12 Adding NVIDIA platform support (#10)
parent
cc0e9cb4
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
100 additions
and
15 deletions
+100
-15
CHANGELOG.md
CHANGELOG.md
+5
-0
EnvVars.hpp
EnvVars.hpp
+1
-1
Kernels.hpp
Kernels.hpp
+17
-6
Makefile
Makefile
+1
-1
README.md
README.md
+8
-0
TransferBench.cpp
TransferBench.cpp
+60
-6
TransferBench.hpp
TransferBench.hpp
+8
-1
No files found.
CHANGELOG.md
View file @
c72a105f
# Changelog for TransferBench
# Changelog for TransferBench
## v1.12
### Added
-
Added support for TransferBench on NVIDIA platforms (via HIP_PLATFORM=nvidia)
-
CPU executors on NVIDIA platform cannot access GPU memory (no large-bar access)
## v1.11
## v1.11
### Added
### Added
-
New multi-input / multi-output support (MIMO). Transfers now can reduce (element-wise summation) multiple input memory arrays
-
New multi-input / multi-output support (MIMO). Transfers now can reduce (element-wise summation) multiple input memory arrays
...
...
EnvVars.hpp
View file @
c72a105f
...
@@ -28,7 +28,7 @@ THE SOFTWARE.
...
@@ -28,7 +28,7 @@ THE SOFTWARE.
#include <time.h>
#include <time.h>
#include "Kernels.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.1
1
"
#define TB_VERSION "1.1
2
"
extern
char
const
MemTypeStr
[];
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
...
Kernels.hpp
View file @
c72a105f
...
@@ -29,6 +29,19 @@ THE SOFTWARE.
...
@@ -29,6 +29,19 @@ THE SOFTWARE.
#define MEMSET_CHAR 75
#define MEMSET_CHAR 75
#define MEMSET_VAL 13323083.0f
#define MEMSET_VAL 13323083.0f
#if defined(__NVCC__)
// Define float4 addition operator for NVIDIA platform
__device__
inline
float4
&
operator
+=
(
float4
&
a
,
const
float4
&
b
)
{
a
.
x
+=
b
.
x
;
a
.
y
+=
b
.
y
;
a
.
z
+=
b
.
z
;
a
.
w
+=
b
.
w
;
return
a
;
}
#endif
// Each subExecutor is provided with subarrays to work on
// Each subExecutor is provided with subarrays to work on
#define MAX_SRCS 16
#define MAX_SRCS 16
#define MAX_DSTS 16
#define MAX_DSTS 16
...
@@ -51,14 +64,14 @@ void CpuReduceKernel(SubExecParam const& p)
...
@@ -51,14 +64,14 @@ void CpuReduceKernel(SubExecParam const& p)
if
(
numSrcs
==
0
)
if
(
numSrcs
==
0
)
{
{
for
(
int
i
=
0
;
i
<
numDsts
;
++
i
)
for
(
int
i
=
0
;
i
<
numDsts
;
++
i
)
memset
(
(
float
*
__restrict__
)
p
.
dst
[
i
],
MEMSET_CHAR
,
p
.
N
*
sizeof
(
float
));
memset
(
p
.
dst
[
i
],
MEMSET_CHAR
,
p
.
N
*
sizeof
(
float
));
}
}
else
if
(
numSrcs
==
1
)
else
if
(
numSrcs
==
1
)
{
{
float
const
*
__restrict__
src
=
p
.
src
[
0
];
float
const
*
__restrict__
src
=
p
.
src
[
0
];
for
(
int
i
=
0
;
i
<
numDsts
;
++
i
)
for
(
int
i
=
0
;
i
<
numDsts
;
++
i
)
{
{
memcpy
(
(
float
*
__restrict__
)
p
.
dst
[
i
],
src
,
p
.
N
*
sizeof
(
float
));
memcpy
(
p
.
dst
[
i
],
src
,
p
.
N
*
sizeof
(
float
));
}
}
}
}
else
else
...
@@ -88,7 +101,6 @@ GpuReduceKernel(SubExecParam* params)
...
@@ -88,7 +101,6 @@ GpuReduceKernel(SubExecParam* params)
SubExecParam
&
p
=
params
[
blockIdx
.
x
];
SubExecParam
&
p
=
params
[
blockIdx
.
x
];
int
const
numSrcs
=
p
.
numSrcs
;
int
const
numSrcs
=
p
.
numSrcs
;
int
const
numDsts
=
p
.
numDsts
;
int
const
numDsts
=
p
.
numDsts
;
int
const
numWaves
=
BLOCKSIZE
/
WARP_SIZE
;
// Number of wavefronts per threadblock
int
const
waveId
=
threadIdx
.
x
/
WARP_SIZE
;
// Wavefront number
int
const
waveId
=
threadIdx
.
x
/
WARP_SIZE
;
// Wavefront number
int
const
threadId
=
threadIdx
.
x
%
WARP_SIZE
;
// Thread index within wavefront
int
const
threadId
=
threadIdx
.
x
%
WARP_SIZE
;
// Thread index within wavefront
...
@@ -177,11 +189,11 @@ GpuReduceKernel(SubExecParam* params)
...
@@ -177,11 +189,11 @@ GpuReduceKernel(SubExecParam* params)
else
else
{
{
for
(
int
i
=
0
;
i
<
numSrcs
;
++
i
)
for
(
int
i
=
0
;
i
<
numSrcs
;
++
i
)
val
+=
((
float
const
*
__restrict__
)
p
.
src
[
i
]
)
[
offset
];
val
+=
p
.
src
[
i
][
offset
];
}
}
for
(
int
i
=
0
;
i
<
numDsts
;
++
i
)
for
(
int
i
=
0
;
i
<
numDsts
;
++
i
)
((
float
*
__restrict__
)
p
.
dst
[
i
]
)
[
offset
]
=
val
;
p
.
dst
[
i
][
offset
]
=
val
;
}
}
}
}
...
@@ -197,7 +209,6 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR>
...
@@ -197,7 +209,6 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__
size_t
GpuReduceFuncImpl2
(
SubExecParam
const
&
p
,
size_t
const
offset
,
size_t
const
N
)
__device__
size_t
GpuReduceFuncImpl2
(
SubExecParam
const
&
p
,
size_t
const
offset
,
size_t
const
N
)
{
{
int
constexpr
numFloatsPerPack
=
sizeof
(
FLOAT_TYPE
)
/
sizeof
(
float
);
// Number of floats handled at a time per thread
int
constexpr
numFloatsPerPack
=
sizeof
(
FLOAT_TYPE
)
/
sizeof
(
float
);
// Number of floats handled at a time per thread
int
constexpr
numWaves
=
BLOCKSIZE
/
WARP_SIZE
;
// Number of wavefronts per threadblock
size_t
constexpr
loopPackInc
=
BLOCKSIZE
*
UNROLL_FACTOR
;
size_t
constexpr
loopPackInc
=
BLOCKSIZE
*
UNROLL_FACTOR
;
size_t
constexpr
numPacksPerWave
=
WARP_SIZE
*
UNROLL_FACTOR
;
size_t
constexpr
numPacksPerWave
=
WARP_SIZE
*
UNROLL_FACTOR
;
int
const
waveId
=
threadIdx
.
x
/
WARP_SIZE
;
// Wavefront number
int
const
waveId
=
threadIdx
.
x
/
WARP_SIZE
;
// Wavefront number
...
...
Makefile
View file @
c72a105f
...
@@ -3,7 +3,7 @@ ROCM_PATH ?= /opt/rocm
...
@@ -3,7 +3,7 @@ ROCM_PATH ?= /opt/rocm
HIPCC
=
$(ROCM_PATH)
/bin/hipcc
HIPCC
=
$(ROCM_PATH)
/bin/hipcc
EXE
=
TransferBench
EXE
=
TransferBench
CXXFLAGS
=
-O3
-I
.
-lnuma
-L
$(ROCM_PATH)
/hsa/lib
-lhsa-runtime64
-ferror-limit
=
5
CXXFLAGS
=
-O3
-I
.
-I
$(ROCM_PATH)
/hsa/include
-lnuma
-L
$(ROCM_PATH)
/hsa/lib
-lhsa-runtime64
all
:
$(EXE)
all
:
$(EXE)
...
...
README.md
View file @
c72a105f
...
@@ -13,6 +13,14 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be
...
@@ -13,6 +13,14 @@ TransferBench is a simple utility capable of benchmarking simultaneous copies be
If ROCm is installed in a folder other than
`/opt/rocm/`
, set ROCM_PATH appropriately
If ROCm is installed in a folder other than
`/opt/rocm/`
, set ROCM_PATH appropriately
## NVIDIA platform support
TransferBench may also be built to run on NVIDIA platforms via HIP, but requires a HIP-compatible CUDA version installed (e.g. CUDA 11.5)
To build:
```
CUDA_PATH=<path_to_CUDA> HIP_PLATFORM=nvidia make`
```
## Hints and suggestions
## Hints and suggestions
-
Running TransferBench with no arguments will display usage instructions and detected topology information
-
Running TransferBench with no arguments will display usage instructions and detected topology information
...
...
TransferBench.cpp
View file @
c72a105f
...
@@ -291,7 +291,11 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -291,7 +291,11 @@ void ExecuteTransfers(EnvVars const& ev,
printf
(
" DST %0d: %p
\n
"
,
iDst
,
transfer
.
dstMem
[
iDst
]);
printf
(
" DST %0d: %p
\n
"
,
iDst
,
transfer
.
dstMem
[
iDst
]);
}
}
printf
(
"Hit <Enter> to continue: "
);
printf
(
"Hit <Enter> to continue: "
);
scanf
(
"%*c"
);
if
(
scanf
(
"%*c"
)
!=
0
)
{
printf
(
"[ERROR] Unexpected input
\n
"
);
exit
(
1
);
}
printf
(
"
\n
"
);
printf
(
"
\n
"
);
}
}
...
@@ -332,7 +336,11 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -332,7 +336,11 @@ void ExecuteTransfers(EnvVars const& ev,
if
(
verbose
&&
ev
.
useInteractive
)
if
(
verbose
&&
ev
.
useInteractive
)
{
{
printf
(
"Transfers complete. Hit <Enter> to continue: "
);
printf
(
"Transfers complete. Hit <Enter> to continue: "
);
scanf
(
"%*c"
);
if
(
scanf
(
"%*c"
)
!=
0
)
{
printf
(
"[ERROR] Unexpected input
\n
"
);
exit
(
1
);
}
printf
(
"
\n
"
);
printf
(
"
\n
"
);
}
}
...
@@ -590,6 +598,7 @@ int RemappedIndex(int const origIdx, bool const isCpuType)
...
@@ -590,6 +598,7 @@ int RemappedIndex(int const origIdx, bool const isCpuType)
void
DisplayTopology
(
bool
const
outputToCsv
)
void
DisplayTopology
(
bool
const
outputToCsv
)
{
{
int
numCpuDevices
=
numa_num_configured_nodes
();
int
numCpuDevices
=
numa_num_configured_nodes
();
int
numGpuDevices
;
int
numGpuDevices
;
HIP_CALL
(
hipGetDeviceCount
(
&
numGpuDevices
));
HIP_CALL
(
hipGetDeviceCount
(
&
numGpuDevices
));
...
@@ -648,6 +657,7 @@ void DisplayTopology(bool const outputToCsv)
...
@@ -648,6 +657,7 @@ void DisplayTopology(bool const outputToCsv)
else
else
printf
(
" %5d | "
,
numCpus
);
printf
(
" %5d | "
,
numCpus
);
#if !defined(__NVCC__)
bool
isFirst
=
true
;
bool
isFirst
=
true
;
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
{
{
...
@@ -658,10 +668,16 @@ void DisplayTopology(bool const outputToCsv)
...
@@ -658,10 +668,16 @@ void DisplayTopology(bool const outputToCsv)
printf
(
"%d"
,
j
);
printf
(
"%d"
,
j
);
}
}
}
}
#endif
printf
(
"
\n
"
);
printf
(
"
\n
"
);
}
}
printf
(
"
\n
"
);
printf
(
"
\n
"
);
#if defined(__NVCC__)
// No further topology detection done for NVIDIA platforms
return
;
#endif
// Print out detected GPU topology
// Print out detected GPU topology
if
(
outputToCsv
)
if
(
outputToCsv
)
{
{
...
@@ -691,8 +707,8 @@ void DisplayTopology(bool const outputToCsv)
...
@@ -691,8 +707,8 @@ void DisplayTopology(bool const outputToCsv)
printf
(
"--------------+------+-------------
\n
"
);
printf
(
"--------------+------+-------------
\n
"
);
}
}
#if !defined(__NVCC__)
char
pciBusId
[
20
];
char
pciBusId
[
20
];
for
(
int
i
=
0
;
i
<
numGpuDevices
;
i
++
)
for
(
int
i
=
0
;
i
<
numGpuDevices
;
i
++
)
{
{
int
const
deviceIdx
=
RemappedIndex
(
i
,
false
);
int
const
deviceIdx
=
RemappedIndex
(
i
,
false
);
...
@@ -732,6 +748,7 @@ void DisplayTopology(bool const outputToCsv)
...
@@ -732,6 +748,7 @@ void DisplayTopology(bool const outputToCsv)
else
else
printf
(
" %11s | %4d | %d
\n
"
,
pciBusId
,
numDeviceCUs
,
GetClosestNumaNode
(
deviceIdx
));
printf
(
" %11s | %4d | %d
\n
"
,
pciBusId
,
numDeviceCUs
,
GetClosestNumaNode
(
deviceIdx
));
}
}
#endif
}
}
void
ParseMemType
(
std
::
string
const
&
token
,
int
const
numCpus
,
int
const
numGpus
,
void
ParseMemType
(
std
::
string
const
&
token
,
int
const
numCpus
,
int
const
numGpus
,
...
@@ -930,11 +947,20 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
...
@@ -930,11 +947,20 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
// Allocate host-pinned memory (should respect NUMA mem policy)
// Allocate host-pinned memory (should respect NUMA mem policy)
if
(
memType
==
MEM_CPU_FINE
)
if
(
memType
==
MEM_CPU_FINE
)
{
{
#if defined (__NVCC__)
printf
(
"[ERROR] Fine-grained CPU memory not supported on NVIDIA platform
\n
"
);
exit
(
1
);
#else
HIP_CALL
(
hipHostMalloc
((
void
**
)
memPtr
,
numBytes
,
hipHostMallocNumaUser
));
HIP_CALL
(
hipHostMalloc
((
void
**
)
memPtr
,
numBytes
,
hipHostMallocNumaUser
));
#endif
}
}
else
if
(
memType
==
MEM_CPU
)
else
if
(
memType
==
MEM_CPU
)
{
{
#if defined (__NVCC__)
if
(
hipHostMalloc
((
void
**
)
memPtr
,
numBytes
,
0
)
!=
hipSuccess
)
#else
if
(
hipHostMalloc
((
void
**
)
memPtr
,
numBytes
,
hipHostMallocNumaUser
|
hipHostMallocNonCoherent
)
!=
hipSuccess
)
if
(
hipHostMalloc
((
void
**
)
memPtr
,
numBytes
,
hipHostMallocNumaUser
|
hipHostMallocNonCoherent
)
!=
hipSuccess
)
#endif
{
{
printf
(
"[ERROR] Unable to allocate non-coherent host memory on NUMA node %d
\n
"
,
devIndex
);
printf
(
"[ERROR] Unable to allocate non-coherent host memory on NUMA node %d
\n
"
,
devIndex
);
exit
(
1
);
exit
(
1
);
...
@@ -960,8 +986,13 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
...
@@ -960,8 +986,13 @@ void AllocateMemory(MemType memType, int devIndex, size_t numBytes, void** memPt
}
}
else
if
(
memType
==
MEM_GPU_FINE
)
else
if
(
memType
==
MEM_GPU_FINE
)
{
{
#if defined (__NVCC__)
printf
(
"[ERROR] Fine-grained GPU memory not supported on NVIDIA platform
\n
"
);
exit
(
1
);
#else
HIP_CALL
(
hipSetDevice
(
devIndex
));
HIP_CALL
(
hipSetDevice
(
devIndex
));
HIP_CALL
(
hipExtMallocWithFlags
((
void
**
)
memPtr
,
numBytes
,
hipDeviceMallocFinegrained
));
HIP_CALL
(
hipExtMallocWithFlags
((
void
**
)
memPtr
,
numBytes
,
hipDeviceMallocFinegrained
));
#endif
}
}
else
else
{
{
...
@@ -1044,13 +1075,18 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1044,13 +1075,18 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// In single stream mode, all the threadblocks for this GPU are launched
// In single stream mode, all the threadblocks for this GPU are launched
// Otherwise, just launch the threadblocks associated with this single Transfer
// Otherwise, just launch the threadblocks associated with this single Transfer
int
const
numBlocksToRun
=
ev
.
useSingleStream
?
exeInfo
.
totalSubExecs
:
transfer
->
numSubExecs
;
int
const
numBlocksToRun
=
ev
.
useSingleStream
?
exeInfo
.
totalSubExecs
:
transfer
->
numSubExecs
;
#if defined(__NVCC__)
HIP_CALL
(
hipEventRecord
(
startEvent
,
stream
));
GpuKernelTable
[
ev
.
gpuKernel
]
<<<
numBlocksToRun
,
BLOCKSIZE
,
ev
.
sharedMemBytes
,
stream
>>>
(
transfer
->
subExecParamGpuPtr
);
HIP_CALL
(
hipEventRecord
(
stopEvent
,
stream
));
#else
hipExtLaunchKernelGGL
(
GpuKernelTable
[
ev
.
gpuKernel
],
hipExtLaunchKernelGGL
(
GpuKernelTable
[
ev
.
gpuKernel
],
dim3
(
numBlocksToRun
,
1
,
1
),
dim3
(
numBlocksToRun
,
1
,
1
),
dim3
(
BLOCKSIZE
,
1
,
1
),
dim3
(
BLOCKSIZE
,
1
,
1
),
ev
.
sharedMemBytes
,
stream
,
ev
.
sharedMemBytes
,
stream
,
startEvent
,
stopEvent
,
startEvent
,
stopEvent
,
0
,
transfer
->
subExecParamGpuPtr
);
0
,
transfer
->
subExecParamGpuPtr
);
#endif
// Synchronize per iteration, unless in single sync mode, in which case
// Synchronize per iteration, unless in single sync mode, in which case
// synchronize during last warmup / last actual iteration
// synchronize during last warmup / last actual iteration
HIP_CALL
(
hipStreamSynchronize
(
stream
));
HIP_CALL
(
hipStreamSynchronize
(
stream
));
...
@@ -1228,8 +1264,6 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
...
@@ -1228,8 +1264,6 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
// Skip bidirectional on same device
// Skip bidirectional on same device
if
(
isBidirectional
&&
srcType
==
dstType
&&
srcIndex
==
dstIndex
)
return
0.0
f
;
if
(
isBidirectional
&&
srcType
==
dstType
&&
srcIndex
==
dstIndex
)
return
0.0
f
;
int
const
initOffset
=
ev
.
byteOffset
/
sizeof
(
float
);
// Prepare Transfers
// Prepare Transfers
std
::
vector
<
Transfer
>
transfers
(
2
);
std
::
vector
<
Transfer
>
transfers
(
2
);
transfers
[
0
].
numBytes
=
transfers
[
1
].
numBytes
=
N
*
sizeof
(
float
);
transfers
[
0
].
numBytes
=
transfers
[
1
].
numBytes
=
N
*
sizeof
(
float
);
...
@@ -1265,6 +1299,12 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
...
@@ -1265,6 +1299,12 @@ double GetPeakBandwidth(EnvVars const& ev, size_t const N,
{
{
if
(
transfers
[
i
].
exeType
==
EXE_CPU
&&
ev
.
numCpusPerNuma
[
transfers
[
i
].
exeIndex
]
==
0
)
if
(
transfers
[
i
].
exeType
==
EXE_CPU
&&
ev
.
numCpusPerNuma
[
transfers
[
i
].
exeIndex
]
==
0
)
return
0
;
return
0
;
#if defined(__NVCC__)
// NVIDIA platform cannot access GPU memory directly from CPU executors
if
(
transfers
[
i
].
exeType
==
EXE_CPU
&&
(
IsGpuType
(
srcType
)
||
IsGpuType
(
dstType
)))
return
0
;
#endif
}
}
ExecuteTransfers
(
ev
,
0
,
N
,
transfers
,
false
);
ExecuteTransfers
(
ev
,
0
,
N
,
transfers
,
false
);
...
@@ -1549,6 +1589,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
...
@@ -1549,6 +1589,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
{
{
if
(
exeList
[
i
].
second
!=
srcList
[
j
].
second
)
if
(
exeList
[
i
].
second
!=
srcList
[
j
].
second
)
{
{
#if defined(__NVCC__)
isXgmiSrc
=
false
;
#else
uint32_t
exeToSrcLinkType
,
exeToSrcHopCount
;
uint32_t
exeToSrcLinkType
,
exeToSrcHopCount
;
HIP_CALL
(
hipExtGetLinkTypeAndHopCount
(
RemappedIndex
(
exeList
[
i
].
second
,
false
),
HIP_CALL
(
hipExtGetLinkTypeAndHopCount
(
RemappedIndex
(
exeList
[
i
].
second
,
false
),
RemappedIndex
(
srcList
[
j
].
second
,
false
),
RemappedIndex
(
srcList
[
j
].
second
,
false
),
...
@@ -1556,6 +1599,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
...
@@ -1556,6 +1599,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
&
exeToSrcHopCount
));
&
exeToSrcHopCount
));
isXgmiSrc
=
(
exeToSrcLinkType
==
HSA_AMD_LINK_INFO_TYPE_XGMI
);
isXgmiSrc
=
(
exeToSrcLinkType
==
HSA_AMD_LINK_INFO_TYPE_XGMI
);
if
(
isXgmiSrc
)
numHopsSrc
=
exeToSrcHopCount
;
if
(
isXgmiSrc
)
numHopsSrc
=
exeToSrcHopCount
;
#endif
}
}
else
else
{
{
...
@@ -1582,6 +1626,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
...
@@ -1582,6 +1626,9 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
{
{
if
(
exeList
[
i
].
second
!=
dstList
[
k
].
second
)
if
(
exeList
[
i
].
second
!=
dstList
[
k
].
second
)
{
{
#if defined(__NVCC__)
isXgmiSrc
=
false
;
#else
uint32_t
exeToDstLinkType
,
exeToDstHopCount
;
uint32_t
exeToDstLinkType
,
exeToDstHopCount
;
HIP_CALL
(
hipExtGetLinkTypeAndHopCount
(
RemappedIndex
(
exeList
[
i
].
second
,
false
),
HIP_CALL
(
hipExtGetLinkTypeAndHopCount
(
RemappedIndex
(
exeList
[
i
].
second
,
false
),
RemappedIndex
(
dstList
[
k
].
second
,
false
),
RemappedIndex
(
dstList
[
k
].
second
,
false
),
...
@@ -1589,6 +1636,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
...
@@ -1589,6 +1636,7 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
&
exeToDstHopCount
));
&
exeToDstHopCount
));
isXgmiDst
=
(
exeToDstLinkType
==
HSA_AMD_LINK_INFO_TYPE_XGMI
);
isXgmiDst
=
(
exeToDstLinkType
==
HSA_AMD_LINK_INFO_TYPE_XGMI
);
if
(
isXgmiDst
)
numHopsDst
=
exeToDstHopCount
;
if
(
isXgmiDst
)
numHopsDst
=
exeToDstHopCount
;
#endif
}
}
else
else
{
{
...
@@ -1606,6 +1654,12 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
...
@@ -1606,6 +1654,12 @@ void RunSweepPreset(EnvVars const& ev, size_t const numBytesPerTransfer, int con
// Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
// Skip this DST if total XGMI distance (SRC + DST) is greater than max limit
if
(
ev
.
sweepXgmiMax
>=
0
&&
(
numHopsSrc
+
numHopsDst
)
>
ev
.
sweepXgmiMax
)
continue
;
if
(
ev
.
sweepXgmiMax
>=
0
&&
(
numHopsSrc
+
numHopsDst
)
>
ev
.
sweepXgmiMax
)
continue
;
#if defined(__NVCC__)
// Skip CPU executors on GPU memory on NVIDIA platform
if
(
IsCpuType
(
exeList
[
i
].
first
)
&&
(
IsGpuType
(
dstList
[
j
].
first
)
||
IsGpuType
(
dstList
[
k
].
first
)))
continue
;
#endif
tinfo
.
dstType
=
dstList
[
k
].
first
;
tinfo
.
dstType
=
dstList
[
k
].
first
;
tinfo
.
dstIndex
=
dstList
[
k
].
second
;
tinfo
.
dstIndex
=
dstList
[
k
].
second
;
...
...
TransferBench.hpp
View file @
c72a105f
...
@@ -31,8 +31,15 @@ THE SOFTWARE.
...
@@ -31,8 +31,15 @@ THE SOFTWARE.
#include <map>
#include <map>
#include <iostream>
#include <iostream>
#include <sstream>
#include <sstream>
#include <hip/hip_runtime.h>
#if defined(__NVCC__)
#include <cuda_runtime.h>
#define __builtin_amdgcn_s_memrealtime clock64
#else
#include <hip/hip_ext.h>
#include <hip/hip_ext.h>
#endif
#include <hip/hip_runtime.h>
#include <hsa/hsa_ext_amd.h>
#include <hsa/hsa_ext_amd.h>
// Helper macro for catching HIP errors
// Helper macro for catching HIP errors
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment