Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
08997e88
Unverified
Commit
08997e88
authored
Nov 23, 2023
by
gilbertlee-amd
Committed by
GitHub
Nov 23, 2023
Browse files
v1.36 - Adding experimental USE_XCC_FILTER (#67)
parent
e047656f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
122 additions
and
14 deletions
+122
-14
CHANGELOG.md
CHANGELOG.md
+7
-0
src/TransferBench.cpp
src/TransferBench.cpp
+14
-3
src/include/EnvVars.hpp
src/include/EnvVars.hpp
+78
-3
src/include/Kernels.hpp
src/include/Kernels.hpp
+23
-8
No files found.
CHANGELOG.md
View file @
08997e88
...
...
@@ -3,6 +3,13 @@
Documentation for TransferBench is available at
[
https://rocm.docs.amd.com/projects/TransferBench
](
https://rocm.docs.amd.com/projects/TransferBench
)
.
## v1.36
### Additions
*
(Experimental) Adding XCC filtering - combined with XCC_PREF_TABLE, this tries to select
specific XCCs to use for specific (SRC->DST) Transfers
## v1.35
### Additions
...
...
src/TransferBench.cpp
View file @
08997e88
...
...
@@ -259,7 +259,7 @@ void ExecuteTransfers(EnvVars const& ev,
int
const
srcIndex
=
RemappedIndex
(
transfer
->
srcIndex
[
iSrc
],
IsCpuType
(
srcType
));
// Ensure executing GPU can access source memory
if
(
IsGpuType
(
exeType
)
==
MEM_GPU
&&
IsGpuType
(
srcType
)
&&
srcIndex
!=
exeIndex
)
if
(
IsGpuType
(
exeType
)
&&
IsGpuType
(
srcType
)
&&
srcIndex
!=
exeIndex
)
EnablePeerAccess
(
exeIndex
,
srcIndex
);
AllocateMemory
(
srcType
,
srcIndex
,
transfer
->
numBytesActual
+
ev
.
byteOffset
,
(
void
**
)
&
transfer
->
srcMem
[
iSrc
]);
...
...
@@ -273,7 +273,7 @@ void ExecuteTransfers(EnvVars const& ev,
int
const
dstIndex
=
RemappedIndex
(
transfer
->
dstIndex
[
iDst
],
IsCpuType
(
dstType
));
// Ensure executing GPU can access destination memory
if
(
IsGpuType
(
exeType
)
==
MEM_GPU
&&
IsGpuType
(
dstType
)
&&
dstIndex
!=
exeIndex
)
if
(
IsGpuType
(
exeType
)
&&
IsGpuType
(
dstType
)
&&
dstIndex
!=
exeIndex
)
EnablePeerAccess
(
exeIndex
,
dstIndex
);
AllocateMemory
(
dstType
,
dstIndex
,
transfer
->
numBytesActual
+
ev
.
byteOffset
,
(
void
**
)
&
transfer
->
dstMem
[
iDst
]);
...
...
@@ -1362,13 +1362,15 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// In single stream mode, all the threadblocks for this GPU are launched
// Otherwise, just launch the threadblocks associated with this single Transfer
int
const
numBlocksToRun
=
ev
.
useSingleStream
?
exeInfo
.
totalSubExecs
:
transfer
->
numSubExecs
;
int
const
numXCCs
=
(
ev
.
useXccFilter
?
ev
.
xccIdsPerDevice
[
exeIndex
].
size
()
:
1
);
#if defined(__NVCC__)
HIP_CALL
(
hipEventRecord
(
startEvent
,
stream
));
GpuKernelTable
[
ev
.
gpuKernel
]
<<<
numBlocksToRun
,
ev
.
blockSize
,
ev
.
sharedMemBytes
,
stream
>>>
(
transfer
->
subExecParamGpuPtr
);
HIP_CALL
(
hipEventRecord
(
stopEvent
,
stream
));
#else
hipExtLaunchKernelGGL
(
GpuKernelTable
[
ev
.
gpuKernel
],
dim3
(
numBlocksToRun
,
1
,
1
),
dim3
(
numXCCs
,
numBlocksToRun
,
1
),
dim3
(
ev
.
blockSize
,
1
,
1
),
ev
.
sharedMemBytes
,
stream
,
startEvent
,
stopEvent
,
...
...
@@ -1999,6 +2001,15 @@ void Transfer::PrepareSubExecParams(EnvVars const& ev)
for
(
int
iDst
=
0
;
iDst
<
this
->
numDsts
;
++
iDst
)
p
.
dst
[
iDst
]
=
this
->
dstMem
[
iDst
]
+
assigned
+
initOffset
;
p
.
preferredXccId
=
-
1
;
if
(
ev
.
useXccFilter
)
{
if
(
this
->
exeType
==
EXE_GPU_GFX
&&
this
->
numDsts
==
1
&&
IsGpuType
(
this
->
dstType
[
0
]))
{
p
.
preferredXccId
=
ev
.
prefXccTable
[
this
->
exeIndex
][
this
->
dstIndex
[
0
]];
}
}
if
(
ev
.
enableDebug
)
{
printf
(
"Transfer %02d SE:%02d: %10lu floats: %10lu to %10lu
\n
"
,
...
...
src/include/EnvVars.hpp
View file @
08997e88
...
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.3
5
"
#define TB_VERSION "1.3
6
"
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
...
@@ -91,10 +91,12 @@ public:
int
usePcieIndexing
;
// Base GPU indexing on PCIe address instead of HIP device
int
usePrepSrcKernel
;
// Use GPU kernel to prepare source data instead of copy (can't be used with fillPattern)
int
useSingleStream
;
// Use a single stream per GPU GFX executor instead of stream per Transfer
int
useXccFilter
;
// Use XCC filtering (experimental)
int
validateDirect
;
// Validate GPU destination memory directly instead of staging GPU memory on host
std
::
vector
<
float
>
fillPattern
;
// Pattern of floats used to fill source data
std
::
vector
<
uint32_t
>
cuMask
;
// Bit-vector representing the CU mask
std
::
vector
<
std
::
vector
<
int
>>
prefXccTable
;
// Environment variables only for P2P preset
int
numCpuSubExecs
;
// Number of CPU subexecttors to use
...
...
@@ -135,6 +137,8 @@ public:
std
::
vector
<
int
>
wallClockPerDeviceMhz
;
std
::
vector
<
std
::
set
<
int
>>
xccIdsPerDevice
;
// Constructor that collects values
EnvVars
()
{
...
...
@@ -187,6 +191,7 @@ public:
usePcieIndexing
=
GetEnvVar
(
"USE_PCIE_INDEX"
,
0
);
usePrepSrcKernel
=
GetEnvVar
(
"USE_PREP_KERNEL"
,
0
);
useSingleStream
=
GetEnvVar
(
"USE_SINGLE_STREAM"
,
0
);
useXccFilter
=
GetEnvVar
(
"USE_XCC_FILTER"
,
0
);
validateDirect
=
GetEnvVar
(
"VALIDATE_DIRECT"
,
0
);
enableDebug
=
GetEnvVar
(
"DEBUG"
,
0
);
gpuKernel
=
GetEnvVar
(
"GPU_KERNEL"
,
defaultGpuKernel
);
...
...
@@ -327,6 +332,60 @@ public:
#endif
}
// Figure out number of xccs per device
int
maxNumXccs
=
64
;
xccIdsPerDevice
.
resize
(
numGpuDevices
);
for
(
int
i
=
0
;
i
<
numGpuDevices
;
i
++
)
{
int
*
data
;
HIP_CALL
(
hipSetDevice
(
i
));
HIP_CALL
(
hipHostMalloc
((
void
**
)
&
data
,
maxNumXccs
*
sizeof
(
int
)));
CollectXccIdsKernel
<<<
maxNumXccs
,
1
>>>
(
data
);
HIP_CALL
(
hipDeviceSynchronize
());
xccIdsPerDevice
[
i
].
clear
();
for
(
int
j
=
0
;
j
<
maxNumXccs
;
j
++
)
xccIdsPerDevice
[
i
].
insert
(
data
[
j
]);
HIP_CALL
(
hipHostFree
(
data
));
}
// Parse preferred XCC table (if provided
{
prefXccTable
.
resize
(
numGpuDevices
);
for
(
int
i
=
0
;
i
<
numGpuDevices
;
i
++
)
{
prefXccTable
[
i
].
resize
(
numGpuDevices
,
0
);
}
char
*
prefXccStr
=
getenv
(
"XCC_PREF_TABLE"
);
char
*
token
=
strtok
(
prefXccStr
,
","
);
int
tokenCount
=
0
;
while
(
token
)
{
int
xccId
;
if
(
sscanf
(
token
,
"%d"
,
&
xccId
)
==
1
)
{
int
src
=
tokenCount
/
numGpuDevices
;
int
dst
=
tokenCount
%
numGpuDevices
;
if
(
xccIdsPerDevice
[
src
].
count
(
xccId
)
==
0
)
{
printf
(
"[ERROR] GPU %d does not contain XCC %d
\n
"
,
src
,
xccId
);
exit
(
1
);
}
prefXccTable
[
src
][
dst
]
=
xccId
;
tokenCount
++
;
if
(
tokenCount
==
(
numGpuDevices
*
numGpuDevices
))
break
;
}
else
{
printf
(
"[ERROR] Unrecognized token [%s]
\n
"
,
token
);
exit
(
1
);
}
token
=
strtok
(
NULL
,
","
);
}
}
// Perform some basic validation
if
(
numCpuDevices
>
numDetectedCpus
)
{
...
...
@@ -503,6 +562,7 @@ public:
printf
(
" USE_PCIE_INDEX - Index GPUs by PCIe address-ordering instead of HIP-provided indexing
\n
"
);
printf
(
" USE_PREP_KERNEL - Use GPU kernel to initialize source data array pattern
\n
"
);
printf
(
" USE_SINGLE_STREAM - Use a single stream per GPU GFX executor instead of stream per Transfer
\n
"
);
printf
(
" USE_XCC_FILTER - Use XCC filtering (experimental)
\n
"
);
printf
(
" VALIDATE_DIRECT - Validate GPU destination memory directly instead of staging GPU memory on host
\n
"
);
}
...
...
@@ -519,8 +579,8 @@ public:
if
(
!
outputToCsv
)
{
printf
(
"TransferBench v%s
\n
"
,
TB_VERSION
);
printf
(
"=====================================================
\n
"
);
if
(
!
hideEnv
)
printf
(
"[Common]
\n
"
);
printf
(
"=====================================================
==========
\n
"
);
if
(
!
hideEnv
)
printf
(
"[Common]
(Suppress by setting HIDE_ENV=1)
\n
"
);
}
else
if
(
!
hideEnv
)
printf
(
"EnvVar,Value,Description,(TransferBench v%s)
\n
"
,
TB_VERSION
);
...
...
@@ -566,6 +626,21 @@ public:
std
::
string
(
"Using "
)
+
(
usePrepSrcKernel
?
"GPU kernels"
:
"hipMemcpy"
)
+
" to initialize source data"
);
PRINT_EV
(
"USE_SINGLE_STREAM"
,
useSingleStream
,
std
::
string
(
"Using single stream per "
)
+
(
useSingleStream
?
"device"
:
"Transfer"
));
PRINT_EV
(
"USE_XCC_FILTER"
,
useXccFilter
,
std
::
string
(
"XCC filtering "
)
+
(
useXccFilter
?
"enabled"
:
"disabled"
));
if
(
useXccFilter
)
{
printf
(
"%36s: Preferred XCC Table (XCC_PREF_TABLE)
\n
"
,
""
);
printf
(
"%36s: "
,
""
);
for
(
int
i
=
0
;
i
<
numGpuDevices
;
i
++
)
printf
(
" %3d"
,
i
);
printf
(
" (#XCCs)
\n
"
);
for
(
int
i
=
0
;
i
<
numGpuDevices
;
i
++
)
{
printf
(
"%36s: GPU %3d "
,
""
,
i
);
for
(
int
j
=
0
;
j
<
numGpuDevices
;
j
++
)
printf
(
" %3d"
,
prefXccTable
[
i
][
j
]);
printf
(
" %3lu
\n
"
,
xccIdsPerDevice
[
i
].
size
());
}
}
PRINT_EV
(
"VALIDATE_DIRECT"
,
validateDirect
,
std
::
string
(
"Validate GPU destination memory "
)
+
(
validateDirect
?
"directly"
:
"via CPU staging buffer"
));
printf
(
"
\n
"
);
...
...
src/include/Kernels.hpp
View file @
08997e88
...
...
@@ -40,6 +40,7 @@ struct SubExecParam
int
numDsts
;
// Number of destination arrays
float
*
src
[
MAX_SRCS
];
// Source array pointers
float
*
dst
[
MAX_DSTS
];
// Destination array pointers
uint32_t
preferredXccId
;
// XCC ID to execute on
// Outputs
long
long
startCycle
;
// Start timestamp for in-kernel timing (GPU-GFX executor)
...
...
@@ -59,11 +60,11 @@ struct SubExecParam
// Macro for collecting HW_REG_XCC_ID
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#define
__trace_xccreg(
) \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (
p.xccId
));
#define
GetXccId(val
) \
asm volatile ("s_getreg_b32 %0, hwreg(HW_REG_XCC_ID)" : "=s" (
val
));
#else
#define
__trace_xccreg(
) \
p.xccId
= 0
#define
GetXccId(val
) \
val
= 0
#endif
void
CpuReduceKernel
(
SubExecParam
const
&
p
)
...
...
@@ -105,6 +106,13 @@ __host__ __device__ float PrepSrcValue(int srcBufferIdx, size_t idx)
return
(((
idx
%
383
)
*
517
)
%
383
+
31
)
*
(
srcBufferIdx
+
1
);
}
__global__
void
CollectXccIdsKernel
(
int
*
xccIds
)
{
int
xccId
;
GetXccId
(
xccId
);
xccIds
[
blockIdx
.
x
]
=
xccId
;
}
// GPU kernel to prepare src buffer data
__global__
void
PrepSrcDataKernel
(
float
*
ptr
,
size_t
N
,
int
srcBufferIdx
)
...
...
@@ -127,10 +135,17 @@ template <int LOOP1_UNROLL>
__global__
void
__launch_bounds__
(
MAX_BLOCKSIZE
)
GpuReduceKernel
(
SubExecParam
*
params
)
{
int64_t
startCycle
=
wall_clock64
();
int64_t
startCycle
;
if
(
threadIdx
.
x
==
0
)
startCycle
=
wall_clock64
();
SubExecParam
&
p
=
params
[
blockIdx
.
y
];
// Filter by XCC if desired
int
xccId
;
GetXccId
(
xccId
);
if
(
p
.
preferredXccId
!=
-
1
&&
xccId
!=
p
.
preferredXccId
)
return
;
// Operate on wavefront granularity
SubExecParam
&
p
=
params
[
blockIdx
.
x
];
int
const
numSrcs
=
p
.
numSrcs
;
int
const
numDsts
=
p
.
numDsts
;
int
const
waveId
=
threadIdx
.
x
/
WARP_SIZE
;
// Wavefront number
...
...
@@ -234,8 +249,8 @@ GpuReduceKernel(SubExecParam* params)
{
p
.
stopCycle
=
wall_clock64
();
p
.
startCycle
=
startCycle
;
p
.
xccId
=
xccId
;
__trace_hwreg
();
__trace_xccreg
();
}
}
...
...
@@ -369,7 +384,7 @@ __global__ void __launch_bounds__(MAX_BLOCKSIZE)
GpuReduceKernel2
(
SubExecParam
*
params
)
{
int64_t
startCycle
=
wall_clock64
();
SubExecParam
&
p
=
params
[
blockIdx
.
x
];
SubExecParam
&
p
=
params
[
blockIdx
.
y
];
size_t
numFloatsLeft
=
GpuReduceFunc
<
float4
>
(
p
,
0
,
p
.
N
,
8
);
if
(
numFloatsLeft
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment