Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
e7cfab75
Unverified
Commit
e7cfab75
authored
Oct 13, 2023
by
gilbertlee-amd
Committed by
GitHub
Oct 13, 2023
Browse files
Adding BLOCK_SIZE and BLOCK_ORDER (#57)
parent
0b29707e
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
138 additions
and
35 deletions
+138
-35
CHANGELOG.md
CHANGELOG.md
+8
-0
src/TransferBench.cpp
src/TransferBench.cpp
+78
-24
src/include/EnvVars.hpp
src/include/EnvVars.hpp
+40
-3
src/include/Kernels.hpp
src/include/Kernels.hpp
+11
-8
src/include/TransferBench.hpp
src/include/TransferBench.hpp
+1
-0
No files found.
CHANGELOG.md
View file @
e7cfab75
# Changelog for TransferBench
# Changelog for TransferBench
## v1.30
### Added
-
BLOCK_SIZE added to control threadblock size (Must be multiple of 64, up to 512)
-
BLOCK_ORDER added to control how work is ordered for GFX-executors running USE_SINGLE_STREAM=1
-
0 - Threadblocks for Transfers are ordered sequentially (Default)
-
1 - Threadblocks for Transfers are interleaved
-
2 - Threadblocks for Transfers are ordered randomly
## v1.29
## v1.29
### Added
### Added
-
a2a preset config now responds to USE_REMOTE_READ
-
a2a preset config now responds to USE_REMOTE_READ
...
...
src/TransferBench.cpp
View file @
e7cfab75
...
@@ -336,8 +336,6 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -336,8 +336,6 @@ void ExecuteTransfers(EnvVars const& ev,
int
const
exeIndex
=
RemappedIndex
(
executor
.
second
,
IsCpuType
(
exeType
));
int
const
exeIndex
=
RemappedIndex
(
executor
.
second
,
IsCpuType
(
exeType
));
exeInfo
.
totalBytes
=
0
;
exeInfo
.
totalBytes
=
0
;
int
transferOffset
=
0
;
for
(
int
i
=
0
;
i
<
exeInfo
.
transfers
.
size
();
++
i
)
for
(
int
i
=
0
;
i
<
exeInfo
.
transfers
.
size
();
++
i
)
{
{
// Prepare subarrays each threadblock works on and fill src memory with patterned data
// Prepare subarrays each threadblock works on and fill src memory with patterned data
...
@@ -345,20 +343,75 @@ void ExecuteTransfers(EnvVars const& ev,
...
@@ -345,20 +343,75 @@ void ExecuteTransfers(EnvVars const& ev,
transfer
->
PrepareSubExecParams
(
ev
);
transfer
->
PrepareSubExecParams
(
ev
);
isSrcCorrect
&=
transfer
->
PrepareSrc
(
ev
);
isSrcCorrect
&=
transfer
->
PrepareSrc
(
ev
);
exeInfo
.
totalBytes
+=
transfer
->
numBytesActual
;
exeInfo
.
totalBytes
+=
transfer
->
numBytesActual
;
}
// Copy block parameters to GPU for GPU executors
// Copy block parameters to GPU for GPU executors
if
(
transfer
->
exeType
==
EXE_GPU_GFX
)
if
(
exeType
==
EXE_GPU_GFX
)
{
{
exeInfo
.
transfers
[
i
]
->
subExecParamGpuPtr
=
exeInfo
.
subExecParamGpu
+
transferOffset
;
std
::
vector
<
SubExecParam
>
tempSubExecParam
;
HIP_CALL
(
hipSetDevice
(
exeIndex
));
HIP_CALL
(
hipMemcpy
(
&
exeInfo
.
subExecParamGpu
[
transferOffset
],
if
(
!
ev
.
useSingleStream
||
(
ev
.
blockOrder
==
ORDER_SEQUENTIAL
))
transfer
->
subExecParam
.
data
(),
{
transfer
->
subExecParam
.
size
()
*
sizeof
(
SubExecParam
),
// Assign Transfers to sequentual threadblocks
hipMemcpyHostToDevice
));
int
transferOffset
=
0
;
HIP_CALL
(
hipDeviceSynchronize
());
for
(
Transfer
*
transfer
:
exeInfo
.
transfers
)
{
transfer
->
subExecParamGpuPtr
=
exeInfo
.
subExecParamGpu
+
transferOffset
;
transfer
->
subExecIdx
.
clear
();
for
(
int
subExecIdx
=
0
;
subExecIdx
<
transfer
->
subExecParam
.
size
();
subExecIdx
++
)
{
transfer
->
subExecIdx
.
push_back
(
transferOffset
+
subExecIdx
);
tempSubExecParam
.
push_back
(
transfer
->
subExecParam
[
subExecIdx
]);
}
transferOffset
+=
transfer
->
numSubExecs
;
}
}
else
if
(
ev
.
blockOrder
==
ORDER_INTERLEAVED
)
{
// Interleave threadblocks of different Transfers
exeInfo
.
transfers
[
0
]
->
subExecParamGpuPtr
=
exeInfo
.
subExecParamGpu
;
for
(
int
subExecIdx
=
0
;
tempSubExecParam
.
size
()
<
exeInfo
.
totalSubExecs
;
++
subExecIdx
)
{
for
(
Transfer
*
transfer
:
exeInfo
.
transfers
)
{
if
(
subExecIdx
<
transfer
->
numSubExecs
)
{
transfer
->
subExecIdx
.
push_back
(
tempSubExecParam
.
size
());
tempSubExecParam
.
push_back
(
transfer
->
subExecParam
[
subExecIdx
]);
}
}
}
}
else
if
(
ev
.
blockOrder
==
ORDER_RANDOM
)
{
std
::
vector
<
std
::
pair
<
int
,
int
>>
indices
;
exeInfo
.
transfers
[
0
]
->
subExecParamGpuPtr
=
exeInfo
.
subExecParamGpu
;
// Build up a list of (transfer,subExecParam) indices, then randomly sort them
for
(
int
i
=
0
;
i
<
exeInfo
.
transfers
.
size
();
i
++
)
{
Transfer
*
transfer
=
exeInfo
.
transfers
[
i
];
for
(
int
subExecIdx
=
0
;
subExecIdx
<
transfer
->
numSubExecs
;
subExecIdx
++
)
indices
.
push_back
(
std
::
make_pair
(
i
,
subExecIdx
));
}
std
::
shuffle
(
indices
.
begin
(),
indices
.
end
(),
*
ev
.
generator
);
transferOffset
+=
transfer
->
subExecParam
.
size
();
// Build randomized threadblock list
for
(
auto
p
:
indices
)
{
Transfer
*
transfer
=
exeInfo
.
transfers
[
p
.
first
];
transfer
->
subExecIdx
.
push_back
(
tempSubExecParam
.
size
());
tempSubExecParam
.
push_back
(
transfer
->
subExecParam
[
p
.
second
]);
}
}
}
HIP_CALL
(
hipSetDevice
(
exeIndex
));
HIP_CALL
(
hipMemcpy
(
exeInfo
.
subExecParamGpu
,
tempSubExecParam
.
data
(),
tempSubExecParam
.
size
()
*
sizeof
(
SubExecParam
),
hipMemcpyDefault
));
HIP_CALL
(
hipDeviceSynchronize
());
}
}
}
}
...
@@ -1281,12 +1334,12 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1281,12 +1334,12 @@ void RunTransfer(EnvVars const& ev, int const iteration,
int
const
numBlocksToRun
=
ev
.
useSingleStream
?
exeInfo
.
totalSubExecs
:
transfer
->
numSubExecs
;
int
const
numBlocksToRun
=
ev
.
useSingleStream
?
exeInfo
.
totalSubExecs
:
transfer
->
numSubExecs
;
#if defined(__NVCC__)
#if defined(__NVCC__)
HIP_CALL
(
hipEventRecord
(
startEvent
,
stream
));
HIP_CALL
(
hipEventRecord
(
startEvent
,
stream
));
GpuKernelTable
[
ev
.
gpuKernel
]
<<<
numBlocksToRun
,
BLOCKSIZE
,
ev
.
sharedMemBytes
,
stream
>>>
(
transfer
->
subExecParamGpuPtr
);
GpuKernelTable
[
ev
.
gpuKernel
]
<<<
numBlocksToRun
,
ev
.
blockSize
,
ev
.
sharedMemBytes
,
stream
>>>
(
transfer
->
subExecParamGpuPtr
);
HIP_CALL
(
hipEventRecord
(
stopEvent
,
stream
));
HIP_CALL
(
hipEventRecord
(
stopEvent
,
stream
));
#else
#else
hipExtLaunchKernelGGL
(
GpuKernelTable
[
ev
.
gpuKernel
],
hipExtLaunchKernelGGL
(
GpuKernelTable
[
ev
.
gpuKernel
],
dim3
(
numBlocksToRun
,
1
,
1
),
dim3
(
numBlocksToRun
,
1
,
1
),
dim3
(
BLOCKSIZE
,
1
,
1
),
dim3
(
ev
.
blockSize
,
1
,
1
),
ev
.
sharedMemBytes
,
stream
,
ev
.
sharedMemBytes
,
stream
,
startEvent
,
stopEvent
,
startEvent
,
stopEvent
,
0
,
transfer
->
subExecParamGpuPtr
);
0
,
transfer
->
subExecParamGpuPtr
);
...
@@ -1306,12 +1359,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1306,12 +1359,16 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// Figure out individual timings for Transfers that were all launched together
// Figure out individual timings for Transfers that were all launched together
for
(
Transfer
*
currTransfer
:
exeInfo
.
transfers
)
for
(
Transfer
*
currTransfer
:
exeInfo
.
transfers
)
{
{
long
long
minStartCycle
=
currTransfer
->
subExecParamGpuPtr
[
0
].
startCycle
;
long
long
minStartCycle
=
std
::
numeric_limits
<
long
long
>::
max
();
long
long
maxStopCycle
=
currTransfer
->
subExecParamGpuPtr
[
0
].
stopCycle
;
long
long
maxStopCycle
=
std
::
numeric_limits
<
long
long
>::
min
();
for
(
int
i
=
1
;
i
<
currTransfer
->
numSubExecs
;
i
++
)
std
::
set
<
int
>
CUs
;
for
(
auto
subExecIdx
:
currTransfer
->
subExecIdx
)
{
{
minStartCycle
=
std
::
min
(
minStartCycle
,
currTransfer
->
subExecParamGpuPtr
[
i
].
startCycle
);
minStartCycle
=
std
::
min
(
minStartCycle
,
exeInfo
.
subExecParamGpu
[
subExecIdx
].
startCycle
);
maxStopCycle
=
std
::
max
(
maxStopCycle
,
currTransfer
->
subExecParamGpuPtr
[
i
].
stopCycle
);
maxStopCycle
=
std
::
max
(
maxStopCycle
,
exeInfo
.
subExecParamGpu
[
subExecIdx
].
stopCycle
);
if
(
ev
.
showIterations
)
CUs
.
insert
(
GetId
(
exeInfo
.
subExecParamGpu
[
subExecIdx
].
hwId
));
}
}
int
const
wallClockRate
=
ev
.
wallClockPerDeviceMhz
[
exeIndex
];
int
const
wallClockRate
=
ev
.
wallClockPerDeviceMhz
[
exeIndex
];
double
iterationTimeMs
=
(
maxStopCycle
-
minStartCycle
)
/
(
double
)(
wallClockRate
);
double
iterationTimeMs
=
(
maxStopCycle
-
minStartCycle
)
/
(
double
)(
wallClockRate
);
...
@@ -1319,9 +1376,6 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1319,9 +1376,6 @@ void RunTransfer(EnvVars const& ev, int const iteration,
if
(
ev
.
showIterations
)
if
(
ev
.
showIterations
)
{
{
currTransfer
->
perIterationTime
.
push_back
(
iterationTimeMs
);
currTransfer
->
perIterationTime
.
push_back
(
iterationTimeMs
);
std
::
set
<
int
>
CUs
;
for
(
int
i
=
0
;
i
<
currTransfer
->
numSubExecs
;
i
++
)
CUs
.
insert
(
GetId
(
currTransfer
->
subExecParamGpuPtr
[
i
].
hwId
));
currTransfer
->
perIterationCUs
.
push_back
(
CUs
);
currTransfer
->
perIterationCUs
.
push_back
(
CUs
);
}
}
}
}
...
@@ -1990,7 +2044,7 @@ bool Transfer::PrepareSrc(EnvVars const& ev)
...
@@ -1990,7 +2044,7 @@ bool Transfer::PrepareSrc(EnvVars const& ev)
int
const
deviceIdx
=
RemappedIndex
(
this
->
srcIndex
[
srcIdx
],
false
);
int
const
deviceIdx
=
RemappedIndex
(
this
->
srcIndex
[
srcIdx
],
false
);
HIP_CALL
(
hipSetDevice
(
deviceIdx
));
HIP_CALL
(
hipSetDevice
(
deviceIdx
));
if
(
ev
.
usePrepSrcKernel
)
if
(
ev
.
usePrepSrcKernel
)
PrepSrcDataKernel
<<<
32
,
BLOCKSIZE
>>>
(
srcPtr
,
N
,
srcIdx
);
PrepSrcDataKernel
<<<
32
,
ev
.
blockSize
>>>
(
srcPtr
,
N
,
srcIdx
);
else
else
HIP_CALL
(
hipMemcpy
(
srcPtr
,
reference
.
data
(),
this
->
numBytesActual
,
hipMemcpyDefault
));
HIP_CALL
(
hipMemcpy
(
srcPtr
,
reference
.
data
(),
this
->
numBytesActual
,
hipMemcpyDefault
));
HIP_CALL
(
hipDeviceSynchronize
());
HIP_CALL
(
hipDeviceSynchronize
());
...
...
src/include/EnvVars.hpp
View file @
e7cfab75
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Compatibility.hpp"
#include "Kernels.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.
29
"
#define TB_VERSION "1.
30
"
extern
char
const
MemTypeStr
[];
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
@@ -43,6 +43,13 @@ enum ConfigModeEnum
...
@@ -43,6 +43,13 @@ enum ConfigModeEnum
CFG_A2A
=
4
CFG_A2A
=
4
};
};
enum
BlockOrderEnum
{
ORDER_SEQUENTIAL
=
0
,
ORDER_INTERLEAVED
=
1
,
ORDER_RANDOM
=
2
};
// This class manages environment variable that affect TransferBench
// This class manages environment variable that affect TransferBench
class
EnvVars
class
EnvVars
{
{
...
@@ -65,7 +72,9 @@ public:
...
@@ -65,7 +72,9 @@ public:
int
const
DEFAULT_SWEEP_TIME_LIMIT
=
0
;
int
const
DEFAULT_SWEEP_TIME_LIMIT
=
0
;
// Environment variables
// Environment variables
int
blockSize
;
// Size of each threadblock (must be multiple of 64)
int
blockBytes
;
// Each CU, except the last, gets a multiple of this many bytes to copy
int
blockBytes
;
// Each CU, except the last, gets a multiple of this many bytes to copy
int
blockOrder
;
// How blocks are ordered in single-stream mode (0=Sequential, 1=Interleaved, 2=Random)
int
byteOffset
;
// Byte-offset for memory allocations
int
byteOffset
;
// Byte-offset for memory allocations
int
continueOnError
;
// Continue tests even after mismatch detected
int
continueOnError
;
// Continue tests even after mismatch detected
int
hideEnv
;
// Skip printing environment variable
int
hideEnv
;
// Skip printing environment variable
...
@@ -157,7 +166,9 @@ public:
...
@@ -157,7 +166,9 @@ public:
else
if
(
archName
==
"gfx940"
)
defaultGpuKernel
=
6
;
else
if
(
archName
==
"gfx940"
)
defaultGpuKernel
=
6
;
else
if
(
archName
==
"gfx941"
)
defaultGpuKernel
=
6
;
else
if
(
archName
==
"gfx941"
)
defaultGpuKernel
=
6
;
blockSize
=
GetEnvVar
(
"BLOCK_SIZE"
,
256
);
blockBytes
=
GetEnvVar
(
"BLOCK_BYTES"
,
256
);
blockBytes
=
GetEnvVar
(
"BLOCK_BYTES"
,
256
);
blockOrder
=
GetEnvVar
(
"BLOCK_ORDER"
,
0
);
byteOffset
=
GetEnvVar
(
"BYTE_OFFSET"
,
0
);
byteOffset
=
GetEnvVar
(
"BYTE_OFFSET"
,
0
);
continueOnError
=
GetEnvVar
(
"CONTINUE_ON_ERROR"
,
0
);
continueOnError
=
GetEnvVar
(
"CONTINUE_ON_ERROR"
,
0
);
hideEnv
=
GetEnvVar
(
"HIDE_ENV"
,
0
);
hideEnv
=
GetEnvVar
(
"HIDE_ENV"
,
0
);
...
@@ -324,11 +335,26 @@ public:
...
@@ -324,11 +335,26 @@ public:
printf
(
"[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)
\n
"
,
numGpuDevices
,
numDetectedGpus
);
printf
(
"[ERROR] Number of GPUs to use (%d) cannot exceed number of detected GPUs (%d)
\n
"
,
numGpuDevices
,
numDetectedGpus
);
exit
(
1
);
exit
(
1
);
}
}
if
(
blockSize
%
64
)
{
printf
(
"[ERROR] BLOCK_SIZE (%d) must be a multiple of 64
\n
"
,
blockSize
);
exit
(
1
);
}
if
(
blockSize
>
MAX_BLOCKSIZE
)
{
printf
(
"[ERROR] BLOCK_SIZE (%d) must be less than %d
\n
"
,
blockSize
,
MAX_BLOCKSIZE
);
exit
(
1
);
}
if
(
byteOffset
%
sizeof
(
float
))
if
(
byteOffset
%
sizeof
(
float
))
{
{
printf
(
"[ERROR] BYTE_OFFSET must be set to multiple of %lu
\n
"
,
sizeof
(
float
));
printf
(
"[ERROR] BYTE_OFFSET must be set to multiple of %lu
\n
"
,
sizeof
(
float
));
exit
(
1
);
exit
(
1
);
}
}
if
(
blockOrder
<
0
||
blockOrder
>
2
)
{
printf
(
"[ERROR] BLOCK_ORDER must be 0 (Sequential), 1 (Interleaved), or 2 (Random)
\n
"
);
exit
(
1
);
}
if
(
numWarmups
<
0
)
if
(
numWarmups
<
0
)
{
{
printf
(
"[ERROR] NUM_WARMUPS must be set to a non-negative number
\n
"
);
printf
(
"[ERROR] NUM_WARMUPS must be set to a non-negative number
\n
"
);
...
@@ -349,7 +375,6 @@ public:
...
@@ -349,7 +375,6 @@ public:
printf
(
"[ERROR] BLOCK_BYTES must be a positive multiple of 4
\n
"
);
printf
(
"[ERROR] BLOCK_BYTES must be a positive multiple of 4
\n
"
);
exit
(
1
);
exit
(
1
);
}
}
if
(
numGpuSubExecs
<=
0
)
if
(
numGpuSubExecs
<=
0
)
{
{
printf
(
"[ERROR] NUM_GPU_SE must be greater than 0
\n
"
);
printf
(
"[ERROR] NUM_GPU_SE must be greater than 0
\n
"
);
...
@@ -454,7 +479,9 @@ public:
...
@@ -454,7 +479,9 @@ public:
{
{
printf
(
"Environment variables:
\n
"
);
printf
(
"Environment variables:
\n
"
);
printf
(
"======================
\n
"
);
printf
(
"======================
\n
"
);
printf
(
" BLOCK_BYTES=B - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy
\n
"
);
printf
(
" BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64). Defaults to 256
\n
"
);
printf
(
" BLOCK_BYTES - Each CU (except the last) receives a multiple of BLOCK_BYTES to copy
\n
"
);
printf
(
" BLOCK_ORDER - Threadblock ordering in single-stream mode (0=Serial, 1=Interleaved, 2=Random)
\n
"
);
printf
(
" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0
\n
"
);
printf
(
" BYTE_OFFSET - Initial byte-offset for memory allocations. Must be multiple of 4. Defaults to 0
\n
"
);
printf
(
" CONTINUE_ON_ERROR - Continue tests even after mismatch detected
\n
"
);
printf
(
" CONTINUE_ON_ERROR - Continue tests even after mismatch detected
\n
"
);
printf
(
" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)
\n
"
);
printf
(
" CU_MASK - CU mask for streams specified in hex digits (0-0,a-f,A-F)
\n
"
);
...
@@ -495,8 +522,14 @@ public:
...
@@ -495,8 +522,14 @@ public:
printf
(
"EnvVar,Value,Description,(TransferBench v%s)
\n
"
,
TB_VERSION
);
printf
(
"EnvVar,Value,Description,(TransferBench v%s)
\n
"
,
TB_VERSION
);
if
(
hideEnv
)
return
;
if
(
hideEnv
)
return
;
PRINT_EV
(
"BLOCK_SIZE"
,
blockSize
,
std
::
string
(
"Threadblock size of "
+
std
::
to_string
(
blockSize
)));
PRINT_EV
(
"BLOCK_BYTES"
,
blockBytes
,
PRINT_EV
(
"BLOCK_BYTES"
,
blockBytes
,
std
::
string
(
"Each CU gets a multiple of "
+
std
::
to_string
(
blockBytes
)
+
" bytes to copy"
));
std
::
string
(
"Each CU gets a multiple of "
+
std
::
to_string
(
blockBytes
)
+
" bytes to copy"
));
PRINT_EV
(
"BLOCK_ORDER"
,
blockOrder
,
std
::
string
(
"Transfer blocks order: "
+
std
::
string
((
blockOrder
==
0
?
"Sequential"
:
blockOrder
==
1
?
"Interleaved"
:
"Random"
))));
PRINT_EV
(
"BYTE_OFFSET"
,
byteOffset
,
PRINT_EV
(
"BYTE_OFFSET"
,
byteOffset
,
std
::
string
(
"Using byte offset of "
+
std
::
to_string
(
byteOffset
)));
std
::
string
(
"Using byte offset of "
+
std
::
to_string
(
byteOffset
)));
PRINT_EV
(
"CONTINUE_ON_ERROR"
,
continueOnError
,
PRINT_EV
(
"CONTINUE_ON_ERROR"
,
continueOnError
,
...
@@ -531,12 +564,16 @@ public:
...
@@ -531,12 +564,16 @@ public:
PRINT_EV
(
"VALIDATE_DIRECT"
,
validateDirect
,
PRINT_EV
(
"VALIDATE_DIRECT"
,
validateDirect
,
std
::
string
(
"Validate GPU destination memory "
)
+
(
validateDirect
?
"directly"
:
"via CPU staging buffer"
));
std
::
string
(
"Validate GPU destination memory "
)
+
(
validateDirect
?
"directly"
:
"via CPU staging buffer"
));
printf
(
"
\n
"
);
printf
(
"
\n
"
);
if
(
blockOrder
!=
ORDER_SEQUENTIAL
&&
!
useSingleStream
)
printf
(
"[WARN] BLOCK_ORDER is ignored if USE_SINGLE_STREAM is not enabled
\n
"
);
};
};
// Display env var for P2P Benchmark preset
// Display env var for P2P Benchmark preset
void
DisplayP2PBenchmarkEnvVars
()
const
void
DisplayP2PBenchmarkEnvVars
()
const
{
{
DisplayEnvVars
();
DisplayEnvVars
();
if
(
hideEnv
)
return
;
if
(
hideEnv
)
return
;
if
(
!
outputToCsv
)
if
(
!
outputToCsv
)
...
...
src/include/Kernels.hpp
View file @
e7cfab75
...
@@ -24,7 +24,7 @@ THE SOFTWARE.
...
@@ -24,7 +24,7 @@ THE SOFTWARE.
#define PackedFloat_t float4
#define PackedFloat_t float4
#define WARP_SIZE 64
#define WARP_SIZE 64
#define BLOCKSIZE
256
#define
MAX_
BLOCKSIZE
512
#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
#define FLOATS_PER_PACK (sizeof(PackedFloat_t) / sizeof(float))
#define MEMSET_CHAR 75
#define MEMSET_CHAR 75
#define MEMSET_VAL 13323083.0f
#define MEMSET_VAL 13323083.0f
...
@@ -34,11 +34,14 @@ THE SOFTWARE.
...
@@ -34,11 +34,14 @@ THE SOFTWARE.
#define MAX_DSTS 16
#define MAX_DSTS 16
struct
SubExecParam
struct
SubExecParam
{
{
// Inputs
size_t
N
;
// Number of floats this subExecutor works on
size_t
N
;
// Number of floats this subExecutor works on
int
numSrcs
;
// Number of source arrays
int
numSrcs
;
// Number of source arrays
int
numDsts
;
// Number of destination arrays
int
numDsts
;
// Number of destination arrays
float
*
src
[
MAX_SRCS
];
// Source array pointers
float
*
src
[
MAX_SRCS
];
// Source array pointers
float
*
dst
[
MAX_DSTS
];
// Destination array pointers
float
*
dst
[
MAX_DSTS
];
// Destination array pointers
// Outputs
long
long
startCycle
;
// Start timestamp for in-kernel timing (GPU-GFX executor)
long
long
startCycle
;
// Start timestamp for in-kernel timing (GPU-GFX executor)
long
long
stopCycle
;
// Stop timestamp for in-kernel timing (GPU-GFX executor)
long
long
stopCycle
;
// Stop timestamp for in-kernel timing (GPU-GFX executor)
uint32_t
hwId
;
// Hardware ID
uint32_t
hwId
;
// Hardware ID
...
@@ -111,7 +114,7 @@ template <> __device__ __forceinline__ float4 MemsetVal(){ return make
...
@@ -111,7 +114,7 @@ template <> __device__ __forceinline__ float4 MemsetVal(){ return make
// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
// GPU copy kernel 0: 3 loops: unroll float 4, float4s, floats
template
<
int
LOOP1_UNROLL
>
template
<
int
LOOP1_UNROLL
>
__global__
void
__launch_bounds__
(
BLOCKSIZE
)
__global__
void
__launch_bounds__
(
MAX_
BLOCKSIZE
)
GpuReduceKernel
(
SubExecParam
*
params
)
GpuReduceKernel
(
SubExecParam
*
params
)
{
{
int64_t
startCycle
=
wall_clock64
();
int64_t
startCycle
=
wall_clock64
();
...
@@ -128,7 +131,7 @@ GpuReduceKernel(SubExecParam* params)
...
@@ -128,7 +131,7 @@ GpuReduceKernel(SubExecParam* params)
size_t
Nrem
=
p
.
N
;
size_t
Nrem
=
p
.
N
;
size_t
const
loop1Npack
=
(
Nrem
/
(
FLOATS_PER_PACK
*
LOOP1_UNROLL
*
WARP_SIZE
))
*
(
LOOP1_UNROLL
*
WARP_SIZE
);
size_t
const
loop1Npack
=
(
Nrem
/
(
FLOATS_PER_PACK
*
LOOP1_UNROLL
*
WARP_SIZE
))
*
(
LOOP1_UNROLL
*
WARP_SIZE
);
size_t
const
loop1Nelem
=
loop1Npack
*
FLOATS_PER_PACK
;
size_t
const
loop1Nelem
=
loop1Npack
*
FLOATS_PER_PACK
;
size_t
const
loop1Inc
=
BLOCKSIZE
*
LOOP1_UNROLL
;
size_t
const
loop1Inc
=
blockDim
.
x
*
LOOP1_UNROLL
;
size_t
loop1Offset
=
waveId
*
LOOP1_UNROLL
*
WARP_SIZE
+
threadId
;
size_t
loop1Offset
=
waveId
*
LOOP1_UNROLL
*
WARP_SIZE
+
threadId
;
while
(
loop1Offset
<
loop1Npack
)
while
(
loop1Offset
<
loop1Npack
)
...
@@ -167,7 +170,7 @@ GpuReduceKernel(SubExecParam* params)
...
@@ -167,7 +170,7 @@ GpuReduceKernel(SubExecParam* params)
// NOTE: Using int32_t due to smaller size requirements
// NOTE: Using int32_t due to smaller size requirements
int32_t
const
loop2Npack
=
Nrem
/
FLOATS_PER_PACK
;
int32_t
const
loop2Npack
=
Nrem
/
FLOATS_PER_PACK
;
int32_t
const
loop2Nelem
=
loop2Npack
*
FLOATS_PER_PACK
;
int32_t
const
loop2Nelem
=
loop2Npack
*
FLOATS_PER_PACK
;
int32_t
const
loop2Inc
=
BLOCKSIZE
;
int32_t
const
loop2Inc
=
blockDim
.
x
;
int32_t
loop2Offset
=
threadIdx
.
x
;
int32_t
loop2Offset
=
threadIdx
.
x
;
while
(
loop2Offset
<
loop2Npack
)
while
(
loop2Offset
<
loop2Npack
)
...
@@ -229,7 +232,7 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR>
...
@@ -229,7 +232,7 @@ template <typename FLOAT_TYPE, int UNROLL_FACTOR>
__device__
size_t
GpuReduceFuncImpl2
(
SubExecParam
const
&
p
,
size_t
const
offset
,
size_t
const
N
)
__device__
size_t
GpuReduceFuncImpl2
(
SubExecParam
const
&
p
,
size_t
const
offset
,
size_t
const
N
)
{
{
int
constexpr
numFloatsPerPack
=
sizeof
(
FLOAT_TYPE
)
/
sizeof
(
float
);
// Number of floats handled at a time per thread
int
constexpr
numFloatsPerPack
=
sizeof
(
FLOAT_TYPE
)
/
sizeof
(
float
);
// Number of floats handled at a time per thread
size_t
constexpr
loopPackInc
=
BLOCKSIZE
*
UNROLL_FACTOR
;
size_t
constexpr
loopPackInc
=
blockDim
.
x
*
UNROLL_FACTOR
;
size_t
constexpr
numPacksPerWave
=
WARP_SIZE
*
UNROLL_FACTOR
;
size_t
constexpr
numPacksPerWave
=
WARP_SIZE
*
UNROLL_FACTOR
;
int
const
waveId
=
threadIdx
.
x
/
WARP_SIZE
;
// Wavefront number
int
const
waveId
=
threadIdx
.
x
/
WARP_SIZE
;
// Wavefront number
int
const
threadId
=
threadIdx
.
x
%
WARP_SIZE
;
// Thread index within wavefront
int
const
threadId
=
threadIdx
.
x
%
WARP_SIZE
;
// Thread index within wavefront
...
@@ -283,8 +286,8 @@ __device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset,
...
@@ -283,8 +286,8 @@ __device__ size_t GpuReduceFuncImpl(SubExecParam const &p, size_t const offset,
{
{
// Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
// Each thread in the block works on UNROLL_FACTOR FLOAT_TYPEs during each iteration of the loop
int
constexpr
numFloatsPerRead
=
sizeof
(
FLOAT_TYPE
)
/
sizeof
(
float
);
int
constexpr
numFloatsPerRead
=
sizeof
(
FLOAT_TYPE
)
/
sizeof
(
float
);
size_t
const
expr
numFloatsPerInnerLoop
=
BLOCKSIZE
*
numFloatsPerRead
;
size_t
const
numFloatsPerInnerLoop
=
blockDim
.
x
*
numFloatsPerRead
;
size_t
const
expr
numFloatsPerOuterLoop
=
numFloatsPerInnerLoop
*
UNROLL_FACTOR
;
size_t
const
numFloatsPerOuterLoop
=
numFloatsPerInnerLoop
*
UNROLL_FACTOR
;
size_t
const
numFloatsLeft
=
(
numFloatsPerRead
==
1
&&
UNROLL_FACTOR
==
1
)
?
0
:
N
%
numFloatsPerOuterLoop
;
size_t
const
numFloatsLeft
=
(
numFloatsPerRead
==
1
&&
UNROLL_FACTOR
==
1
)
?
0
:
N
%
numFloatsPerOuterLoop
;
size_t
const
numFloatsDone
=
N
-
numFloatsLeft
;
size_t
const
numFloatsDone
=
N
-
numFloatsLeft
;
int
const
numSrcs
=
p
.
numSrcs
;
int
const
numSrcs
=
p
.
numSrcs
;
...
@@ -351,7 +354,7 @@ __device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size
...
@@ -351,7 +354,7 @@ __device__ size_t GpuReduceFunc(SubExecParam const &p, size_t const offset, size
}
}
// GPU copy kernel
// GPU copy kernel
__global__
void
__launch_bounds__
(
BLOCKSIZE
)
__global__
void
__launch_bounds__
(
MAX_
BLOCKSIZE
)
GpuReduceKernel2
(
SubExecParam
*
params
)
GpuReduceKernel2
(
SubExecParam
*
params
)
{
{
int64_t
startCycle
=
wall_clock64
();
int64_t
startCycle
=
wall_clock64
();
...
...
src/include/TransferBench.hpp
View file @
e7cfab75
...
@@ -118,6 +118,7 @@ struct Transfer
...
@@ -118,6 +118,7 @@ struct Transfer
std
::
vector
<
SubExecParam
>
subExecParam
;
// Defines subarrays assigned to each threadblock
std
::
vector
<
SubExecParam
>
subExecParam
;
// Defines subarrays assigned to each threadblock
SubExecParam
*
subExecParamGpuPtr
;
// Pointer to GPU copy of subExecParam
SubExecParam
*
subExecParamGpuPtr
;
// Pointer to GPU copy of subExecParam
std
::
vector
<
int
>
subExecIdx
;
// Indicies into subExecParamGpu
std
::
vector
<
double
>
perIterationTime
;
// Per-iteration timing
std
::
vector
<
double
>
perIterationTime
;
// Per-iteration timing
std
::
vector
<
std
::
set
<
int
>>
perIterationCUs
;
// Per-iteration CU usage
std
::
vector
<
std
::
set
<
int
>>
perIterationCUs
;
// Per-iteration CU usage
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment