Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
47480a59
Unverified
Commit
47480a59
authored
Jan 09, 2024
by
gilbertlee-amd
Committed by
GitHub
Jan 09, 2024
Browse files
v1.47 Fixing CUDA compilation (#83)
parent
e9f51f2b
Changes
5
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
35 additions
and
15 deletions
+35
-15
CHANGELOG.md
CHANGELOG.md
+5
-0
src/Makefile
src/Makefile
+1
-1
src/TransferBench.cpp
src/TransferBench.cpp
+7
-7
src/include/EnvVars.hpp
src/include/EnvVars.hpp
+2
-3
src/include/Kernels.hpp
src/include/Kernels.hpp
+20
-4
No files found.
CHANGELOG.md
View file @
47480a59
...
@@ -3,6 +3,11 @@
...
@@ -3,6 +3,11 @@
Documentation for TransferBench is available at
Documentation for TransferBench is available at
[
https://rocm.docs.amd.com/projects/TransferBench
](
https://rocm.docs.amd.com/projects/TransferBench
)
.
[
https://rocm.docs.amd.com/projects/TransferBench
](
https://rocm.docs.amd.com/projects/TransferBench
)
.
## v1.47
### Fixes
*
Fixing CUDA support
## v1.46
## v1.46
### Fixes
### Fixes
...
...
src/Makefile
View file @
47480a59
...
@@ -13,7 +13,7 @@ else
...
@@ -13,7 +13,7 @@ else
endif
endif
CXXFLAGS
=
-O3
-Iinclude
-I
$(ROCM_PATH)
/include
-lnuma
-L
$(ROCM_PATH)
/lib
-lhsa-runtime64
CXXFLAGS
=
-O3
-Iinclude
-I
$(ROCM_PATH)
/include
-lnuma
-L
$(ROCM_PATH)
/lib
-lhsa-runtime64
NVFLAGS
=
-O3
-g
-Iinclude
-x
cu
-lnuma
-gencode
=
arch
=
compute_80,code
=
sm_80
-gencode
=
arch
=
compute_75,code
=
sm_75
NVFLAGS
=
-O3
-Iinclude
-x
cu
-lnuma
-gencode
=
arch
=
compute_80,code
=
sm_80
-gencode
=
arch
=
compute_75,code
=
sm_75
LDFLAGS
+=
-lpthread
LDFLAGS
+=
-lpthread
all
:
$(EXE)
all
:
$(EXE)
...
...
src/TransferBench.cpp
View file @
47480a59
...
@@ -1459,16 +1459,17 @@ void RunTransfer(EnvVars const& ev, int const iteration,
...
@@ -1459,16 +1459,17 @@ void RunTransfer(EnvVars const& ev, int const iteration,
// Otherwise, just launch the threadblocks associated with this single Transfer
// Otherwise, just launch the threadblocks associated with this single Transfer
int
const
numBlocksToRun
=
ev
.
useSingleStream
?
exeInfo
.
totalSubExecs
:
transfer
->
numSubExecs
;
int
const
numBlocksToRun
=
ev
.
useSingleStream
?
exeInfo
.
totalSubExecs
:
transfer
->
numSubExecs
;
int
const
numXCCs
=
(
ev
.
useXccFilter
?
ev
.
xccIdsPerDevice
[
exeIndex
].
size
()
:
1
);
int
const
numXCCs
=
(
ev
.
useXccFilter
?
ev
.
xccIdsPerDevice
[
exeIndex
].
size
()
:
1
);
dim3
const
gridSize
(
numXCCs
,
numBlocksToRun
,
1
);
dim3
const
blockSize
(
ev
.
gfxBlockSize
,
1
,
1
);
#if defined(__NVCC__)
#if defined(__NVCC__)
HIP_CALL
(
hipEventRecord
(
startEvent
,
stream
));
HIP_CALL
(
hipEventRecord
(
startEvent
,
stream
));
GpuKernelTable
[
ev
.
gfxBlockSize
/
warpSize
-
1
][
ev
.
gfxUnroll
-
1
]
GpuKernelTable
[
ev
.
gfxBlockSize
/
64
-
1
][
ev
.
gfxUnroll
-
1
]
<<<
numBlocksToRun
,
ev
.
gfxB
lockSize
,
ev
.
sharedMemBytes
,
stream
>>>
(
transfer
->
subExecParamGpuPtr
,
ev
.
w
aveOrder
);
<<<
gridSize
,
b
lockSize
,
ev
.
sharedMemBytes
,
stream
>>>
(
transfer
->
subExecParamGpuPtr
,
ev
.
gfxW
aveOrder
);
HIP_CALL
(
hipEventRecord
(
stopEvent
,
stream
));
HIP_CALL
(
hipEventRecord
(
stopEvent
,
stream
));
#else
#else
hipExtLaunchKernelGGL
(
GpuKernelTable
[
ev
.
gfxBlockSize
/
warpSize
-
1
][
ev
.
gfxUnroll
-
1
],
hipExtLaunchKernelGGL
(
GpuKernelTable
[
ev
.
gfxBlockSize
/
64
-
1
][
ev
.
gfxUnroll
-
1
],
dim3
(
numXCCs
,
numBlocksToRun
,
1
),
gridSize
,
blockSize
,
dim3
(
ev
.
gfxBlockSize
,
1
,
1
),
ev
.
sharedMemBytes
,
stream
,
ev
.
sharedMemBytes
,
stream
,
startEvent
,
stopEvent
,
startEvent
,
stopEvent
,
0
,
transfer
->
subExecParamGpuPtr
,
ev
.
gfxWaveOrder
);
0
,
transfer
->
subExecParamGpuPtr
,
ev
.
gfxWaveOrder
);
...
@@ -1994,9 +1995,9 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
...
@@ -1994,9 +1995,9 @@ void RunAllToAllBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, i
if
(
ev
.
a2aDirect
)
if
(
ev
.
a2aDirect
)
{
{
#if !defined(__NVCC__)
if
(
i
==
j
)
continue
;
if
(
i
==
j
)
continue
;
#if !defined(__NVCC__)
uint32_t
linkType
,
hopCount
;
uint32_t
linkType
,
hopCount
;
HIP_CALL
(
hipExtGetLinkTypeAndHopCount
(
RemappedIndex
(
i
,
false
),
HIP_CALL
(
hipExtGetLinkTypeAndHopCount
(
RemappedIndex
(
i
,
false
),
RemappedIndex
(
j
,
false
),
RemappedIndex
(
j
,
false
),
...
@@ -2460,7 +2461,6 @@ void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int
...
@@ -2460,7 +2461,6 @@ void RunSchmooBenchmark(EnvVars const& ev, size_t const numBytesPerTransfer, int
void
RunRemoteWriteBenchmark
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
numSubExecs
,
int
const
srcIdx
,
int
minGpus
,
int
maxGpus
)
void
RunRemoteWriteBenchmark
(
EnvVars
const
&
ev
,
size_t
const
numBytesPerTransfer
,
int
numSubExecs
,
int
const
srcIdx
,
int
minGpus
,
int
maxGpus
)
{
{
char
memType
=
ev
.
useFineGrain
?
'F'
:
'G'
;
printf
(
"Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]
\n
"
,
numBytesPerTransfer
,
srcIdx
,
numSubExecs
,
minGpus
,
maxGpus
);
printf
(
"Bytes to write: %lu from GPU %d using %d CUs [Sweeping %d to %d parallel writes]
\n
"
,
numBytesPerTransfer
,
srcIdx
,
numSubExecs
,
minGpus
,
maxGpus
);
char
sep
=
(
ev
.
outputToCsv
?
','
:
' '
);
char
sep
=
(
ev
.
outputToCsv
?
','
:
' '
);
...
...
src/include/EnvVars.hpp
View file @
47480a59
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
...
@@ -29,7 +29,7 @@ THE SOFTWARE.
#include "Compatibility.hpp"
#include "Compatibility.hpp"
#include "Kernels.hpp"
#include "Kernels.hpp"
#define TB_VERSION "1.4
6
"
#define TB_VERSION "1.4
7
"
extern
char
const
MemTypeStr
[];
extern
char
const
MemTypeStr
[];
extern
char
const
ExeTypeStr
[];
extern
char
const
ExeTypeStr
[];
...
@@ -530,8 +530,7 @@ public:
...
@@ -530,8 +530,7 @@ public:
for
(
int
i
=
0
;
i
<
numDetectedGpus
;
i
++
)
for
(
int
i
=
0
;
i
<
numDetectedGpus
;
i
++
)
{
{
#if defined(__NVCC__)
#if defined(__NVCC__)
// NOTE: wallClock doesn't exist in CUDA. This may need to be adjusted / run with fixed clocks
wallClockPerDeviceMhz
[
i
]
=
1000000
;
wallClockPerDeviceMhz
[
i
]
=
1410000
;
#else
#else
hipDeviceProp_t
prop
;
hipDeviceProp_t
prop
;
HIP_CALL
(
hipGetDeviceProperties
(
&
prop
,
i
));
HIP_CALL
(
hipGetDeviceProperties
(
&
prop
,
i
));
...
...
src/include/Kernels.hpp
View file @
47480a59
...
@@ -29,6 +29,10 @@ THE SOFTWARE.
...
@@ -29,6 +29,10 @@ THE SOFTWARE.
#define MEMSET_VAL 13323083.0f
#define MEMSET_VAL 13323083.0f
#if defined(__NVCC__)
#define warpSize 32
#endif
#define MAX_WAVEGROUPS MAX_BLOCKSIZE / warpSize
#define MAX_WAVEGROUPS MAX_BLOCKSIZE / warpSize
#define MAX_UNROLL 8
#define MAX_UNROLL 8
#define NUM_WAVEORDERS 6
#define NUM_WAVEORDERS 6
...
@@ -44,7 +48,7 @@ struct SubExecParam
...
@@ -44,7 +48,7 @@ struct SubExecParam
int
numDsts
;
// Number of destination arrays
int
numDsts
;
// Number of destination arrays
float
*
src
[
MAX_SRCS
];
// Source array pointers
float
*
src
[
MAX_SRCS
];
// Source array pointers
float
*
dst
[
MAX_DSTS
];
// Destination array pointers
float
*
dst
[
MAX_DSTS
];
// Destination array pointers
u
int32_t
preferredXccId
;
// XCC ID to execute on
int32_t
preferredXccId
;
// XCC ID to execute on
// Prepared
// Prepared
int
teamSize
;
// Index of this sub executor amongst team
int
teamSize
;
// Index of this sub executor amongst team
...
@@ -133,6 +137,17 @@ PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
...
@@ -133,6 +137,17 @@ PrepSrcDataKernel(float* ptr, size_t N, int srcBufferIdx)
}
}
}
}
__device__
int64_t
GetTimestamp
()
{
#if defined(__NVCC__)
int64_t
result
;
asm
volatile
(
"mov.u64 %0, %%globaltimer;"
:
"=l"
(
result
));
return
result
;
#else
return
wall_clock64
();
#endif
}
// Helper function for memset
// Helper function for memset
template
<
typename
T
>
__device__
__forceinline__
T
MemsetVal
();
template
<
typename
T
>
__device__
__forceinline__
T
MemsetVal
();
template
<
>
__device__
__forceinline__
float
MemsetVal
(){
return
MEMSET_VAL
;
};
template
<
>
__device__
__forceinline__
float
MemsetVal
(){
return
MEMSET_VAL
;
};
...
@@ -143,14 +158,16 @@ __global__ void __launch_bounds__(BLOCKSIZE)
...
@@ -143,14 +158,16 @@ __global__ void __launch_bounds__(BLOCKSIZE)
GpuReduceKernel
(
SubExecParam
*
params
,
int
waveOrder
)
GpuReduceKernel
(
SubExecParam
*
params
,
int
waveOrder
)
{
{
int64_t
startCycle
;
int64_t
startCycle
;
if
(
threadIdx
.
x
==
0
)
startCycle
=
wall_clock64
();
if
(
threadIdx
.
x
==
0
)
startCycle
=
GetTimestamp
();
SubExecParam
&
p
=
params
[
blockIdx
.
y
];
SubExecParam
&
p
=
params
[
blockIdx
.
y
];
// (Experimental) Filter by XCC if desired
// (Experimental) Filter by XCC if desired
#if !defined(__NVCC__)
int32_t
xccId
;
int32_t
xccId
;
GetXccId
(
xccId
);
GetXccId
(
xccId
);
if
(
p
.
preferredXccId
!=
-
1
&&
xccId
!=
p
.
preferredXccId
)
return
;
if
(
p
.
preferredXccId
!=
-
1
&&
xccId
!=
p
.
preferredXccId
)
return
;
#endif
// Collect data information
// Collect data information
int32_t
const
numSrcs
=
p
.
numSrcs
;
int32_t
const
numSrcs
=
p
.
numSrcs
;
...
@@ -168,7 +185,6 @@ __global__ void __launch_bounds__(BLOCKSIZE)
...
@@ -168,7 +185,6 @@ __global__ void __launch_bounds__(BLOCKSIZE)
int32_t
const
tIdx
=
threadIdx
.
x
%
warpSize
;
// Thread index within wavefront
int32_t
const
tIdx
=
threadIdx
.
x
%
warpSize
;
// Thread index within wavefront
size_t
const
numFloat4
=
p
.
N
/
4
;
size_t
const
numFloat4
=
p
.
N
/
4
;
int32_t
const
nFlt4PerWave
=
warpSize
*
4
;
int32_t
teamStride
,
waveStride
,
unrlStride
,
teamStride2
,
waveStride2
;
int32_t
teamStride
,
waveStride
,
unrlStride
,
teamStride2
,
waveStride2
;
switch
(
waveOrder
)
switch
(
waveOrder
)
...
@@ -266,7 +282,7 @@ __global__ void __launch_bounds__(BLOCKSIZE)
...
@@ -266,7 +282,7 @@ __global__ void __launch_bounds__(BLOCKSIZE)
if
(
threadIdx
.
x
==
0
)
if
(
threadIdx
.
x
==
0
)
{
{
__threadfence_system
();
__threadfence_system
();
p
.
stopCycle
=
wall_clock64
();
p
.
stopCycle
=
GetTimestamp
();
p
.
startCycle
=
startCycle
;
p
.
startCycle
=
startCycle
;
p
.
xccId
=
xccId
;
p
.
xccId
=
xccId
;
__trace_hwreg
();
__trace_hwreg
();
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment