Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
9f68d14d
Unverified
Commit
9f68d14d
authored
Nov 22, 2024
by
gilbertlee-amd
Committed by
GitHub
Nov 22, 2024
Browse files
Fixing compile error on CentOS and multi-stream GFX mode (#140)
parent
02ce785c
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
47 additions
and
33 deletions
+47
-33
CHANGELOG.md
CHANGELOG.md
+5
-0
CMakeLists.txt
CMakeLists.txt
+1
-1
Makefile
Makefile
+1
-1
src/client/Client.hpp
src/client/Client.hpp
+1
-1
src/header/TransferBench.hpp
src/header/TransferBench.hpp
+39
-30
No files found.
CHANGELOG.md
View file @
9f68d14d
...
@@ -3,6 +3,11 @@
...
@@ -3,6 +3,11 @@
Documentation for TransferBench is available at
Documentation for TransferBench is available at
[
https://rocm.docs.amd.com/projects/TransferBench
](
https://rocm.docs.amd.com/projects/TransferBench
)
.
[
https://rocm.docs.amd.com/projects/TransferBench
](
https://rocm.docs.amd.com/projects/TransferBench
)
.
## v1.55
### Fixed
-
Fixed missing header error when compiling on CentOS
-
Fixed issues when using multi-stream mode for GFX executor
## v1.54
## v1.54
### Modified
### Modified
-
Refactored TransferBench into a header-only library combined with a thin client to facilitate the
-
Refactored TransferBench into a header-only library combined with a thin client to facilitate the
...
...
CMakeLists.txt
View file @
9f68d14d
...
@@ -7,7 +7,7 @@ else()
...
@@ -7,7 +7,7 @@ else()
endif
()
endif
()
cmake_minimum_required
(
VERSION 3.5
)
cmake_minimum_required
(
VERSION 3.5
)
project
(
TransferBench VERSION 1.5
4
.0 LANGUAGES CXX
)
project
(
TransferBench VERSION 1.5
5
.0 LANGUAGES CXX
)
# Default GPU architectures to build
# Default GPU architectures to build
#==================================================================================================
#==================================================================================================
...
...
Makefile
View file @
9f68d14d
...
@@ -18,7 +18,7 @@ endif
...
@@ -18,7 +18,7 @@ endif
CXXFLAGS
=
-I
$(ROCM_PATH)
/include
-lnuma
-L
$(ROCM_PATH)
/lib
-lhsa-runtime64
CXXFLAGS
=
-I
$(ROCM_PATH)
/include
-lnuma
-L
$(ROCM_PATH)
/lib
-lhsa-runtime64
NVFLAGS
=
-x
cu
-lnuma
-arch
=
native
NVFLAGS
=
-x
cu
-lnuma
-arch
=
native
COMMON_FLAGS
=
-O3
--std
=
c++20
-I
./src/header
-I
./src/client
-I
./src/client/Presets
COMMON_FLAGS
=
-g
-O3
--std
=
c++20
-I
./src/header
-I
./src/client
-I
./src/client/Presets
LDFLAGS
+=
-lpthread
LDFLAGS
+=
-lpthread
all
:
$(EXE)
all
:
$(EXE)
...
...
src/client/Client.hpp
View file @
9f68d14d
...
@@ -23,7 +23,7 @@ THE SOFTWARE.
...
@@ -23,7 +23,7 @@ THE SOFTWARE.
#pragma once
#pragma once
// TransferBench client version
// TransferBench client version
#define CLIENT_VERSION "1.5
4
.00"
#define CLIENT_VERSION "1.5
5
.00"
#include "TransferBench.hpp"
#include "TransferBench.hpp"
#include "EnvVars.hpp"
#include "EnvVars.hpp"
...
...
src/header/TransferBench.hpp
View file @
9f68d14d
...
@@ -30,6 +30,7 @@ THE SOFTWARE.
...
@@ -30,6 +30,7 @@ THE SOFTWARE.
#include <sstream>
#include <sstream>
#include <stdarg.h>
#include <stdarg.h>
#include <thread>
#include <thread>
#include <unistd.h>
#include <vector>
#include <vector>
#if defined(__NVCC__)
#if defined(__NVCC__)
...
@@ -1416,7 +1417,6 @@ namespace {
...
@@ -1416,7 +1417,6 @@ namespace {
int
const
numStreamsToUse
=
(
exeDevice
.
exeType
==
EXE_GPU_DMA
||
int
const
numStreamsToUse
=
(
exeDevice
.
exeType
==
EXE_GPU_DMA
||
(
exeDevice
.
exeType
==
EXE_GPU_GFX
&&
cfg
.
gfx
.
useMultiStream
))
(
exeDevice
.
exeType
==
EXE_GPU_GFX
&&
cfg
.
gfx
.
useMultiStream
))
?
exeInfo
.
resources
.
size
()
:
1
;
?
exeInfo
.
resources
.
size
()
:
1
;
exeInfo
.
streams
.
resize
(
numStreamsToUse
);
exeInfo
.
streams
.
resize
(
numStreamsToUse
);
// Create streams
// Create streams
...
@@ -1833,6 +1833,8 @@ namespace {
...
@@ -1833,6 +1833,8 @@ namespace {
// Execute a single GPU Transfer (when using 1 stream per Transfer)
// Execute a single GPU Transfer (when using 1 stream per Transfer)
static
ErrResult
ExecuteGpuTransfer
(
int
const
iteration
,
static
ErrResult
ExecuteGpuTransfer
(
int
const
iteration
,
hipStream_t
const
stream
,
hipStream_t
const
stream
,
hipEvent_t
const
startEvent
,
hipEvent_t
const
stopEvent
,
int
const
xccDim
,
int
const
xccDim
,
ConfigOptions
const
&
cfg
,
ConfigOptions
const
&
cfg
,
TransferResources
&
resources
)
TransferResources
&
resources
)
...
@@ -1844,33 +1846,41 @@ namespace {
...
@@ -1844,33 +1846,41 @@ namespace {
dim3
const
blockSize
(
cfg
.
gfx
.
blockSize
,
1
);
dim3
const
blockSize
(
cfg
.
gfx
.
blockSize
,
1
);
#if defined(__NVCC__)
#if defined(__NVCC__)
if
(
startEvent
!=
NULL
)
ERR_CHECK
(
hipEventRecord
(
startEvent
,
stream
));
GpuKernelTable
[
cfg
.
gfx
.
blockSize
/
64
-
1
][
cfg
.
gfx
.
unrollFactor
-
1
]
GpuKernelTable
[
cfg
.
gfx
.
blockSize
/
64
-
1
][
cfg
.
gfx
.
unrollFactor
-
1
]
<<<
gridSize
,
blockSize
,
0
,
stream
>>>
<<<
gridSize
,
blockSize
,
0
,
stream
>>>
(
resources
.
subExecParamGpuPtr
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
(
resources
.
subExecParamGpuPtr
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
if
(
stopEvent
!=
NULL
)
ERR_CHECK
(
hipEventRecord
(
stopEvent
,
stream
));
#else
#else
hipExtLaunchKernelGGL
(
GpuKernelTable
[
cfg
.
gfx
.
blockSize
/
64
-
1
][
cfg
.
gfx
.
unrollFactor
-
1
],
hipExtLaunchKernelGGL
(
GpuKernelTable
[
cfg
.
gfx
.
blockSize
/
64
-
1
][
cfg
.
gfx
.
unrollFactor
-
1
],
gridSize
,
blockSize
,
0
,
stream
,
gridSize
,
blockSize
,
0
,
stream
,
startEvent
,
stopEvent
,
NULL
,
NULL
,
0
,
resources
.
subExecParamGpuPtr
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
0
,
resources
.
subExecParamGpuPtr
,
cfg
.
gfx
.
waveOrder
,
cfg
.
general
.
numSubIterations
);
#endif
#endif
ERR_CHECK
(
hipStreamSynchronize
(
stream
));
ERR_CHECK
(
hipStreamSynchronize
(
stream
));
auto
cpuDelta
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
cpuStart
;
auto
cpuDelta
=
std
::
chrono
::
high_resolution_clock
::
now
()
-
cpuStart
;
double
d
eltaMsec
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
duration
<
double
>>
(
cpuDelta
).
count
()
*
1000.0
;
double
cpuD
eltaMsec
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
duration
<
double
>>
(
cpuDelta
).
count
()
*
1000.0
;
if
(
iteration
>=
0
)
{
if
(
iteration
>=
0
)
{
double
deltaMsec
=
cpuDeltaMsec
;
if
(
startEvent
!=
NULL
)
{
float
gpuDeltaMsec
;
ERR_CHECK
(
hipEventElapsedTime
(
&
gpuDeltaMsec
,
startEvent
,
stopEvent
));
deltaMsec
=
gpuDeltaMsec
;
}
resources
.
totalDurationMsec
+=
deltaMsec
;
resources
.
totalDurationMsec
+=
deltaMsec
;
if
(
cfg
.
general
.
recordPerIteration
)
{
if
(
cfg
.
general
.
recordPerIteration
)
{
resources
.
perIterMsec
.
push_back
(
deltaMsec
);
resources
.
perIterMsec
.
push_back
(
deltaMsec
);
#if !defined(__NVCC__)
std
::
set
<
std
::
pair
<
int
,
int
>>
CUs
;
std
::
set
<
std
::
pair
<
int
,
int
>>
CUs
;
for
(
int
i
=
0
;
i
<
numSubExecs
;
i
++
)
{
for
(
int
i
=
0
;
i
<
numSubExecs
;
i
++
)
{
CUs
.
insert
(
std
::
make_pair
(
resources
.
subExecParamGpuPtr
[
i
].
xccId
,
CUs
.
insert
(
std
::
make_pair
(
resources
.
subExecParamGpuPtr
[
i
].
xccId
,
GetId
(
resources
.
subExecParamGpuPtr
[
i
].
hwId
)));
GetId
(
resources
.
subExecParamGpuPtr
[
i
].
hwId
)));
}
}
resources
.
perIterCUs
.
push_back
(
CUs
);
resources
.
perIterCUs
.
push_back
(
CUs
);
#endif
}
}
}
}
return
ERR_NONE
;
return
ERR_NONE
;
...
@@ -1895,6 +1905,8 @@ namespace {
...
@@ -1895,6 +1905,8 @@ namespace {
ExecuteGpuTransfer
,
ExecuteGpuTransfer
,
iteration
,
iteration
,
exeInfo
.
streams
[
i
],
exeInfo
.
streams
[
i
],
cfg
.
gfx
.
useHipEvents
?
exeInfo
.
startEvents
[
i
]
:
NULL
,
cfg
.
gfx
.
useHipEvents
?
exeInfo
.
stopEvents
[
i
]
:
NULL
,
xccDim
,
xccDim
,
std
::
cref
(
cfg
),
std
::
cref
(
cfg
),
std
::
ref
(
exeInfo
.
resources
[
i
])));
std
::
ref
(
exeInfo
.
resources
[
i
])));
...
@@ -1931,7 +1943,7 @@ namespace {
...
@@ -1931,7 +1943,7 @@ namespace {
double
cpuDeltaMsec
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
duration
<
double
>>
(
cpuDelta
).
count
()
*
1000.0
;
double
cpuDeltaMsec
=
std
::
chrono
::
duration_cast
<
std
::
chrono
::
duration
<
double
>>
(
cpuDelta
).
count
()
*
1000.0
;
if
(
iteration
>=
0
)
{
if
(
iteration
>=
0
)
{
if
(
cfg
.
gfx
.
useHipEvents
)
{
if
(
cfg
.
gfx
.
useHipEvents
&&
!
cfg
.
gfx
.
useMultiStream
)
{
float
gpuDeltaMsec
;
float
gpuDeltaMsec
;
ERR_CHECK
(
hipEventElapsedTime
(
&
gpuDeltaMsec
,
exeInfo
.
startEvents
[
0
],
exeInfo
.
stopEvents
[
0
]));
ERR_CHECK
(
hipEventElapsedTime
(
&
gpuDeltaMsec
,
exeInfo
.
startEvents
[
0
],
exeInfo
.
stopEvents
[
0
]));
exeInfo
.
totalDurationMsec
+=
gpuDeltaMsec
;
exeInfo
.
totalDurationMsec
+=
gpuDeltaMsec
;
...
@@ -1940,34 +1952,31 @@ namespace {
...
@@ -1940,34 +1952,31 @@ namespace {
}
}
// Determine timing for each of the individual transfers that were part of this launch
// Determine timing for each of the individual transfers that were part of this launch
for
(
int
i
=
0
;
i
<
exeInfo
.
resources
.
size
();
i
++
)
{
if
(
!
cfg
.
gfx
.
useMultiStream
)
{
TransferResources
&
resources
=
exeInfo
.
resources
[
i
];
for
(
int
i
=
0
;
i
<
exeInfo
.
resources
.
size
();
i
++
)
{
long
long
minStartCycle
=
std
::
numeric_limits
<
long
long
>::
max
()
;
TransferResources
&
resources
=
exeInfo
.
resources
[
i
]
;
long
long
m
axStop
Cycle
=
std
::
numeric_limits
<
long
long
>::
m
in
();
long
long
m
inStart
Cycle
=
std
::
numeric_limits
<
long
long
>::
m
ax
();
std
::
set
<
std
::
pair
<
int
,
int
>>
CUs
;
long
long
maxStopCycle
=
std
::
numeric_limits
<
long
long
>::
min
()
;
std
::
set
<
std
::
pair
<
int
,
int
>>
CUs
;
for
(
auto
subExecIdx
:
resources
.
subExecIdx
)
{
minStartCycle
=
std
::
min
(
minStartCycle
,
exeInfo
.
subExecParamGpu
[
subExecIdx
].
startCycle
);
for
(
auto
subExecIdx
:
resources
.
subExecIdx
)
{
maxStop
Cycle
=
std
::
m
ax
(
maxStop
Cycle
,
exeInfo
.
subExecParamGpu
[
subExecIdx
].
st
op
Cycle
);
minStart
Cycle
=
std
::
m
in
(
minStart
Cycle
,
exeInfo
.
subExecParamGpu
[
subExecIdx
].
st
art
Cycle
);
if
(
cfg
.
general
.
recordPerIteration
)
{
maxStopCycle
=
std
::
max
(
maxStopCycle
,
exeInfo
.
subExecParamGpu
[
subExecIdx
].
stopCycle
);
#if !defined(__NVCC__)
if
(
cfg
.
general
.
recordPerIteration
)
{
CUs
.
insert
(
std
::
make_pair
(
exeInfo
.
subExecParamGpu
[
subExecIdx
].
xccId
,
CUs
.
insert
(
std
::
make_pair
(
exeInfo
.
subExecParamGpu
[
subExecIdx
].
xccId
,
GetId
(
exeInfo
.
subExecParamGpu
[
subExecIdx
].
hwId
)));
GetId
(
exeInfo
.
subExecParamGpu
[
subExecIdx
].
hwId
)));
#endif
}
}
}
}
double
deltaMsec
=
(
maxStopCycle
-
minStartCycle
)
/
(
double
)(
exeInfo
.
wallClockRate
);
double
deltaMsec
=
(
maxStopCycle
-
minStartCycle
)
/
(
double
)(
exeInfo
.
wallClockRate
);
resources
.
totalDurationMsec
+=
deltaMsec
;
resources
.
totalDurationMsec
+=
deltaMsec
;
if
(
cfg
.
general
.
recordPerIteration
)
{
if
(
cfg
.
general
.
recordPerIteration
)
{
resources
.
perIterMsec
.
push_back
(
deltaMsec
);
resources
.
perIterMsec
.
push_back
(
deltaMsec
);
#if !defined(__NVCC__)
resources
.
perIterCUs
.
push_back
(
CUs
);
resources
.
perIterCUs
.
push_back
(
CUs
);
}
#endif
}
}
}
}
}
}
return
ERR_NONE
;
return
ERR_NONE
;
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment