Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
one
TransferBench
Commits
3b2a07d6
Unverified
Commit
3b2a07d6
authored
Oct 30, 2025
by
alexxu-amd
Committed by
GitHub
Oct 30, 2025
Browse files
Sync release/rocm-rel-7.1 into docs/7.1.0
parents
6bcbcf4d
a824bc1b
Changes
8
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
236 additions
and
71 deletions
+236
-71
CHANGELOG.md
CHANGELOG.md
+21
-0
CMakeLists.txt
CMakeLists.txt
+1
-1
Makefile
Makefile
+0
-1
docs/sphinx/requirements.in
docs/sphinx/requirements.in
+1
-1
docs/sphinx/requirements.txt
docs/sphinx/requirements.txt
+1
-1
src/client/EnvVars.hpp
src/client/EnvVars.hpp
+37
-1
src/client/Presets/AllToAllSweep.hpp
src/client/Presets/AllToAllSweep.hpp
+45
-38
src/header/TransferBench.hpp
src/header/TransferBench.hpp
+130
-28
No files found.
CHANGELOG.md
View file @
3b2a07d6
...
...
@@ -3,6 +3,27 @@
Documentation for TransferBench is available at
[
https://rocm.docs.amd.com/projects/TransferBench
](
https://rocm.docs.amd.com/projects/TransferBench
)
.
## v1.64.00
### Added
-
Added BLOCKSIZES to a2asweep preset to allow also sweeping over threadblock sizes
-
Added FILL_COMPRESS to allow more control over input data pattern
-
FILL_COMPRESS takes in a comma-separated list of integer percentages (that must add up to 100)
that sets the percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0 data patterns
-
Bins:
-
0 - random
-
1 - 1B0 upper 1 byte of each aligned 2 bytes is 0
-
2 - 2B0 upper 2 bytes of each aligned 4 bytes is 0
-
3 - 4B0 upper 4 bytes of each aligned 8 bytes is 0
-
4 - 32B0 upper 32 bytes of each aligned 64-byte line are 0
-
FILL_PATTERN will be ignored if FILL_COMPRESS is specified
-
Additional details about data patterns generated will be printed if the debug env var DUMP_LINES is
set to a non-zero value, which also corresponds to how many 64 byte lines will be printed
### Modified
-
Increased GFX_BLOCKSIZE limit from 512 to 1024 (still requires multiple of 64)
### Fixed
-
Fixed bug when using BYTE_OFFSET
## v1.63.00
### Added
-
Added
`gfx950`
,
`gfx1150`
, and
`gfx1151`
to default GPU targets list in CMake builds
...
...
CMakeLists.txt
View file @
3b2a07d6
...
...
@@ -9,7 +9,7 @@ if (NOT CMAKE_TOOLCHAIN_FILE)
message
(
STATUS
"CMAKE_TOOLCHAIN_FILE:
${
CMAKE_TOOLCHAIN_FILE
}
"
)
endif
()
set
(
VERSION_STRING
"1.6
3
.00"
)
set
(
VERSION_STRING
"1.6
4
.00"
)
project
(
TransferBench VERSION
${
VERSION_STRING
}
LANGUAGES CXX
)
## Load CMake modules
...
...
Makefile
View file @
3b2a07d6
...
...
@@ -83,4 +83,3 @@ TransferBenchCuda: ./src/client/Client.cpp $(shell find -regex ".*\.\hpp")
clean
:
rm
-f
./TransferBench ./TransferBenchCuda
docs/sphinx/requirements.in
View file @
3b2a07d6
rocm-docs-core==1.2
2
.0
rocm-docs-core==1.2
3
.0
docs/sphinx/requirements.txt
View file @
3b2a07d6
...
...
@@ -187,7 +187,7 @@ requests==2.32.2
# via
# pygithub
# sphinx
rocm-docs-core==1.2
2
.0
rocm-docs-core==1.2
3
.0
# via -r requirements.in
rpds-py==0.22.3
# via
...
...
src/client/EnvVars.hpp
View file @
3b2a07d6
...
...
@@ -77,6 +77,7 @@ public:
int
blockBytes
;
// Each subexecutor, except the last, gets a multiple of this many bytes to copy
int
byteOffset
;
// Byte-offset for memory allocations
vector
<
float
>
fillPattern
;
// Pattern of floats used to fill source data
vector
<
int
>
fillCompress
;
// Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0
int
validateDirect
;
// Validate GPU destination memory directly instead of staging GPU memory on host
int
validateSource
;
// Validate source GPU memory immediately after preparation
...
...
@@ -137,6 +138,7 @@ public:
alwaysValidate
=
GetEnvVar
(
"ALWAYS_VALIDATE"
,
0
);
blockBytes
=
GetEnvVar
(
"BLOCK_BYTES"
,
256
);
byteOffset
=
GetEnvVar
(
"BYTE_OFFSET"
,
0
);
fillCompress
=
GetEnvVarArray
(
"FILL_COMPRESS"
,
{});
gfxBlockOrder
=
GetEnvVar
(
"GFX_BLOCK_ORDER"
,
0
);
gfxBlockSize
=
GetEnvVar
(
"GFX_BLOCK_SIZE"
,
256
);
gfxSingleTeam
=
GetEnvVar
(
"GFX_SINGLE_TEAM"
,
1
);
...
...
@@ -314,6 +316,7 @@ public:
printf
(
" CLOSEST_NIC - Comma-separated list of per-GPU closest NIC (default=auto)
\n
"
);
#endif
printf
(
" CU_MASK - CU mask for streams. Can specify ranges e.g '5,10-12,14'
\n
"
);
printf
(
" FILL_COMPRESS - Percentages of 64B lines to be filled by random/1B0/2B0/4B0/32B0
\n
"
);
printf
(
" FILL_PATTERN - Big-endian pattern for source data, specified in hex digits. Must be even # of digits
\n
"
);
printf
(
" GFX_BLOCK_ORDER - How blocks for transfers are ordered. 0=sequential, 1=interleaved
\n
"
);
printf
(
" GFX_BLOCK_SIZE - # of threads per threadblock (Must be multiple of 64)
\n
"
);
...
...
@@ -400,6 +403,8 @@ public:
#endif
Print
(
"CU_MASK"
,
getenv
(
"CU_MASK"
)
?
1
:
0
,
"%s"
,
(
cuMask
.
size
()
?
GetCuMaskDesc
().
c_str
()
:
"All"
));
Print
(
"FILL_COMPRESS"
,
getenv
(
"FILL_COMPRESS"
)
?
1
:
0
,
"%s"
,
(
fillCompress
.
size
()
?
GetStr
(
fillCompress
).
c_str
()
:
"Not specified"
));
Print
(
"FILL_PATTERN"
,
getenv
(
"FILL_PATTERN"
)
?
1
:
0
,
"%s"
,
(
fillPattern
.
size
()
?
getenv
(
"FILL_PATTERN"
)
:
TransferBench
::
GetStrAttribute
(
ATR_SRC_PREP_DESCRIPTION
).
c_str
()));
Print
(
"GFX_BLOCK_ORDER"
,
gfxBlockOrder
,
...
...
@@ -493,6 +498,27 @@ public:
}
static
std
::
vector
<
int
>
GetEnvVarArray
(
std
::
string
const
&
varname
,
std
::
vector
<
int
>
const
&
defaultValue
)
{
if
(
getenv
(
varname
.
c_str
()))
{
std
::
vector
<
int
>
values
;
char
*
arrayStr
=
getenv
(
varname
.
c_str
());
char
*
token
=
strtok
(
arrayStr
,
","
);
while
(
token
)
{
int
val
;
if
(
sscanf
(
token
,
"%d"
,
&
val
)
==
1
)
{
values
.
push_back
(
val
);
}
else
{
printf
(
"[ERROR] Unrecognized token [%s]
\n
"
,
token
);
exit
(
1
);
}
token
=
strtok
(
NULL
,
","
);
}
return
values
;
}
return
defaultValue
;
}
static
std
::
vector
<
int
>
GetEnvVarRangeArray
(
std
::
string
const
&
varname
,
std
::
vector
<
int
>
const
&
defaultValue
)
{
if
(
getenv
(
varname
.
c_str
()))
{
char
*
rangeStr
=
getenv
(
varname
.
c_str
());
...
...
@@ -524,6 +550,15 @@ public:
return
defaultValue
;
}
std
::
string
GetStr
(
std
::
vector
<
int
>
const
&
varnameList
)
const
{
std
::
string
result
=
""
;
for
(
int
i
=
0
;
i
<
varnameList
.
size
();
i
++
)
{
if
(
i
)
result
+=
","
;
result
+=
std
::
to_string
(
varnameList
[
i
]);
}
return
result
;
}
std
::
string
GetCuMaskDesc
()
const
{
std
::
vector
<
std
::
pair
<
int
,
int
>>
runs
;
...
...
@@ -572,9 +607,10 @@ public:
cfg
.
data
.
alwaysValidate
=
alwaysValidate
;
cfg
.
data
.
blockBytes
=
blockBytes
;
cfg
.
data
.
byteOffset
=
byteOffset
;
cfg
.
data
.
fillCompress
=
fillCompress
;
cfg
.
data
.
fillPattern
=
fillPattern
;
cfg
.
data
.
validateDirect
=
validateDirect
;
cfg
.
data
.
validateSource
=
validateSource
;
cfg
.
data
.
fillPattern
=
fillPattern
;
cfg
.
dma
.
useHipEvents
=
useHipEvents
;
cfg
.
dma
.
useHsaCopy
=
useHsaDma
;
...
...
src/client/Presets/AllToAllSweep.hpp
View file @
3b2a07d6
...
...
@@ -50,6 +50,7 @@ void AllToAllSweepPreset(EnvVars& ev,
int
useSpray
=
EnvVars
::
GetEnvVar
(
"USE_SPRAY"
,
0
);
int
verbose
=
EnvVars
::
GetEnvVar
(
"VERBOSE"
,
0
);
std
::
vector
<
int
>
blockList
=
EnvVars
::
GetEnvVarArray
(
"BLOCKSIZES"
,
{
256
});
std
::
vector
<
int
>
unrollList
=
EnvVars
::
GetEnvVarArray
(
"UNROLLS"
,
{
1
,
2
,
3
,
4
,
6
,
8
});
std
::
vector
<
int
>
numCusList
=
EnvVars
::
GetEnvVarArray
(
"NUM_CUS"
,
{
4
,
8
,
12
,
16
,
24
,
32
});
...
...
@@ -77,6 +78,7 @@ void AllToAllSweepPreset(EnvVars& ev,
ev
.
Print
(
"A2A_MODE"
,
(
a2aMode
==
A2A_CUSTOM
)
?
std
::
to_string
(
numSrcs
)
+
":"
+
std
::
to_string
(
numDsts
)
:
std
::
to_string
(
a2aMode
),
(
a2aMode
==
A2A_CUSTOM
)
?
(
std
::
to_string
(
numSrcs
)
+
" read(s) "
+
std
::
to_string
(
numDsts
)
+
" write(s)"
).
c_str
()
:
a2aModeStr
[
a2aMode
]);
ev
.
Print
(
"BLOCKSIZES"
,
blockList
.
size
()
,
EnvVars
::
ToStr
(
blockList
).
c_str
());
ev
.
Print
(
"SHOW_MIN_ONLY"
,
showMinOnly
,
showMinOnly
?
"Showing only slowest GPU results"
:
"Showing slowest and fastest GPU results"
);
ev
.
Print
(
"NUM_CUS"
,
numCusList
.
size
(),
EnvVars
::
ToStr
(
numCusList
).
c_str
());
ev
.
Print
(
"NUM_GPU_DEVICES"
,
numGpus
,
"Using %d GPUs"
,
numGpus
);
...
...
@@ -180,6 +182,10 @@ void AllToAllSweepPreset(EnvVars& ev,
std
::
map
<
std
::
pair
<
int
,
int
>
,
TransferBench
::
TestResults
>
results
;
// Display summary
for
(
int
blockSize
:
blockList
)
{
printf
(
"Blocksize: %d
\n
"
,
blockSize
);
ev
.
gfxBlockSize
=
cfg
.
gfx
.
blockSize
=
blockSize
;
printf
(
"#CUs
\\
Unroll"
);
for
(
int
u
:
unrollList
)
{
printf
(
" %d(Min) "
,
u
);
...
...
@@ -225,4 +231,5 @@ void AllToAllSweepPreset(EnvVars& ev,
}
}
}
}
}
src/header/TransferBench.hpp
View file @
3b2a07d6
/*
Copyright (c)
2019-2024
Advanced Micro Devices, Inc. All rights reserved.
Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
...
...
@@ -66,7 +66,7 @@ namespace TransferBench
using
std
::
set
;
using
std
::
vector
;
constexpr
char
VERSION
[]
=
"1.6
3
"
;
constexpr
char
VERSION
[]
=
"1.6
4
"
;
/**
* Enumeration of supported Executor types
...
...
@@ -166,6 +166,7 @@ namespace TransferBench
int
blockBytes
=
256
;
///< Each subexecutor works on a multiple of this many bytes
int
byteOffset
=
0
;
///< Byte-offset for memory allocations
vector
<
float
>
fillPattern
=
{};
///< Pattern of floats used to fill source data
vector
<
int
>
fillCompress
=
{};
///< Customized data patterns (overrides fillPattern if non-empty)
int
validateDirect
=
0
;
///< Validate GPU results directly instead of copying to host
int
validateSource
=
0
;
///< Validate src GPU memory immediately after preparation
};
...
...
@@ -599,7 +600,7 @@ namespace {
// Constants
//========================================================================================
int
constexpr
MAX_BLOCKSIZE
=
512
;
// Max threadblock size
int
constexpr
MAX_BLOCKSIZE
=
1024
;
// Max threadblock size
int
constexpr
MAX_WAVEGROUPS
=
MAX_BLOCKSIZE
/
64
;
// Max wavegroups/warps
int
constexpr
MAX_UNROLL
=
8
;
// Max unroll factor
int
constexpr
MAX_SRCS
=
8
;
// Max srcs per Transfer
...
...
@@ -681,8 +682,8 @@ namespace {
int
canAccess
;
ERR_CHECK
(
hipDeviceCanAccessPeer
(
&
canAccess
,
deviceId
,
peerDeviceId
));
if
(
!
canAccess
)
return
{
ERR_FATAL
,
"Unable to enable peer access from GPU devices %d to %d
"
,
peerDeviceId
,
deviceId
};
return
{
ERR_FATAL
,
"Peer access is unavailable between GPU devices %d to %d."
"For AMD hardware, check IOMMU configuration
"
,
peerDeviceId
,
deviceId
};
ERR_CHECK
(
hipSetDevice
(
deviceId
));
hipError_t
error
=
hipDeviceEnablePeerAccess
(
peerDeviceId
,
0
);
...
...
@@ -973,6 +974,19 @@ namespace {
errors
.
push_back
({
ERR_FATAL
,
"[data.blockBytes] must be positive multiple of %lu"
,
sizeof
(
float
)});
if
(
cfg
.
data
.
byteOffset
<
0
||
cfg
.
data
.
byteOffset
%
sizeof
(
float
))
errors
.
push_back
({
ERR_FATAL
,
"[data.byteOffset] must be positive multiple of %lu"
,
sizeof
(
float
)});
if
(
cfg
.
data
.
fillCompress
.
size
()
>
0
&&
cfg
.
data
.
fillPattern
.
size
()
>
0
)
errors
.
push_back
({
ERR_WARN
,
"[data.fillCompress] will override [data.fillPattern] when both are specified"
});
if
(
cfg
.
data
.
fillCompress
.
size
()
>
0
)
{
int
sum
=
0
;
for
(
int
bin
:
cfg
.
data
.
fillCompress
)
sum
+=
bin
;
if
(
sum
!=
100
)
{
errors
.
push_back
({
ERR_FATAL
,
"[data.fillCompress] values must add up to 100"
});
}
}
if
(
cfg
.
data
.
fillCompress
.
size
()
>
5
)
{
errors
.
push_back
({
ERR_FATAL
,
"[data.fillCompress] may only have up to 5 values"
});
}
// Check GFX options
if
(
cfg
.
gfx
.
blockOrder
<
0
||
cfg
.
gfx
.
blockOrder
>
2
)
...
...
@@ -2162,8 +2176,104 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
{
size_t
N
=
cpuBuffer
.
size
();
// Source buffer
if
(
bufferIdx
>=
0
)
{
if
(
!
cfg
.
data
.
fillCompress
.
empty
())
{
// 0 -> Random
// 1 -> 1B0 - The upper 1 byte of each aligned 2 bytes is 0
// 2 -> 2B0 - The upper 2 bytes of each aligned 4 bytes are 0
// 3 -> 4B0 - The upper 4 bytes of each aligned 8 bytes are 0
// 4 -> 32B0 - The upper 32 bytes of each 64-byte line are 0
// Fill buffer with random floats
std
::
mt19937
gen
;
gen
.
seed
(
bufferIdx
*
425
);
std
::
uniform_real_distribution
<
float
>
dist
(
-
100000.0
f
,
+
100000.0
f
);
for
(
size_t
i
=
0
;
i
<
N
;
i
++
)
{
cpuBuffer
[
i
]
=
dist
(
gen
);
}
// Figure out distribution for lines based on the percentages given
size_t
numLines
=
N
/
16
;
size_t
leftover
=
numLines
;
std
::
vector
<
size_t
>
lineCounts
(
5
,
0
);
std
::
set
<
std
::
pair
<
double
,
int
>>
remainder
;
// Assign rounded down values first
std
::
vector
<
int
>
percentages
=
cfg
.
data
.
fillCompress
;
while
(
percentages
.
size
()
<
5
)
percentages
.
push_back
(
0
);
for
(
int
i
=
0
;
i
<
percentages
.
size
();
i
++
){
lineCounts
[
i
]
=
(
size_t
)(
numLines
*
(
percentages
[
i
]
/
100.0
));
leftover
-=
lineCounts
[
i
];
remainder
.
insert
(
std
::
make_pair
(
numLines
*
(
percentages
[
i
]
/
100.0
)
-
lineCounts
[
i
],
i
));
}
// Assign leftovers based on largest remainder
while
(
leftover
!=
0
)
{
auto
last
=
*
remainder
.
rbegin
();
lineCounts
[
last
.
second
]
++
;
remainder
.
erase
(
last
);
leftover
--
;
}
// Randomly decide which lines get assigned to which types
std
::
vector
<
int
>
lineTypes
(
numLines
,
0
);
int
offset
=
lineCounts
[
0
];
for
(
int
i
=
1
;
i
<
5
;
i
++
)
{
for
(
int
j
=
0
;
j
<
lineCounts
[
i
];
j
++
)
lineTypes
[
offset
++
]
=
i
;
}
std
::
shuffle
(
lineTypes
.
begin
(),
lineTypes
.
end
(),
gen
);
// Apply zero-ing
int
dumpLines
=
getenv
(
"DUMP_LINES"
)
?
atoi
(
getenv
(
"DUMP_LINES"
))
:
0
;
if
(
dumpLines
)
{
printf
(
"Input pattern 64B line statistics for bufferIdx %d:
\n
"
,
bufferIdx
);
printf
(
"Total lines: %lu
\n
"
,
numLines
);
printf
(
"- 0: Random : %8lu (%8.3f%%)
\n
"
,
lineCounts
[
0
],
100.0
*
lineCounts
[
0
]
/
(
1.0
*
numLines
));
printf
(
"- 1: 1B0 : %8lu (%8.3f%%)
\n
"
,
lineCounts
[
1
],
100.0
*
lineCounts
[
1
]
/
(
1.0
*
numLines
));
printf
(
"- 2: 2B0 : %8lu (%8.3f%%)
\n
"
,
lineCounts
[
2
],
100.0
*
lineCounts
[
2
]
/
(
1.0
*
numLines
));
printf
(
"- 3: 4B0 : %8lu (%8.3f%%)
\n
"
,
lineCounts
[
3
],
100.0
*
lineCounts
[
3
]
/
(
1.0
*
numLines
));
printf
(
"- 4: 32B0 : %8lu (%8.3f%%)
\n
"
,
lineCounts
[
4
],
100.0
*
lineCounts
[
4
]
/
(
1.0
*
numLines
));
}
for
(
int
line
=
0
;
line
<
numLines
;
line
++
)
{
unsigned
char
*
linePtr
=
(
unsigned
char
*
)
&
cpuBuffer
[
line
*
16
];
switch
(
lineTypes
[
line
])
{
case
1
:
// 1B0
for
(
int
i
=
0
;
i
<
32
;
i
++
)
linePtr
[
2
*
i
+
1
]
=
0
;
break
;
case
2
:
// 2B0
for
(
int
i
=
0
;
i
<
16
;
i
++
)
{
linePtr
[
4
*
i
+
2
]
=
0
;
linePtr
[
4
*
i
+
3
]
=
0
;
}
break
;
case
3
:
// 4B0
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
linePtr
[
8
*
i
+
4
]
=
0
;
linePtr
[
8
*
i
+
5
]
=
0
;
linePtr
[
8
*
i
+
6
]
=
0
;
linePtr
[
8
*
i
+
7
]
=
0
;
}
break
;
case
4
:
// 32B0
for
(
int
i
=
32
;
i
<
64
;
i
++
)
linePtr
[
i
]
=
0
;
break
;
}
if
(
line
<
dumpLines
)
{
printf
(
"Line %02d [%d]: "
,
line
,
lineTypes
[
line
]);
for
(
int
j
=
63
;
j
>=
0
;
j
--
){
printf
(
"%02x "
,
linePtr
[
j
]);
if
(
j
%
16
==
0
)
printf
(
" "
);
}
printf
(
"
\n
"
);
}
}
}
else
{
// Use fill pattern if specified
size_t
patternLen
=
cfg
.
data
.
fillPattern
.
size
();
if
(
patternLen
>
0
)
{
...
...
@@ -2177,27 +2287,10 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
if
(
leftOver
)
memcpy
(
cpuBufferPtr
,
cfg
.
data
.
fillPattern
.
data
(),
leftOver
*
sizeof
(
float
));
}
else
{
// Fall back to pseudo-random
for
(
size_t
i
=
0
;
i
<
N
;
++
i
)
cpuBuffer
[
i
]
=
PrepSrcValue
(
bufferIdx
,
i
);
}
}
else
{
// Destination buffer
int
numSrcs
=
-
bufferIdx
-
1
;
if
(
numSrcs
==
0
)
{
// Note: 0x75757575 = 13323083.0
memset
(
cpuBuffer
.
data
(),
MEMSET_CHAR
,
N
*
sizeof
(
float
));
}
else
{
PrepareReference
(
cfg
,
cpuBuffer
,
0
);
if
(
numSrcs
>
1
)
{
std
::
vector
<
float
>
temp
(
N
);
for
(
int
i
=
1
;
i
<
numSrcs
;
i
++
)
{
PrepareReference
(
cfg
,
temp
,
i
);
for
(
int
j
=
0
;
j
<
N
;
j
++
)
{
cpuBuffer
[
i
]
+=
temp
[
i
];
}
}
}
}
}
}
...
...
@@ -2472,7 +2565,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
}
std
::
random_device
rd
;
std
::
default_random_engine
gen
(
rd
());
std
::
mt19937
gen
(
rd
());
std
::
shuffle
(
indices
.
begin
(),
indices
.
end
(),
gen
);
// Build randomized threadblock list
...
...
@@ -3055,7 +3148,15 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
GPU_KERNEL_UNROLL_DECL
(
320
),
GPU_KERNEL_UNROLL_DECL
(
384
),
GPU_KERNEL_UNROLL_DECL
(
448
),
GPU_KERNEL_UNROLL_DECL
(
512
)
GPU_KERNEL_UNROLL_DECL
(
512
),
GPU_KERNEL_UNROLL_DECL
(
576
),
GPU_KERNEL_UNROLL_DECL
(
640
),
GPU_KERNEL_UNROLL_DECL
(
704
),
GPU_KERNEL_UNROLL_DECL
(
768
),
GPU_KERNEL_UNROLL_DECL
(
832
),
GPU_KERNEL_UNROLL_DECL
(
896
),
GPU_KERNEL_UNROLL_DECL
(
960
),
GPU_KERNEL_UNROLL_DECL
(
1024
),
};
#undef GPU_KERNEL_UNROLL_DECL
#undef GPU_KERNEL_DWORD_DECL
...
...
@@ -3442,6 +3543,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
vector
<
float
>
outputBuffer
(
maxN
);
vector
<
vector
<
float
>>
dstReference
(
maxNumSrcs
+
1
,
vector
<
float
>
(
maxN
));
{
size_t
initOffset
=
cfg
.
data
.
byteOffset
/
sizeof
(
float
);
vector
<
vector
<
float
>>
srcReference
(
maxNumSrcs
,
vector
<
float
>
(
maxN
));
memset
(
dstReference
[
0
].
data
(),
MEMSET_CHAR
,
maxNumBytes
);
...
...
@@ -3458,7 +3560,7 @@ static bool IsConfiguredGid(union ibv_gid const& gid)
// Initialize all src memory buffers
for
(
auto
resource
:
transferResources
)
{
for
(
int
srcIdx
=
0
;
srcIdx
<
resource
->
srcMem
.
size
();
srcIdx
++
)
{
ERR_APPEND
(
hipMemcpy
(
resource
->
srcMem
[
srcIdx
],
srcReference
[
srcIdx
].
data
(),
resource
->
numBytes
,
ERR_APPEND
(
hipMemcpy
(
resource
->
srcMem
[
srcIdx
]
+
initOffset
,
srcReference
[
srcIdx
].
data
(),
resource
->
numBytes
,
hipMemcpyDefault
),
errResults
);
}
}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment