Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
eea26d0d
Commit
eea26d0d
authored
Apr 17, 2026
by
one
Browse files
Improve launch bounds for gpu-copy
parent
2ea51c1d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
1 deletion
+37
-1
superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
...chmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+37
-1
No files found.
superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
View file @
eea26d0d
...
...
@@ -875,11 +875,42 @@ __global__ void SMOneToAllCopyKernel(ulong2 **dst_buffers, ulong2 *src_buffer, u
}
}
int
GetSafeAllToAllThreadBlockSize
(
const
Opts
&
opts
,
int
*
thread_block_size
)
{
if
(
thread_block_size
==
nullptr
)
{
return
-
1
;
}
cudaFuncAttributes
func_attr
;
#if defined(__HIP_PLATFORM_AMD__)
cudaError_t
cuda_err
=
cudaFuncGetAttributes
(
&
func_attr
,
reinterpret_cast
<
const
void
*>
(
SMOneToAllCopyKernel
));
#else
cudaError_t
cuda_err
=
cudaFuncGetAttributes
(
&
func_attr
,
SMOneToAllCopyKernel
);
#endif
if
(
cuda_err
!=
cudaSuccess
)
{
fprintf
(
stderr
,
"GetSafeAllToAllThreadBlockSize::cudaFuncGetAttributes error: %d
\n
"
,
cuda_err
);
return
-
1
;
}
if
(
func_attr
.
maxThreadsPerBlock
<=
0
)
{
fprintf
(
stderr
,
"GetSafeAllToAllThreadBlockSize::invalid maxThreadsPerBlock: %d
\n
"
,
func_attr
.
maxThreadsPerBlock
);
return
-
1
;
}
*
thread_block_size
=
static_cast
<
int
>
(
opts
.
all_to_all_thread_block_size
);
if
(
*
thread_block_size
>
func_attr
.
maxThreadsPerBlock
)
{
*
thread_block_size
=
func_attr
.
maxThreadsPerBlock
;
}
return
0
;
}
// src_rank/dst_rank: < 0 for all ranks, else for specified rank
int
RunAllToAllBench
(
const
Opts
&
opts
,
int
gpu_count
,
int
src_rank
,
int
dst_rank
)
{
int
ret
=
0
;
cudaError_t
cuda_err
=
cudaSuccess
;
int
can_access
=
0
;
int
thread_block_size
=
0
;
std
::
vector
<
uint8_t
*>
src_buffers_gpu
(
gpu_count
,
nullptr
);
std
::
vector
<
uint8_t
*>
dst_buffers_gpu
(
gpu_count
,
nullptr
);
...
...
@@ -890,6 +921,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
uint64_t
*
data_buffer_cpu
=
nullptr
;
ret
=
GetSafeAllToAllThreadBlockSize
(
opts
,
&
thread_block_size
);
if
(
ret
!=
0
)
{
return
-
1
;
}
// Scan all GPUs
for
(
int
i
=
0
;
i
<
gpu_count
;
i
++
)
{
for
(
int
j
=
0
;
j
<
gpu_count
;
j
++
)
{
...
...
@@ -1011,7 +1047,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
}
}
SMOneToAllCopyKernel
<<<
gpu_count
*
opts
.
all_to_all_num_thread_blocks_per_rank
,
opts
.
all_to_all_
thread_block_size
,
0
,
streams
[
rank
]
>>>
(
thread_block_size
,
0
,
streams
[
rank
]
>>>
(
(
ulong2
**
)
dst_buffer_gpu_args
[
rank
],
(
ulong2
*
)
src_buffers_gpu
[
rank
],
opts
.
size
,
rank
,
dst_rank
,
gpu_count
);
if
(
i
==
opts
.
num_warm_up
+
opts
.
num_loops
-
1
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment