Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
eea26d0d
Commit
eea26d0d
authored
Apr 17, 2026
by
one
Browse files
Improve launch bounds for gpu-copy
parent
2ea51c1d
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
37 additions
and
1 deletion
+37
-1
superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
...chmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+37
-1
No files found.
superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
View file @
eea26d0d
...
@@ -875,11 +875,42 @@ __global__ void SMOneToAllCopyKernel(ulong2 **dst_buffers, ulong2 *src_buffer, u
...
@@ -875,11 +875,42 @@ __global__ void SMOneToAllCopyKernel(ulong2 **dst_buffers, ulong2 *src_buffer, u
}
}
}
}
int
GetSafeAllToAllThreadBlockSize
(
const
Opts
&
opts
,
int
*
thread_block_size
)
{
if
(
thread_block_size
==
nullptr
)
{
return
-
1
;
}
cudaFuncAttributes
func_attr
;
#if defined(__HIP_PLATFORM_AMD__)
cudaError_t
cuda_err
=
cudaFuncGetAttributes
(
&
func_attr
,
reinterpret_cast
<
const
void
*>
(
SMOneToAllCopyKernel
));
#else
cudaError_t
cuda_err
=
cudaFuncGetAttributes
(
&
func_attr
,
SMOneToAllCopyKernel
);
#endif
if
(
cuda_err
!=
cudaSuccess
)
{
fprintf
(
stderr
,
"GetSafeAllToAllThreadBlockSize::cudaFuncGetAttributes error: %d
\n
"
,
cuda_err
);
return
-
1
;
}
if
(
func_attr
.
maxThreadsPerBlock
<=
0
)
{
fprintf
(
stderr
,
"GetSafeAllToAllThreadBlockSize::invalid maxThreadsPerBlock: %d
\n
"
,
func_attr
.
maxThreadsPerBlock
);
return
-
1
;
}
*
thread_block_size
=
static_cast
<
int
>
(
opts
.
all_to_all_thread_block_size
);
if
(
*
thread_block_size
>
func_attr
.
maxThreadsPerBlock
)
{
*
thread_block_size
=
func_attr
.
maxThreadsPerBlock
;
}
return
0
;
}
// src_rank/dst_rank: < 0 for all ranks, else for specified rank
// src_rank/dst_rank: < 0 for all ranks, else for specified rank
int
RunAllToAllBench
(
const
Opts
&
opts
,
int
gpu_count
,
int
src_rank
,
int
dst_rank
)
{
int
RunAllToAllBench
(
const
Opts
&
opts
,
int
gpu_count
,
int
src_rank
,
int
dst_rank
)
{
int
ret
=
0
;
int
ret
=
0
;
cudaError_t
cuda_err
=
cudaSuccess
;
cudaError_t
cuda_err
=
cudaSuccess
;
int
can_access
=
0
;
int
can_access
=
0
;
int
thread_block_size
=
0
;
std
::
vector
<
uint8_t
*>
src_buffers_gpu
(
gpu_count
,
nullptr
);
std
::
vector
<
uint8_t
*>
src_buffers_gpu
(
gpu_count
,
nullptr
);
std
::
vector
<
uint8_t
*>
dst_buffers_gpu
(
gpu_count
,
nullptr
);
std
::
vector
<
uint8_t
*>
dst_buffers_gpu
(
gpu_count
,
nullptr
);
...
@@ -890,6 +921,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
...
@@ -890,6 +921,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
uint64_t
*
data_buffer_cpu
=
nullptr
;
uint64_t
*
data_buffer_cpu
=
nullptr
;
ret
=
GetSafeAllToAllThreadBlockSize
(
opts
,
&
thread_block_size
);
if
(
ret
!=
0
)
{
return
-
1
;
}
// Scan all GPUs
// Scan all GPUs
for
(
int
i
=
0
;
i
<
gpu_count
;
i
++
)
{
for
(
int
i
=
0
;
i
<
gpu_count
;
i
++
)
{
for
(
int
j
=
0
;
j
<
gpu_count
;
j
++
)
{
for
(
int
j
=
0
;
j
<
gpu_count
;
j
++
)
{
...
@@ -1011,7 +1047,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
...
@@ -1011,7 +1047,7 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
}
}
}
}
SMOneToAllCopyKernel
<<<
gpu_count
*
opts
.
all_to_all_num_thread_blocks_per_rank
,
SMOneToAllCopyKernel
<<<
gpu_count
*
opts
.
all_to_all_num_thread_blocks_per_rank
,
opts
.
all_to_all_
thread_block_size
,
0
,
streams
[
rank
]
>>>
(
thread_block_size
,
0
,
streams
[
rank
]
>>>
(
(
ulong2
**
)
dst_buffer_gpu_args
[
rank
],
(
ulong2
*
)
src_buffers_gpu
[
rank
],
opts
.
size
,
rank
,
dst_rank
,
(
ulong2
**
)
dst_buffer_gpu_args
[
rank
],
(
ulong2
*
)
src_buffers_gpu
[
rank
],
opts
.
size
,
rank
,
dst_rank
,
gpu_count
);
gpu_count
);
if
(
i
==
opts
.
num_warm_up
+
opts
.
num_loops
-
1
)
{
if
(
i
==
opts
.
num_warm_up
+
opts
.
num_loops
-
1
)
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment