Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d103e0cf
Commit
d103e0cf
authored
Apr 07, 2025
by
zhuwenwen
Browse files
remove __MEMORY_SCOPE_SYSTEM
parent
675ba75f
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
22 additions
and
13 deletions
+22
-13
csrc/custom_all_reduce.cuh
csrc/custom_all_reduce.cuh
+22
-13
No files found.
csrc/custom_all_reduce.cuh
View file @
d103e0cf
...
@@ -247,12 +247,16 @@ DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
...
@@ -247,12 +247,16 @@ DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
if
(
threadIdx
.
x
<
ngpus
)
{
if
(
threadIdx
.
x
<
ngpus
)
{
// simultaneously write to the corresponding flag of all ranks.
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
// Latency = 1 p2p write
__scoped_atomic_store_n
(
&
sg
.
signals
[
threadIdx
.
x
]
->
start
[
blockIdx
.
x
][
rank
],
// __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
flag
,
__ATOMIC_RELAXED
,
__MEMORY_SCOPE_SYSTEM
);
// flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
__atomic_store_n
(
&
sg
.
signals
[
threadIdx
.
x
]
->
start
[
blockIdx
.
x
][
rank
],
flag
,
__ATOMIC_RELAXED
);
// wait until we got true from all ranks
// wait until we got true from all ranks
while
(
__scoped_atomic_load_n
(
&
self_sg
->
start
[
blockIdx
.
x
][
threadIdx
.
x
],
// while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
__ATOMIC_RELAXED
,
// __ATOMIC_RELAXED,
__MEMORY_SCOPE_DEVICE
)
<
flag
);
// __MEMORY_SCOPE_DEVICE) < flag);
while
(
__atomic_load_n
(
&
self_sg
->
start
[
blockIdx
.
x
][
threadIdx
.
x
],
__ATOMIC_RELAXED
)
<
flag
);
}
}
__syncthreads
();
__syncthreads
();
// use one thread to update flag
// use one thread to update flag
...
@@ -266,15 +270,20 @@ DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
...
@@ -266,15 +270,20 @@ DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
if
(
threadIdx
.
x
<
ngpus
)
{
if
(
threadIdx
.
x
<
ngpus
)
{
// simultaneously write to the corresponding flag of all ranks.
// simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write
// Latency = 1 p2p write
__scoped_atomic_store_n
(
&
sg
.
signals
[
threadIdx
.
x
]
->
end
[
blockIdx
.
x
][
rank
],
// __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
flag
,
// flag,
final_sync
?
__ATOMIC_RELAXED
:
__ATOMIC_RELEASE
,
// final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
__MEMORY_SCOPE_SYSTEM
);
// __MEMORY_SCOPE_SYSTEM);
__atomic_store_n
(
&
sg
.
signals
[
threadIdx
.
x
]
->
end
[
blockIdx
.
x
][
rank
],
flag
,
final_sync
?
__ATOMIC_RELAXED
:
__ATOMIC_RELEASE
);
// wait until we got true from all ranks
// wait until we got true from all ranks
while
(
// while (
__scoped_atomic_load_n
(
&
self_sg
->
end
[
blockIdx
.
x
][
threadIdx
.
x
],
// __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
final_sync
?
__ATOMIC_RELAXED
:
__ATOMIC_ACQUIRE
,
// final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
__MEMORY_SCOPE_DEVICE
)
<
flag
);
// __MEMORY_SCOPE_DEVICE) < flag);
while
(
__atomic_load_n
(
&
self_sg
->
end
[
blockIdx
.
x
][
threadIdx
.
x
],
final_sync
?
__ATOMIC_RELAXED
:
__ATOMIC_ACQUIRE
)
<
flag
);
}
}
if
constexpr
(
!
final_sync
)
__syncthreads
();
if
constexpr
(
!
final_sync
)
__syncthreads
();
// use one thread to update flag
// use one thread to update flag
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment