Commit d103e0cf authored by zhuwenwen's avatar zhuwenwen
Browse files

remove __MEMORY_SCOPE_SYSTEM

parent 675ba75f
...@@ -247,12 +247,16 @@ DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg, ...@@ -247,12 +247,16 @@ DINLINE void barrier_at_start(const RankSignals& sg, Signal* self_sg,
if (threadIdx.x < ngpus) { if (threadIdx.x < ngpus) {
// simultaneously write to the corresponding flag of all ranks. // simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write // Latency = 1 p2p write
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank], // __scoped_atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank],
flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); // flag, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM);
__atomic_store_n(&sg.signals[threadIdx.x]->start[blockIdx.x][rank], flag,
__ATOMIC_RELAXED);
// wait until we got true from all ranks // wait until we got true from all ranks
while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x], // while (__scoped_atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
__ATOMIC_RELAXED, // __ATOMIC_RELAXED,
__MEMORY_SCOPE_DEVICE) < flag); // __MEMORY_SCOPE_DEVICE) < flag);
while (__atomic_load_n(&self_sg->start[blockIdx.x][threadIdx.x],
__ATOMIC_RELAXED) < flag);
} }
__syncthreads(); __syncthreads();
// use one thread to update flag // use one thread to update flag
...@@ -266,15 +270,20 @@ DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) { ...@@ -266,15 +270,20 @@ DINLINE void barrier_at_end(const RankSignals& sg, Signal* self_sg, int rank) {
if (threadIdx.x < ngpus) { if (threadIdx.x < ngpus) {
// simultaneously write to the corresponding flag of all ranks. // simultaneously write to the corresponding flag of all ranks.
// Latency = 1 p2p write // Latency = 1 p2p write
__scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank], // __scoped_atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank],
flag, // flag,
final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE, // final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE,
__MEMORY_SCOPE_SYSTEM); // __MEMORY_SCOPE_SYSTEM);
__atomic_store_n(&sg.signals[threadIdx.x]->end[blockIdx.x][rank], flag,
final_sync ? __ATOMIC_RELAXED : __ATOMIC_RELEASE);
// wait until we got true from all ranks // wait until we got true from all ranks
while ( // while (
__scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x], // __scoped_atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE, // final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE,
__MEMORY_SCOPE_DEVICE) < flag); // __MEMORY_SCOPE_DEVICE) < flag);
while (__atomic_load_n(&self_sg->end[blockIdx.x][threadIdx.x],
final_sync ? __ATOMIC_RELAXED : __ATOMIC_ACQUIRE) <
flag);
} }
if constexpr (!final_sync) __syncthreads(); if constexpr (!final_sync) __syncthreads();
// use one thread to update flag // use one thread to update flag
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment