Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
f587d8f7
Commit
f587d8f7
authored
Apr 18, 2025
by
zhuwenwen
Browse files
update barrier_at_start and barrier_at_end
parent
b0eacb5b
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
4 additions
and
4 deletions
+4
-4
csrc/custom_all_reduce.cuh
csrc/custom_all_reduce.cuh
+4
-4
No files found.
csrc/custom_all_reduce.cuh
View file @
f587d8f7
...
@@ -388,13 +388,13 @@ __global__ void __launch_bounds__(512, 1)
...
@@ -388,13 +388,13 @@ __global__ void __launch_bounds__(512, 1)
__atomic_store_n
(
curr_hdp_reg
[
i
],
0x1
,
__ATOMIC_RELAXED
);
__atomic_store_n
(
curr_hdp_reg
[
i
],
0x1
,
__ATOMIC_RELAXED
);
}
}
}
}
start
_sync
<
ngpus
>
(
sg
,
self_sg
,
rank
);
barrier_at_
start
<
ngpus
>
(
sg
,
self_sg
,
rank
);
// do the actual reduction
// do the actual reduction
for
(
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
idx
<
size
;
for
(
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
idx
<
size
;
idx
+=
gridDim
.
x
*
blockDim
.
x
)
{
idx
+=
gridDim
.
x
*
blockDim
.
x
)
{
((
P
*
)
result
)[
idx
]
=
packed_reduce
<
P
,
ngpus
,
A
>
((
const
P
**
)
&
dp
.
ptrs
[
0
],
idx
);
((
P
*
)
result
)[
idx
]
=
packed_reduce
<
P
,
ngpus
,
A
>
((
const
P
**
)
&
dp
.
ptrs
[
0
],
idx
);
}
}
end_sync
<
ngpus
,
true
>
(
sg
,
self_sg
,
rank
);
barrier_at_end
<
ngpus
,
true
>
(
sg
,
self_sg
,
rank
);
}
}
template
<
typename
T
,
int
ngpus
>
template
<
typename
T
,
int
ngpus
>
...
@@ -424,13 +424,13 @@ __global__ void __launch_bounds__(512, 1)
...
@@ -424,13 +424,13 @@ __global__ void __launch_bounds__(512, 1)
tmps
[
i
]
=
get_tmp_buf
<
P
>
(
sg
.
signals
[
target
]);
tmps
[
i
]
=
get_tmp_buf
<
P
>
(
sg
.
signals
[
target
]);
}
}
auto
tmp_out
=
tmps
[
0
];
auto
tmp_out
=
tmps
[
0
];
start
_sync
<
ngpus
>
(
sg
,
self_sg
,
rank
);
barrier_at_
start
<
ngpus
>
(
sg
,
self_sg
,
rank
);
// stage 1: reduce scatter
// stage 1: reduce scatter
for
(
int
idx
=
start
+
tid
;
idx
<
end
;
idx
+=
stride
)
{
for
(
int
idx
=
start
+
tid
;
idx
<
end
;
idx
+=
stride
)
{
tmp_out
[
idx
-
start
]
=
packed_reduce
<
P
,
ngpus
,
A
>
(
ptrs
,
idx
);
tmp_out
[
idx
-
start
]
=
packed_reduce
<
P
,
ngpus
,
A
>
(
ptrs
,
idx
);
}
}
end_sync
<
ngpus
>
(
sg
,
self_sg
,
rank
);
barrier_at_end
<
ngpus
>
(
sg
,
self_sg
,
rank
);
// stage 2: allgather. Note: it's important to match the tid between
// stage 2: allgather. Note: it's important to match the tid between
// the two stages, because visibility across devices is only guaranteed
// the two stages, because visibility across devices is only guaranteed
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment