Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
DeepEP
Commits
1fc40d50
"examples/cpp-examples/HelloArgonInFortran.f90" did not exist on "346d3ce3244e9af797a12002ced79cbcb5190a5b"
Commit
1fc40d50
authored
Mar 06, 2025
by
Chenggang Zhao
Browse files
Improve AR performance
parent
41385ba5
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
4 deletions
+8
-4
csrc/config.hpp
csrc/config.hpp
+5
-1
csrc/kernels/internode.cu
csrc/kernels/internode.cu
+2
-2
csrc/kernels/internode_ll.cu
csrc/kernels/internode_ll.cu
+1
-1
No files found.
csrc/config.hpp
View file @
1fc40d50
...
...
@@ -34,8 +34,12 @@ struct Config {
EP_HOST_ASSERT
(
num_max_nvl_chunked_send_tokens
>
0
and
num_max_nvl_chunked_recv_tokens
>
0
);
EP_HOST_ASSERT
(
num_max_nvl_chunked_send_tokens
<
num_max_nvl_chunked_recv_tokens
);
EP_HOST_ASSERT
(
num_max_rdma_chunked_send_tokens
>
0
and
num_max_rdma_chunked_recv_tokens
>
0
);
EP_HOST_ASSERT
(
num_max_rdma_chunked_send_tokens
<
num_max_rdma_chunked_recv_tokens
);
// Ceil up RDMA buffer size
this
->
num_max_rdma_chunked_recv_tokens
=
align
<
int
>
(
num_max_rdma_chunked_recv_tokens
,
num_max_rdma_chunked_send_tokens
);
EP_HOST_ASSERT
(
num_max_rdma_chunked_send_tokens
<
num_max_rdma_chunked_recv_tokens
);
// NOTES: this assertion is related to RDMA lazy head update, we must ensure senders always have space to push
EP_HOST_ASSERT
(
num_max_rdma_chunked_send_tokens
<=
num_max_rdma_chunked_recv_tokens
/
2
);
}
size_t
get_nvl_buffer_size_hint
(
size_t
hidden_bytes
,
int
num_ranks
)
const
{
...
...
csrc/kernels/internode.cu
View file @
1fc40d50
...
...
@@ -925,7 +925,7 @@ dispatch(int4* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv
break
;
// Update remote head
if
(
min_head
!=
std
::
numeric_limits
<
int
>::
max
()
and
min_head
>
last_head
and
lane_id
<
kNumRDMARanks
)
{
if
(
min_head
!=
std
::
numeric_limits
<
int
>::
max
()
and
min_head
>
=
last_head
+
num_max_rdma_chunked_send_tokens
and
lane_id
<
kNumRDMARanks
)
{
nvshmemx_signal_op
(
rdma_channel_head
.
buffer
(
rdma_rank
),
min_head
-
last_head
,
NVSHMEM_SIGNAL_ADD
,
translate_dst_rdma_rank
<
kLowLatencyMode
>
(
lane_id
,
nvl_rank
));
last_head
=
min_head
;
...
...
@@ -1655,7 +1655,7 @@ combine(int4* combined_x, float* combined_topk_weights,
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumRDMAReceivers
;
++
i
)
if
(
not
rdma_receiver_retired
[
i
])
min_head
=
min
(
min_head
,
rdma_receiver_rdma_head
[
i
][
dst_rdma_rank
]);
if
(
min_head
!=
std
::
numeric_limits
<
int
>::
max
()
and
min_head
>
last_rdma_head
and
lane_id
<
kNumRDMARanks
)
{
if
(
min_head
!=
std
::
numeric_limits
<
int
>::
max
()
and
min_head
>
=
last_rdma_head
+
num_max_rdma_chunked_send_tokens
and
lane_id
<
kNumRDMARanks
)
{
nvshmemx_signal_op
(
rdma_channel_head
.
buffer
(
rdma_rank
),
min_head
-
last_rdma_head
,
NVSHMEM_SIGNAL_ADD
,
translate_dst_rdma_rank
<
kLowLatencyMode
>
(
dst_rdma_rank
,
nvl_rank
));
last_rdma_head
=
min_head
;
...
...
csrc/kernels/internode_ll.cu
View file @
1fc40d50
...
...
@@ -255,7 +255,7 @@ dispatch(void* packed_recv_x, float* packed_recv_x_scales,
if
(
sub_warp_id
==
1
and
lane_id
==
0
)
{
if
(
src_rank
!=
rank
)
{
nvshmemi_ibgda_poll_recv
(
src_rank
,
local_expert_idx
);
num_recv_tokens
=
ld_acquire_global
(
rdma_recv_count
+
local_expert_idx
*
num_ranks
+
src_rank
);
num_recv_tokens
=
ld_acquire_
sys_
global
(
rdma_recv_count
+
local_expert_idx
*
num_ranks
+
src_rank
);
EP_DEVICE_ASSERT
(
num_recv_tokens
!=
0
);
}
else
{
while
((
num_recv_tokens
=
ld_acquire_global
(
rdma_recv_count
+
local_expert_idx
*
num_ranks
+
src_rank
))
==
0
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment