Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
DeepEP
Commits
1fc40d50
Commit
1fc40d50
authored
Mar 06, 2025
by
Chenggang Zhao
Browse files
Improve AR performance
parent
41385ba5
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
8 additions
and
4 deletions
+8
-4
csrc/config.hpp
csrc/config.hpp
+5
-1
csrc/kernels/internode.cu
csrc/kernels/internode.cu
+2
-2
csrc/kernels/internode_ll.cu
csrc/kernels/internode_ll.cu
+1
-1
No files found.
csrc/config.hpp
View file @
1fc40d50
...
@@ -34,8 +34,12 @@ struct Config {
...
@@ -34,8 +34,12 @@ struct Config {
EP_HOST_ASSERT
(
num_max_nvl_chunked_send_tokens
>
0
and
num_max_nvl_chunked_recv_tokens
>
0
);
EP_HOST_ASSERT
(
num_max_nvl_chunked_send_tokens
>
0
and
num_max_nvl_chunked_recv_tokens
>
0
);
EP_HOST_ASSERT
(
num_max_nvl_chunked_send_tokens
<
num_max_nvl_chunked_recv_tokens
);
EP_HOST_ASSERT
(
num_max_nvl_chunked_send_tokens
<
num_max_nvl_chunked_recv_tokens
);
EP_HOST_ASSERT
(
num_max_rdma_chunked_send_tokens
>
0
and
num_max_rdma_chunked_recv_tokens
>
0
);
EP_HOST_ASSERT
(
num_max_rdma_chunked_send_tokens
>
0
and
num_max_rdma_chunked_recv_tokens
>
0
);
EP_HOST_ASSERT
(
num_max_rdma_chunked_send_tokens
<
num_max_rdma_chunked_recv_tokens
);
// Ceil up RDMA buffer size
this
->
num_max_rdma_chunked_recv_tokens
=
align
<
int
>
(
num_max_rdma_chunked_recv_tokens
,
num_max_rdma_chunked_send_tokens
);
this
->
num_max_rdma_chunked_recv_tokens
=
align
<
int
>
(
num_max_rdma_chunked_recv_tokens
,
num_max_rdma_chunked_send_tokens
);
EP_HOST_ASSERT
(
num_max_rdma_chunked_send_tokens
<
num_max_rdma_chunked_recv_tokens
);
// NOTES: this assertion is related to RDMA lazy head update, we must ensure senders always have space to push
EP_HOST_ASSERT
(
num_max_rdma_chunked_send_tokens
<=
num_max_rdma_chunked_recv_tokens
/
2
);
}
}
size_t
get_nvl_buffer_size_hint
(
size_t
hidden_bytes
,
int
num_ranks
)
const
{
size_t
get_nvl_buffer_size_hint
(
size_t
hidden_bytes
,
int
num_ranks
)
const
{
...
...
csrc/kernels/internode.cu
View file @
1fc40d50
...
@@ -925,7 +925,7 @@ dispatch(int4* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv
...
@@ -925,7 +925,7 @@ dispatch(int4* recv_x, float* recv_x_scales, int64_t* recv_topk_idx, float* recv
break
;
break
;
// Update remote head
// Update remote head
if
(
min_head
!=
std
::
numeric_limits
<
int
>::
max
()
and
min_head
>
last_head
and
lane_id
<
kNumRDMARanks
)
{
if
(
min_head
!=
std
::
numeric_limits
<
int
>::
max
()
and
min_head
>
=
last_head
+
num_max_rdma_chunked_send_tokens
and
lane_id
<
kNumRDMARanks
)
{
nvshmemx_signal_op
(
rdma_channel_head
.
buffer
(
rdma_rank
),
min_head
-
last_head
,
NVSHMEM_SIGNAL_ADD
,
nvshmemx_signal_op
(
rdma_channel_head
.
buffer
(
rdma_rank
),
min_head
-
last_head
,
NVSHMEM_SIGNAL_ADD
,
translate_dst_rdma_rank
<
kLowLatencyMode
>
(
lane_id
,
nvl_rank
));
translate_dst_rdma_rank
<
kLowLatencyMode
>
(
lane_id
,
nvl_rank
));
last_head
=
min_head
;
last_head
=
min_head
;
...
@@ -1655,7 +1655,7 @@ combine(int4* combined_x, float* combined_topk_weights,
...
@@ -1655,7 +1655,7 @@ combine(int4* combined_x, float* combined_topk_weights,
#pragma unroll
#pragma unroll
for
(
int
i
=
0
;
i
<
kNumRDMAReceivers
;
++
i
)
if
(
not
rdma_receiver_retired
[
i
])
for
(
int
i
=
0
;
i
<
kNumRDMAReceivers
;
++
i
)
if
(
not
rdma_receiver_retired
[
i
])
min_head
=
min
(
min_head
,
rdma_receiver_rdma_head
[
i
][
dst_rdma_rank
]);
min_head
=
min
(
min_head
,
rdma_receiver_rdma_head
[
i
][
dst_rdma_rank
]);
if
(
min_head
!=
std
::
numeric_limits
<
int
>::
max
()
and
min_head
>
last_rdma_head
and
lane_id
<
kNumRDMARanks
)
{
if
(
min_head
!=
std
::
numeric_limits
<
int
>::
max
()
and
min_head
>
=
last_rdma_head
+
num_max_rdma_chunked_send_tokens
and
lane_id
<
kNumRDMARanks
)
{
nvshmemx_signal_op
(
rdma_channel_head
.
buffer
(
rdma_rank
),
min_head
-
last_rdma_head
,
NVSHMEM_SIGNAL_ADD
,
nvshmemx_signal_op
(
rdma_channel_head
.
buffer
(
rdma_rank
),
min_head
-
last_rdma_head
,
NVSHMEM_SIGNAL_ADD
,
translate_dst_rdma_rank
<
kLowLatencyMode
>
(
dst_rdma_rank
,
nvl_rank
));
translate_dst_rdma_rank
<
kLowLatencyMode
>
(
dst_rdma_rank
,
nvl_rank
));
last_rdma_head
=
min_head
;
last_rdma_head
=
min_head
;
...
...
csrc/kernels/internode_ll.cu
View file @
1fc40d50
...
@@ -255,7 +255,7 @@ dispatch(void* packed_recv_x, float* packed_recv_x_scales,
...
@@ -255,7 +255,7 @@ dispatch(void* packed_recv_x, float* packed_recv_x_scales,
if
(
sub_warp_id
==
1
and
lane_id
==
0
)
{
if
(
sub_warp_id
==
1
and
lane_id
==
0
)
{
if
(
src_rank
!=
rank
)
{
if
(
src_rank
!=
rank
)
{
nvshmemi_ibgda_poll_recv
(
src_rank
,
local_expert_idx
);
nvshmemi_ibgda_poll_recv
(
src_rank
,
local_expert_idx
);
num_recv_tokens
=
ld_acquire_global
(
rdma_recv_count
+
local_expert_idx
*
num_ranks
+
src_rank
);
num_recv_tokens
=
ld_acquire_
sys_
global
(
rdma_recv_count
+
local_expert_idx
*
num_ranks
+
src_rank
);
EP_DEVICE_ASSERT
(
num_recv_tokens
!=
0
);
EP_DEVICE_ASSERT
(
num_recv_tokens
!=
0
);
}
else
{
}
else
{
while
((
num_recv_tokens
=
ld_acquire_global
(
rdma_recv_count
+
local_expert_idx
*
num_ranks
+
src_rank
))
==
0
);
while
((
num_recv_tokens
=
ld_acquire_global
(
rdma_recv_count
+
local_expert_idx
*
num_ranks
+
src_rank
))
==
0
);
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment