Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
DeepEP
Commits
d6104f9c
Commit
d6104f9c
authored
May 09, 2026
by
lijian
Browse files
feat: Make DeepEP CPU timeout configurable and increase default timeout.
Signed-off-by:
lijian
<
lijina6@sugon.com
>
parent
0494e395
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
19 additions
and
4 deletions
+19
-4
csrc/deep_ep.cu
csrc/deep_ep.cu
+2
-2
csrc/kernels/configs.cuh
csrc/kernels/configs.cuh
+17
-2
No files found.
csrc/deep_ep.cu
View file @
d6104f9c
...
...
@@ -551,7 +551,7 @@ Buffer::intranode_dispatch(
// Timeout check
if
(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
seconds
>
(
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
)
.
count
()
>
NUM_CPU_TIMEOUT_SECS
)
.
count
()
>
get_num_cpu_timeout_secs
()
)
throw
std
::
runtime_error
(
"DeepEP error: CPU recv timeout"
);
}
num_recv_tokens_per_expert_list
=
std
::
vector
<
int
>
(
...
...
@@ -992,7 +992,7 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te
// Timeout check
if
(
std
::
chrono
::
duration_cast
<
std
::
chrono
::
seconds
>
(
std
::
chrono
::
high_resolution_clock
::
now
()
-
start_time
)
.
count
()
>
NUM_CPU_TIMEOUT_SECS
)
{
.
count
()
>
get_num_cpu_timeout_secs
()
)
{
printf
(
"Global rank: %d, num_recv_tokens: %d, num_rdma_recv_tokens: %d
\n
"
,
rank
,
num_recv_tokens
,
num_rdma_recv_tokens
);
for
(
int
i
=
0
;
i
<
num_local_experts
;
++
i
)
...
...
csrc/kernels/configs.cuh
View file @
d6104f9c
...
...
@@ -13,8 +13,8 @@
#define FINISHED_SUM_TAG 1024
#define NUM_CPU_TIMEOUT_SECS
2
000
#define NUM_TIMEOUT_CYCLES
3
000000000000ll // 200G cycles ~= 100s
#define NUM_CPU_TIMEOUT_SECS
1
000
#define NUM_TIMEOUT_CYCLES
6
000000000000ll // 200G cycles ~= 100s
#define NUM_WAIT_NANOSECONDS 500
...
...
@@ -68,3 +68,18 @@ template <typename T> inline __host__ __device__ T ALIGN(T a, T b) {
#ifdef __HIP_NO_HALF_OPERATORS__
#undef __HIP_NO_HALF_OPERATORS__
#endif
static
inline
int
get_num_cpu_timeout_secs
()
{
static
int
timeout
=
[]()
{
const
char
*
env
=
std
::
getenv
(
"DEEPEP_CPU_TIMEOUT_SECS"
);
if
(
!
env
||
env
[
0
]
==
'\0'
)
{
return
NUM_CPU_TIMEOUT_SECS
;
}
try
{
return
std
::
stoi
(
env
);
}
catch
(...)
{
return
NUM_CPU_TIMEOUT_SECS
;
}
}();
return
timeout
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment