Commit d6104f9c authored by lijian's avatar lijian
Browse files

feat: Make DeepEP CPU timeout configurable and increase default timeout.


Signed-off-by: default avatarlijian <lijina6@sugon.com>
parent 0494e395
......@@ -551,7 +551,7 @@ Buffer::intranode_dispatch(
// Timeout check
if (std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::high_resolution_clock::now() - start_time)
.count() > NUM_CPU_TIMEOUT_SECS)
.count() > get_num_cpu_timeout_secs())
throw std::runtime_error("DeepEP error: CPU recv timeout");
}
num_recv_tokens_per_expert_list = std::vector<int>(
......@@ -992,7 +992,7 @@ Buffer::internode_dispatch(const torch::Tensor &x, const std::optional<torch::Te
// Timeout check
if (std::chrono::duration_cast<std::chrono::seconds>(
std::chrono::high_resolution_clock::now() - start_time)
.count() > NUM_CPU_TIMEOUT_SECS) {
.count() > get_num_cpu_timeout_secs()) {
printf("Global rank: %d, num_recv_tokens: %d, num_rdma_recv_tokens: %d\n", rank,
num_recv_tokens, num_rdma_recv_tokens);
for (int i = 0; i < num_local_experts; ++i)
......
......@@ -13,8 +13,8 @@
#define FINISHED_SUM_TAG 1024
#define NUM_CPU_TIMEOUT_SECS 2000
#define NUM_TIMEOUT_CYCLES 3000000000000ll // 200G cycles ~= 100s
#define NUM_CPU_TIMEOUT_SECS 1000
#define NUM_TIMEOUT_CYCLES 6000000000000ll // 200G cycles ~= 100s
#define NUM_WAIT_NANOSECONDS 500
......@@ -68,3 +68,18 @@ template <typename T> inline __host__ __device__ T ALIGN(T a, T b) {
#ifdef __HIP_NO_HALF_OPERATORS__
#undef __HIP_NO_HALF_OPERATORS__
#endif
static inline int get_num_cpu_timeout_secs() {
static int timeout = []() {
const char *env = std::getenv("DEEPEP_CPU_TIMEOUT_SECS");
if (!env || env[0] == '\0') {
return NUM_CPU_TIMEOUT_SECS;
}
try {
return std::stoi(env);
} catch (...) {
return NUM_CPU_TIMEOUT_SECS;
}
}();
return timeout;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment