"...git@developer.sourcefind.cn:kecinstone/2024-pra-vllm.git" did not exist on "298695b76691867ecd320ea6a2c6d0c6a843d5ae"
Unverified Commit f0311a18 authored by Pavel Shamis (Pasha)'s avatar Pavel Shamis (Pasha) Committed by GitHub
Browse files

[UB] Fixing consistency of error messages. (#840)

parent 115a27ef
...@@ -153,7 +153,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -153,7 +153,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allreduce Gather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Allreduce Gather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -188,8 +188,8 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -188,8 +188,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d ]Allreduce reduce-scatter:SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d ]Allreduce reduce-scatter:SM %d [%d]: expecting %d got %d",
threadIdx.x, reduce_id, *flag); myrank, blockIdx.x, threadIdx.x, reduce_id, *flag);
break; break;
} }
} }
...@@ -237,7 +237,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -237,7 +237,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allreduce gather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Allreduce gather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -300,7 +300,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -300,7 +300,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -376,7 +376,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -376,7 +376,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -450,7 +450,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -450,7 +450,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (clock64() - s > TIMEOUT) { if (clock64() - s > TIMEOUT) {
UB_PRINT("Reduce-scatter: SM %d [%d]:expecting %d got %d", blockIdx.x, threadIdx.x, UB_PRINT("Reduce-scatter: SM %d [%d]: expecting %d got %d", blockIdx.x, threadIdx.x,
reduce_id, *flag); reduce_id, *flag);
break; break;
} }
...@@ -518,7 +518,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -518,7 +518,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (clock64() - s > 2ull * TIMEOUT) { if (clock64() - s > 2ull * TIMEOUT) {
UB_PRINT("Allgather: SM %d [%d]:expecting %d got %d", blockIdx.x, threadIdx.x, reduce_id, UB_PRINT("Allgather: SM %d [%d]: expecting %d got %d", blockIdx.x, threadIdx.x, reduce_id,
*flag); *flag);
break; break;
} }
...@@ -554,7 +554,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -554,7 +554,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -640,7 +640,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -640,7 +640,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -771,8 +771,8 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -771,8 +771,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x, UB_PRINT("[%d] Allgather: SM %d [%d]: expecting %d got %d",
reduce_id, *flag); myrank, blockIdx.x, threadIdx.x, reduce_id, *flag);
break; break;
} }
} }
...@@ -837,7 +837,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_ ...@@ -837,7 +837,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -926,7 +926,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -926,7 +926,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -1014,7 +1014,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -1014,7 +1014,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -1109,7 +1109,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -1109,7 +1109,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -1205,7 +1205,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -1205,7 +1205,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag); threadIdx.x, reduce_id, *flag);
break; break;
} }
...@@ -1324,7 +1324,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -1324,7 +1324,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x, UB_PRINT("[%d] Allgather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
reduce_id, *flag); reduce_id, *flag);
break; break;
} }
...@@ -1422,7 +1422,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -1422,7 +1422,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) { while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x, UB_PRINT("[%d] Allgather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
reduce_id, *flag); reduce_id, *flag);
break; break;
} }
...@@ -2004,7 +2004,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -2004,7 +2004,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, signal_id)) { while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pullrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: expected %d," UB_PRINT("pullrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: expecting %d,"
" observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag); " observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
break; break;
} }
...@@ -2078,8 +2078,8 @@ __global__ void kuserbuffers_pushrecv(int myrank, int peer, int nvrank, int nvpe ...@@ -2078,8 +2078,8 @@ __global__ void kuserbuffers_pushrecv(int myrank, int peer, int nvrank, int nvpe
clock_t s = clock64(); clock_t s = clock64();
while (CHECK_IDS(*flag, signal_id)) { while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pushrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d] : " UB_PRINT("pushrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: "
"expected %d, observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag); "expecting %d, observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
if (CHECK_CE(ce_start_ptr, ce_end_ptr)) if (CHECK_CE(ce_start_ptr, ce_end_ptr))
UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n", UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
*ce_start_ptr, *ce_end_ptr); *ce_start_ptr, *ce_end_ptr);
...@@ -2135,7 +2135,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -2135,7 +2135,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
while (CHECK_IDS(*flag, signal_id)) { while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pushsendrecv [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer: %d" UB_PRINT("pushsendrecv [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer: %d"
" receiving peer: %d]: expected %d, observed %d", " receiving peer: %d]: expecting %d, observed %d",
send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag);
if (CHECK_CE(ce_start_ptr, ce_end_ptr)) if (CHECK_CE(ce_start_ptr, ce_end_ptr))
UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n", UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
...@@ -2192,7 +2192,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -2192,7 +2192,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
while (CHECK_IDS(*flag, signal_id)) { while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pushsendrecv atomic [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer:" UB_PRINT("pushsendrecv atomic [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer:"
" %d receiving peer: %d]: expected %d, observed %d", " %d receiving peer: %d]: expecting %d, observed %d",
send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/ send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/
if (CHECK_CE(ce_start_ptr, ce_end_ptr)) if (CHECK_CE(ce_start_ptr, ce_end_ptr))
UB_PRINT("pushsendrecv atomic: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n", UB_PRINT("pushsendrecv atomic: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
...@@ -2263,7 +2263,7 @@ __global__ void __launch_bounds__(MAX_THREADS) ...@@ -2263,7 +2263,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
while (CHECK_IDS(*flag, signal_id)) { while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) { if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pushsendrecv multiatomic [sending peer:%d receiving peer:%d][nvrank(GPU)" UB_PRINT("pushsendrecv multiatomic [sending peer:%d receiving peer:%d][nvrank(GPU)"
" sending peer: %d receiving peer: %d]: expected %d, observed %d", " sending peer: %d receiving peer: %d]: expecting %d, observed %d",
send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/ send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/
// CE mode is not supported for multi-atomic, so there is no need to check for a deadlock // CE mode is not supported for multi-atomic, so there is no need to check for a deadlock
return; return;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment