Unverified Commit f0311a18 authored by Pavel Shamis (Pasha)'s avatar Pavel Shamis (Pasha) Committed by GitHub
Browse files

[UB] Fixing consistency of error messages. (#840)

parent 115a27ef
......@@ -153,7 +153,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allreduce Gather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Allreduce Gather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -188,8 +188,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d ]Allreduce reduce-scatter:SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
UB_PRINT("[%d ]Allreduce reduce-scatter:SM %d [%d]: expecting %d got %d",
myrank, blockIdx.x, threadIdx.x, reduce_id, *flag);
break;
}
}
......@@ -237,7 +237,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allreduce gather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Allreduce gather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -300,7 +300,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -376,7 +376,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -450,7 +450,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (clock64() - s > TIMEOUT) {
UB_PRINT("Reduce-scatter: SM %d [%d]:expecting %d got %d", blockIdx.x, threadIdx.x,
UB_PRINT("Reduce-scatter: SM %d [%d]: expecting %d got %d", blockIdx.x, threadIdx.x,
reduce_id, *flag);
break;
}
......@@ -518,7 +518,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (clock64() - s > 2ull * TIMEOUT) {
UB_PRINT("Allgather: SM %d [%d]:expecting %d got %d", blockIdx.x, threadIdx.x, reduce_id,
UB_PRINT("Allgather: SM %d [%d]: expecting %d got %d", blockIdx.x, threadIdx.x, reduce_id,
*flag);
break;
}
......@@ -554,7 +554,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -640,7 +640,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -771,8 +771,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
reduce_id, *flag);
UB_PRINT("[%d] Allgather: SM %d [%d]: expecting %d got %d",
myrank, blockIdx.x, threadIdx.x, reduce_id, *flag);
break;
}
}
......@@ -837,7 +837,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -926,7 +926,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -1014,7 +1014,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -1109,7 +1109,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -1205,7 +1205,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
threadIdx.x, reduce_id, *flag);
break;
}
......@@ -1324,7 +1324,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
UB_PRINT("[%d] Allgather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
reduce_id, *flag);
break;
}
......@@ -1422,7 +1422,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, reduce_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
UB_PRINT("[%d] Allgather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
reduce_id, *flag);
break;
}
......@@ -2004,7 +2004,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
clock_t s = clock64();
while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pullrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: expected %d,"
UB_PRINT("pullrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: expecting %d,"
" observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
break;
}
......@@ -2078,8 +2078,8 @@ __global__ void kuserbuffers_pushrecv(int myrank, int peer, int nvrank, int nvpe
clock_t s = clock64();
while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pushrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d] : "
"expected %d, observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
UB_PRINT("pushrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: "
"expecting %d, observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
if (CHECK_CE(ce_start_ptr, ce_end_ptr))
UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
*ce_start_ptr, *ce_end_ptr);
......@@ -2135,7 +2135,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pushsendrecv [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer: %d"
" receiving peer: %d]: expected %d, observed %d",
" receiving peer: %d]: expecting %d, observed %d",
send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag);
if (CHECK_CE(ce_start_ptr, ce_end_ptr))
UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
......@@ -2192,7 +2192,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pushsendrecv atomic [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer:"
" %d receiving peer: %d]: expected %d, observed %d",
" %d receiving peer: %d]: expecting %d, observed %d",
send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/
if (CHECK_CE(ce_start_ptr, ce_end_ptr))
UB_PRINT("pushsendrecv atomic: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
......@@ -2263,7 +2263,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
while (CHECK_IDS(*flag, signal_id)) {
if (CHECK_TIMEOUT(s, ub_timeout)) {
UB_PRINT("pushsendrecv multiatomic [sending peer:%d receiving peer:%d][nvrank(GPU)"
" sending peer: %d receiving peer: %d]: expected %d, observed %d",
" sending peer: %d receiving peer: %d]: expecting %d, observed %d",
send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/
// CE mode is not supported for multi-atomic, so there is no need to check for a deadlock
return;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment