[UB] Fixing consistency of error messages. (#840)

f0311a18 · Pavel Shamis (Pasha) · GitHub · 115a27ef · f0311a18
Unverified Commit f0311a18 authored May 20, 2024 by Pavel Shamis (Pasha) Committed by GitHub May 20, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 25 additions and 25 deletions

transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu +25 -25

No files found.
--- a/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu
+++ b/transformer_engine/pytorch/csrc/userbuffers/userbuffers.cu
@@ -153,7 +153,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Allreduce Gather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+        UB_PRINT("[%d] Allreduce Gather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
                  threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -188,8 +188,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d ]Allreduce reduce-scatter:SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+        UB_PRINT("[%d ]Allreduce reduce-scatter:SM %d [%d]: expecting %d got %d",
-                  threadIdx.x, reduce_id, *flag);
+                 myrank, blockIdx.x, threadIdx.x, reduce_id, *flag);
        break;
      }
    }
@@ -237,7 +237,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Allreduce gather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+        UB_PRINT("[%d] Allreduce gather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
                  threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -300,7 +300,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
                  threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -376,7 +376,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
                  threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -450,7 +450,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (clock64() - s > TIMEOUT) {
-        UB_PRINT("Reduce-scatter: SM %d [%d]:expecting %d got %d", blockIdx.x, threadIdx.x,
+        UB_PRINT("Reduce-scatter: SM %d [%d]: expecting %d got %d", blockIdx.x, threadIdx.x,
                  reduce_id, *flag);
        break;
      }
@@ -518,7 +518,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (clock64() - s > 2ull * TIMEOUT) {
-        UB_PRINT("Allgather: SM %d [%d]:expecting %d got %d", blockIdx.x, threadIdx.x, reduce_id,
+        UB_PRINT("Allgather: SM %d [%d]: expecting %d got %d", blockIdx.x, threadIdx.x, reduce_id,
               *flag);
        break;
      }
@@ -554,7 +554,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
       if (CHECK_TIMEOUT(s, ub_timeout)) {
-         UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+         UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
                 threadIdx.x, reduce_id, *flag);
         break;
      }
@@ -640,7 +640,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
                  threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -771,8 +771,8 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
       if (CHECK_TIMEOUT(s, ub_timeout)) {
-         UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
+         UB_PRINT("[%d] Allgather: SM %d [%d]: expecting %d got %d",
-                  reduce_id, *flag);
+                   myrank, blockIdx.x, threadIdx.x, reduce_id, *flag);
         break;
      }
    }
@@ -837,7 +837,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
               threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -926,7 +926,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
      clock_t s = clock64();
      while (CHECK_IDS(*flag, reduce_id)) {
        if (CHECK_TIMEOUT(s, ub_timeout)) {
-          UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+          UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
                 threadIdx.x, reduce_id, *flag);
          break;
        }
@@ -1014,7 +1014,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
               threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -1109,7 +1109,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+        UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
               threadIdx.x, reduce_id, *flag);
        break;
      }
@@ -1205,7 +1205,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
      clock_t s = clock64();
      while (CHECK_IDS(*flag, reduce_id)) {
        if (CHECK_TIMEOUT(s, ub_timeout)) {
-          UB_PRINT("[%d] Reduce-scatter: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x,
+          UB_PRINT("[%d] Reduce-scatter: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x,
                 threadIdx.x, reduce_id, *flag);
          break;
        }
@@ -1324,7 +1324,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
+        UB_PRINT("[%d] Allgather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
                  reduce_id, *flag);
        break;
      }
@@ -1422,7 +1422,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, reduce_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("[%d] Allgather: SM %d [%d]:expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
+        UB_PRINT("[%d] Allgather: SM %d [%d]: expecting %d got %d", myrank, blockIdx.x, threadIdx.x,
                 reduce_id, *flag);
        break;
      }
@@ -2004,7 +2004,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    clock_t s = clock64();
    while (CHECK_IDS(*flag, signal_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
-        UB_PRINT("pullrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: expected %d,"
+        UB_PRINT("pullrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: expecting %d,"
                 " observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
        break;
      }
@@ -2078,8 +2078,8 @@ __global__ void kuserbuffers_pushrecv(int myrank, int peer, int nvrank, int nvpe
  clock_t s = clock64();
  while (CHECK_IDS(*flag, signal_id)) {
    if (CHECK_TIMEOUT(s, ub_timeout)) {
-      UB_PRINT("pushrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d] : "
+      UB_PRINT("pushrecv [grank dst:%d global src:%d][nvrank(GPU) dst: %d src: %d]: "
-               "expected %d, observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
+               "expecting %d, observed %d", myrank, peer, nvrank, nvpeer, signal_id, *flag);
      if (CHECK_CE(ce_start_ptr, ce_end_ptr))
        UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
                 *ce_start_ptr, *ce_end_ptr);
@@ -2135,7 +2135,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    while (CHECK_IDS(*flag, signal_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
        UB_PRINT("pushsendrecv [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer: %d"
-                 " receiving peer: %d]: expected %d, observed %d",
+                 " receiving peer: %d]: expecting %d, observed %d",
                 send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag);
        if (CHECK_CE(ce_start_ptr, ce_end_ptr))
          UB_PRINT("pushrecv: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
@@ -2192,7 +2192,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    while (CHECK_IDS(*flag, signal_id)) {
      if (CHECK_TIMEOUT(s, ub_timeout)) {
        UB_PRINT("pushsendrecv atomic [sending peer:%d receiving peer:%d][nvrank(GPU) sending peer:"
-                 " %d receiving peer: %d]: expected %d, observed %d",
+                 " %d receiving peer: %d]: expecting %d, observed %d",
                 send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/
        if (CHECK_CE(ce_start_ptr, ce_end_ptr))
          UB_PRINT("pushsendrecv atomic: CE deadlock DETECTED: %d (ce_start) != %d (ce_end)\n",
@@ -2263,7 +2263,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
      while (CHECK_IDS(*flag, signal_id)) {
        if (CHECK_TIMEOUT(s, ub_timeout)) {
          UB_PRINT("pushsendrecv multiatomic [sending peer:%d receiving peer:%d][nvrank(GPU)"
-                   " sending peer: %d receiving peer: %d]: expected %d, observed %d",
+                   " sending peer: %d receiving peer: %d]: expecting %d, observed %d",
                   send_peer, recv_peer, nv_send, nv_recv, signal_id, *flag); /*return;*/
          // CE mode is not supported for multi-atomic, so there is no need to check for a deadlock
          return;