Clean up race condition test, need to figure out a clean way to create distributed unit tests

9d731777 · Michael Carilli · fa183ee8 · 9d731777
Commit 9d731777 authored Sep 29, 2018 by Michael Carilli
Hide whitespace changes
Inline Side-by-side

Showing with 14 additions and 7 deletions

tests/distributed/ddp_race_condition_test.py tests/distributed/ddp_race_condition_test.py +14 -7

No files found.
--- a/tests/distributed/ddp_race_condition_test.py
+++ b/tests/distributed/ddp_race_condition_test.py
@@ -34,24 +34,31 @@ class Model(Module):
        return (input*self.a)*self.b

 model = DDP(Model(), message_size=1)
+# model = DDP(Model(), delay_allreduce=True)

 x = torch.cuda.FloatTensor(4096*4096)

+passed = True
 for i in range(10):
    x.fill_(i + args.local_rank) # fill x with new values every iteration for sanity
    model.zero_grad()
    out = model(x)
    loss = out.sum()
-    torch.cuda.nvtx.range_push("backward")
+    # torch.cuda.nvtx.range_push("backward")
    loss.backward()
-    torch.cuda.nvtx.range_pop()
+    # torch.cuda.nvtx.range_pop()
    
-    torch.cuda.nvtx.range_push("synchronize() + info")
+    # torch.cuda.nvtx.range_push("synchronize() + info")
    # torch.cuda.synchronize()
    print("i = {}".format(i))
    def info(name, param, val):
+        expected = val*4096*4096*(2.*i+1)/2.
+        actual = param.grad.data.sum().item()
        print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
-              param.grad.data_ptr(), val*4096*4096*(2.*i+1)/2., param.grad.data.sum().item()))
-    info("model.a", model.module.a, 2.) 
-    info("model.b", model.module.b, 1.)
-    torch.cuda.nvtx.range_pop()
+              param.grad.data_ptr(), expected, actual))
+        return (expected == actual)
+    if not info("model.a", model.module.a, 2.):  passed = False
+    if not info("model.b", model.module.b, 1.):  passed = False
+    # torch.cuda.nvtx.range_pop()
+
+print("passed = ", passed)