Multi-op sequence for ddp_race_condition_test.py

cc8f03c8 · Michael Carilli · 47ac5c2b · cc8f03c8 · cc8f03c8
Commit cc8f03c8 authored May 14, 2018 by Michael Carilli
Hide whitespace changes
Inline Side-by-side

Showing with 22 additions and 30 deletions

apex/fp16_utils/fp16util.py apex/fp16_utils/fp16util.py +3 -11

tests/distributed/ddp_race_condition_test.py tests/distributed/ddp_race_condition_test.py +19 -19

No files found.
--- a/apex/fp16_utils/fp16util.py
+++ b/apex/fp16_utils/fp16util.py
@@ -73,18 +73,10 @@ def prep_param_lists(model, flat_master=False):
            # flatten_dense_tensors returns a contiguous flat array.
            # http://pytorch.org/docs/master/_modules/torch/_utils.html
            master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
-        except TypeError as instance:
-            # This is brittle, and depends on how cat chooses to word its error message.
-            if "cat received an invalid combination of arguments" not in instance.args[0]:
-                raise
-            else:
-                # If you append a message to the exception instance, via
-                # instance.args = instance.args + ("Error...",)
-                # this messes up the terminal-formatted printing of the instance's original message.
-                # Basic solution for now:
-                print("Error in prep_param_lists:  model likely contains a mixture of parameters "
+        except:
+            print("Error in prep_param_lists:  model may contain a mixture of parameters "
                      "of different types.  Use flat_master=False, or use F16_Optimizer.")
-                raise
+            raise
        master_params = torch.nn.Parameter(master_params)
        master_params.requires_grad = True
        # master_params.register_hook(backwards_debug_hook)

--- a/tests/distributed/ddp_race_condition_test.py
+++ b/tests/distributed/ddp_race_condition_test.py
@@ -24,40 +24,40 @@ args.distributed = args.world_size > 1

 if args.distributed:
    torch.cuda.set_device(args.rank % torch.cuda.device_count())
-    dist.init_process_group(args.dist_backend, init_method=args.dist_url,
-                            world_size=args.world_size)
-    rank = torch.distributed.get_rank()
+    dist.init_process_group(args.dist_backend, 
+                            init_method=args.dist_url,
+                            world_size=args.world_size,
+                            rank=args.rank)
+
 torch.set_printoptions(precision=10)

 class Model(Module):
    def __init__(self):
        super(Model, self).__init__()
-        self.x = Parameter(torch.cuda.FloatTensor(1,4096*4096).fill_(1.0))
+        self.a = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(1.0))
+        self.b = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(2.0))
    def forward(self, input):
-        return self.x*input
+        return (input*self.a)*self.b
+
 model = DDP(Model(), message_size=1)

-z = torch.cuda.FloatTensor(4096*4096)
+x = torch.cuda.FloatTensor(4096*4096)

 for i in range(10):
-    z.fill_(i + rank) # fill z with new values every iteration for sanity
+    x.fill_(i + args.rank) # fill x with new values every iteration for sanity
    model.zero_grad()
-    out = model(z)
+    out = model(x)
    loss = out.sum()
    torch.cuda.nvtx.range_push("backward")
    loss.backward()
    torch.cuda.nvtx.range_pop()
    
-    torch.cuda.nvtx.range_push("synchronize() + sum")
+    torch.cuda.nvtx.range_push("synchronize() + info")
    torch.cuda.synchronize()
-    for param in model.parameters():
-        print("i = {},\n"
-              "param.grad.data_ptr() = {}\n"
-              "expected {},\n" 
-              "     got {}\n"
-              .format(i,
-                      param.grad.data_ptr(),
-                      4096*4096*(2.*i+1)/2.,
-                      param.grad.data.sum().item()))
+    print("i = {}".format(i))
+    def info(name, param, val):
+        print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
+              param.grad.data_ptr(), val*4096*4096*(2.*i+1)/2., param.grad.data.sum().item()))
+    info("model.a", model.module.a, 2.) 
+    info("model.b", model.module.b, 1.)
    torch.cuda.nvtx.range_pop()
-