Commit 9d731777 authored by Michael Carilli's avatar Michael Carilli
Browse files

Clean up race condition test, need to figure out a clean way to create distributed unit tests

parent fa183ee8
...@@ -34,24 +34,31 @@ class Model(Module): ...@@ -34,24 +34,31 @@ class Model(Module):
return (input*self.a)*self.b return (input*self.a)*self.b
model = DDP(Model(), message_size=1) model = DDP(Model(), message_size=1)
# model = DDP(Model(), delay_allreduce=True)
x = torch.cuda.FloatTensor(4096*4096) x = torch.cuda.FloatTensor(4096*4096)
passed = True
for i in range(10): for i in range(10):
x.fill_(i + args.local_rank) # fill x with new values every iteration for sanity x.fill_(i + args.local_rank) # fill x with new values every iteration for sanity
model.zero_grad() model.zero_grad()
out = model(x) out = model(x)
loss = out.sum() loss = out.sum()
torch.cuda.nvtx.range_push("backward") # torch.cuda.nvtx.range_push("backward")
loss.backward() loss.backward()
torch.cuda.nvtx.range_pop() # torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("synchronize() + info") # torch.cuda.nvtx.range_push("synchronize() + info")
# torch.cuda.synchronize() # torch.cuda.synchronize()
print("i = {}".format(i)) print("i = {}".format(i))
def info(name, param, val): def info(name, param, val):
expected = val*4096*4096*(2.*i+1)/2.
actual = param.grad.data.sum().item()
print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format( print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
param.grad.data_ptr(), val*4096*4096*(2.*i+1)/2., param.grad.data.sum().item())) param.grad.data_ptr(), expected, actual))
info("model.a", model.module.a, 2.) return (expected == actual)
info("model.b", model.module.b, 1.) if not info("model.a", model.module.a, 2.): passed = False
torch.cuda.nvtx.range_pop() if not info("model.b", model.module.b, 1.): passed = False
# torch.cuda.nvtx.range_pop()
print("passed = ", passed)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment