Commit 9d731777 authored by Michael Carilli's avatar Michael Carilli
Browse files

Clean up race condition test, need to figure out a clean way to create distributed unit tests

parent fa183ee8
......@@ -34,24 +34,31 @@ class Model(Module):
return (input*self.a)*self.b
model = DDP(Model(), message_size=1)
# model = DDP(Model(), delay_allreduce=True)
x = torch.cuda.FloatTensor(4096*4096)
passed = True
for i in range(10):
x.fill_(i + args.local_rank) # fill x with new values every iteration for sanity
model.zero_grad()
out = model(x)
loss = out.sum()
torch.cuda.nvtx.range_push("backward")
# torch.cuda.nvtx.range_push("backward")
loss.backward()
torch.cuda.nvtx.range_pop()
# torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("synchronize() + info")
# torch.cuda.nvtx.range_push("synchronize() + info")
# torch.cuda.synchronize()
print("i = {}".format(i))
def info(name, param, val):
expected = val*4096*4096*(2.*i+1)/2.
actual = param.grad.data.sum().item()
print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
param.grad.data_ptr(), val*4096*4096*(2.*i+1)/2., param.grad.data.sum().item()))
info("model.a", model.module.a, 2.)
info("model.b", model.module.b, 1.)
torch.cuda.nvtx.range_pop()
param.grad.data_ptr(), expected, actual))
return (expected == actual)
if not info("model.a", model.module.a, 2.): passed = False
if not info("model.b", model.module.b, 1.): passed = False
# torch.cuda.nvtx.range_pop()
print("passed = ", passed)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment