Commit cc8f03c8 authored by Michael Carilli's avatar Michael Carilli
Browse files

Multi-op sequence for ddp_race_condition_test.py

parent 47ac5c2b
......@@ -73,18 +73,10 @@ def prep_param_lists(model, flat_master=False):
# flatten_dense_tensors returns a contiguous flat array.
# http://pytorch.org/docs/master/_modules/torch/_utils.html
master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
except TypeError as instance:
# This is brittle, and depends on how cat chooses to word its error message.
if "cat received an invalid combination of arguments" not in instance.args[0]:
raise
else:
# If you append a message to the exception instance, via
# instance.args = instance.args + ("Error...",)
# this messes up the terminal-formatted printing of the instance's original message.
# Basic solution for now:
print("Error in prep_param_lists: model likely contains a mixture of parameters "
except:
print("Error in prep_param_lists: model may contain a mixture of parameters "
"of different types. Use flat_master=False, or use F16_Optimizer.")
raise
raise
master_params = torch.nn.Parameter(master_params)
master_params.requires_grad = True
# master_params.register_hook(backwards_debug_hook)
......
......@@ -24,40 +24,40 @@ args.distributed = args.world_size > 1
if args.distributed:
torch.cuda.set_device(args.rank % torch.cuda.device_count())
dist.init_process_group(args.dist_backend, init_method=args.dist_url,
world_size=args.world_size)
rank = torch.distributed.get_rank()
dist.init_process_group(args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
torch.set_printoptions(precision=10)
class Model(Module):
def __init__(self):
super(Model, self).__init__()
self.x = Parameter(torch.cuda.FloatTensor(1,4096*4096).fill_(1.0))
self.a = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(1.0))
self.b = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(2.0))
def forward(self, input):
return self.x*input
return (input*self.a)*self.b
model = DDP(Model(), message_size=1)
z = torch.cuda.FloatTensor(4096*4096)
x = torch.cuda.FloatTensor(4096*4096)
for i in range(10):
z.fill_(i + rank) # fill z with new values every iteration for sanity
x.fill_(i + args.rank) # fill x with new values every iteration for sanity
model.zero_grad()
out = model(z)
out = model(x)
loss = out.sum()
torch.cuda.nvtx.range_push("backward")
loss.backward()
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("synchronize() + sum")
torch.cuda.nvtx.range_push("synchronize() + info")
torch.cuda.synchronize()
for param in model.parameters():
print("i = {},\n"
"param.grad.data_ptr() = {}\n"
"expected {},\n"
" got {}\n"
.format(i,
param.grad.data_ptr(),
4096*4096*(2.*i+1)/2.,
param.grad.data.sum().item()))
print("i = {}".format(i))
def info(name, param, val):
print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
param.grad.data_ptr(), val*4096*4096*(2.*i+1)/2., param.grad.data.sum().item()))
info("model.a", model.module.a, 2.)
info("model.b", model.module.b, 1.)
torch.cuda.nvtx.range_pop()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment