Commit cc8f03c8 authored by Michael Carilli's avatar Michael Carilli
Browse files

Multi-op sequence for ddp_race_condition_test.py

parent 47ac5c2b
...@@ -73,16 +73,8 @@ def prep_param_lists(model, flat_master=False): ...@@ -73,16 +73,8 @@ def prep_param_lists(model, flat_master=False):
# flatten_dense_tensors returns a contiguous flat array. # flatten_dense_tensors returns a contiguous flat array.
# http://pytorch.org/docs/master/_modules/torch/_utils.html # http://pytorch.org/docs/master/_modules/torch/_utils.html
master_params = _flatten_dense_tensors([param.data for param in model_params]).float() master_params = _flatten_dense_tensors([param.data for param in model_params]).float()
except TypeError as instance: except:
# This is brittle, and depends on how cat chooses to word its error message. print("Error in prep_param_lists: model may contain a mixture of parameters "
if "cat received an invalid combination of arguments" not in instance.args[0]:
raise
else:
# If you append a message to the exception instance, via
# instance.args = instance.args + ("Error...",)
# this messes up the terminal-formatted printing of the instance's original message.
# Basic solution for now:
print("Error in prep_param_lists: model likely contains a mixture of parameters "
"of different types. Use flat_master=False, or use F16_Optimizer.") "of different types. Use flat_master=False, or use F16_Optimizer.")
raise raise
master_params = torch.nn.Parameter(master_params) master_params = torch.nn.Parameter(master_params)
......
...@@ -24,40 +24,40 @@ args.distributed = args.world_size > 1 ...@@ -24,40 +24,40 @@ args.distributed = args.world_size > 1
if args.distributed: if args.distributed:
torch.cuda.set_device(args.rank % torch.cuda.device_count()) torch.cuda.set_device(args.rank % torch.cuda.device_count())
dist.init_process_group(args.dist_backend, init_method=args.dist_url, dist.init_process_group(args.dist_backend,
world_size=args.world_size) init_method=args.dist_url,
rank = torch.distributed.get_rank() world_size=args.world_size,
rank=args.rank)
torch.set_printoptions(precision=10) torch.set_printoptions(precision=10)
class Model(Module): class Model(Module):
def __init__(self): def __init__(self):
super(Model, self).__init__() super(Model, self).__init__()
self.x = Parameter(torch.cuda.FloatTensor(1,4096*4096).fill_(1.0)) self.a = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(1.0))
self.b = Parameter(torch.cuda.FloatTensor(4096*4096).fill_(2.0))
def forward(self, input): def forward(self, input):
return self.x*input return (input*self.a)*self.b
model = DDP(Model(), message_size=1) model = DDP(Model(), message_size=1)
z = torch.cuda.FloatTensor(4096*4096) x = torch.cuda.FloatTensor(4096*4096)
for i in range(10): for i in range(10):
z.fill_(i + rank) # fill z with new values every iteration for sanity x.fill_(i + args.rank) # fill x with new values every iteration for sanity
model.zero_grad() model.zero_grad()
out = model(z) out = model(x)
loss = out.sum() loss = out.sum()
torch.cuda.nvtx.range_push("backward") torch.cuda.nvtx.range_push("backward")
loss.backward() loss.backward()
torch.cuda.nvtx.range_pop() torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("synchronize() + sum") torch.cuda.nvtx.range_push("synchronize() + info")
torch.cuda.synchronize() torch.cuda.synchronize()
for param in model.parameters(): print("i = {}".format(i))
print("i = {},\n" def info(name, param, val):
"param.grad.data_ptr() = {}\n" print(name+": grad.data_ptr() = {}, expected sum {}, got {}".format(
"expected {},\n" param.grad.data_ptr(), val*4096*4096*(2.*i+1)/2., param.grad.data.sum().item()))
" got {}\n" info("model.a", model.module.a, 2.)
.format(i, info("model.b", model.module.b, 1.)
param.grad.data_ptr(),
4096*4096*(2.*i+1)/2.,
param.grad.data.sum().item()))
torch.cuda.nvtx.range_pop() torch.cuda.nvtx.range_pop()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment