Commit f1b2524b authored by Neel Kant's avatar Neel Kant
Browse files

Add debug statements

parent 1dd51c0e
......@@ -904,7 +904,7 @@ class InverseClozeDataset(data.Dataset):
def __getitem__(self, idx):
# get rng state corresponding to index (allows deterministic random pair)
rng = random.Random(idx)
rng = random.Random(idx + 1000)
np_rng = np.random.RandomState(seed=[rng.randint(0, 2**32-1) for _ in range(16)])
# get seq length. Save 2 tokens for beginning and end
......@@ -924,6 +924,7 @@ class InverseClozeDataset(data.Dataset):
'context_types': np.array(context_token_types),
'context_pad_mask': np.array(context_pad_mask)
}
print("got item")
return sample
......@@ -957,7 +958,7 @@ class InverseClozeDataset(data.Dataset):
doc = self.get_sentence_split_doc(doc_idx)
if not doc:
doc = None
print("got doc sentences")
# set up and tokenize the entire selected document
num_sentences = len(doc)
all_token_lists = []
......@@ -967,6 +968,7 @@ class InverseClozeDataset(data.Dataset):
all_token_lists.append(tokens)
all_token_type_lists.append(token_types)
print("got tokenized sentences")
sentence_token_lens = [len(l) for l in all_token_lists]
inclusion_mask = [True] * num_sentences
......@@ -993,6 +995,7 @@ class InverseClozeDataset(data.Dataset):
inclusion_mask[num_sentences - view_radius] = False
remove_preceding = not remove_preceding
print("got inclusion mask")
# assemble the tokens and token types of the context
context_tokens = list(itertools.chain(
*[l for i, l in enumerate(all_token_lists) if inclusion_mask[i]]))
......@@ -1005,6 +1008,8 @@ class InverseClozeDataset(data.Dataset):
context_tokens, context_token_types, context_pad_mask = self.concat_and_pad_tokens(
context_tokens, context_token_types)
print("got all tokens")
return (input_tokens, input_token_types, input_pad_mask), \
(context_tokens, context_token_types, context_pad_mask)
......
......@@ -292,10 +292,13 @@ class ICTBertModel(MegatronModule):
context_tokens, context_attention_mask, context_types):
question_ict_logits, _ = self.question_model.forward(input_tokens, input_attention_mask, input_types)
print("(bert ict forward) got question logits")
context_ict_logits, _ = self.context_model.forward(context_tokens, context_attention_mask, context_types)
print("(bert ict forward) got context logits")
# [batch x h] * [h x batch]
retrieval_scores = question_ict_logits.matmul(torch.transpose(context_ict_logits, 0, 1))
print("(bert ict forward) got retrieval scores")
return retrieval_scores
......
......@@ -253,6 +253,7 @@ def setup_model_and_optimizer(model_provider_func, args):
def backward_step(optimizer, model, loss, args, timers):
"""Backward step."""
print("back1")
# Backward pass.
optimizer.zero_grad()
if args.fp16:
......@@ -260,6 +261,7 @@ def backward_step(optimizer, model, loss, args, timers):
else:
loss.backward()
print("back2")
# All-reduce if needed.
if args.DDP_impl == 'local':
timers('allreduce').start()
......@@ -267,10 +269,12 @@ def backward_step(optimizer, model, loss, args, timers):
fp32_allreduce=args.fp32_allreduce)
timers('allreduce').stop()
print("back3")
# Update master gradients.
if args.fp16:
optimizer.update_master_grads()
print("back4")
# Clipping gradients helps prevent the exploding gradient.
if args.clip_grad > 0:
if not args.fp16:
......@@ -278,6 +282,7 @@ def backward_step(optimizer, model, loss, args, timers):
else:
optimizer.clip_master_grads(args.clip_grad)
print("back5")
def train_step(forward_step_func, data_iterator, model, optimizer, lr_scheduler,
args, timers):
......@@ -287,16 +292,22 @@ def train_step(forward_step_func, data_iterator, model, optimizer, lr_scheduler,
timers('forward').start()
loss, loss_reduced = forward_step_func(data_iterator, model, args, timers)
timers('forward').stop()
torch.cuda.synchronize()
print("confirm forward")
# Calculate gradients, reduce across processes, and clip.
timers('backward').start()
backward_step(optimizer, model, loss, args, timers)
timers('backward').stop()
torch.cuda.synchronize()
print("did backward step")
# Update parameters.
timers('optimizer').start()
optimizer.step()
timers('optimizer').stop()
torch.cuda.synchronize()
print("did optim step")
# Update learning rate.
skipped_iter = 0
......
......@@ -26,6 +26,7 @@ from megatron.utils import reduce_losses
from megatron.utils import vocab_size_with_padding
from megatron.training import run
num_batches = 0
def model_provider(args):
"""Build the model."""
......@@ -78,6 +79,9 @@ def get_batch(data_iterator, timers):
context_types = data_b['context_types'].long()
context_pad_mask = data_b['context_pad_mask'].long()
global num_batches
print("got batch {}".format(num_batches))
return input_tokens, input_types, input_pad_mask,\
context_tokens, context_types, context_pad_mask
......@@ -94,12 +98,19 @@ def forward_step(data_iterator, model, args, timers):
# Forward model.
retrieval_scores = model(input_tokens, 1 - input_pad_mask, input_types,
context_tokens, 1 - context_pad_mask, context_types)
print("ran model to get retrieval scores")
softmaxed = F.softmax(retrieval_scores, dim=0).float()
retrieval_loss = F.cross_entropy(softmaxed, torch.arange(softmaxed.size()[0]).cuda())
softmaxed = F.softmax(retrieval_scores, dim=0)
retrieval_loss = F.cross_entropy(softmaxed, torch.arange(softmaxed.shape[0]).cuda())
print(type(retrieval_loss))
reduced_losses = reduce_losses([retrieval_loss])
global num_batches
print("did forward step {}".format(num_batches))
num_batches += 1
print(retrieval_loss, {'retrieval loss': reduced_losses[0]})
return retrieval_loss, {'retrieval loss': reduced_losses[0]}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment