"examples/vscode:/vscode.git/clone" did not exist on "05b0f1ea2f9dc012dbc19deabca7fa653db9a1ac"
Commit f1b2524b authored by Neel Kant's avatar Neel Kant
Browse files

Add debug statements

parent 1dd51c0e
...@@ -904,7 +904,7 @@ class InverseClozeDataset(data.Dataset): ...@@ -904,7 +904,7 @@ class InverseClozeDataset(data.Dataset):
def __getitem__(self, idx): def __getitem__(self, idx):
# get rng state corresponding to index (allows deterministic random pair) # get rng state corresponding to index (allows deterministic random pair)
rng = random.Random(idx) rng = random.Random(idx + 1000)
np_rng = np.random.RandomState(seed=[rng.randint(0, 2**32-1) for _ in range(16)]) np_rng = np.random.RandomState(seed=[rng.randint(0, 2**32-1) for _ in range(16)])
# get seq length. Save 2 tokens for beginning and end # get seq length. Save 2 tokens for beginning and end
...@@ -924,6 +924,7 @@ class InverseClozeDataset(data.Dataset): ...@@ -924,6 +924,7 @@ class InverseClozeDataset(data.Dataset):
'context_types': np.array(context_token_types), 'context_types': np.array(context_token_types),
'context_pad_mask': np.array(context_pad_mask) 'context_pad_mask': np.array(context_pad_mask)
} }
print("got item")
return sample return sample
...@@ -957,7 +958,7 @@ class InverseClozeDataset(data.Dataset): ...@@ -957,7 +958,7 @@ class InverseClozeDataset(data.Dataset):
doc = self.get_sentence_split_doc(doc_idx) doc = self.get_sentence_split_doc(doc_idx)
if not doc: if not doc:
doc = None doc = None
print("got doc sentences")
# set up and tokenize the entire selected document # set up and tokenize the entire selected document
num_sentences = len(doc) num_sentences = len(doc)
all_token_lists = [] all_token_lists = []
...@@ -967,6 +968,7 @@ class InverseClozeDataset(data.Dataset): ...@@ -967,6 +968,7 @@ class InverseClozeDataset(data.Dataset):
all_token_lists.append(tokens) all_token_lists.append(tokens)
all_token_type_lists.append(token_types) all_token_type_lists.append(token_types)
print("got tokenized sentences")
sentence_token_lens = [len(l) for l in all_token_lists] sentence_token_lens = [len(l) for l in all_token_lists]
inclusion_mask = [True] * num_sentences inclusion_mask = [True] * num_sentences
...@@ -993,6 +995,7 @@ class InverseClozeDataset(data.Dataset): ...@@ -993,6 +995,7 @@ class InverseClozeDataset(data.Dataset):
inclusion_mask[num_sentences - view_radius] = False inclusion_mask[num_sentences - view_radius] = False
remove_preceding = not remove_preceding remove_preceding = not remove_preceding
print("got inclusion mask")
# assemble the tokens and token types of the context # assemble the tokens and token types of the context
context_tokens = list(itertools.chain( context_tokens = list(itertools.chain(
*[l for i, l in enumerate(all_token_lists) if inclusion_mask[i]])) *[l for i, l in enumerate(all_token_lists) if inclusion_mask[i]]))
...@@ -1005,6 +1008,8 @@ class InverseClozeDataset(data.Dataset): ...@@ -1005,6 +1008,8 @@ class InverseClozeDataset(data.Dataset):
context_tokens, context_token_types, context_pad_mask = self.concat_and_pad_tokens( context_tokens, context_token_types, context_pad_mask = self.concat_and_pad_tokens(
context_tokens, context_token_types) context_tokens, context_token_types)
print("got all tokens")
return (input_tokens, input_token_types, input_pad_mask), \ return (input_tokens, input_token_types, input_pad_mask), \
(context_tokens, context_token_types, context_pad_mask) (context_tokens, context_token_types, context_pad_mask)
......
...@@ -292,10 +292,13 @@ class ICTBertModel(MegatronModule): ...@@ -292,10 +292,13 @@ class ICTBertModel(MegatronModule):
context_tokens, context_attention_mask, context_types): context_tokens, context_attention_mask, context_types):
question_ict_logits, _ = self.question_model.forward(input_tokens, input_attention_mask, input_types) question_ict_logits, _ = self.question_model.forward(input_tokens, input_attention_mask, input_types)
print("(bert ict forward) got question logits")
context_ict_logits, _ = self.context_model.forward(context_tokens, context_attention_mask, context_types) context_ict_logits, _ = self.context_model.forward(context_tokens, context_attention_mask, context_types)
print("(bert ict forward) got context logits")
# [batch x h] * [h x batch] # [batch x h] * [h x batch]
retrieval_scores = question_ict_logits.matmul(torch.transpose(context_ict_logits, 0, 1)) retrieval_scores = question_ict_logits.matmul(torch.transpose(context_ict_logits, 0, 1))
print("(bert ict forward) got retrieval scores")
return retrieval_scores return retrieval_scores
......
...@@ -253,6 +253,7 @@ def setup_model_and_optimizer(model_provider_func, args): ...@@ -253,6 +253,7 @@ def setup_model_and_optimizer(model_provider_func, args):
def backward_step(optimizer, model, loss, args, timers): def backward_step(optimizer, model, loss, args, timers):
"""Backward step.""" """Backward step."""
print("back1")
# Backward pass. # Backward pass.
optimizer.zero_grad() optimizer.zero_grad()
if args.fp16: if args.fp16:
...@@ -260,6 +261,7 @@ def backward_step(optimizer, model, loss, args, timers): ...@@ -260,6 +261,7 @@ def backward_step(optimizer, model, loss, args, timers):
else: else:
loss.backward() loss.backward()
print("back2")
# All-reduce if needed. # All-reduce if needed.
if args.DDP_impl == 'local': if args.DDP_impl == 'local':
timers('allreduce').start() timers('allreduce').start()
...@@ -267,10 +269,12 @@ def backward_step(optimizer, model, loss, args, timers): ...@@ -267,10 +269,12 @@ def backward_step(optimizer, model, loss, args, timers):
fp32_allreduce=args.fp32_allreduce) fp32_allreduce=args.fp32_allreduce)
timers('allreduce').stop() timers('allreduce').stop()
print("back3")
# Update master gradients. # Update master gradients.
if args.fp16: if args.fp16:
optimizer.update_master_grads() optimizer.update_master_grads()
print("back4")
# Clipping gradients helps prevent the exploding gradient. # Clipping gradients helps prevent the exploding gradient.
if args.clip_grad > 0: if args.clip_grad > 0:
if not args.fp16: if not args.fp16:
...@@ -278,6 +282,7 @@ def backward_step(optimizer, model, loss, args, timers): ...@@ -278,6 +282,7 @@ def backward_step(optimizer, model, loss, args, timers):
else: else:
optimizer.clip_master_grads(args.clip_grad) optimizer.clip_master_grads(args.clip_grad)
print("back5")
def train_step(forward_step_func, data_iterator, model, optimizer, lr_scheduler, def train_step(forward_step_func, data_iterator, model, optimizer, lr_scheduler,
args, timers): args, timers):
...@@ -287,16 +292,22 @@ def train_step(forward_step_func, data_iterator, model, optimizer, lr_scheduler, ...@@ -287,16 +292,22 @@ def train_step(forward_step_func, data_iterator, model, optimizer, lr_scheduler,
timers('forward').start() timers('forward').start()
loss, loss_reduced = forward_step_func(data_iterator, model, args, timers) loss, loss_reduced = forward_step_func(data_iterator, model, args, timers)
timers('forward').stop() timers('forward').stop()
torch.cuda.synchronize()
print("confirm forward")
# Calculate gradients, reduce across processes, and clip. # Calculate gradients, reduce across processes, and clip.
timers('backward').start() timers('backward').start()
backward_step(optimizer, model, loss, args, timers) backward_step(optimizer, model, loss, args, timers)
timers('backward').stop() timers('backward').stop()
torch.cuda.synchronize()
print("did backward step")
# Update parameters. # Update parameters.
timers('optimizer').start() timers('optimizer').start()
optimizer.step() optimizer.step()
timers('optimizer').stop() timers('optimizer').stop()
torch.cuda.synchronize()
print("did optim step")
# Update learning rate. # Update learning rate.
skipped_iter = 0 skipped_iter = 0
......
...@@ -26,6 +26,7 @@ from megatron.utils import reduce_losses ...@@ -26,6 +26,7 @@ from megatron.utils import reduce_losses
from megatron.utils import vocab_size_with_padding from megatron.utils import vocab_size_with_padding
from megatron.training import run from megatron.training import run
num_batches = 0
def model_provider(args): def model_provider(args):
"""Build the model.""" """Build the model."""
...@@ -78,6 +79,9 @@ def get_batch(data_iterator, timers): ...@@ -78,6 +79,9 @@ def get_batch(data_iterator, timers):
context_types = data_b['context_types'].long() context_types = data_b['context_types'].long()
context_pad_mask = data_b['context_pad_mask'].long() context_pad_mask = data_b['context_pad_mask'].long()
global num_batches
print("got batch {}".format(num_batches))
return input_tokens, input_types, input_pad_mask,\ return input_tokens, input_types, input_pad_mask,\
context_tokens, context_types, context_pad_mask context_tokens, context_types, context_pad_mask
...@@ -94,12 +98,19 @@ def forward_step(data_iterator, model, args, timers): ...@@ -94,12 +98,19 @@ def forward_step(data_iterator, model, args, timers):
# Forward model. # Forward model.
retrieval_scores = model(input_tokens, 1 - input_pad_mask, input_types, retrieval_scores = model(input_tokens, 1 - input_pad_mask, input_types,
context_tokens, 1 - context_pad_mask, context_types) context_tokens, 1 - context_pad_mask, context_types)
print("ran model to get retrieval scores")
softmaxed = F.softmax(retrieval_scores, dim=0).float() softmaxed = F.softmax(retrieval_scores, dim=0)
retrieval_loss = F.cross_entropy(softmaxed, torch.arange(softmaxed.size()[0]).cuda()) retrieval_loss = F.cross_entropy(softmaxed, torch.arange(softmaxed.shape[0]).cuda())
print(type(retrieval_loss))
reduced_losses = reduce_losses([retrieval_loss]) reduced_losses = reduce_losses([retrieval_loss])
global num_batches
print("did forward step {}".format(num_batches))
num_batches += 1
print(retrieval_loss, {'retrieval loss': reduced_losses[0]})
return retrieval_loss, {'retrieval loss': reduced_losses[0]} return retrieval_loss, {'retrieval loss': reduced_losses[0]}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment