Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
f1b2524b
"examples/vscode:/vscode.git/clone" did not exist on "05b0f1ea2f9dc012dbc19deabca7fa653db9a1ac"
Commit
f1b2524b
authored
Mar 30, 2020
by
Neel Kant
Browse files
Add debug statements
parent
1dd51c0e
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
34 additions
and
4 deletions
+34
-4
megatron/data_utils/datasets.py
megatron/data_utils/datasets.py
+7
-2
megatron/model/bert_model.py
megatron/model/bert_model.py
+3
-0
megatron/training.py
megatron/training.py
+11
-0
pretrain_bert_ict.py
pretrain_bert_ict.py
+13
-2
No files found.
megatron/data_utils/datasets.py
View file @
f1b2524b
...
@@ -904,7 +904,7 @@ class InverseClozeDataset(data.Dataset):
...
@@ -904,7 +904,7 @@ class InverseClozeDataset(data.Dataset):
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
# get rng state corresponding to index (allows deterministic random pair)
# get rng state corresponding to index (allows deterministic random pair)
rng
=
random
.
Random
(
idx
)
rng
=
random
.
Random
(
idx
+
1000
)
np_rng
=
np
.
random
.
RandomState
(
seed
=
[
rng
.
randint
(
0
,
2
**
32
-
1
)
for
_
in
range
(
16
)])
np_rng
=
np
.
random
.
RandomState
(
seed
=
[
rng
.
randint
(
0
,
2
**
32
-
1
)
for
_
in
range
(
16
)])
# get seq length. Save 2 tokens for beginning and end
# get seq length. Save 2 tokens for beginning and end
...
@@ -924,6 +924,7 @@ class InverseClozeDataset(data.Dataset):
...
@@ -924,6 +924,7 @@ class InverseClozeDataset(data.Dataset):
'context_types'
:
np
.
array
(
context_token_types
),
'context_types'
:
np
.
array
(
context_token_types
),
'context_pad_mask'
:
np
.
array
(
context_pad_mask
)
'context_pad_mask'
:
np
.
array
(
context_pad_mask
)
}
}
print
(
"got item"
)
return
sample
return
sample
...
@@ -957,7 +958,7 @@ class InverseClozeDataset(data.Dataset):
...
@@ -957,7 +958,7 @@ class InverseClozeDataset(data.Dataset):
doc
=
self
.
get_sentence_split_doc
(
doc_idx
)
doc
=
self
.
get_sentence_split_doc
(
doc_idx
)
if
not
doc
:
if
not
doc
:
doc
=
None
doc
=
None
print
(
"got doc sentences"
)
# set up and tokenize the entire selected document
# set up and tokenize the entire selected document
num_sentences
=
len
(
doc
)
num_sentences
=
len
(
doc
)
all_token_lists
=
[]
all_token_lists
=
[]
...
@@ -967,6 +968,7 @@ class InverseClozeDataset(data.Dataset):
...
@@ -967,6 +968,7 @@ class InverseClozeDataset(data.Dataset):
all_token_lists
.
append
(
tokens
)
all_token_lists
.
append
(
tokens
)
all_token_type_lists
.
append
(
token_types
)
all_token_type_lists
.
append
(
token_types
)
print
(
"got tokenized sentences"
)
sentence_token_lens
=
[
len
(
l
)
for
l
in
all_token_lists
]
sentence_token_lens
=
[
len
(
l
)
for
l
in
all_token_lists
]
inclusion_mask
=
[
True
]
*
num_sentences
inclusion_mask
=
[
True
]
*
num_sentences
...
@@ -993,6 +995,7 @@ class InverseClozeDataset(data.Dataset):
...
@@ -993,6 +995,7 @@ class InverseClozeDataset(data.Dataset):
inclusion_mask
[
num_sentences
-
view_radius
]
=
False
inclusion_mask
[
num_sentences
-
view_radius
]
=
False
remove_preceding
=
not
remove_preceding
remove_preceding
=
not
remove_preceding
print
(
"got inclusion mask"
)
# assemble the tokens and token types of the context
# assemble the tokens and token types of the context
context_tokens
=
list
(
itertools
.
chain
(
context_tokens
=
list
(
itertools
.
chain
(
*
[
l
for
i
,
l
in
enumerate
(
all_token_lists
)
if
inclusion_mask
[
i
]]))
*
[
l
for
i
,
l
in
enumerate
(
all_token_lists
)
if
inclusion_mask
[
i
]]))
...
@@ -1005,6 +1008,8 @@ class InverseClozeDataset(data.Dataset):
...
@@ -1005,6 +1008,8 @@ class InverseClozeDataset(data.Dataset):
context_tokens
,
context_token_types
,
context_pad_mask
=
self
.
concat_and_pad_tokens
(
context_tokens
,
context_token_types
,
context_pad_mask
=
self
.
concat_and_pad_tokens
(
context_tokens
,
context_token_types
)
context_tokens
,
context_token_types
)
print
(
"got all tokens"
)
return
(
input_tokens
,
input_token_types
,
input_pad_mask
),
\
return
(
input_tokens
,
input_token_types
,
input_pad_mask
),
\
(
context_tokens
,
context_token_types
,
context_pad_mask
)
(
context_tokens
,
context_token_types
,
context_pad_mask
)
...
...
megatron/model/bert_model.py
View file @
f1b2524b
...
@@ -292,10 +292,13 @@ class ICTBertModel(MegatronModule):
...
@@ -292,10 +292,13 @@ class ICTBertModel(MegatronModule):
context_tokens
,
context_attention_mask
,
context_types
):
context_tokens
,
context_attention_mask
,
context_types
):
question_ict_logits
,
_
=
self
.
question_model
.
forward
(
input_tokens
,
input_attention_mask
,
input_types
)
question_ict_logits
,
_
=
self
.
question_model
.
forward
(
input_tokens
,
input_attention_mask
,
input_types
)
print
(
"(bert ict forward) got question logits"
)
context_ict_logits
,
_
=
self
.
context_model
.
forward
(
context_tokens
,
context_attention_mask
,
context_types
)
context_ict_logits
,
_
=
self
.
context_model
.
forward
(
context_tokens
,
context_attention_mask
,
context_types
)
print
(
"(bert ict forward) got context logits"
)
# [batch x h] * [h x batch]
# [batch x h] * [h x batch]
retrieval_scores
=
question_ict_logits
.
matmul
(
torch
.
transpose
(
context_ict_logits
,
0
,
1
))
retrieval_scores
=
question_ict_logits
.
matmul
(
torch
.
transpose
(
context_ict_logits
,
0
,
1
))
print
(
"(bert ict forward) got retrieval scores"
)
return
retrieval_scores
return
retrieval_scores
...
...
megatron/training.py
View file @
f1b2524b
...
@@ -253,6 +253,7 @@ def setup_model_and_optimizer(model_provider_func, args):
...
@@ -253,6 +253,7 @@ def setup_model_and_optimizer(model_provider_func, args):
def
backward_step
(
optimizer
,
model
,
loss
,
args
,
timers
):
def
backward_step
(
optimizer
,
model
,
loss
,
args
,
timers
):
"""Backward step."""
"""Backward step."""
print
(
"back1"
)
# Backward pass.
# Backward pass.
optimizer
.
zero_grad
()
optimizer
.
zero_grad
()
if
args
.
fp16
:
if
args
.
fp16
:
...
@@ -260,6 +261,7 @@ def backward_step(optimizer, model, loss, args, timers):
...
@@ -260,6 +261,7 @@ def backward_step(optimizer, model, loss, args, timers):
else
:
else
:
loss
.
backward
()
loss
.
backward
()
print
(
"back2"
)
# All-reduce if needed.
# All-reduce if needed.
if
args
.
DDP_impl
==
'local'
:
if
args
.
DDP_impl
==
'local'
:
timers
(
'allreduce'
).
start
()
timers
(
'allreduce'
).
start
()
...
@@ -267,10 +269,12 @@ def backward_step(optimizer, model, loss, args, timers):
...
@@ -267,10 +269,12 @@ def backward_step(optimizer, model, loss, args, timers):
fp32_allreduce
=
args
.
fp32_allreduce
)
fp32_allreduce
=
args
.
fp32_allreduce
)
timers
(
'allreduce'
).
stop
()
timers
(
'allreduce'
).
stop
()
print
(
"back3"
)
# Update master gradients.
# Update master gradients.
if
args
.
fp16
:
if
args
.
fp16
:
optimizer
.
update_master_grads
()
optimizer
.
update_master_grads
()
print
(
"back4"
)
# Clipping gradients helps prevent the exploding gradient.
# Clipping gradients helps prevent the exploding gradient.
if
args
.
clip_grad
>
0
:
if
args
.
clip_grad
>
0
:
if
not
args
.
fp16
:
if
not
args
.
fp16
:
...
@@ -278,6 +282,7 @@ def backward_step(optimizer, model, loss, args, timers):
...
@@ -278,6 +282,7 @@ def backward_step(optimizer, model, loss, args, timers):
else
:
else
:
optimizer
.
clip_master_grads
(
args
.
clip_grad
)
optimizer
.
clip_master_grads
(
args
.
clip_grad
)
print
(
"back5"
)
def
train_step
(
forward_step_func
,
data_iterator
,
model
,
optimizer
,
lr_scheduler
,
def
train_step
(
forward_step_func
,
data_iterator
,
model
,
optimizer
,
lr_scheduler
,
args
,
timers
):
args
,
timers
):
...
@@ -287,16 +292,22 @@ def train_step(forward_step_func, data_iterator, model, optimizer, lr_scheduler,
...
@@ -287,16 +292,22 @@ def train_step(forward_step_func, data_iterator, model, optimizer, lr_scheduler,
timers
(
'forward'
).
start
()
timers
(
'forward'
).
start
()
loss
,
loss_reduced
=
forward_step_func
(
data_iterator
,
model
,
args
,
timers
)
loss
,
loss_reduced
=
forward_step_func
(
data_iterator
,
model
,
args
,
timers
)
timers
(
'forward'
).
stop
()
timers
(
'forward'
).
stop
()
torch
.
cuda
.
synchronize
()
print
(
"confirm forward"
)
# Calculate gradients, reduce across processes, and clip.
# Calculate gradients, reduce across processes, and clip.
timers
(
'backward'
).
start
()
timers
(
'backward'
).
start
()
backward_step
(
optimizer
,
model
,
loss
,
args
,
timers
)
backward_step
(
optimizer
,
model
,
loss
,
args
,
timers
)
timers
(
'backward'
).
stop
()
timers
(
'backward'
).
stop
()
torch
.
cuda
.
synchronize
()
print
(
"did backward step"
)
# Update parameters.
# Update parameters.
timers
(
'optimizer'
).
start
()
timers
(
'optimizer'
).
start
()
optimizer
.
step
()
optimizer
.
step
()
timers
(
'optimizer'
).
stop
()
timers
(
'optimizer'
).
stop
()
torch
.
cuda
.
synchronize
()
print
(
"did optim step"
)
# Update learning rate.
# Update learning rate.
skipped_iter
=
0
skipped_iter
=
0
...
...
pretrain_bert_ict.py
View file @
f1b2524b
...
@@ -26,6 +26,7 @@ from megatron.utils import reduce_losses
...
@@ -26,6 +26,7 @@ from megatron.utils import reduce_losses
from
megatron.utils
import
vocab_size_with_padding
from
megatron.utils
import
vocab_size_with_padding
from
megatron.training
import
run
from
megatron.training
import
run
num_batches
=
0
def
model_provider
(
args
):
def
model_provider
(
args
):
"""Build the model."""
"""Build the model."""
...
@@ -78,6 +79,9 @@ def get_batch(data_iterator, timers):
...
@@ -78,6 +79,9 @@ def get_batch(data_iterator, timers):
context_types
=
data_b
[
'context_types'
].
long
()
context_types
=
data_b
[
'context_types'
].
long
()
context_pad_mask
=
data_b
[
'context_pad_mask'
].
long
()
context_pad_mask
=
data_b
[
'context_pad_mask'
].
long
()
global
num_batches
print
(
"got batch {}"
.
format
(
num_batches
))
return
input_tokens
,
input_types
,
input_pad_mask
,
\
return
input_tokens
,
input_types
,
input_pad_mask
,
\
context_tokens
,
context_types
,
context_pad_mask
context_tokens
,
context_types
,
context_pad_mask
...
@@ -94,12 +98,19 @@ def forward_step(data_iterator, model, args, timers):
...
@@ -94,12 +98,19 @@ def forward_step(data_iterator, model, args, timers):
# Forward model.
# Forward model.
retrieval_scores
=
model
(
input_tokens
,
1
-
input_pad_mask
,
input_types
,
retrieval_scores
=
model
(
input_tokens
,
1
-
input_pad_mask
,
input_types
,
context_tokens
,
1
-
context_pad_mask
,
context_types
)
context_tokens
,
1
-
context_pad_mask
,
context_types
)
print
(
"ran model to get retrieval scores"
)
softmaxed
=
F
.
softmax
(
retrieval_scores
,
dim
=
0
).
float
()
softmaxed
=
F
.
softmax
(
retrieval_scores
,
dim
=
0
)
retrieval_loss
=
F
.
cross_entropy
(
softmaxed
,
torch
.
arange
(
softmaxed
.
size
()[
0
]).
cuda
())
retrieval_loss
=
F
.
cross_entropy
(
softmaxed
,
torch
.
arange
(
softmaxed
.
shape
[
0
]).
cuda
())
print
(
type
(
retrieval_loss
))
reduced_losses
=
reduce_losses
([
retrieval_loss
])
reduced_losses
=
reduce_losses
([
retrieval_loss
])
global
num_batches
print
(
"did forward step {}"
.
format
(
num_batches
))
num_batches
+=
1
print
(
retrieval_loss
,
{
'retrieval loss'
:
reduced_losses
[
0
]})
return
retrieval_loss
,
{
'retrieval loss'
:
reduced_losses
[
0
]}
return
retrieval_loss
,
{
'retrieval loss'
:
reduced_losses
[
0
]}
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment