Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
29a69547
Commit
29a69547
authored
Dec 11, 2020
by
mshoeybi
Committed by
Deepak Narayanan
Dec 19, 2020
Browse files
Some bugfixes
parent
39181113
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
4 deletions
+6
-4
megatron/data/gpt2_dataset.py
megatron/data/gpt2_dataset.py
+1
-1
megatron/training.py
megatron/training.py
+5
-3
No files found.
megatron/data/gpt2_dataset.py
View file @
29a69547
...
@@ -408,7 +408,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length,
...
@@ -408,7 +408,7 @@ def _build_sample_idx(sizes, doc_idx, seq_length,
return
sample_idx
return
sample_idx
def
def
_build_shuffle_idx
(
num_samples
,
total_size
,
np_rng
):
def
_build_shuffle_idx
(
num_samples
,
total_size
,
np_rng
):
"""Build the range [0, size) and shuffle."""
"""Build the range [0, size) and shuffle."""
print
(
' > building shuffle index with split [0, {}) and [{}, {}) '
print
(
' > building shuffle index with split [0, {}) and [{}, {}) '
'...'
.
format
(
num_samples
,
num_samples
,
total_size
),
flush
=
True
)
'...'
.
format
(
num_samples
,
num_samples
,
total_size
),
flush
=
True
)
...
...
megatron/training.py
View file @
29a69547
...
@@ -717,13 +717,14 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -717,13 +717,14 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
add_to_logging
(
'optimizer'
)
add_to_logging
(
'optimizer'
)
add_to_logging
(
'batch generator'
)
add_to_logging
(
'batch generator'
)
batch_size
=
args
.
micro_batch_size
*
args
.
data_parallel_size
*
\
get_num_microbatches
()
# Tensorboard values.
# Tensorboard values.
if
writer
and
torch
.
distributed
.
get_rank
()
==
0
:
if
writer
and
torch
.
distributed
.
get_rank
()
==
0
:
writer
.
add_scalar
(
'learning_rate-iterations'
,
learning_rate
,
iteration
)
writer
.
add_scalar
(
'learning_rate-iterations'
,
learning_rate
,
iteration
)
writer
.
add_scalar
(
'learning_rate-samples'
,
learning_rate
,
writer
.
add_scalar
(
'learning_rate-samples'
,
learning_rate
,
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
batch_size
=
args
.
micro_batch_size
*
args
.
data_parallel_size
*
\
get_num_microbatches
()
writer
.
add_scalar
(
'batch_size-iterations'
,
batch_size
,
iteration
)
writer
.
add_scalar
(
'batch_size-iterations'
,
batch_size
,
iteration
)
writer
.
add_scalar
(
'batch_size-samples'
,
batch_size
,
writer
.
add_scalar
(
'batch_size-samples'
,
batch_size
,
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
...
@@ -748,11 +749,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -748,11 +749,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
elapsed_time
/
args
.
log_interval
,
iteration
)
elapsed_time
/
args
.
log_interval
,
iteration
)
log_string
=
' iteration {:8d}/{:8d} |'
.
format
(
log_string
=
' iteration {:8d}/{:8d} |'
.
format
(
iteration
,
args
.
train_iters
)
iteration
,
args
.
train_iters
)
log_string
+=
' consumed samples {:12d} |'
.
format
(
log_string
+=
' consumed samples
:
{:12d} |'
.
format
(
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
log_string
+=
' elapsed time per iteration (ms): {:.1f} |'
.
format
(
log_string
+=
' elapsed time per iteration (ms): {:.1f} |'
.
format
(
elapsed_time
*
1000.0
/
args
.
log_interval
)
elapsed_time
*
1000.0
/
args
.
log_interval
)
log_string
+=
' learning rate: {:.3E} |'
.
format
(
learning_rate
)
log_string
+=
' learning rate: {:.3E} |'
.
format
(
learning_rate
)
log_string
+=
' global batch size: {:6d} |'
.
format
(
batch_size
)
num_iterations
=
max
(
num_iterations
=
max
(
1
,
args
.
log_interval
-
total_loss_dict
[
skipped_iters_key
])
1
,
args
.
log_interval
-
total_loss_dict
[
skipped_iters_key
])
for
key
in
total_loss_dict
:
for
key
in
total_loss_dict
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment