Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Megatron-LM
Commits
41276b6c
Commit
41276b6c
authored
Oct 03, 2022
by
Vijay Korthikanti
Browse files
Merge branch 'main' into nmt-main
parents
a44360ed
fc7f4f03
Changes
135
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
396 additions
and
339 deletions
+396
-339
megatron/text_generation_server.py
megatron/text_generation_server.py
+1
-14
megatron/timers.py
megatron/timers.py
+304
-0
megatron/tokenizer/__init__.py
megatron/tokenizer/__init__.py
+1
-14
megatron/tokenizer/tokenizer.py
megatron/tokenizer/tokenizer.py
+1
-14
megatron/training.py
megatron/training.py
+60
-73
megatron/utils.py
megatron/utils.py
+1
-14
pretrain_bert.py
pretrain_bert.py
+2
-15
pretrain_gpt.py
pretrain_gpt.py
+2
-15
pretrain_ict.py
pretrain_ict.py
+2
-15
pretrain_t5.py
pretrain_t5.py
+2
-15
pretrain_vision_classify.py
pretrain_vision_classify.py
+2
-15
pretrain_vision_dino.py
pretrain_vision_dino.py
+2
-15
pretrain_vision_inpaint.py
pretrain_vision_inpaint.py
+2
-15
tasks/data_utils.py
tasks/data_utils.py
+1
-14
tasks/eval_utils.py
tasks/eval_utils.py
+1
-14
tasks/finetune_utils.py
tasks/finetune_utils.py
+8
-21
tasks/glue/data.py
tasks/glue/data.py
+1
-14
tasks/glue/finetune.py
tasks/glue/finetune.py
+1
-14
tasks/glue/mnli.py
tasks/glue/mnli.py
+1
-14
tasks/glue/qqp.py
tasks/glue/qqp.py
+1
-14
No files found.
megatron/text_generation_server.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
datetime
import
datetime
import
torch
import
torch
import
json
import
json
...
...
megatron/timers.py
0 → 100644
View file @
41276b6c
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
"""Megatron timers."""
from
abc
import
ABC
from
abc
import
abstractmethod
import
time
import
torch
class
TimerBase
(
ABC
):
def
__init__
(
self
,
name
):
self
.
name
=
name
@
abstractmethod
def
start
(
self
,
barrier
=
False
):
pass
@
abstractmethod
def
stop
(
self
,
barrier
=
False
):
pass
@
abstractmethod
def
reset
(
self
):
pass
@
abstractmethod
def
elapsed
(
self
,
reset
=
True
,
barrier
=
False
):
pass
class
DummyTimer
(
TimerBase
):
def
__init__
(
self
):
super
().
__init__
(
'dummy timer'
)
def
start
(
self
,
barrier
=
False
):
return
def
stop
(
self
,
barrier
=
False
):
return
def
reset
(
self
):
return
def
elapsed
(
self
,
reset
=
True
,
barrier
=
False
):
raise
Exception
(
'dummy timer should not be used to '
'calculate elapsed time'
)
class
Timer
(
TimerBase
):
"""
Comment on using `barrier`: If this flag is passed, then all
the caller processes will wait till all reach the timing routine.
It is up to the user to make sure all the ranks in `barrier_group`
call it otherwise, it will result in a hang.
Comment on `barrier_group`: By default it is set to None which
in torch distributed land, it will result in the global communicator.
"""
def
__init__
(
self
,
name
):
super
().
__init__
(
name
)
self
.
_elapsed
=
0.0
self
.
_started
=
False
# Note that None will default to the global process group
self
.
_barrier_group
=
None
self
.
_start_time
=
time
.
time
()
def
set_barrier_group
(
self
,
barrier_group
):
self
.
_barrier_group
=
barrier_group
def
start
(
self
,
barrier
=
False
):
"""Start the timer."""
assert
not
self
.
_started
,
'timer has already been started'
if
barrier
:
torch
.
distributed
.
barrier
(
group
=
self
.
_barrier_group
)
torch
.
cuda
.
synchronize
()
self
.
_start_time
=
time
.
time
()
self
.
_started
=
True
def
stop
(
self
,
barrier
=
False
):
"""Stop the timer."""
assert
self
.
_started
,
'timer is not started'
if
barrier
:
torch
.
distributed
.
barrier
(
group
=
self
.
_barrier_group
)
torch
.
cuda
.
synchronize
()
self
.
_elapsed
+=
(
time
.
time
()
-
self
.
_start_time
)
self
.
_started
=
False
def
reset
(
self
):
"""Reset timer."""
self
.
_elapsed
=
0.0
self
.
_started
=
False
def
elapsed
(
self
,
reset
=
True
,
barrier
=
False
):
"""Calculate the elapsed time."""
_started
=
self
.
_started
# If the timing in progress, end it first.
if
self
.
_started
:
self
.
stop
(
barrier
=
barrier
)
# Get the elapsed time.
_elapsed
=
self
.
_elapsed
# Reset the elapsed time
if
reset
:
self
.
reset
()
# If timing was in progress, set it back.
if
_started
:
self
.
start
(
barrier
=
barrier
)
return
_elapsed
class
Timers
:
"""Group of timers."""
def
__init__
(
self
,
log_level
,
log_option
):
self
.
_log_level
=
log_level
self
.
_log_option
=
log_option
self
.
_timers
=
{}
self
.
_log_levels
=
{}
self
.
_dummy_timer
=
DummyTimer
()
self
.
_max_log_level
=
2
def
__call__
(
self
,
name
,
log_level
=
None
):
# If the timer has already been set, then check if the log-level
# is provided, it matches the one that the timer was created with.
if
name
in
self
.
_timers
:
if
log_level
is
not
None
:
assert
log_level
==
self
.
_log_levels
[
name
],
\
'input log level {} does not match already existing '
\
'log level {} for {} timer'
.
format
(
log_level
,
self
.
_log_levels
[
name
],
name
)
return
self
.
_timers
[
name
]
# If timer does not exist and no log level is provided,
# set it to the max log level which is 2.
if
log_level
is
None
:
log_level
=
self
.
_max_log_level
assert
log_level
<=
self
.
_max_log_level
,
\
'log level {} is larger than max supported log level {}'
.
format
(
log_level
,
self
.
_max_log_level
)
# Now if the input log level is larger than the one set for
# the timers class, just ignore it and return a dummy timer.
if
log_level
>
self
.
_log_level
:
return
self
.
_dummy_timer
# Otherwise, initalize the timer and set the level.
self
.
_timers
[
name
]
=
Timer
(
name
)
self
.
_log_levels
[
name
]
=
log_level
return
self
.
_timers
[
name
]
def
_get_elapsed_time_all_ranks
(
self
,
names
,
reset
,
barrier
):
"""
Assumptions:
- All the ranks call this function.
- `names` are identical on all ranks.
If the above assumptions are not met, calling this function will
result in hang.
Arguments:
- names: list of timer names
- reset: reset the timer after recording the elapsed time
- barrier: if set, do a global barrier before time measurments
"""
# First make sure all the callers are in sync.
if
barrier
:
torch
.
distributed
.
barrier
()
world_size
=
torch
.
distributed
.
get_world_size
()
rank
=
torch
.
distributed
.
get_rank
()
# Here we can use gather on the rank we want to print the
# timing, however, there is no gather_base support in
# pytorch yet. It is simpler to deal with a single tensor
# and since we are only gathering a small amount of data,
# it should be ok to use all-gather instead of gather.
rank_name_to_time
=
torch
.
zeros
((
world_size
,
len
(
names
)),
dtype
=
torch
.
float
,
device
=
torch
.
cuda
.
current_device
())
for
i
,
name
in
enumerate
(
names
):
if
name
in
self
.
_timers
:
# Here we don't need to pass the barrier flag as all
# the processes are already in sync. This avoids the
# issue of different timers having different barrier
# groups inside their class.
rank_name_to_time
[
rank
,
i
]
=
self
.
_timers
[
name
].
elapsed
(
reset
=
reset
)
# See the note above for why we are not using gather.
torch
.
distributed
.
_all_gather_base
(
rank_name_to_time
.
view
(
-
1
),
rank_name_to_time
[
rank
,
:].
view
(
-
1
))
return
rank_name_to_time
def
_get_global_min_max_time
(
self
,
names
,
reset
,
barrier
,
normalizer
):
"""Report only min and max times across all ranks."""
rank_name_to_time
=
self
.
_get_elapsed_time_all_ranks
(
names
,
reset
,
barrier
)
name_to_min_max_time
=
{}
for
i
,
name
in
enumerate
(
names
):
rank_to_time
=
rank_name_to_time
[:,
i
]
# filter out the ones we did not have any timings for
rank_to_time
=
rank_to_time
[
rank_to_time
>
0.0
]
# If the timer exists:
if
rank_to_time
.
numel
()
>
0
:
name_to_min_max_time
[
name
]
=
(
rank_to_time
.
min
().
item
()
/
normalizer
,
rank_to_time
.
max
().
item
()
/
normalizer
)
return
name_to_min_max_time
def
_get_global_min_max_time_string
(
self
,
names
,
reset
,
barrier
,
normalizer
,
max_only
):
name_to_min_max_time
=
self
.
_get_global_min_max_time
(
names
,
reset
,
barrier
,
normalizer
)
if
not
name_to_min_max_time
:
return
None
output_string
=
'(min, max) time across ranks (ms):'
for
name
in
name_to_min_max_time
:
min_time
,
max_time
=
name_to_min_max_time
[
name
]
if
max_only
:
output_string
+=
'
\n
{}: {:.2f}'
.
format
(
(
name
+
' '
).
ljust
(
48
,
'.'
),
max_time
)
else
:
output_string
+=
'
\n
{}: ({:.2f}, {:.2f})'
.
format
(
(
name
+
' '
).
ljust
(
48
,
'.'
),
min_time
,
max_time
)
return
output_string
def
_get_all_ranks_time_string
(
self
,
names
,
reset
,
barrier
,
normalizer
):
"""Report times across all ranks."""
rank_name_to_time
=
self
.
_get_elapsed_time_all_ranks
(
names
,
reset
,
barrier
)
output_string
=
'times across ranks (ms):'
no_reported_timing
=
True
for
i
,
name
in
enumerate
(
names
):
not_yet_found
=
True
for
rank
in
range
(
torch
.
distributed
.
get_world_size
()):
if
rank_name_to_time
[
rank
,
i
]
>
0
:
no_reported_timing
=
False
if
not_yet_found
:
not_yet_found
=
False
output_string
+=
'
\n
{}:'
.
format
(
name
)
output_string
+=
'
\n
rank {:2d}: {:.2f}'
.
format
(
rank
,
rank_name_to_time
[
rank
,
i
]
/
normalizer
)
if
no_reported_timing
:
return
None
return
output_string
def
log
(
self
,
names
,
rank
=
None
,
normalizer
=
1.0
,
reset
=
True
,
barrier
=
False
):
"""Log a group of timers."""
# Print.
assert
normalizer
>
0.0
if
self
.
_log_option
in
[
'max'
,
'minmax'
]:
max_only
=
False
if
self
.
_log_option
==
'max'
:
max_only
=
True
output_string
=
self
.
_get_global_min_max_time_string
(
names
,
reset
,
barrier
,
normalizer
/
1000.0
,
max_only
)
elif
self
.
_log_option
==
'all'
:
output_string
=
self
.
_get_all_ranks_time_string
(
names
,
reset
,
barrier
,
normalizer
/
1000.0
)
else
:
raise
Exception
(
'unknown timing log option {}'
.
format
(
self
.
_log_option
))
# If no input rank is provided, log on last rank.
if
rank
is
None
:
rank
=
torch
.
distributed
.
get_world_size
()
-
1
if
rank
==
torch
.
distributed
.
get_rank
()
and
output_string
is
not
None
:
print
(
output_string
,
flush
=
True
)
def
write
(
self
,
names
,
writer
,
iteration
,
normalizer
=
1.0
,
reset
=
False
,
barrier
=
False
):
"""Write timers to a tensorboard writer
Note that we only report maximum time across ranks to tensorboard.
"""
# currently when using add_scalars,
# torch.utils.add_scalars makes each timer its own run, which
# polutes the runs list, so we just add each as a scalar
assert
normalizer
>
0.0
name_to_min_max_time
=
self
.
_get_global_min_max_time
(
names
,
reset
,
barrier
,
normalizer
)
if
writer
is
not
None
:
for
name
in
name_to_min_max_time
:
_
,
max_time
=
name_to_min_max_time
[
name
]
writer
.
add_scalar
(
name
+
'-time'
,
max_time
,
iteration
)
megatron/tokenizer/__init__.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
.tokenizer
import
build_tokenizer
from
.tokenizer
import
build_tokenizer
megatron/tokenizer/tokenizer.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Megatron tokenizers."""
"""Megatron tokenizers."""
...
...
megatron/training.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain utilities."""
"""Pretrain utilities."""
...
@@ -119,23 +106,28 @@ def pretrain(train_valid_test_dataset_provider,
...
@@ -119,23 +106,28 @@ def pretrain(train_valid_test_dataset_provider,
timers
=
get_timers
()
timers
=
get_timers
()
# Model, optimizer, and learning rate.
# Model, optimizer, and learning rate.
timers
(
'model-and-optimizer-setup'
).
start
(
)
timers
(
'model-and-optimizer-setup'
,
log_level
=
0
).
start
(
barrier
=
True
)
model
,
optimizer
,
opt_param_scheduler
=
setup_model_and_optimizer
(
model_provider
,
model
,
optimizer
,
opt_param_scheduler
=
setup_model_and_optimizer
(
model_type
)
model_provider
,
model_type
)
timers
(
'model-and-optimizer-setup'
).
stop
()
timers
(
'model-and-optimizer-setup'
).
stop
()
print_datetime
(
'after model, optimizer, and learning rate '
print_datetime
(
'after model, optimizer, and learning rate '
'scheduler are built'
)
'scheduler are built'
)
# Data stuff.
# Data stuff.
timers
(
'train/valid/test-data-iterators-setup'
).
start
()
timers
(
'train/valid/test-data-iterators-setup'
,
log_level
=
0
).
start
(
barrier
=
True
)
if
args
.
virtual_pipeline_model_parallel_size
is
not
None
:
if
args
.
virtual_pipeline_model_parallel_size
is
not
None
:
all_data_iterators
=
[
all_data_iterators
=
[
build_train_valid_test_data_iterators
(
train_valid_test_dataset_provider
)
build_train_valid_test_data_iterators
(
train_valid_test_dataset_provider
)
for
_
in
range
(
len
(
model
))
for
_
in
range
(
len
(
model
))
]
]
train_data_iterator
=
[
data_iterators
[
0
]
for
data_iterators
in
all_data_iterators
]
train_data_iterator
=
[
data_iterators
[
0
]
valid_data_iterator
=
[
data_iterators
[
1
]
for
data_iterators
in
all_data_iterators
]
for
data_iterators
in
all_data_iterators
]
test_data_iterator
=
[
data_iterators
[
2
]
for
data_iterators
in
all_data_iterators
]
valid_data_iterator
=
[
data_iterators
[
1
]
for
data_iterators
in
all_data_iterators
]
test_data_iterator
=
[
data_iterators
[
2
]
for
data_iterators
in
all_data_iterators
]
else
:
else
:
train_data_iterator
,
valid_data_iterator
,
test_data_iterator
\
train_data_iterator
,
valid_data_iterator
,
test_data_iterator
\
=
build_train_valid_test_data_iterators
(
=
build_train_valid_test_data_iterators
(
...
@@ -145,7 +137,8 @@ def pretrain(train_valid_test_dataset_provider,
...
@@ -145,7 +137,8 @@ def pretrain(train_valid_test_dataset_provider,
# Print setup timing.
# Print setup timing.
print_rank_0
(
'done with setup ...'
)
print_rank_0
(
'done with setup ...'
)
timers
.
log
([
'model-and-optimizer-setup'
,
'train/valid/test-data-iterators-setup'
])
timers
.
log
([
'model-and-optimizer-setup'
,
'train/valid/test-data-iterators-setup'
],
barrier
=
True
)
print_rank_0
(
'training ...'
)
print_rank_0
(
'training ...'
)
iteration
=
0
iteration
=
0
...
@@ -373,13 +366,9 @@ def setup_model_and_optimizer(model_provider_func,
...
@@ -373,13 +366,9 @@ def setup_model_and_optimizer(model_provider_func,
if
args
.
load
is
not
None
:
if
args
.
load
is
not
None
:
timers
=
get_timers
()
timers
=
get_timers
()
# Extra barrier is added to make sure all ranks report the
timers
(
'load-checkpoint'
,
log_level
=
0
).
start
(
barrier
=
True
)
# max time.
torch
.
distributed
.
barrier
()
timers
(
'load-checkpoint'
).
start
()
args
.
iteration
=
load_checkpoint
(
model
,
optimizer
,
opt_param_scheduler
)
args
.
iteration
=
load_checkpoint
(
model
,
optimizer
,
opt_param_scheduler
)
torch
.
distributed
.
barrier
()
timers
(
'load-checkpoint'
).
stop
(
barrier
=
True
)
timers
(
'load-checkpoint'
).
stop
()
timers
.
log
([
'load-checkpoint'
])
timers
.
log
([
'load-checkpoint'
])
else
:
else
:
args
.
iteration
=
0
args
.
iteration
=
0
...
@@ -412,19 +401,21 @@ def train_step(forward_step_func, data_iterator,
...
@@ -412,19 +401,21 @@ def train_step(forward_step_func, data_iterator,
optimizer
.
zero_grad
()
optimizer
.
zero_grad
()
# Forward pass.
# Forward pass.
timers
(
'forward-backward'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
forward_backward_func
=
get_forward_backward_func
()
forward_backward_func
=
get_forward_backward_func
()
fwd_bwd_timers
=
timers
if
args
.
timing_log_level
>
1
else
None
losses_reduced
=
forward_backward_func
(
losses_reduced
=
forward_backward_func
(
forward_step_func
,
data_iterator
,
model
,
forward_step_func
,
data_iterator
,
model
,
optimizer
,
timers
,
forward_only
=
False
)
optimizer
,
fwd_bwd_timers
,
forward_only
=
False
)
timers
(
'forward-backward'
).
stop
()
# Empty unused memory.
# Empty unused memory.
if
args
.
empty_unused_memory_level
>=
1
:
if
args
.
empty_unused_memory_level
>=
1
:
torch
.
cuda
.
empty_cache
()
torch
.
cuda
.
empty_cache
()
# Reduce gradients.
# Reduce gradients.
timers
(
'backward-reduce-model-grads'
).
start
()
optimizer
.
reduce_model_grads
(
args
,
timers
)
optimizer
.
reduce_model_grads
(
args
,
timers
)
timers
(
'backward-reduce-model-grads'
).
stop
()
# Vision gradients.
# Vision gradients.
if
args
.
vision_pretraining
and
args
.
vision_pretraining_type
==
"dino"
:
if
args
.
vision_pretraining
and
args
.
vision_pretraining_type
==
"dino"
:
...
@@ -433,15 +424,13 @@ def train_step(forward_step_func, data_iterator,
...
@@ -433,15 +424,13 @@ def train_step(forward_step_func, data_iterator,
unwrapped_model
.
cancel_gradients_last_layer
(
args
.
curr_iteration
)
unwrapped_model
.
cancel_gradients_last_layer
(
args
.
curr_iteration
)
# Update parameters.
# Update parameters.
timers
(
'optimizer'
).
start
(
)
timers
(
'optimizer'
,
log_level
=
1
).
start
(
barrier
=
args
.
barrier_with_L1_time
)
update_successful
,
grad_norm
,
num_zeros_in_grad
=
optimizer
.
step
(
args
,
timers
)
update_successful
,
grad_norm
,
num_zeros_in_grad
=
optimizer
.
step
(
args
,
timers
)
timers
(
'optimizer'
).
stop
()
timers
(
'optimizer'
).
stop
()
# Gather params.
# Gather params.
if
update_successful
:
if
update_successful
:
timers
(
'backward-gather-model-params'
).
start
()
optimizer
.
gather_model_params
(
args
,
timers
)
optimizer
.
gather_model_params
(
args
,
timers
)
timers
(
'backward-gather-model-params'
).
stop
()
# Vision momentum.
# Vision momentum.
if
args
.
vision_pretraining
and
args
.
vision_pretraining_type
==
"dino"
:
if
args
.
vision_pretraining
and
args
.
vision_pretraining_type
==
"dino"
:
...
@@ -511,33 +500,32 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -511,33 +500,32 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
nan_iters_key
,
0
)
+
int
(
got_nan
)
nan_iters_key
,
0
)
+
int
(
got_nan
)
# Logging.
# Logging.
timers_to_log
=
[]
timers_to_log
=
[
'forward-backward'
,
def
add_to_logging
(
name
):
'forward-compute'
,
if
name
in
timers
.
timers
:
'backward-compute'
,
timers_to_log
.
append
(
name
)
'batch-generator'
,
add_to_logging
(
'forward-compute'
)
'forward-recv'
,
add_to_logging
(
'forward-recv'
)
'forward-send'
,
add_to_logging
(
'forward-send'
)
'backward-recv'
,
add_to_logging
(
'forward-backward-send-forward-backward-recv'
)
'backward-send'
,
add_to_logging
(
'backward-compute'
)
'forward-send-forward-recv'
,
add_to_logging
(
'backward-recv'
)
'forward-send-backward-recv'
,
add_to_logging
(
'backward-send'
)
'backward-send-forward-recv'
,
add_to_logging
(
'backward-send-forward-recv'
)
'backward-send-backward-recv'
,
add_to_logging
(
'backward-send-backward-recv'
)
'forward-backward-send-forward-backward-recv'
,
add_to_logging
(
'backward-params-all-reduce'
)
'layernorm-grads-all-reduce'
,
add_to_logging
(
'backward-layernorm-all-reduce'
)
'embedding-grads-all-reduce'
,
add_to_logging
(
'backward-embedding-all-reduce'
)
'grads-all-reduce'
,
add_to_logging
(
'backward-reduce-model-grads'
)
'grads-reduce-scatter'
,
add_to_logging
(
'backward-gather-model-params'
)
'params-all-gather'
,
add_to_logging
(
'optimizer-copy-to-main-grad'
)
'optimizer-copy-to-main-grad'
,
add_to_logging
(
'optimizer-unscale-and-check-inf'
)
'optimizer-unscale-and-check-inf'
,
add_to_logging
(
'optimizer-clip-main-grad'
)
'optimizer-clip-main-grad'
,
add_to_logging
(
'optimizer-count-zeros'
)
'optimizer-count-zeros'
,
add_to_logging
(
'optimizer-inner-step'
)
'optimizer-inner-step'
,
add_to_logging
(
'optimizer-copy-main-to-model-params'
)
'optimizer-copy-main-to-model-params'
,
add_to_logging
(
'optimizer'
)
'optimizer'
]
add_to_logging
(
'batch-generator'
)
# Calculate batch size.
# Calculate batch size.
batch_size
=
args
.
micro_batch_size
*
args
.
data_parallel_size
*
\
batch_size
=
args
.
micro_batch_size
*
args
.
data_parallel_size
*
\
...
@@ -547,8 +535,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -547,8 +535,12 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
total_loss_dict
[
skipped_iters_key
]
total_loss_dict
[
skipped_iters_key
]
# Tensorboard values.
# Tensorboard values.
if
writer
and
(
iteration
%
args
.
tensorboard_log_interval
==
0
)
and
\
# Timer requires all the ranks to call.
is_last_rank
():
if
args
.
log_timers_to_tensorboard
and
\
(
iteration
%
args
.
tensorboard_log_interval
==
0
):
timers
.
write
(
timers_to_log
,
writer
,
iteration
,
normalizer
=
total_iterations
)
if
writer
and
(
iteration
%
args
.
tensorboard_log_interval
==
0
):
if
args
.
log_learning_rate_to_tensorboard
:
if
args
.
log_learning_rate_to_tensorboard
:
writer
.
add_scalar
(
'learning-rate'
,
learning_rate
,
iteration
)
writer
.
add_scalar
(
'learning-rate'
,
learning_rate
,
iteration
)
writer
.
add_scalar
(
'learning-rate vs samples'
,
learning_rate
,
writer
.
add_scalar
(
'learning-rate vs samples'
,
learning_rate
,
...
@@ -581,9 +573,6 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -581,9 +573,6 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
writer
.
add_scalar
(
'params-norm'
,
params_norm
,
iteration
)
writer
.
add_scalar
(
'params-norm'
,
params_norm
,
iteration
)
writer
.
add_scalar
(
'params-norm vs samples'
,
params_norm
,
writer
.
add_scalar
(
'params-norm vs samples'
,
params_norm
,
args
.
consumed_train_samples
)
args
.
consumed_train_samples
)
if
args
.
log_timers_to_tensorboard
:
timers
.
write
(
timers_to_log
,
writer
,
iteration
,
normalizer
=
total_iterations
)
if
args
.
log_memory_to_tensorboard
:
if
args
.
log_memory_to_tensorboard
:
mem_stats
=
torch
.
cuda
.
memory_stats
()
mem_stats
=
torch
.
cuda
.
memory_stats
()
writer
.
add_scalar
(
writer
.
add_scalar
(
...
@@ -603,7 +592,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
...
@@ -603,7 +592,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, iteration,
)
)
if
iteration
%
args
.
log_interval
==
0
:
if
iteration
%
args
.
log_interval
==
0
:
elapsed_time
=
timers
(
'interval-time'
).
elapsed
()
elapsed_time
=
timers
(
'interval-time'
).
elapsed
(
barrier
=
True
)
elapsed_time_per_iteration
=
elapsed_time
/
total_iterations
elapsed_time_per_iteration
=
elapsed_time
/
total_iterations
if
writer
:
if
writer
:
if
args
.
log_timers_to_tensorboard
:
if
args
.
log_timers_to_tensorboard
:
...
@@ -653,11 +642,9 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler):
...
@@ -653,11 +642,9 @@ def save_checkpoint_and_time(iteration, model, optimizer, opt_param_scheduler):
timers
=
get_timers
()
timers
=
get_timers
()
# Extra barrier is added to make sure
# Extra barrier is added to make sure
# all ranks report the max time.
# all ranks report the max time.
torch
.
distributed
.
barrier
()
timers
(
'save-checkpoint'
,
log_level
=
0
).
start
(
barrier
=
True
)
timers
(
'save-checkpoint'
).
start
()
save_checkpoint
(
iteration
,
model
,
optimizer
,
opt_param_scheduler
)
save_checkpoint
(
iteration
,
model
,
optimizer
,
opt_param_scheduler
)
torch
.
distributed
.
barrier
()
timers
(
'save-checkpoint'
).
stop
(
barrier
=
True
)
timers
(
'save-checkpoint'
).
stop
()
timers
.
log
([
'save-checkpoint'
])
timers
.
log
([
'save-checkpoint'
])
...
@@ -681,7 +668,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
...
@@ -681,7 +668,7 @@ def train(forward_step_func, model, optimizer, opt_param_scheduler,
# Iterations.
# Iterations.
iteration
=
args
.
iteration
iteration
=
args
.
iteration
timers
(
'interval-time'
).
start
(
)
timers
(
'interval-time'
,
log_level
=
0
).
start
(
barrier
=
True
)
print_datetime
(
'before the start of training step'
)
print_datetime
(
'before the start of training step'
)
report_memory_flag
=
True
report_memory_flag
=
True
while
iteration
<
args
.
train_iters
:
while
iteration
<
args
.
train_iters
:
...
...
megatron/utils.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""General utilities."""
"""General utilities."""
...
...
pretrain_bert.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain BERT"""
"""Pretrain BERT"""
...
@@ -104,7 +91,7 @@ def forward_step(data_iterator, model):
...
@@ -104,7 +91,7 @@ def forward_step(data_iterator, model):
timers
=
get_timers
()
timers
=
get_timers
()
# Get the batch.
# Get the batch.
timers
(
'batch-generator'
).
start
()
timers
(
'batch-generator'
,
log_level
=
2
).
start
()
tokens
,
types
,
sentence_order
,
loss_mask
,
lm_labels
,
padding_mask
=
get_batch
(
tokens
,
types
,
sentence_order
,
loss_mask
,
lm_labels
,
padding_mask
=
get_batch
(
data_iterator
)
data_iterator
)
timers
(
'batch-generator'
).
stop
()
timers
(
'batch-generator'
).
stop
()
...
...
pretrain_gpt.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain GPT"""
"""Pretrain GPT"""
...
@@ -89,7 +76,7 @@ def forward_step(data_iterator, model):
...
@@ -89,7 +76,7 @@ def forward_step(data_iterator, model):
timers
=
get_timers
()
timers
=
get_timers
()
# Get the batch.
# Get the batch.
timers
(
'batch-generator'
).
start
()
timers
(
'batch-generator'
,
log_level
=
2
).
start
()
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
=
get_batch
(
tokens
,
labels
,
loss_mask
,
attention_mask
,
position_ids
=
get_batch
(
data_iterator
)
data_iterator
)
timers
(
'batch-generator'
).
stop
()
timers
(
'batch-generator'
).
stop
()
...
...
pretrain_ict.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain BERT for Inverse Cloze Task"""
"""Pretrain BERT for Inverse Cloze Task"""
...
@@ -134,7 +121,7 @@ def forward_step(data_iterator, model):
...
@@ -134,7 +121,7 @@ def forward_step(data_iterator, model):
timers
=
get_timers
()
timers
=
get_timers
()
# Get the batch.
# Get the batch.
timers
(
'batch-generator'
).
start
()
timers
(
'batch-generator'
,
log_level
=
2
).
start
()
query_tokens
,
query_mask
,
\
query_tokens
,
query_mask
,
\
context_tokens
,
context_mask
,
context_indices
=
get_ict_batch
(
data_iterator
)
context_tokens
,
context_mask
,
context_indices
=
get_ict_batch
(
data_iterator
)
timers
(
'batch-generator'
).
stop
()
timers
(
'batch-generator'
).
stop
()
...
...
pretrain_t5.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain T5"""
"""Pretrain T5"""
...
@@ -126,7 +113,7 @@ def forward_step(data_iterator, model):
...
@@ -126,7 +113,7 @@ def forward_step(data_iterator, model):
timers
=
get_timers
()
timers
=
get_timers
()
# Get the batch.
# Get the batch.
timers
(
'batch generator'
).
start
()
timers
(
'batch generator'
,
log_level
=
2
).
start
()
tokens_enc
,
tokens_dec
,
loss_mask
,
lm_labels
,
enc_mask
,
dec_mask
,
enc_dec_mask
\
tokens_enc
,
tokens_dec
,
loss_mask
,
lm_labels
,
enc_mask
,
dec_mask
,
enc_dec_mask
\
=
get_batch
(
data_iterator
)
=
get_batch
(
data_iterator
)
timers
(
'batch generator'
).
stop
()
timers
(
'batch generator'
).
stop
()
...
...
pretrain_vision_classify.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain VIT"""
"""Pretrain VIT"""
...
@@ -77,7 +64,7 @@ def forward_step(data_iterator, model):
...
@@ -77,7 +64,7 @@ def forward_step(data_iterator, model):
timers
=
get_timers
()
timers
=
get_timers
()
# Get the batch.
# Get the batch.
timers
(
"batch-generator"
).
start
()
timers
(
"batch-generator"
,
log_level
=
2
).
start
()
(
(
images
,
images
,
labels
,
labels
,
...
...
pretrain_vision_dino.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
...
@@ -84,7 +71,7 @@ def forward_step(data_iterator, model):
...
@@ -84,7 +71,7 @@ def forward_step(data_iterator, model):
timers
=
get_timers
()
timers
=
get_timers
()
# Get the batch.
# Get the batch.
timers
(
"batch-generator"
).
start
()
timers
(
"batch-generator"
,
log_level
=
2
).
start
()
(
(
images
,
images
,
labels
,
labels
,
...
...
pretrain_vision_inpaint.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Pretrain VIT"""
"""Pretrain VIT"""
...
@@ -91,7 +78,7 @@ def forward_step(data_iterator, model):
...
@@ -91,7 +78,7 @@ def forward_step(data_iterator, model):
timers
=
get_timers
()
timers
=
get_timers
()
# Get the batch.
# Get the batch.
timers
(
"batch-generator"
).
start
()
timers
(
"batch-generator"
,
log_level
=
2
).
start
()
(
(
images
,
images
,
masks
,
masks
,
...
...
tasks/data_utils.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Tasks data utility."""
""" Tasks data utility."""
...
...
tasks/eval_utils.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Evaluation utilities."""
"""Evaluation utilities."""
...
...
tasks/finetune_utils.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Finetune utilities."""
"""Finetune utilities."""
...
@@ -67,7 +54,7 @@ def _cross_entropy_forward_step(batch, model):
...
@@ -67,7 +54,7 @@ def _cross_entropy_forward_step(batch, model):
timers
=
get_timers
()
timers
=
get_timers
()
# Get the batch.
# Get the batch.
timers
(
'batch-generator'
).
start
()
timers
(
'batch-generator'
,
log_level
=
2
).
start
()
try
:
try
:
batch_
=
next
(
batch
)
batch_
=
next
(
batch
)
except
BaseException
:
except
BaseException
:
...
@@ -178,7 +165,7 @@ def _train(model, optimizer, opt_param_scheduler, forward_step,
...
@@ -178,7 +165,7 @@ def _train(model, optimizer, opt_param_scheduler, forward_step,
report_memory_flag
=
True
report_memory_flag
=
True
# For each remaining epoch
# For each remaining epoch
timers
(
'interval-time'
).
start
(
)
timers
(
'interval-time'
,
log_level
=
0
).
start
(
barrier
=
True
)
for
epoch
in
range
(
start_epoch
,
args
.
epochs
):
for
epoch
in
range
(
start_epoch
,
args
.
epochs
):
print_rank_0
(
'working on epoch {} ...'
.
format
(
epoch
+
1
))
print_rank_0
(
'working on epoch {} ...'
.
format
(
epoch
+
1
))
...
@@ -261,7 +248,7 @@ def finetune(train_valid_datasets_provider, model_provider,
...
@@ -261,7 +248,7 @@ def finetune(train_valid_datasets_provider, model_provider,
'batch size scaling is not supported for finetuning'
'batch size scaling is not supported for finetuning'
# Train and validation data loaders.
# Train and validation data loaders.
timers
(
'train/valid/test dataset/dataloder'
).
start
()
timers
(
'train/valid/test dataset/dataloder'
,
log_level
=
0
).
start
()
if
args
.
epochs
>
0
:
if
args
.
epochs
>
0
:
train_dataset
,
valid_dataset
=
train_valid_datasets_provider
()
train_dataset
,
valid_dataset
=
train_valid_datasets_provider
()
train_dataloader
,
valid_dataloader
=
_build_train_valid_dataloaders
(
train_dataloader
,
valid_dataloader
=
_build_train_valid_dataloaders
(
...
@@ -271,21 +258,21 @@ def finetune(train_valid_datasets_provider, model_provider,
...
@@ -271,21 +258,21 @@ def finetune(train_valid_datasets_provider, model_provider,
timers
(
'train/valid/test dataset/dataloder'
).
stop
()
timers
(
'train/valid/test dataset/dataloder'
).
stop
()
# Build calback function.
# Build calback function.
timers
(
'callback function'
).
start
()
timers
(
'callback function'
,
log_level
=
0
).
start
()
end_of_epoch_callback
=
None
end_of_epoch_callback
=
None
if
end_of_epoch_callback_provider
is
not
None
:
if
end_of_epoch_callback_provider
is
not
None
:
end_of_epoch_callback
=
end_of_epoch_callback_provider
()
end_of_epoch_callback
=
end_of_epoch_callback_provider
()
timers
(
'callback function'
).
stop
()
timers
(
'callback function'
).
stop
()
# Build model, optimizer and learning rate scheduler.
# Build model, optimizer and learning rate scheduler.
timers
(
'model and optimizer'
).
start
()
timers
(
'model and optimizer'
,
log_level
=
0
).
start
()
model
,
optimizer
,
opt_param_scheduler
=
setup_model_and_optimizer
(
model_provider
,
model_type
)
model
,
optimizer
,
opt_param_scheduler
=
setup_model_and_optimizer
(
model_provider
,
model_type
)
timers
(
'model and optimizer'
).
stop
()
timers
(
'model and optimizer'
).
stop
()
# If pretrained checkpoint is provided and we have not trained for
# If pretrained checkpoint is provided and we have not trained for
# any iteration (i.e., iteration is zero), then load the pretrained
# any iteration (i.e., iteration is zero), then load the pretrained
# checkpoint.
# checkpoint.
timers
(
'pretrained checkpoint'
).
start
(
)
timers
(
'pretrained checkpoint'
,
log_level
=
0
).
start
(
barrier
=
True
)
if
args
.
iteration
==
0
and
args
.
pretrained_checkpoint
is
not
None
:
if
args
.
iteration
==
0
and
args
.
pretrained_checkpoint
is
not
None
:
original_load
=
args
.
load
original_load
=
args
.
load
args
.
load
=
args
.
pretrained_checkpoint
args
.
load
=
args
.
pretrained_checkpoint
...
@@ -302,7 +289,7 @@ def finetune(train_valid_datasets_provider, model_provider,
...
@@ -302,7 +289,7 @@ def finetune(train_valid_datasets_provider, model_provider,
# Print setup timing.
# Print setup timing.
print_rank_0
(
'done with setups ...'
)
print_rank_0
(
'done with setups ...'
)
timers
.
log
([
'train/valid/test dataset/dataloder'
,
'callback function'
,
timers
.
log
([
'train/valid/test dataset/dataloder'
,
'callback function'
,
'model and optimizer'
,
'pretrained checkpoint'
])
'model and optimizer'
,
'pretrained checkpoint'
]
,
barrier
=
True
)
print_rank_0
(
'training ...'
)
print_rank_0
(
'training ...'
)
# Finetune the model.
# Finetune the model.
...
...
tasks/glue/data.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GLUE dataset."""
"""GLUE dataset."""
...
...
tasks/glue/finetune.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""GLUE finetuning/evaluation."""
"""GLUE finetuning/evaluation."""
...
...
tasks/glue/mnli.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""MNLI dataset."""
"""MNLI dataset."""
...
...
tasks/glue/qqp.py
View file @
41276b6c
# coding=utf-8
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""QQP dataset."""
"""QQP dataset."""
...
...
Prev
1
2
3
4
5
6
7
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment