"projects/NeRF/vscode:/vscode.git/clone" did not exist on "3b355d3f9d5f4f5501ff6e76ba4018d83b640087"
finetune_utils.py 9.59 KB
Newer Older
1
# coding=utf-8
Mohammad's avatar
Mohammad committed
2
# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Finetune utilities."""

import torch

20
from megatron import get_args, print_rank_0
Mohammad's avatar
Mohammad committed
21
from megatron import get_timers
22
from megatron import mpu
Neel Kant's avatar
Neel Kant committed
23
from megatron.checkpointing import load_checkpoint
Mohammad's avatar
Mohammad committed
24
from megatron.checkpointing import save_checkpoint
25
26
27
28
29
30
31
32
from megatron.training import evaluate_and_print_results
from megatron.training import setup_model_and_optimizer
from megatron.training import train_step
from megatron.training import training_log
from megatron.utils import check_adlr_autoresume_termination
from megatron.utils import reduce_losses


Mohammad's avatar
Mohammad committed
33
def process_batch(batch):
34
    """Process batch and produce inputs for the model."""
Mohammad's avatar
Mohammad committed
35
    args = get_args()
36
37
38
39
40
41
42
43
44
45
46

    tokens = batch['text'].long().cuda().contiguous()
    types = batch['types'].long().cuda().contiguous()
    labels = batch['label'].long().cuda().contiguous()
    attention_mask = batch['padding_mask'].float().cuda().contiguous()
    if args.fp16:
        attention_mask = attention_mask.half()

    return tokens, types, labels, attention_mask


Mohammad's avatar
Mohammad committed
47
def _cross_entropy_forward_step(batch, model):
48
    """Simple forward step with cross-entropy loss."""
Mohammad's avatar
Mohammad committed
49
    timers = get_timers()
50
51
52
53
54

    # Get the batch.
    timers('batch generator').start()
    try:
        batch_ = next(batch)
Neel Kant's avatar
Neel Kant committed
55
    except BaseException:
56
        batch_ = batch
Mohammad's avatar
Mohammad committed
57
    tokens, types, labels, attention_mask = process_batch(batch_)
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    timers('batch generator').stop()

    # Forward model.
    logits = model(tokens, attention_mask, types)

    # Cross-entropy loss.
    loss_func = torch.nn.CrossEntropyLoss()
    loss = loss_func(logits.contiguous().float(), labels)

    # Reduce loss for logging.
    reduced_loss = reduce_losses([loss])

    return loss, {'lm loss': reduced_loss[0]}


def build_data_loader(dataset, batch_size, num_workers, drop_last):
    """Data loader. Note that batch-size is the local (per GPU) batch-size."""

    # Sampler.
    world_size = mpu.get_data_parallel_world_size()
    rank = mpu.get_data_parallel_rank()
    sampler = torch.utils.data.distributed.DistributedSampler(
        dataset, num_replicas=world_size, rank=rank)

    # Data loader. Note that batch size is the per GPU batch size.
    data_loader = torch.utils.data.DataLoader(dataset,
                                              batch_size=batch_size,
                                              sampler=sampler,
                                              shuffle=False,
                                              num_workers=num_workers,
                                              drop_last=drop_last,
                                              pin_memory=True)

    return data_loader


def _build_infinite_size_dataloader(dataloader):
    """Build a looped dataloader with infinite size."""

    iterator = dataloader.__iter__()
    while True:
        try:
            yield iterator.__next__()
        except StopIteration:
            iterator = dataloader.__iter__()


Mohammad's avatar
Mohammad committed
105
def _build_train_valid_dataloaders(train_dataset, valid_dataset):
106
    """Traing and validation dataloaders."""
Mohammad's avatar
Mohammad committed
107
    args = get_args()
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125

    print_rank_0('building train and validation dataloaders ...')
    # Training dataset.
    train_dataloader = build_data_loader(train_dataset, args.batch_size,
                                         args.num_workers, not args.keep_last)
    # Set the training iterations.
    args.train_iters_per_epoch = len(train_dataloader)
    args.train_iters = args.epochs * args.train_iters_per_epoch
    # Validation dataset. For this dataset, we do not need to set up
    # shuffling so we can just use a simple infinite loop.
    valid_dataloader_ = build_data_loader(valid_dataset, args.batch_size,
                                          args.num_workers, not args.keep_last)
    valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_)

    return train_dataloader, valid_dataloader


def _train(model, optimizer, lr_scheduler, forward_step,
Mohammad's avatar
Mohammad committed
126
           train_dataloader, valid_dataloader, end_of_epoch_callback):
127
    """Train the model."""
Mohammad's avatar
Mohammad committed
128
129
    args = get_args()
    timers = get_timers()
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147

    # Turn on training mode which enables dropout.
    model.train()

    # Tracking loss.
    losses_dict_sum = {}

    # Starting epoch and iteration
    start_epoch = args.iteration // args.train_iters_per_epoch
    start_iteration = args.iteration % args.train_iters_per_epoch
    iteration = args.iteration

    # Memory reporting flag.
    report_memory_flag = True

    # For each remaining epoch
    timers('interval time').start()
    for epoch in range(start_epoch, args.epochs):
Neel Kant's avatar
Neel Kant committed
148
        print_rank_0('working on epoch {} ...'.format(epoch + 1))
149
150
151
152
153
154
155
156
157
158
159
160
161
162

        # Set the data loader epoch to shuffle the index iterator.
        train_dataloader.sampler.set_epoch(args.seed + epoch)

        # For all the batches in the dataset.
        for iteration_, batch in enumerate(train_dataloader):

            # Ignore the iterations before starting value
            if iteration_ < start_iteration:
                continue
            # Set to zero so the next epoch does not skip any batches.
            start_iteration = 0

            # Train for one step.
Mohammad's avatar
Mohammad committed
163
164
            losses_dict, _ = train_step(forward_step, batch, model,
                                        optimizer, lr_scheduler)
165
166
167
168
169
170
            iteration += 1

            # Logging.
            report_memory_flag = training_log(losses_dict, losses_dict_sum,
                                              optimizer.param_groups[0]['lr'],
                                              iteration, optimizer.loss_scale,
Mohammad's avatar
Mohammad committed
171
                                              report_memory_flag)
172
173

            # Autoresume
Neel Kant's avatar
Neel Kant committed
174
            if args.adlr_autoresume and \
175
               (iteration % args.adlr_autoresume_interval == 0):
Mohammad's avatar
Mohammad committed
176
177
                check_adlr_autoresume_termination(iteration, model,
                                                  optimizer, lr_scheduler)
178
179
180
181

            # Checkpointing
            if args.save and args.save_interval and \
               iteration % args.save_interval == 0:
Mohammad's avatar
Mohammad committed
182
                save_checkpoint(iteration, model, optimizer, lr_scheduler)
183
184
185
186
187

            # Evaluation
            if args.eval_interval and iteration % args.eval_interval == 0:
                prefix = 'iteration {}'.format(iteration)
                evaluate_and_print_results(prefix, forward_step,
Mohammad's avatar
Mohammad committed
188
189
                                           valid_dataloader, model,
                                           iteration, False)
190
191
192

        # Checkpointing at the end of each epoch.
        if args.save:
Mohammad's avatar
Mohammad committed
193
            save_checkpoint(iteration, model, optimizer, lr_scheduler)
194
195
196

        # Callback at the end of each epoch.
        if end_of_epoch_callback is not None:
Mohammad's avatar
Mohammad committed
197
            end_of_epoch_callback(model, epoch)
198
199


Mohammad's avatar
Mohammad committed
200
def finetune(train_valid_datasets_provider, model_provider,
201
202
203
             forward_step=_cross_entropy_forward_step,
             end_of_epoch_callback_provider=None):
    """Main finetune function used across all tasks."""
Mohammad's avatar
Mohammad committed
204
205
    args = get_args()
    timers = get_timers()
206
207

    # Train and validation data loaders.
Mohammad's avatar
Mohammad committed
208
    timers('train/valid/test dataset/dataloder').start()
209
    if args.epochs > 0:
Mohammad's avatar
Mohammad committed
210
        train_dataset, valid_dataset = train_valid_datasets_provider()
211
        train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
Mohammad's avatar
Mohammad committed
212
213
            train_dataset, valid_dataset)
    timers('train/valid/test dataset/dataloder').stop()
214
215

    # Build calback function.
Mohammad's avatar
Mohammad committed
216
    timers('callback function').start()
217
218
    end_of_epoch_callback = None
    if end_of_epoch_callback_provider is not None:
Mohammad's avatar
Mohammad committed
219
220
        end_of_epoch_callback = end_of_epoch_callback_provider()
    timers('callback function').stop()
221
222

    # Build model, optimizer and learning rate scheduler.
Mohammad's avatar
Mohammad committed
223
224
225
    timers('model and optimizer').start()
    model, optimizer, lr_scheduler = setup_model_and_optimizer(model_provider)
    timers('model and optimizer').stop()
226
227
228
229

    # If pretrained checkpoint is provided and we have not trained for
    # any iteration (i.e., iteration is zero), then load the pretrained
    # checkpoint.
Mohammad's avatar
Mohammad committed
230
    timers('pretrained checkpoint').start()
231
232
233
    if args.iteration == 0 and args.pretrained_checkpoint is not None:
        original_load = args.load
        args.load = args.pretrained_checkpoint
Mohammad's avatar
Mohammad committed
234
        _ = load_checkpoint(model, None, None)
235
236
237
238
239
        args.load = original_load
        # This is critical when only model is loaded. We should make sure
        # master parameters are also updated.
        if args.fp16:
            optimizer._model_params_to_master_params()
Mohammad's avatar
Mohammad committed
240
    timers('pretrained checkpoint').stop()
241

Mohammad's avatar
Mohammad committed
242
243
244
245
246
    # Print setup timing.
    print_rank_0('done with setups ...')
    timers.log(['train/valid/test dataset/dataloder', 'callback function',
                'model and optimizer', 'pretrained checkpoint'])
    print_rank_0('training ...')
247
248
249
250

    # Finetune the model.
    if args.epochs > 0:
        _train(model, optimizer, lr_scheduler, forward_step,
Mohammad's avatar
Mohammad committed
251
               train_dataloader, valid_dataloader, end_of_epoch_callback)
252
253
254
255
    # Or just evaluate.
    else:
        if end_of_epoch_callback is not None:
            print_rank_0('evaluation only mode, setting epoch to -1')
Mohammad's avatar
Mohammad committed
256
            end_of_epoch_callback(model, epoch=-1, output_predictions=True)
257
258

    print_rank_0('done :-)')