train.py 22.8 KB
Newer Older
1
2
3
import datetime
import os
import time
4
import warnings
5

6
import presets
7
8
9
import torch
import torch.utils.data
import torchvision
10
import torchvision.transforms
11
import utils
12
from sampler import RASampler
13
14
15
from torch import nn
from torch.utils.data.dataloader import default_collate
from torchvision.transforms.functional import InterpolationMode
16
from transforms import get_mixup_cutmix
17
18


19
def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None):
20
21
    model.train()
    metric_logger = utils.MetricLogger(delimiter="  ")
22
23
    metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value}"))
    metric_logger.add_meter("img/s", utils.SmoothedValue(window_size=10, fmt="{value}"))
24

25
    header = f"Epoch: [{epoch}]"
26
    for i, (image, target) in enumerate(metric_logger.log_every(data_loader, args.print_freq, header)):
27
        start_time = time.time()
28
        image, target = image.to(device), target.to(device)
29
        with torch.cuda.amp.autocast(enabled=scaler is not None):
30
31
            output = model(image)
            loss = criterion(output, target)
32
33

        optimizer.zero_grad()
34
        if scaler is not None:
35
            scaler.scale(loss).backward()
36
37
38
            if args.clip_grad_norm is not None:
                # we should unscale the gradients of optimizer's assigned params if do gradient clipping
                scaler.unscale_(optimizer)
39
                nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm)
40
41
            scaler.step(optimizer)
            scaler.update()
42
43
        else:
            loss.backward()
44
            if args.clip_grad_norm is not None:
45
                nn.utils.clip_grad_norm_(model.parameters(), args.clip_grad_norm)
46
            optimizer.step()
47

48
49
50
51
52
53
        if model_ema and i % args.model_ema_steps == 0:
            model_ema.update_parameters(model)
            if epoch < args.lr_warmup_epochs:
                # Reset ema buffer to keep copying weights during warmup period
                model_ema.n_averaged.fill_(0)

54
55
56
        acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
        batch_size = image.shape[0]
        metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"])
57
58
59
        metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
        metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
        metric_logger.meters["img/s"].update(batch_size / (time.time() - start_time))
60

61

62
def evaluate(model, criterion, data_loader, device, print_freq=100, log_suffix=""):
63
64
    model.eval()
    metric_logger = utils.MetricLogger(delimiter="  ")
65
    header = f"Test: {log_suffix}"
66
67

    num_processed_samples = 0
68
    with torch.inference_mode():
69
        for image, target in metric_logger.log_every(data_loader, print_freq, header):
70
71
72
73
74
75
76
77
78
79
            image = image.to(device, non_blocking=True)
            target = target.to(device, non_blocking=True)
            output = model(image)
            loss = criterion(output, target)

            acc1, acc5 = utils.accuracy(output, target, topk=(1, 5))
            # FIXME need to take into account that the datasets
            # could have been padded in distributed setup
            batch_size = image.shape[0]
            metric_logger.update(loss=loss.item())
80
81
            metric_logger.meters["acc1"].update(acc1.item(), n=batch_size)
            metric_logger.meters["acc5"].update(acc5.item(), n=batch_size)
82
            num_processed_samples += batch_size
83
    # gather the stats from all processes
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

    num_processed_samples = utils.reduce_across_processes(num_processed_samples)
    if (
        hasattr(data_loader.dataset, "__len__")
        and len(data_loader.dataset) != num_processed_samples
        and torch.distributed.get_rank() == 0
    ):
        # See FIXME above
        warnings.warn(
            f"It looks like the dataset has {len(data_loader.dataset)} samples, but {num_processed_samples} "
            "samples were used for the validation, which might bias the results. "
            "Try adjusting the batch size and / or the world size. "
            "Setting the world size to 1 is always a safe bet."
        )

99
100
    metric_logger.synchronize_between_processes()

101
    print(f"{header} Acc@1 {metric_logger.acc1.global_avg:.3f} Acc@5 {metric_logger.acc5.global_avg:.3f}")
102
103
104
    return metric_logger.acc1.global_avg


105
106
def _get_cache_path(filepath):
    import hashlib
107

108
109
110
111
112
113
    h = hashlib.sha1(filepath.encode()).hexdigest()
    cache_path = os.path.join("~", ".torch", "vision", "datasets", "imagefolder", h[:10] + ".pt")
    cache_path = os.path.expanduser(cache_path)
    return cache_path


114
def load_data(traindir, valdir, args):
115
116
    # Data loading code
    print("Loading data")
117
    val_resize_size, val_crop_size, train_crop_size = (
Ponku's avatar
Ponku committed
118
119
120
121
        args.val_resize_size,
        args.val_crop_size,
        args.train_crop_size,
    )
122
    interpolation = InterpolationMode(args.interpolation)
123
124
125

    print("Loading training data")
    st = time.time()
126
    cache_path = _get_cache_path(traindir)
127
    if args.cache_dataset and os.path.exists(cache_path):
128
        # Attention, as the transforms are also cached!
129
        print(f"Loading dataset_train from {cache_path}")
130
131
        # TODO: this could probably be weights_only=True
        dataset, _ = torch.load(cache_path, weights_only=False)
132
    else:
133
134
        # We need a default value for the variables below because args may come
        # from train_quantization.py which doesn't define them.
135
136
        auto_augment_policy = getattr(args, "auto_augment", None)
        random_erase_prob = getattr(args, "random_erase", 0.0)
137
138
        ra_magnitude = getattr(args, "ra_magnitude", None)
        augmix_severity = getattr(args, "augmix_severity", None)
139
140
        dataset = torchvision.datasets.ImageFolder(
            traindir,
141
            presets.ClassificationPresetTrain(
142
143
144
145
                crop_size=train_crop_size,
                interpolation=interpolation,
                auto_augment_policy=auto_augment_policy,
                random_erase_prob=random_erase_prob,
Ponku's avatar
Ponku committed
146
147
                ra_magnitude=ra_magnitude,
                augmix_severity=augmix_severity,
148
                backend=args.backend,
149
                use_v2=args.use_v2,
150
151
            ),
        )
152
        if args.cache_dataset:
153
            print(f"Saving dataset_train to {cache_path}")
154
155
            utils.mkdir(os.path.dirname(cache_path))
            utils.save_on_master((dataset, traindir), cache_path)
156
157
158
    print("Took", time.time() - st)

    print("Loading validation data")
159
    cache_path = _get_cache_path(valdir)
160
    if args.cache_dataset and os.path.exists(cache_path):
161
        # Attention, as the transforms are also cached!
162
        print(f"Loading dataset_test from {cache_path}")
163
164
        # TODO: this could probably be weights_only=True
        dataset_test, _ = torch.load(cache_path, weights_only=False)
165
    else:
166
167
        if args.weights and args.test_only:
            weights = torchvision.models.get_weight(args.weights)
168
169
170
171
            preprocessing = weights.transforms(antialias=True)
            if args.backend == "tensor":
                preprocessing = torchvision.transforms.Compose([torchvision.transforms.PILToTensor(), preprocessing])

172
        else:
173
            preprocessing = presets.ClassificationPresetEval(
174
175
176
177
                crop_size=val_crop_size,
                resize_size=val_resize_size,
                interpolation=interpolation,
                backend=args.backend,
178
                use_v2=args.use_v2,
179
180
            )

181
182
        dataset_test = torchvision.datasets.ImageFolder(
            valdir,
183
            preprocessing,
184
        )
185
        if args.cache_dataset:
186
            print(f"Saving dataset_test to {cache_path}")
187
188
            utils.mkdir(os.path.dirname(cache_path))
            utils.save_on_master((dataset_test, valdir), cache_path)
189
190

    print("Creating data loaders")
191
    if args.distributed:
192
        if hasattr(args, "ra_sampler") and args.ra_sampler:
193
            train_sampler = RASampler(dataset, shuffle=True, repetitions=args.ra_reps)
194
195
        else:
            train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
196
        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
197
198
199
200
    else:
        train_sampler = torch.utils.data.RandomSampler(dataset)
        test_sampler = torch.utils.data.SequentialSampler(dataset_test)

201
202
203
204
205
206
207
208
209
210
211
212
    return dataset, dataset_test, train_sampler, test_sampler


def main(args):
    if args.output_dir:
        utils.mkdir(args.output_dir)

    utils.init_distributed_mode(args)
    print(args)

    device = torch.device(args.device)

213
214
215
216
217
    if args.use_deterministic_algorithms:
        torch.backends.cudnn.benchmark = False
        torch.use_deterministic_algorithms(True)
    else:
        torch.backends.cudnn.benchmark = True
218

219
220
    train_dir = os.path.join(args.data_path, "train")
    val_dir = os.path.join(args.data_path, "val")
221
    dataset, dataset_test, train_sampler, test_sampler = load_data(train_dir, val_dir, args)
222
223

    num_classes = len(dataset.classes)
224
225
226
227
    mixup_cutmix = get_mixup_cutmix(
        mixup_alpha=args.mixup_alpha, cutmix_alpha=args.cutmix_alpha, num_categories=num_classes, use_v2=args.use_v2
    )
    if mixup_cutmix is not None:
Ponku's avatar
Ponku committed
228
229

        def collate_fn(batch):
230
231
232
233
            return mixup_cutmix(*default_collate(batch))

    else:
        collate_fn = default_collate
Ponku's avatar
Ponku committed
234

235
    data_loader = torch.utils.data.DataLoader(
236
237
238
239
240
241
242
        dataset,
        batch_size=args.batch_size,
        sampler=train_sampler,
        num_workers=args.workers,
        pin_memory=True,
        collate_fn=collate_fn,
    )
243
    data_loader_test = torch.utils.data.DataLoader(
244
245
        dataset_test, batch_size=args.batch_size, sampler=test_sampler, num_workers=args.workers, pin_memory=True
    )
246
247

    print("Creating model")
248
    model = torchvision.models.get_model(args.model, weights=args.weights, num_classes=num_classes)
249
    model.to(device)
250

251
    if args.distributed and args.sync_bn:
Francisco Massa's avatar
Francisco Massa committed
252
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
253

254
    criterion = nn.CrossEntropyLoss(label_smoothing=args.label_smoothing)
255

256
257
258
259
    custom_keys_weight_decay = []
    if args.bias_weight_decay is not None:
        custom_keys_weight_decay.append(("bias", args.bias_weight_decay))
    if args.transformer_embedding_decay is not None:
260
        for key in ["class_token", "position_embedding", "relative_position_bias_table"]:
261
262
263
264
265
266
267
            custom_keys_weight_decay.append((key, args.transformer_embedding_decay))
    parameters = utils.set_weight_decay(
        model,
        args.weight_decay,
        norm_weight_decay=args.norm_weight_decay,
        custom_keys_weight_decay=custom_keys_weight_decay if len(custom_keys_weight_decay) > 0 else None,
    )
268

269
    opt_name = args.opt.lower()
270
    if opt_name.startswith("sgd"):
271
        optimizer = torch.optim.SGD(
272
            parameters,
273
274
275
276
277
278
279
            lr=args.lr,
            momentum=args.momentum,
            weight_decay=args.weight_decay,
            nesterov="nesterov" in opt_name,
        )
    elif opt_name == "rmsprop":
        optimizer = torch.optim.RMSprop(
280
            parameters, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay, eps=0.0316, alpha=0.9
281
        )
282
283
    elif opt_name == "adamw":
        optimizer = torch.optim.AdamW(parameters, lr=args.lr, weight_decay=args.weight_decay)
284
    else:
285
        raise RuntimeError(f"Invalid optimizer {args.opt}. Only SGD, RMSprop and AdamW are supported.")
286

287
    scaler = torch.cuda.amp.GradScaler() if args.amp else None
288

289
    args.lr_scheduler = args.lr_scheduler.lower()
290
    if args.lr_scheduler == "steplr":
291
        main_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
292
293
    elif args.lr_scheduler == "cosineannealinglr":
        main_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
294
            optimizer, T_max=args.epochs - args.lr_warmup_epochs, eta_min=args.lr_min
295
296
        )
    elif args.lr_scheduler == "exponentiallr":
297
        main_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=args.lr_gamma)
298
    else:
299
        raise RuntimeError(
300
301
            f"Invalid lr scheduler '{args.lr_scheduler}'. Only StepLR, CosineAnnealingLR and ExponentialLR "
            "are supported."
302
        )
303
304

    if args.lr_warmup_epochs > 0:
305
306
307
308
309
310
311
312
        if args.lr_warmup_method == "linear":
            warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
                optimizer, start_factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs
            )
        elif args.lr_warmup_method == "constant":
            warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
                optimizer, factor=args.lr_warmup_decay, total_iters=args.lr_warmup_epochs
            )
313
        else:
314
            raise RuntimeError(
315
                f"Invalid warmup lr method '{args.lr_warmup_method}'. Only linear and constant are supported."
316
            )
317
        lr_scheduler = torch.optim.lr_scheduler.SequentialLR(
318
            optimizer, schedulers=[warmup_lr_scheduler, main_lr_scheduler], milestones=[args.lr_warmup_epochs]
319
320
321
        )
    else:
        lr_scheduler = main_lr_scheduler
322
323
324
325
326
327

    model_without_ddp = model
    if args.distributed:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
        model_without_ddp = model.module

328
329
    model_ema = None
    if args.model_ema:
330
        # Decay adjustment that aims to keep the decay independent of other hyper-parameters originally proposed at:
331
332
333
        # https://github.com/facebookresearch/pycls/blob/f8cd9627/pycls/core/net.py#L123
        #
        # total_ema_updates = (Dataset_size / n_GPUs) * epochs / (batch_size_per_gpu * EMA_steps)
334
        # We consider constant = Dataset_size for a given dataset/setup and omit it. Thus:
335
336
337
338
339
        # adjust = 1 / total_ema_updates ~= n_GPUs * batch_size_per_gpu * EMA_steps / epochs
        adjust = args.world_size * args.batch_size * args.model_ema_steps / args.epochs
        alpha = 1.0 - args.model_ema_decay
        alpha = min(1.0, alpha * adjust)
        model_ema = utils.ExponentialMovingAverage(model_without_ddp, device=device, decay=1.0 - alpha)
340

341
    if args.resume:
342
        checkpoint = torch.load(args.resume, map_location="cpu", weights_only=True)
343
        model_without_ddp.load_state_dict(checkpoint["model"])
344
345
346
        if not args.test_only:
            optimizer.load_state_dict(checkpoint["optimizer"])
            lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
347
        args.start_epoch = checkpoint["epoch"] + 1
348
        if model_ema:
349
            model_ema.load_state_dict(checkpoint["model_ema"])
350
351
        if scaler:
            scaler.load_state_dict(checkpoint["scaler"])
352
353

    if args.test_only:
354
355
356
        # We disable the cudnn benchmarking because it can noticeably affect the accuracy
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
357
358
359
360
        if model_ema:
            evaluate(model_ema, criterion, data_loader_test, device=device, log_suffix="EMA")
        else:
            evaluate(model, criterion, data_loader_test, device=device)
361
362
363
364
        return

    print("Start training")
    start_time = time.time()
365
    for epoch in range(args.start_epoch, args.epochs):
366
367
        if args.distributed:
            train_sampler.set_epoch(epoch)
368
        train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema, scaler)
Francisco Massa's avatar
Francisco Massa committed
369
        lr_scheduler.step()
370
        evaluate(model, criterion, data_loader_test, device=device)
371
        if model_ema:
372
            evaluate(model_ema, criterion, data_loader_test, device=device, log_suffix="EMA")
373
        if args.output_dir:
374
            checkpoint = {
375
376
377
378
379
380
                "model": model_without_ddp.state_dict(),
                "optimizer": optimizer.state_dict(),
                "lr_scheduler": lr_scheduler.state_dict(),
                "epoch": epoch,
                "args": args,
            }
381
            if model_ema:
382
                checkpoint["model_ema"] = model_ema.state_dict()
383
384
            if scaler:
                checkpoint["scaler"] = scaler.state_dict()
385
            utils.save_on_master(checkpoint, os.path.join(args.output_dir, f"model_{epoch}.pth"))
386
            utils.save_on_master(checkpoint, os.path.join(args.output_dir, "checkpoint.pth"))
387
388
389

    total_time = time.time() - start_time
    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
390
    print(f"Training time {total_time_str}")
391
392


393
def get_args_parser(add_help=True):
394
    import argparse
395
396
397

    parser = argparse.ArgumentParser(description="PyTorch Classification Training", add_help=add_help)

398
399
400
401
402
403
    parser.add_argument("--data-path", default="/datasets01/imagenet_full_size/061417/", type=str, help="dataset path")
    parser.add_argument("--model", default="resnet18", type=str, help="model name")
    parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
    parser.add_argument(
        "-b", "--batch-size", default=32, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
    )
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
    parser.add_argument("--epochs", default=90, type=int, metavar="N", help="number of total epochs to run")
    parser.add_argument(
        "-j", "--workers", default=16, type=int, metavar="N", help="number of data loading workers (default: 16)"
    )
    parser.add_argument("--opt", default="sgd", type=str, help="optimizer")
    parser.add_argument("--lr", default=0.1, type=float, help="initial learning rate")
    parser.add_argument("--momentum", default=0.9, type=float, metavar="M", help="momentum")
    parser.add_argument(
        "--wd",
        "--weight-decay",
        default=1e-4,
        type=float,
        metavar="W",
        help="weight decay (default: 1e-4)",
        dest="weight_decay",
    )
420
421
422
423
424
425
    parser.add_argument(
        "--norm-weight-decay",
        default=None,
        type=float,
        help="weight decay for Normalization layers (default: None, same value as --wd)",
    )
426
427
428
429
430
431
432
433
434
435
436
437
    parser.add_argument(
        "--bias-weight-decay",
        default=None,
        type=float,
        help="weight decay for bias parameters of all layers (default: None, same value as --wd)",
    )
    parser.add_argument(
        "--transformer-embedding-decay",
        default=None,
        type=float,
        help="weight decay for embedding parameters for vision transformer models (default: None, same value as --wd)",
    )
438
439
440
441
442
    parser.add_argument(
        "--label-smoothing", default=0.0, type=float, help="label smoothing (default: 0.0)", dest="label_smoothing"
    )
    parser.add_argument("--mixup-alpha", default=0.0, type=float, help="mixup alpha (default: 0.0)")
    parser.add_argument("--cutmix-alpha", default=0.0, type=float, help="cutmix alpha (default: 0.0)")
443
    parser.add_argument("--lr-scheduler", default="steplr", type=str, help="the lr scheduler (default: steplr)")
444
445
446
447
448
449
450
    parser.add_argument("--lr-warmup-epochs", default=0, type=int, help="the number of epochs to warmup (default: 0)")
    parser.add_argument(
        "--lr-warmup-method", default="constant", type=str, help="the warmup method (default: constant)"
    )
    parser.add_argument("--lr-warmup-decay", default=0.01, type=float, help="the decay for lr")
    parser.add_argument("--lr-step-size", default=30, type=int, help="decrease lr every step-size epochs")
    parser.add_argument("--lr-gamma", default=0.1, type=float, help="decrease lr by a factor of lr-gamma")
451
    parser.add_argument("--lr-min", default=0.0, type=float, help="minimum lr of lr schedule (default: 0.0)")
452
    parser.add_argument("--print-freq", default=10, type=int, help="print frequency")
453
454
    parser.add_argument("--output-dir", default=".", type=str, help="path to save outputs")
    parser.add_argument("--resume", default="", type=str, help="path of checkpoint")
455
    parser.add_argument("--start-epoch", default=0, type=int, metavar="N", help="start epoch")
456
457
458
459
460
461
462
463
464
465
466
467
    parser.add_argument(
        "--cache-dataset",
        dest="cache_dataset",
        help="Cache the datasets for quicker initialization. It also serializes the transforms",
        action="store_true",
    )
    parser.add_argument(
        "--sync-bn",
        dest="sync_bn",
        help="Use sync batch norm",
        action="store_true",
    )
468
469
470
471
472
473
    parser.add_argument(
        "--test-only",
        dest="test_only",
        help="Only test the model",
        action="store_true",
    )
474
    parser.add_argument("--auto-augment", default=None, type=str, help="auto augment policy (default: None)")
Ponku's avatar
Ponku committed
475
476
    parser.add_argument("--ra-magnitude", default=9, type=int, help="magnitude of auto augment policy")
    parser.add_argument("--augmix-severity", default=3, type=int, help="severity of augmix policy")
477
    parser.add_argument("--random-erase", default=0.0, type=float, help="random erasing probability (default: 0.0)")
478

479
    # Mixed precision training parameters
480
    parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")
481

482
    # distributed training parameters
483
    parser.add_argument("--world-size", default=1, type=int, help="number of distributed processes")
484
    parser.add_argument("--dist-url", default="env://", type=str, help="url used to set up distributed training")
485
    parser.add_argument(
486
487
        "--model-ema", action="store_true", help="enable tracking Exponential Moving Average of model parameters"
    )
488
489
490
491
492
493
    parser.add_argument(
        "--model-ema-steps",
        type=int,
        default=32,
        help="the number of iterations that controls how often to update the EMA model (default: 32)",
    )
494
    parser.add_argument(
495
496
        "--model-ema-decay",
        type=float,
497
498
        default=0.99998,
        help="decay factor for Exponential Moving Average of model parameters (default: 0.99998)",
499
    )
500
501
502
    parser.add_argument(
        "--use-deterministic-algorithms", action="store_true", help="Forces the use of deterministic algorithms only."
    )
503
504
505
506
507
508
509
510
511
512
513
514
    parser.add_argument(
        "--interpolation", default="bilinear", type=str, help="the interpolation method (default: bilinear)"
    )
    parser.add_argument(
        "--val-resize-size", default=256, type=int, help="the resize size used for validation (default: 256)"
    )
    parser.add_argument(
        "--val-crop-size", default=224, type=int, help="the central crop size used for validation (default: 224)"
    )
    parser.add_argument(
        "--train-crop-size", default=224, type=int, help="the random crop size used for training (default: 224)"
    )
515
    parser.add_argument("--clip-grad-norm", default=None, type=float, help="the maximum gradient norm (default None)")
516
517
518
519
    parser.add_argument("--ra-sampler", action="store_true", help="whether to use Repeated Augmentation in training")
    parser.add_argument(
        "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)"
    )
520
    parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
521
    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
522
    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
523
    return parser
524

525
526

if __name__ == "__main__":
527
    args = get_args_parser().parse_args()
528
    main(args)