# coding=utf-8 # Copyright 2021 The OneFlow Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from omegaconf import OmegaConf from flowvision.data import Mixup # from flowvision.loss.cross_entropy import SoftTargetCrossEntropy from libai.config import LazyCall, get_config from modeling.cross_entropy import SoftTargetCrossEntropy from configs.models.vit_base_patch16 import model from utils.scheduler import ( warmup_layerscale_cosine_lr_scheduler, warmup_cosine_lr_scheduler, ) from utils.lr_decay import param_groups_lrd # Get train, optim and graph configs train = get_config("common/train.py").train optim = get_config("common/optim.py").optim graph = get_config("common/models/graph.py").graph dataloader = get_config("common/data/imagenet.py").dataloader # number devices n_gpus = 8 # Graph training graph.enabled = True # Refine model cfg for vit training on imagenet model.num_classes = 1000 model.loss_func = LazyCall(SoftTargetCrossEntropy)() # Path to the weight for fine-tune finetune = OmegaConf.create() finetune.enable = True # only load weight if enable is True finetune.weight_style = ( "oneflow" # Set "oneflow" for loading oneflow weights, set "pytorch" for loading torch weights ) finetune.path = "/path/to/pretrained_mae_weight" # Refine data path to imagenet dataloader.train.dataset[0].root = "/path/to/imagenet" dataloader.test[0].dataset.root = "/path/to/imagenet" # Add Mixup Func dataloader.train.mixup_func = LazyCall(Mixup)( mixup_alpha=0.8, cutmix_alpha=1.0, prob=1.0, switch_prob=0.5, mode="batch", label_smoothing=0.1, num_classes=model.num_classes, ) # Refine training settings for MAE finetune train.train_micro_batch_size = 32 train.num_accumulation_steps = 4 train.test_micro_batch_size = 32 effective_batch_size = train.train_micro_batch_size * train.num_accumulation_steps * n_gpus train.train_epoch = 100 train.warmup_ratio = 5 / 100 train.log_period = 20 train.evaluation.eval_after_n_epoch = 1 train.checkpointer.save_model_after_n_epoch = 1 # Set layer decay for MAE fine-tune train.layer_decay = 0.65 # AMP train.amp.enabled = True # Base learning in MAE is set to 1.5e-4 # The actually learning rate should be computed by linear scaling rule as follows: # lr = base_lr * batch_size / 256 # In LiBai, you should refine the actually learning rate due to your on settings # Here we use 8 GPUs, 128 batch_size per GPU for training, batch_size equals to 1024 base_lr = 5e-4 actual_lr = base_lr * effective_batch_size / 256 # Refine optim settings optim.params._target_ = param_groups_lrd optim.params.weight_decay = 0.05 optim.params.layer_decay = 0.65 optim.lr = actual_lr del optim.params.clip_grad_max_norm del optim.params.clip_grad_norm_type del optim.params.weight_decay_norm del optim.params.weight_decay_bias del optim.weight_decay # Refine scheduler if graph.enabled: train.scheduler = LazyCall(warmup_cosine_lr_scheduler)( warmup_factor=0.0, min_lr=1e-6, ) else: train.scheduler = LazyCall(warmup_layerscale_cosine_lr_scheduler)( warmup_factor=0.0, min_lr=1e-6, ) # Distributed Settings train.dist.pipeline_num_layers = model.depth train.dist.data_parallel_size = n_gpus train.dist.tensor_parallel_size = 1 train.dist.pipeline_parallel_size = 1 eval_only = False