"vscode:/vscode.git/clone" did not exist on "4c605235aa832f259e148dfbdce08d9e471b5099"
Commit 5988d2cc authored by yuguo960516's avatar yuguo960516
Browse files

bert-large

parent 478602ba
Pipeline #142 canceled with stages
from libai.config import LazyCall
from libai.models import VisionTransformer
from .vit_tiny_patch16_224 import cfg
cfg.patch_size = 16
cfg.embed_dim = 384
cfg.num_heads = 6
model = LazyCall(VisionTransformer)(cfg=cfg)
from omegaconf import DictConfig
from libai.config import LazyCall
from libai.models import VisionTransformer
from .vit_tiny_patch16_224 import cfg
cfg.patch_size = 32
cfg.embed_dim = 384
cfg.num_heads = 6
model = LazyCall(VisionTransformer)(cfg=cfg)
from omegaconf import DictConfig
from libai.config import LazyCall
from libai.models import VisionTransformer
cfg = dict(
img_size=224,
patch_size=16,
in_chans=3,
embed_dim=192,
depth=12,
num_heads=3,
mlp_ratio=4.0,
drop_rate=0.0,
attn_drop_rate=0.0,
drop_path_rate=0.0,
num_classes=1000,
loss_func=None,
)
cfg = DictConfig(cfg)
model = LazyCall(VisionTransformer)(cfg=cfg)
import oneflow as flow
from libai.optim import get_default_optimizer_params
from libai.config import LazyCall
optim = LazyCall(flow.optim.AdamW)(
params=LazyCall(get_default_optimizer_params)(
# params.model is meant to be set to the model object,
# before instantiating the optimizer.
clip_grad_max_norm=1.0,
clip_grad_norm_type=2.0,
weight_decay_norm=0.0,
weight_decay_bias=0.0,
),
lr=1e-4,
weight_decay=0.01,
betas=(0.9, 0.999),
eps=1e-8,
do_bias_correction=True,
)
from omegaconf import DictConfig
from libai.config import LazyCall
from libai.scheduler import WarmupCosineLR
from libai.evaluation import ClsEvaluator
# fmt: off
train = dict(
# Directory where output files are written
output_dir="./output",
# `train_micro_batch_size` is number of samples per batch on each GPU.
# train_mini_batch_size = train_micro_batch_size * num_accumulation_steps.
# This is also the number of training samples per step (i.e. per iteration).
# If we use 8 GPUs for data parallel groups, `train_micro_batch_size = 2` and
# `num_accumulation_steps = 4`, then each GPU will see 2 samples per batch and
# 8 samples per iteration.
# Total 64 samples will be trained per iteration across all GPUs.
# global_batch_size = micro_batch_size * num_grad_acc * data_parallel_groups
train_micro_batch_size=32,
global_batch_size=None,
num_accumulation_steps=None,
# The total training iterations
train_iter=10000,
# The total training epochs, will be scaled to training iterations automatically.
# The actual total training iterations will be calculated by the
# formula `max(train_iter, train_epoch * iter_per_epoch)`.
train_epoch=0,
consumed_train_samples=0,
consumed_valid_samples=0,
train_samples=None,
# Fraction of lr-warmup-iters to use for warmup (as a float)
warmup_ratio=0,
# The start iteration, usually needn't set it manually.
# It can be computed automatically when resuming training.
start_iter=0,
# Enable automatic mixed precision for training which does not
# change model's inference behavior.
amp=dict(enabled=False),
# Enable activation checkpointing to allow for training
# with larger models, sequences, and batch sizes.
# If enabled, checkpoint the input activations of each transformer layers by default.
activation_checkpoint=dict(enabled=False),
# NCCL fusion threshold megabytes, set to 0 to
# compatible with previous version of OneFlow.
nccl_fusion_threshold_mb=16,
# Maximum number of ops of NCCL fusion, set to 0 to
# compatible with previous version of OneFlow.
nccl_fusion_max_ops=24,
# Enable ZeRO Optimization to allow for training with larger models.
# This optimization will reduce optimizer stages memory consumption
# as described in ZeRO https://arxiv.org/abs/1910.02054.
zero_optimization=dict(
enabled=False,
stage=1,
),
# Save a model checkpoint after every this number of iterations,
# and maximum number of checkpoint will be kept.
checkpointer=dict(period=5000, max_to_keep=100, save_model_after_n_epoch=None),
# Options for evaluation
# `test_micro_batch_size` is number of samples per batch on each GPU for testing.
# If we use 8 GPUs for data parallel groups and `test_micro_batch_size = 2`, then
# total 16 samples will be used per iteration across all GPUs.
test_micro_batch_size=32,
# Enabled evaluation during training, after every `eval_period` number of iterations
# will perform the evaluation process.
# You can set the maximum evaluation iterations to run for validation/test.
# You can also set a customized evaluator for use.
evaluation=dict(
enabled=False,
# evaluator for calculating top-k acc
evaluator=LazyCall(ClsEvaluator)(topk=(1, 5)),
eval_period=50000,
eval_after_n_epoch=None,
eval_iter=1e5, # running steps for validation/test
# Metrics to be used for best model checkpoint.
eval_metric="Acc@1",
eval_mode="max",
),
# Path to a checkpoint file to be loaded to the model for training or evaluation.
load_weight="",
# Output log to console after every this number of iterations.
log_period=20,
# lr_scheduler arguments
# See libai/scheduler/lr_scheduler.py for definition.
scheduler=LazyCall(WarmupCosineLR)(
# In DefaultTrainer we will automatically set `max_iter`
# and `warmup_iter` by the given train cfg.
warmup_factor=0.001,
alpha=0.01,
warmup_method="linear",
),
# Distributed arguments
# See https://libai.readthedocs.io/en/latest/tutorials/Getting%20Started.html for more detail.
dist=dict(
data_parallel_size=1,
tensor_parallel_size=1,
pipeline_parallel_size=1,
# users must set the `pipeline_num_layers` attribute when `pipeline_parallel_size > 1`
pipeline_num_layers=None,
# users could customize the number of layers in different stages
# by setting the `custom_pipeline_stage_id ` attribute which is used for
# manually balance calculation between stages when running pipeline parallelism
# e.g. you can set `custom_pipeline_stage_id=[0, 0, 0, 1]`
# for `pipeline_num_layers=4 and pipeline_parallel_size=2`
# which means the first 3 layers will be placed on stage0 and
# the last layer will be placed on stage1
# NOTE: if it is None, LiBai will automatically set pipeline_stage_id
# `auto_pipeline_stage_id` and `actual_pipeline_stage_id` will be saved in `config.yaml`
custom_pipeline_stage_id=None,
),
# the device type of input tensors for model, defaults to "cuda".
# if you want to accelerate the model training when pipeline_parallel > 1
# you can set `input_placement_device="cpu"` then call input_tensor.to_global()
# inside your model.forward() method
# see `libai/models/bert_model.py` as reference
input_placement_device="cuda",
# set to `True` to enable rdma for improving speed of pipeline_parallel
rdma_enabled=True,
# Set seed to positive to use a fixed seed. Note that a fixed seed increases
# reproducibility but does not guarantee fully deterministic behavior.
# Disabling all parallelism further increases reproducibility.
seed=1234,
)
# fmt: on
train = DictConfig(train)
from libai.config import LazyCall
from libai.evaluation import PPLEvaluator
from .common.models.gpt import pretrain_model as model
from .common.train import train
from .common.optim import optim
from .common.data.gpt_dataset import dataloader, tokenization
from .common.models.graph import graph
vocab_file = "./nlp_data/gpt2-vocab.json"
merge_files = "./nlp_data/gpt2-merges.txt"
data_prefix = "./nlp_data/data/loss_compara_content_sentence"
tokenization.tokenizer.vocab_file = vocab_file
tokenization.tokenizer.merges_file = merge_files
dataloader.train.dataset[0].data_prefix = data_prefix
dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix
# GPT-2 model config
model.cfg.embedding_dropout_prob = 0.1
model.cfg.attention_dropout_prob = 0.1
model.cfg.num_attention_heads = 16
model.cfg.hidden_size = 384
model.cfg.ffn_hidden_size = 1536
model.cfg.hidden_layers = 6
model.cfg.max_seq_length = 1024
train.input_placement_device = "cpu"
train.dist.pipeline_num_layers = model.cfg.hidden_layers
for ds in dataloader.train.dataset:
ds.max_seq_length = model.cfg.max_seq_length
optim.lr = 1.5e-4
train.train_micro_batch_size = 4
train.amp.enabled = True
train.evaluation.evaluator = LazyCall(PPLEvaluator)()
train.output_dir = "./output/gpt2_output"
from libai.config import LazyCall
from .common.models.resmlp.resmlp_12 import model
from .common.models.graph import graph
from .common.train import train
from .common.optim import optim
from .common.data.imagenet import dataloader
import oneflow as flow
import flowvision.transforms as transforms
from flowvision.transforms import InterpolationMode
from flowvision.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
from flowvision.data import Mixup
from flowvision.loss.cross_entropy import SoftTargetCrossEntropy
# Refine output dir
train.output_dir = "./output_resmlp"
# Refine data path to imagenet
dataloader.train.dataset[0].root = "/path/to/imagenet"
dataloader.test[0].dataset.root = "/path/to/imagenet"
# Refine test data augmentation for resmlp model
resmlp_test_aug = LazyCall(transforms.Compose)(
transforms=[
LazyCall(transforms.Resize)(
size=int(224 / 0.9),
interpolation=InterpolationMode.BICUBIC,
),
LazyCall(transforms.CenterCrop)(
size=224,
),
LazyCall(transforms.ToTensor)(),
LazyCall(transforms.Normalize)(
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD,
),
]
)
dataloader.test[0].dataset.transform = resmlp_test_aug
# Refine model cfg for resmlp training on imagenet
model.cfg.num_classes = 1000
model.cfg.loss_func = SoftTargetCrossEntropy()
# Add Mixup Func
dataloader.train.mixup_func = LazyCall(Mixup)(
mixup_alpha=0.8,
cutmix_alpha=1.0,
prob=1.0,
switch_prob=0.5,
mode="batch",
num_classes=model.cfg.num_classes,
)
# Refine optimizer cfg for resmlp model
optim._target_ = flow.optim.LAMB # use lamb optimizer
optim.lr = 5e-3 # default batch size equals to 256 * 8 = 2048
optim.eps = 1e-8
optim.weight_decay = 0.2
optim.params.clip_grad_max_norm = None
optim.params.clip_grad_norm_type = None
optim.params.overrides = {
"alpha": {"weight_decay": 0.0},
"beta": {"weight_decay": 0.0},
"gamma_1": {"weight_decay": 0.0},
"gamma_2": {"weight_decay": 0.0},
}
# Refine train cfg for resmlp model
train.train_micro_batch_size = 256
train.test_micro_batch_size = 64
train.train_epoch = 400
train.warmup_ratio = 5 / 400
train.evaluation.eval_period = 1000
train.log_period = 1
# Scheduler
train.scheduler.warmup_factor = 0.001
train.scheduler.alpha = 0.01
train.scheduler.warmup_method = "linear"
# Set fp16 ON
train.amp.enabled = True
# Distributed Settings
train.dist.pipeline_num_layers = model.cfg.depth
train.dist.data_parallel_size = 1
train.dist.tensor_parallel_size = 1
train.dist.pipeline_parallel_size = 1
from libai.config import LazyCall
from libai.evaluation import PPLEvaluator
from .common.models.roberta import pretrain_model as model
from .common.models.graph import graph
from .common.train import train
from .common.optim import optim
from .common.data.roberta_dataset import dataloader, tokenization
vocab_file = "./data_test/roberta_data/roberta-vocab.json"
merge_files = "./data_test/roberta_data/roberta-merges.txt"
data_prefix = "./data_test/roberta_data/loss_compara_content_sentence"
tokenization.tokenizer.vocab_file = vocab_file
tokenization.tokenizer.merges_file = merge_files
dataloader.train.dataset[0].data_prefix = data_prefix
dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix
# RoBERTa model config
model.cfg.num_attention_heads = 12
model.cfg.hidden_size = 768
model.cfg.hidden_layers = 8
train.input_placement_device = "cpu"
# parallel strategy settings
train.dist.data_parallel_size = 8
train.dist.tensor_parallel_size = 1
train.dist.pipeline_parallel_size = 1
train.dist.pipeline_num_layers = model.cfg.hidden_layers
train.train_micro_batch_size = 2
train.amp.enabled = True
for ds in dataloader.train.dataset:
ds.max_seq_length = model.cfg.max_position_embeddings
train.evaluation.evaluator = LazyCall(PPLEvaluator)()
train.output_dir = "output/roberta_output"
from libai.config import LazyCall
from .common.models.swin.swin_tiny_patch4_window7_224 import model
from .common.models.graph import graph
from .common.train import train
from .common.optim import optim
from .common.data.cifar100 import dataloader
from flowvision.data import Mixup
from flowvision.loss.cross_entropy import SoftTargetCrossEntropy
# Add Mixup Func
dataloader.train.mixup_func = LazyCall(Mixup)(
mixup_alpha=0.8,
cutmix_alpha=1.0,
prob=1.0,
switch_prob=0.5,
mode="batch",
num_classes=100,
)
# Refine model cfg for vit training on cifar100
model.cfg.num_classes = 100
model.cfg.loss_func = SoftTargetCrossEntropy()
# Refine optimizer cfg for swin model
optim.lr = 5e-4
optim.eps = 1e-8
optim.weight_decay = 0.05
optim.params.clip_grad_max_norm = None
optim.params.clip_grad_norm_type = None
# Refine train cfg for swin model
train.train_micro_batch_size = 32
train.num_accumulation_steps = 1
train.test_micro_batch_size = 32
train.train_epoch = 300
train.warmup_ratio = 20 / 300
train.evaluation.eval_period = 200
train.log_period = 20
# Scheduler
train.scheduler.warmup_factor = 5e-7
train.scheduler.alpha = 0.0
train.scheduler.warmup_method = "linear"
# parallel strategy settings
train.dist.data_parallel_size = 8
train.dist.tensor_parallel_size = 1
train.dist.pipeline_parallel_size = 1
train.dist.pipeline_num_layers = sum(model.cfg.depths)
train.output_dir = "./output"
# Set fp16 ON
train.amp.enabled = False
train.activation_checkpoint.enabled = False
# train.zero_optimization.enabled = True
# train.zero_optimization.stage = 1
graph.enabled = False
from libai.config import LazyCall
from .common.models.swin.swin_tiny_patch4_window7_224 import model
from .common.models.graph import graph
from .common.train import train
from .common.optim import optim
from .common.data.imagenet import dataloader
from flowvision.data import Mixup
from flowvision.loss.cross_entropy import SoftTargetCrossEntropy
# Refine data path to imagenet
dataloader.train.dataset[0].root = "/path/to/imagenet"
dataloader.test[0].dataset.root = "/path/to/imagenet"
# Add Mixup Func
dataloader.train.mixup_func = LazyCall(Mixup)(
mixup_alpha=0.8,
cutmix_alpha=1.0,
prob=1.0,
switch_prob=0.5,
mode="batch",
num_classes=1000,
)
# Refine model cfg for vit training on imagenet
model.cfg.num_classes = 1000
model.cfg.loss_func = SoftTargetCrossEntropy()
# Refine optimizer cfg for vit model
optim.lr = 1e-3
optim.eps = 1e-8
optim.weight_decay = 0.05
optim.params.clip_grad_max_norm = None
optim.params.clip_grad_norm_type = None
# Refine train cfg for vit model
train.train_micro_batch_size = 128
train.test_micro_batch_size = 128
train.train_epoch = 300
train.warmup_ratio = 20 / 300
train.eval_period = 1562
train.log_period = 100
# Scheduler
train.scheduler.warmup_factor = 0.001
train.scheduler.alpha = 0.01
train.scheduler.warmup_method = "linear"
# Set fp16 ON
train.amp.enabled = True
from libai.config import LazyCall
from .common.models.swinv2.swinv2_tiny_patch4_window8_256 import model
from .common.models.graph import graph
from .common.train import train
from .common.optim import optim
from .common.data.cifar100 import dataloader
from flowvision import transforms
from flowvision.transforms import InterpolationMode
from flowvision.transforms.functional import str_to_interp_mode
from flowvision.data import Mixup
from flowvision.loss.cross_entropy import SoftTargetCrossEntropy
CIFAR100_TRAIN_MEAN = (0.5070751592371323, 0.48654887331495095, 0.4409178433670343)
CIFAR100_TRAIN_STD = (0.2673342858792401, 0.2564384629170883, 0.27615047132568404)
# Add Mixup Func
dataloader.train.mixup_func = LazyCall(Mixup)(
mixup_alpha=0.8,
cutmix_alpha=1.0,
prob=1.0,
switch_prob=0.5,
mode="batch",
num_classes=100,
)
dataloader.train.dataset[0].transform = LazyCall(transforms.Compose)(
transforms=[
LazyCall(transforms.RandomResizedCrop)(
size=(256, 256),
scale=(0.08, 1.0),
ratio=(3.0 / 4.0, 4.0 / 3.0),
interpolation=str_to_interp_mode("bicubic"),
),
LazyCall(transforms.RandomHorizontalFlip)(),
LazyCall(transforms.ToTensor)(),
LazyCall(transforms.Normalize)(mean=CIFAR100_TRAIN_MEAN, std=CIFAR100_TRAIN_STD),
]
)
dataloader.test[0].dataset.transform = LazyCall(transforms.Compose)(
transforms=[
LazyCall(transforms.Resize)(
size=256,
interpolation=InterpolationMode.BICUBIC,
),
LazyCall(transforms.CenterCrop)(
size=256,
),
LazyCall(transforms.ToTensor)(),
LazyCall(transforms.Normalize)(
mean=CIFAR100_TRAIN_MEAN,
std=CIFAR100_TRAIN_STD,
),
]
)
# Refine model cfg for vit training on cifar100
model.cfg.num_classes = 100
model.cfg.loss_func = SoftTargetCrossEntropy()
# Refine optimizer cfg for swinv2 model
optim.lr = 5e-4
optim.eps = 1e-8
optim.weight_decay = 0.05
def check_keywords_in_name(name, keywords=()):
isin = False
for keyword in keywords:
if keyword in name:
isin = True
return isin
def set_weight_decay(model, skip_list=(), skip_keywords=()):
has_decay = []
no_decay = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue # frozen weights
if (
len(param.shape) == 1
or name.endswith(".bias")
or (name in skip_list)
or check_keywords_in_name(name, skip_keywords)
):
no_decay.append(param)
else:
has_decay.append(param)
return [{"params": has_decay}, {"params": no_decay, "weight_decay": 0.0}]
optim.params = LazyCall(set_weight_decay)(
model=model,
skip_list=("absolute_pos_embed"),
skip_keywords=("cpb_mlp", "logit_scale", "relative_position_bias_table"),
)
# Refine train cfg for swin model
train.train_micro_batch_size = 32
train.num_accumulation_steps = 8
train.test_micro_batch_size = 32
train.train_epoch = 300
train.warmup_ratio = 20 / 300
train.evaluation.eval_period = 1562
train.log_period = 10
# Scheduler
train.scheduler.warmup_factor = 5e-7
train.scheduler.alpha = 0.0
train.scheduler.warmup_method = "linear"
# parallel strategy settings
train.dist.data_parallel_size = 1
train.dist.tensor_parallel_size = 1
train.dist.pipeline_parallel_size = 1
train.dist.pipeline_num_layers = sum(model.cfg.depths)
train.output_dir = "./output"
train.rdma_enabled = False
# Set fp16 ON
train.amp.enabled = False
train.activation_checkpoint.enabled = False
# train.zero_optimization.enabled = True
# train.zero_optimization.stage = 1
graph.enabled = False
from libai.config import LazyCall
from .common.models.swinv2.swinv2_tiny_patch4_window8_256 import model
from .common.models.graph import graph
from .common.train import train
from .common.optim import optim
from .common.data.imagenet import dataloader
from flowvision import transforms
from flowvision.data import Mixup
from flowvision.loss.cross_entropy import SoftTargetCrossEntropy
from flowvision.transforms import InterpolationMode
from flowvision.transforms.functional import str_to_interp_mode
from flowvision.data.constants import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
)
from flowvision.data.auto_augment import rand_augment_transform
from flowvision.data.random_erasing import RandomErasing
# Refine data path to imagenet
dataloader.train.dataset[0].root = "/path/to/imagenet"
dataloader.test[0].dataset.root = "/path/to/imagenet"
# Add Mixup Func
dataloader.train.mixup_func = LazyCall(Mixup)(
mixup_alpha=0.8,
cutmix_alpha=1.0,
prob=1.0,
switch_prob=0.5,
mode="batch",
num_classes=1000,
)
dataloader.train.dataset[0].transform = LazyCall(transforms.Compose)(
transforms=[
LazyCall(transforms.RandomResizedCrop)(
size=256,
scale=(0.08, 1.0),
ratio=(3.0 / 4.0, 4.0 / 3.0),
interpolation=InterpolationMode.BICUBIC,
),
LazyCall(transforms.RandomHorizontalFlip)(p=0.5),
LazyCall(rand_augment_transform)(
config_str="rand-m9-mstd0.5-inc1",
hparams=dict(
translate_const=int(256 * 0.45),
img_mean=tuple([min(255, round(255 * x)) for x in IMAGENET_DEFAULT_MEAN]),
interpolation=str_to_interp_mode("bicubic"),
),
),
LazyCall(transforms.ToTensor)(),
LazyCall(transforms.Normalize)(
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD,
),
LazyCall(RandomErasing)(
probability=0.25,
mode="pixel",
max_count=1,
num_splits=0,
device="cpu",
),
]
)
dataloader.test[0].dataset.transform = LazyCall(transforms.Compose)(
transforms=[
LazyCall(transforms.Resize)(
size=256,
interpolation=InterpolationMode.BICUBIC,
),
LazyCall(transforms.CenterCrop)(
size=256,
),
LazyCall(transforms.ToTensor)(),
LazyCall(transforms.Normalize)(
mean=IMAGENET_DEFAULT_MEAN,
std=IMAGENET_DEFAULT_STD,
),
]
)
# Refine model cfg for vit training on imagenet
model.cfg.num_classes = 1000
model.cfg.loss_func = SoftTargetCrossEntropy()
# Refine optimizer cfg for vit model
optim.lr = 1e-3 # The pytorch version is 1024 as the total batch size, 1e-3 as the learning rate
optim.eps = 1e-8
optim.weight_decay = 0.05
def check_keywords_in_name(name, keywords=()):
isin = False
for keyword in keywords:
if keyword in name:
isin = True
return isin
def set_weight_decay(model, skip_list=(), skip_keywords=()):
has_decay = []
no_decay = []
for name, param in model.named_parameters():
if not param.requires_grad:
continue # frozen weights
if (
len(param.shape) == 1
or name.endswith(".bias")
or (name in skip_list)
or check_keywords_in_name(name, skip_keywords)
):
no_decay.append(param)
else:
has_decay.append(param)
return [{"params": has_decay}, {"params": no_decay, "weight_decay": 0.0}]
optim.params = LazyCall(set_weight_decay)(
model=model,
skip_list=("absolute_pos_embed"),
skip_keywords=("cpb_mlp", "logit_scale", "relative_position_bias_table"),
)
# Refine train cfg for vit model
train.train_micro_batch_size = 128
train.test_micro_batch_size = 128
train.train_epoch = 300
train.warmup_ratio = 20 / 300
train.eval_period = 1562
train.log_period = 100
graph.enabled = False
train.rdma_enabled = True
# Scheduler
train.scheduler.warmup_factor = 0.001
train.scheduler.alpha = 0.01
train.scheduler.warmup_method = "linear"
# Set fp16 ON
train.amp.enabled = True
from libai.config import LazyCall
from libai.evaluation import PPLEvaluator
from .common.models.t5 import pretrain_model as model
from .common.train import train
from .common.optim import optim
from .common.data.t5_dataset import dataloader, tokenization
from .common.models.graph import graph
vocab_file = "./data_test/bert_data/bert-base-chinese-vocab.txt"
data_prefix = "./data_test/bert_data/loss_compara_content_sentence"
tokenization.tokenizer.vocab_file = vocab_file
dataloader.train.dataset[0].data_prefix = data_prefix
dataloader.train.dataset[0].indexed_dataset.data_prefix = data_prefix
# T5-large model config
model.cfg.num_attention_heads = 12
model.cfg.hidden_size = 384
model.cfg.hidden_layers = 6
train.input_placement_device = "cpu"
train.dist.pipeline_num_layers = 2 * model.cfg.hidden_layers
train.train_micro_batch_size = 16
train.amp.enabled = True
train.evaluation.evaluator = LazyCall(PPLEvaluator)()
train.output_dir = "./output/t5_output"
from libai.config import LazyCall
from .common.models.vit.vit_base_patch16_224 import model
from .common.models.graph import graph
from .common.train import train
from .common.optim import optim
from .common.data.imagenet import dataloader
from flowvision.data import Mixup
from flowvision.loss.cross_entropy import SoftTargetCrossEntropy
# Refine data path to imagenet
dataloader.train.dataset[0].root = "/path/to/imagenet"
dataloader.test[0].dataset.root = "/path/to/imagenet"
# Refine model cfg for vit training on imagenet
model.cfg.num_classes = 1000
model.cfg.loss_func = SoftTargetCrossEntropy()
# Add Mixup Func
dataloader.train.mixup_func = LazyCall(Mixup)(
mixup_alpha=0.8,
cutmix_alpha=1.0,
prob=1.0,
switch_prob=0.5,
mode="batch",
num_classes=model.cfg.num_classes,
)
# Refine optimizer cfg for vit model
optim.lr = 1e-3 # 5e-4 * 1024 (batchsize) / 512
optim.eps = 1e-8
optim.weight_decay = 0.05
optim.params.clip_grad_max_norm = None
optim.params.clip_grad_norm_type = None
optim.params.overrides = {"pos_embed": {"weight_decay": 0.0}, "cls_token": {"weight_decay": 0.0}}
# Refine train cfg for vit model
train.train_micro_batch_size = 128
train.test_micro_batch_size = 128
train.train_epoch = 300
train.warmup_ratio = 5 / 300
train.evaluation.eval_period = 1000
train.log_period = 1
# Scheduler
train.scheduler.warmup_factor = 0.001
train.scheduler.alpha = 0.01
train.scheduler.warmup_method = "linear"
# Set fp16 ON
train.amp.enabled = True
# Distributed Settings
train.dist.pipeline_num_layers = model.cfg.depth
train.dist.data_parallel_size = 1
train.dist.tensor_parallel_size = 1
train.dist.pipeline_parallel_size = 1
# Some scripts for developers to use, include:
- `linter.sh`: lint the codebase before commit.
- `run_unittest.sh`: run unittest for codebase.
#!/bin/bash -e
# cd to libai project root
cd "$(dirname "${BASH_SOURCE[0]}")/.."
export TEST_OUTPUT=output_unittest
export ONEFLOW_TEST_DEVICE_NUM=4
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/inference/test_text_generation.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/inference/test_text_classification.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/inference/test_image_classification.py
rm -rf $TEST_OUTPUT
#!/bin/bash -e
# cd to libai project root
cd "$(dirname "${BASH_SOURCE[0]}")/.."
{
black --version | grep -E "21\." > /dev/null
} || {
echo "Linter requires 'black==21.*' !"
exit 1
}
ISORT_VERSION=$(isort --version-number)
if [[ "$ISORT_VERSION" != 5.10.1 ]]; then
echo "Linter requires isort==5.10.1 !"
exit 1
fi
set -v
echo "Running autoflake ..."
autoflake --remove-unused-variables --in-place --recursive .
echo "Running isort ..."
isort . --atomic
echo "Running black ..."
black -l 100 .
echo "Running flake8 ..."
if [ -x "$(command -v flake8-3)" ]; then
flake8-3 .
else
python3 -m flake8 .
fi
echo "Running clang-format ..."
find . -regex ".*\.\(cpp\|c\|cc\|cu\|cxx\|h\|hh\|hpp\|hxx\|tcc\|mm\|m\)" -print0 | xargs -0 clang-format -i
command -v arc > /dev/null && arc lint
\ No newline at end of file
#!/bin/bash -e
# cd to libai project root
cd "$(dirname "${BASH_SOURCE[0]}")/.."
export TEST_OUTPUT=output_unittest
export ONEFLOW_TEST_DEVICE_NUM=4
export ONEFLOW_EP_CUDA_ENABLE_TF32_EXECUTION=0
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/model_utils/test_bert_loader.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/model_utils/test_roberta_loader.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/model_utils/test_gpt_loader.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/model_utils/test_mt5_loader.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/model_utils/test_t5_loader.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/model_utils/test_swin_loader.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/model_utils/test_swinv2_loader.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/model_utils/test_vit_loader.py
rm -rf $TEST_OUTPUT
\ No newline at end of file
#!/bin/bash -e
# cd to libai project root
cd "$(dirname "${BASH_SOURCE[0]}")/.."
export TEST_OUTPUT=output_unittest
export ONEFLOW_TEST_DEVICE_NUM=4
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/models/test_bert.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/models/test_roberta.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/models/test_gpt.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/models/test_t5.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/models/test_mt5.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/models/test_vit.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/models/test_swin.py
python3 -m oneflow.distributed.launch --nproc_per_node 4 -m pytest -s --disable-warnings tests/models/test_swinv2.py
rm -rf $TEST_OUTPUT
#!/bin/bash -e
# cd to libai project root
cd "$(dirname "${BASH_SOURCE[0]}")/.."
pytest --disable-warnings ./tests
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment