from omegaconf import DictConfig

from libai.config import LazyCall
from libai.scheduler import WarmupCosineLR
from libai.evaluation import ClsEvaluator

# fmt: off
train = dict(

    # Directory where output files are written
    output_dir="./output",

    # `train_micro_batch_size` is number of samples per batch on each GPU.
    # train_mini_batch_size = train_micro_batch_size * num_accumulation_steps.
    # This is also the number of training samples per step (i.e. per iteration).

    # If we use 8 GPUs for data parallel groups, `train_micro_batch_size = 2` and
    # `num_accumulation_steps = 4`, then each GPU will see 2 samples per batch and
    # 8 samples per iteration.
    # Total 64 samples will be trained per iteration across all GPUs.

    # global_batch_size = micro_batch_size  * num_grad_acc * data_parallel_groups
    train_micro_batch_size=32,
    global_batch_size=None,
    num_accumulation_steps=None,

    # The total training iterations
    train_iter=10000,
    # The total training epochs, will be scaled to training iterations automatically.
    # The actual total training iterations will be calculated by the
    # formula `max(train_iter, train_epoch * iter_per_epoch)`.
    train_epoch=0,
    consumed_train_samples=0,
    consumed_valid_samples=0,
    train_samples=None,

    # Fraction of lr-warmup-iters to use for warmup (as a float)
    warmup_ratio=0,

    # The start iteration, usually needn't set it manually.
    # It can be computed automatically when resuming training.
    start_iter=0,

    # Enable automatic mixed precision for training which does not
    # change model's inference behavior.
    amp=dict(enabled=False),

    # Enable activation checkpointing to allow for training
    # with larger models, sequences, and batch sizes.
    # If enabled, checkpoint the input activations of each transformer layers by default.
    activation_checkpoint=dict(enabled=False),

    # NCCL fusion threshold megabytes, set to 0 to
    # compatible with previous version of OneFlow.
    nccl_fusion_threshold_mb=16,

    # Maximum number of ops of NCCL fusion, set to 0 to
    # compatible with previous version of OneFlow.
    nccl_fusion_max_ops=24,

    # Enable ZeRO Optimization to allow for training with larger models.
    # This optimization will reduce optimizer stages memory consumption
    # as described in ZeRO https://arxiv.org/abs/1910.02054.
    zero_optimization=dict(
        enabled=False,
        stage=1,
    ),

    # Save a model checkpoint after every this number of iterations,
    # and maximum number of checkpoint will be kept.
    checkpointer=dict(period=5000, max_to_keep=100, save_model_after_n_epoch=None),

    # Options for evaluation

    # `test_micro_batch_size` is number of samples per batch on each GPU for testing.
    # If we use 8 GPUs for data parallel groups and `test_micro_batch_size = 2`, then
    # total 16 samples will be used per iteration across all GPUs.
    test_micro_batch_size=32,

    # Enabled evaluation during training, after every `eval_period` number of iterations
    # will perform the evaluation process.
    # You can set the maximum evaluation iterations to run for validation/test.
    # You can also set a customized evaluator for use.
    evaluation=dict(
        enabled=True,
        # evaluator for calculating top-k acc
        evaluator=LazyCall(ClsEvaluator)(topk=(1, 5)),
        eval_period=5000,
        eval_after_n_epoch=None,
        eval_iter=1e5,  # running steps for validation/test

        # Metrics to be used for best model checkpoint.
        eval_metric="Acc@1",
        eval_mode="max",
    ),

    # Path to a checkpoint file to be loaded to the model for training or evaluation.
    load_weight="",

    # Output log to console after every this number of iterations.
    log_period=20,

    # lr_scheduler arguments
    # See libai/scheduler/lr_scheduler.py for definition.
    scheduler=LazyCall(WarmupCosineLR)(
        # In DefaultTrainer we will automatically set `max_iter`
        # and `warmup_iter` by the given train cfg.
        warmup_factor=0.001,
        alpha=0.01,
        warmup_method="linear",
    ),

    # Distributed arguments
    # See https://libai.readthedocs.io/en/latest/tutorials/Getting%20Started.html for more detail.
    dist=dict(
        data_parallel_size=1,
        tensor_parallel_size=1,
        pipeline_parallel_size=1,
        # users must set the `pipeline_num_layers` attribute when `pipeline_parallel_size > 1`
        pipeline_num_layers=None,
        # users could customize the number of layers in different stages
        # by setting the `custom_pipeline_stage_id ` attribute which is used for
        # manually balance calculation between stages when running pipeline parallelism
        # e.g. you can set `custom_pipeline_stage_id=[0, 0, 0, 1]`
        # for `pipeline_num_layers=4 and pipeline_parallel_size=2`
        # which means the first 3 layers will be placed on stage0 and
        # the last layer will be placed on stage1
        # NOTE: if it is None, LiBai will automatically set pipeline_stage_id
        # `auto_pipeline_stage_id` and `actual_pipeline_stage_id` will be saved in `config.yaml`
        custom_pipeline_stage_id=None,
    ),

    # the device type of input tensors for model, defaults to "cuda".
    # if you want to accelerate the model training when pipeline_parallel > 1
    # you can set `input_placement_device="cpu"` then call input_tensor.to_global()
    # inside your model.forward() method
    # see `libai/models/bert_model.py` as reference
    input_placement_device="cuda",

    # set to `True` to enable rdma for improving speed of pipeline_parallel
    rdma_enabled=True,

    # Set seed to positive to use a fixed seed. Note that a fixed seed increases
    # reproducibility but does not guarantee fully deterministic behavior.
    # Disabling all parallelism further increases reproducibility.
    seed=1234,
)
# fmt: on

train = DictConfig(train)