ds_config.py 1.83 KB
Newer Older
mashun1's avatar
hyi2v  
mashun1 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import argparse
from pathlib import Path


def get_tensorboard_config(output_dir: str, job_name: str):
    tensorboard_config = {
        "enabled": True,
        "output_path": output_dir,
        "job_name": job_name
    }
    return tensorboard_config


def get_deepspeed_config(args: argparse.Namespace,
                         micro_batch_size: int,
                         global_batch_size: int,
                         output_dir: str = None,
                         job_name: str = None,
                         ):
    config = {
        "train_batch_size": global_batch_size,
        "train_micro_batch_size_per_gpu": micro_batch_size,
        "gradient_accumulation_steps": args.gradient_accumulation_steps,
        "steps_per_print": args.log_every,
        "optimizer": {
            "type": "AdamW",
            "params": {
                "lr": args.lr,
                "betas": [
                    args.adam_beta1,
                    args.adam_beta2
                ],
                "eps": args.adam_eps,
                "weight_decay": args.weight_decay
            }
        },
        "gradient_clipping": 1.0,
        "prescale_gradients": True,

        "fp16": {
            "enabled": args.precision == 'fp16',
            "fp16_master_weights_and_grads": False,
            "loss_scale": 0,
            "loss_scale_window": 500,
            "hysteresis": 2,
            "min_loss_scale": 1,
            "initial_scale_power": 15
        },
        "bf16": {
            "enabled": args.precision == 'bf16'
        },
        "wall_clock_breakdown": False,
        "zero_optimization": {
            "stage": args.zero_stage,
            "reduce_scatter": False,
            "reduce_bucket_size": 1e9,
        },
    }

    if args.tensorboard:
        config["tensorboard"] = get_tensorboard_config(output_dir, job_name)

    return config