run_table_1.sh 2.65 KB
Newer Older
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
#!/bin/bash

# ================================
# Choose the case to run.
# ================================
# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
MODEL_SIZE=1.7B






if [ ${MODEL_SIZE} == "1.7B" ]; then
    TP=1
    PP=1
    MBS=16
    GBS=512
    NLS=24
    HS=2304
    NAH=24
    DDP=torch
    NNODES=4
slym's avatar
slym committed
24
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
25
26
27
28
29
30
31
32
33
34
elif [ ${MODEL_SIZE} == "3.6B" ]; then
    TP=2
    PP=1
    MBS=16
    GBS=512
    NLS=30
    HS=3072
    NAH=32
    DDP=torch
    NNODES=8
slym's avatar
slym committed
35
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
36
37
38
39
40
41
42
43
44
45
elif [ ${MODEL_SIZE} == "7.5B" ]; then
    TP=4
    PP=1
    MBS=16
    GBS=512
    NLS=36
    HS=4096
    NAH=32
    DDP=torch
    NNODES=16
slym's avatar
slym committed
46
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
47
48
49
50
51
52
53
54
55
56
elif [ ${MODEL_SIZE} == "18B" ]; then
    TP=8
    PP=1
    MBS=8
    GBS=1024
    NLS=40
    HS=6144
    NAH=48
    DDP=torch
    NNODES=32
slym's avatar
slym committed
57
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
58
59
60
61
62
63
64
65
66
67
elif [ ${MODEL_SIZE} == "39B" ]; then
    TP=8
    PP=2
    MBS=4
    GBS=1536
    NLS=48
    HS=8192
    NAH=64
    DDP=local
    NNODES=64
slym's avatar
slym committed
68
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
69
70
71
72
73
74
75
76
77
78
elif [ ${MODEL_SIZE} == "76B" ]; then
    TP=8
    PP=4
    MBS=2
    GBS=1792
    NLS=60
    HS=10240
    NAH=80
    DDP=local
    NNODES=128
slym's avatar
slym committed
79
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
80
81
82
83
84
85
86
87
88
89
elif [ ${MODEL_SIZE} == "145B" ]; then
    TP=8
    PP=8
    MBS=2
    GBS=2304
    NLS=80
    HS=12288
    NAH=96
    DDP=local
    NNODES=192
slym's avatar
slym committed
90
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
91
92
93
94
95
96
97
98
99
100
elif [ ${MODEL_SIZE} == "310B" ]; then
    TP=8
    PP=16
    MBS=1
    GBS=2160
    NLS=96
    HS=16384
    NAH=128
    DDP=local
    NNODES=240
slym's avatar
slym committed
101
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
102
103
104
105
106
107
108
109
110
111
elif [ ${MODEL_SIZE} == "530B" ]; then
    TP=8
    PP=35
    MBS=1
    GBS=2520
    NLS=105
    HS=20480
    NAH=128
    DDP=local
    NNODES=315
slym's avatar
slym committed
112
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
113
114
115
116
117
118
119
120
121
122
elif [ ${MODEL_SIZE} == "1T" ]; then
    TP=8
    PP=64
    MBS=1
    GBS=3072
    NLS=128
    HS=25600
    NAH=160
    DDP=local
    NNODES=384
slym's avatar
slym committed
123
    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
Mohammad Shoeybi's avatar
Mohammad Shoeybi committed
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
else
    echo "Invalid configuration"
    exit 1
fi


# Name of the job
export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}


# Import the configs.
. `pwd`/CONFIG.sh


# Submit the job.
. `pwd`/SBATCH.sh


exit 0