deepspeed_constants.py 7.27 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""

#############################################
# Routes
#############################################
ROUTE_TRAIN = "train"
ROUTE_EVAL = "eval"
ROUTE_PREDICT = "predict"
ROUTE_ENCODE = "encode"

#############################################
# Batch size
#############################################
TRAIN_BATCH_SIZE = "train_batch_size"
TRAIN_BATCH_SIZE_DEFAULT = 1

#############################################
# Optimizer and lr scheduler
#############################################
OPTIMIZER = "optimizer"
OPTIMIZER_TYPE_DEFAULT = None
OPTIMIZER_PARAMS = "params"
TYPE = "type"
27
28
LEGACY_FUSION = "legacy_fusion"
LEGACY_FUSION_DEFAULT = True
29
30
31
32
33
SCHEDULER = "scheduler"
SCHEDULER_TYPE_DEFAULT = None
SCHEDULER_PARAMS = "params"
MAX_GRAD_NORM = 'max_grad_norm'

34
35
36
37
38
39
#############################################
# Optimizer and lr scheduler
#############################################
ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False

40
41
42
43
44
#############################################
# Torch distributed constants
#############################################
TORCH_DISTRIBUTED_DEFAULT_PORT = "29500"

45
46
47
48
49
50
51
52
53
54
# Steps
STEPS_PER_PRINT = "steps_per_print"
STEPS_PER_PRINT_DEFAULT = 10

#########################################
# Training micro batch size per GPU
#########################################
# Batch size for one training step. This is used when the
# TRAIN_BATCH_SIZE cannot fit in GPU memory to determine
# the number of gradient accumulation steps. By default, this
55
# is set to None. Users can configure in ds_config.json as below example:
56
57
TRAIN_MICRO_BATCH_SIZE_PER_GPU = '''
TRAIN_MICRO_BATCH_SIZE_PER_GPU is defined in this format:
58
"train_micro_batch_size_per_gpu": 1
59
60
61
62
63
64
65
66
'''
TRAIN_MICRO_BATCH_SIZE_PER_GPU = "train_micro_batch_size_per_gpu"
TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT = None

#########################################
# Gradient Accumulation
#########################################
# Gradient accumulation feature. By default, this feature is not enabled.
67
# Users can configure in ds_config.json as below example:
68
69
GRADIENT_ACCUMULATION_FORMAT = '''
Gradient Accumulation should be of the format:
70
"gradient_accumulation_steps": 1
71
72
73
74
75
76
77
78
79
80
81
82
'''
GRADIENT_ACCUMULATION_STEPS = "gradient_accumulation_steps"
GRADIENT_ACCUMULATION_STEPS_DEFAULT = None

# DeepSpeed CSR gradient sparsity
SPARSE_GRADIENTS = "sparse_gradients"
SPARSE_GRADIENTS_DEFAULT = False

#########################################
# FP16 support
#########################################
# FP16 feature. By default, this feature is not enabled.
83
# Users can configure in ds_config.json as below example:
84
85
FP16_FORMAT = '''
FP16 parameters should be of the format:
86
87
88
89
90
91
92
"fp16": {
  "enabled": true,
  "loss_scale": 0,
  "initial_scale_power": 32,
  "loss_scale_window": 1000,
  "hysteresis": 2,
  "min_loss_scale": 1
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
}
'''
FP16 = "fp16"

FP16_ENABLED = "enabled"
FP16_ENABLED_DEFAULT = False

# FP16 loss scale, zero means using dynamic scaling
FP16_LOSS_SCALE = "loss_scale"
FP16_LOSS_SCALE_DEFAULT = 0

# FP16 initial dynamic scale loss power
FP16_INITIAL_SCALE_POWER = "initial_scale_power"
FP16_INITIAL_SCALE_POWER_DEFAULT = 32

# FP16 loss scale window
FP16_LOSS_SCALE_WINDOW = "loss_scale_window"
FP16_LOSS_SCALE_WINDOW_DEFAULT = 1000

# FP16 hysteresis
FP16_HYSTERESIS = "hysteresis"
FP16_HYSTERESIS_DEFAULT = 2

# FP16 min loss scale
FP16_MIN_LOSS_SCALE = "min_loss_scale"
FP16_MIN_LOSS_SCALE_DEFAULT = 1

#########################################
# Gradient clipping
#########################################
# Gradient clipping. By default, this feature is not enabled.
124
# Users can configure in ds_config.json as below example:
125
126
GRADIENT_CLIPPING_FORMAT = '''
Dump state should be enabled as:
127
"gradient_clipping": 1.0
128
129
130
131
132
133
134
135
'''
GRADIENT_CLIPPING = 'gradient_clipping'
GRADIENT_CLIPPING_DEFAULT = 0.

#########################################
# ZeRO optimization
#########################################
# ZeRO optimization. By default, this optimization is not enabled.
136
# Users can configure in ds_config.json as below example:
137
138
ZERO_FORMAT = '''
ZeRO optimization should be enabled as:
139
140
"zero_optimization": true,
"zero_all_gather_size": 200
141
142
143
144
145
146
147
148
149
150
151
'''
ZERO_OPTIMIZATION = 'zero_optimization'
ZERO_OPTIMIZATION_DEFAULT = False

ALLGATHER_SIZE = 'allgather_size'
ALLGATHER_SIZE_DEFAULT = 500000000

#########################################
# FP32 AllReduce
#########################################
# FP32 All reduce. By default, this feature is not enabled.
152
# Users can configure in ds_config.json as below example:
153
154
FP32_ALLREDUCE_FORMAT = '''
FP32 Allreduce should be enabled as:
155
"fp32_allreduce": true
156
157
158
159
160
161
162
163
'''
FP32_ALLREDUCE = "fp32_allreduce"
FP32_ALLREDUCE_DEFAULT = False

#########################################
# Scale gradients before allreduce
#########################################
# Prescale gradients. By default, this feature is not enabled.
164
# Users can configure in ds_config.json as below example:
165
166
PRESCALE_GRADIENTS_FORMAT = '''
Gradient prescaling should be enabled as:
167
"prescale_gradients": true
168
169
170
171
172
173
174
175
'''
PRESCALE_GRADIENTS = "prescale_gradients"
PRESCALE_GRADIENTS_DEFAULT = False

#########################################
# Disable AllGather
#########################################
# Disable AllGather. By default, this feature is not enabled.
176
# Users can configure in ds_config.json as below example:
177
178
DISABLE_ALLGATHER_FORMAT = '''
Disable AllGather should be enabled as:
179
"disable_allgather": true
180
181
182
183
184
185
186
187
'''
DISABLE_ALLGATHER = "disable_allgather"
DISABLE_ALLGATHER_DEFAULT = False

#########################################
# Dump DeepSpeed state
#########################################
# Dump State. By default, this feature is not enabled.
188
# Users can configure in ds_config.json as below example:
189
190
DUMP_STATE_FORMAT = '''
Dump state should be enabled as:
191
"dump_state": true
192
193
194
195
196
197
198
199
'''
DUMP_STATE = 'dump_state'
DUMP_STATE_DEFAULT = False

#########################################
# Vocabulary size
#########################################
# Vocabulary size.
200
# Users can configure in ds_config.json as below example:
201
202
VOCABULARY_SIZE_FORMAT = '''
Vocabulary size can be specified as:
203
"vocabulary_size": 1024
204
205
206
207
208
209
210
211
'''
VOCABULARY_SIZE = 'vocabulary_size'
VOCABULARY_SIZE_DEFAULT = None

#########################################
# Wall block breakdown
#########################################
# Wall clock breakdown. By default, this feature is not enabled.
212
# Users can configure in ds_config.json as below example:
213
214
WALL_CLOCK_BREAKDOWN_FORMAT = '''
Wall block breakdown should be enabled as:
215
"wall_clock_breakdown": true
216
217
218
219
220
221
222
223
'''
WALL_CLOCK_BREAKDOWN = 'wall_clock_breakdown'
WALL_CLOCK_BREAKDOWN_DEFAULT = False

#########################################
# Tensorboard
#########################################
# Tensorboard. By default, this feature is not enabled.
224
# Users can configure in ds_config.json as below example:
225
226
TENSORBOARD_FORMAT = '''
Tensorboard can be specified as:
227
228
229
230
"tensorboard": {
  "enabled": true,
  "output_path": "/home/myname/foo",
  "job_name": "model_lr2e-5_epoch3_seed2_seq64"
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
}
'''
TENSORBOARD = "tensorboard"

# Tensorboard enable signal
TENSORBOARD_ENABLED = "enabled"
TENSORBOARD_ENABLED_DEFAULT = False

# Tensorboard output path
TENSORBOARD_OUTPUT_PATH = "output_path"
TENSORBOARD_OUTPUT_PATH_DEFAULT = ""

# Tensorboard job name
TENSORBOARD_JOB_NAME = "job_name"
TENSORBOARD_JOB_NAME_DEFAULT = "DeepSpeedJobName"