constants.py 13.6 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
"""
Copyright (c) Microsoft Corporation
Licensed under the MIT license.
"""

#############################################
# Routes
#############################################
ROUTE_TRAIN = "train"
ROUTE_EVAL = "eval"
ROUTE_PREDICT = "predict"
ROUTE_ENCODE = "encode"

#############################################
# Batch size
#############################################
TRAIN_BATCH_SIZE = "train_batch_size"
Jeff Rasley's avatar
Jeff Rasley committed
18
TRAIN_BATCH_SIZE_DEFAULT = None
19
20

#############################################
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
# Sparse attention
#############################################
SPARSE_ATTENTION = "sparse_attention"
SPARSE_DENSE_MODE = "dense"
SPARSE_FIXED_MODE = "fixed"
SPARSE_VARIABLE_MODE = "variable"
SPARSE_BIGBIRD_MODE = "bigbird"
SPARSE_BSLONGFORMER_MODE = "bslongformer"
SPARSE_MODE = "mode"
SPARSE_MODE_DEFAULT = SPARSE_FIXED_MODE
SPARSE_BLOCK = "block"
SPARSE_BLOCK_DEFAULT = 16
SPARSE_DIFFERENT_LAYOUT_PER_HEAD = "different_layout_per_head"
SPARSE_DIFFERENT_LAYOUT_PER_HEAD_DEFAULT = False
SPARSE_NUM_LOCAL_BLOCKS = "num_local_blocks"
SPARSE_NUM_LOCAL_BLOCKS_DEFAULT = 4
SPARSE_NUM_GLOBAL_BLOCKS = "num_global_blocks"
SPARSE_NUM_GLOBAL_BLOCKS_DEFAULT = 1
SPARSE_ATTENTION_TYPE = "attention"
SPARSE_ATTENTION_TYPE_DEFAULT = "bidirectional"
SPARSE_HORIZONTAL_GLOBAL_ATTENTION = "horizontal_global_attention"
SPARSE_HORIZONTAL_GLOBAL_ATTENTION_DEFAULT = False
43
SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS = "num_different_global_patterns"
44
45
46
47
48
49
50
51
52
53
54
55
56
SPARSE_NUM_DIFFERENT_GLOBAL_PATTERNS_DEFAULT = 1
SPARSE_NUM_RANDOM_BLOCKS = "num_random_blocks"
SPARSE_NUM_RANDOM_BLOCKS_DEFAULT = 0
SPARSE_LOCAL_WINDOW_BLOCKS = "local_window_blocks"
SPARSE_LOCAL_WINDOW_BLOCKS_DEFAULT = [4]
SPARSE_GLOBAL_BLOCK_INDICES = "global_block_indices"
SPARSE_GLOBAL_BLOCK_INDICES_DEFAULT = [0]
SPARSE_GLOBAL_BLOCK_END_INDICES = "global_block_end_indices"
SPARSE_GLOBAL_BLOCK_END_INDICES_DEFAULT = None
SPARSE_NUM_SLIDING_WINDOW_BLOCKS = "num_sliding_window_blocks"
SPARSE_NUM_SLIDING_WINDOW_BLOCKS_DEFAULT = 3

#############################################
57
58
59
60
61
62
# Optimizer and lr scheduler
#############################################
OPTIMIZER = "optimizer"
OPTIMIZER_TYPE_DEFAULT = None
OPTIMIZER_PARAMS = "params"
TYPE = "type"
63
LEGACY_FUSION = "legacy_fusion"
64
LEGACY_FUSION_DEFAULT = False
65
66
67
68
69
SCHEDULER = "scheduler"
SCHEDULER_TYPE_DEFAULT = None
SCHEDULER_PARAMS = "params"
MAX_GRAD_NORM = 'max_grad_norm'

70
71
72
73
74
75
#############################################
# Optimizer and lr scheduler
#############################################
ZERO_ALLOW_UNTESTED_OPTIMIZER = "zero_allow_untested_optimizer"
ZERO_ALLOW_UNTESTED_OPTIMIZER_DEFAULT = False

76
77
78
79
80
81
82
83
84
85
# Steps
STEPS_PER_PRINT = "steps_per_print"
STEPS_PER_PRINT_DEFAULT = 10

#########################################
# Training micro batch size per GPU
#########################################
# Batch size for one training step. This is used when the
# TRAIN_BATCH_SIZE cannot fit in GPU memory to determine
# the number of gradient accumulation steps. By default, this
86
# is set to None. Users can configure in ds_config.json as below example:
87
88
TRAIN_MICRO_BATCH_SIZE_PER_GPU = '''
TRAIN_MICRO_BATCH_SIZE_PER_GPU is defined in this format:
89
"train_micro_batch_size_per_gpu": 1
90
91
92
93
94
95
96
97
'''
TRAIN_MICRO_BATCH_SIZE_PER_GPU = "train_micro_batch_size_per_gpu"
TRAIN_MICRO_BATCH_SIZE_PER_GPU_DEFAULT = None

#########################################
# Gradient Accumulation
#########################################
# Gradient accumulation feature. By default, this feature is not enabled.
98
# Users can configure in ds_config.json as below example:
99
100
GRADIENT_ACCUMULATION_FORMAT = '''
Gradient Accumulation should be of the format:
101
"gradient_accumulation_steps": 1
102
103
104
105
106
107
108
109
'''
GRADIENT_ACCUMULATION_STEPS = "gradient_accumulation_steps"
GRADIENT_ACCUMULATION_STEPS_DEFAULT = None

# DeepSpeed CSR gradient sparsity
SPARSE_GRADIENTS = "sparse_gradients"
SPARSE_GRADIENTS_DEFAULT = False

aiss's avatar
aiss committed
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#########################################
# BFLOAT16 support
#########################################
# BFLOAT16 feature. By default, this feature is not enabled.
# Users can configure in ds_config.json as below example:
BFLOAT16_FORMAT = '''
BFLOAT16 parameters should be of the format:
"bf16": {
  "enabled": true
}
'''
BFLOAT16 = "bf16"
BFLOAT16_OLD = "bfloat16"  # keeping for backwards compatibility

BFLOAT16_ENABLED = "enabled"
BFLOAT16_ENABLED_DEFAULT = False

127
128
129
130
#########################################
# FP16 support
#########################################
# FP16 feature. By default, this feature is not enabled.
131
# Users can configure in ds_config.json as below example:
132
133
FP16_FORMAT = '''
FP16 parameters should be of the format:
134
135
136
137
138
139
140
"fp16": {
  "enabled": true,
  "loss_scale": 0,
  "initial_scale_power": 32,
  "loss_scale_window": 1000,
  "hysteresis": 2,
  "min_loss_scale": 1
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
}
'''
FP16 = "fp16"

FP16_ENABLED = "enabled"
FP16_ENABLED_DEFAULT = False

# FP16 loss scale, zero means using dynamic scaling
FP16_LOSS_SCALE = "loss_scale"
FP16_LOSS_SCALE_DEFAULT = 0

# FP16 initial dynamic scale loss power
FP16_INITIAL_SCALE_POWER = "initial_scale_power"
FP16_INITIAL_SCALE_POWER_DEFAULT = 32

# FP16 loss scale window
FP16_LOSS_SCALE_WINDOW = "loss_scale_window"
FP16_LOSS_SCALE_WINDOW_DEFAULT = 1000

# FP16 hysteresis
FP16_HYSTERESIS = "hysteresis"
FP16_HYSTERESIS_DEFAULT = 2

# FP16 min loss scale
FP16_MIN_LOSS_SCALE = "min_loss_scale"
FP16_MIN_LOSS_SCALE_DEFAULT = 1

aiss's avatar
aiss committed
168
169
170
171
# FP16 master and grads
FP16_MASTER_WEIGHTS_AND_GRADS = "fp16_master_weights_and_grads"
FP16_MASTER_WEIGHTS_AND_GRADS_DEFAULT = False

172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#########################################
# Apex AMP support
#########################################
# Use Apex AMP for mixed precision support, all parameters (other than 'enabled') will be passed to
# amp.initialize(model, optimizer, **amp_params)
# See apex documentation for supported parameters/features: https://nvidia.github.io/apex/amp.html#apex.amp.initialize
AMP_FORMAT = '''
"amp" {
  "enabled: true,
  "opt_level": "O1",
  ...
}
'''
AMP = "amp"

AMP_ENABLED = "enabled"
AMP_ENABLED_DEFAULT = False

190
191
192
193
#########################################
# Gradient clipping
#########################################
# Gradient clipping. By default, this feature is not enabled.
194
# Users can configure in ds_config.json as below example:
195
GRADIENT_CLIPPING_FORMAT = '''
196
Gradient clipping should be enabled as:
197
"gradient_clipping": 1.0
198
199
200
201
202
'''
GRADIENT_CLIPPING = 'gradient_clipping'
GRADIENT_CLIPPING_DEFAULT = 0.

#########################################
aiss's avatar
aiss committed
203
# Communication data type
204
#########################################
aiss's avatar
aiss committed
205
206
# Supported types: ['none', 'fp16', 'fp32']
# By default, this feature is not enabled ('none' value)
207
# Users can configure in ds_config.json as below example:
aiss's avatar
aiss committed
208
209
210
COMMUNICATION_DATA_TYPE_FORMAT = '''
Communication data type should be set as:
"communication_data_type": "fp32"
211
'''
aiss's avatar
aiss committed
212
213
COMMUNICATION_DATA_TYPE = "communication_data_type"
COMMUNICATION_DATA_TYPE_DEFAULT = None
214
215

#########################################
216
# Scale/predivide gradients before allreduce
217
218
#########################################
# Prescale gradients. By default, this feature is not enabled.
219
# Users can configure in ds_config.json as below example:
220
221
PRESCALE_GRADIENTS_FORMAT = '''
Gradient prescaling should be enabled as:
222
"prescale_gradients": true
223
224
225
226
'''
PRESCALE_GRADIENTS = "prescale_gradients"
PRESCALE_GRADIENTS_DEFAULT = False

227
228
229
230
231
232
233
GRADIENT_PREDIVIDE_FACTOR_FORMAT = '''
Gradient predivide factor should be enabled as:
"gradient_predivide_factor": 1.0
'''
GRADIENT_PREDIVIDE_FACTOR = "gradient_predivide_factor"
GRADIENT_PREDIVIDE_FACTOR_DEFAULT = 1.0

234
235
236
237
#########################################
# Disable AllGather
#########################################
# Disable AllGather. By default, this feature is not enabled.
238
# Users can configure in ds_config.json as below example:
239
240
DISABLE_ALLGATHER_FORMAT = '''
Disable AllGather should be enabled as:
241
"disable_allgather": true
242
243
244
245
246
247
248
249
'''
DISABLE_ALLGATHER = "disable_allgather"
DISABLE_ALLGATHER_DEFAULT = False

#########################################
# Dump DeepSpeed state
#########################################
# Dump State. By default, this feature is not enabled.
250
# Users can configure in ds_config.json as below example:
251
252
DUMP_STATE_FORMAT = '''
Dump state should be enabled as:
253
"dump_state": true
254
255
256
257
258
259
260
261
'''
DUMP_STATE = 'dump_state'
DUMP_STATE_DEFAULT = False

#########################################
# Vocabulary size
#########################################
# Vocabulary size.
262
# Users can configure in ds_config.json as below example:
263
264
VOCABULARY_SIZE_FORMAT = '''
Vocabulary size can be specified as:
265
"vocabulary_size": 1024
266
267
268
269
270
271
272
273
'''
VOCABULARY_SIZE = 'vocabulary_size'
VOCABULARY_SIZE_DEFAULT = None

#########################################
# Wall block breakdown
#########################################
# Wall clock breakdown. By default, this feature is not enabled.
274
# Users can configure in ds_config.json as below example:
275
276
WALL_CLOCK_BREAKDOWN_FORMAT = '''
Wall block breakdown should be enabled as:
277
"wall_clock_breakdown": true
278
279
280
281
'''
WALL_CLOCK_BREAKDOWN = 'wall_clock_breakdown'
WALL_CLOCK_BREAKDOWN_DEFAULT = False

Jeff Rasley's avatar
Jeff Rasley committed
282
283
284
MEMORY_BREAKDOWN = 'memory_breakdown'
MEMORY_BREAKDOWN_DEFAULT = False

285
286
287
288
#########################################
# Tensorboard
#########################################
# Tensorboard. By default, this feature is not enabled.
289
# Users can configure in ds_config.json as below example:
290
291
TENSORBOARD_FORMAT = '''
Tensorboard can be specified as:
292
293
294
295
"tensorboard": {
  "enabled": true,
  "output_path": "/home/myname/foo",
  "job_name": "model_lr2e-5_epoch3_seed2_seq64"
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
}
'''
TENSORBOARD = "tensorboard"

# Tensorboard enable signal
TENSORBOARD_ENABLED = "enabled"
TENSORBOARD_ENABLED_DEFAULT = False

# Tensorboard output path
TENSORBOARD_OUTPUT_PATH = "output_path"
TENSORBOARD_OUTPUT_PATH_DEFAULT = ""

# Tensorboard job name
TENSORBOARD_JOB_NAME = "job_name"
TENSORBOARD_JOB_NAME_DEFAULT = "DeepSpeedJobName"
Olatunji Ruwase's avatar
Olatunji Ruwase committed
311

aiss's avatar
aiss committed
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#########################################
# Eigenvalue
#########################################
# Eigenvalue computation. By default, this feature is not enabled.
# Users can configure in ds_config.json as below example:
EIGENVALUE_FORMAT = '''
Tensorboard can be specified as:
"eigenvalue": {
  "enabled": true,
  "verbose": true,
  "max_iter": 100,
  "tol": 1e-2,
  "stability": 1e-6
}
'''
EIGENVALUE = "eigenvalue"

# Tensorboard enable signal
EIGENVALUE_ENABLED = "enabled"
EIGENVALUE_ENABLED_DEFAULT = False

EIGENVALUE_VERBOSE = "verbose"
EIGENVALUE_VERBOSE_DEFAULT = False

EIGENVALUE_MAX_ITER = "max_iter"
EIGENVALUE_MAX_ITER_DEFAULT = 100

EIGENVALUE_TOL = "tol"
EIGENVALUE_TOL_DEFAULT = 1e-2

EIGENVALUE_STABILITY = "stability"
EIGENVALUE_STABILITY_DEFAULT = 1e-6

EIGENVALUE_GAS_BOUNDARY_RESOLUTION = "gas_boundary_resolution"
EIGENVALUE_GAS_BOUNDARY_RESOLUTION_DEFAULT = 1

EIGENVALUE_LAYER_NAME = "layer_name"
EIGENVALUE_LAYER_NAME_DEFAULT = "bert.encoder.layer"

EIGENVALUE_LAYER_NUM = "layer_num"
EIGENVALUE_LAYER_NUM_DEFAULT = 0

354
#########################################
Olatunji Ruwase's avatar
Olatunji Ruwase committed
355
# Progressive Layer Drop (PLD)
356
#########################################
Olatunji Ruwase's avatar
Olatunji Ruwase committed
357
358
359
360
361
362
363
364
365
366
367
PROGRESSIVE_LAYER_DROP = "progressive_layer_drop"

# PLD enable signal
PLD_ENABLED = "enabled"
PLD_ENABLED_DEFAULT = False

PLD_THETA = "theta"
PLD_THETA_DEFAULT = 1.0

PLD_GAMMA = "gamma"
PLD_GAMMA_DEFAULT = 0.001
368

aiss's avatar
aiss committed
369
370
371
372
373
374
375
376
#########################################
# Curriculum Learning
#########################################
CURRICULUM_LEARNING = "curriculum_learning"

CURRICULUM_ENABLED = "enabled"
CURRICULUM_ENABLED_DEFAULT = False

377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398

#########################################
# Validation modes
#########################################
class ValidationMode:
    WARN = "WARN"
    IGNORE = "IGNORE"
    FAIL = "FAIL"


#########################################
# Checkpoint config params
#########################################
# "checkpoint": {tag_validation=["Ignore"|"Warn"|"Fail"]}
CHECKPOINT = "checkpoint"
CHECKPOINT_TAG_VALIDATION = "tag_validation"
CHECKPOINT_TAG_VALIDATION_DEFAULT = ValidationMode.WARN
CHECKPOINT_TAG_VALIDATION_MODES = [
    ValidationMode.WARN,
    ValidationMode.IGNORE,
    ValidationMode.FAIL
]
aiss's avatar
aiss committed
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453

#########################################
# Quantization
#########################################
QUANTIZE_TRAINING = "quantize_training"
QUANTIZE_BITS = "quantize_bits"
START_BITS = "start_bits"
TARGET_BITS = "target_bits"
QUANTIZER_KERNEL = "quantizer_kernel"
QUANTIZE_SCHEDULE = "quantize_schedule"
QUANTIZE_PERIOD = "quantize_period"
SCHEDULE_OFFSET = "schedule_offset"
QUANTIZE_GROUPS = "quantize_groups"
FP16_MIXED_QUANTIZE = "fp16_mixed_quantize"
QUANTIZE_CHANGE_RATIO = "quantize_change_ratio"
FP16_MIXED_QUANTIZE_ENABLED = "enabled"
QUANTIZE_VERBOSE = "quantize_verbose"
QUANTIZE_ALGO = "quantize_algo"
QUANTIZE_TYPE = "q_type"
QUANTIZE_SYMMETRIC = "symmetric"
QUANTIZE_ASYMMETRIC = "asymmetric"
STOCHASTIC_ROUNDING = "stochastic"
NEAREST_ROUNDING = "nearest"
QUANTIZE_ROUNDING = "rounding"
QUANTIZE_TRAINING_ENABLED = "enabled"
QUANTIZE_TRAINING_ENABLED_DEFAULT = False
QUANTIZE_TRAINING_DEFAULT = False
QUANTIZE_START_BITS_DEFAULT = 16
QUANTIZE_TARGET_BITS_DEFAULT = 8
QUANTIZER_KERNEL_DEFAULT = False
QUANTIZE_PERIOD_DEFAULT = 1000
QUANTIZE_OFFSET_DEFAULT = 1000
QUANTIZE_GROUPS_DEFAULT = 1
QUANTIZE_TYPE_DEFAULT = 0  #symmetric
QUANTIZE_ROUNDING_DEFAULT = 0  #nearest
FP16_MIXED_QUANTIZE_ENABLED_DEFAULT = False
QUANTIZE_CHANGE_RATIO_DEFAULT = 0.001
QUANTIZE_VERBOSE_DEFAULT = False

#########################################
# Drop the last incomplete Batch
# #########################################
# dataloader_drop_last. By default, this feature is not enabled.
# Users can configure in ds_config.json as below example:
DATALOADER_DROP_LAST_FORMAT = '''
The last incomplete batch can be dropped by setting:
"dataloader_drop_last": True
'''
DATALOADER_DROP_LAST = "dataloader_drop_last"
DATALOADER_DROP_LAST_DEFAULT = False

#########################################
# PIPELINE PARALLELISM
#########################################
PIPE_REPLICATED = 'ds_pipe_replicated'