{ "train_batch_size" : CONFIG_BATCH_SIZE, "train_micro_batch_size_per_gpu": CONFIG_MBSIZE, "steps_per_print": LOG_INTERVAL, "zero_optimization": { "stage": ZERO_STAGE }, "gradient_clipping": 1.0, "prescale_gradients": PRESCALE_GRAD, "fp16": { "enabled": CONFIG_FP16_ENABLED, "loss_scale": 0, "loss_scale_window": 500, "hysteresis": 2, "min_loss_scale": 1, "initial_scale_power": 11 }, "bf16": { "enabled": CONFIG_BF16_ENABLED }, "curriculum_learning": { "enabled": CONFIG_CL_ENABLED, "curriculum_type": "seqlen", "min_difficulty": CONFIG_CL_MIN, "max_difficulty": CONFIG_CL_MAX, "schedule_type": "fixed_linear", "schedule_config": { "total_curriculum_step": CONFIG_CL_DURATION, "difficulty_step": 8 } }, "wall_clock_breakdown" : false, "compression_training": { "weight_quantization": { "shared_parameters":{ "enabled": true, "quantizer_kernel": false, "schedule_offset": 50, "quantize_groups": 48, "quantize_verbose": false, "quantization_type": "symmetric", "rounding": "nearest", "fp16_mixed_quantize":{ "enabled": false, "quantize_change_ratio": 0.001 } }, "different_groups":{ "wq1": { "params": { "start_bits": 12, "target_bits": 4, "quantization_period": 50 }, "modules": [ "encoder.layers" ] } } }, "activation_quantization": { "shared_parameters":{ "enabled": true, "quantization_type": "asymmetric", "range_calibration": "static", "schedule_offset": 50 }, "different_groups":{ "aq1": { "params": { "bits": 8 }, "modules": [ "encoder.layers" ] } } } } }