Commit 94586767 authored by Zeqiang Lai's avatar Zeqiang Lai Committed by zhe chen
Browse files

[Classification] support deepspeed, fix optimizer bugs #(83)

parent 2d975df6
...@@ -196,6 +196,66 @@ python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 main. ...@@ -196,6 +196,66 @@ python -m torch.distributed.launch --nproc_per_node 8 --master_port 12345 main.
--data-path <imagenet-path> --batch-size 64 --accumulation-steps 2 [--use-checkpoint] --data-path <imagenet-path> --batch-size 64 --accumulation-steps 2 [--use-checkpoint]
``` --> ``` -->
### Training with Deepspeed
We support utilizing [Deepspeed](https://github.com/microsoft/DeepSpeed) to reduce memory costs for training large-scale models, e.g. InternImage-H with over 1 billion parameters.
To use it, first install the requirements as
```bash
pip install deepspeed
```
Then you could launch the training in a slurm system with 8 GPUs as follows (tiny and huge as examples)
```
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume ckpt.pth
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume deepspeed_ckpt_dir
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_h_22kto1k_640.yaml --batch-size 16 --accumulation-steps 4 --pretrained ckpt/internimage_h_jointto22k_384.pth
```
🤗 **Huggingface Accelerate Integration of Deepspeed**
Optionally, you could use our [Huggingface accelerate](https://github.com/huggingface/accelerate) integration to use deepspeed.
```bash
pip install accelerate
```
```bash
accelerate launch --config_file configs/accelerate/dist_8gpus_zero3_wo_loss_scale.yaml main_accelerate.py --cfg configs/internimage_h_22kto1k_640.yaml --data-path /mnt/lustre/share/images --batch-size 16 --pretrained ckpt/internimage_h_jointto22k_384.pth --accumulation-steps 4
accelerate launch --config_file configs/accelerate/dist_8gpus_zero3_offload.yaml main_accelerate.py --cfg configs/internimage_t_1k_224.yaml --data-path /mnt/lustre/share/images --batch-size 128 --accumulation-steps 4 --output output_zero3_offload
accelerate launch --config_file configs/accelerate/dist_8gpus_zero1.yaml main_accelerate.py --cfg configs/internimage_t_1k_224.yaml --data-path /mnt/lustre/share/images --batch-size 128 --accumulation-steps 4
```
**Memory Costs**
Here is the reference GPU memory cost for InternImage-H with 8 GPUs.
- total batch size = 512, 16 batch size for each GPU, gradient accumulation steps = 4.
| Resolution | Deepspeed | Cpu offloading | Memory |
| --- | --- | --- | --- |
| 640 | zero1 | False | 22572 |
| 640 | zero3 | False | 20000 |
| 640 | zero3 | True | 19144 |
| 384 | zero1 | False | 16000 |
| 384 | zero3 | True | 11928 |
**Convert Checkpoints**
To convert deepspeed checkpoints to pytorch fp32 checkpoint, you could use the following snippet.
```python
from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict
convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, 'best.pth', tag='best')
```
Then, you could use `best.pth` as usual, e.g., `model.load_state_dict(torch.load('best.pth'))`
> Due to the lack of computational resources, the deepspeed training scripts are currently only verified for the first few epochs. Please fire an issue if you have problems for reproducing the whole training.
### Export ### Export
To export `InternImage-T` from PyTorch to ONNX, run: To export `InternImage-T` from PyTorch to ONNX, run:
......
{
"fp16": {
"enabled": true,
"auto_cast": true
},
"zero_optimization": {
"stage": 1,
"offload_optimizer": {
"device": "none"
},
"offload_param": {
"device": "none"
}
},
"gradient_accumulation_steps": 4,
"gradient_clipping": 5.0,
"steps_per_print": "inf",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}
\ No newline at end of file
{
"fp16": {
"enabled": true,
"auto_cast": true,
"loss_scale": 1
},
"zero_optimization": {
"stage": 1,
"offload_optimizer": {
"device": "none"
},
"offload_param": {
"device": "none"
}
},
"gradient_accumulation_steps": 4,
"gradient_clipping": 5.0,
"steps_per_print": "inf",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}
\ No newline at end of file
{
"fp16": {
"enabled": true,
"auto_cast": true
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu"
},
"offload_param": {
"device": "cpu"
}
},
"gradient_accumulation_steps": 4,
"gradient_clipping": 5.0,
"steps_per_print": "inf",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}
\ No newline at end of file
{
"fp16": {
"enabled": true,
"auto_cast": true,
"loss_scale": 1
},
"zero_optimization": {
"stage": 3,
"offload_optimizer": {
"device": "cpu"
},
"offload_param": {
"device": "cpu"
}
},
"gradient_accumulation_steps": 4,
"gradient_clipping": 5.0,
"steps_per_print": "inf",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto"
}
\ No newline at end of file
compute_environment: LOCAL_MACHINE
deepspeed_config: {}
distributed_type: MULTI_GPU
downcast_bf16: 'no'
fsdp_config: {}
machine_rank: 0
main_process_ip: null
main_process_port: 11922
main_training_function: main
mixed_precision: 'fp16'
num_machines: 1
num_processes: 8
use_cpu: false
\ No newline at end of file
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_config_file: configs/accelerate/deepspeed/ds_config_zero1.json
zero3_init_flag: false
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 8
rdzv_backend: static
use_cpu: false
\ No newline at end of file
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_config_file: configs/accelerate/deepspeed/ds_config_zero1_wo_loss_scale.json
zero3_init_flag: false
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 8
rdzv_backend: static
use_cpu: false
\ No newline at end of file
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_config_file: configs/accelerate/deepspeed/ds_config_zero3.json
zero3_init_flag: false
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 8
rdzv_backend: static
use_cpu: false
\ No newline at end of file
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_config_file: configs/accelerate/deepspeed/ds_config_zero3_wo_loss_scale.json
zero3_init_flag: false
distributed_type: DEEPSPEED
downcast_bf16: 'no'
machine_rank: 0
main_training_function: main
num_machines: 1
num_processes: 8
rdzv_backend: static
use_cpu: false
\ No newline at end of file
...@@ -33,8 +33,6 @@ TRAIN: ...@@ -33,8 +33,6 @@ TRAIN:
BASE_LR: 2e-05 # 512 BASE_LR: 2e-05 # 512
WARMUP_LR: .0 WARMUP_LR: .0
MIN_LR: .0 MIN_LR: .0
LR_LAYER_DECAY: true
LR_LAYER_DECAY_RATIO: 0.9
USE_CHECKPOINT: true USE_CHECKPOINT: true
OPTIMIZER: OPTIMIZER:
DCN_LR_MUL: 0.1 DCN_LR_MUL: 0.1
......
DATA:
IMG_SIZE: 384
IMG_ON_MEMORY: True
AUG:
MIXUP: 0.0
CUTMIX: 0.0
REPROB: 0.0
MODEL:
TYPE: intern_image
DROP_PATH_RATE: 0.2
LABEL_SMOOTHING: 0.3
INTERN_IMAGE:
CORE_OP: 'DCNv3'
DEPTHS: [6, 6, 32, 6]
GROUPS: [10, 20, 40, 80]
CHANNELS: 320
DW_KERNEL_SIZE: 5
LAYER_SCALE: None
OFFSET_SCALE: 1.0
MLP_RATIO: 4.0
POST_NORM: False
RES_POST_NORM: True
LEVEL2_POST_NORM: True
LEVEL2_POST_NORM_BLOCK_IDS: [5, 11, 17, 23, 29]
CENTER_FEATURE_SCALE: True
USE_CLIP_PROJECTOR: True
TRAIN:
EMA:
ENABLE: true
DECAY: 0.9999
EPOCHS: 20
WARMUP_EPOCHS: 2
WEIGHT_DECAY: 0.05
BASE_LR: 2e-05 # 512
WARMUP_LR: .0
MIN_LR: .0
USE_CHECKPOINT: true
OPTIMIZER:
DCN_LR_MUL: 0.1
AMP_OPT_LEVEL: O0
EVAL_FREQ: 1
\ No newline at end of file
...@@ -34,8 +34,6 @@ TRAIN: ...@@ -34,8 +34,6 @@ TRAIN:
BASE_LR: 2e-05 # 512 BASE_LR: 2e-05 # 512
WARMUP_LR: .0 WARMUP_LR: .0
MIN_LR: .0 MIN_LR: .0
LR_LAYER_DECAY: true
LR_LAYER_DECAY_RATIO: 0.9
USE_CHECKPOINT: true USE_CHECKPOINT: true
OPTIMIZER: OPTIMIZER:
USE_ZERO: True USE_ZERO: True
......
...@@ -28,8 +28,6 @@ TRAIN: ...@@ -28,8 +28,6 @@ TRAIN:
BASE_LR: 2e-05 # 512 BASE_LR: 2e-05 # 512
WARMUP_LR: .0 WARMUP_LR: .0
MIN_LR: .0 MIN_LR: .0
LR_LAYER_DECAY: true
LR_LAYER_DECAY_RATIO: 0.9
USE_CHECKPOINT: true USE_CHECKPOINT: true
OPTIMIZER: OPTIMIZER:
DCN_LR_MUL: 0.1 DCN_LR_MUL: 0.1
......
...@@ -28,8 +28,6 @@ TRAIN: ...@@ -28,8 +28,6 @@ TRAIN:
BASE_LR: 2e-05 # 512 BASE_LR: 2e-05 # 512
WARMUP_LR: .0 WARMUP_LR: .0
MIN_LR: .0 MIN_LR: .0
LR_LAYER_DECAY: true
LR_LAYER_DECAY_RATIO: 0.9
USE_CHECKPOINT: true USE_CHECKPOINT: true
OPTIMIZER: OPTIMIZER:
DCN_LR_MUL: 0.1 DCN_LR_MUL: 0.1
......
DATA:
IMG_ON_MEMORY: True
MODEL:
TYPE: intern_image
DROP_PATH_RATE: 0.5
INTERN_IMAGE:
CORE_OP: 'DCNv3'
DEPTHS: [4, 4, 21, 4]
GROUPS: [7, 14, 28, 56]
CHANNELS: 112
LAYER_SCALE: 1e-5
OFFSET_SCALE: 1.0
MLP_RATIO: 4.0
POST_NORM: True
TRAIN:
EMA:
ENABLE: True
DECAY: 0.9999
BASE_LR: 5e-4
\ No newline at end of file
DATA:
IMG_SIZE: 512
IMG_ON_MEMORY: True
AUG:
MIXUP: 0.0
CUTMIX: 0.0
REPROB: 0.0
MODEL:
TYPE: intern_image
DROP_PATH_RATE: 0.4
LABEL_SMOOTHING: 0.3
INTERN_IMAGE:
CORE_OP: 'DCNv3'
DEPTHS: [2, 2, 48, 4]
GROUPS: [16, 32, 64, 128]
CHANNELS: 512
DW_KERNEL_SIZE: 5
LAYER_SCALE: None
OFFSET_SCALE: 1.0
MLP_RATIO: 4.0
POST_NORM: True
LEVEL2_POST_NORM: True
LEVEL2_POST_NORM_BLOCK_IDS: [5, 11, 17, 23, 29, 35, 41, 47]
CENTER_FEATURE_SCALE: True
USE_CLIP_PROJECTOR: True
TRAIN:
EMA:
ENABLE: true
DECAY: 0.9999
EPOCHS: 20
WARMUP_EPOCHS: 2
WEIGHT_DECAY: 0.05
BASE_LR: 2e-05 # 512
WARMUP_LR: .0
MIN_LR: .0
LR_LAYER_DECAY: true
LR_LAYER_DECAY_RATIO: 0.9
USE_CHECKPOINT: true
OPTIMIZER:
DCN_LR_MUL: 0.1
AMP_OPT_LEVEL: O0
EVAL_FREQ: 1
\ No newline at end of file
DATA:
IMG_SIZE: 384
IMG_ON_MEMORY: True
AUG:
MIXUP: 0.0
CUTMIX: 0.0
REPROB: 0.0
MODEL:
TYPE: intern_image
DROP_PATH_RATE: 0.2
LABEL_SMOOTHING: 0.3
INTERN_IMAGE:
CORE_OP: 'DCNv3'
DEPTHS: [6, 6, 32, 6]
GROUPS: [10, 20, 40, 80]
CHANNELS: 320
DW_KERNEL_SIZE: 5
LAYER_SCALE: None
OFFSET_SCALE: 1.0
MLP_RATIO: 4.0
POST_NORM: False
RES_POST_NORM: True
LEVEL2_POST_NORM: True
LEVEL2_POST_NORM_BLOCK_IDS: [5, 11, 17, 23, 29]
CENTER_FEATURE_SCALE: True
USE_CLIP_PROJECTOR: True
TRAIN:
EMA:
ENABLE: true
DECAY: 0.9999
EPOCHS: 20
WARMUP_EPOCHS: 2
WEIGHT_DECAY: 0.05
BASE_LR: 2e-05 # 512
WARMUP_LR: .0
MIN_LR: .0
LR_LAYER_DECAY: true
LR_LAYER_DECAY_RATIO: 0.9
USE_CHECKPOINT: true
OPTIMIZER:
DCN_LR_MUL: 0.1
AMP_OPT_LEVEL: O0
EVAL_FREQ: 1
\ No newline at end of file
DATA:
IMG_SIZE: 640
IMG_ON_MEMORY: True
AUG:
MIXUP: 0.0
CUTMIX: 0.0
REPROB: 0.0
MODEL:
TYPE: intern_image
DROP_PATH_RATE: 0.2
LABEL_SMOOTHING: 0.3
INTERN_IMAGE:
CORE_OP: 'DCNv3'
DEPTHS: [6, 6, 32, 6]
GROUPS: [10, 20, 40, 80]
CHANNELS: 320
DW_KERNEL_SIZE: 5
LAYER_SCALE: None
OFFSET_SCALE: 1.0
MLP_RATIO: 4.0
POST_NORM: False
RES_POST_NORM: True
LEVEL2_POST_NORM: True
LEVEL2_POST_NORM_BLOCK_IDS: [5, 11, 17, 23, 29]
CENTER_FEATURE_SCALE: True
USE_CLIP_PROJECTOR: True
TRAIN:
EMA:
ENABLE: true
DECAY: 0.9999
EPOCHS: 20
WARMUP_EPOCHS: 2
WEIGHT_DECAY: 0.05
BASE_LR: 2e-05 # 512
WARMUP_LR: .0
MIN_LR: .0
LR_LAYER_DECAY: true
LR_LAYER_DECAY_RATIO: 0.9
USE_CHECKPOINT: true
OPTIMIZER:
USE_ZERO: True
DCN_LR_MUL: 0.1
AMP_OPT_LEVEL: O0
EVAL_FREQ: 1
\ No newline at end of file
DATA:
IMG_SIZE: 384
IMG_ON_MEMORY: True
AUG:
MIXUP: 0.0
CUTMIX: 0.0
REPROB: 0.0
MODEL:
TYPE: intern_image
DROP_PATH_RATE: 0.1
LABEL_SMOOTHING: 0.3
INTERN_IMAGE:
CORE_OP: 'DCNv3'
DEPTHS: [5, 5, 22, 5]
GROUPS: [10, 20, 40, 80]
CHANNELS: 160
LAYER_SCALE: 1e-5
OFFSET_SCALE: 2.0
MLP_RATIO: 4.0
POST_NORM: True
TRAIN:
EMA:
ENABLE: true
DECAY: 0.9999
EPOCHS: 20
WARMUP_EPOCHS: 2
WEIGHT_DECAY: 0.05
BASE_LR: 2e-05 # 512
WARMUP_LR: .0
MIN_LR: .0
LR_LAYER_DECAY: true
LR_LAYER_DECAY_RATIO: 0.9
USE_CHECKPOINT: true
OPTIMIZER:
DCN_LR_MUL: 0.1
AMP_OPT_LEVEL: O0
EVAL_FREQ: 1
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment