Minor update on Deepspeed (#112)

* minor update on deepspeed * support config offloading

Minor update on Deepspeed (#112)
* minor update on deepspeed * support config offloading
213b0b8d · Zeqiang Lai · GitHub · 861253ca · 213b0b8d · 213b0b8d
Unverified Commit 213b0b8d authored Apr 19, 2023 by Zeqiang Lai Committed by GitHub Apr 19, 2023
3 changed files
--- a/classification/README.md
+++ b/classification/README.md
@@ -204,16 +204,17 @@ We support utilizing [Deepspeed](https://github.com/microsoft/DeepSpeed) to redu
 To use it, first install the requirements as
 ```bash
-pip install deepspeed
+pip install deepspeed==0.8.3
 ```
-Then you could launch the training in a slurm system with 8 GPUs as follows (tiny and huge as examples)
+Then you could launch the training in a slurm system with 8 GPUs as follows (tiny and huge as examples).
+The default zero stage is 1 and it could config via command line args `--zero-stage`.
 ```
 GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 
 GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume ckpt.pth
 GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume deepspeed_ckpt_dir
 GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_h_22kto1k_640.yaml --batch-size 16 --accumulation-steps 4 --pretrained ckpt/internimage_h_jointto22k_384.pth
+GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_h_22kto1k_640.yaml --batch-size 16 --accumulation-steps 4 --pretrained ckpt/internimage_h_jointto22k_384.pth --zero-stage 3
 ```
@@ -222,7 +223,7 @@ GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/in
 Optionally, you could use our [Huggingface accelerate](https://github.com/huggingface/accelerate) integration to use deepspeed.
 ```bash
-pip install accelerate
+pip install accelerate==0.18.0
 ```
 ```bash

--- a/classification/configs/accelerate/dist_8gpus_zero3_offload.yaml
+++ b/classification/configs/accelerate/dist_8gpus_zero3_offload.yaml
 compute_environment: LOCAL_MACHINE
 deepspeed_config:
-  deepspeed_config_file: configs/accelerate/deepspeed/ds_config_zero3.json
+  deepspeed_config_file: configs/accelerate/deepspeed/ds_config_zero3_offload.json
  zero3_init_flag: false
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'

--- a/classification/main_deepspeed.py
+++ b/classification/main_deepspeed.py
@@ -29,6 +29,7 @@ from utils import load_pretrained, reduce_tensor, MyAverageMeter
 from ddp_hooks import fp16_compress_hook
 from ema_deepspeed import EMADeepspeed
 def parse_option():
    parser = argparse.ArgumentParser(
        'InternImage training and evaluation script', add_help=False)
@@ -45,7 +46,8 @@ def parse_option():
                             'full: cache all data, '
                             'part: sharding the dataset into nonoverlapping pieces and only cache one piece'
                        )
-    parser.add_argument('--pretrained', help='pretrained weight from checkpoint, could be imagenet22k pretrained weight')
+    parser.add_argument('--pretrained',
+                        help='pretrained weight from checkpoint, could be imagenet22k pretrained weight')
    parser.add_argument('--resume', help='resume from checkpoint')
    parser.add_argument('--output', default='output', type=str, metavar='PATH',
                        help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)'
@@ -58,7 +60,16 @@ def parse_option():
    # distributed training
    parser.add_argument("--local-rank", type=int, required=True, help='local rank for DistributedDataParallel')
+    # deepspeed config
    parser.add_argument('--disable-grad-scalar', action='store_true', help='disable Grad Scalar')
+    parser.add_argument('--offload-optimizer', type=str, default='none', choices=['cpu', 'none'],
+                        help='enable optimizer offloading')
+    parser.add_argument('--offload-param', type=str, default='none', choices=['cpu', 'none'],
+                        help='enable model offloading')
+    # To use Zero3, Please use main_accelerate.py instead.
+    # For this script, we are facing a similar issue as https://github.com/microsoft/DeepSpeed/issues/3068
+    parser.add_argument("--zero-stage", type=int, default=1, choices=[1, 2], help='deep speed zero stage')
    args, unparsed = parser.parse_known_args()
    config = get_config(args)
@@ -121,7 +132,7 @@ def scale_learning_rate(config, num_processes):
 def log_model_statistic(model_wo_ddp):
    n_parameters = sum(p.numel() for p in model_wo_ddp.parameters()
                       if p.requires_grad)
-    logger.info(f"number of params: {n_parameters/1e6} M")
+    logger.info(f"number of params: {n_parameters / 1e6} M")
    if hasattr(model_wo_ddp, 'flops'):
        flops = model_wo_ddp.flops()
        logger.info(f"number of GFLOPs: {flops / 1e9}")
@@ -180,7 +191,13 @@ def build_ds_config(config, args):
            "loss_scale": 1 if args.disable_grad_scalar else 0
        },
        "zero_optimization": {
-            "stage": 1,
+            "stage": args.zero_stage,
+            "offload_optimizer": {
+                "device": args.offload_optimizer
+            },
+            "offload_param": {
+                "device": args.offload_param
+            }
        },
        "steps_per_print": 1e10,
        "gradient_accumulation_steps": config.TRAIN.ACCUMULATION_STEPS,
@@ -464,7 +481,8 @@ def eval(config):
                state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir=ckpt_dir, tag=tag)
                model_wo_ddp.load_state_dict(state_dict)
            except:
-                checkpoint = torch.load(os.path.join(config.MODEL.RESUME, 'mp_rank_00_model_states.pt'), map_location='cpu')
+                checkpoint = torch.load(os.path.join(config.MODEL.RESUME, 'mp_rank_00_model_states.pt'),
+                                        map_location='cpu')
                model_wo_ddp.load_state_dict(checkpoint['module'])
    elif config.MODEL.PRETRAINED:
        load_pretrained(config, model_wo_ddp, logger)