"...git@developer.sourcefind.cn:OpenDAS/openpcdet.git" did not exist on "ac5198a7ad7facc5d2febaa46c92116440541502"
Unverified Commit 213b0b8d authored by Zeqiang Lai's avatar Zeqiang Lai Committed by GitHub
Browse files

Minor update on Deepspeed (#112)

* minor update on deepspeed

* support config offloading
parent 861253ca
...@@ -204,16 +204,17 @@ We support utilizing [Deepspeed](https://github.com/microsoft/DeepSpeed) to redu ...@@ -204,16 +204,17 @@ We support utilizing [Deepspeed](https://github.com/microsoft/DeepSpeed) to redu
To use it, first install the requirements as To use it, first install the requirements as
```bash ```bash
pip install deepspeed pip install deepspeed==0.8.3
``` ```
Then you could launch the training in a slurm system with 8 GPUs as follows (tiny and huge as examples) Then you could launch the training in a slurm system with 8 GPUs as follows (tiny and huge as examples).
The default zero stage is 1 and it could config via command line args `--zero-stage`.
``` ```
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume ckpt.pth GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume ckpt.pth
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume deepspeed_ckpt_dir GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume deepspeed_ckpt_dir
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_h_22kto1k_640.yaml --batch-size 16 --accumulation-steps 4 --pretrained ckpt/internimage_h_jointto22k_384.pth GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_h_22kto1k_640.yaml --batch-size 16 --accumulation-steps 4 --pretrained ckpt/internimage_h_jointto22k_384.pth
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_h_22kto1k_640.yaml --batch-size 16 --accumulation-steps 4 --pretrained ckpt/internimage_h_jointto22k_384.pth --zero-stage 3
``` ```
...@@ -222,7 +223,7 @@ GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/in ...@@ -222,7 +223,7 @@ GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/in
Optionally, you could use our [Huggingface accelerate](https://github.com/huggingface/accelerate) integration to use deepspeed. Optionally, you could use our [Huggingface accelerate](https://github.com/huggingface/accelerate) integration to use deepspeed.
```bash ```bash
pip install accelerate pip install accelerate==0.18.0
``` ```
```bash ```bash
......
compute_environment: LOCAL_MACHINE compute_environment: LOCAL_MACHINE
deepspeed_config: deepspeed_config:
deepspeed_config_file: configs/accelerate/deepspeed/ds_config_zero3.json deepspeed_config_file: configs/accelerate/deepspeed/ds_config_zero3_offload.json
zero3_init_flag: false zero3_init_flag: false
distributed_type: DEEPSPEED distributed_type: DEEPSPEED
downcast_bf16: 'no' downcast_bf16: 'no'
......
...@@ -29,6 +29,7 @@ from utils import load_pretrained, reduce_tensor, MyAverageMeter ...@@ -29,6 +29,7 @@ from utils import load_pretrained, reduce_tensor, MyAverageMeter
from ddp_hooks import fp16_compress_hook from ddp_hooks import fp16_compress_hook
from ema_deepspeed import EMADeepspeed from ema_deepspeed import EMADeepspeed
def parse_option(): def parse_option():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
'InternImage training and evaluation script', add_help=False) 'InternImage training and evaluation script', add_help=False)
...@@ -42,10 +43,11 @@ def parse_option(): ...@@ -42,10 +43,11 @@ def parse_option():
parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset') parser.add_argument('--zip', action='store_true', help='use zipped dataset instead of folder dataset')
parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'], parser.add_argument('--cache-mode', type=str, default='part', choices=['no', 'full', 'part'],
help='no: no cache, ' help='no: no cache, '
'full: cache all data, ' 'full: cache all data, '
'part: sharding the dataset into nonoverlapping pieces and only cache one piece' 'part: sharding the dataset into nonoverlapping pieces and only cache one piece'
) )
parser.add_argument('--pretrained', help='pretrained weight from checkpoint, could be imagenet22k pretrained weight') parser.add_argument('--pretrained',
help='pretrained weight from checkpoint, could be imagenet22k pretrained weight')
parser.add_argument('--resume', help='resume from checkpoint') parser.add_argument('--resume', help='resume from checkpoint')
parser.add_argument('--output', default='output', type=str, metavar='PATH', parser.add_argument('--output', default='output', type=str, metavar='PATH',
help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)' help='root of output folder, the full path is <output>/<model_name>/<tag> (default: output)'
...@@ -58,7 +60,16 @@ def parse_option(): ...@@ -58,7 +60,16 @@ def parse_option():
# distributed training # distributed training
parser.add_argument("--local-rank", type=int, required=True, help='local rank for DistributedDataParallel') parser.add_argument("--local-rank", type=int, required=True, help='local rank for DistributedDataParallel')
# deepspeed config
parser.add_argument('--disable-grad-scalar', action='store_true', help='disable Grad Scalar') parser.add_argument('--disable-grad-scalar', action='store_true', help='disable Grad Scalar')
parser.add_argument('--offload-optimizer', type=str, default='none', choices=['cpu', 'none'],
help='enable optimizer offloading')
parser.add_argument('--offload-param', type=str, default='none', choices=['cpu', 'none'],
help='enable model offloading')
# To use Zero3, Please use main_accelerate.py instead.
# For this script, we are facing a similar issue as https://github.com/microsoft/DeepSpeed/issues/3068
parser.add_argument("--zero-stage", type=int, default=1, choices=[1, 2], help='deep speed zero stage')
args, unparsed = parser.parse_known_args() args, unparsed = parser.parse_known_args()
config = get_config(args) config = get_config(args)
...@@ -97,11 +108,11 @@ def build_criterion(config): ...@@ -97,11 +108,11 @@ def build_criterion(config):
def scale_learning_rate(config, num_processes): def scale_learning_rate(config, num_processes):
# linear scale the learning rate according to total batch size, may not be optimal # linear scale the learning rate according to total batch size, may not be optimal
linear_scaled_lr = config.TRAIN.BASE_LR * \ linear_scaled_lr = config.TRAIN.BASE_LR * \
config.DATA.BATCH_SIZE * num_processes / 512.0 config.DATA.BATCH_SIZE * num_processes / 512.0
linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * \ linear_scaled_warmup_lr = config.TRAIN.WARMUP_LR * \
config.DATA.BATCH_SIZE * num_processes / 512.0 config.DATA.BATCH_SIZE * num_processes / 512.0
linear_scaled_min_lr = config.TRAIN.MIN_LR * \ linear_scaled_min_lr = config.TRAIN.MIN_LR * \
config.DATA.BATCH_SIZE * num_processes / 512.0 config.DATA.BATCH_SIZE * num_processes / 512.0
# gradient accumulation also need to scale the learning rate # gradient accumulation also need to scale the learning rate
if config.TRAIN.ACCUMULATION_STEPS > 1: if config.TRAIN.ACCUMULATION_STEPS > 1:
linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS linear_scaled_lr = linear_scaled_lr * config.TRAIN.ACCUMULATION_STEPS
...@@ -121,7 +132,7 @@ def scale_learning_rate(config, num_processes): ...@@ -121,7 +132,7 @@ def scale_learning_rate(config, num_processes):
def log_model_statistic(model_wo_ddp): def log_model_statistic(model_wo_ddp):
n_parameters = sum(p.numel() for p in model_wo_ddp.parameters() n_parameters = sum(p.numel() for p in model_wo_ddp.parameters()
if p.requires_grad) if p.requires_grad)
logger.info(f"number of params: {n_parameters/1e6} M") logger.info(f"number of params: {n_parameters / 1e6} M")
if hasattr(model_wo_ddp, 'flops'): if hasattr(model_wo_ddp, 'flops'):
flops = model_wo_ddp.flops() flops = model_wo_ddp.flops()
logger.info(f"number of GFLOPs: {flops / 1e9}") logger.info(f"number of GFLOPs: {flops / 1e9}")
...@@ -180,7 +191,13 @@ def build_ds_config(config, args): ...@@ -180,7 +191,13 @@ def build_ds_config(config, args):
"loss_scale": 1 if args.disable_grad_scalar else 0 "loss_scale": 1 if args.disable_grad_scalar else 0
}, },
"zero_optimization": { "zero_optimization": {
"stage": 1, "stage": args.zero_stage,
"offload_optimizer": {
"device": args.offload_optimizer
},
"offload_param": {
"device": args.offload_param
}
}, },
"steps_per_print": 1e10, "steps_per_print": 1e10,
"gradient_accumulation_steps": config.TRAIN.ACCUMULATION_STEPS, "gradient_accumulation_steps": config.TRAIN.ACCUMULATION_STEPS,
...@@ -324,7 +341,7 @@ def eval_epoch(config, data_loader, model, epoch=None): ...@@ -324,7 +341,7 @@ def eval_epoch(config, data_loader, model, epoch=None):
def train(config, ds_config): def train(config, ds_config):
# -------------- build ---------------- # # -------------- build ---------------- #
_, dataset_val, _, data_loader_train, data_loader_val, _, mixup_fn = build_loader(config) _, dataset_val, _, data_loader_train, data_loader_val, _, mixup_fn = build_loader(config)
model = build_model(config) model = build_model(config)
model.cuda() model.cuda()
...@@ -356,7 +373,7 @@ def train(config, ds_config): ...@@ -356,7 +373,7 @@ def train(config, ds_config):
model_ema = EMADeepspeed(model, config.TRAIN.EMA.DECAY) model_ema = EMADeepspeed(model, config.TRAIN.EMA.DECAY)
# -------------- resume ---------------- # # -------------- resume ---------------- #
max_accuracy = 0.0 max_accuracy = 0.0
max_accuracy_ema = 0.0 max_accuracy_ema = 0.0
client_state = {} client_state = {}
...@@ -379,9 +396,9 @@ def train(config, ds_config): ...@@ -379,9 +396,9 @@ def train(config, ds_config):
if model_ema is not None: if model_ema is not None:
max_accuracy_ema = client_state.get('max_accuracy_ema', 0.0) max_accuracy_ema = client_state.get('max_accuracy_ema', 0.0)
model_ema.load_state_dict((client_state['model_ema'])) model_ema.load_state_dict((client_state['model_ema']))
# -------------- training ---------------- # # -------------- training ---------------- #
logger.info(f"Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}") logger.info(f"Creating model:{config.MODEL.TYPE}/{config.MODEL.NAME}")
logger.info(str(model)) logger.info(str(model))
logger.info(get_optimizer_state_str(optimizer)) logger.info(get_optimizer_state_str(optimizer))
...@@ -461,10 +478,11 @@ def eval(config): ...@@ -461,10 +478,11 @@ def eval(config):
from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
ckpt_dir = os.path.dirname(config.MODEL.RESUME) ckpt_dir = os.path.dirname(config.MODEL.RESUME)
tag = os.path.basename(config.MODEL.RESUME) tag = os.path.basename(config.MODEL.RESUME)
state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir=ckpt_dir, tag=tag) state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir=ckpt_dir, tag=tag)
model_wo_ddp.load_state_dict(state_dict) model_wo_ddp.load_state_dict(state_dict)
except: except:
checkpoint = torch.load(os.path.join(config.MODEL.RESUME, 'mp_rank_00_model_states.pt'), map_location='cpu') checkpoint = torch.load(os.path.join(config.MODEL.RESUME, 'mp_rank_00_model_states.pt'),
map_location='cpu')
model_wo_ddp.load_state_dict(checkpoint['module']) model_wo_ddp.load_state_dict(checkpoint['module'])
elif config.MODEL.PRETRAINED: elif config.MODEL.PRETRAINED:
load_pretrained(config, model_wo_ddp, logger) load_pretrained(config, model_wo_ddp, logger)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment