Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
dcnv3
Commits
213b0b8d
Unverified
Commit
213b0b8d
authored
Apr 19, 2023
by
Zeqiang Lai
Committed by
GitHub
Apr 19, 2023
Browse files
Minor update on Deepspeed (#112)
* minor update on deepspeed * support config offloading
parent
861253ca
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
38 additions
and
19 deletions
+38
-19
classification/README.md
classification/README.md
+5
-4
classification/configs/accelerate/dist_8gpus_zero3_offload.yaml
...fication/configs/accelerate/dist_8gpus_zero3_offload.yaml
+1
-1
classification/main_deepspeed.py
classification/main_deepspeed.py
+32
-14
No files found.
classification/README.md
View file @
213b0b8d
...
...
@@ -204,16 +204,17 @@ We support utilizing [Deepspeed](https://github.com/microsoft/DeepSpeed) to redu
To use it, first install the requirements as
```
bash
pip
install
deepspeed
pip
install
deepspeed
==
0.8.3
```
Then you could launch the training in a slurm system with 8 GPUs as follows (tiny and huge as examples)
Then you could launch the training in a slurm system with 8 GPUs as follows (tiny and huge as examples)
.
The default zero stage is 1 and it could config via command line args
`--zero-stage`
.
```
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume ckpt.pth
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_t_1k_224.yaml --batch-size 128 --accumulation-steps 4 --eval --resume deepspeed_ckpt_dir
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_h_22kto1k_640.yaml --batch-size 16 --accumulation-steps 4 --pretrained ckpt/internimage_h_jointto22k_384.pth
GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/internimage_h_22kto1k_640.yaml --batch-size 16 --accumulation-steps 4 --pretrained ckpt/internimage_h_jointto22k_384.pth --zero-stage 3
```
...
...
@@ -222,7 +223,7 @@ GPUS=8 GPUS_PER_NODE=8 sh train_in1k_deepspeed.sh vc_research_4 train configs/in
Optionally, you could use our
[
Huggingface accelerate
](
https://github.com/huggingface/accelerate
)
integration to use deepspeed.
```
bash
pip
install
accelerate
pip
install
accelerate
==
0.18.0
```
```
bash
...
...
classification/configs/accelerate/dist_8gpus_zero3_offload.yaml
View file @
213b0b8d
compute_environment
:
LOCAL_MACHINE
deepspeed_config
:
deepspeed_config_file
:
configs/accelerate/deepspeed/ds_config_zero3.json
deepspeed_config_file
:
configs/accelerate/deepspeed/ds_config_zero3
_offload
.json
zero3_init_flag
:
false
distributed_type
:
DEEPSPEED
downcast_bf16
:
'
no'
...
...
classification/main_deepspeed.py
View file @
213b0b8d
...
...
@@ -29,6 +29,7 @@ from utils import load_pretrained, reduce_tensor, MyAverageMeter
from
ddp_hooks
import
fp16_compress_hook
from
ema_deepspeed
import
EMADeepspeed
def
parse_option
():
parser
=
argparse
.
ArgumentParser
(
'InternImage training and evaluation script'
,
add_help
=
False
)
...
...
@@ -45,7 +46,8 @@ def parse_option():
'full: cache all data, '
'part: sharding the dataset into nonoverlapping pieces and only cache one piece'
)
parser
.
add_argument
(
'--pretrained'
,
help
=
'pretrained weight from checkpoint, could be imagenet22k pretrained weight'
)
parser
.
add_argument
(
'--pretrained'
,
help
=
'pretrained weight from checkpoint, could be imagenet22k pretrained weight'
)
parser
.
add_argument
(
'--resume'
,
help
=
'resume from checkpoint'
)
parser
.
add_argument
(
'--output'
,
default
=
'output'
,
type
=
str
,
metavar
=
'PATH'
,
help
=
'root of output folder, the full path is <output>/<model_name>/<tag> (default: output)'
...
...
@@ -58,7 +60,16 @@ def parse_option():
# distributed training
parser
.
add_argument
(
"--local-rank"
,
type
=
int
,
required
=
True
,
help
=
'local rank for DistributedDataParallel'
)
# deepspeed config
parser
.
add_argument
(
'--disable-grad-scalar'
,
action
=
'store_true'
,
help
=
'disable Grad Scalar'
)
parser
.
add_argument
(
'--offload-optimizer'
,
type
=
str
,
default
=
'none'
,
choices
=
[
'cpu'
,
'none'
],
help
=
'enable optimizer offloading'
)
parser
.
add_argument
(
'--offload-param'
,
type
=
str
,
default
=
'none'
,
choices
=
[
'cpu'
,
'none'
],
help
=
'enable model offloading'
)
# To use Zero3, Please use main_accelerate.py instead.
# For this script, we are facing a similar issue as https://github.com/microsoft/DeepSpeed/issues/3068
parser
.
add_argument
(
"--zero-stage"
,
type
=
int
,
default
=
1
,
choices
=
[
1
,
2
],
help
=
'deep speed zero stage'
)
args
,
unparsed
=
parser
.
parse_known_args
()
config
=
get_config
(
args
)
...
...
@@ -121,7 +132,7 @@ def scale_learning_rate(config, num_processes):
def
log_model_statistic
(
model_wo_ddp
):
n_parameters
=
sum
(
p
.
numel
()
for
p
in
model_wo_ddp
.
parameters
()
if
p
.
requires_grad
)
logger
.
info
(
f
"number of params:
{
n_parameters
/
1e6
}
M"
)
logger
.
info
(
f
"number of params:
{
n_parameters
/
1e6
}
M"
)
if
hasattr
(
model_wo_ddp
,
'flops'
):
flops
=
model_wo_ddp
.
flops
()
logger
.
info
(
f
"number of GFLOPs:
{
flops
/
1e9
}
"
)
...
...
@@ -180,7 +191,13 @@ def build_ds_config(config, args):
"loss_scale"
:
1
if
args
.
disable_grad_scalar
else
0
},
"zero_optimization"
:
{
"stage"
:
1
,
"stage"
:
args
.
zero_stage
,
"offload_optimizer"
:
{
"device"
:
args
.
offload_optimizer
},
"offload_param"
:
{
"device"
:
args
.
offload_param
}
},
"steps_per_print"
:
1e10
,
"gradient_accumulation_steps"
:
config
.
TRAIN
.
ACCUMULATION_STEPS
,
...
...
@@ -464,7 +481,8 @@ def eval(config):
state_dict
=
get_fp32_state_dict_from_zero_checkpoint
(
checkpoint_dir
=
ckpt_dir
,
tag
=
tag
)
model_wo_ddp
.
load_state_dict
(
state_dict
)
except
:
checkpoint
=
torch
.
load
(
os
.
path
.
join
(
config
.
MODEL
.
RESUME
,
'mp_rank_00_model_states.pt'
),
map_location
=
'cpu'
)
checkpoint
=
torch
.
load
(
os
.
path
.
join
(
config
.
MODEL
.
RESUME
,
'mp_rank_00_model_states.pt'
),
map_location
=
'cpu'
)
model_wo_ddp
.
load_state_dict
(
checkpoint
[
'module'
])
elif
config
.
MODEL
.
PRETRAINED
:
load_pretrained
(
config
,
model_wo_ddp
,
logger
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment