Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
2b2dc1c8
Unverified
Commit
2b2dc1c8
authored
Jun 10, 2022
by
Frank Lee
Committed by
GitHub
Jun 10, 2022
Browse files
[pipeline] refactor the pipeline module (#1087)
* [pipeline] refactor the pipeline module * polish code
parent
bad5d4c0
Changes
28
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
30 additions
and
239 deletions
+30
-239
tests/test_trainer/test_pipeline/model/resnet.py
tests/test_trainer/test_pipeline/model/resnet.py
+0
-163
tests/test_trainer/test_pipeline/resnet_config.py
tests/test_trainer/test_pipeline/resnet_config.py
+0
-21
tests/test_trainer/test_pipeline/test_partition.py
tests/test_trainer/test_pipeline/test_partition.py
+0
-43
tests/test_trainer/test_pipeline/test_pipeline_schedule.py
tests/test_trainer/test_pipeline/test_pipeline_schedule.py
+26
-8
tests/test_utils/test_checkpoint/test_checkpoint_1d.py
tests/test_utils/test_checkpoint/test_checkpoint_1d.py
+1
-1
tests/test_utils/test_checkpoint/test_checkpoint_2d.py
tests/test_utils/test_checkpoint/test_checkpoint_2d.py
+1
-1
tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
+1
-1
tests/test_utils/test_checkpoint/test_checkpoint_3d.py
tests/test_utils/test_checkpoint/test_checkpoint_3d.py
+1
-1
No files found.
tests/test_trainer/test_pipeline/model/resnet.py
deleted
100644 → 0
View file @
bad5d4c0
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from
typing
import
List
,
Optional
import
torch
import
torch.nn
as
nn
from
torch
import
Tensor
from
colossalai.registry
import
LAYERS
from
colossalai.registry
import
MODELS
from
colossalai.nn.model
import
ModelFromConfig
@
MODELS
.
register_module
class
VanillaResNet
(
ModelFromConfig
):
"""ResNet from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
"""
def
__init__
(
self
,
num_cls
:
int
,
block_type
:
str
,
layers
:
List
[
int
],
norm_layer_type
:
str
=
'BatchNorm2d'
,
in_channels
:
int
=
3
,
groups
:
int
=
1
,
width_per_group
:
int
=
64
,
zero_init_residual
:
bool
=
False
,
replace_stride_with_dilation
:
Optional
[
List
[
bool
]]
=
None
,
dilations
=
(
1
,
1
,
1
,
1
)
)
->
None
:
super
().
__init__
()
self
.
inplanes
=
64
self
.
zero_init_residual
=
zero_init_residual
self
.
blocks
=
layers
self
.
block_expansion
=
LAYERS
.
get_module
(
block_type
).
expansion
self
.
dilations
=
dilations
self
.
reslayer_common_cfg
=
dict
(
type
=
'ResLayer'
,
block_type
=
block_type
,
norm_layer_type
=
norm_layer_type
,
groups
=
groups
,
base_width
=
width_per_group
)
if
replace_stride_with_dilation
is
None
:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation
=
[
False
,
False
,
False
]
if
len
(
replace_stride_with_dilation
)
!=
3
:
raise
ValueError
(
"replace_stride_with_dilation should be None "
"or a 3-element tuple, got {}"
.
format
(
replace_stride_with_dilation
))
self
.
layers_cfg
=
[
# conv1
dict
(
type
=
'Conv2d'
,
in_channels
=
in_channels
,
out_channels
=
self
.
inplanes
,
kernel_size
=
7
,
stride
=
2
,
padding
=
3
,
bias
=
False
),
# bn1
dict
(
type
=
norm_layer_type
,
num_features
=
self
.
inplanes
),
# relu
dict
(
type
=
'ReLU'
,
inplace
=
True
),
# maxpool
dict
(
type
=
'MaxPool2d'
,
kernel_size
=
3
,
stride
=
2
,
padding
=
1
),
# layer 1
dict
(
inplanes
=
self
.
inplanes
,
planes
=
64
,
blocks
=
self
.
blocks
[
0
],
dilation
=
self
.
dilations
[
0
],
**
self
.
reslayer_common_cfg
),
# layer 2
dict
(
inplanes
=
64
*
self
.
block_expansion
,
planes
=
128
,
blocks
=
self
.
blocks
[
1
],
stride
=
2
,
dilate
=
replace_stride_with_dilation
[
0
],
dilation
=
self
.
dilations
[
1
],
**
self
.
reslayer_common_cfg
),
# layer 3
dict
(
inplanes
=
128
*
self
.
block_expansion
,
planes
=
256
,
blocks
=
layers
[
2
],
stride
=
2
,
dilate
=
replace_stride_with_dilation
[
1
],
dilation
=
self
.
dilations
[
2
],
**
self
.
reslayer_common_cfg
),
# layer 4
dict
(
inplanes
=
256
*
self
.
block_expansion
,
planes
=
512
,
blocks
=
layers
[
3
],
stride
=
2
,
dilate
=
replace_stride_with_dilation
[
2
],
dilation
=
self
.
dilations
[
3
],
**
self
.
reslayer_common_cfg
),
# avg pool
dict
(
type
=
'AdaptiveAvgPool2d'
,
output_size
=
(
1
,
1
)
),
# flatten
dict
(
type
=
'LambdaWrapper'
,
func
=
lambda
mod
,
x
:
torch
.
flatten
(
x
,
1
)
),
# linear
dict
(
type
=
'Linear'
,
in_features
=
512
*
self
.
block_expansion
,
out_features
=
num_cls
)
]
def
forward
(
self
,
x
:
Tensor
):
for
layer
in
self
.
layers
:
x
=
layer
(
x
)
return
x
def
init_weights
(
self
):
for
m
in
self
.
modules
():
if
isinstance
(
m
,
nn
.
Conv2d
):
nn
.
init
.
kaiming_normal_
(
m
.
weight
,
mode
=
'fan_out'
,
nonlinearity
=
'relu'
)
elif
isinstance
(
m
,
(
nn
.
BatchNorm2d
,
nn
.
GroupNorm
)):
nn
.
init
.
constant_
(
m
.
weight
,
1
)
nn
.
init
.
constant_
(
m
.
bias
,
0
)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if
self
.
zero_init_residual
:
for
m
in
self
.
modules
():
if
isinstance
(
m
,
LAYERS
.
get_module
(
'ResNetBottleneck'
)):
# type: ignore[arg-type]
nn
.
init
.
constant_
(
m
.
bn3
.
weight
,
0
)
elif
isinstance
(
m
,
LAYERS
.
get_module
(
'ResNetBasicBlock'
)):
# type: ignore[arg-type]
nn
.
init
.
constant_
(
m
.
bn2
.
weight
,
0
)
tests/test_trainer/test_pipeline/resnet_config.py
deleted
100644 → 0
View file @
bad5d4c0
import
os
import
model
from
pathlib
import
Path
BATCH_SIZE
=
128
IMG_SIZE
=
224
DIM
=
768
NUM_CLASSES
=
10
NUM_ATTN_HEADS
=
12
NUM_MICRO_BATCHES
=
2
# resnet 18
model
=
dict
(
type
=
'VanillaResNet'
,
block_type
=
'ResNetBasicBlock'
,
layers
=
[
2
,
2
,
2
,
2
],
num_cls
=
10
)
parallel
=
dict
(
pipeline
=
dict
(
size
=
4
),
tensor
=
dict
(
size
=
1
,
mode
=
None
)
)
tests/test_trainer/test_pipeline/test_partition.py
deleted
100644 → 0
View file @
bad5d4c0
import
os.path
as
osp
import
pytest
import
torch
import
torch.multiprocessing
as
mp
from
colossalai.builder.pipeline
import
build_pipeline_model_from_cfg
from
colossalai.core
import
global_context
from
colossalai.initialize
import
launch
from
colossalai.logging
import
get_dist_logger
from
functools
import
partial
from
colossalai.utils
import
free_port
from
colossalai.testing
import
rerun_on_exception
DIR_PATH
=
osp
.
dirname
(
osp
.
realpath
(
__file__
))
CONFIG_PATH
=
osp
.
join
(
DIR_PATH
,
'resnet_config.py'
)
def
run_partition
(
rank
,
world_size
,
port
):
launch
(
config
=
CONFIG_PATH
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
logger
=
get_dist_logger
()
logger
.
info
(
'finished initialization'
)
# build model
model
=
build_pipeline_model_from_cfg
(
global_context
.
config
.
model
,
1
,
verbose
=
True
)
assert
isinstance
(
model
,
torch
.
nn
.
Module
)
logger
.
info
(
'model is created'
)
global_context
.
destroy
()
logger
.
info
(
'training finished'
)
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
dist
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
def
test_partition
():
world_size
=
4
run_func
=
partial
(
run_partition
,
world_size
=
world_size
,
port
=
free_port
())
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_partition
()
tests/test_trainer/test_pipeline/test_pipeline_schedule.py
View file @
2b2dc1c8
...
...
@@ -8,27 +8,45 @@ from pathlib import Path
import
colossalai
import
pytest
import
torch
import
torch.nn
as
nn
import
torch.multiprocessing
as
mp
from
colossalai.builder
import
build_pipeline_model_from_cfg
from
colossalai.core
import
global_context
as
gpc
from
colossalai.
engine.schedule
import
PipelineSchedul
e
from
colossalai.
context
import
ParallelMod
e
from
colossalai.initialize
import
launch
from
colossalai.utils
import
free_port
,
get_dataloader
,
print_rank_0
from
colossalai.testing
import
rerun_on_exception
from
torchvision
import
transforms
from
torchvision.datasets
import
CIFAR10
from
torchvision.models
import
resnet18
BATCH_SIZE
=
4
DIR_PATH
=
osp
.
dirname
(
osp
.
realpath
(
__file__
))
CONFIG_PATH
=
osp
.
join
(
DIR_PATH
,
'./resnet_config.py'
)
BATCH_SIZE
=
8
CONFIG
=
dict
(
NUM_MICRO_BATCHES
=
2
,
parallel
=
dict
(
pipeline
=
dict
(
size
=
2
),
tensor
=
dict
(
size
=
1
,
mode
=
None
)
)
)
def
run_schedule
(
rank
,
world_size
,
port
):
launch
(
config
=
CONFIG
_PATH
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
launch
(
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
# build model
model
=
build_pipeline_model_from_cfg
(
gpc
.
config
.
model
,
1
)
model
=
resnet18
(
num_classes
=
10
)
if
gpc
.
get_local_rank
(
ParallelMode
.
PIPELINE
)
==
0
:
model
=
nn
.
Sequential
(
model
.
conv1
,
model
.
bn1
,
model
.
relu
,
model
.
maxpool
,
model
.
layer1
,
model
.
layer2
)
elif
gpc
.
get_local_rank
(
ParallelMode
.
PIPELINE
)
==
1
:
class
Flatten
(
nn
.
Module
):
def
forward
(
self
,
x
):
return
torch
.
flatten
(
x
,
1
)
model
=
nn
.
Sequential
(
model
.
layer3
,
model
.
layer4
,
model
.
avgpool
,
Flatten
(),
model
.
fc
)
print_rank_0
(
'model is created'
)
train_dataset
=
CIFAR10
(
root
=
Path
(
os
.
environ
[
'DATA'
]),
...
...
@@ -69,7 +87,7 @@ def run_schedule(rank, world_size, port):
@
pytest
.
mark
.
dist
@
rerun_on_exception
(
exception_type
=
mp
.
ProcessRaisedException
,
pattern
=
".*Address already in use.*"
)
def
test_pipeline_schedule
():
world_size
=
4
world_size
=
2
run_func
=
partial
(
run_schedule
,
world_size
=
world_size
,
port
=
free_port
())
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
...
...
tests/test_utils/test_checkpoint/test_checkpoint_1d.py
View file @
2b2dc1c8
...
...
@@ -19,7 +19,7 @@ from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus
def
build_pipeline
(
model
):
from
colossalai.
builder.
pipeline
import
partition_uniform
from
colossalai.pipeline
.utils
import
partition_uniform
pipeline_size
=
gpc
.
get_world_size
(
ParallelMode
.
PIPELINE
)
pipeline_rank
=
gpc
.
get_local_rank
(
ParallelMode
.
PIPELINE
)
...
...
tests/test_utils/test_checkpoint/test_checkpoint_2d.py
View file @
2b2dc1c8
...
...
@@ -19,7 +19,7 @@ from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus
def
build_pipeline
(
model
):
from
colossalai.
builder.
pipeline
import
partition_uniform
from
colossalai.pipeline
.utils
import
partition_uniform
pipeline_size
=
gpc
.
get_world_size
(
ParallelMode
.
PIPELINE
)
pipeline_rank
=
gpc
.
get_local_rank
(
ParallelMode
.
PIPELINE
)
...
...
tests/test_utils/test_checkpoint/test_checkpoint_2p5d.py
View file @
2b2dc1c8
...
...
@@ -19,7 +19,7 @@ from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus
def
build_pipeline
(
model
):
from
colossalai.
builder.
pipeline
import
partition_uniform
from
colossalai.pipeline
.utils
import
partition_uniform
pipeline_size
=
gpc
.
get_world_size
(
ParallelMode
.
PIPELINE
)
pipeline_rank
=
gpc
.
get_local_rank
(
ParallelMode
.
PIPELINE
)
...
...
tests/test_utils/test_checkpoint/test_checkpoint_3d.py
View file @
2b2dc1c8
...
...
@@ -19,7 +19,7 @@ from colossalai.testing import rerun_on_exception, skip_if_not_enough_gpus
def
build_pipeline
(
model
):
from
colossalai.
builder.
pipeline
import
partition_uniform
from
colossalai.pipeline
.utils
import
partition_uniform
pipeline_size
=
gpc
.
get_world_size
(
ParallelMode
.
PIPELINE
)
pipeline_rank
=
gpc
.
get_local_rank
(
ParallelMode
.
PIPELINE
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment