Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
526a3180
Commit
526a3180
authored
Mar 11, 2022
by
Frank Lee
Browse files
[unit test] Refactored test cases with component func (#339)
* refactored test with component func * fixed bug
parent
de464504
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
163 additions
and
435 deletions
+163
-435
tests/components_to_test/nested_model.py
tests/components_to_test/nested_model.py
+1
-1
tests/components_to_test/utils/dummy_data_generator.py
tests/components_to_test/utils/dummy_data_generator.py
+12
-1
tests/test_trainer/test_trainer_with_non_pipe_schedule.py
tests/test_trainer/test_trainer_with_non_pipe_schedule.py
+28
-63
tests/test_utils/test_zero_gradient_clippling.py
tests/test_utils/test_zero_gradient_clippling.py
+6
-15
tests/test_zero_data_parallel/common.py
tests/test_zero_data_parallel/common.py
+0
-17
tests/test_zero_data_parallel/test_shard_param.py
tests/test_zero_data_parallel/test_shard_param.py
+18
-16
tests/test_zero_data_parallel/test_sharded_optim.py
tests/test_zero_data_parallel/test_sharded_optim.py
+94
-104
tests/test_zero_data_parallel/test_sharded_optim_with_sync_bn.py
...est_zero_data_parallel/test_sharded_optim_with_sync_bn.py
+2
-0
tests/test_zero_data_parallel/test_state_dict.py
tests/test_zero_data_parallel/test_state_dict.py
+2
-2
tests/test_zero_data_parallel/test_zero_dev_3.py
tests/test_zero_data_parallel/test_zero_dev_3.py
+0
-119
tests/test_zero_data_parallel/test_zero_dev_3_mp4.py
tests/test_zero_data_parallel/test_zero_dev_3_mp4.py
+0
-97
No files found.
tests/components_to_test/nested_model.py
View file @
526a3180
...
@@ -43,7 +43,7 @@ class DummyDataLoader(DummyDataGenerator):
...
@@ -43,7 +43,7 @@ class DummyDataLoader(DummyDataGenerator):
@
non_distributed_component_funcs
.
register
(
name
=
'nested_model'
)
@
non_distributed_component_funcs
.
register
(
name
=
'nested_model'
)
def
get_training_components
():
def
get_training_components
():
def
model_builder
(
checkpoint
):
def
model_builder
(
checkpoint
=
True
):
return
NestedNet
(
checkpoint
)
return
NestedNet
(
checkpoint
)
trainloader
=
DummyDataLoader
()
trainloader
=
DummyDataLoader
()
...
...
tests/components_to_test/utils/dummy_data_generator.py
View file @
526a3180
...
@@ -3,12 +3,23 @@ from abc import ABC, abstractmethod
...
@@ -3,12 +3,23 @@ from abc import ABC, abstractmethod
class
DummyDataGenerator
(
ABC
):
class
DummyDataGenerator
(
ABC
):
def
__init__
(
self
,
length
=
10
):
self
.
length
=
length
@
abstractmethod
@
abstractmethod
def
generate
(
self
):
def
generate
(
self
):
pass
pass
def
__iter__
(
self
):
def
__iter__
(
self
):
self
.
step
=
0
return
self
return
self
def
__next__
(
self
):
def
__next__
(
self
):
return
self
.
generate
()
if
self
.
step
<
self
.
length
:
self
.
step
+=
1
return
self
.
generate
()
else
:
raise
StopIteration
def
__len__
(
self
):
return
self
.
length
tests/test_trainer/test_trainer_with_non_pipe_schedule.py
View file @
526a3180
import
os
from
functools
import
partial
from
functools
import
partial
from
pathlib
import
Path
import
colossalai
import
colossalai
import
pytest
import
pytest
import
torch
import
torch
import
torch.multiprocessing
as
mp
import
torch.multiprocessing
as
mp
import
torch.nn
as
nn
from
colossalai.amp.amp_type
import
AMP_TYPE
from
colossalai.amp.amp_type
import
AMP_TYPE
from
colossalai.core
import
global_context
as
gpc
from
colossalai.logging
import
get_dist_logger
from
colossalai.logging
import
get_dist_logger
from
colossalai.trainer
import
Trainer
from
colossalai.trainer
import
Trainer
from
colossalai.utils
import
MultiTimer
,
free_port
,
get_dataloader
from
colossalai.utils
import
MultiTimer
,
free_port
from
torch.optim
import
Adam
from
tests.components_to_test.registry
import
non_distributed_component_funcs
from
torchvision
import
transforms
from
torchvision.datasets
import
CIFAR10
from
torchvision.models
import
resnet18
BATCH_SIZE
=
16
BATCH_SIZE
=
16
IMG_SIZE
=
32
IMG_SIZE
=
32
...
@@ -29,60 +22,32 @@ CONFIG = dict(
...
@@ -29,60 +22,32 @@ CONFIG = dict(
def
run_trainer_no_pipeline
(
rank
,
world_size
,
port
):
def
run_trainer_no_pipeline
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
colossalai
.
launch
(
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
# build model
test_models
=
[
'repeated_computed_layers'
,
'resnet18'
,
'nested_model'
]
model
=
resnet18
(
num_classes
=
10
)
for
name
in
test_models
:
get_components_func
=
non_distributed_component_funcs
.
get_callable
(
name
)
# build dataloaders
model_builder
,
train_dataloader
,
test_dataloader
,
optimizer_builder
,
criterion
=
get_components_func
()
train_dataset
=
CIFAR10
(
root
=
Path
(
os
.
environ
[
'DATA'
]),
model
=
model_builder
()
download
=
True
,
optimizer
=
optimizer_builder
(
model
)
transform
=
transforms
.
Compose
([
engine
,
train_dataloader
,
*
_
=
colossalai
.
initialize
(
model
=
model
,
transforms
.
Resize
(
size
=
(
IMG_SIZE
,
IMG_SIZE
)),
optimizer
=
optimizer
,
transforms
.
ToTensor
(),
criterion
=
criterion
,
transforms
.
Normalize
(
mean
=
(
0.5
,
0.5
,
0.5
),
std
=
(
0.5
,
0.5
,
0.5
))
train_dataloader
=
train_dataloader
)
]))
logger
=
get_dist_logger
()
test_dataset
=
CIFAR10
(
root
=
Path
(
os
.
environ
[
'DATA'
]),
logger
.
info
(
"engine is built"
,
ranks
=
[
0
])
train
=
False
,
download
=
True
,
timer
=
MultiTimer
()
transform
=
transforms
.
Compose
([
trainer
=
Trainer
(
engine
=
engine
,
logger
=
logger
,
timer
=
timer
)
transforms
.
Resize
(
size
=
(
IMG_SIZE
,
IMG_SIZE
)),
logger
.
info
(
"trainer is built"
,
ranks
=
[
0
])
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
=
(
0.5
,
0.5
,
0.5
),
std
=
(
0.5
,
0.5
,
0.5
))
logger
.
info
(
"start training"
,
ranks
=
[
0
])
]))
trainer
.
fit
(
train_dataloader
=
train_dataloader
,
test_dataloader
=
test_dataloader
,
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
epochs
=
NUM_EPOCHS
,
shuffle
=
True
,
max_steps
=
5
,
batch_size
=
BATCH_SIZE
,
display_progress
=
True
,
pin_memory
=
True
,
test_interval
=
5
)
drop_last
=
True
)
torch
.
cuda
.
empty_cache
()
test_dataloader
=
get_dataloader
(
dataset
=
test_dataset
,
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
drop_last
=
True
)
# build optimizer
optimizer
=
Adam
(
model
.
parameters
(),
lr
=
0.001
)
criterion
=
nn
.
CrossEntropyLoss
()
engine
,
train_dataloader
,
*
args
=
colossalai
.
initialize
(
model
=
model
,
optimizer
=
optimizer
,
criterion
=
criterion
,
train_dataloader
=
train_dataloader
)
logger
=
get_dist_logger
()
logger
.
info
(
"engine is built"
,
ranks
=
[
0
])
timer
=
MultiTimer
()
trainer
=
Trainer
(
engine
=
engine
,
logger
=
logger
,
timer
=
timer
)
logger
.
info
(
"trainer is built"
,
ranks
=
[
0
])
logger
.
info
(
"start training"
,
ranks
=
[
0
])
trainer
.
fit
(
train_dataloader
=
train_dataloader
,
test_dataloader
=
test_dataloader
,
epochs
=
NUM_EPOCHS
,
max_steps
=
100
,
display_progress
=
True
,
test_interval
=
5
)
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
...
...
tests/test_utils/test_zero_gradient_clippling.py
View file @
526a3180
...
@@ -20,6 +20,7 @@ from torch.nn.utils import clip_grad_norm_
...
@@ -20,6 +20,7 @@ from torch.nn.utils import clip_grad_norm_
class
Enumerator
:
class
Enumerator
:
def
__init__
(
self
,
arg_names
:
List
[
str
],
arg_values
:
List
[
tuple
])
->
None
:
def
__init__
(
self
,
arg_names
:
List
[
str
],
arg_values
:
List
[
tuple
])
->
None
:
self
.
arg_names
=
arg_names
self
.
arg_names
=
arg_names
self
.
enums
=
Enumerator
.
all_enumerate
(
arg_values
)
self
.
enums
=
Enumerator
.
all_enumerate
(
arg_values
)
...
@@ -49,11 +50,12 @@ class Enumerator:
...
@@ -49,11 +50,12 @@ class Enumerator:
def
checkpoint_wrapper
(
module
,
enable
=
True
):
def
checkpoint_wrapper
(
module
,
enable
=
True
):
if
enable
:
if
enable
:
module
.
forward
=
partial
(
checkpoint
,
module
.
forward
)
module
.
forward
=
partial
(
checkpoint
,
module
.
forward
,
False
)
return
module
return
module
class
Net
(
nn
.
Module
):
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
checkpoint
=
False
)
->
None
:
def
__init__
(
self
,
checkpoint
=
False
)
->
None
:
super
().
__init__
()
super
().
__init__
()
self
.
fc1
=
nn
.
Linear
(
5
,
5
)
self
.
fc1
=
nn
.
Linear
(
5
,
5
)
...
@@ -61,13 +63,7 @@ class Net(nn.Module):
...
@@ -61,13 +63,7 @@ class Net(nn.Module):
self
.
fc3
=
nn
.
Linear
(
5
,
1
)
self
.
fc3
=
nn
.
Linear
(
5
,
1
)
if
checkpoint
:
if
checkpoint
:
self
.
fc1
=
checkpoint_wrapper
(
self
.
fc1
)
self
.
fc1
=
checkpoint_wrapper
(
self
.
fc1
)
self
.
layers
=
[
self
.
layers
=
[
self
.
fc1
,
self
.
fc2
,
self
.
fc1
,
self
.
fc2
,
self
.
fc3
]
self
.
fc1
,
self
.
fc2
,
self
.
fc1
,
self
.
fc2
,
self
.
fc3
]
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
for
layer
in
self
.
layers
:
for
layer
in
self
.
layers
:
...
@@ -158,12 +154,7 @@ def check_config(checkpoint=False, fp16=False, offload=False, norm_type=2.0):
...
@@ -158,12 +154,7 @@ def check_config(checkpoint=False, fp16=False, offload=False, norm_type=2.0):
def
run_dist
(
rank
,
world_size
,
port
):
def
run_dist
(
rank
,
world_size
,
port
):
disable_existing_loggers
()
disable_existing_loggers
()
colossalai
.
launch
(
config
=
{},
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
args
=
[
'checkpoint'
,
'fp16'
,
'offload'
,
'norm_type'
]
args
=
[
'checkpoint'
,
'fp16'
,
'offload'
,
'norm_type'
]
arg_values
=
[(
False
,
True
),
(
False
,
True
),
(
False
,
True
),
(
1.0
,
2.0
,
float
(
'inf'
))]
arg_values
=
[(
False
,
True
),
(
False
,
True
),
(
False
,
True
),
(
1.0
,
2.0
,
float
(
'inf'
))]
...
@@ -176,7 +167,7 @@ def run_dist(rank, world_size, port):
...
@@ -176,7 +167,7 @@ def run_dist(rank, world_size, port):
check_config
()
check_config
()
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
def
test_zero_clip_grad
():
def
test_zero_clip_grad
():
world_size
=
4
world_size
=
4
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
...
...
tests/test_zero_data_parallel/common.py
View file @
526a3180
...
@@ -43,23 +43,6 @@ def checkpoint_wrapper(module, enable=True):
...
@@ -43,23 +43,6 @@ def checkpoint_wrapper(module, enable=True):
return
module
return
module
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
checkpoint
=
False
)
->
None
:
super
().
__init__
()
self
.
fc1
=
nn
.
Linear
(
5
,
5
)
self
.
fc2
=
nn
.
Linear
(
5
,
5
)
self
.
fc3
=
nn
.
Linear
(
5
,
1
)
if
checkpoint
:
self
.
fc1
=
checkpoint_wrapper
(
self
.
fc1
)
self
.
layers
=
[
self
.
fc1
,
self
.
fc2
,
self
.
fc1
,
self
.
fc2
,
self
.
fc3
]
def
forward
(
self
,
x
):
for
layer
in
self
.
layers
:
x
=
layer
(
x
)
return
x
def
allclose
(
tensor_a
:
torch
.
Tensor
,
tensor_b
:
torch
.
Tensor
,
loose
=
False
)
->
bool
:
def
allclose
(
tensor_a
:
torch
.
Tensor
,
tensor_b
:
torch
.
Tensor
,
loose
=
False
)
->
bool
:
if
loose
:
if
loose
:
return
torch
.
allclose
(
tensor_a
,
tensor_b
,
atol
=
1e-2
,
rtol
=
1e-3
)
return
torch
.
allclose
(
tensor_a
,
tensor_b
,
atol
=
1e-2
,
rtol
=
1e-3
)
...
...
tests/test_zero_data_parallel/test_shard_param.py
View file @
526a3180
...
@@ -13,7 +13,8 @@ from colossalai.utils import free_port
...
@@ -13,7 +13,8 @@ from colossalai.utils import free_port
from
colossalai.zero.shard_utils
import
TensorShardStrategy
from
colossalai.zero.shard_utils
import
TensorShardStrategy
from
colossalai.zero.sharded_param
import
ShardedParam
,
ShardedTensor
from
colossalai.zero.sharded_param
import
ShardedParam
,
ShardedTensor
from
colossalai.zero.sharded_param.sharded_param
import
ShardedParamV2
from
colossalai.zero.sharded_param.sharded_param
import
ShardedParamV2
from
tests.test_zero_data_parallel.common
import
CONFIG
,
Net
,
allclose
from
tests.test_zero_data_parallel.common
import
CONFIG
,
allclose
from
tests.components_to_test.registry
import
non_distributed_component_funcs
def
_run_shard_tensor
(
rank
,
world_size
,
port
):
def
_run_shard_tensor
(
rank
,
world_size
,
port
):
...
@@ -68,21 +69,22 @@ def _run_test_shard_param(rank, world_size, port):
...
@@ -68,21 +69,22 @@ def _run_test_shard_param(rank, world_size, port):
print
(
param_ref
.
data
)
print
(
param_ref
.
data
)
logger
=
get_dist_logger
()
logger
=
get_dist_logger
()
model
=
Net
()
for
get_components_func
in
non_distributed_component_funcs
:
model_builder
,
*
_
=
get_components_func
()
# add an attribute as ca_attr to hijack the access to param.data
model
=
model_builder
(
checkpoint
=
True
)
for
_
,
param
in
model
.
named_parameters
():
# add an attribute as col_attr to hijack the access to param.data
numel_ref
=
(
param
.
numel
()
+
world_size
-
1
)
//
world_size
for
_
,
param
in
model
.
named_parameters
():
param
.
ca_attr
=
ShardedParam
(
param
)
numel_ref
=
(
param
.
numel
()
+
world_size
-
1
)
//
world_size
param
.
ca_attr
.
shard
()
param
.
col_attr
=
ShardedParam
(
param
)
param_data
=
param
.
ca_attr
.
payload
(
torch
.
device
(
'cpu'
))
param
.
col_attr
.
shard
()
assert
(
numel_ref
==
param_data
.
numel
())
param_data
=
param
.
col_attr
.
payload
(
torch
.
device
(
'cpu'
))
assert
(
numel_ref
==
param_data
.
numel
())
for
_
,
param
in
model
.
named_parameters
():
param
.
ca_attr
.
gather
()
for
_
,
param
in
model
.
named_parameters
():
param_data
=
param
.
ca_attr
.
payload
(
torch
.
device
(
'cpu'
))
param
.
col_attr
.
gather
()
param_data
=
param
.
col_attr
.
payload
(
torch
.
device
(
'cpu'
))
disable_existing_loggers
([
logger
])
disable_existing_loggers
([
logger
])
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
...
...
tests/test_zero_data_parallel/test_sharded_optim.py
View file @
526a3180
...
@@ -3,19 +3,13 @@ import colossalai
...
@@ -3,19 +3,13 @@ import colossalai
import
copy
import
copy
import
pytest
import
pytest
import
torch.multiprocessing
as
mp
import
torch.multiprocessing
as
mp
import
torch.nn
as
nn
from
colossalai.zero
import
ShardedOptimizer
from
colossalai.zero
import
ShardedOptimizer
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
from
colossalai.utils
import
free_port
from
colossalai.utils
import
free_port
from
functools
import
partial
from
functools
import
partial
from
common
import
allclose
from
tests.components_to_test.registry
import
non_distributed_component_funcs
def
check_equal
(
a
,
b
):
"""
This function checks if two tensors are equal within tolerance
"""
assert
torch
.
allclose
(
a
.
float
(),
b
.
float
(),
rtol
=
1e-4
,
atol
=
1e-3
),
f
'a =
{
a
}
, b =
{
b
}
'
def
check_completely_equal
(
a
,
b
):
def
check_completely_equal
(
a
,
b
):
...
@@ -36,61 +30,56 @@ def check_sharded_param_consistency():
...
@@ -36,61 +30,56 @@ def check_sharded_param_consistency():
pg: partition gradients and optimizer states
pg: partition gradients and optimizer states
"""
"""
test_models
=
[
'repeated_computed_layers'
,
'resnet18'
,
'nested_model'
]
# create layers
oss_linear1
=
nn
.
Linear
(
128
,
256
)
for
name
in
test_models
:
oss_linear2
=
nn
.
Linear
(
256
,
512
)
get_components_func
=
non_distributed_component_funcs
.
get_callable
(
name
)
model_builder
,
train_dataloader
,
*
_
=
get_components_func
()
# create model
oss_model
=
nn
.
Sequential
(
oss_linear1
,
oss_linear2
)
# create model
pg_model
=
copy
.
deepcopy
(
oss_model
)
oss_model
=
model_builder
(
checkpoint
=
True
).
cuda
().
half
()
pg_model
=
copy
.
deepcopy
(
oss_model
)
oss_model
=
oss_model
.
cuda
().
half
()
pg_model
=
pg_model
.
cuda
().
half
()
# create optimizer
oss_optimizer
=
torch
.
optim
.
Adam
(
oss_model
.
parameters
(),
lr
=
0.001
)
# create optimizer
pg_optimizer
=
torch
.
optim
.
Adam
(
pg_model
.
parameters
(),
lr
=
0.001
)
oss_optimizer
=
torch
.
optim
.
Adam
(
oss_model
.
parameters
(),
lr
=
0.001
)
oss_optimizer
=
ShardedOptimizer
(
oss_optimizer
,
overlap_communication
=
True
,
initial_scale
=
1
,
clip_grad_norm
=
0.0
)
pg_optimizer
=
torch
.
optim
.
Adam
(
pg_model
.
parameters
(),
lr
=
0.001
)
pg_optimizer
=
ShardedOptimizer
(
pg_optimizer
,
oss_optimizer
=
ShardedOptimizer
(
oss_optimizer
,
overlap_communication
=
True
,
initial_scale
=
1
,
clip_grad_norm
=
0.0
)
overlap_communication
=
True
,
pg_optimizer
=
ShardedOptimizer
(
pg_optimizer
,
partition_grad
=
True
,
overlap_communication
=
True
,
initial_scale
=
1
,
partition_grad
=
True
,
clip_grad_norm
=
0.0
)
initial_scale
=
1
,
clip_grad_norm
=
0.0
)
# create
data
,
label
=
next
(
iter
(
train_dataloader
))
# create
input_data
=
data
.
cuda
().
half
()
input_data
=
torch
.
rand
(
32
,
128
).
cuda
().
half
()
# forward
# forward
oss_output
=
oss_model
(
input_data
)
oss_output
=
oss_model
(
input_data
)
pg_output
=
pg_model
(
input_data
)
pg_output
=
pg_model
(
input_data
)
check_completely_equal
(
oss_output
,
pg_output
)
check_completely_equal
(
oss_output
,
pg_output
)
# backward
# backward
oss_optimizer
.
backward
(
oss_output
.
mean
().
float
())
oss_optimizer
.
backward
(
oss_output
.
mean
().
float
())
pg_optimizer
.
backward
(
pg_output
.
mean
().
float
())
pg_optimizer
.
backward
(
pg_output
.
mean
().
float
())
# check grad
# check grad
# as this param is small, the backward reduction
# as this param is small, the backward reduction
# will not be fired
# will not be fired
for
oss_param
,
pg_param
in
zip
(
oss_model
.
parameters
(),
pg_model
.
parameters
()):
oss_linear1_grad
=
oss_model
[
0
].
weight
.
grad
check_completely_equal
(
oss_param
.
grad
,
pg_param
.
grad
)
oss_linear2_grad
=
oss_model
[
1
].
weight
.
grad
pg_linear1_grad
=
pg_model
[
0
].
weight
.
grad
# step
pg_linear2_grad
=
pg_model
[
1
].
weight
.
grad
oss_optimizer
.
sync_grad
()
check_completely_equal
(
oss_linear1_grad
,
pg_linear1_grad
)
pg_optimizer
.
sync_grad
()
check_completely_equal
(
oss_linear2_grad
,
pg_linear2_grad
)
# step
# step
oss_optimizer
.
step
()
oss_optimizer
.
sync_grad
()
pg_optimizer
.
step
()
pg_optimizer
.
sync_grad
()
# check updated param
# step
for
oss_param
,
pg_param
in
zip
(
oss_model
.
parameters
(),
pg_model
.
parameters
()):
oss_optimizer
.
step
()
check_completely_equal
(
oss_param
,
pg_param
)
pg_optimizer
.
step
()
# check updated param
check_completely_equal
(
oss_model
[
0
].
weight
,
pg_model
[
0
].
weight
)
check_completely_equal
(
oss_model
[
1
].
weight
,
pg_model
[
1
].
weight
)
def
check_sharded_optim_against_torch_ddp
():
def
check_sharded_optim_against_torch_ddp
():
...
@@ -103,61 +92,62 @@ def check_sharded_optim_against_torch_ddp():
...
@@ -103,61 +92,62 @@ def check_sharded_optim_against_torch_ddp():
differences in model output and updated parameters are within tolerance.
differences in model output and updated parameters are within tolerance.
"""
"""
# create layer
test_models
=
[
'repeated_computed_layers'
,
'resnet18'
,
'nested_model'
]
zero_linear1
=
nn
.
Linear
(
128
,
256
)
zero_linear2
=
nn
.
Linear
(
256
,
512
)
# create
model
for
name
in
test_
model
s
:
zero_model
=
nn
.
Sequential
(
zero_linear1
,
zero_linear2
)
get_components_func
=
non_distributed_component_funcs
.
get_callable
(
name
)
torch_model
=
copy
.
deepcopy
(
zero_model
)
model_builder
,
train_dataloader
,
*
_
=
get_components_func
(
)
zero_model
=
zero_model
.
cuda
().
half
()
# create model
torch_model
=
DDP
(
torch_model
.
cuda
())
zero_model
=
model_builder
(
checkpoint
=
True
).
cuda
()
torch_model
=
copy
.
deepcopy
(
zero_model
)
# create optimizer
zero_model
=
zero_model
.
half
()
zero_optimizer
=
torch
.
optim
.
Adam
(
zero_model
.
parameters
(),
lr
=
0.001
)
torch_model
=
DDP
(
torch_model
.
cuda
()
)
# we only test stage 1 here
# create optimizer
# in `check_sharded_param_consistency.py`, we will test whether
zero_optimizer
=
torch
.
optim
.
Adam
(
zero_model
.
parameters
(),
lr
=
0.001
)
# level 1 and 2 will produce exactly the same results
zero_optimizer
=
ShardedOptimizer
(
zero_optimizer
,
overlap_communication
=
True
,
initial_scale
=
1
,
clip_grad_norm
=
0.0
)
torch_optimizer
=
torch
.
optim
.
Adam
(
torch_model
.
parameters
(),
lr
=
0.001
)
# we only test stage 1 here
# in `check_sharded_param_consistency.py`, we will test whether
# level 1 and 2 will produce exactly the same results
zero_optimizer
=
ShardedOptimizer
(
zero_optimizer
,
overlap_communication
=
True
,
initial_scale
=
1
,
clip_grad_norm
=
0.0
)
torch_optimizer
=
torch
.
optim
.
Adam
(
torch_model
.
parameters
(),
lr
=
0.001
)
# create
# create
input_data
=
torch
.
rand
(
32
,
128
).
cuda
()
input_data
,
_
=
next
(
iter
(
train_dataloader
))
input_data
=
input_data
.
cuda
()
# zero-dp forward
# zero-dp forward
zero_output
=
zero_model
(
input_data
.
half
())
zero_output
=
zero_model
(
input_data
.
half
())
# torch-ddp forward
# torch-ddp forward
torch_output
=
torch_model
(
input_data
)
torch_output
=
torch_model
(
input_data
)
check_equal
(
zero_output
,
torch_output
)
allclose
(
zero_output
,
torch_output
.
half
()
)
# zero-dp backward
# zero-dp backward
zero_optimizer
.
backward
(
zero_output
.
mean
().
float
())
zero_optimizer
.
backward
(
zero_output
.
mean
().
float
())
# torch-ddp backward
# torch-ddp backward
torch_output
.
mean
().
backward
()
torch_output
.
mean
().
backward
()
# check grad
# check grad
zero_linear1_grad
=
zero_model
[
0
].
weight
.
grad
for
oss_param
,
torch_param
in
zip
(
zero_model
.
parameters
(),
torch_model
.
parameters
()):
zero_linear2_grad
=
zero_model
[
1
].
weight
.
grad
allclose
(
oss_param
.
grad
,
torch_param
.
grad
.
half
())
torch_linear1_grad
=
torch_model
.
module
[
0
].
weight
.
grad
torch_linear2_grad
=
torch_model
.
module
[
1
].
weight
.
grad
check_equal
(
zero_linear1_grad
,
torch_linear1_grad
)
check_equal
(
zero_linear2_grad
,
torch_linear2_grad
)
# zero-dp step
# zero-dp step
zero_optimizer
.
sync_grad
()
zero_optimizer
.
sync_grad
()
zero_optimizer
.
step
()
zero_optimizer
.
step
()
# torch ddp step
# torch ddp step
torch_optimizer
.
step
()
torch_optimizer
.
step
()
# check updated param
# check updated param
check_equal
(
zero_model
[
0
].
weight
,
torch_model
.
module
[
0
].
weight
)
for
oss_param
,
torch_param
in
zip
(
zero_model
.
parameters
(),
torch_model
.
parameters
()):
check_equal
(
zero_model
[
1
].
weight
,
torch_model
.
module
[
1
].
weight
)
allclose
(
oss_param
,
torch_param
.
half
()
)
def
run_dist
(
rank
,
world_size
,
port
):
def
run_dist
(
rank
,
world_size
,
port
):
...
...
tests/test_zero_data_parallel/test_sharded_optim_with_sync_bn.py
View file @
526a3180
...
@@ -15,6 +15,8 @@ import torch.distributed as dist
...
@@ -15,6 +15,8 @@ import torch.distributed as dist
def
run_dist
(
rank
,
world_size
,
port
):
def
run_dist
(
rank
,
world_size
,
port
):
# this test only runs on resnet18
# as this model has sync batch normalization
# need to configure cudnn deterministic so that
# need to configure cudnn deterministic so that
# randomness of convolution layers will be disabled
# randomness of convolution layers will be disabled
colossalai
.
launch
(
config
=
dict
(
zero
=
dict
(
level
=
2
,
partition_grad
=
True
),
colossalai
.
launch
(
config
=
dict
(
zero
=
dict
(
level
=
2
,
partition_grad
=
True
),
...
...
tests/test_zero_data_parallel/test_state_dict.py
View file @
526a3180
...
@@ -22,8 +22,8 @@ def run_dist(rank, world_size, port):
...
@@ -22,8 +22,8 @@ def run_dist(rank, world_size, port):
test_models
=
[
'repeated_computed_layers'
,
'resnet18'
]
test_models
=
[
'repeated_computed_layers'
,
'resnet18'
]
for
model_name
in
test_models
:
for
model_name
in
test_models
:
get_components_func
=
non_distributed_component_funcs
.
get_callable
(
model_name
)
get_components_func
=
non_distributed_component_funcs
.
get_callable
(
model_name
)
model
,
train_dataloader
,
test_dataloader
,
optimizer
,
criterion
=
get_components_func
()
model
_builder
,
train_dataloader
,
test_dataloader
,
optimizer
,
criterion
=
get_components_func
()
model
=
model
()
model
=
model
_builder
()
shard_strategy
=
TensorShardStrategy
()
shard_strategy
=
TensorShardStrategy
()
model
=
model
.
half
().
cuda
()
model
=
model
.
half
().
cuda
()
zero_model
=
ShardedModelV2
(
deepcopy
(
model
),
shard_strategy
)
zero_model
=
ShardedModelV2
(
deepcopy
(
model
),
shard_strategy
)
...
...
tests/test_zero_data_parallel/test_zero_dev_3.py
deleted
100644 → 0
View file @
de464504
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
copy
from
functools
import
partial
import
colossalai
import
pytest
import
torch
import
torch.multiprocessing
as
mp
import
torch.nn
as
nn
from
colossalai.logging
import
disable_existing_loggers
from
colossalai.utils
import
checkpoint
,
free_port
from
colossalai.zero.sharded_model
import
ShardedModel
from
common
import
Net
,
check_grads
,
check_params
,
check_params
def
checkpoint_wrapper
(
module
,
enable
=
True
):
if
enable
:
module
.
forward
=
partial
(
checkpoint
,
module
.
forward
)
return
module
class
Net
(
nn
.
Module
):
def
__init__
(
self
,
checkpoint
=
False
)
->
None
:
super
().
__init__
()
self
.
fc1
=
nn
.
Linear
(
5
,
5
)
self
.
fc2
=
nn
.
Linear
(
5
,
5
)
self
.
fc3
=
nn
.
Linear
(
5
,
1
)
if
checkpoint
:
self
.
fc1
=
checkpoint_wrapper
(
self
.
fc1
)
self
.
layers
=
[
self
.
fc1
,
self
.
fc2
,
self
.
fc1
,
self
.
fc2
,
self
.
fc3
]
def
forward
(
self
,
x
):
for
layer
in
self
.
layers
:
x
=
layer
(
x
)
return
x
def
run_step
(
model
,
optimizer
,
x
,
enable_autocast
=
False
):
model
.
train
()
optimizer
.
zero_grad
()
with
torch
.
cuda
.
amp
.
autocast
(
enabled
=
enable_autocast
):
y
=
model
(
x
)
loss
=
y
.
sum
()
loss
=
loss
.
float
()
loss
.
backward
()
optimizer
.
step
()
def
decode_booleans
(
intval
,
bits
):
res
=
[]
for
bit
in
range
(
bits
):
mask
=
1
<<
bit
res
.
append
((
intval
&
mask
)
==
mask
)
return
res
def
check_config
(
checkpoint
=
False
,
fp16
=
False
,
offload
=
False
):
model
=
Net
(
checkpoint
=
checkpoint
).
cuda
()
zero_model
=
copy
.
deepcopy
(
model
)
offload_config
=
{}
if
offload
:
offload_config
[
'device'
]
=
'cpu'
zero_model
=
zero_model
.
cpu
()
zero_model
=
ShardedModel
(
zero_model
,
mixed_precision
=
fp16
,
offload_config
=
offload_config
)
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
1e-3
)
zero_optimizer
=
torch
.
optim
.
Adam
(
zero_model
.
parameters
(),
lr
=
1e-3
)
for
_
in
range
(
5
):
x
=
torch
.
rand
(
2
,
5
).
cuda
()
run_step
(
model
,
optimizer
,
x
,
enable_autocast
=
fp16
)
run_step
(
zero_model
,
zero_optimizer
,
x
,
enable_autocast
=
fp16
)
check_grads
(
model
,
zero_model
)
check_params
(
model
,
zero_model
)
for
_
in
range
(
5
):
x
=
torch
.
rand
(
2
,
5
).
cuda
()
run_step
(
model
,
optimizer
,
x
,
enable_autocast
=
False
)
run_step
(
zero_model
,
zero_optimizer
,
x
,
enable_autocast
=
False
)
check_grads
(
model
,
zero_model
,
loose
=
True
)
check_params
(
model
,
zero_model
,
loose
=
True
)
def
run_dist
(
rank
,
world_size
,
port
):
disable_existing_loggers
()
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
args
=
[
'checkpoint'
,
'fp16'
,
'offload'
]
def
pack_args
(
i
):
booleans
=
decode_booleans
(
i
,
len
(
args
))
return
{
arg
:
booleans
[
idx
]
for
idx
,
arg
in
enumerate
(
args
)}
for
j
in
range
(
2
**
len
(
args
)):
kwargs
=
pack_args
(
j
)
print
(
kwargs
)
check_config
(
**
kwargs
)
@
pytest
.
mark
.
dist
def
test_zero_level_3
():
world_size
=
1
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_zero_level_3
()
tests/test_zero_data_parallel/test_zero_dev_3_mp4.py
deleted
100644 → 0
View file @
de464504
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
copy
from
functools
import
partial
import
colossalai
import
pytest
import
torch
import
torch.distributed
as
dist
import
torch.multiprocessing
as
mp
import
torch.nn
as
nn
from
colossalai.logging
import
disable_existing_loggers
from
colossalai.utils
import
checkpoint
,
free_port
from
colossalai.zero.sharded_model
import
ShardedModel
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
from
common
import
Net
,
check_grads_padding
,
check_params_padding
def
run_step
(
model
,
optimizer
,
x
,
enable_autocast
=
False
):
model
.
train
()
optimizer
.
zero_grad
()
with
torch
.
cuda
.
amp
.
autocast
(
enabled
=
enable_autocast
):
y
=
model
(
x
)
loss
=
y
.
sum
()
loss
=
loss
.
float
()
loss
.
backward
()
optimizer
.
step
()
def
decode_booleans
(
intval
,
bits
):
res
=
[]
for
bit
in
range
(
bits
):
mask
=
1
<<
bit
res
.
append
((
intval
&
mask
)
==
mask
)
return
res
def
check_config
(
checkpoint
=
False
,
fp16
=
False
,
offload
=
False
):
model
=
Net
(
checkpoint
=
checkpoint
).
cuda
()
zero_model
=
copy
.
deepcopy
(
model
)
ddp_model
=
DDP
(
model
)
offload_config
=
{}
if
offload
:
offload_config
[
'device'
]
=
'cpu'
zero_model
=
zero_model
.
cpu
()
zero_model
=
ShardedModel
(
zero_model
,
mixed_precision
=
fp16
,
offload_config
=
offload_config
)
optimizer
=
torch
.
optim
.
Adam
(
ddp_model
.
parameters
(),
lr
=
1e-3
)
zero_optimizer
=
torch
.
optim
.
Adam
(
zero_model
.
parameters
(),
lr
=
1e-3
)
for
_
in
range
(
5
):
x
=
torch
.
rand
(
2
,
5
).
cuda
()
run_step
(
ddp_model
,
optimizer
,
x
,
enable_autocast
=
fp16
)
run_step
(
zero_model
,
zero_optimizer
,
x
,
enable_autocast
=
fp16
)
check_grads_padding
(
ddp_model
,
zero_model
)
check_params_padding
(
ddp_model
,
zero_model
)
for
_
in
range
(
5
):
x
=
torch
.
rand
(
2
,
5
).
cuda
()
run_step
(
ddp_model
,
optimizer
,
x
,
enable_autocast
=
False
)
run_step
(
zero_model
,
zero_optimizer
,
x
,
enable_autocast
=
False
)
check_grads_padding
(
ddp_model
,
zero_model
,
loose
=
True
)
check_params_padding
(
ddp_model
,
zero_model
,
loose
=
True
)
def
run_dist
(
rank
,
world_size
,
port
):
disable_existing_loggers
()
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
port
,
backend
=
'nccl'
)
args
=
[
'checkpoint'
,
'fp16'
,
'offload'
]
def
pack_args
(
i
):
booleans
=
decode_booleans
(
i
,
len
(
args
))
return
{
arg
:
booleans
[
idx
]
for
idx
,
arg
in
enumerate
(
args
)}
for
j
in
range
(
2
**
len
(
args
)):
kwargs
=
pack_args
(
j
)
if
dist
.
get_rank
()
==
0
:
print
(
kwargs
)
check_config
(
**
kwargs
)
@
pytest
.
mark
.
dist
def
test_zero_level_3
():
world_size
=
4
run_func
=
partial
(
run_dist
,
world_size
=
world_size
,
port
=
free_port
())
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_zero_level_3
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment