Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
537e1817
Unverified
Commit
537e1817
authored
Nov 29, 2022
by
HELSON
Committed by
GitHub
Nov 29, 2022
Browse files
[testing] fix testing models (#2036)
* [testing] fix testing models * roll back
parent
a1ce02d7
Changes
7
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
74 additions
and
64 deletions
+74
-64
tests/components_to_test/bert.py
tests/components_to_test/bert.py
+7
-4
tests/components_to_test/inline_op_model.py
tests/components_to_test/inline_op_model.py
+1
-1
tests/components_to_test/nested_model.py
tests/components_to_test/nested_model.py
+4
-2
tests/components_to_test/no_leaf_module.py
tests/components_to_test/no_leaf_module.py
+48
-46
tests/components_to_test/repeated_computed_layer.py
tests/components_to_test/repeated_computed_layer.py
+4
-2
tests/components_to_test/simple_net.py
tests/components_to_test/simple_net.py
+6
-4
tests/test_gemini/update/test_fwd_bwd.py
tests/test_gemini/update/test_fwd_bwd.py
+4
-5
No files found.
tests/components_to_test/bert.py
View file @
537e1817
...
@@ -8,6 +8,7 @@ from .registry import non_distributed_component_funcs
...
@@ -8,6 +8,7 @@ from .registry import non_distributed_component_funcs
def
get_bert_data_loader
(
def
get_bert_data_loader
(
n_class
,
batch_size
,
batch_size
,
total_samples
,
total_samples
,
sequence_length
,
sequence_length
,
...
@@ -16,7 +17,7 @@ def get_bert_data_loader(
...
@@ -16,7 +17,7 @@ def get_bert_data_loader(
):
):
train_data
=
torch
.
randint
(
train_data
=
torch
.
randint
(
low
=
0
,
low
=
0
,
high
=
1000
,
high
=
n_class
,
size
=
(
total_samples
,
sequence_length
),
size
=
(
total_samples
,
sequence_length
),
device
=
device
,
device
=
device
,
dtype
=
torch
.
long
,
dtype
=
torch
.
long
,
...
@@ -37,7 +38,7 @@ def get_training_components():
...
@@ -37,7 +38,7 @@ def get_training_components():
num_head
=
4
num_head
=
4
sequence_length
=
12
sequence_length
=
12
num_layer
=
2
num_layer
=
2
vocab_size
=
3
0524
vocab_size
=
3
2
def
bert_model_builder
(
checkpoint
):
def
bert_model_builder
(
checkpoint
):
config
=
BertConfig
(
vocab_size
=
vocab_size
,
config
=
BertConfig
(
vocab_size
=
vocab_size
,
...
@@ -67,11 +68,13 @@ def get_training_components():
...
@@ -67,11 +68,13 @@ def get_training_components():
return
model
return
model
trainloader
=
get_bert_data_loader
(
batch_size
=
2
,
trainloader
=
get_bert_data_loader
(
n_class
=
vocab_size
,
batch_size
=
2
,
total_samples
=
10000
,
total_samples
=
10000
,
sequence_length
=
sequence_length
,
sequence_length
=
sequence_length
,
is_distrbuted
=
True
)
is_distrbuted
=
True
)
testloader
=
get_bert_data_loader
(
batch_size
=
2
,
testloader
=
get_bert_data_loader
(
n_class
=
vocab_size
,
batch_size
=
2
,
total_samples
=
10000
,
total_samples
=
10000
,
sequence_length
=
sequence_length
,
sequence_length
=
sequence_length
,
is_distrbuted
=
True
)
is_distrbuted
=
True
)
...
...
tests/components_to_test/inline_op_model.py
View file @
537e1817
...
@@ -41,7 +41,7 @@ class DummyDataLoader(DummyDataGenerator):
...
@@ -41,7 +41,7 @@ class DummyDataLoader(DummyDataGenerator):
@
non_distributed_component_funcs
.
register
(
name
=
'inline_op_model'
)
@
non_distributed_component_funcs
.
register
(
name
=
'inline_op_model'
)
def
get_training_components
():
def
get_training_components
():
def
model_builder
(
checkpoint
=
Tru
e
):
def
model_builder
(
checkpoint
=
Fals
e
):
return
InlineOpModule
(
checkpoint
)
return
InlineOpModule
(
checkpoint
)
trainloader
=
DummyDataLoader
()
trainloader
=
DummyDataLoader
()
...
...
tests/components_to_test/nested_model.py
View file @
537e1817
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
colossalai.nn
import
CheckpointModule
from
colossalai.nn
import
CheckpointModule
from
.utils
import
DummyDataGenerator
from
.registry
import
non_distributed_component_funcs
from
.registry
import
non_distributed_component_funcs
from
.utils
import
DummyDataGenerator
class
SubNet
(
nn
.
Module
):
class
SubNet
(
nn
.
Module
):
...
@@ -43,7 +45,7 @@ class DummyDataLoader(DummyDataGenerator):
...
@@ -43,7 +45,7 @@ class DummyDataLoader(DummyDataGenerator):
@
non_distributed_component_funcs
.
register
(
name
=
'nested_model'
)
@
non_distributed_component_funcs
.
register
(
name
=
'nested_model'
)
def
get_training_components
():
def
get_training_components
():
def
model_builder
(
checkpoint
=
Tru
e
):
def
model_builder
(
checkpoint
=
Fals
e
):
return
NestedNet
(
checkpoint
)
return
NestedNet
(
checkpoint
)
trainloader
=
DummyDataLoader
()
trainloader
=
DummyDataLoader
()
...
...
tests/components_to_test/no_leaf_module.py
View file @
537e1817
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
from
colossalai.nn
import
CheckpointModule
from
.utils.dummy_data_generator
import
DummyDataGenerator
from
colossalai.nn
import
CheckpointModule
from
.registry
import
non_distributed_component_funcs
from
.registry
import
non_distributed_component_funcs
from
.utils.dummy_data_generator
import
DummyDataGenerator
class
NoLeafModule
(
CheckpointModule
):
"""
In this no-leaf module, it has subordinate nn.modules and a nn.Parameter.
class
NoLeafModule
(
CheckpointModule
):
"""
"""
In this no-leaf module, it has subordinate nn.modules and a nn.Parameter.
def
__init__
(
self
,
checkpoint
=
False
)
->
None
:
"""
super
().
__init__
(
checkpoint
=
checkpoint
)
self
.
proj1
=
nn
.
Linear
(
4
,
8
)
def
__init__
(
self
,
checkpoint
=
False
)
->
None
:
self
.
weight
=
nn
.
Parameter
(
torch
.
randn
(
8
,
8
))
super
().
__init__
(
checkpoint
=
checkpoint
)
self
.
proj2
=
nn
.
Linear
(
8
,
4
)
self
.
proj1
=
nn
.
Linear
(
4
,
8
)
self
.
weight
=
nn
.
Parameter
(
torch
.
randn
(
8
,
8
))
def
forward
(
self
,
x
):
self
.
proj2
=
nn
.
Linear
(
8
,
4
)
x
=
self
.
proj1
(
x
)
x
=
F
.
linear
(
x
,
self
.
weight
)
def
forward
(
self
,
x
):
x
=
self
.
proj2
(
x
)
x
=
self
.
proj1
(
x
)
return
x
x
=
F
.
linear
(
x
,
self
.
weight
)
x
=
self
.
proj2
(
x
)
return
x
class
DummyDataLoader
(
DummyDataGenerator
):
def
generate
(
self
):
class
DummyDataLoader
(
DummyDataGenerator
):
data
=
torch
.
rand
(
16
,
4
)
label
=
torch
.
randint
(
low
=
0
,
high
=
2
,
size
=
(
16
,))
def
generate
(
self
):
return
data
,
label
data
=
torch
.
rand
(
16
,
4
)
label
=
torch
.
randint
(
low
=
0
,
high
=
2
,
size
=
(
16
,))
return
data
,
label
@
non_distributed_component_funcs
.
register
(
name
=
'no_leaf_module'
)
def
get_training_components
():
@
non_distributed_component_funcs
.
register
(
name
=
'no_leaf_module'
)
def
model_builder
(
checkpoint
=
True
):
def
get_training_components
():
return
NoLeafModule
(
checkpoint
)
def
model_builder
(
checkpoint
=
False
):
trainloader
=
DummyDataLoader
()
return
NoLeafModule
(
checkpoint
)
testloader
=
DummyDataLoader
()
trainloader
=
DummyDataLoader
()
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
testloader
=
DummyDataLoader
()
from
colossalai.nn.optimizer
import
HybridAdam
return
model_builder
,
trainloader
,
testloader
,
HybridAdam
,
criterion
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
from
colossalai.nn.optimizer
import
HybridAdam
return
model_builder
,
trainloader
,
testloader
,
HybridAdam
,
criterion
tests/components_to_test/repeated_computed_layer.py
View file @
537e1817
...
@@ -2,9 +2,11 @@
...
@@ -2,9 +2,11 @@
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
colossalai.nn
import
CheckpointModule
from
colossalai.nn
import
CheckpointModule
from
.utils.dummy_data_generator
import
DummyDataGenerator
from
.registry
import
non_distributed_component_funcs
from
.registry
import
non_distributed_component_funcs
from
.utils.dummy_data_generator
import
DummyDataGenerator
class
NetWithRepeatedlyComputedLayers
(
CheckpointModule
):
class
NetWithRepeatedlyComputedLayers
(
CheckpointModule
):
...
@@ -37,7 +39,7 @@ class DummyDataLoader(DummyDataGenerator):
...
@@ -37,7 +39,7 @@ class DummyDataLoader(DummyDataGenerator):
@
non_distributed_component_funcs
.
register
(
name
=
'repeated_computed_layers'
)
@
non_distributed_component_funcs
.
register
(
name
=
'repeated_computed_layers'
)
def
get_training_components
():
def
get_training_components
():
def
model_builder
(
checkpoint
=
Tru
e
):
def
model_builder
(
checkpoint
=
Fals
e
):
return
NetWithRepeatedlyComputedLayers
(
checkpoint
)
return
NetWithRepeatedlyComputedLayers
(
checkpoint
)
trainloader
=
DummyDataLoader
()
trainloader
=
DummyDataLoader
()
...
...
tests/components_to_test/simple_net.py
View file @
537e1817
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
colossalai.nn
import
CheckpointModule
from
colossalai.nn
import
CheckpointModule
from
.utils.dummy_data_generator
import
DummyDataGenerator
from
.registry
import
non_distributed_component_funcs
from
colossalai.utils.cuda
import
get_current_device
from
colossalai.utils.cuda
import
get_current_device
from
.registry
import
non_distributed_component_funcs
from
.utils.dummy_data_generator
import
DummyDataGenerator
class
SimpleNet
(
CheckpointModule
):
class
SimpleNet
(
CheckpointModule
):
"""
"""
In this no-leaf module, it has subordinate nn.modules and a nn.Parameter.
In this no-leaf module, it has subordinate nn.modules and a nn.Parameter.
...
@@ -29,7 +32,6 @@ class SimpleNet(CheckpointModule):
...
@@ -29,7 +32,6 @@ class SimpleNet(CheckpointModule):
return
x
return
x
class
DummyDataLoader
(
DummyDataGenerator
):
class
DummyDataLoader
(
DummyDataGenerator
):
def
generate
(
self
):
def
generate
(
self
):
...
@@ -41,7 +43,7 @@ class DummyDataLoader(DummyDataGenerator):
...
@@ -41,7 +43,7 @@ class DummyDataLoader(DummyDataGenerator):
@
non_distributed_component_funcs
.
register
(
name
=
'simple_net'
)
@
non_distributed_component_funcs
.
register
(
name
=
'simple_net'
)
def
get_training_components
():
def
get_training_components
():
def
model_builder
(
checkpoint
=
Tru
e
):
def
model_builder
(
checkpoint
=
Fals
e
):
return
SimpleNet
(
checkpoint
)
return
SimpleNet
(
checkpoint
)
trainloader
=
DummyDataLoader
()
trainloader
=
DummyDataLoader
()
...
...
tests/test_gemini/update/test_fwd_bwd.py
View file @
537e1817
...
@@ -4,6 +4,7 @@ import pytest
...
@@ -4,6 +4,7 @@ import pytest
import
torch
import
torch
import
torch.multiprocessing
as
mp
import
torch.multiprocessing
as
mp
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
from
torch.testing
import
assert_close
import
colossalai
import
colossalai
from
colossalai.amp
import
convert_to_apex_amp
from
colossalai.amp
import
convert_to_apex_amp
...
@@ -28,7 +29,7 @@ def check_grad(model: ZeroDDP, torch_model: torch.nn.Module):
...
@@ -28,7 +29,7 @@ def check_grad(model: ZeroDDP, torch_model: torch.nn.Module):
chunk_manager
.
access_chunk
(
chunk
)
chunk_manager
.
access_chunk
(
chunk
)
for
(
p0
,
p1
)
in
zip
(
model
.
parameters
(),
torch_model
.
parameters
()):
for
(
p0
,
p1
)
in
zip
(
model
.
parameters
(),
torch_model
.
parameters
()):
assert
torch
.
all
close
(
p0
,
p1
.
grad
,
a
tol
=
1e-3
,
r
tol
=
1
e-5
)
,
"{}"
.
format
(
torch
.
max
(
torch
.
abs
(
p0
-
p1
.
grad
)).
item
())
assert
_
close
(
p0
,
p1
.
grad
,
r
tol
=
1e-3
,
a
tol
=
5
e-5
)
@
parameterize
(
'placement_policy'
,
[
'cuda'
,
'cpu'
,
'auto'
,
'const'
])
@
parameterize
(
'placement_policy'
,
[
'cuda'
,
'cpu'
,
'auto'
,
'const'
])
...
@@ -74,10 +75,8 @@ def exam_gpt_fwd_bwd(placement_policy, keep_gather, model_name: str, use_grad_ch
...
@@ -74,10 +75,8 @@ def exam_gpt_fwd_bwd(placement_policy, keep_gather, model_name: str, use_grad_ch
torch_loss
=
run_fwd_bwd
(
torch_model
,
input_ids
.
cuda
(),
label
.
cuda
(),
criterion
,
use_init_ctx
=
False
)
torch_loss
=
run_fwd_bwd
(
torch_model
,
input_ids
.
cuda
(),
label
.
cuda
(),
criterion
,
use_init_ctx
=
False
)
loss
=
run_fwd_bwd
(
model
,
input_ids
.
cuda
(),
label
.
cuda
(),
criterion
,
use_init_ctx
=
True
)
loss
=
run_fwd_bwd
(
model
,
input_ids
.
cuda
(),
label
.
cuda
(),
criterion
,
use_init_ctx
=
True
)
assert
torch
.
allclose
(
loss
,
torch_loss
,
rtol
=
1e-2
),
"{} {} {}"
.
format
(
assert
torch
.
equal
(
torch_loss
,
loss
)
torch
.
max
(
torch
.
abs
(
loss
-
torch_loss
)).
item
(),
loss
,
torch_loss
)
# FIXME(1SAA) bert and resnet18 can not pass the check_grad
check_grad
(
model
,
torch_model
)
check_grad
(
model
,
torch_model
)
...
@@ -96,4 +95,4 @@ def test_gpt(world_size):
...
@@ -96,4 +95,4 @@ def test_gpt(world_size):
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_gpt
(
1
)
test_gpt
(
4
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment