Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
cd9c28e0
Unverified
Commit
cd9c28e0
authored
Dec 16, 2021
by
Frank Lee
Committed by
GitHub
Dec 16, 2021
Browse files
added CI for unit testing (#69)
parent
45355a62
Changes
68
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
274 additions
and
80 deletions
+274
-80
tests/test_zero_data_parallel/test_zero.sh
tests/test_zero_data_parallel/test_zero.sh
+0
-4
tests/test_zero_data_parallel/test_zero_level_2.py
tests/test_zero_data_parallel/test_zero_level_2.py
+102
-0
tests/test_zero_data_parallel/test_zero_level_3.py
tests/test_zero_data_parallel/test_zero_level_3.py
+20
-30
tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py
tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py
+0
-12
tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py
tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py
+0
-12
tests/test_zero_tensor_parallel/test.sh
tests/test_zero_tensor_parallel/test.sh
+0
-4
tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
+33
-18
tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
+119
-0
No files found.
tests/test_zero_data_parallel/test_zero.sh
deleted
100644 → 0
View file @
45355a62
#!/bin/bash
test_file
=
"test_zero.py"
python
$test_file
--rank
$SLURM_PROCID
--world_size
$SLURM_NPROCS
--host
$HOST
--port
29500
\ No newline at end of file
tests/test_zero_data_parallel/test_zero_level_2.py
0 → 100644
View file @
cd9c28e0
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os
import
pytest
import
torch
import
torch.multiprocessing
as
mp
from
pathlib
import
Path
import
colossalai
from
colossalai.core
import
global_context
as
gpc
from
colossalai.utils
import
get_dataloader
from
torchvision
import
transforms
from
torchvision.models
import
resnet18
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
BATCH_SIZE
=
16
IMG_SIZE
=
224
CONFIG
=
dict
(
fp16
=
dict
(
mode
=
None
,
),
zero
=
dict
(
level
=
2
,
cpu_offload
=
True
,
verbose
=
False
,
),
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
1
,
mode
=
None
)
)
)
def
run_dist
(
rank
,
world_size
):
colossalai
.
launch
(
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
29940
,
backend
=
'nccl'
)
# build model
model
=
resnet18
(
num_classes
=
10
)
# build dataloader# build dataloaders
train_dataset
=
CIFAR10
(
root
=
Path
(
os
.
environ
[
'DATA'
]),
download
=
True
,
transform
=
transforms
.
Compose
(
[
transforms
.
Resize
(
size
=
(
IMG_SIZE
,
IMG_SIZE
)),
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
=
(
0.5
,
0.5
,
0.5
),
std
=
(
0.5
,
0.5
,
0.5
))
]
)
)
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
drop_last
=
True
)
# build optimizer and loss
# optimizer = build_optimizer(global_context.config.optimizer, model)
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
0.001
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
engine
,
train_dataloader
,
*
args
=
colossalai
.
initialize
(
model
=
model
,
optimizer
=
optimizer
,
criterion
=
criterion
,
train_dataloader
=
train_dataloader
)
# train
model
.
train
()
for
idx
,
(
data
,
label
)
in
enumerate
(
train_dataloader
):
engine
.
zero_grad
()
data
=
data
.
cuda
()
label
=
label
.
cuda
()
output
=
engine
(
data
)
loss
=
engine
.
criterion
(
output
,
label
)
engine
.
backward
(
loss
)
engine
.
step
()
break
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
dist
def
test_zero_level_2
():
world_size
=
4
run_func
=
partial
(
run_dist
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_zero_level_2
()
tests/test_zero_data_parallel/test_zero.py
→
tests/test_zero_data_parallel/test_zero
_level_3
.py
View file @
cd9c28e0
...
@@ -4,36 +4,25 @@
...
@@ -4,36 +4,25 @@
import
os
import
os
import
pytest
import
pytest
import
torch
import
torch
import
torch.multiprocessing
as
mp
from
pathlib
import
Path
from
pathlib
import
Path
import
colossalai
import
colossalai
from
colossalai.initialize
import
get_default_parser
from
colossalai.core
import
global_context
as
gpc
from
colossalai.core
import
global_context
as
gpc
from
colossalai.utils
import
get_dataloader
from
colossalai.utils
import
get_dataloader
from
torchvision
import
transforms
from
torchvision
import
transforms
from
torchvision.models
import
resnet18
from
torchvision.models
import
resnet18
from
torchvision.datasets
import
CIFAR10
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
BATCH_SIZE
=
1
28
BATCH_SIZE
=
1
6
IMG_SIZE
=
224
IMG_SIZE
=
224
NUM_CLS
=
1000
CONFIG
=
dict
(
CONFIG
=
dict
(
fp16
=
dict
(
fp16
=
dict
(
mode
=
None
,
mode
=
None
,
),
),
zero
=
dict
(
zero
=
dict
(
# ==============
# level 2 config
# ==============
# level=2,
# cpu_offload=True,
# verbose=False,
# ==============
# level 3 config
# ==============
level
=
3
,
level
=
3
,
verbose
=
False
,
verbose
=
False
,
offload_optimizer_config
=
dict
(
offload_optimizer_config
=
dict
(
...
@@ -57,16 +46,13 @@ CONFIG = dict(
...
@@ -57,16 +46,13 @@ CONFIG = dict(
)
)
def
run_dist
():
def
run_dist
(
rank
,
world_size
):
parser
=
get_default_parser
()
args
=
parser
.
parse_args
()
colossalai
.
launch
(
config
=
CONFIG
,
colossalai
.
launch
(
config
=
CONFIG
,
rank
=
args
.
rank
,
rank
=
rank
,
world_size
=
args
.
world_size
,
world_size
=
world_size
,
host
=
args
.
host
,
host
=
'local
host
'
,
port
=
args
.
port
,
port
=
29941
,
backend
=
args
.
backend
)
backend
=
'nccl'
)
# build model
# build model
model
=
resnet18
(
num_classes
=
10
)
model
=
resnet18
(
num_classes
=
10
)
...
@@ -86,7 +72,6 @@ def run_dist():
...
@@ -86,7 +72,6 @@ def run_dist():
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
batch_size
=
BATCH_SIZE
,
num_workers
=
1
,
pin_memory
=
True
,
pin_memory
=
True
,
drop_last
=
True
)
drop_last
=
True
)
...
@@ -104,22 +89,27 @@ def run_dist():
...
@@ -104,22 +89,27 @@ def run_dist():
model
.
train
()
model
.
train
()
for
idx
,
(
data
,
label
)
in
enumerate
(
train_dataloader
):
for
idx
,
(
data
,
label
)
in
enumerate
(
train_dataloader
):
engine
.
zero_grad
()
engine
.
zero_grad
()
data
=
data
.
cuda
()
data
=
data
.
cuda
()
.
half
()
label
=
label
.
cuda
()
label
=
label
.
cuda
()
output
=
engine
(
data
)
output
=
engine
(
data
)
.
float
()
loss
=
engine
.
criterion
(
output
,
label
)
loss
=
engine
.
criterion
(
output
,
label
)
engine
.
backward
(
loss
)
engine
.
backward
(
loss
)
engine
.
step
()
engine
.
step
()
break
break
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
skip
(
"This test should be invoked manually using the script provided"
)
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
dist
def
test_zero
():
@
pytest
.
mark
.
skip
(
"Level 3 has unknown bug so skip this test for now"
)
run_dist
()
def
test_zero_level_3
():
world_size
=
4
run_func
=
partial
(
run_dist
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_zero
()
test_zero
_level_3
()
tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py
deleted
100644 → 0
View file @
45355a62
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
)
fp16
=
dict
(
mode
=
None
,
)
zero
=
dict
(
level
=
2
)
tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py
deleted
100644 → 0
View file @
45355a62
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
)
fp16
=
dict
(
mode
=
None
,
)
zero
=
dict
(
level
=
3
)
tests/test_zero_tensor_parallel/test.sh
deleted
100644 → 0
View file @
45355a62
#!/usr/bin/env sh
test_file
=
$1
python
$test_file
--rank
$SLURM_PROCID
--world_size
$SLURM_NPROCS
--host
$HOST
--port
29500
\ No newline at end of file
tests/test_zero_tensor_parallel/test_vit_2d.py
→
tests/test_zero_tensor_parallel/test_vit_2d
_level_2
.py
View file @
cd9c28e0
...
@@ -6,12 +6,11 @@ from pathlib import Path
...
@@ -6,12 +6,11 @@ from pathlib import Path
import
pytest
import
pytest
import
torch.autograd
import
torch.autograd
import
torch.multiprocessing
as
mp
import
colossalai
import
colossalai
import
torch
import
torch
from
colossalai.initialize
import
get_default_parser
from
colossalai.builder
import
build_model
from
colossalai.builder
import
build_model
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.core
import
global_context
as
gpc
from
colossalai.logging
import
get_dist_logger
from
colossalai.logging
import
get_dist_logger
from
colossalai.utils
import
get_dataloader
from
colossalai.utils
import
get_dataloader
...
@@ -20,9 +19,20 @@ from colossalai.nn import CrossEntropyLoss2D
...
@@ -20,9 +19,20 @@ from colossalai.nn import CrossEntropyLoss2D
from
torchvision
import
transforms
from
torchvision
import
transforms
from
torchvision.datasets
import
CIFAR10
from
torchvision.datasets
import
CIFAR10
from
components
import
*
from
components
import
*
from
functools
import
partial
level
=
os
.
environ
[
'LEVEL'
]
CONFIG_PATH
=
Path
(
__file__
).
parent
.
parent
.
joinpath
(
f
'configs/vit_2d_zero
{
level
}
.py'
)
CONFIG
=
dict
(
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
),
fp16
=
dict
(
mode
=
None
,
),
zero
=
dict
(
level
=
2
)
)
def
train_epoch
(
engine
,
train_dataloader
):
def
train_epoch
(
engine
,
train_dataloader
):
...
@@ -37,18 +47,14 @@ def train_epoch(engine, train_dataloader):
...
@@ -37,18 +47,14 @@ def train_epoch(engine, train_dataloader):
return
avg_loss
return
avg_loss
@
pytest
.
mark
.
dist
def
run_2d_parallel_vision_transformer_level_2
(
rank
,
world_size
):
@
pytest
.
mark
.
skip
(
"This test should be invoked by test.sh in the same folder as it runs on multiple gpus"
)
def
test_2d_parallel_vision_transformer
():
parser
=
get_default_parser
()
args
=
parser
.
parse_args
()
colossalai
.
launch
(
colossalai
.
launch
(
config
=
CONFIG
_PATH
,
config
=
CONFIG
,
rank
=
args
.
rank
,
rank
=
rank
,
world_size
=
args
.
world_size
,
world_size
=
world_size
,
host
=
args
.
host
,
host
=
'local
host
'
,
port
=
args
.
port
,
port
=
29950
,
backend
=
args
.
backend
backend
=
'nccl'
)
)
# build model
# build model
...
@@ -70,7 +76,6 @@ def test_2d_parallel_vision_transformer():
...
@@ -70,7 +76,6 @@ def test_2d_parallel_vision_transformer():
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
batch_size
=
BATCH_SIZE
,
num_workers
=
1
,
pin_memory
=
True
,
pin_memory
=
True
,
drop_last
=
True
)
drop_last
=
True
)
...
@@ -97,6 +102,16 @@ def test_2d_parallel_vision_transformer():
...
@@ -97,6 +102,16 @@ def test_2d_parallel_vision_transformer():
engine
.
step
()
engine
.
step
()
break
break
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
dist
def
test_2d_vit_zero_level_2
():
world_size
=
8
run_func
=
partial
(
run_2d_parallel_vision_transformer_level_2
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
if
__name__
==
'__main__'
:
test_2d_
parallel_vision_transformer
()
test_2d_
vit_zero_level_2
()
tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
0 → 100644
View file @
cd9c28e0
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os
from
pathlib
import
Path
import
pytest
import
torch.autograd
import
torch.multiprocessing
as
mp
import
colossalai
import
torch
from
colossalai.core
import
global_context
as
gpc
from
colossalai.builder
import
build_model
from
colossalai.logging
import
get_dist_logger
from
colossalai.utils
import
get_dataloader
from
colossalai.nn.layer._parallel_utilities
import
_gather
from
colossalai.nn
import
CrossEntropyLoss2D
from
torchvision
import
transforms
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
from
components
import
*
CONFIG
=
dict
(
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
),
fp16
=
dict
(
mode
=
None
,
),
zero
=
dict
(
level
=
3
)
)
def
train_epoch
(
engine
,
train_dataloader
):
engine
.
train
()
accumulated_loss
=
0
num_steps
=
len
(
train_dataloader
)
data_iter
=
iter
(
train_dataloader
)
for
i
in
range
(
num_steps
):
output
,
label
,
loss
=
engine
.
step
(
data_iter
)
accumulated_loss
+=
loss
.
detach
().
cpu
().
numpy
()
avg_loss
=
accumulated_loss
/
num_steps
return
avg_loss
def
run_2d_parallel_vision_transformer_level_3
(
rank
,
world_size
):
colossalai
.
launch
(
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
29951
,
backend
=
'nccl'
)
# build model
model
=
build_model
(
model_cfg
)
model
.
build_from_cfg
()
# build dataloader# build dataloaders
train_dataset
=
CIFAR10
(
root
=
Path
(
os
.
environ
[
'DATA'
]),
download
=
True
,
transform
=
transforms
.
Compose
(
[
transforms
.
Resize
(
size
=
(
IMG_SIZE
,
IMG_SIZE
)),
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
=
(
0.5
,
0.5
,
0.5
),
std
=
(
0.5
,
0.5
,
0.5
))
]
)
)
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
drop_last
=
True
)
# build optimizer and loss
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
0.001
)
criterion
=
CrossEntropyLoss2D
()
engine
,
train_dataloader
,
*
args
=
colossalai
.
initialize
(
model
=
model
,
optimizer
=
optimizer
,
criterion
=
criterion
,
train_dataloader
=
train_dataloader
)
logger
=
get_dist_logger
()
logger
.
info
(
'start training'
)
engine
.
train
()
for
img
,
label
in
train_dataloader
:
engine
.
zero_grad
()
img
=
img
.
cuda
()
label
=
label
.
cuda
()
out
=
engine
(
img
)
loss
=
engine
.
criterion
(
out
,
label
)
engine
.
backward
(
loss
)
engine
.
step
()
break
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"Level 3 has unknown bug so skip this test for now"
)
def
test_3d_vit_zero_level_3
():
world_size
=
8
run_func
=
partial
(
run_2d_parallel_vision_transformer_level_3
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_3d_vit_zero_level_3
()
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment