Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
cd9c28e0
"...models/git@developer.sourcefind.cn:OpenDAS/colossalai.git" did not exist on "9d02590c9a64d12bc31866f35bf9b51a4084963f"
Unverified
Commit
cd9c28e0
authored
Dec 16, 2021
by
Frank Lee
Committed by
GitHub
Dec 16, 2021
Browse files
added CI for unit testing (#69)
parent
45355a62
Changes
68
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
274 additions
and
80 deletions
+274
-80
tests/test_zero_data_parallel/test_zero.sh
tests/test_zero_data_parallel/test_zero.sh
+0
-4
tests/test_zero_data_parallel/test_zero_level_2.py
tests/test_zero_data_parallel/test_zero_level_2.py
+102
-0
tests/test_zero_data_parallel/test_zero_level_3.py
tests/test_zero_data_parallel/test_zero_level_3.py
+20
-30
tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py
tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py
+0
-12
tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py
tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py
+0
-12
tests/test_zero_tensor_parallel/test.sh
tests/test_zero_tensor_parallel/test.sh
+0
-4
tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
+33
-18
tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
+119
-0
No files found.
tests/test_zero_data_parallel/test_zero.sh
deleted
100644 → 0
View file @
45355a62
#!/bin/bash
test_file
=
"test_zero.py"
python
$test_file
--rank
$SLURM_PROCID
--world_size
$SLURM_NPROCS
--host
$HOST
--port
29500
\ No newline at end of file
tests/test_zero_data_parallel/test_zero_level_2.py
0 → 100644
View file @
cd9c28e0
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os
import
pytest
import
torch
import
torch.multiprocessing
as
mp
from
pathlib
import
Path
import
colossalai
from
colossalai.core
import
global_context
as
gpc
from
colossalai.utils
import
get_dataloader
from
torchvision
import
transforms
from
torchvision.models
import
resnet18
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
BATCH_SIZE
=
16
IMG_SIZE
=
224
CONFIG
=
dict
(
fp16
=
dict
(
mode
=
None
,
),
zero
=
dict
(
level
=
2
,
cpu_offload
=
True
,
verbose
=
False
,
),
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
1
,
mode
=
None
)
)
)
def
run_dist
(
rank
,
world_size
):
colossalai
.
launch
(
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
29940
,
backend
=
'nccl'
)
# build model
model
=
resnet18
(
num_classes
=
10
)
# build dataloader# build dataloaders
train_dataset
=
CIFAR10
(
root
=
Path
(
os
.
environ
[
'DATA'
]),
download
=
True
,
transform
=
transforms
.
Compose
(
[
transforms
.
Resize
(
size
=
(
IMG_SIZE
,
IMG_SIZE
)),
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
=
(
0.5
,
0.5
,
0.5
),
std
=
(
0.5
,
0.5
,
0.5
))
]
)
)
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
drop_last
=
True
)
# build optimizer and loss
# optimizer = build_optimizer(global_context.config.optimizer, model)
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
0.001
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
engine
,
train_dataloader
,
*
args
=
colossalai
.
initialize
(
model
=
model
,
optimizer
=
optimizer
,
criterion
=
criterion
,
train_dataloader
=
train_dataloader
)
# train
model
.
train
()
for
idx
,
(
data
,
label
)
in
enumerate
(
train_dataloader
):
engine
.
zero_grad
()
data
=
data
.
cuda
()
label
=
label
.
cuda
()
output
=
engine
(
data
)
loss
=
engine
.
criterion
(
output
,
label
)
engine
.
backward
(
loss
)
engine
.
step
()
break
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
dist
def
test_zero_level_2
():
world_size
=
4
run_func
=
partial
(
run_dist
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_zero_level_2
()
tests/test_zero_data_parallel/test_zero.py
→
tests/test_zero_data_parallel/test_zero
_level_3
.py
View file @
cd9c28e0
...
...
@@ -4,36 +4,25 @@
import
os
import
pytest
import
torch
import
torch.multiprocessing
as
mp
from
pathlib
import
Path
import
colossalai
from
colossalai.initialize
import
get_default_parser
from
colossalai.core
import
global_context
as
gpc
from
colossalai.utils
import
get_dataloader
from
torchvision
import
transforms
from
torchvision.models
import
resnet18
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
BATCH_SIZE
=
1
28
BATCH_SIZE
=
1
6
IMG_SIZE
=
224
NUM_CLS
=
1000
CONFIG
=
dict
(
fp16
=
dict
(
mode
=
None
,
),
zero
=
dict
(
# ==============
# level 2 config
# ==============
# level=2,
# cpu_offload=True,
# verbose=False,
# ==============
# level 3 config
# ==============
level
=
3
,
verbose
=
False
,
offload_optimizer_config
=
dict
(
...
...
@@ -57,16 +46,13 @@ CONFIG = dict(
)
def
run_dist
():
parser
=
get_default_parser
()
args
=
parser
.
parse_args
()
def
run_dist
(
rank
,
world_size
):
colossalai
.
launch
(
config
=
CONFIG
,
rank
=
args
.
rank
,
world_size
=
args
.
world_size
,
host
=
args
.
host
,
port
=
args
.
port
,
backend
=
args
.
backend
)
rank
=
rank
,
world_size
=
world_size
,
host
=
'local
host
'
,
port
=
29941
,
backend
=
'nccl'
)
# build model
model
=
resnet18
(
num_classes
=
10
)
...
...
@@ -86,7 +72,6 @@ def run_dist():
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
num_workers
=
1
,
pin_memory
=
True
,
drop_last
=
True
)
...
...
@@ -104,22 +89,27 @@ def run_dist():
model
.
train
()
for
idx
,
(
data
,
label
)
in
enumerate
(
train_dataloader
):
engine
.
zero_grad
()
data
=
data
.
cuda
()
data
=
data
.
cuda
()
.
half
()
label
=
label
.
cuda
()
output
=
engine
(
data
)
output
=
engine
(
data
)
.
float
()
loss
=
engine
.
criterion
(
output
,
label
)
engine
.
backward
(
loss
)
engine
.
step
()
break
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
skip
(
"This test should be invoked manually using the script provided"
)
@
pytest
.
mark
.
dist
def
test_zero
():
run_dist
()
@
pytest
.
mark
.
skip
(
"Level 3 has unknown bug so skip this test for now"
)
def
test_zero_level_3
():
world_size
=
4
run_func
=
partial
(
run_dist
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_zero
()
test_zero
_level_3
()
tests/test_zero_tensor_parallel/configs/vit_2d_zero2.py
deleted
100644 → 0
View file @
45355a62
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
)
fp16
=
dict
(
mode
=
None
,
)
zero
=
dict
(
level
=
2
)
tests/test_zero_tensor_parallel/configs/vit_2d_zero3.py
deleted
100644 → 0
View file @
45355a62
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
)
fp16
=
dict
(
mode
=
None
,
)
zero
=
dict
(
level
=
3
)
tests/test_zero_tensor_parallel/test.sh
deleted
100644 → 0
View file @
45355a62
#!/usr/bin/env sh
test_file
=
$1
python
$test_file
--rank
$SLURM_PROCID
--world_size
$SLURM_NPROCS
--host
$HOST
--port
29500
\ No newline at end of file
tests/test_zero_tensor_parallel/test_vit_2d.py
→
tests/test_zero_tensor_parallel/test_vit_2d
_level_2
.py
View file @
cd9c28e0
...
...
@@ -6,12 +6,11 @@ from pathlib import Path
import
pytest
import
torch.autograd
import
torch.multiprocessing
as
mp
import
colossalai
import
torch
from
colossalai.initialize
import
get_default_parser
from
colossalai.builder
import
build_model
from
colossalai.context.parallel_mode
import
ParallelMode
from
colossalai.core
import
global_context
as
gpc
from
colossalai.logging
import
get_dist_logger
from
colossalai.utils
import
get_dataloader
...
...
@@ -20,9 +19,20 @@ from colossalai.nn import CrossEntropyLoss2D
from
torchvision
import
transforms
from
torchvision.datasets
import
CIFAR10
from
components
import
*
level
=
os
.
environ
[
'LEVEL'
]
CONFIG_PATH
=
Path
(
__file__
).
parent
.
parent
.
joinpath
(
f
'configs/vit_2d_zero
{
level
}
.py'
)
from
functools
import
partial
CONFIG
=
dict
(
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
),
fp16
=
dict
(
mode
=
None
,
),
zero
=
dict
(
level
=
2
)
)
def
train_epoch
(
engine
,
train_dataloader
):
...
...
@@ -37,18 +47,14 @@ def train_epoch(engine, train_dataloader):
return
avg_loss
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"This test should be invoked by test.sh in the same folder as it runs on multiple gpus"
)
def
test_2d_parallel_vision_transformer
():
parser
=
get_default_parser
()
args
=
parser
.
parse_args
()
def
run_2d_parallel_vision_transformer_level_2
(
rank
,
world_size
):
colossalai
.
launch
(
config
=
CONFIG
_PATH
,
rank
=
args
.
rank
,
world_size
=
args
.
world_size
,
host
=
args
.
host
,
port
=
args
.
port
,
backend
=
args
.
backend
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'local
host
'
,
port
=
29950
,
backend
=
'nccl'
)
# build model
...
...
@@ -70,7 +76,6 @@ def test_2d_parallel_vision_transformer():
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
num_workers
=
1
,
pin_memory
=
True
,
drop_last
=
True
)
...
...
@@ -97,6 +102,16 @@ def test_2d_parallel_vision_transformer():
engine
.
step
()
break
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
dist
def
test_2d_vit_zero_level_2
():
world_size
=
8
run_func
=
partial
(
run_2d_parallel_vision_transformer_level_2
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_2d_
parallel_vision_transformer
()
test_2d_
vit_zero_level_2
()
tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
0 → 100644
View file @
cd9c28e0
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import
os
from
pathlib
import
Path
import
pytest
import
torch.autograd
import
torch.multiprocessing
as
mp
import
colossalai
import
torch
from
colossalai.core
import
global_context
as
gpc
from
colossalai.builder
import
build_model
from
colossalai.logging
import
get_dist_logger
from
colossalai.utils
import
get_dataloader
from
colossalai.nn.layer._parallel_utilities
import
_gather
from
colossalai.nn
import
CrossEntropyLoss2D
from
torchvision
import
transforms
from
torchvision.datasets
import
CIFAR10
from
functools
import
partial
from
components
import
*
CONFIG
=
dict
(
parallel
=
dict
(
pipeline
=
dict
(
size
=
1
),
tensor
=
dict
(
size
=
4
,
mode
=
'2d'
),
),
fp16
=
dict
(
mode
=
None
,
),
zero
=
dict
(
level
=
3
)
)
def
train_epoch
(
engine
,
train_dataloader
):
engine
.
train
()
accumulated_loss
=
0
num_steps
=
len
(
train_dataloader
)
data_iter
=
iter
(
train_dataloader
)
for
i
in
range
(
num_steps
):
output
,
label
,
loss
=
engine
.
step
(
data_iter
)
accumulated_loss
+=
loss
.
detach
().
cpu
().
numpy
()
avg_loss
=
accumulated_loss
/
num_steps
return
avg_loss
def
run_2d_parallel_vision_transformer_level_3
(
rank
,
world_size
):
colossalai
.
launch
(
config
=
CONFIG
,
rank
=
rank
,
world_size
=
world_size
,
host
=
'localhost'
,
port
=
29951
,
backend
=
'nccl'
)
# build model
model
=
build_model
(
model_cfg
)
model
.
build_from_cfg
()
# build dataloader# build dataloaders
train_dataset
=
CIFAR10
(
root
=
Path
(
os
.
environ
[
'DATA'
]),
download
=
True
,
transform
=
transforms
.
Compose
(
[
transforms
.
Resize
(
size
=
(
IMG_SIZE
,
IMG_SIZE
)),
transforms
.
ToTensor
(),
transforms
.
Normalize
(
mean
=
(
0.5
,
0.5
,
0.5
),
std
=
(
0.5
,
0.5
,
0.5
))
]
)
)
train_dataloader
=
get_dataloader
(
dataset
=
train_dataset
,
shuffle
=
True
,
batch_size
=
BATCH_SIZE
,
pin_memory
=
True
,
drop_last
=
True
)
# build optimizer and loss
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
0.001
)
criterion
=
CrossEntropyLoss2D
()
engine
,
train_dataloader
,
*
args
=
colossalai
.
initialize
(
model
=
model
,
optimizer
=
optimizer
,
criterion
=
criterion
,
train_dataloader
=
train_dataloader
)
logger
=
get_dist_logger
()
logger
.
info
(
'start training'
)
engine
.
train
()
for
img
,
label
in
train_dataloader
:
engine
.
zero_grad
()
img
=
img
.
cuda
()
label
=
label
.
cuda
()
out
=
engine
(
img
)
loss
=
engine
.
criterion
(
out
,
label
)
engine
.
backward
(
loss
)
engine
.
step
()
break
gpc
.
destroy
()
torch
.
cuda
.
empty_cache
()
@
pytest
.
mark
.
dist
@
pytest
.
mark
.
skip
(
"Level 3 has unknown bug so skip this test for now"
)
def
test_3d_vit_zero_level_3
():
world_size
=
8
run_func
=
partial
(
run_2d_parallel_vision_transformer_level_3
,
world_size
=
world_size
)
mp
.
spawn
(
run_func
,
nprocs
=
world_size
)
if
__name__
==
'__main__'
:
test_3d_vit_zero_level_3
()
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment