Unverified Commit cd9c28e0 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

added CI for unit testing (#69)

parent 45355a62
#!/bin/bash
test_file="test_zero.py"
python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
\ No newline at end of file
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
import pytest
import torch
import torch.multiprocessing as mp
from pathlib import Path
import colossalai
from colossalai.core import global_context as gpc
from colossalai.utils import get_dataloader
from torchvision import transforms
from torchvision.models import resnet18
from torchvision.datasets import CIFAR10
from functools import partial
BATCH_SIZE = 16
IMG_SIZE = 224
CONFIG = dict(
fp16=dict(
mode=None,
),
zero=dict(
level=2,
cpu_offload=True,
verbose=False,
),
parallel=dict(
pipeline=dict(size=1),
tensor=dict(size=1, mode=None)
)
)
def run_dist(rank, world_size):
colossalai.launch(config=CONFIG,
rank=rank,
world_size=world_size,
host='localhost',
port=29940,
backend='nccl')
# build model
model = resnet18(num_classes=10)
# build dataloader# build dataloaders
train_dataset = CIFAR10(
root=Path(os.environ['DATA']),
download=True,
transform=transforms.Compose(
[
transforms.Resize(size=(IMG_SIZE, IMG_SIZE)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
]
)
)
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
pin_memory=True,
drop_last=True)
# build optimizer and loss
# optimizer = build_optimizer(global_context.config.optimizer, model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
engine, train_dataloader, *args = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader)
# train
model.train()
for idx, (data, label) in enumerate(train_dataloader):
engine.zero_grad()
data = data.cuda()
label = label.cuda()
output = engine(data)
loss = engine.criterion(output, label)
engine.backward(loss)
engine.step()
break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_zero_level_2():
world_size = 4
run_func = partial(run_dist, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_zero_level_2()
...@@ -4,36 +4,25 @@ ...@@ -4,36 +4,25 @@
import os import os
import pytest import pytest
import torch import torch
import torch.multiprocessing as mp
from pathlib import Path from pathlib import Path
import colossalai import colossalai
from colossalai.initialize import get_default_parser
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.utils import get_dataloader from colossalai.utils import get_dataloader
from torchvision import transforms from torchvision import transforms
from torchvision.models import resnet18 from torchvision.models import resnet18
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from functools import partial
BATCH_SIZE = 128 BATCH_SIZE = 16
IMG_SIZE = 224 IMG_SIZE = 224
NUM_CLS = 1000
CONFIG = dict( CONFIG = dict(
fp16=dict( fp16=dict(
mode=None, mode=None,
), ),
zero=dict( zero=dict(
# ==============
# level 2 config
# ==============
# level=2,
# cpu_offload=True,
# verbose=False,
# ==============
# level 3 config
# ==============
level=3, level=3,
verbose=False, verbose=False,
offload_optimizer_config=dict( offload_optimizer_config=dict(
...@@ -57,16 +46,13 @@ CONFIG = dict( ...@@ -57,16 +46,13 @@ CONFIG = dict(
) )
def run_dist(): def run_dist(rank, world_size):
parser = get_default_parser()
args = parser.parse_args()
colossalai.launch(config=CONFIG, colossalai.launch(config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29941,
backend=args.backend) backend='nccl')
# build model # build model
model = resnet18(num_classes=10) model = resnet18(num_classes=10)
...@@ -86,7 +72,6 @@ def run_dist(): ...@@ -86,7 +72,6 @@ def run_dist():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True, pin_memory=True,
drop_last=True) drop_last=True)
...@@ -104,22 +89,27 @@ def run_dist(): ...@@ -104,22 +89,27 @@ def run_dist():
model.train() model.train()
for idx, (data, label) in enumerate(train_dataloader): for idx, (data, label) in enumerate(train_dataloader):
engine.zero_grad() engine.zero_grad()
data = data.cuda() data = data.cuda().half()
label = label.cuda() label = label.cuda()
output = engine(data) output = engine(data).float()
loss = engine.criterion(output, label) loss = engine.criterion(output, label)
engine.backward(loss) engine.backward(loss)
engine.step() engine.step()
break break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.skip("This test should be invoked manually using the script provided")
@pytest.mark.dist @pytest.mark.dist
def test_zero(): @pytest.mark.skip("Level 3 has unknown bug so skip this test for now")
run_dist() def test_zero_level_3():
world_size = 4
run_func = partial(run_dist, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':
test_zero() test_zero_level_3()
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
fp16 = dict(
mode=None,
)
zero = dict(
level=2
)
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
fp16 = dict(
mode=None,
)
zero = dict(
level=3
)
#!/usr/bin/env sh
test_file=$1
python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
\ No newline at end of file
...@@ -6,12 +6,11 @@ from pathlib import Path ...@@ -6,12 +6,11 @@ from pathlib import Path
import pytest import pytest
import torch.autograd import torch.autograd
import torch.multiprocessing as mp
import colossalai import colossalai
import torch import torch
from colossalai.initialize import get_default_parser
from colossalai.builder import build_model from colossalai.builder import build_model
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader from colossalai.utils import get_dataloader
...@@ -20,9 +19,20 @@ from colossalai.nn import CrossEntropyLoss2D ...@@ -20,9 +19,20 @@ from colossalai.nn import CrossEntropyLoss2D
from torchvision import transforms from torchvision import transforms
from torchvision.datasets import CIFAR10 from torchvision.datasets import CIFAR10
from components import * from components import *
from functools import partial
level = os.environ['LEVEL']
CONFIG_PATH = Path(__file__).parent.parent.joinpath(f'configs/vit_2d_zero{level}.py') CONFIG = dict(
parallel=dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
),
fp16=dict(
mode=None,
),
zero=dict(
level=2
)
)
def train_epoch(engine, train_dataloader): def train_epoch(engine, train_dataloader):
...@@ -37,18 +47,14 @@ def train_epoch(engine, train_dataloader): ...@@ -37,18 +47,14 @@ def train_epoch(engine, train_dataloader):
return avg_loss return avg_loss
@pytest.mark.dist def run_2d_parallel_vision_transformer_level_2(rank, world_size):
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2d_parallel_vision_transformer():
parser = get_default_parser()
args = parser.parse_args()
colossalai.launch( colossalai.launch(
config=CONFIG_PATH, config=CONFIG,
rank=args.rank, rank=rank,
world_size=args.world_size, world_size=world_size,
host=args.host, host='localhost',
port=args.port, port=29950,
backend=args.backend backend='nccl'
) )
# build model # build model
...@@ -70,7 +76,6 @@ def test_2d_parallel_vision_transformer(): ...@@ -70,7 +76,6 @@ def test_2d_parallel_vision_transformer():
train_dataloader = get_dataloader(dataset=train_dataset, train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True, shuffle=True,
batch_size=BATCH_SIZE, batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True, pin_memory=True,
drop_last=True) drop_last=True)
...@@ -97,6 +102,16 @@ def test_2d_parallel_vision_transformer(): ...@@ -97,6 +102,16 @@ def test_2d_parallel_vision_transformer():
engine.step() engine.step()
break break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_2d_vit_zero_level_2():
world_size = 8
run_func = partial(run_2d_parallel_vision_transformer_level_2, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__': if __name__ == '__main__':
test_2d_parallel_vision_transformer() test_2d_vit_zero_level_2()
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
import pytest
import torch.autograd
import torch.multiprocessing as mp
import colossalai
import torch
from colossalai.core import global_context as gpc
from colossalai.builder import build_model
from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader
from colossalai.nn.layer._parallel_utilities import _gather
from colossalai.nn import CrossEntropyLoss2D
from torchvision import transforms
from torchvision.datasets import CIFAR10
from functools import partial
from components import *
CONFIG = dict(
parallel=dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
),
fp16=dict(
mode=None,
),
zero=dict(
level=3
)
)
def train_epoch(engine, train_dataloader):
engine.train()
accumulated_loss = 0
num_steps = len(train_dataloader)
data_iter = iter(train_dataloader)
for i in range(num_steps):
output, label, loss = engine.step(data_iter)
accumulated_loss += loss.detach().cpu().numpy()
avg_loss = accumulated_loss / num_steps
return avg_loss
def run_2d_parallel_vision_transformer_level_3(rank, world_size):
colossalai.launch(
config=CONFIG,
rank=rank,
world_size=world_size,
host='localhost',
port=29951,
backend='nccl'
)
# build model
model = build_model(model_cfg)
model.build_from_cfg()
# build dataloader# build dataloaders
train_dataset = CIFAR10(
root=Path(os.environ['DATA']),
download=True,
transform=transforms.Compose(
[
transforms.Resize(size=(IMG_SIZE, IMG_SIZE)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
]
)
)
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
pin_memory=True,
drop_last=True)
# build optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss2D()
engine, train_dataloader, *args = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader)
logger = get_dist_logger()
logger.info('start training')
engine.train()
for img, label in train_dataloader:
engine.zero_grad()
img = img.cuda()
label = label.cuda()
out = engine(img)
loss = engine.criterion(out, label)
engine.backward(loss)
engine.step()
break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
@pytest.mark.skip("Level 3 has unknown bug so skip this test for now")
def test_3d_vit_zero_level_3():
world_size = 8
run_func = partial(run_2d_parallel_vision_transformer_level_3, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_3d_vit_zero_level_3()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment