Unverified Commit cd9c28e0 authored by Frank Lee's avatar Frank Lee Committed by GitHub
Browse files

added CI for unit testing (#69)

parent 45355a62
#!/bin/bash
test_file="test_zero.py"
python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
\ No newline at end of file
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
import pytest
import torch
import torch.multiprocessing as mp
from pathlib import Path
import colossalai
from colossalai.core import global_context as gpc
from colossalai.utils import get_dataloader
from torchvision import transforms
from torchvision.models import resnet18
from torchvision.datasets import CIFAR10
from functools import partial
BATCH_SIZE = 16
IMG_SIZE = 224
CONFIG = dict(
fp16=dict(
mode=None,
),
zero=dict(
level=2,
cpu_offload=True,
verbose=False,
),
parallel=dict(
pipeline=dict(size=1),
tensor=dict(size=1, mode=None)
)
)
def run_dist(rank, world_size):
colossalai.launch(config=CONFIG,
rank=rank,
world_size=world_size,
host='localhost',
port=29940,
backend='nccl')
# build model
model = resnet18(num_classes=10)
# build dataloader# build dataloaders
train_dataset = CIFAR10(
root=Path(os.environ['DATA']),
download=True,
transform=transforms.Compose(
[
transforms.Resize(size=(IMG_SIZE, IMG_SIZE)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
]
)
)
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
pin_memory=True,
drop_last=True)
# build optimizer and loss
# optimizer = build_optimizer(global_context.config.optimizer, model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.CrossEntropyLoss()
engine, train_dataloader, *args = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader)
# train
model.train()
for idx, (data, label) in enumerate(train_dataloader):
engine.zero_grad()
data = data.cuda()
label = label.cuda()
output = engine(data)
loss = engine.criterion(output, label)
engine.backward(loss)
engine.step()
break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_zero_level_2():
world_size = 4
run_func = partial(run_dist, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_zero_level_2()
......@@ -4,36 +4,25 @@
import os
import pytest
import torch
import torch.multiprocessing as mp
from pathlib import Path
import colossalai
from colossalai.initialize import get_default_parser
from colossalai.core import global_context as gpc
from colossalai.utils import get_dataloader
from torchvision import transforms
from torchvision.models import resnet18
from torchvision.datasets import CIFAR10
from functools import partial
BATCH_SIZE = 128
BATCH_SIZE = 16
IMG_SIZE = 224
NUM_CLS = 1000
CONFIG = dict(
fp16=dict(
mode=None,
),
zero=dict(
# ==============
# level 2 config
# ==============
# level=2,
# cpu_offload=True,
# verbose=False,
# ==============
# level 3 config
# ==============
level=3,
verbose=False,
offload_optimizer_config=dict(
......@@ -57,16 +46,13 @@ CONFIG = dict(
)
def run_dist():
parser = get_default_parser()
args = parser.parse_args()
def run_dist(rank, world_size):
colossalai.launch(config=CONFIG,
rank=args.rank,
world_size=args.world_size,
host=args.host,
port=args.port,
backend=args.backend)
rank=rank,
world_size=world_size,
host='localhost',
port=29941,
backend='nccl')
# build model
model = resnet18(num_classes=10)
......@@ -86,7 +72,6 @@ def run_dist():
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True,
drop_last=True)
......@@ -104,22 +89,27 @@ def run_dist():
model.train()
for idx, (data, label) in enumerate(train_dataloader):
engine.zero_grad()
data = data.cuda()
data = data.cuda().half()
label = label.cuda()
output = engine(data)
output = engine(data).float()
loss = engine.criterion(output, label)
engine.backward(loss)
engine.step()
break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.skip("This test should be invoked manually using the script provided")
@pytest.mark.dist
def test_zero():
run_dist()
@pytest.mark.skip("Level 3 has unknown bug so skip this test for now")
def test_zero_level_3():
world_size = 4
run_func = partial(run_dist, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_zero()
test_zero_level_3()
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
fp16 = dict(
mode=None,
)
zero = dict(
level=2
)
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
fp16 = dict(
mode=None,
)
zero = dict(
level=3
)
#!/usr/bin/env sh
test_file=$1
python $test_file --rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
\ No newline at end of file
......@@ -6,12 +6,11 @@ from pathlib import Path
import pytest
import torch.autograd
import torch.multiprocessing as mp
import colossalai
import torch
from colossalai.initialize import get_default_parser
from colossalai.builder import build_model
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader
......@@ -20,9 +19,20 @@ from colossalai.nn import CrossEntropyLoss2D
from torchvision import transforms
from torchvision.datasets import CIFAR10
from components import *
level = os.environ['LEVEL']
CONFIG_PATH = Path(__file__).parent.parent.joinpath(f'configs/vit_2d_zero{level}.py')
from functools import partial
CONFIG = dict(
parallel=dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
),
fp16=dict(
mode=None,
),
zero=dict(
level=2
)
)
def train_epoch(engine, train_dataloader):
......@@ -37,18 +47,14 @@ def train_epoch(engine, train_dataloader):
return avg_loss
@pytest.mark.dist
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2d_parallel_vision_transformer():
parser = get_default_parser()
args = parser.parse_args()
def run_2d_parallel_vision_transformer_level_2(rank, world_size):
colossalai.launch(
config=CONFIG_PATH,
rank=args.rank,
world_size=args.world_size,
host=args.host,
port=args.port,
backend=args.backend
config=CONFIG,
rank=rank,
world_size=world_size,
host='localhost',
port=29950,
backend='nccl'
)
# build model
......@@ -70,7 +76,6 @@ def test_2d_parallel_vision_transformer():
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True,
drop_last=True)
......@@ -97,6 +102,16 @@ def test_2d_parallel_vision_transformer():
engine.step()
break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
def test_2d_vit_zero_level_2():
world_size = 8
run_func = partial(run_2d_parallel_vision_transformer_level_2, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_2d_parallel_vision_transformer()
test_2d_vit_zero_level_2()
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
import pytest
import torch.autograd
import torch.multiprocessing as mp
import colossalai
import torch
from colossalai.core import global_context as gpc
from colossalai.builder import build_model
from colossalai.logging import get_dist_logger
from colossalai.utils import get_dataloader
from colossalai.nn.layer._parallel_utilities import _gather
from colossalai.nn import CrossEntropyLoss2D
from torchvision import transforms
from torchvision.datasets import CIFAR10
from functools import partial
from components import *
CONFIG = dict(
parallel=dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
),
fp16=dict(
mode=None,
),
zero=dict(
level=3
)
)
def train_epoch(engine, train_dataloader):
engine.train()
accumulated_loss = 0
num_steps = len(train_dataloader)
data_iter = iter(train_dataloader)
for i in range(num_steps):
output, label, loss = engine.step(data_iter)
accumulated_loss += loss.detach().cpu().numpy()
avg_loss = accumulated_loss / num_steps
return avg_loss
def run_2d_parallel_vision_transformer_level_3(rank, world_size):
colossalai.launch(
config=CONFIG,
rank=rank,
world_size=world_size,
host='localhost',
port=29951,
backend='nccl'
)
# build model
model = build_model(model_cfg)
model.build_from_cfg()
# build dataloader# build dataloaders
train_dataset = CIFAR10(
root=Path(os.environ['DATA']),
download=True,
transform=transforms.Compose(
[
transforms.Resize(size=(IMG_SIZE, IMG_SIZE)),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
]
)
)
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
pin_memory=True,
drop_last=True)
# build optimizer and loss
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = CrossEntropyLoss2D()
engine, train_dataloader, *args = colossalai.initialize(model=model,
optimizer=optimizer,
criterion=criterion,
train_dataloader=train_dataloader)
logger = get_dist_logger()
logger.info('start training')
engine.train()
for img, label in train_dataloader:
engine.zero_grad()
img = img.cuda()
label = label.cuda()
out = engine(img)
loss = engine.criterion(out, label)
engine.backward(loss)
engine.step()
break
gpc.destroy()
torch.cuda.empty_cache()
@pytest.mark.dist
@pytest.mark.skip("Level 3 has unknown bug so skip this test for now")
def test_3d_vit_zero_level_3():
world_size = 8
run_func = partial(run_2d_parallel_vision_transformer_level_3, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)
if __name__ == '__main__':
test_3d_vit_zero_level_3()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment