"applications/ColossalQA/colossalqa/local/__init__.py" did not exist on "b0ce5a10326912961f0bc07cbbd250bab7b9c399"
Commit 404ecbdc authored by zbian's avatar zbian
Browse files

Migrated project

parent 2ebaefc5
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import pytest
import torch
import torch.nn.functional as F
from torch.utils.checkpoint import checkpoint
from colossalai.context.parallel_mode import ParallelMode
from colossalai.context.random import add_seed, seed, set_mode
from colossalai.utils import checkpoint
def forward(x, weight):
out = torch.matmul(x, weight)
with seed(ParallelMode.DATA):
out_ = F.dropout(out, p=0.4, training=True)
return out_
@pytest.mark.gpu
def test_activation_checkpointing():
add_seed(ParallelMode.GLOBAL, 1024)
set_mode(ParallelMode.GLOBAL)
global_cuda_rng_state = torch.cuda.get_rng_state()
add_seed(ParallelMode.DATA, 1026)
set_mode(ParallelMode.DATA)
data_parallel_cuda_rng_state = torch.cuda.get_rng_state()
set_mode(ParallelMode.GLOBAL)
# normal
data = torch.rand(2, 2, requires_grad=True).cuda()
data.retain_grad()
weight = torch.rand(2, 4, requires_grad=True).cuda()
data_ = data.clone().detach()
data_.requires_grad = True
data_.retain_grad()
weight_ = weight.clone().detach()
weight_.requires_grad = True
out = forward(data, weight)
loss = out.sum()
loss.backward()
# checkpoint
set_mode(ParallelMode.GLOBAL)
torch.cuda.set_rng_state(global_cuda_rng_state)
set_mode(ParallelMode.DATA)
torch.cuda.set_rng_state(data_parallel_cuda_rng_state)
set_mode(ParallelMode.GLOBAL)
out = checkpoint(forward, data_, weight_)
loss = out.sum()
loss.backward()
assert torch.all(data.grad == data_.grad), 'Gradient of the input does not match'
if __name__ == '__main__':
test_activation_checkpointing()
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
BATCH_SIZE = 128
IMG_SIZE = 224
NUM_CLS = 1000
# resnet 18
model = dict(
type='VanillaResNet',
block_type='ResNetBottleneck',
layers=[3, 4, 6, 3],
num_cls=NUM_CLS
)
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='RandomResizedCrop', size=IMG_SIZE),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
]
),
dataloader=dict(
batch_size=64,
pin_memory=True,
num_workers=4,
sampler=dict(
type='DataParallelSampler',
shuffle=True,
)
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=(IMG_SIZE, IMG_SIZE)),
dict(type='ToTensor'),
dict(type='Normalize', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=4,
)
)
dist_initializer = [
dict(type='DataParallelInitializer'),
]
parallelization = dict(
pipeline=1,
tensor=1,
sequence=-1
)
optimizer = dict(
type='Adam',
lr=0.01
)
loss = dict(
type='CrossEntropyLoss'
)
trainer = dict(
max_epochs=5,
max_iters=1000
)
amp = dict(
fp16=None,
)
level = 2
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=1, mode=None)
)
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os.path as osp
import pytest
import torch
from torch.utils.data import DataLoader
import colossalai
from colossalai.builder import build_dataset, build_loss, build_data_sampler, build_model
from colossalai.core import global_context
from colossalai.engine.gradient_handler import DataParallelGradientHandler
from colossalai.nn.optimizer import ZeroRedundancyOptimizer_Level_1, ZeroRedundancyOptimizer_Level_3, \
ZeroRedundancyOptimizer_Level_2
from colossalai.utils import print_rank_0
DIR_PATH = osp.dirname(osp.abspath(__file__))
CONFIG_PATH = osp.join(DIR_PATH, 'config.py')
def run_dist():
colossalai.init_dist(CONFIG_PATH)
# build resnet model
model = build_model(global_context.config.model)
model.build_from_cfg()
model = model.cuda()
level = global_context.config.level
if level > 1:
model = model.half()
# test init cuda memory
_ = torch.rand(1).cuda()
torch.cuda.synchronize()
max_alloc = torch.cuda.max_memory_allocated()
max_reserved = torch.cuda.max_memory_reserved()
print(f'before run: max_allocation = {max_alloc}, max_reserved = {max_reserved}')
# build dataloader
train_dataset = build_dataset(global_context.config.train_data.dataset)
sampler_cfg = global_context.config.train_data.dataloader.pop('sampler', None)
if sampler_cfg is None:
train_dataloader = DataLoader(dataset=train_dataset, **global_context.config.train_data.dataloader)
else:
sampler = build_data_sampler(sampler_cfg, train_dataset)
train_dataloader = DataLoader(dataset=train_dataset, sampler=sampler,
**global_context.config.train_data.dataloader)
test_dataset = build_dataset(global_context.config.test_data.dataset)
test_dataloader = DataLoader(dataset=test_dataset, **global_context.config.test_data.dataloader)
# build optimizer and loss
# optimizer = build_optimizer(global_context.config.optimizer, model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
if level == 1:
zero_optim = ZeroRedundancyOptimizer_Level_1(init_optimizer=optimizer, verbose=False)
elif level == 2:
zero_optim = ZeroRedundancyOptimizer_Level_2(init_optimizer=optimizer, cpu_offload=True, verbose=False)
elif level == 3:
zero_optim = ZeroRedundancyOptimizer_Level_3(init_optimizer=optimizer,
module=model,
verbose=False,
offload_optimizer_config=dict(
device='cpu',
pin_memory=True,
buffer_count=5,
fast_init=False
),
offload_param_config=dict(
device='cpu',
pin_memory=True,
buffer_count=5,
buffer_size=1e8,
max_in_cpu=1e9
)
)
loss_fn = build_loss(global_context.config.loss)
gradient_handler = DataParallelGradientHandler(model, zero_optim)
# train
for epoch in range(100):
model.train()
# train
avg_train_loss = 0
train_iter = 0
for idx, (data, label) in enumerate(train_dataloader):
# model = model.half()
data = data[0].cuda()
label = label[0].cuda()
if level > 1:
data = data.half()
output = model(data)
loss = loss_fn(output[0], label)
if level > 1:
zero_optim.backward(loss)
zero_optim.overlapping_partition_gradients_reduce_epilogue()
else:
loss.backward()
gradient_handler.handle_gradient()
zero_optim.step()
zero_optim.zero_grad()
avg_train_loss += loss.detach().cpu().numpy()
train_iter += 1
print_rank_0(f'epoch: {epoch}, train loss: {avg_train_loss / train_iter}')
if epoch % 2 == 0:
model.eval()
avg_eval_loss = 0
correct = 0
total = 0
eval_iters = 0
for idx, (data, label) in enumerate(test_dataloader):
with torch.no_grad():
data = data[0].cuda()
label = label[0].cuda()
if level > 1:
data = data.half()
output = model(data)
loss = loss_fn(output[0], label)
avg_eval_loss += loss.detach().cpu().numpy()
preds = torch.argmax(output[0], dim=1)
total += data.size(0)
correct += sum(preds == label)
eval_iters += 1
print_rank_0(f'epoch: {epoch}, eval loss: {avg_eval_loss / eval_iters}, acc: {correct / total}')
@pytest.mark.skip("This test should be invoked manually using the script provided")
@pytest.mark.dist
def test_zero():
run_dist()
if __name__ == '__main__':
test_zero()
#!/bin/bash
test_file="test_zero.py"
python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
\ No newline at end of file
import os
from pathlib import Path
import torch
BATCH_SIZE = 512
IMG_SIZE = 32
PATCH_SIZE = 4
DIM = 512
NUM_ATTENTION_HEADS = 8
SUMMA_DIM = 2
NUM_CLASSES = 10
DEPTH = 6
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=4,
shuffle=True
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=4,
shuffle=True
)
)
optimizer = dict(
type='ZeroRedundancyOptimizer',
optimizer_class=torch.optim.Adam,
lr=0.001,
weight_decay=0
)
optimizer = dict(
type='Adam',
lr=0.001,
weight_decay=0
)
loss = dict(
type='CrossEntropyLoss2D',
)
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(
type='ViTInputSplitter2D',
),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(
type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
),
droppath_cfg=dict(
type='VanillaViTDropPath',
),
mlp_cfg=dict(
type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
from colossalai.engine import AMP_TYPE
fp16 = dict(
mode=AMP_TYPE.PARALLEL,
initial_scale=2 ** 4
)
#
# fp16 = dict(
# mode=None,
# )
# both level 2 and 3 work
# zero = dict(
# type='ZeroRedundancyOptimizer_Level_1',
# )
lr_scheduler = dict(
type='LinearWarmupLR',
warmup_epochs=5
)
num_epochs = 60
import os
from pathlib import Path
BATCH_SIZE = 512
IMG_SIZE = 32
PATCH_SIZE = 4
DIM = 512
NUM_ATTENTION_HEADS = 8
SUMMA_DIM = 2
NUM_CLASSES = 10
DEPTH = 6
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=4,
shuffle=True
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=4,
shuffle=True
)
)
optimizer = dict(
type='Adam',
lr=0.001,
weight_decay=0
)
loss = dict(
type='CrossEntropyLoss2D',
)
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(
type='ViTInputSplitter2D',
),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(
type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
),
droppath_cfg=dict(
type='VanillaViTDropPath',
),
mlp_cfg=dict(
type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
# from colossalai.engine import AMP_TYPE
#
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=2 ** 4
# )
fp16 = dict(
mode=None,
)
# both level 2 and 3 work
zero = dict(
type='ZeroRedundancyOptimizer_Level_2'
)
lr_scheduler = dict(
type='LinearWarmupLR',
warmup_epochs=5
)
num_epochs = 60
import os
from pathlib import Path
BATCH_SIZE = 512
IMG_SIZE = 32
PATCH_SIZE = 4
DIM = 512
NUM_ATTENTION_HEADS = 8
SUMMA_DIM = 2
NUM_CLASSES = 10
DEPTH = 6
train_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
transform_pipeline=[
dict(type='RandomCrop', size=IMG_SIZE, padding=4),
dict(type='RandomHorizontalFlip'),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=4,
shuffle=True
)
)
test_data = dict(
dataset=dict(
type='CIFAR10Dataset',
root=Path(os.environ['DATA']),
train=False,
transform_pipeline=[
dict(type='Resize', size=IMG_SIZE),
dict(type='ToTensor'),
dict(type='Normalize',
mean=[0.4914, 0.4822, 0.4465],
std=[0.2023, 0.1994, 0.2010]
),
]
),
dataloader=dict(
batch_size=BATCH_SIZE,
pin_memory=True,
num_workers=4,
shuffle=True
)
)
optimizer = dict(
type='Adam',
lr=0.001,
weight_decay=0
)
loss = dict(
type='CrossEntropyLoss2D',
)
model = dict(
type='VisionTransformerFromConfig',
tensor_splitting_cfg=dict(
type='ViTInputSplitter2D',
),
embedding_cfg=dict(
type='ViTPatchEmbedding2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
),
token_fusion_cfg=dict(
type='ViTTokenFuser2D',
img_size=IMG_SIZE,
patch_size=PATCH_SIZE,
embed_dim=DIM,
drop_rate=0.1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
block_cfg=dict(
type='ViTBlock',
attention_cfg=dict(
type='ViTSelfAttention2D',
hidden_size=DIM,
num_attention_heads=NUM_ATTENTION_HEADS,
attention_dropout_prob=0.,
hidden_dropout_prob=0.1,
),
droppath_cfg=dict(
type='VanillaViTDropPath',
),
mlp_cfg=dict(
type='ViTMLP2D',
in_features=DIM,
dropout_prob=0.1,
mlp_ratio=1
),
norm_cfg=dict(
type='LayerNorm2D',
normalized_shape=DIM,
eps=1e-6,
),
),
head_cfg=dict(
type='ViTHead2D',
hidden_size=DIM,
num_classes=NUM_CLASSES,
),
embed_dim=DIM,
depth=DEPTH,
drop_path_rate=0.,
)
parallel = dict(
pipeline=dict(size=1),
tensor=dict(size=4, mode='2d'),
)
# from colossalai.engine import AMP_TYPE
# fp16 = dict(
# mode=AMP_TYPE.PARALLEL,
# initial_scale=2 ** 4
# )
fp16 = dict(
mode=None,
)
# both level 2 and 3 work
zero = dict(
type='ZeroRedundancyOptimizer_Level_3'
)
lr_scheduler = dict(
type='LinearWarmupLR',
warmup_epochs=5
)
num_epochs = 60
#!/usr/bin/env sh
test_file=$1
python $test_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500
\ No newline at end of file
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import os
from pathlib import Path
import pytest
import torch.autograd
import colossalai
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.logging import get_global_dist_logger
from colossalai.nn.layer._parallel_utilities import _gather
level = os.environ['LEVEL']
CONFIG_PATH = Path(__file__).parent.parent.joinpath(f'configs/vit_2d_zero{level}.py')
def eval(engine):
engine.eval()
accumulated_loss = 0
correct_sum = 0
total_sum = 0
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
accumulated_loss += loss.detach().cpu().numpy()
output = _gather(
output[0],
ParallelMode.PARALLEL_2D_ROW,
1
)
output = _gather(
output,
ParallelMode.PARALLEL_2D_COL,
0,
)
output = torch.argmax(output, dim=-1)
correct = torch.sum(label[0] == output)
correct_sum += correct
total_sum += label[0].size(0)
avg_loss = accumulated_loss / engine.schedule.num_steps
return correct_sum, total_sum, avg_loss
def train(engine):
engine.train()
accumulated_loss = 0
for i in range(engine.schedule.num_steps):
output, label, loss = engine.step()
accumulated_loss += loss.detach().cpu().numpy()
avg_loss = accumulated_loss / engine.schedule.num_steps
return avg_loss
@pytest.mark.dist
@pytest.mark.skip("This test should be invoked by test.sh in the same folder as it runs on multiple gpus")
def test_2d_parallel_vision_transformer():
# init dist
model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize(
CONFIG_PATH)
logger = get_global_dist_logger()
engine = Engine(model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
criterion=criterion,
optimizer=optimizer,
lr_scheduler=lr_scheduler,
schedule=schedule)
# for param in model.parameters():
# if isinstance(param, torch.HalfTensor):
# print(param.shape)
logger.info('start training')
for epoch in range(gpc.config.num_epochs):
train_loss = train(engine)
logger.info(f'epoch {epoch} - train loss: {train_loss}')
if epoch % 2 == 0:
correct_sum, total_sum, eval_loss = eval(engine)
logger.info(
f'epoch {epoch} - eval loss: {eval_loss}, total: {total_sum}, '
f'correct: {correct_sum}, acc: {correct_sum / total_sum}')
if __name__ == '__main__':
test_2d_parallel_vision_transformer()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment