Unverified Commit b8e770c8 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[test] merge old components to test to model zoo (#4945)

* [test] add custom models in model zoo

* [test] update legacy test

* [test] update model zoo

* [test] update gemini test

* [test] remove components to test
parent 3a41e830
...@@ -9,6 +9,7 @@ from .comparison import ( ...@@ -9,6 +9,7 @@ from .comparison import (
) )
from .pytest_wrapper import run_on_environment_flag from .pytest_wrapper import run_on_environment_flag
from .utils import ( from .utils import (
DummyDataloader,
clear_cache_before_run, clear_cache_before_run,
free_port, free_port,
parameterize, parameterize,
...@@ -34,4 +35,5 @@ __all__ = [ ...@@ -34,4 +35,5 @@ __all__ = [
"run_on_environment_flag", "run_on_environment_flag",
"check_state_dict_equal", "check_state_dict_equal",
"assert_hf_output_close", "assert_hf_output_close",
"DummyDataloader",
] ]
...@@ -273,3 +273,24 @@ def clear_cache_before_run(): ...@@ -273,3 +273,24 @@ def clear_cache_before_run():
return _clear_cache return _clear_cache
return _wrap_func return _wrap_func
class DummyDataloader:
def __init__(self, data_gen_fn: Callable, length: int = 10):
self.data_gen_fn = data_gen_fn
self.length = length
self.step = 0
def __iter__(self):
self.step = 0
return self
def __next__(self):
if self.step < self.length:
self.step += 1
return self.data_gen_fn()
else:
raise StopIteration
def __len__(self):
return self.length
from . import (
beit,
bert,
gpt2,
hanging_param_model,
inline_op_model,
nested_model,
repeated_computed_layers,
resnet,
simple_net,
)
from .utils import run_fwd, run_fwd_bwd
from . import albert # isort:skip
__all__ = [
"bert",
"gpt2",
"hanging_param_model",
"inline_op_model",
"nested_model",
"repeated_computed_layers",
"resnet",
"simple_net",
"run_fwd_bwd",
"albert",
"beit",
"run_fwd",
]
import torch
from transformers import AlbertConfig, AlbertForSequenceClassification
from .bert import get_bert_data_loader
from .registry import non_distributed_component_funcs
@non_distributed_component_funcs.register(name="albert")
def get_training_components():
hidden_dim = 8
num_head = 4
sequence_length = 12
num_layer = 2
vocab_size = 32
def bert_model_builder(checkpoint: bool = False):
config = AlbertConfig(
vocab_size=vocab_size,
gradient_checkpointing=checkpoint,
hidden_size=hidden_dim,
intermediate_size=hidden_dim * 4,
num_attention_heads=num_head,
max_position_embeddings=sequence_length,
num_hidden_layers=num_layer,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
)
print("building AlbertForSequenceClassification model")
# adapting huggingface BertForSequenceClassification for single unittest calling interface
class ModelAdaptor(AlbertForSequenceClassification):
def forward(self, input_ids, labels):
"""
inputs: data, label
outputs: loss
"""
return super().forward(input_ids=input_ids, labels=labels)[0]
model = ModelAdaptor(config)
# if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"):
# model.gradient_checkpointing_enable()
return model
is_distributed = torch.distributed.is_initialized()
trainloader = get_bert_data_loader(
n_class=vocab_size,
batch_size=2,
total_samples=10000,
sequence_length=sequence_length,
is_distributed=is_distributed,
)
testloader = get_bert_data_loader(
n_class=vocab_size,
batch_size=2,
total_samples=10000,
sequence_length=sequence_length,
is_distributed=is_distributed,
)
criterion = None
return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion
import torch
from timm.models.beit import Beit
from colossalai.utils.cuda import get_current_device
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator
class DummyDataLoader(DummyDataGenerator):
img_size = 64
num_channel = 3
num_class = 10
batch_size = 4
def generate(self):
data = torch.randn(
(
DummyDataLoader.batch_size,
DummyDataLoader.num_channel,
DummyDataLoader.img_size,
DummyDataLoader.img_size,
),
device=get_current_device(),
)
label = torch.randint(
low=0, high=DummyDataLoader.num_class, size=(DummyDataLoader.batch_size,), device=get_current_device()
)
return data, label
@non_distributed_component_funcs.register(name="beit")
def get_training_components():
def model_builder(checkpoint=False):
model = Beit(
img_size=DummyDataLoader.img_size, num_classes=DummyDataLoader.num_class, embed_dim=32, depth=2, num_heads=4
)
return model
trainloader = DummyDataLoader()
testloader = DummyDataLoader()
criterion = torch.nn.CrossEntropyLoss()
return model_builder, trainloader, testloader, torch.optim.Adam, criterion
import torch
import transformers
from packaging import version
from torch.utils.data import SequentialSampler
from transformers import BertConfig, BertForSequenceClassification
from .registry import non_distributed_component_funcs
def get_bert_data_loader(
n_class,
batch_size,
total_samples,
sequence_length,
device=torch.device("cpu:0"),
is_distributed=False,
):
train_data = torch.randint(
low=0,
high=n_class,
size=(total_samples, sequence_length),
device=device,
dtype=torch.long,
)
train_label = torch.randint(low=0, high=2, size=(total_samples,), device=device, dtype=torch.long)
train_dataset = torch.utils.data.TensorDataset(train_data, train_label)
if is_distributed:
sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
else:
sampler = SequentialSampler(train_dataset)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
return train_loader
@non_distributed_component_funcs.register(name="bert")
def get_training_components():
hidden_dim = 8
num_head = 4
sequence_length = 12
num_layer = 2
vocab_size = 32
def bert_model_builder(checkpoint: bool = False):
config = BertConfig(
vocab_size=vocab_size,
gradient_checkpointing=checkpoint,
hidden_size=hidden_dim,
intermediate_size=hidden_dim * 4,
num_attention_heads=num_head,
max_position_embeddings=sequence_length,
num_hidden_layers=num_layer,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
)
# adapting huggingface BertForSequenceClassification for single unittest calling interface
class ModelAdaptor(BertForSequenceClassification):
def forward(self, input_ids, labels):
"""
inputs: data, label
outputs: loss
"""
return super().forward(input_ids=input_ids, labels=labels)[0]
model = ModelAdaptor(config)
if checkpoint and version.parse(transformers.__version__) >= version.parse("4.11.0"):
model.gradient_checkpointing_enable()
return model
is_distributed = torch.distributed.is_initialized()
trainloader = get_bert_data_loader(
n_class=vocab_size,
batch_size=2,
total_samples=10000,
sequence_length=sequence_length,
is_distributed=is_distributed,
)
testloader = get_bert_data_loader(
n_class=vocab_size,
batch_size=2,
total_samples=10000,
sequence_length=sequence_length,
is_distributed=is_distributed,
)
criterion = None
return bert_model_builder, trainloader, testloader, torch.optim.Adam, criterion
import torch
import torch.nn as nn
from transformers import GPT2Config, GPT2LMHeadModel
from colossalai.utils.cuda import get_current_device
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator
class DummyDataLoader(DummyDataGenerator):
vocab_size = 128
batch_size = 4
seq_len = 64
def generate(self):
input_ids = torch.randint(
0,
DummyDataLoader.vocab_size,
(DummyDataLoader.batch_size, DummyDataLoader.seq_len),
device=get_current_device(),
)
return input_ids, input_ids
class GPTLMModel(nn.Module):
def __init__(
self,
hidden_size=768,
num_layers=12,
num_attention_heads=12,
max_seq_len=1024,
vocab_size=50304,
checkpoint=False,
):
super().__init__()
self.checkpoint = checkpoint
self.model = GPT2LMHeadModel(
GPT2Config(
n_embd=hidden_size,
n_layer=num_layers,
n_head=num_attention_heads,
n_positions=max_seq_len,
n_ctx=max_seq_len,
vocab_size=vocab_size,
resid_pdrop=0.0,
embd_pdrop=0.0,
attn_pdrop=0.0,
)
)
if checkpoint:
self.model.gradient_checkpointing_enable()
def forward(self, input_ids):
# Only return lm_logits
attention_mask = torch.ones_like(input_ids)
return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
def gpt2_micro(checkpoint=True):
return GPTLMModel(
checkpoint=checkpoint, hidden_size=32, num_layers=2, num_attention_heads=4, max_seq_len=64, vocab_size=128
)
def gpt2_s(checkpoint=True):
return GPTLMModel(checkpoint=checkpoint)
def gpt2_m(checkpoint=True):
return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16, checkpoint=checkpoint)
class GPTLMLoss(nn.Module):
def __init__(self):
super().__init__()
self.loss_fn = nn.CrossEntropyLoss()
def forward(self, logits, labels):
shift_logits = logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# Flatten the tokens
return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
@non_distributed_component_funcs.register(name="gpt2")
def get_training_components():
trainloader = DummyDataLoader()
testloader = DummyDataLoader()
criterion = GPTLMLoss()
return gpt2_micro, trainloader, testloader, torch.optim.Adam, criterion
import torch
import torch.nn as nn
from colossalai.legacy.nn import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator
class InlineOpModule(CheckpointModule):
"""
a module with inline Ops
"""
def __init__(self, checkpoint=False) -> None:
super().__init__(checkpoint=checkpoint)
self.proj1 = nn.Linear(4, 8)
self.proj2 = nn.Linear(8, 8)
def forward(self, x):
x = self.proj1(x)
# inline add_
x.add_(10)
x = self.proj2(x)
# inline relu_
x = torch.relu_(x)
x = self.proj2(x)
return x
class DummyDataLoader(DummyDataGenerator):
def generate(self):
data = torch.rand(16, 4)
label = torch.randint(low=0, high=2, size=(16,))
return data, label
@non_distributed_component_funcs.register(name="inline_op_model")
def get_training_components():
def model_builder(checkpoint=False):
return InlineOpModule(checkpoint)
trainloader = DummyDataLoader()
testloader = DummyDataLoader()
criterion = torch.nn.CrossEntropyLoss()
from colossalai.nn.optimizer import HybridAdam
return model_builder, trainloader, testloader, HybridAdam, criterion
#!/usr/bin/env python
class Registry:
def __init__(self):
self._registry = dict()
def register(self, name):
assert name not in self._registry
def _register(callable_):
self._registry[name] = callable_
return _register
def get_callable(self, name: str):
return self._registry[name]
def __iter__(self):
self._idx = 0
self._len = len(self._registry)
self._names = list(self._registry.keys())
return self
def __next__(self):
if self._idx < self._len:
key = self._names[self._idx]
callable_ = self._registry[key]
self._idx += 1
return callable_
else:
raise StopIteration
non_distributed_component_funcs = Registry()
model_parallel_component_funcs = Registry()
__all__ = ["non_distributed_component_funcs", "model_parallel_component_funcs"]
import os
from pathlib import Path
import torch
from torchvision.datasets import CIFAR10
from torchvision.models import resnet18
from torchvision.transforms import transforms
from colossalai.legacy.utils import get_dataloader
from .registry import non_distributed_component_funcs
def get_cifar10_dataloader(train):
# build dataloaders
dataset = CIFAR10(
root=Path(os.environ["DATA"]),
download=True,
train=train,
transform=transforms.Compose(
[transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))]
),
)
dataloader = get_dataloader(dataset=dataset, shuffle=True, batch_size=16, drop_last=True)
return dataloader
@non_distributed_component_funcs.register(name="resnet18")
def get_resnet_training_components():
def model_builder(checkpoint=False):
return resnet18(num_classes=10)
trainloader = get_cifar10_dataloader(train=True)
testloader = get_cifar10_dataloader(train=False)
criterion = torch.nn.CrossEntropyLoss()
return model_builder, trainloader, testloader, torch.optim.Adam, criterion
from .dummy_data_generator import DummyDataGenerator
from .executor import run_fwd, run_fwd_bwd
from abc import ABC, abstractmethod
class DummyDataGenerator(ABC):
def __init__(self, length=10):
self.length = length
@abstractmethod
def generate(self):
pass
def __iter__(self):
self.step = 0
return self
def __next__(self):
if self.step < self.length:
self.step += 1
return self.generate()
else:
raise StopIteration
def __len__(self):
return self.length
from . import diffusers, timm, torchaudio, torchrec, torchvision, transformers from . import custom, diffusers, timm, torchaudio, torchrec, torchvision, transformers
from .executor import run_fwd, run_fwd_bwd
from .registry import model_zoo from .registry import model_zoo
__all__ = ["model_zoo"] __all__ = ["model_zoo", "run_fwd", "run_fwd_bwd"]
from .hanging_param_model import *
from .nested_model import *
from .repeated_computed_layers import *
from .simple_net import *
import torch.nn as nn
from torch.utils.checkpoint import checkpoint
class CheckpointModule(nn.Module):
def __init__(self, checkpoint: bool = False):
super().__init__()
self.checkpoint = checkpoint
self._use_checkpoint = checkpoint
def _forward(self, *args, **kwargs):
raise NotImplementedError("CheckpointModule should implement _forward method instead of origin forward")
def forward(self, *args, **kwargs):
if self._use_checkpoint:
return checkpoint(self._forward, *args, **kwargs)
else:
return self._forward(*args, **kwargs)
def train(self, mode: bool = True):
self._use_checkpoint = self.checkpoint
return super().train(mode=mode)
def eval(self):
self._use_checkpoint = False
return super().eval()
...@@ -2,10 +2,8 @@ import torch ...@@ -2,10 +2,8 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from colossalai.legacy.nn import CheckpointModule from ..registry import model_zoo
from .base import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator
class HangingParamModule(CheckpointModule): class HangingParamModule(CheckpointModule):
...@@ -27,22 +25,24 @@ class HangingParamModule(CheckpointModule): ...@@ -27,22 +25,24 @@ class HangingParamModule(CheckpointModule):
return x return x
class DummyDataLoader(DummyDataGenerator): def data_gen():
def generate(self): return dict(x=torch.rand(16, 4))
data = torch.rand(16, 4)
label = torch.randint(low=0, high=2, size=(16,))
return data, label
def loss_fn(x):
outputs = x["x"]
label = torch.randint(low=0, high=2, size=(16,), device=outputs.device)
return F.cross_entropy(x["x"], label)
@non_distributed_component_funcs.register(name="hanging_param_model")
def get_training_components():
def model_builder(checkpoint=False):
return HangingParamModule(checkpoint)
trainloader = DummyDataLoader() def output_transform(x: torch.Tensor):
testloader = DummyDataLoader() return dict(x=x)
criterion = torch.nn.CrossEntropyLoss()
from colossalai.nn.optimizer import HybridAdam
return model_builder, trainloader, testloader, HybridAdam, criterion model_zoo.register(
name="custom_hanging_param_model",
model_fn=HangingParamModule,
data_gen_fn=data_gen,
output_transform_fn=output_transform,
loss_fn=loss_fn,
)
...@@ -2,10 +2,8 @@ import torch ...@@ -2,10 +2,8 @@ import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F import torch.nn.functional as F
from colossalai.legacy.nn import CheckpointModule from ..registry import model_zoo
from .base import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils import DummyDataGenerator
class SubNet(nn.Module): class SubNet(nn.Module):
...@@ -32,20 +30,24 @@ class NestedNet(CheckpointModule): ...@@ -32,20 +30,24 @@ class NestedNet(CheckpointModule):
return x return x
class DummyDataLoader(DummyDataGenerator): def data_gen():
def generate(self): return dict(x=torch.rand(16, 5))
data = torch.rand(16, 5)
label = torch.randint(low=0, high=2, size=(16,))
return data, label def loss_fn(x):
outputs = x["x"]
label = torch.randint(low=0, high=2, size=(16,), device=outputs.device)
return F.cross_entropy(x["x"], label)
@non_distributed_component_funcs.register(name="nested_model") def output_transform(x: torch.Tensor):
def get_training_components(): return dict(x=x)
def model_builder(checkpoint=False):
return NestedNet(checkpoint)
trainloader = DummyDataLoader()
testloader = DummyDataLoader()
criterion = torch.nn.CrossEntropyLoss() model_zoo.register(
return model_builder, trainloader, testloader, torch.optim.Adam, criterion name="custom_nested_model",
model_fn=NestedNet,
data_gen_fn=data_gen,
output_transform_fn=output_transform,
loss_fn=loss_fn,
)
#!/usr/bin/env python
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F
from colossalai.legacy.nn import CheckpointModule from ..registry import model_zoo
from .base import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator
class NetWithRepeatedlyComputedLayers(CheckpointModule): class NetWithRepeatedlyComputedLayers(CheckpointModule):
...@@ -28,20 +25,24 @@ class NetWithRepeatedlyComputedLayers(CheckpointModule): ...@@ -28,20 +25,24 @@ class NetWithRepeatedlyComputedLayers(CheckpointModule):
return x return x
class DummyDataLoader(DummyDataGenerator): def data_gen():
def generate(self): return dict(x=torch.rand(16, 5))
data = torch.rand(16, 5)
label = torch.randint(low=0, high=2, size=(16,))
return data, label def loss_fn(x):
outputs = x["x"]
label = torch.randint(low=0, high=2, size=(16,), device=outputs.device)
return F.cross_entropy(x["x"], label)
@non_distributed_component_funcs.register(name="repeated_computed_layers") def output_transform(x: torch.Tensor):
def get_training_components(): return dict(x=x)
def model_builder(checkpoint=False):
return NetWithRepeatedlyComputedLayers(checkpoint)
trainloader = DummyDataLoader()
testloader = DummyDataLoader()
criterion = torch.nn.CrossEntropyLoss() model_zoo.register(
return model_builder, trainloader, testloader, torch.optim.Adam, criterion name="custom_repeated_computed_layers",
model_fn=NetWithRepeatedlyComputedLayers,
data_gen_fn=data_gen,
output_transform_fn=output_transform,
loss_fn=loss_fn,
)
import torch import torch
import torch.nn as nn import torch.nn as nn
import torch.nn.functional as F
from colossalai.legacy.nn import CheckpointModule from ..registry import model_zoo
from colossalai.utils.cuda import get_current_device from .base import CheckpointModule
from .registry import non_distributed_component_funcs
from .utils.dummy_data_generator import DummyDataGenerator
class SimpleNet(CheckpointModule): class SimpleNet(CheckpointModule):
...@@ -32,22 +30,24 @@ class SimpleNet(CheckpointModule): ...@@ -32,22 +30,24 @@ class SimpleNet(CheckpointModule):
return x return x
class DummyDataLoader(DummyDataGenerator): def data_gen():
def generate(self): return dict(x=torch.randint(low=0, high=20, size=(16,)))
data = torch.randint(low=0, high=20, size=(16,), device=get_current_device())
label = torch.randint(low=0, high=2, size=(16,), device=get_current_device())
return data, label
def loss_fn(x):
outputs = x["x"]
label = torch.randint(low=0, high=2, size=(16,), device=outputs.device)
return F.cross_entropy(x["x"], label)
@non_distributed_component_funcs.register(name="simple_net")
def get_training_components():
def model_builder(checkpoint=False):
return SimpleNet(checkpoint)
trainloader = DummyDataLoader() def output_transform(x: torch.Tensor):
testloader = DummyDataLoader() return dict(x=x)
criterion = torch.nn.CrossEntropyLoss()
from colossalai.nn.optimizer import HybridAdam
return model_builder, trainloader, testloader, HybridAdam, criterion model_zoo.register(
name="custom_simple_net",
model_fn=SimpleNet,
data_gen_fn=data_gen,
output_transform_fn=output_transform,
loss_fn=loss_fn,
)
from typing import Callable, Dict, Optional, Union
import torch import torch
from torch.nn import Module
from torch.optim import Optimizer
from colossalai.interface import OptimizerWrapper
def run_fwd(model, data, label, criterion) -> torch.Tensor: def run_fwd(
model: Module, data: Dict, output_transform_fn: Callable, criterion: Optional[Callable] = None
) -> torch.Tensor:
"""run_fwd """run_fwd
run fwd for the model run fwd for the model
...@@ -14,18 +22,22 @@ def run_fwd(model, data, label, criterion) -> torch.Tensor: ...@@ -14,18 +22,22 @@ def run_fwd(model, data, label, criterion) -> torch.Tensor:
Returns: Returns:
torch.Tensor: loss of fwd torch.Tensor: loss of fwd
""" """
outputs = model(**data)
outputs = output_transform_fn(outputs)
if criterion: if criterion:
y = model(data) loss = criterion(outputs)
y = y.float()
loss = criterion(y, label)
else: else:
loss = model(data, label) loss = next(iter(outputs.values())).sum()
loss = loss.float()
return loss return loss
def run_fwd_bwd(model, data, label, criterion, optimizer=None) -> torch.Tensor: def run_fwd_bwd(
model: Module,
data: Dict,
output_transform_fn: Callable,
criterion: Optional[Callable] = None,
optimizer: Optional[Union[Optimizer, OptimizerWrapper]] = None,
) -> torch.Tensor:
"""run_fwd_bwd """run_fwd_bwd
run fwd and bwd for the model run fwd and bwd for the model
...@@ -38,7 +50,7 @@ def run_fwd_bwd(model, data, label, criterion, optimizer=None) -> torch.Tensor: ...@@ -38,7 +50,7 @@ def run_fwd_bwd(model, data, label, criterion, optimizer=None) -> torch.Tensor:
Returns: Returns:
torch.Tensor: loss of fwd torch.Tensor: loss of fwd
""" """
loss = run_fwd(model, data, label, criterion) loss = run_fwd(model, data, output_transform_fn, criterion)
if optimizer: if optimizer:
optimizer.backward(loss) optimizer.backward(loss)
else: else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment