Migrated project

404ecbdc · zbian · 2ebaefc5 · 404ecbdc · 404ecbdc · 404ecbdc
Commit 404ecbdc authored Oct 28, 2021 by zbian
20 changed files
--- a/model_zoo/mlp_mixer/parallel_3d/__init__.py
+++ b/model_zoo/mlp_mixer/parallel_3d/__init__.py
+from .mlp_mixer import *
--- a/model_zoo/mlp_mixer/parallel_3d/mlp_mixer.py
+++ b/model_zoo/mlp_mixer/parallel_3d/mlp_mixer.py
+# modified from https://github.com/lucidrains/mlp-mixer-pytorch/blob/main/mlp_mixer_pytorch/mlp_mixer_pytorch.py
+from functools import partial
+from colossalai.context import ParallelMode
+from colossalai.registry import MODELS
+from torch import nn
+from colossalai import nn as col_nn
+from colossalai.nn.layer.parallel_3d._utils import get_depth_from_env
+from einops.layers.torch import Rearrange, Reduce
+
+__all__ = [
+    'MLPMixer',
+]
+
+
+class PreNormResidual(nn.Module):
+    def __init__(self, dim, fn, depth_3d):
+        super().__init__()
+        self.fn = fn
+        self.norm = col_nn.LayerNorm3D(
+            dim, depth_3d, ParallelMode.PARALLEL_3D_INPUT, ParallelMode.PARALLEL_3D_WEIGHT)
+
+    def forward(self, x):
+        return self.fn(self.norm(x)) + x
+
+
+def FeedForward(dim, depth_3d, expansion_factor=4, dropout=0., dense=None):
+    if dense is None:
+        dense = partial(col_nn.Linear3D, depth=depth_3d, input_parallel_mode=ParallelMode.PARALLEL_3D_INPUT,
+                        weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT)
+    return nn.Sequential(
+        dense(dim, dim * expansion_factor),
+        nn.GELU(),
+        nn.Dropout(dropout),
+        dense(dim * expansion_factor, dim),
+        nn.Dropout(dropout)
+    )
+
+
+@MODELS.register_module
+def MLPMixer(image_size, channels, patch_size, dim, depth, num_classes, expansion_factor=4, dropout=0.):
+    assert (image_size % patch_size) == 0, 'image must be divisible by patch size'
+    num_patches = (image_size // patch_size) ** 2
+    depth_3d = get_depth_from_env()
+    linear = partial(col_nn.Linear3D, depth=depth_3d, input_parallel_mode=ParallelMode.PARALLEL_3D_INPUT,
+                     weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT)
+    norm_layer = partial(col_nn.LayerNorm3D, depth=depth_3d, input_parallel_mode=ParallelMode.PARALLEL_3D_INPUT,
+                         weight_parallel_mode=ParallelMode.PARALLEL_3D_WEIGHT)
+    chan_first, chan_last = partial(nn.Conv1d, kernel_size=1), linear
+
+    return nn.Sequential(
+        Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)',
+                  p1=patch_size, p2=patch_size),
+        linear((patch_size ** 2) * channels, dim),
+        *[nn.Sequential(
+            PreNormResidual(dim, FeedForward(
+                num_patches, expansion_factor, dropout, chan_first)),
+            PreNormResidual(dim, FeedForward(
+                dim, expansion_factor, dropout, chan_last))
+        ) for _ in range(depth)],
+        norm_layer(dim),
+        Reduce('b n c -> b c', 'mean'),
+        linear(dim, num_classes)
+    )
--- a/model_zoo/vit/__init__.py
+++ b/model_zoo/vit/__init__.py
+from .parallel_2d import *
+from .parallel_3d import *
--- a/model_zoo/vit/parallel_1d/.init
+++ b/model_zoo/vit/parallel_1d/.init
--- a/model_zoo/vit/parallel_2d/__init__.py
+++ b/model_zoo/vit/parallel_2d/__init__.py
+from .vit import *
\ No newline at end of file
--- a/model_zoo/vit/parallel_2d/vit.py
+++ b/model_zoo/vit/parallel_2d/vit.py
+from colossalai.context import ParallelMode, seed
+from colossalai import nn as clsl_nn
+from colossalai.registry import MODELS
+from torch import nn
+import torch
+
+
+__all__ = [
+    'VisionTransformer2D',
+    'vit_tiny_2d_patch4_32',
+    'vit_tiny_2d_patch16_224',
+    'vit_tiny_2d_patch16_384',
+    'vit_small_2d_patch16_224',
+    'vit_small_2d_patch16_384',
+    'vit_small_2d_patch32_224',
+    'vit_small_2d_patch32_384',
+    'vit_base_2d_patch16_224',
+    'vit_base_2d_patch16_384',
+    'vit_base_2d_patch32_224',
+    'vit_base_2d_patch32_384',
+    'vit_large_2d_patch16_224',
+    'vit_large_2d_patch16_384',
+    'vit_large_2d_patch32_224',
+    'vit_large_2d_patch32_384',
+]
+
+
+class ViTBlock2D(nn.Module):
+
+    def __init__(self,
+                 dim: int,
+                 num_heads: int,
+                 mlp_ratio: int = 4,
+                 drop: float = 0.,
+                 attn_drop: float = 0.,
+                 drop_path: float = 0.,
+                 act_layer: str = 'gelu'):
+        super().__init__()
+        self.norm1 = clsl_nn.LayerNorm2D(dim, eps=1e-6)
+        self.attn = clsl_nn.ViTSelfAttention2D(dim, num_heads, attn_drop, drop)
+        self.drop_path = clsl_nn.VanillaViTDropPath(drop_path) if drop_path > 0. \
+            else nn.Identity()
+        self.norm2 = clsl_nn.LayerNorm2D(dim, eps=1e-6)
+        self.mlp = clsl_nn.ViTMLP2D(dim, mlp_ratio, act_layer, drop)
+
+    def forward(self, x):
+        y = self.attn(self.norm1(x))
+        with seed(ParallelMode.TENSOR):
+            x = x + self.drop_path(y)
+        y = self.mlp(self.norm2(x))
+        with seed(ParallelMode.TENSOR):
+            x = x + self.drop_path(y)
+        return x
+
+
+@MODELS.register_module
+class VisionTransformer2D(nn.Module):
+
+    def __init__(self,
+                 img_size: int = 224,
+                 patch_size: int = 16,
+                 in_chans: int = 3,
+                 num_classes: int = 1000,
+                 embed_dim: int = 768,
+                 depth: int = 12,
+                 num_heads: int = 12,
+                 mlp_ratio: int = 4,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.,
+                 act_layer: str = 'gelu'):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = clsl_nn.ViTPatchEmbedding2D(
+            img_size, patch_size, embed_dim, in_chans
+        )
+
+        self.splitter = clsl_nn.ViTInputSplitter2D()
+
+        self.token_fuser = clsl_nn.ViTTokenFuser2D(
+            img_size, patch_size, embed_dim, drop_rate
+        )
+
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.Sequential(*[
+            ViTBlock2D(embed_dim, num_heads, mlp_ratio, drop_rate,
+                       attn_drop_rate, dpr[i], act_layer)
+            for i in range(depth)
+        ])
+
+        self.norm = clsl_nn.LayerNorm2D(embed_dim, eps=1e-6)
+        self.head = clsl_nn.ViTHead2D(self.num_features, num_classes) if num_classes > 0 \
+            else nn.Identity()
+
+        self.init_weights()
+
+    def init_weights(self):
+        pass
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = self.splitter(x)
+        x = self.token_fuser(x)
+        x = self.blocks(x)
+        x = self.norm(x)
+        x = self.head(x)
+        return x
+
+
+def _create_vit_model(**model_kwargs):
+    model = VisionTransformer2D(**model_kwargs)
+    return model
+
+
+@MODELS.register_module
+def vit_tiny_2d_patch4_32(**kwargs):
+    model_kwargs = dict(img_size=32, patch_size=4, embed_dim=512,
+                        depth=6, num_heads=8, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_tiny_2d_patch16_224(**kwargs):
+    model_kwargs = dict(patch_size=16, embed_dim=192,
+                        depth=12, num_heads=3, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_tiny_2d_patch16_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=16, embed_dim=192,
+                        depth=12, num_heads=3, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_small_2d_patch16_224(**kwargs):
+    model_kwargs = dict(patch_size=16, embed_dim=384,
+                        depth=12, num_heads=6, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_small_2d_patch16_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=16, embed_dim=384,
+                        depth=12, num_heads=6, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_small_2d_patch32_224(**kwargs):
+    model_kwargs = dict(patch_size=32, embed_dim=384,
+                        depth=12, num_heads=6, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_small_2d_patch32_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=32, embed_dim=384,
+                        depth=12, num_heads=6, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_base_2d_patch16_224(**kwargs):
+    model_kwargs = dict(patch_size=16, embed_dim=768,
+                        depth=12, num_heads=12, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_base_2d_patch16_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=16, embed_dim=768,
+                        depth=12, num_heads=12, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_base_2d_patch32_224(**kwargs):
+    model_kwargs = dict(patch_size=32, embed_dim=768,
+                        depth=12, num_heads=12, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_base_2d_patch32_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=32, embed_dim=768,
+                        depth=12, num_heads=12, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_large_2d_patch16_224(**kwargs):
+    model_kwargs = dict(patch_size=16, embed_dim=1024,
+                        depth=24, num_heads=16, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_large_2d_patch16_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=16, embed_dim=1024,
+                        depth=24, num_heads=16, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_large_2d_patch32_224(**kwargs):
+    model_kwargs = dict(patch_size=32, embed_dim=1024,
+                        depth=24, num_heads=16, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_large_2d_patch32_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=32, embed_dim=1024,
+                        depth=24, num_heads=16, **kwargs)
+    return _create_vit_model(**model_kwargs)
\ No newline at end of file
--- a/model_zoo/vit/parallel_2p5d/.init
+++ b/model_zoo/vit/parallel_2p5d/.init
--- a/model_zoo/vit/parallel_3d/__init__.py
+++ b/model_zoo/vit/parallel_3d/__init__.py
+from .vit import *
--- a/model_zoo/vit/parallel_3d/vit.py
+++ b/model_zoo/vit/parallel_3d/vit.py
+import torch
+from torch import nn
+
+from colossalai import nn as col_nn
+from colossalai.context import ParallelMode
+from colossalai.registry import MODELS
+
+__all__ = [
+    'VisionTransformer3D',
+    'vit_tiny_3d_patch4_32',
+    'vit_tiny_3d_patch16_224',
+    'vit_tiny_3d_patch16_384',
+    'vit_small_3d_patch16_224',
+    'vit_small_3d_patch16_384',
+    'vit_small_3d_patch32_224',
+    'vit_small_3d_patch32_384',
+    'vit_base_3d_patch16_224',
+    'vit_base_3d_patch16_384',
+    'vit_base_3d_patch32_224',
+    'vit_base_3d_patch32_384',
+    'vit_large_3d_patch16_224',
+    'vit_large_3d_patch16_384',
+    'vit_large_3d_patch32_224',
+    'vit_large_3d_patch32_384',
+]
+
+
+class ViTBlock3D(nn.Module):
+    def __init__(self,
+                 dim: int,
+                 num_heads: int,
+                 hidden_dim: int,
+                 drop: float = 0.,
+                 attn_drop: float = 0.,
+                 drop_path: float = 0.):
+        super().__init__()
+        self.norm1 = col_nn.LayerNorm3D(
+            dim, ParallelMode.PARALLEL_3D_INPUT, ParallelMode.PARALLEL_3D_WEIGHT, eps=1e-6)
+        self.attn = col_nn.ViTSelfAttention3D(dim, num_heads, attn_drop, drop)
+        self.drop_path = col_nn.VanillaViTDropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = col_nn.LayerNorm3D(dim, ParallelMode.PARALLEL_3D_INPUT, ParallelMode.PARALLEL_3D_WEIGHT, eps=1e-6)
+        self.mlp = col_nn.ViTMLP3D(hidden_dim, 1, drop, 'gelu')
+
+    def forward(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+
+@MODELS.register_module
+class VisionTransformer3D(nn.Module):
+    def __init__(self,
+                 img_size: int = 224,
+                 patch_size: int = 16,
+                 in_chans: int = 3,
+                 num_classes: int = 1000,
+                 depth: int = 12,
+                 num_heads: int = 12,
+                 embed_dim: int = 768,
+                 hidden_dim: int = 3072,
+                 drop_rate: float = 0.,
+                 attn_drop_rate: float = 0.,
+                 drop_path_rate: float = 0.):
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim
+
+        self.patch_embed = col_nn.ViTPatchEmbedding3D(
+            img_size,
+            patch_size,
+            in_chans,
+            embed_dim,
+            drop_rate,
+        )
+
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]
+        self.blocks = nn.Sequential(*[
+            ViTBlock3D(embed_dim, num_heads, hidden_dim,
+                       drop_rate, attn_drop_rate, dpr[i])
+            for i in range(depth)
+        ])
+
+        self.norm = col_nn.LayerNorm3D(embed_dim, ParallelMode.PARALLEL_3D_INPUT,
+                                       ParallelMode.PARALLEL_3D_WEIGHT)
+
+        self.head = col_nn.ViTHead3D(hidden_dim, num_classes)
+        self.init_weights()
+
+    def init_weights(self):
+        pass
+
+    def forward(self, x):
+        x = self.patch_embed(x)
+        x = self.blocks(x)
+        x = self.norm(x)
+        x = self.head(x)
+        return x
+
+
+def _create_vit_model(**model_kwargs):
+    model = VisionTransformer3D(**model_kwargs)
+    return model
+
+
+@MODELS.register_module
+def vit_tiny_3d_patch4_32(**kwargs):
+    model_kwargs = dict(img_size=32, patch_size=4, embed_dim=512,
+                        depth=6, num_heads=8, hidden_dim=512, num_classes=10, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_tiny_3d_patch16_224(**kwargs):
+    model_kwargs = dict(patch_size=16, embed_dim=192,
+                        depth=12, num_heads=3, hidden_dim=768, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_tiny_3d_patch16_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=16,
+                        embed_dim=192, depth=12, num_heads=3, hidden_dim=768, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_small_3d_patch16_224(**kwargs):
+    model_kwargs = dict(patch_size=16, embed_dim=384,
+                        depth=12, num_heads=6, hidden_dim=1536, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_small_3d_patch16_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=16,
+                        embed_dim=384, depth=12, num_heads=6, hidden_dim=1536, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_small_3d_patch32_224(**kwargs):
+    model_kwargs = dict(patch_size=32, embed_dim=384,
+                        depth=12, num_heads=6, hidden_dim=1536, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_small_3d_patch32_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=32,
+                        embed_dim=384, depth=12, num_heads=6, hidden_dim=1536, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_base_3d_patch16_224(**kwargs):
+    model_kwargs = dict(patch_size=16, embed_dim=768,
+                        depth=12, num_heads=12, hidden_dim=3072, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_base_3d_patch16_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=16,
+                        embed_dim=768, depth=12, num_heads=12, hidden_dim=3072, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_base_3d_patch32_224(**kwargs):
+    model_kwargs = dict(patch_size=32, embed_dim=768,
+                        depth=12, num_heads=12, hidden_dim=3072, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_base_3d_patch32_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=32,
+                        embed_dim=768, depth=12, num_heads=12, hidden_dim=3072, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_large_3d_patch16_224(**kwargs):
+    model_kwargs = dict(patch_size=16, embed_dim=1024,
+                        depth=24, num_heads=16, hidden_dim=4096, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_large_3d_patch16_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=16,
+                        embed_dim=1024, depth=24, num_heads=16, hidden_dim=4096, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_large_3d_patch32_224(**kwargs):
+    model_kwargs = dict(patch_size=32, embed_dim=1024,
+                        depth=24, num_heads=16, hidden_dim=4096, **kwargs)
+    return _create_vit_model(**model_kwargs)
+
+
+@MODELS.register_module
+def vit_large_3d_patch32_384(**kwargs):
+    model_kwargs = dict(img_size=384, patch_size=32,
+                        embed_dim=1024, depth=24, num_heads=16, hidden_dim=4096, **kwargs)
+    return _create_vit_model(**model_kwargs)
--- a/pytest.ini
+++ b/pytest.ini
+[pytest]
+markers =
+    cpu: tests which can run on CPU
+    gpu: tests which requires a single GPU
+    dist: tests which are run in a multi-GPU or multi-machine environment
+    experiment: tests for experimental features
\ No newline at end of file
--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
+pytest
+rpyc
+matplotlib
\ No newline at end of file
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
+torch>=1.8
+torchvision>=0.9
+numpy
+tqdm
+psutil
+tensorboardX
+packaging
\ No newline at end of file
--- a/scripts/slurm_dist_train.sh
+++ b/scripts/slurm_dist_train.sh
+#!/usr/bin/env sh
+
+
+main_file=$1
+config_file=$2
+
+python $main_file --local_rank $SLURM_PROCID --world_size $SLURM_NPROCS --host $HOST --port 29500 --config $config_file
+
+# how to run this script
+# exmaple:
+# HOST=IP_ADDR srun ./scripts/slurm_dist_train.sh ./examples/train_vit_2d.py ./configs/vit/vit_2d.py
\ No newline at end of file
--- a/setup.py
+++ b/setup.py
+import os
+import subprocess
+import sys
+import warnings
+
+import torch
+from setuptools import setup, find_packages
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
+
+# ninja build does not work unless include_dirs are abs path
+this_dir = os.path.dirname(os.path.abspath(__file__))
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+
+if not torch.cuda.is_available():
+    # https://github.com/NVIDIA/apex/issues/486
+    # Extension builds after https://github.com/pytorch/pytorch/pull/23408 attempt to query torch.cuda.get_device_capability(),
+    # which will fail if you are compiling in an environment without visible GPUs (e.g. during an nvidia-docker build command).
+    print('\nWarning: Torch did not find available GPUs on this system.\n',
+          'If your intention is to cross-compile, this is not an error.\n'
+          'By default, Apex will cross-compile for Pascal (compute capabilities 6.0, 6.1, 6.2),\n'
+          'Volta (compute capability 7.0), Turing (compute capability 7.5),\n'
+          'and, if the CUDA version is >= 11.0, Ampere (compute capability 8.0).\n'
+          'If you wish to cross-compile for a single specific architecture,\n'
+          'export TORCH_CUDA_ARCH_LIST="compute capability" before running setup.py.\n')
+    if os.environ.get("TORCH_CUDA_ARCH_LIST", None) is None:
+        _, bare_metal_major, _ = get_cuda_bare_metal_version(CUDA_HOME)
+        if int(bare_metal_major) == 11:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5;8.0"
+        else:
+            os.environ["TORCH_CUDA_ARCH_LIST"] = "6.0;6.1;6.2;7.0;7.5"
+
+print("\n\ntorch.__version__  = {}\n\n".format(torch.__version__))
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+if TORCH_MAJOR == 0 and TORCH_MINOR < 4:
+    raise RuntimeError("Apex requires Pytorch 0.4 or newer.\n" +
+                       "The latest stable release can be obtained from https://pytorch.org/")
+
+cmdclass = {}
+ext_modules = []
+
+extras = {}
+if "--pyprof" in sys.argv:
+    string = "\n\nPyprof has been moved to its own dedicated repository and will " + \
+             "soon be removed from Apex.  Please visit\n" + \
+             "https://github.com/NVIDIA/PyProf\n" + \
+             "for the latest version."
+    warnings.warn(string, DeprecationWarning)
+    with open('requirements.txt') as f:
+        required_packages = f.read().splitlines()
+        extras['pyprof'] = required_packages
+    try:
+        sys.argv.remove("--pyprof")
+    except:
+        pass
+else:
+    warnings.warn(
+        "Option --pyprof not specified. Not installing PyProf dependencies!")
+
+if "--cuda_ext" in sys.argv:
+    if TORCH_MAJOR == 0:
+        raise RuntimeError("--cuda_ext requires Pytorch 1.0 or later, "
+                           "found torch.__version__ = {}".format(torch.__version__))
+
+
+def get_cuda_bare_metal_version(cuda_dir):
+    raw_output = subprocess.check_output(
+        [cuda_dir + "/bin/nvcc", "-V"], universal_newlines=True)
+    output = raw_output.split()
+    release_idx = output.index("release") + 1
+    release = output[release_idx].split(".")
+    bare_metal_major = release[0]
+    bare_metal_minor = release[1][0]
+
+    return raw_output, bare_metal_major, bare_metal_minor
+
+
+def check_cuda_torch_binary_vs_bare_metal(cuda_dir):
+    raw_output, bare_metal_major, bare_metal_minor = get_cuda_bare_metal_version(
+        cuda_dir)
+    torch_binary_major = torch.version.cuda.split(".")[0]
+    torch_binary_minor = torch.version.cuda.split(".")[1]
+
+    print("\nCompiling cuda extensions with")
+    print(raw_output + "from " + cuda_dir + "/bin\n")
+
+    if (bare_metal_major != torch_binary_major) or (bare_metal_minor != torch_binary_minor):
+        raise RuntimeError("Cuda extensions are being compiled with a version of Cuda that does " +
+                           "not match the version used to compile Pytorch binaries.  " +
+                           "Pytorch binaries were compiled with Cuda {}.\n".format(torch.version.cuda) +
+                           "In some cases, a minor-version mismatch will not cause later errors:  " +
+                           "https://github.com/NVIDIA/apex/pull/323#discussion_r287021798.  "
+                           "You can try commenting out this check (at your own risk).")
+
+
+# Set up macros for forward/backward compatibility hack around
+# https://github.com/pytorch/pytorch/commit/4404762d7dd955383acee92e6f06b48144a0742e
+# and
+# https://github.com/NVIDIA/apex/issues/456
+# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
+version_ge_1_1 = []
+if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+    version_ge_1_1 = ['-DVERSION_GE_1_1']
+version_ge_1_3 = []
+if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
+    version_ge_1_3 = ['-DVERSION_GE_1_3']
+version_ge_1_5 = []
+if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
+    version_ge_1_5 = ['-DVERSION_GE_1_5']
+version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+
+if "--cuda_ext" in sys.argv:
+    sys.argv.remove("--cuda_ext")
+
+    if CUDA_HOME is None:
+        raise RuntimeError(
+            "--cuda_ext was requested, but nvcc was not found.  Are you sure your environment has nvcc available?  If you're installing within a container from https://hub.docker.com/r/pytorch/pytorch, only images whose names contain 'devel' will provide nvcc.")
+    else:
+        check_cuda_torch_binary_vs_bare_metal(CUDA_HOME)
+
+        ext_modules.append(
+            CUDAExtension(name='colossal_C',
+                          sources=['csrc/colossal_C_frontend.cpp',
+                                   'csrc/multi_tensor_sgd_kernel.cu',
+                                   'csrc/multi_tensor_scale_kernel.cu',
+                                   'csrc/multi_tensor_adam.cu',
+                                   'csrc/multi_tensor_l2norm_kernel.cu',
+                                   'csrc/multi_tensor_lamb.cu'],
+                          extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
+                                              'nvcc': ['-lineinfo',
+                                                       '-O3',
+                                                       # '--resource-usage',
+                                                       '--use_fast_math'] + version_dependent_macros}))
+
+# Check, if ATen/CUDAGenerator.h is found, otherwise use the new ATen/CUDAGeneratorImpl.h, due to breaking change in https://github.com/pytorch/pytorch/pull/36026
+generator_flag = []
+torch_dir = torch.__path__[0]
+if os.path.exists(os.path.join(torch_dir, 'include', 'ATen', 'CUDAGenerator.h')):
+    generator_flag = ['-DOLD_GENERATOR']
+
+
+def fetch_requirements(path):
+    with open(path, 'r') as fd:
+        return [r.strip() for r in fd.readlines()]
+
+
+install_requires = fetch_requirements('requirements/requirements.txt')
+
+setup(
+    name='colossal-ai',
+    version='0.0.1-beta',
+    packages=find_packages(exclude=('csrc',
+                                    'tests',
+                                    'docs',
+                                    'tests',
+                                    '*.egg-info',)),
+    description='An integrated large-scale model training framework with efficient parallelization techniques',
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': BuildExtension} if ext_modules else {},
+    extras_require=extras,
+    install_requires=install_requires,
+)
--- a/tests/test_config/sample_config.py
+++ b/tests/test_config/sample_config.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import os
+from pathlib import Path
+
+train_data = dict(
+    dataset=dict(
+        type='CIFAR10Dataset',
+        root=Path(os.environ['DATA']),
+        download=True,
+        transform_pipeline=[
+            dict(type='RandomResizedCrop', size=224),
+            dict(type='RandomHorizontalFlip'),
+            dict(type='ToTensor'),
+            dict(type='Normalize', mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
+        ]
+    ),
+    dataloader=dict(
+        batch_size=64,
+        pin_memory=True,
+        num_workers=4,
+        sampler=dict(
+            type='DataParallelSampler',
+            shuffle=True,
+        )
+    )
+)
--- a/tests/test_config/test_load_config.py
+++ b/tests/test_config/test_load_config.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from pathlib import Path
+
+import pytest
+
+from colossalai.context.config import Config
+
+
+@pytest.mark.cpu
+def test_load_config():
+    filename = Path(__file__).parent.joinpath('sample_config.py')
+    config = Config.from_file(filename)
+
+    assert config.train_data, 'cannot access train data as attribute'
+    assert config.train_data.dataset, 'cannot access grandchild attribute'
+    assert isinstance(config.train_data.dataset.transform_pipeline[0], dict), \
+        f'expected attribute transform_pipeline elements to be a dict, but found {type(config.train_data.dataset.transform_pipeline)}'
--- a/tests/test_context/configs/parallel_2d_init.py
+++ b/tests/test_context/configs/parallel_2d_init.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+parallel = dict(
+    pipeline=dict(size=2),
+    tensor=dict(
+        size=4,
+        mode='2d'
+    )
+)
--- a/tests/test_context/configs/parallel_2p5d_init.py
+++ b/tests/test_context/configs/parallel_2p5d_init.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+parallel = dict(
+    pipeline=dict(size=2),
+    tensor=dict(
+        size=8,
+        depth=2,
+        mode='2.5d'
+    )
+)
--- a/tests/test_context/configs/parallel_3d_init.py
+++ b/tests/test_context/configs/parallel_3d_init.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+parallel = dict(
+    pipeline=dict(size=2),
+    tensor=dict(
+        size=8,
+        mode='3d'
+    )
+)
--- a/tests/test_context/test_2d_init.py
+++ b/tests/test_context/test_2d_init.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from functools import partial
+from pathlib import Path
+
+import pytest
+import torch.multiprocessing as mp
+
+from colossalai import init_dist
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+
+CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2d_init.py').absolute()
+
+
+def check_data_parallel_rank(rank):
+    if rank in [0, 1, 2, 3, 4, 5, 6, 7]:
+        assert gpc.get_local_rank(ParallelMode.DATA) == 0
+    elif rank in [8, 9, 10, 11, 12, 13, 14, 15]:
+        assert gpc.get_local_rank(ParallelMode.DATA) == 1
+
+
+def check_pipeline_parallel_rank(rank):
+    if rank in [0, 1, 2, 3]:
+        assert gpc.get_local_rank(ParallelMode.PIPELINE) == 0
+    elif rank in [4, 5, 6, 7]:
+        assert gpc.get_local_rank(ParallelMode.PIPELINE) == 1
+    elif rank in [8, 9, 10, 11]:
+        assert gpc.get_local_rank(ParallelMode.PIPELINE) == 0
+    elif rank in [12, 13, 14, 15]:
+        assert gpc.get_local_rank(ParallelMode.PIPELINE) == 1
+
+
+def check_tensor_parallel_rank(rank):
+    if rank in [0, 4, 8, 12]:
+        assert gpc.get_local_rank(ParallelMode.TENSOR) == 0
+    elif rank in [1, 5, 9, 13]:
+        assert gpc.get_local_rank(ParallelMode.TENSOR) == 1
+    elif rank in [2, 6, 10, 14]:
+        assert gpc.get_local_rank(ParallelMode.TENSOR) == 2
+    elif rank in [3, 7, 11, 15]:
+        assert gpc.get_local_rank(ParallelMode.TENSOR) == 3
+
+
+def check_2d_parallel_rank(rank):
+    if rank in [0, 4, 8, 12]:
+        assert gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL) == 0
+        assert gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW) == 0
+    elif rank in [1, 5, 9, 13]:
+        assert gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL) == 0
+        assert gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW) == 1
+    elif rank in [2, 6, 10, 14]:
+        assert gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL) == 1
+        assert gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW) == 0
+    elif rank in [3, 7, 11, 15]:
+        assert gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL) == 1
+        assert gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW) == 1
+
+
+def init_2d(local_rank, world_size, backend, port, host):
+    dist_args = dict(
+        config=CONFIG_PATH,
+        local_rank=local_rank,
+        world_size=world_size,
+        backend=backend,
+        port=port,
+        host=host
+    )
+    init_dist(**dist_args)
+
+    check_tensor_parallel_rank(local_rank)
+    check_data_parallel_rank(local_rank)
+    check_2d_parallel_rank(local_rank)
+    check_pipeline_parallel_rank(local_rank)
+
+    gpc.destroy()
+
+
+@pytest.mark.cpu
+def test_2d_init():
+    """
+    As no computation or communication is done, we can run this test on CPU.
+    """
+    world_size = 16
+    test_fn = partial(init_2d,
+                      world_size=world_size,
+                      backend='gloo',
+                      port='29500',
+                      host='localhost'
+                      )
+    mp.spawn(test_fn, nprocs=world_size)
+
+
+if __name__ == '__main__':
+    test_2d_init()