[gemini] improve compatibility and add static placement policy (#4479)

* [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example

[gemini] improve compatibility and add static placement policy (#4479)
* [gemini] remove distributed-related part from colotensor (#4379) * [gemini] remove process group dependency * [gemini] remove tp part from colo tensor * [gemini] patch inplace op * [gemini] fix param op hook and update tests * [test] remove useless tests * [test] remove useless tests * [misc] fix requirements * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [test] fix model zoo * [misc] update requirements * [gemini] refactor gemini optimizer and gemini ddp (#4398) * [gemini] update optimizer interface * [gemini] renaming gemini optimizer * [gemini] refactor gemini ddp class * [example] update gemini related example * [example] update gemini related example * [plugin] fix gemini plugin args * [test] update gemini ckpt tests * [gemini] fix checkpoint io * [example] fix opt example requirements * [example] fix opt example * [example] fix opt example * [example] fix opt example * [gemini] add static placement policy (#4443) * [gemini] add static placement policy * [gemini] fix param offload * [test] update gemini tests * [plugin] update gemini plugin * [plugin] update gemini plugin docstr * [misc] fix flash attn requirement * [test] fix gemini checkpoint io test * [example] update resnet example result (#4457) * [example] update bert example result (#4458) * [doc] update gemini doc (#4468) * [example] update gemini related examples (#4473) * [example] update gpt example * [example] update dreambooth example * [example] update vit * [example] update opt * [example] update palm * [example] update vit and opt benchmark * [hotfix] fix bert in model zoo (#4480) * [hotfix] fix bert in model zoo * [test] remove chatglm gemini test * [test] remove sam gemini test * [test] remove vit gemini test * [hotfix] fix opt tutorial example (#4497) * [hotfix] fix opt tutorial example * [hotfix] fix opt tutorial example
27061426 · Hongxin Liu · GitHub · 285fe7ba · 27061426 · 27061426
Unverified Commit 27061426 authored Aug 24, 2023 by Hongxin Liu Committed by GitHub Aug 24, 2023
20 changed files
--- a/tests/kit/model_zoo/transformers/gpt.py
+++ b/tests/kit/model_zoo/transformers/gpt.py
@@ -57,6 +57,12 @@ def data_gen_for_sequence_classification():
    return data


+def date_gen_for_double_heads():
+    data = data_gen_for_lm()
+    data['mc_labels'] = torch.zeros(data['input_ids'].shape[0], dtype=torch.int64)
+    return data
+
+
 # define output transform function
 output_transform_fn = lambda x: x

@@ -94,8 +100,8 @@ model_zoo.register(name='transformers_gpt_lm',
                   model_attribute=ModelAttribute(has_control_flow=True))
 model_zoo.register(name='transformers_gpt_double_heads',
                   model_fn=lambda: transformers.GPT2DoubleHeadsModel(config),
-                   data_gen_fn=data_gen_for_lm,
-                   output_transform_fn=output_transform_fn,
+                   data_gen_fn=date_gen_for_double_heads,
+                   output_transform_fn=lambda x: dict(loss=x.loss + x.mc_loss),
                   loss_fn=loss_fn,
                   model_attribute=ModelAttribute(has_control_flow=True))
 model_zoo.register(name='transformers_gpt_for_question_answering',

--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
@@ -12,19 +12,16 @@ from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
 from colossalai.tensor.colo_parameter import ColoParameter
 from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from colossalai.zero import ColoInitContext
 from tests.kit.model_zoo import model_zoo


 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
    try:
-        if init_method == 'colo':
-            ctx = ColoInitContext()
-        elif init_method == 'lazy':
+        if init_method == 'lazy':
            ctx = LazyInitContext()
        else:
            ctx = nullcontext()
-        plugin = GeminiPlugin(placement_policy='cuda', strict_ddp_mode=True, max_norm=1.0, initial_scale=2**5)
+        plugin = GeminiPlugin(max_norm=1.0, initial_scale=2**5)
        booster = Booster(plugin=plugin)
        with ctx:
            model = model_fn()
@@ -50,6 +47,7 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[
        optimizer.step()

    except Exception as e:
+        # raise e
        return repr(e)


@@ -57,8 +55,9 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[
 # @parameterize('init_method', ['lazy', 'none', 'colo'])


+@parameterize('subset', ['torchvision', 'transformers', 'diffusers'])
 @parameterize('init_method', ['none'])
-def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True):
+def check_gemini_plugin(subset: str, init_method: str = 'none', early_stop: bool = True):
    """check gemini plugin over model zoo

    Args:
@@ -71,29 +70,23 @@ def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True):
    passed_models = []
    failed_info = {}    # (model_name, error) pair

-    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.items():
+    for name, (model_fn, data_gen_fn, output_transform_fn, _, _) in model_zoo.get_sub_registry(subset).items():
        # These models lead to CUDA error
        if name in ('diffusers_auto_encoder_kl', 'diffusers_vq_model', 'diffusers_unet2d_model', 'timm_resmlp',
-                    'timm_gmixer_12_224', 'timm_gmlp_b16_224', 'timm_mixer_b16_224', 'timm_convnext'):
+                    'timm_gmixer_12_224', 'timm_gmlp_b16_224', 'timm_mixer_b16_224', 'timm_convnext',
+                    'torchvision_convnext_base'):
            continue
        # These models are not compatible with gemini
        if name in [
-                'diffusers_clip_vision_model', 'timm_resnet', 'timm_beit', 'timm_beitv2', 'timm_eca_nfnet',
-                'timm_efficientformer', 'timm_hrnet_w18_small', 'timm_nf_ecaresnet101', 'timm_nf_regnet_b0',
-                'timm_skresnet18', 'timm_wide_resnet50_2', 'timm_convit', 'timm_dm_nfnet', 'timm_swin_transformer',
-                'torchaudio_conformer', 'torchaudio_deepspeech', 'torchaudio_wavernn', 'torchaudio_tacotron',
-                'deepfm_interactionarch', 'deepfm_simpledeepfmnn', 'dlrm', 'dlrm_interactionarch',
-                'torchvision_googlenet', 'torchvision_inception_v3', 'torchvision_mobilenet_v3_small',
-                'torchvision_resnet18', 'torchvision_resnext50_32x4d', 'torchvision_wide_resnet50_2',
-                'torchvision_vit_b_16', 'torchvision_convnext_base', 'torchvision_swin_s', 'transformers_albert',
-                'transformers_albert_for_pretraining', 'transformers_bert', 'transformers_bert_for_pretraining',
-                'transformers_gpt_double_heads', 'torchaudio_hubert_base', 'torchaudio_wav2vec2_base',
-                'transformers_t5_for_conditional_generation', 'transformers_t5', 'transformers_t5_encoder_model',
-                'transformers_vit', 'transformers_vit_for_masked_image_modeling',
-                'transformers_vit_for_image_classification', 'transformers_chatglm',
-                'transformers_chatglm_for_conditional_generation', 'transformers_blip2',
-                'transformers_blip2_conditional_gerneration', 'transformers_sam', 'transformers_whisper',
-                'transformers_whisper_for_conditional_generation', 'transformers_whisper_for_audio_classification'
+                'timm_convit',
+                'timm_dm_nfnet',
+                'torchvision_vit_b_16',
+                'transformers_t5',
+                'transformers_t5_for_conditional_generation',
+                'transformers_t5_encoder_model',    # does not support apex rmsnorm
+                'transformers_chatglm',
+                'transformers_sam',
+                'transformers_vit'
        ]:
            continue

@@ -105,7 +98,6 @@ def check_gemini_plugin(init_method: str = 'none', early_stop: bool = True):
        ]:
            continue
        err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
-        torch.cuda.empty_cache()

        if err is None:
            passed_models.append(name)

--- a/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
+++ b/tests/test_checkpoint_io/test_gemini_checkpoint_io.py
@@ -18,12 +18,45 @@ from colossalai.testing import (
 )
 from tests.kit.model_zoo import model_zoo

+MODEL_PLACEMENT_CONFIGS = [
+    {
+        'placement_policy': 'static',
+        'shard_param_frac': 0.0
+    },    # zero2
+    {
+        'placement_policy': 'static',
+        'shard_param_frac': 1.0
+    },    # zero3
+    {
+        'placement_policy': 'static',
+        'shard_param_frac': 0.5
+    },    # zero3-half
+]
+
+OPTIM_PLACEMENT_CONFIGS = [
+    {
+        'placement_policy': 'static',
+        'shard_param_frac': 0.0,
+        'offload_optim_frac': 0.0
+    },    # zero2
+    {
+        'placement_policy': 'static',
+        'shard_param_frac': 0.0,
+        'offload_optim_frac': 1.0
+    },    # zero2-offload
+    {
+        'placement_policy': 'static',
+        'shard_param_frac': 0.0,
+        'offload_optim_frac': 0.5
+    },    # zero2-offload-half
+]
+

 @clear_cache_before_run()
-@parameterize('placement_policy', ['cuda', 'cpu'])
+@parameterize('placement_config', MODEL_PLACEMENT_CONFIGS)
 @parameterize('model_name', ['transformers_bert_for_sequence_classification'])
 @parameterize('use_safetensors', [False, True])
-def exam_state_dict_with_origin(placement_policy, model_name, use_safetensors: bool):
+def exam_state_dict_with_origin(placement_config, model_name, use_safetensors: bool):
    from transformers import BertForSequenceClassification
    (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
    bert_model = model_fn()
@@ -32,7 +65,7 @@ def exam_state_dict_with_origin(placement_policy, model_name, use_safetensors: b
        pretrained_path = os.path.join(tempdir, 'pretrained')
        bert_model.config.save_pretrained(save_directory=pretrained_path)

-        plugin = GeminiPlugin(placement_policy=placement_policy)
+        plugin = GeminiPlugin(**placement_config)
        booster = Booster(plugin=plugin)
        bert_model, _, _, _, _ = booster.boost(bert_model)
        model_size = sum(p.numel() * p.element_size() for p in bert_model.parameters()) / 1024**2
@@ -46,19 +79,19 @@ def exam_state_dict_with_origin(placement_policy, model_name, use_safetensors: b
        dist.barrier()

        new_bert_model = BertForSequenceClassification.from_pretrained(pretrained_path)
-        check_state_dict_equal(bert_model.unwrap().state_dict(only_rank_0=False, dtype=torch.float32),
+        check_state_dict_equal(bert_model.state_dict(only_rank_0=False, dtype=torch.float32),
                               new_bert_model.state_dict(), False)


 @clear_cache_before_run()
-@parameterize('placement_policy', ['cuda', 'cpu'])
+@parameterize('placement_config', OPTIM_PLACEMENT_CONFIGS)
 @parameterize('shard', [False, True])
 @parameterize('model_name', ['transformers_gpt'])
 @parameterize('size_per_shard', [32])
-def exam_state_dict(placement_policy, shard: bool, model_name: str, size_per_shard: int):
+def exam_state_dict(placement_config, shard: bool, model_name: str, size_per_shard: int):
    (model_fn, data_gen_fn, output_transform_fn, _, _) = next(iter(model_zoo.get_sub_registry(model_name).values()))
    criterion = lambda x: x.mean()
-    plugin = GeminiPlugin(placement_policy=placement_policy, precision="fp16", initial_scale=(2**14))
+    plugin = GeminiPlugin(**placement_config, precision="fp16", initial_scale=(2**14))
    booster = Booster(plugin=plugin)

    model = model_fn()
@@ -87,12 +120,11 @@ def exam_state_dict(placement_policy, shard: bool, model_name: str, size_per_sha
        dist.barrier()

        booster.load_model(new_model, model_ckpt_path)
-        check_state_dict_equal(model.unwrap().state_dict(only_rank_0=False),
-                               new_model.unwrap().state_dict(only_rank_0=False), False)
+        check_state_dict_equal(model.state_dict(only_rank_0=False), new_model.state_dict(only_rank_0=False), False)

        booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.unwrap().state_dict(only_rank_0=False),
-                               new_optimizer.unwrap().state_dict(only_rank_0=False), False)
+        check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(only_rank_0=False),
+                               False)

        # Check the new model/optimizer can successfully run.
        data = data_gen_fn()

--- a/tests/test_checkpoint_io/test_gemini_torch_compability.py
+++ b/tests/test_checkpoint_io/test_gemini_torch_compability.py
@@ -60,12 +60,11 @@ def exam_torch_load_from_gemini(shard: bool, model_name: str):
        new_booster.load_model(new_model, model_ckpt_path, strict=True)

        # Add prefix to get aligned with pytorch parameter names.
-        check_state_dict_equal(
-            model.unwrap().state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
-            new_model.state_dict(), False)
+        check_state_dict_equal(model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
+                               new_model.state_dict(), False)

        new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
-        check_state_dict_equal(optimizer.unwrap().state_dict(only_rank_0=False), new_optimizer.state_dict(), False)
+        check_state_dict_equal(optimizer.state_dict(only_rank_0=False), new_optimizer.state_dict(), False)

        # Check the new model/optimizer can successfully run.
        data = data_gen_fn()
@@ -124,13 +123,12 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
        new_booster.load_model(new_model, model_ckpt_path, strict=True)

        # Add prefix to get aligned with pytorch parameter names.
-        check_state_dict_equal(
-            new_model.unwrap().state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
-            model.state_dict(), False)
+        check_state_dict_equal(new_model.state_dict(only_rank_0=False, prefix='module.module.', dtype=torch.float32),
+                               model.state_dict(), False)

        new_booster.load_optimizer(new_optimizer, optimizer_ckpt_path)
        old_state_dict = optimizer.state_dict()
-        new_state_dict = new_optimizer.unwrap().state_dict(only_rank_0=False)
+        new_state_dict = new_optimizer.state_dict(only_rank_0=False)

        # Comparison of param_groups needs special care here,
        # since not all hyperparameters in Adam are used by HybridAdam
@@ -138,7 +136,7 @@ def exam_gemini_load_from_torch(shard: bool, model_name: str):
        for old_group, new_group in zip(old_state_dict['param_groups'], new_state_dict['param_groups']):
            for k in hyperparameters_to_examine:
                assert k in old_group and k in new_group, \
-                        f"Old group's keys: {list(old_group.keys())}, New group's keys: {list(new_group.keys())}"
+                    f"Old group's keys: {list(old_group.keys())}, New group's keys: {list(new_group.keys())}"
                assert old_group[k] == new_group[k]
        check_state_dict_equal(old_state_dict['state'], new_state_dict['state'], False)


--- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor_v2.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor_v2.py
-import os
-from pathlib import Path
-
-import pytest
-import torch
-from torchvision import transforms
-from torchvision.datasets import CIFAR10
-
-import colossalai
-from colossalai.amp import AMP_TYPE
-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.engine.schedule._pipeline_schedule_v2 import PipelineScheduleV2
-from colossalai.logging import disable_existing_loggers, get_dist_logger
-from colossalai.nn import CrossEntropyLoss
-from colossalai.nn.lr_scheduler import CosineAnnealingWarmupLR
-from colossalai.pipeline.pipelinable import PipelinableContext
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.trainer import Trainer, hooks
-from colossalai.utils import get_dataloader
-
-disable_existing_loggers()
-BATCH_SIZE = 4
-NUM_EPOCHS = 10
-WARMUP_EPOCHS = 5
-CONFIG = dict(NUM_MICRO_BATCHES=2,
-              parallel=dict(pipeline=2, tensor=dict(size=1, mode='1d')),
-              fp16=dict(mode=AMP_TYPE.NAIVE),
-              gradient_accumulation=2)
-
-
-def run_trainer(rank, world_size, port):
-    disable_existing_loggers()
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-
-    disable_existing_loggers()
-    # get logger
-    logger = get_dist_logger()
-
-    pipelinable = PipelinableContext()
-    try:
-        from titans.model.vit import vit_tiny_patch4_32
-    except ImportError:
-        logger.warning('skip the test_cifar_with_data_pipeline_tensor test because titan is not installed')
-        logger.warning('please install titan from https://github.com/hpcaitech/Titans')
-        return
-    with pipelinable:
-        model = vit_tiny_patch4_32()
-    pipelinable.to_layer_list()
-    pipelinable.policy = "uniform"
-    model = pipelinable.partition(1, gpc.pipeline_parallel_size, gpc.get_local_rank(ParallelMode.PIPELINE))
-
-    # create dataloaders
-    root = Path(os.environ['DATA'])
-    transform_train = transforms.Compose([
-        transforms.RandomCrop(32, padding=4, pad_if_needed=True),
-        transforms.AutoAugment(policy=transforms.AutoAugmentPolicy.CIFAR10),
-        transforms.ToTensor(),
-        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
-    ])
-    train_dataset = CIFAR10(root=root, train=True, download=True, transform=transform_train)
-    train_dataloader = get_dataloader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, pin_memory=True)
-
-    # create loss function
-    criterion = CrossEntropyLoss(label_smoothing=0.1)
-
-    # create optimizer
-    optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0)
-
-    # create lr scheduler
-    lr_scheduler = CosineAnnealingWarmupLR(optimizer=optimizer, total_steps=NUM_EPOCHS, warmup_steps=WARMUP_EPOCHS)
-
-    # initialize
-    engine, train_dataloader, *_ = colossalai.initialize(model=model,
-                                                         optimizer=optimizer,
-                                                         criterion=criterion,
-                                                         train_dataloader=train_dataloader)
-
-    engine._schedule = PipelineScheduleV2(num_microbatches=gpc.config.NUM_MICRO_BATCHES)
-
-    logger = get_dist_logger()
-
-    trainer = Trainer(engine=engine, logger=logger)
-
-    hook_list = [
-        hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
-    ]
-
-    trainer.fit(train_dataloader=train_dataloader,
-                max_steps=2,
-                epochs=NUM_EPOCHS,
-                hooks=hook_list,
-                display_progress=True)
-
-
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_hybrid_parallel():
-    spawn(run_trainer, 2)
-    disable_existing_loggers()
-
-
-if __name__ == '__main__':
-    test_hybrid_parallel()
--- a/tests/test_ddp/test_ddp_ignore_params.py
+++ b/tests/test_ddp/test_ddp_ignore_params.py
-import os
-import random
-from typing import Callable, Type
-
-import numpy as np
-import pytest
-import torch
-import torch.distributed as dist
-
-import colossalai
-from colossalai.nn.parallel import ColoDDP
-from colossalai.tensor import ProcessGroup
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils.cuda import get_current_device
-from colossalai.zero import ColoInitContext, ZeroDDP
-from colossalai.zero.gemini.chunk import ChunkManager, search_chunk_configuration
-from colossalai.zero.gemini.gemini_mgr import GeminiManager
-
-
-def set_seed(seed):
-    random.seed(seed)
-    os.environ['PYTHONHASHSEED'] = str(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.backends.cudnn.deterministic = True
-
-
-def init_ddp(module: torch.nn.Module) -> ColoDDP:
-    pg = ProcessGroup()
-    return ColoDDP(module, process_group=pg)
-
-
-def init_ddpv2(module: torch.nn.Module) -> ZeroDDP:
-    chunk_config, *_ = search_chunk_configuration(module, 4, 1024)
-    chunk_manager = ChunkManager(chunk_config)
-    gemini_manager = GeminiManager('cuda', chunk_manager)
-    return ZeroDDP(module, gemini_manager)
-
-
-class Net(torch.nn.Module):
-
-    def __init__(self) -> None:
-        super().__init__()
-        self.fc1 = torch.nn.Linear(3, 3, bias=False)
-        self.fc2 = torch.nn.Linear(3, 1, bias=False)
-
-    def forward(self, x):
-        return self.fc2(self.fc1(x))
-
-
-def run_fwd_bwd(ddp_cls: Type[ColoDDP], init_ddp_func: Callable[[torch.nn.Module], ColoDDP]):
-    with ColoInitContext(device=get_current_device()):
-        model = Net().cuda()
-    w1 = model.fc1.weight
-    w2 = model.fc2.weight
-    ddp_cls.set_params_to_ignore([w2])
-    model = init_ddp_func(model)
-    x = torch.rand(2, 3, device=get_current_device())
-    logits = model(x)
-    loss = torch.sum(logits)
-    model.backward(loss)
-
-    if ddp_cls is ZeroDDP:
-        w1s_grad = w1
-    else:
-        w1s_grad = w1.grad
-
-    w1_grads = [torch.empty_like(w1) for _ in range(dist.get_world_size())]
-    dist.all_gather(w1_grads, w1s_grad)
-    assert torch.equal(w1_grads[0], w1_grads[1])
-    w2_grads = [torch.empty_like(w2) for _ in range(dist.get_world_size())]
-    dist.all_gather(w2_grads, w2.grad)
-    assert not torch.equal(w2_grads[0], w2_grads[1])
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    set_seed(dist.get_rank())
-    run_fwd_bwd(ColoDDP, init_ddp)
-    run_fwd_bwd(ZeroDDP, init_ddpv2)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [2])
-@rerun_if_address_is_in_use()
-def test_ddp_ignore_params(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_ddp_ignore_params(2)
--- a/tests/test_ddp/test_ddp_state_dict.py
+++ b/tests/test_ddp/test_ddp_state_dict.py
-from collections import OrderedDict
-
-import pytest
-import torch
-
-import colossalai
-from colossalai.nn.parallel import ColoDDP
-from colossalai.tensor import ColoParameter, ProcessGroup
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils.cuda import get_current_device
-from colossalai.zero import ColoInitContext
-from tests.components_to_test.registry import non_distributed_component_funcs
-
-
-def check_state_dict_equal(state_dict: OrderedDict, other_state_dict: OrderedDict):
-    for (k1, t1), (k2, t2) in zip(state_dict.items(), other_state_dict.items()):
-        assert k1 == k2
-
-        if t1.device != t2.device:
-            temp_t2 = t2.to(t1.device)
-        else:
-            temp_t2 = t2
-
-        assert torch.equal(t1, temp_t2), "\t{}\n\t{}".format(t1, temp_t2)
-
-
-def init_ddp(module: torch.nn.Module) -> ColoDDP:
-    pg = ProcessGroup()
-    return ColoDDP(module, process_group=pg)
-
-
-def run_ddp_state_dict():
-    get_components_func = non_distributed_component_funcs.get_callable('gpt2')
-    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
-    torch_model = model_builder().cuda()
-    with ColoInitContext(device=get_current_device()):
-        model = model_builder()
-    model = init_ddp(model)
-    torch_state_dict = torch_model.state_dict()
-
-    for param in model.parameters():
-        if isinstance(param, ColoParameter):
-            assert param.get_process_group() is not None
-    model.load_state_dict(torch_state_dict)
-
-    for param in model.parameters():
-        if isinstance(param, ColoParameter):
-            assert param.get_process_group() is not None
-
-    state_dict = model.state_dict()
-    check_state_dict_equal(torch_state_dict, state_dict)
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    run_ddp_state_dict()
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 2])
-@rerun_if_address_is_in_use()
-def test_state_dict(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_state_dict(2)
--- a/tests/test_ddp/test_reducer.py
+++ b/tests/test_ddp/test_reducer.py
-from functools import partial
-
-import pytest
-import torch
-import torch.distributed as dist
-from torch.distributed.distributed_c10d import _get_default_group
-
-import colossalai
-from colossalai.nn.parallel.reducer import Reducer
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils.cuda import get_current_device
-
-REDUCE_CNT = 0
-
-
-def check_eq(grad, grad_clone):
-    global REDUCE_CNT
-    print(f'Rank{dist.get_rank()} check {REDUCE_CNT}')
-    REDUCE_CNT += 1
-    assert torch.allclose(grad, grad_clone)
-
-
-def run_reducer():
-    grads = [torch.rand(64, i + 1, device=get_current_device()) for i in range(10)]
-    grads_clone = [g.clone().detach() for g in grads]
-    for g in grads:
-        dist.all_reduce(g)
-    reducer = Reducer(bucket_size_mb=1)
-    for g, g_clone in zip(grads, grads_clone):
-        reducer.all_reduce_async(g_clone, _get_default_group(), partial(check_eq, g))
-    reducer.flush()
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    run_reducer()
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 2])
-@rerun_if_address_is_in_use()
-def test_reducer(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_reducer(2)
--- a/tests/test_ops/test_addmm_tp.py
+++ b/tests/test_ops/test_addmm_tp.py
-import pytest
-import torch
-import torch.nn as nn
-
-import colossalai
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.test_tensor.common_utils import split_param_col_tp1d, split_param_row_tp1d, tensor_equal, tensor_shard_equal
-
-
-class Conv1D(nn.Module):
-    """
-    1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2).
-    Basically works like a linear layer but the weights are transposed.
-    Args:
-        nf (`int`): The number of output features.
-        nx (`int`): The number of input features.
-    """
-
-    def __init__(self, nf, nx):
-        super().__init__()
-        self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = nn.Parameter(w)
-        self.bias = nn.Parameter(torch.ones(nf))
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(size_out)
-        return x
-
-
-def run_with_spec(spec_init_func, split_bias):
-    model = Conv1D(4, 16).cuda()
-    world_size = torch.distributed.get_world_size()
-    pg = ProcessGroup(tp_degree=world_size)
-
-    weight = ColoTensor(torch.nn.Parameter(model.weight.detach()), ColoTensorSpec(pg))
-    bias = ColoTensor(torch.nn.Parameter(model.bias.detach()), ColoTensorSpec(pg))
-
-    spec_init_func(weight, pg)
-    if split_bias:
-        spec_init_func(bias, pg)
-
-    x = torch.rand(2, 16).cuda()
-    out = model(x)
-    colo_out = torch.addmm(bias, x, weight)
-    colo_out = colo_out.to_replicate()
-    assert tensor_equal(out, colo_out)
-    grad = torch.rand_like(out)
-    out.backward(grad)
-    colo_out.backward(grad)
-    tensor_shard_equal(model.weight.grad, weight.grad, pg.tp_local_rank(), pg.tp_world_size())
-    tensor_shard_equal(model.bias.grad, bias.grad, pg.tp_local_rank(), pg.tp_world_size())
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    run_with_spec(spec_init_func=split_param_row_tp1d, split_bias=False)
-    run_with_spec(spec_init_func=split_param_col_tp1d, split_bias=True)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 4])
-@rerun_if_address_is_in_use()
-def test_addmm_1d(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_addmm_1d(4)
--- a/tests/test_ops/test_embedding_bag_tp.py
+++ b/tests/test_ops/test_embedding_bag_tp.py
-import pytest
-import torch
-from torch.nn import functional as F
-
-import colossalai
-from colossalai.tensor import ColoParameter, ColoTensorSpec, ProcessGroup
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.test_tensor.common_utils import split_param_col_tp1d, tensor_equal, tensor_shard_equal
-
-
-def run_with_spec(spec_init_func):
-    pg = ProcessGroup(tp_degree=torch.distributed.get_world_size())
-    model = torch.nn.EmbeddingBag(10, 4).cuda()
-    weight = ColoParameter(model.weight.clone(), True, ColoTensorSpec(pg))
-
-    spec_init_func(weight, pg)
-
-    inputs = torch.tensor([1, 2, 4, 5, 4, 3, 2, 9]).cuda()
-    offsets = torch.tensor([0, 4]).cuda()
-    out = model(inputs, offsets=offsets)
-    colo_out = F.embedding_bag(inputs, weight, offsets=offsets)
-    assert tensor_equal(out, colo_out)
-    grad = torch.rand_like(out)
-    out.backward(grad)
-    colo_out.backward(grad)
-    assert tensor_shard_equal(model.weight.grad, weight.grad, pg.tp_local_rank(), pg.tp_world_size())
-
-
-def run_dist(rank, world_size, port):
-    config = dict(parallel=dict(tensor=dict(mode="1d", size=world_size),))
-    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    run_with_spec(split_param_col_tp1d)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 4])
-@rerun_if_address_is_in_use()
-def test_embedding_bag_1d(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_embedding_bag_1d(4)
--- a/tests/test_ops/test_embedding_tp.py
+++ b/tests/test_ops/test_embedding_tp.py
-import pytest
-import torch
-from torch.nn import functional as F
-
-import colossalai
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.test_tensor.common_utils import split_param_col_tp1d, split_param_row_tp1d, tensor_equal, tensor_shard_equal
-
-
-def run_with_spec(spec_init_func, pg: ProcessGroup):
-    model = torch.nn.Embedding(12, 32).cuda()
-    weight = ColoTensor(torch.nn.Parameter(model.weight.detach()), ColoTensorSpec(pg))
-
-    spec_init_func(weight, pg)
-
-    x = torch.tensor((0, 3, 6, 9)).cuda()
-    out = model(x)
-    colo_out = F.embedding(x, weight)
-    assert tensor_equal(out, colo_out)
-    grad = torch.rand_like(out)
-    out.backward(grad)
-    colo_out.backward(grad)
-    # compare grad inside a TP group
-    assert tensor_shard_equal(model.weight.grad, weight.grad, pg.tp_local_rank(), pg.tp_world_size())
-
-
-def run_dist(rank, world_size, port):
-    # config = dict(parallel=dict(tensor=dict(mode="1d", size=world_size),))
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    pg = ProcessGroup(tp_degree=world_size)
-    run_with_spec(split_param_row_tp1d, pg)
-    run_with_spec(split_param_col_tp1d, pg)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 4])
-@rerun_if_address_is_in_use()
-def test_embedding_1d(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_embedding_1d(4)
--- a/tests/test_ops/test_linear_tp.py
+++ b/tests/test_ops/test_linear_tp.py
-import pytest
-import torch
-import torch.nn.functional as F
-
-import colossalai
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from tests.test_tensor.common_utils import split_param_col_tp1d, split_param_row_tp1d, tensor_equal, tensor_shard_equal
-
-
-def run_with_spec(spec_init_func, split_bias):
-    pg = ProcessGroup(tp_degree=torch.distributed.get_world_size())
-    model = torch.nn.Linear(4, 8).cuda()
-    weight = ColoTensor(torch.nn.Parameter(model.weight.detach()), ColoTensorSpec(pg))
-    bias = ColoTensor(torch.nn.Parameter(model.bias.detach()), ColoTensorSpec(pg))
-
-    spec_init_func(weight, pg)
-    if split_bias:
-        spec_init_func(bias, pg)
-
-    x = torch.rand(2, 4).cuda()
-    out = model(x)
-    colo_out = F.linear(x, weight, bias)
-    colo_out = colo_out.to_replicate()
-    assert tensor_equal(out, colo_out)
-    grad = torch.rand_like(out)
-    out.backward(grad)
-    colo_out.backward(grad)
-    assert tensor_shard_equal(model.weight.grad, weight.grad, pg.tp_local_rank(), pg.tp_world_size())
-    assert tensor_shard_equal(model.bias.grad, bias.grad, pg.tp_local_rank(), pg.tp_world_size())
-
-
-def run_dist(rank, world_size, port):
-    config = dict(parallel=dict(tensor=dict(mode="1d", size=world_size),))
-    colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    run_with_spec(spec_init_func=split_param_col_tp1d, split_bias=False)
-    run_with_spec(spec_init_func=split_param_row_tp1d, split_bias=True)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 4])
-@rerun_if_address_is_in_use()
-def test_linear_1d(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_linear_1d(4)
--- a/tests/test_ops/test_loss_func.py
+++ b/tests/test_ops/test_loss_func.py
-import pytest
-import torch
-import torch.nn.functional as F
-
-import colossalai
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ProcessGroup, ShardSpec
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils import get_current_device
-
-
-def check_cross_entropy():
-    input_t = torch.randn(4, 4, device=get_current_device(), requires_grad=True)
-    input_ct = torch.randn(4, 4, device=get_current_device(), requires_grad=True)
-    with torch.no_grad():
-        input_ct.copy_(input_t)
-
-    target = torch.randint(4, (4,), dtype=torch.int64, device=get_current_device())
-
-    world_size = torch.distributed.get_world_size()
-    pg = ProcessGroup(tp_degree=world_size)
-    input_t_colo = ColoTensor.from_torch_tensor(tensor=input_ct, spec=ColoTensorSpec(pg))
-    input_shard = input_t_colo.redistribute(ShardSpec([-1], [pg.tp_world_size()]))
-    input_shard.set_tensor_spec(dist_spec=None, compute_spec=ComputeSpec(ComputePattern.TP1D))
-
-    output = F.cross_entropy(input_t, target)
-    output_colo = F.cross_entropy(input_shard, target)
-    assert torch.allclose(output_colo, output)
-
-    output.backward()
-    output_colo.backward()
-
-    assert torch.allclose(input_t.grad, input_ct.grad)
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    check_cross_entropy()
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 2])
-@rerun_if_address_is_in_use()
-def test_loss_func(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_loss_func(1)
--- a/tests/test_ops/test_op.py
+++ b/tests/test_ops/test_op.py
-import pytest
-import torch
-import torch.nn.functional as F
-from torch.nn import Parameter
-
-import colossalai
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup, ShardSpec
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils import get_current_device
-
-
-def _run_layer_norm():
-    ln_op = torch.nn.LayerNorm(2, 3, device=get_current_device())
-
-    input_t = torch.randn(3, 2, device=get_current_device())
-
-    pg = ProcessGroup(tp_degree=torch.distributed.get_world_size())
-    input_t_colo = ColoTensor.from_torch_tensor(input_t.clone().detach(), ColoTensorSpec(pg))
-
-    # prepare colossalai LN
-    weight = ColoTensor(Parameter(ln_op.weight.detach()), ColoTensorSpec(pg))
-    bias = ColoTensor(Parameter(ln_op.bias.detach()), ColoTensorSpec(pg))
-
-    output = ln_op(input_t)
-    output_colo = F.layer_norm(input_t_colo, ln_op.normalized_shape, weight, bias, ln_op.eps)
-
-    assert torch.allclose(output_colo, output)
-
-    torch.mean(output).backward()
-    torch.mean(output_colo).backward()
-
-    assert torch.allclose(ln_op.weight.grad, weight.grad)
-
-
-def check_spec_eq(tensor, other):
-    assert isinstance(tensor, ColoTensor) and isinstance(other, ColoTensor)
-    for k in dir(tensor.dist_spec):
-        if not k.startswith('__'):
-            assert hasattr(other.dist_spec, k), f"{k}"
-            assert getattr(tensor.dist_spec, k) == getattr(other.dist_spec, k)
-
-
-def check_element_wise_ops():
-    world_size = torch.distributed.get_world_size()
-    pg = ProcessGroup(tp_degree=world_size)
-    t = torch.rand(2, 2)
-    x = ColoTensor(t, spec=ColoTensorSpec(pg, ShardSpec([0], [pg.tp_world_size()])))
-
-    check_spec_eq(x, x.cuda())
-    assert torch.equal(x.cuda(), t.cuda())
-    check_spec_eq(x, torch.abs(x))
-    assert torch.equal(torch.abs(x), torch.abs(t))
-    check_spec_eq(x, F.sigmoid(x))
-    assert torch.equal(F.sigmoid(x), F.sigmoid(t))
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    check_element_wise_ops()
-    _run_layer_norm()
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [2])
-@rerun_if_address_is_in_use()
-def test_element_wise_ops(world_size):
-    spawn(run_dist, world_size)
-
-
-def run_dist2(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    _run_layer_norm()
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1])
-@rerun_if_address_is_in_use()
-def test_ln(world_size):
-    spawn(run_dist2, world_size)
-
-
-def check_all():
-    test_element_wise_ops(2)
-
-
-if __name__ == '__main__':
-    check_all()
--- a/tests/test_ops/test_view.py
+++ b/tests/test_ops/test_view.py
-import pytest
-import torch
-import torch.distributed as dist
-
-import colossalai
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup, ShardSpec
-from colossalai.tensor.distspec import DistPlacementPattern
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils import get_current_device
-from tests.test_tensor.common_utils import debug_print, split_param_col_tp1d, split_param_row_tp1d
-
-
-def exam_view_core(pg):
-    # the case of replicated ColoTensors
-    x = torch.randn(4, 4).cuda()
-    x_colo = ColoTensor(x, ColoTensorSpec(pg))
-
-    y = x.view(2, -1, 2)
-    y_colo = x_colo.view(2, -1, 2)
-
-    assert torch.all(y == y_colo)
-    assert y_colo.dist_spec.placement == DistPlacementPattern.REPLICATE
-    # the perfect case of col-sliced ColoTensors
-    split_param_col_tp1d(x_colo, pg)
-
-    z = x.view(torch.Size((2, 1, 2, -1)))
-    z_colo = x_colo.view(torch.Size((2, 1, 2, -1)))
-    if dist.get_rank() == 0:
-        z = z[:, :, :, 0:2]
-    else:
-        z = z[:, :, :, 2:]
-    assert torch.all(z == z_colo)
-    assert z_colo.dist_spec == x_colo.dist_spec
-    # the perfect case of row-sliced ColoTensors
-    split_param_row_tp1d(x_colo, pg)
-
-    z = x.view(torch.Size((-1, 2, 2)))
-    z_colo = x_colo.view(torch.Size((-1, 2, 2)))
-    if dist.get_rank() == 0:
-        z = z[0:2, :, :]
-    else:
-        z = z[2:, :, :]
-    assert torch.all(z == z_colo)
-    assert z_colo.dist_spec == x_colo.dist_spec
-    # the normal case of row-sliced ColoTensors
-    z = x.view(-1, 2, 2, 2)
-    z_colo = x_colo.view(-1, 2, 2, 2)
-    assert torch.all(z == z_colo)
-    assert y_colo.dist_spec.placement == DistPlacementPattern.REPLICATE
-
-
-def exam_view_autograd(pg):
-    x = torch.randn(8, 2, device=get_current_device(), requires_grad=True)
-    y = torch.randn(8, 2, device=get_current_device(), requires_grad=True)
-    with torch.no_grad():
-        y.copy_(x)
-    y = ColoTensor(y, ColoTensorSpec(pg))
-    y_slice = y.redistribute(ShardSpec([-1], [pg.tp_world_size()]))
-
-    xx = x.view(2, 2, -1)
-    yy_slice = y_slice.view(2, 2, -1)
-    yy = yy_slice.to_replicate()
-    grad = torch.randn(2, 2, 4, device=get_current_device())
-
-    xx.backward(grad)
-    yy.backward(grad)
-    assert torch.all(x.grad == y.grad)
-
-
-def exam_view_errors(pg):
-    x = torch.randn(8, 2, device=get_current_device())
-    x = ColoTensor(x, ColoTensorSpec(pg))
-    split_param_row_tp1d(x, pg)
-
-    x.view('a', 'b', 'c')
-    x.view(8, -1)
-    x.view([-2, -2, -2])
-    x.view((-1, -1, -1))
-
-
-def run_dist(rank, world_size, port):
-    colossalai.launch(config=dict(), rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    pg = ProcessGroup(tp_degree=torch.distributed.get_world_size())
-    exam_view_core(pg)
-    exam_view_autograd(pg)
-    # exam_view_errors(pg)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [2])
-@rerun_if_address_is_in_use()
-def test_view(world_size):
-    spawn(run_dist, world_size)
-
-
-if __name__ == '__main__':
-    test_view(2)
--- a/tests/test_pipeline/test_pipelinable.py
+++ b/tests/test_pipeline/test_pipelinable.py
+import pytest
 import torch

 from colossalai.pipeline.pipelinable import PipelinableContext
@@ -48,6 +49,7 @@ def run_pipelinable(rank, world_size, port):
    assert layers_count_in_part_0 + layers_count_in_part_1 == pipelinable.layers_count


+@pytest.mark.skip(reason="this is useless")
 @rerun_if_address_is_in_use()
 def test_pipelinable():
    spawn(run_pipelinable, 1)

--- a/tests/test_shardformer/test_model/test_shard_gpt2.py
+++ b/tests/test_shardformer/test_model/test_shard_gpt2.py
@@ -127,6 +127,10 @@ def check_gpt2(rank, world_size, port):
    run_gpt2_test()


+# TODO(ver217): fix this
+
+
+@pytest.mark.skip("this will stuck in CI")
 @pytest.mark.dist
 @rerun_if_address_is_in_use()
 @clear_cache_before_run()

--- a/tests/test_tensor/core/test_tensor.py
+++ b/tests/test_tensor/core/test_tensor.py
-import pytest
-import torch
-from numpy import allclose
-
-import colossalai
-from colossalai.core import global_context as gpc
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ProcessGroup, ReplicaSpec, ShardSpec, distspec
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-
-
-def _run_tensor_indexing():
-    pg = ProcessGroup()
-    torch_t = torch.randn(2, 3)
-    colo_t = ColoTensor(torch_t, ColoTensorSpec(pg))
-    assert allclose(torch_t[:, 1], colo_t[:, 1])
-
-
-def _run_wrapped_tensor_func():
-    pg = ProcessGroup()
-    t_ref = torch.randn(4, 5)
-    t = ColoTensor.from_torch_tensor(t_ref.clone(), ColoTensorSpec(pg))
-
-    # non-func attr
-    assert t.is_cuda == t_ref.is_cuda
-
-    # return 1 torch.Tensor
-    t_abs = t.abs()
-    assert isinstance(t_abs, ColoTensor) and torch.equal(t_abs, t_ref.abs())
-
-    # return 1 non-torch.Tensor
-    assert t.dim() == t_ref.dim()
-
-    # return >1 torch.Tensor
-    assert isinstance(t, ColoTensor)
-    t_split1, t_split2 = t.split(2)
-    assert isinstance(t_split1, ColoTensor) and isinstance(t_split2, ColoTensor), f"{type(t_split1)} {type(t_split2)}"
-
-
-def _run_operand(world_size):
-    pg = ProcessGroup()
-    t_ref = torch.randn(4, 5)
-    t = ColoTensor.from_torch_tensor(t_ref.clone(), ColoTensorSpec(pg))
-
-    t_ref_res = t_ref + t_ref
-    t_res = t + t
-
-    assert isinstance(t_res, ColoTensor)
-    assert torch.allclose(t_ref_res, t_res)
-
-    pg = ProcessGroup(tp_degree=world_size)
-    t = ColoTensor.from_torch_tensor(t_ref.clone(), ColoTensorSpec(pg))
-    t.set_dist_spec(ShardSpec([0], [world_size]))
-    t_new = torch.zeros_like(t)
-    assert isinstance(t_new, ColoTensor)
-    assert t_new.is_sharded()
-
-
-#### Test Distributed init a Colotensor
-
-
-def _run_view(world_size):
-    t_ref = torch.randn(4, 5)
-    rank = gpc.get_global_rank()
-    pg = ProcessGroup(rank, list(range(world_size)), tp_degree=world_size)
-    t = ColoTensor.from_torch_tensor(
-        t_ref, ColoTensorSpec(pg, dist_attr=ShardSpec(dims=[0], num_partitions=[pg.tp_world_size()])))
-
-    assert t.size_global()[0] == 4 * world_size
-    assert t.size_global(1) == 5
-    assert t.size_global() == torch.Size([4 * world_size, 5])
-
-    t = t.view(4 * 5 * world_size)
-    assert t.shape == torch.Size([4 * 5 * world_size])
-
-
-def _run_tensor_shard_init(world_size):
-    t_ref = torch.randn(4, 5)
-    pg = ProcessGroup(tp_degree=world_size)
-    shard_attr = ShardSpec(dims=[0], num_partitions=[pg.tp_world_size()])
-    tensor_spec = ColoTensorSpec(pg, dist_attr=shard_attr)
-    t = ColoTensor.from_torch_tensor(t_ref.clone(), tensor_spec)
-    t.set_dist_spec(ReplicaSpec())
-
-    assert t.shape == torch.Size((4 * world_size, 5)), f"{t.shape} vs ({4 * world_size, 5})"
-
-
-def _run_tensor_replicated_init(world_size):
-    t_ref = torch.randn(4 * world_size, 5)
-    pg = ProcessGroup()
-    spec = ColoTensorSpec(pg)
-    t = ColoTensor.from_torch_tensor(t_ref.clone(), spec)
-
-    assert t.shape == torch.Size((4 * world_size, 5)), f"{t.shape}"
-
-
-def _run_process_group(world_size):
-    pg1 = ProcessGroup()
-    pg2 = ProcessGroup()
-    assert pg1 == pg2
-
-
-def _run_redistributed(world_size):
-    if world_size != 4:
-        return
-    pg1 = ProcessGroup(tp_degree=2, dp_degree=2)
-    pg2 = ProcessGroup(tp_degree=4, dp_degree=1)
-
-    spec1 = ColoTensorSpec(pg1)
-    t1 = ColoTensor.from_torch_tensor(torch.randn(2, 3, 4), spec1)
-    t1 = t1.redistribute(ShardSpec([0], [pg1.tp_world_size()]))
-    assert t1.is_sharded()
-    t1 = t1.redistribute(ShardSpec([-1], [pg2.tp_world_size()]), pg2)
-    assert t1.is_sharded()
-    pg3 = ProcessGroup(tp_degree=1, dp_degree=4)
-    t1 = t1.redistribute(ReplicaSpec(), pg3)
-    assert t1.is_replicate()
-
-
-def _run_set_tensor_spec(world_size):
-    if world_size != 4:
-        return
-    pg = ProcessGroup(tp_degree=2, dp_degree=2)
-    spec1 = ColoTensorSpec(pg)
-    t1 = ColoTensor.from_torch_tensor(torch.randn(2, 3, 4), spec1)
-
-    dist_spec2 = ShardSpec([-1], [pg.tp_world_size()])
-    assert t1.is_replicate()
-    t1.set_dist_spec(dist_spec2)
-    assert t1.is_shard_1dcol()
-
-
-def run_dist_tests(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    _run_tensor_shard_init(world_size)
-    _run_tensor_replicated_init(world_size)
-    _run_view(world_size)
-    _run_process_group(world_size)
-    _run_tensor_indexing()
-    _run_operand(world_size)
-    _run_wrapped_tensor_func()
-    _run_redistributed(world_size)
-    _run_set_tensor_spec(world_size)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 2])
-@rerun_if_address_is_in_use()
-def test_dist_cases(world_size):
-    spawn(run_dist_tests, world_size)
-
-
-if __name__ == '__main__':
-    test_dist_cases(4)
--- a/tests/test_tensor/model/test_gpt2.py
+++ b/tests/test_tensor/model/test_gpt2.py
-import pytest
-import torch
-from torch.nn.parallel import DistributedDataParallel as DDP
-
-import colossalai
-from colossalai.nn.parallel.data_parallel import ColoDDP
-from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ProcessGroup, ShardSpec
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-from colossalai.utils.cuda import get_current_device
-from colossalai.zero import ColoInitContext
-from tests.components_to_test.registry import non_distributed_component_funcs
-from tests.test_tensor.common_utils import (
-    debug_print,
-    set_seed,
-    split_param_col_tp1d,
-    split_param_row_tp1d,
-    tensor_equal,
-    tensor_shard_equal,
-)
-
-
-def init_1d_row_spec(model, pg: ProcessGroup):
-    tensor_spec = (ShardSpec([0], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-    for n, p in model.named_parameters():
-        p.set_process_group(pg)
-        if 'weight' in n and 'ln' not in n:
-            p.set_tensor_spec(*tensor_spec)
-
-
-def init_1d_col_spec(model, pg: ProcessGroup):
-    spec = (ShardSpec([-1], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
-
-    for n, p in model.named_parameters():
-        p.set_process_group(pg)
-        if 'ln' not in n and ('weight' in n or 'bias' in n):
-            p.set_tensor_spec(*spec)
-
-
-def init_megatron_spec(model, pg: ProcessGroup):
-    for mn, module in model.named_modules():
-        # debug_print([0], mn)
-        for pn, param in module.named_parameters(recurse=False):
-            # debug_print([0], '\t', pn, param.compute_spec, param.shape)
-            param.set_process_group(pg)
-
-            if 'mlp.c_fc' in mn:
-                if 'weight' in pn or 'bias' in pn:
-                    split_param_col_tp1d(param, pg)
-                    param.compute_spec.set_output_replicate(False)
-                else:
-                    raise RuntimeError
-            elif 'mlp.c_proj' in mn:
-                if 'weight' in pn:
-                    split_param_row_tp1d(param, pg)
-                else:
-                    assert 'bias' in pn
-            elif 'wte' in mn or 'wpe' in mn:
-                assert 'weight' in pn
-                split_param_col_tp1d(param, pg)
-            elif 'c_attn' in mn or 'c_proj' in mn:
-                split_param_col_tp1d(param, pg)
-            # debug_print([0], '\t', param.compute_spec, param.shape)
-
-
-def check_param_equal(model, torch_model, pg: ProcessGroup):
-    for p, torch_p in zip(model.parameters(), torch_model.parameters()):
-        assert pg.tp_local_rank() is not None, f"{pg.rank()} {pg.tp_world_size()} {pg._tp_degree} {pg.tp_local_rank()}1"
-        assert pg.tp_world_size() is not None
-        assert tensor_shard_equal(torch_p, p, pg.tp_local_rank(), pg.tp_world_size())
-
-
-def check_grad_equal(model, torch_model, pg: ProcessGroup):
-    for p, torch_p in zip(model.parameters(), torch_model.parameters()):
-        assert tensor_shard_equal(torch_p.grad, p.grad, pg.tp_local_rank(), pg.tp_world_size())
-
-
-def run_gpt(init_spec_func, use_ddp):
-    world_size = torch.distributed.get_world_size()
-
-    # build a PG with TP and DP hybrid
-    pg = ProcessGroup(dp_degree=(2 if (use_ddp and world_size >= 2) else 1))
-
-    # set seed make processes of the same tp group use the same seed
-    # set_seed(pg.tp_local_rank())
-
-    get_components_func = non_distributed_component_funcs.get_callable('gpt2')
-    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
-
-    # make sure torch_model and model has the same parameter values
-    with ColoInitContext(device=get_current_device()):
-        model = model_builder()
-    model = model.cuda()
-    torch_model = model_builder().cuda()
-
-    if use_ddp:
-        torch_model = DDP(torch_model, device_ids=[pg.rank()], process_group=pg.dp_process_group())
-        model = ColoDDP(model, process_group=pg)
-
-    for torch_p, p in zip(torch_model.parameters(), model.parameters()):
-        torch_p.data.copy_(p)
-
-    init_spec_func(model, pg)
-
-    check_param_equal(model, torch_model, pg)
-
-    # close the dropout in eval mode
-    model.eval()
-    torch_model.eval()
-    set_seed(pg.dp_local_rank())
-    torch.distributed.barrier()
-    for i, (input_ids, label) in enumerate(train_dataloader):
-        colo_input = ColoTensor.from_torch_tensor(input_ids, ColoTensorSpec(pg))
-        logits = model(colo_input)
-        torch_logits = torch_model(input_ids)
-        assert tensor_equal(torch_logits, logits), f"{torch_logits - logits}"
-        loss = criterion(logits, input_ids)
-        torch_loss = criterion(torch_logits, input_ids)
-        if use_ddp:
-            model.backward(loss)
-        else:
-            loss.backward()
-        torch_loss.backward()
-        check_grad_equal(model, torch_model, pg)
-        if i > 0:
-            break
-    set_seed(313)
-
-
-def run_dist(rank, world_size, port, use_ddp):
-    if use_ddp and world_size == 1:
-        return
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    # Comments below tests for speed concern
-    # run_gpt(init_1d_row_spec, use_ddp)
-    # run_gpt(init_1d_col_spec, use_ddp)
-    run_gpt(init_megatron_spec, use_ddp)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 4])
-@pytest.mark.parametrize('use_ddp', [False, True])
-@rerun_if_address_is_in_use()
-def test_gpt(world_size, use_ddp):
-    spawn(run_dist, world_size, use_ddp=use_ddp)
-
-
-if __name__ == '__main__':
-    test_gpt(4, use_ddp=False)
--- a/tests/test_tensor/model/test_model.py
+++ b/tests/test_tensor/model/test_model.py
-import pytest
-import torch
-
-import colossalai
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.tensor import ColoTensor, ProcessGroup
-from colossalai.tensor.colo_parameter import ColoParameter
-from colossalai.testing import free_port, rerun_if_address_is_in_use, spawn
-from colossalai.utils.cuda import get_current_device
-from colossalai.zero import ColoInitContext
-from tests.components_to_test.registry import non_distributed_component_funcs
-from tests.test_tensor.common_utils import (
-    check_equal,
-    set_seed,
-    split_param_col_tp1d,
-    split_param_row_tp1d,
-    tensor_shard_equal,
-)
-
-
-def run_1d_hybrid_tp(model_name):
-    # A simple net with two stacked nn.Linear
-    get_components_func = non_distributed_component_funcs.get_callable(model_name)
-    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
-
-    rank = torch.distributed.get_rank()
-    world_size = torch.distributed.get_world_size()
-
-    set_seed(1)
-    with ColoInitContext(device=get_current_device()):
-        model = model_builder(checkpoint=True)
-
-    if rank == 0:
-        model_torch = model_builder(checkpoint=True)
-        model_torch = model_torch.cuda()
-
-        optimizer_torch = ColossalaiOptimizer(torch.optim.SGD(model_torch.parameters(), lr=0.1))
-
-        # Make two models have the same init params
-        for p1, p2 in zip(model.parameters(), model_torch.parameters()):
-            p2.data.copy_(p1.data)
-    else:
-        model_torch = None
-        optimizer_torch = None
-
-    pg = ProcessGroup(tp_degree=world_size)
-    if 'bert' == model_name:
-        for name, p in model.named_parameters():
-            if not isinstance(p, ColoTensor):
-                continue
-
-            # num_class = type_vocab_size = 2 | (8, 2)
-            if 'classifier' in name and 'weight' in name:
-                split_param_col_tp1d(p, pg)
-            # num_class = vocab_size = 30524 | (30524, 8)
-            elif 'word_embeddings' in name and 'weight' in name:
-                split_param_row_tp1d(p, pg)
-            # num_class = seq_len = 512 | (512, 8)
-            elif 'position_embeddings' in name and 'weight' in name:
-                split_param_row_tp1d(p, pg)
-            # num_class = type_vocab_size = 2 | (2, 8)
-            elif 'token_type_embeddings' in name and 'weight' in name:
-                split_param_col_tp1d(p, pg)
-
-    elif "simple_net" == model_name:
-        # A naive way to set spec for all weights in Linear
-        for name, p in model.named_parameters():
-            if not isinstance(p, ColoTensor):
-                continue
-            if 'embed' in name and 'weight' in name:
-                split_param_col_tp1d(p, pg)
-            if 'proj1' in name and ('weight' in name or 'bias' in name):
-                split_param_row_tp1d(p, pg)
-            if 'proj2' in name and 'weight' in name:
-                split_param_col_tp1d(p, pg)
-            if 'classifier' in name and ('weight' in name or 'bias' in name):
-                split_param_row_tp1d(p, pg)
-
-    model = model.cuda()
-    model.eval()
-    if rank == 0:
-        model_torch.eval()
-
-    colo_optimizer = ColossalaiOptimizer(torch.optim.SGD(model.parameters(), lr=0.1))
-
-    for i, (data, label) in enumerate(train_dataloader):
-
-        # Zero grad
-        colo_optimizer.zero_grad()
-        if rank == 0:
-            optimizer_torch.zero_grad()
-        torch.distributed.barrier()
-
-        data = data.to(get_current_device())
-        label = label.to(get_current_device())
-
-        torch.distributed.broadcast(data, 0, group=pg.tp_process_group())
-        torch.distributed.broadcast(label, 0, group=pg.tp_process_group())
-
-        # Bcast rank0 data to all processes
-        if criterion:
-            output = model(data)
-            loss = criterion(output, label)
-        else:
-            output = model(data, label)
-            loss = output
-
-        # Test output
-        if rank == 0:
-            if criterion:
-                output_torch = model_torch(data)
-                loss_torch = criterion(output_torch, label)
-            else:
-                output_torch = model_torch(data, label)
-                loss_torch = output_torch
-            assert torch.allclose(loss, loss_torch, rtol=1e-2), f"model_name {model_name} failed"
-        torch.distributed.barrier()
-
-        loss.backward()
-        colo_optimizer.step()
-
-        if rank == 0:
-            loss_torch.backward()
-            optimizer_torch.step()
-
-            with torch.no_grad():
-                # check param
-                for p, torch_p in zip(model.parameters(), model_torch.parameters()):
-                    assert tensor_shard_equal(torch_p, p, pg.tp_local_rank(), pg.tp_world_size())
-        torch.distributed.barrier()
-        if i > 5:
-            break
-
-
-# Test the overrided parameters() and named_parameters() member functions
-def test_model_parameters():
-    colossalai.launch(config={}, rank=0, world_size=1, host='localhost', port=free_port(), backend='nccl')
-
-    # build a module with 2 Linear, 4 parameters in total.
-    class Net(torch.nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.fcs = torch.nn.Sequential(torch.nn.Linear(2, 3), torch.nn.Linear(3, 2))
-            self.extra_param = torch.nn.Parameter(torch.randn(2))
-
-    with ColoInitContext(device=get_current_device()):
-        model = Net()
-
-    param_cnt = 0
-    for name, p in model.named_parameters():
-        param_cnt += 1
-    assert param_cnt == 5
-
-    for name, colo_p in model.named_parameters():
-        assert colo_p.is_model_data()
-
-    param_cnt = 0
-    for name, p in model.named_parameters(recurse=False):
-        param_cnt += 1
-    assert param_cnt == 1
-
-    param_cnt = 0
-    for p in model.fcs[0].parameters(recurse=False):
-        param_cnt += 1
-    assert param_cnt == 2
-
-
-def test_colo_optimizer():
-    get_components_func = non_distributed_component_funcs.get_callable('simple_net')
-    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
-    set_seed(1)
-    with ColoInitContext(device=get_current_device()):
-        model = model_builder(checkpoint=True)
-
-    colo_optimizer = ColossalaiOptimizer(torch.optim.SGD(model.parameters(), lr=0.1))
-    for i, (data, label) in enumerate(train_dataloader):
-        colo_optimizer.zero_grad()
-        data = data.to(get_current_device())
-        label = label.to(get_current_device())
-
-        # Bcast rank0 data to all processes
-        if criterion:
-            output = model(data)
-            loss = criterion(output, label)
-        else:
-            output = model(data, label)
-            loss = output
-
-        loss.backward()
-        colo_optimizer.step()
-
-        if i > 5:
-            break
-
-
-def run_1d_row_tp(model_name: str):
-    # A simple net with two stacked nn.Linear
-    get_components_func = non_distributed_component_funcs.get_callable(model_name)
-    model_builder, train_dataloader, test_dataloader, optimizer_class, criterion = get_components_func()
-    rank = torch.distributed.get_rank()
-
-    set_seed(1)
-    with ColoInitContext(device=get_current_device()):
-        model = model_builder(checkpoint=True)
-
-    world_size = torch.distributed.get_world_size()
-    pg = ProcessGroup(tp_degree=world_size)
-
-    set_seed(1)
-    if rank == 0:
-        model_torch = model_builder(checkpoint=True)
-        model_torch = model_torch.cuda()
-
-    # A naive way to set spec for all weights in Linear
-    for mo_name, module in model.named_modules():
-        # print(mo_name)
-        for pa_name, param in module.named_parameters(recurse=False):
-            # print('\t', pa_name, param.shape)
-            if not isinstance(param, ColoTensor):
-                continue
-            if 'weight' in pa_name:
-                if 'embed' in mo_name and 'token' not in mo_name and 'LayerNorm' not in mo_name:
-                    split_param_row_tp1d(param, pg)
-                elif 'LayerNorm' not in mo_name and 'ln' not in mo_name:
-                    split_param_col_tp1d(param, pg)
-
-    model = model.cuda()
-
-    for i, (data, label) in enumerate(train_dataloader):
-        data = data.to(get_current_device())
-        label = label.to(get_current_device())
-
-        torch.distributed.broadcast(data, 0, group=pg.tp_process_group())
-        torch.distributed.broadcast(label, 0, group=pg.tp_process_group())
-
-        # Bcast rank0 data to all processes
-        if criterion:
-            output = model(data)
-            loss = criterion(output, label)
-        else:
-            output = model(data, label)
-            loss = output
-
-        # For reference
-        if rank == 0:
-            if criterion:
-                output_torch = model_torch(data)
-                loss_torch = criterion(output_torch, label)
-            else:
-                output_torch = model_torch(data, label)
-                loss_torch = output_torch
-            assert torch.allclose(loss, loss_torch, rtol=1e-2)
-        torch.distributed.barrier()
-
-        loss.backward()
-
-        if rank == 0:
-            loss_torch.backward()
-        torch.distributed.barrier()
-
-        if i > 5:
-            break
-
-
-def _run_pretrain_load():
-    from transformers import BertForMaskedLM
-    set_seed(1)
-    model_pretrained = BertForMaskedLM.from_pretrained('bert-base-uncased')
-    with ColoInitContext(device=get_current_device()):
-        model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-
-    model_pretrained = model_pretrained.cuda()
-    model = model.cuda()
-
-    dict_pretrained = {}
-    dict_col = {}
-    c_ref = 0
-    for name, param in model_pretrained.named_parameters():
-        dict_pretrained[name] = param
-        c_ref += 1
-    c1 = 0
-    c2 = 0
-    for name, param in model.named_parameters():
-        if isinstance(param, ColoParameter):
-            c1 += 1
-        else:
-            c2 += 1
-        dict_col[name] = param
-    assert c_ref == c1
-    assert c2 == 0
-    if model_pretrained.cls.predictions.decoder.bias is model_pretrained.cls.predictions.bias:
-        assert model.cls.predictions.decoder.bias is model.cls.predictions.bias
-
-    for name, param in dict_pretrained.items():
-        check_equal(param, dict_col[name])
-
-
-def run_model_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    # Comment below test for speed consideration
-    # for name in ['bert', 'simple_net']:
-    #     run_1d_row_tp(name)
-    for name in ['bert', 'simple_net']:
-        run_1d_hybrid_tp(name)
-
-
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 4])
-@rerun_if_address_is_in_use()
-def test_model(world_size):
-    spawn(run_model_dist, world_size)
-
-
-def run_pretrain_load_dist(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
-    _run_pretrain_load()
-
-
-# The test case has to download huggingface pretrained models from the internet
-# So we manually trigger the test.
-@pytest.mark.skip
-@pytest.mark.dist
-@pytest.mark.parametrize('world_size', [1, 4])
-@rerun_if_address_is_in_use()
-def test_pretrain_load(world_size):
-    spawn(run_pretrain_load_dist, world_size)
-
-
-if __name__ == '__main__':
-    # test_model_parameters()
-    # test_colo_optimizer()
-    test_model(4)
-    # test_pretrain_load(4)