Merge branch 'main' into sync/npu

14846934 · ver217 · 9102d655 · 5d9a0ae7 · 14846934 · 14846934
Commit 14846934 authored Jan 18, 2024 by ver217
20 changed files
--- a/docs/source/zh-Hans/features/shardformer.md
+++ b/docs/source/zh-Hans/features/shardformer.md
@@ -174,6 +174,18 @@ Author: [Baizhou Zhang](https://github.com/Fridge003), [Bin Jia](https://github.
    <td nowrap="nowrap" align="center">❌</td>
    <td nowrap="nowrap" align="center">❌</td>
  </tr>
+  <tr>
+    <td nowrap="nowrap">Falcon</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">❌</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">❌</td>
+    <td nowrap="nowrap" align="center">❌</td>
+  </tr>
  <tr>
    <td colspan="39"></td>
  </tr>

--- a/docs/source/zh-Hans/get_started/installation.md
+++ b/docs/source/zh-Hans/get_started/installation.md
@@ -2,7 +2,7 @@

 环境要求:

- PyTorch >= 1.11 (PyTorch 2.x 正在适配中)
+- PyTorch >= 1.11 并且 PyTorch <= 2.1
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)

--- a/examples/language/bert/data.py
+++ b/examples/language/bert/data.py
@@ -88,6 +88,7 @@ class GLUEDataBuilder:
        )

    def val_dataloader(self):
+        #   as the last batch may not be divisible by the number of microbatches
        if len(self.eval_splits) == 1:
            return self.plugin.prepare_dataloader(self.dataset["validation"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:

--- a/examples/language/bert/finetune.py
+++ b/examples/language/bert/finetune.py
@@ -57,7 +57,7 @@ def evaluate_model(

    def evaluate_subset(dataloader: DataLoader):
        use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
-        is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
+        is_pp_last_device = use_pipeline and booster.plugin.stage_manager.is_last_stage(ignore_chunk=True)

        accum_loss = torch.zeros(1, device=get_accelerator().get_current_device())
        for batch in dataloader:
@@ -69,9 +69,10 @@ def evaluate_model(
                current_pp_group_ranks = pg_mesh.get_ranks_in_group(pp_group)
                current_rank = dist.get_rank()
                batch = iter([batch])
+
                outputs = booster.execute_pipeline(batch, model, criterion, return_loss=True, return_outputs=True)

-                if is_pp_last_stage:
+                if is_pp_last_device:
                    logits = outputs["outputs"]["logits"]
                    val_loss = outputs["loss"]
                    accum_loss.add_(val_loss)
@@ -135,8 +136,8 @@ def train_epoch(
    coordinator: DistCoordinator,
 ):
    use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
-    is_pp_last_stage = use_pipeline and booster.plugin.stage_manager.is_last_stage()
-    print_flag = (not use_pipeline and coordinator.is_master()) or (use_pipeline and is_pp_last_stage)
+    is_pp_last_device = use_pipeline and booster.plugin.stage_manager.is_last_stage(ignore_chunk=True)
+    print_flag = (not use_pipeline and coordinator.is_master()) or (use_pipeline and is_pp_last_device)
    total_step = len(train_dataloader)

    model.train()
@@ -150,7 +151,7 @@ def train_epoch(
                    train_dataloader_iter, model, _criterion, optimizer, return_loss=True, return_outputs=True
                )
                # Backward and optimize
-                if is_pp_last_stage:
+                if is_pp_last_device:
                    loss = outputs["loss"]
                    pbar.set_postfix({"loss": loss.item()})
            else:
@@ -224,7 +225,9 @@ def main():
            tp_size=1,
            pp_size=2,
            num_microbatches=None,
-            microbatch_size=1,
+            pp_style="interleaved",
+            num_model_chunks=2,
+            microbatch_size=16,
            enable_all_optimization=True,
            zero_stage=1,
            precision="fp16",

--- a/examples/language/bert/test_ci.sh
+++ b/examples/language/bert/test_ci.sh
 #!/bin/bash
-set -xe
+set -x

 pip install -r requirements.txt

+FAIL_LIMIT=3
+
 for plugin in "torch_ddp" "torch_ddp_fp16" "gemini" "low_level_zero" "hybrid_parallel"; do
-   torchrun --standalone --nproc_per_node 4  finetune.py --target_f1 0.86 --plugin $plugin --model_type "bert"
+    for i in $(seq 1 $FAIL_LIMIT); do
+        torchrun --standalone --nproc_per_node 4 finetune.py --target_f1 0.86 --plugin $plugin --model_type "bert" && break
+        echo "Failed $i times"
+        if [ $i -eq $FAIL_LIMIT ]; then
+            echo "Failed $FAIL_LIMIT times, exiting"
+            exit 1
+        fi
+    done
 done
--- a/examples/language/llama2/README.md
+++ b/examples/language/llama2/README.md
@@ -6,7 +6,6 @@
 </p>

 - 70 billion parameter LLaMA2 model training accelerated by 195%
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama2)
 [[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training)

 ### LLaMA1
@@ -15,7 +14,6 @@
 </p>

 - 65-billion-parameter large model pretraining accelerated by 38%
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
 [[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)

 ## Dataset
@@ -103,7 +101,7 @@ Here is details about CLI arguments:
 - Max length: `-l`, `--max_length`. The default value is 4096.
 - Mixed precision: `-x`, `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported.
 - Save interval: `-i`, `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000.
- Checkpoint directory: `-o`, `--save_dir`. The directoty path to save checkpoints. The default value is `checkpoint`.
+- Checkpoint directory: `-o`, `--save_dir`. The directory path to save checkpoints. The default value is `checkpoint`.
 - Checkpoint to load: `-f`, `--load`. The checkpoint path to load. The default value is `None`.
 - Gradient clipping: `--gradient_clipping`. The default value is 1.0.
 - Tensorboard log directory: `-t`, `--tensorboard_dir`. The directory path to save tensorboard logs. The default value is `tb_logs`.
@@ -123,7 +121,7 @@ Here we will show an example of how to run training
 llama pretraining with `gemini, batch_size=16, sequence_length=4096, gradient_checkpoint=True, flash_attn=True`.

 #### a. Running environment
-This experiment was performed on 4 computing nodes with 32 A800 GPUs in total for LLaMA-1 65B. The nodes are
+This experiment was performed on 4 computing nodes with 32 A800/H800 80GB GPUs in total for LLaMA-1 65B or LLaMA-2 70B. The nodes are
 connected with RDMA and GPUs within one node are fully connected with NVLink.

 #### b. Running command
@@ -217,7 +215,7 @@ Here is details about CLI arguments:
 - Max length: `-l`, `--max_length`. The default value is 4096.
 - Mixed precision: `-x`, `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported.
 - Save interval: `-i`, `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000.
- Checkpoint directory: `-o`, `--save_dir`. The directoty path to save checkpoints. The default value is `checkpoint`.
+- Checkpoint directory: `-o`, `--save_dir`. The directory path to save checkpoints. The default value is `checkpoint`.
 - Checkpoint to load: `-f`, `--load`. The checkpoint path to load. The default value is `None`.
 - Gradient clipping: `--gradient_clipping`. The default value is 1.0.
 - Tensorboard log directory: `-t`, `--tensorboard_dir`. The directory path to save tensorboard logs. The default value is `tb_logs`.

--- a/examples/language/llama2/benchmark.py
+++ b/examples/language/llama2/benchmark.py
@@ -71,9 +71,10 @@ def main():
    parser.add_argument("--offload_optim_frac", type=float, default=0.0, help="Offload optim fraction. Only for gemini")
    parser.add_argument("--offload_param_frac", type=float, default=0.0, help="Offload param fraction. Only for gemini")
    parser.add_argument("--tp", type=int, default=1, help="Tensor parallel size")
+    parser.add_argument("--extra_dp", type=int, default=1, help="Extra data parallel size, used for Gemini")
    parser.add_argument("--pp", type=int, default=1, help="Pipeline parallel size")
-    parser.add_argument("--mbs", type=int, default=1)
-    parser.add_argument("--zero", type=int, default=0)
+    parser.add_argument("--mbs", type=int, default=1, help="Micro batch size of pipeline parallel")
+    parser.add_argument("--zero", type=int, default=0, help="Zero Stage when hybrid plugin is enabled")
    args = parser.parse_args()

    colossalai.launch_from_torch({})
@@ -92,9 +93,17 @@ def main():
            shard_param_frac=args.shard_param_frac,
            offload_optim_frac=args.offload_optim_frac,
            offload_param_frac=args.offload_param_frac,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
        )
    elif args.plugin == "gemini_auto":
-        plugin = GeminiPlugin(placement_policy="auto", precision="bf16", warmup_non_model_data_ratio=args.warmup_ratio)
+        plugin = GeminiPlugin(
+            placement_policy="auto",
+            precision="bf16",
+            warmup_non_model_data_ratio=args.warmup_ratio,
+            tp_size=args.tp,
+            extra_dp_size=args.extra_dp,
+        )
    elif args.plugin == "fsdp":
        if use_empty_init:
            plugin = TorchFSDPPlugin(
@@ -129,9 +138,11 @@ def main():
        plugin = HybridParallelPlugin(
            tp_size=args.tp,
            pp_size=args.pp,
+            pp_style="interleaved",
            zero_stage=args.zero,
+            num_model_chunks=2,
            enable_fused_normalization=torch.cuda.is_available(),
-            num_microbatches=args.mbs,
+            microbatch_size=args.mbs,
            precision="bf16",
        )
    elif args.plugin == "3d_cpu":
@@ -141,7 +152,7 @@ def main():
            zero_stage=args.zero,
            cpu_offload=True,
            enable_fused_normalization=torch.cuda.is_available(),
-            num_microbatches=args.mbs,
+            microbatch_size=args.mbs,
            initial_scale=2**8,
            precision="bf16",
        )

--- a/examples/language/llama2/scripts/benchmark_70B/3d.sh
+++ b/examples/language/llama2/scripts/benchmark_70B/3d.sh
@@ -14,4 +14,4 @@ cd ../..

 export OMP_NUM_THREADS=8

-colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -p 3d -g -x -b 8 --tp 4 --pp 2 --mbs 4
+colossalai run --nproc_per_node 8 --hostfile $HOSTFILE benchmark.py -c 70b -p 3d -g -x -b 8 --tp 4 --pp 2 --mbs 1
--- a/examples/language/openmoe/README.md
+++ b/examples/language/openmoe/README.md
 ## OpenMoE
 [OpenMoE](https://github.com/XueFuzhao/OpenMoE) is the open-source community's first decoder-only MoE transformer. OpenMoE is implemented in Jax, and [Colossal-AI](https://github.com/hpcaitech/ColossalAI) has pioneered an efficient open-source support for this model in PyTorch, enabling a broader range of users to participate in and use this model. The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates finetune and inference methods.

+
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/MOE_training.png" width=800/>
+</p>
+
+* [2023/11] [Enhanced MoE Parallelism, Open-source MoE Model Training Can Be 9 Times More Efficient](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient)
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/openmoe)
+[[blog]](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient)
+
 ## Usage

 ### 1. Installation

--- a/requirements/requirements-test.txt
+++ b/requirements/requirements-test.txt
 diffusers
-fbgemm-gpu==0.2.0
 pytest
 coverage==7.2.3
 git+https://github.com/hpcaitech/pytest-testmon
@@ -16,7 +15,7 @@ triton==2.1.0
 requests==2.27.1 # downgrade to avoid huggingface error https://github.com/huggingface/transformers/issues/17611
 SentencePiece
 ninja
-flash_attn==2.0.5
+flash_attn
 datasets
 pydantic
 ray

--- a/tests/kit/model_zoo/__init__.py
+++ b/tests/kit/model_zoo/__init__.py
-from . import custom, diffusers, timm, torchaudio, torchrec, torchvision, transformers
+import os
+from . import custom, diffusers, timm, torchaudio, torchvision, transformers
 from .executor import run_fwd, run_fwd_bwd
 from .registry import model_zoo

-__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd"]
+# We pick a subset of models for fast testing in order to reduce the total testing time
+COMMON_MODELS = [
+    'custom_hanging_param_model',
+    'custom_nested_model',
+    'custom_repeated_computed_layers',
+    'custom_simple_net',
+    'diffusers_clip_text_model',
+    'diffusers_auto_encoder_kl',
+    'diffusers_unet2d_model',
+    'timm_densenet',
+    'timm_resnet',
+    'timm_swin_transformer',
+    'torchaudio_wav2vec2_base',
+    'torchaudio_conformer',
+    'transformers_bert_for_masked_lm',
+    'transformers_bloom_for_causal_lm',
+    'transformers_falcon_for_causal_lm',
+    'transformers_chatglm_for_conditional_generation',
+    'transformers_llama_for_casual_lm',
+    'transformers_vit_for_masked_image_modeling',
+    'transformers_mistral_for_casual_lm'
+]
+
+IS_FAST_TEST = os.environ.get('FAST_TEST', '0') == '1'
+
+
+__all__ = ["model_zoo", "run_fwd", "run_fwd_bwd", 'COMMON_MODELS', 'IS_FAST_TEST']
+
--- a/tests/kit/model_zoo/registry.py
+++ b/tests/kit/model_zoo/registry.py
 #!/usr/bin/env python
 from dataclasses import dataclass
-from typing import Callable
+from typing import Callable, List, Union

 __all__ = ["ModelZooRegistry", "ModelAttribute", "model_zoo"]

@@ -61,7 +61,9 @@ class ModelZooRegistry(dict):
        """
        self[name] = (model_fn, data_gen_fn, output_transform_fn, loss_fn, model_attribute)

-    def get_sub_registry(self, keyword: str):
+    def get_sub_registry(
+        self, keyword: Union[str, List[str]], exclude: Union[str, List[str]] = None, allow_empty: bool = False
+    ):
        """
        Get a sub registry with models that contain the keyword.

@@ -70,10 +72,32 @@ class ModelZooRegistry(dict):
        """
        new_dict = dict()

+        if isinstance(keyword, str):
+            keyword_list = [keyword]
+        else:
+            keyword_list = keyword
+        assert isinstance(keyword_list, (list, tuple))
+
+        if exclude is None:
+            exclude_keywords = []
+        elif isinstance(exclude, str):
+            exclude_keywords = [exclude]
+        else:
+            exclude_keywords = exclude
+        assert isinstance(exclude_keywords, (list, tuple))
+
        for k, v in self.items():
-            if keyword in k:
+            for kw in keyword_list:
+                if kw in k:
+                    should_exclude = False
+                    for ex_kw in exclude_keywords:
+                        if ex_kw in k:
+                            should_exclude = True
+
+                    if not should_exclude:
                        new_dict[k] = v

+        if not allow_empty:
            assert len(new_dict) > 0, f"No model found with keyword {keyword}"
        return new_dict


--- a/tests/kit/model_zoo/transformers/__init__.py
+++ b/tests/kit/model_zoo/transformers/__init__.py
@@ -3,10 +3,17 @@ from .bert import *
 from .blip2 import *
 from .bloom import *
 from .chatglm2 import *
+from .falcon import *
 from .gpt import *
+from .gptj import *
 from .llama import *
 from .opt import *
 from .sam import *
 from .t5 import *
 from .vit import *
 from .whisper import *
+
+try:
+    from .mistral import *
+except ImportError:
+    print("This version of transformers doesn't support mistral.")
--- a/tests/kit/model_zoo/transformers/chatglm2.py
+++ b/tests/kit/model_zoo/transformers/chatglm2.py
@@ -2,7 +2,6 @@ import torch

 from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
 from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
-
 from ..registry import ModelAttribute, model_zoo

 # ================================

--- a/tests/kit/model_zoo/transformers/falcon.py
+++ b/tests/kit/model_zoo/transformers/falcon.py
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register Falcon
+# ===============================
+
+
+def data_gen():
+    # Generated from following code snippet
+    #
+    # from transformers import AutoTokenizer
+    # input = 'Hello, my dog is cute'
+    # tokenized_input = tokenizer(input, return_tensors='pt')
+    # input_ids = tokenized_input['input_ids']
+    # attention_mask = tokenized_input['attention_mask']
+    input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def data_gen_for_lm():
+    # LM data gen
+    # the `labels` of LM is the token of the output, cause no padding, use `input_ids` as `labels`
+    data = data_gen()
+    data["labels"] = data["input_ids"].clone()
+    return data
+
+
+def data_gen_for_token_classification():
+    # token classification data gen
+    # `labels` is the type not the token id for token classification, 0 or 1
+    data = data_gen()
+    data["labels"] = torch.tensor([[0, 0, 0, 0, 0, 0, 0, 0]], dtype=torch.int64)
+    return data
+
+
+def data_gen_for_sequence_classification():
+    # sequence classification data gen
+    data = data_gen()
+    data["labels"] = torch.tensor([0], dtype=torch.int64)
+    return data
+
+
+def data_gen_for_question_answering():
+    input_ids = torch.tensor(
+        [[57647, 1620, 23967, 620, 107373, 34, 91514, 620, 107373, 1620, 267, 35378, 48946, 18161, 48946, 18161]],
+        dtype=torch.int64,
+    )
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    start_positions = torch.tensor([1], dtype=torch.int64)
+    end_positions = torch.tensor([10], dtype=torch.int64)
+    return dict(
+        input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions
+    )
+
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# define loss function
+loss_fn_for_falcon_model = lambda x: torch.nn.functional.mse_loss(
+    x.last_hidden_state, torch.ones_like(x.last_hidden_state)
+)
+loss_fn_for_causal_lm = lambda x: x.loss
+loss_fn_for_classification = lambda x: x.loss
+loss_fn_for_question_answering = lambda x: x.loss
+
+config = transformers.FalconConfig(
+    num_hidden_layers=2,
+    num_attention_heads=4,
+    vocab_size=250880,
+    hidden_dropout=0,
+    attention_dropout=0,
+    hidden_size=64,
+    multi_query=False,
+    new_decoder_architecture=True,
+    pad_token_id=-1,
+)
+
+model_zoo.register(
+    name="transformers_falcon",
+    model_fn=lambda: transformers.FalconModel(config),
+    data_gen_fn=data_gen,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_falcon_model,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+
+model_zoo.register(
+    name="transformers_falcon_for_causal_lm",
+    model_fn=lambda: transformers.FalconForCausalLM(config),
+    data_gen_fn=data_gen_for_lm,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_causal_lm,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+
+model_zoo.register(
+    name="transformers_falcon_for_sequence_classification",
+    model_fn=lambda: transformers.FalconForSequenceClassification(config),
+    data_gen_fn=data_gen_for_sequence_classification,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_classification,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+model_zoo.register(
+    name="transformers_falcon_for_token_classification",
+    model_fn=lambda: transformers.FalconForTokenClassification(config),
+    data_gen_fn=data_gen_for_token_classification,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_classification,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+model_zoo.register(
+    name="transformers_falcon_for_question_answering",
+    model_fn=lambda: transformers.FalconForQuestionAnswering(config),
+    data_gen_fn=data_gen_for_question_answering,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_question_answering,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
--- a/tests/kit/model_zoo/transformers/gpt.py
+++ b/tests/kit/model_zoo/transformers/gpt.py
@@ -14,7 +14,7 @@ def data_gen():
    # Generated from following code snippet
    #
    # from transformers import GPT2Tokenizer
-    # input = 'Hello, my dog is cute'
+    # input = 'Hello, my dog is cute is cute' (last two words repeated to satisfy length requirement)
    # tokenized_input = tokenizer(input, return_tensors='pt')
    # input_ids = tokenized_input['input_ids']
    # attention_mask = tokenized_input['attention_mask']

--- a/tests/kit/model_zoo/transformers/gptj.py
+++ b/tests/kit/model_zoo/transformers/gptj.py
+import copy
+
+import torch
+import transformers
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence GPT
+# ===============================
+
+
+def data_gen():
+    # Generated from following code snippet
+    #
+    # from transformers import AutoTokenizer
+    # input = 'Hello, my dog is cute is cute' (last two words repeated to satisfy length requirement)
+    # tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+    # tokenized_input = tokenizer(input, return_tensors='pt')
+    # input_ids = tokenized_input['input_ids']
+    # attention_mask = tokenized_input['attention_mask']
+    input_ids = torch.tensor([[15496, 11, 616, 3290, 318, 13779, 318, 13779]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def data_gen_for_lm():
+    # LM data gen
+    # the `labels` of LM is the token of the output, cause no padding, use `input_ids` as `labels`
+    data = data_gen()
+    data["labels"] = data["input_ids"].clone()
+    return data
+
+
+def data_gen_for_question_answering():
+    # question answering data gen
+    # `labels` is the type not the token id for token classification, 0 or 1
+    data = data_gen()
+    start_positions = torch.tensor([0], dtype=torch.int64)
+    data["start_positions"] = start_positions
+    end_positions = torch.tensor([1], dtype=torch.int64)
+    data["end_positions"] = end_positions
+    return data
+
+
+def data_gen_for_sequence_classification():
+    # sequence classification data gen
+    data = data_gen()
+    data["labels"] = torch.tensor([1], dtype=torch.int64)
+    return data
+
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# define loss function
+loss_fn_for_gptj_model = lambda x: torch.nn.functional.mse_loss(
+    x.last_hidden_state, torch.ones_like(x.last_hidden_state)
+)
+loss_fn = lambda x: x.loss
+
+config = transformers.GPTJConfig(
+    n_layer=2,
+    n_head=4,
+    vocab_size=50258,
+    n_embd=256,
+    hidden_size=256,
+    n_positions=512,
+    attn_pdrop=0,
+    embd_pdrop=0,
+    resid_pdrop=0,
+    hidden_dropout=0,
+    problem_type="single_label_classification",
+    pad_token_id=50256,
+)
+
+config_for_token_classification = copy.deepcopy(config)
+config_for_token_classification.num_labels = 2
+
+# register the following models
+model_zoo.register(
+    name="transformers_gptj",
+    model_fn=lambda: transformers.GPTJModel(config),
+    data_gen_fn=data_gen,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_gptj_model,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+model_zoo.register(
+    name="transformers_gptj_lm",
+    model_fn=lambda: transformers.GPTJForCausalLM(config),
+    data_gen_fn=data_gen_for_lm,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+model_zoo.register(
+    name="transformers_gptj_for_question_answering",
+    model_fn=lambda: transformers.GPTJForQuestionAnswering(config),
+    data_gen_fn=data_gen_for_question_answering,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+model_zoo.register(
+    name="transformers_gptj_for_sequence_classification",
+    model_fn=lambda: transformers.GPTJForSequenceClassification(config_for_token_classification),
+    data_gen_fn=data_gen_for_sequence_classification,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
--- a/tests/kit/model_zoo/transformers/mistral.py
+++ b/tests/kit/model_zoo/transformers/mistral.py
+import torch
+import transformers
+from transformers import MistralConfig
+
+from ..registry import ModelAttribute, model_zoo
+
+# ===============================
+# Register single-sentence Mistral
+# ===============================
+
+
+def data_gen():
+    # Generated from following code snippet
+    #
+    # from transformers import AutoModelForCausalLM, AutoTokenizer
+    # tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+    # input = 'My favourite condiment is vinegar' (last two words repeated to satisfy length requirement)
+    # tokenized_input = tokenizer([input], return_tensors="pt")
+    # input_ids = tokenized_input['input_ids']
+    # attention_mask = tokenized_input['attention_mask']
+    input_ids = torch.tensor([[1, 1984, 16020, 2076, 2487, 349, 21375, 4749]], dtype=torch.int64)
+    attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int64)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def data_gen_for_lm():
+    # LM data gen
+    # the `labels` of LM is the token of the output, cause no padding, use `input_ids` as `labels`
+    data = data_gen()
+    data["labels"] = data["input_ids"].clone()
+    return data
+
+
+def data_gen_for_sequence_classification():
+    # sequence classification data gen
+    data = data_gen()
+    data["labels"] = torch.tensor([1], dtype=torch.int64)
+    return data
+
+
+# define output transform function
+output_transform_fn = lambda x: x
+
+# define loss function
+loss_fn_for_mistral_model = lambda x: torch.nn.functional.mse_loss(
+    x.last_hidden_state, torch.ones_like(x.last_hidden_state)
+)
+loss_fn = lambda x: x.loss
+loss_fn_for_seq_classification = lambda output: output.logits.mean()
+
+config = MistralConfig(
+    hidden_size=256, intermediate_size=256, num_attention_heads=64, num_hidden_layers=2, vocab_size=50258
+)
+
+model_zoo.register(
+    name="transformers_mistral",
+    model_fn=lambda: transformers.MistralModel(config),
+    data_gen_fn=data_gen,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_mistral_model,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+model_zoo.register(
+    name="transformers_mistral_for_casual_lm",
+    model_fn=lambda: transformers.MistralForCausalLM(config),
+    data_gen_fn=data_gen_for_lm,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
+model_zoo.register(
+    name="transformers_mistral_for_sequence_classification",
+    model_fn=lambda: transformers.MistralForSequenceClassification(config),
+    data_gen_fn=data_gen_for_sequence_classification,
+    output_transform_fn=output_transform_fn,
+    loss_fn=loss_fn_for_seq_classification,
+    model_attribute=ModelAttribute(has_control_flow=True),
+)
--- a/tests/test_booster/test_plugin/test_3d_plugin.py
+++ b/tests/test_booster/test_plugin/test_3d_plugin.py
+import copy
 from contextlib import nullcontext
 from typing import Optional

 import torch
 import torch.distributed as dist
+from torch.testing import assert_close
+from torch.utils.data import Dataset

 import colossalai
 from colossalai.booster import Booster
@@ -10,10 +13,35 @@ from colossalai.booster.plugin import HybridParallelPlugin
 from colossalai.fx import is_compatible_with_meta
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing import clear_cache_before_run, parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.utils import get_current_device, set_seed
 from tests.kit.model_zoo import model_zoo


+class RandomDataset(Dataset):
+    def __init__(self, num_samples: int = 100, max_length: int = 512, vocab_size: int = 32000):
+        self.num_samples = num_samples
+        self.max_length = max_length
+        set_seed(42)
+        self.input_ids = torch.randint(0, vocab_size, (num_samples, max_length), device=get_current_device())
+        self.attention_mask = torch.ones_like(self.input_ids)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        return {
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.input_ids[idx],
+        }
+
+
+def move_to_cuda(batch):
+    return {k: v.cuda() for k, v in batch.items()}
+
+
+@clear_cache_before_run()
 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn) -> Optional[str]:
    try:
        if init_method == "lazy":
@@ -69,7 +97,6 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
        "transformers_llama_for_casual_lm"
    ).items():
        err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn)
-        torch.cuda.empty_cache()

        if err is None:
            passed_models.append(name)
@@ -85,10 +112,145 @@ def check_3d_plugin(init_method: str = "none", early_stop: bool = True):
    assert len(failed_info) == 0, "\n".join([f"{k}: {v}" for k, v in failed_info.items()])


+@parameterize(
+    "test_args",
+    [
+        {
+            "batch_size": 8,
+            "num_steps": 4,
+            "tp": 2,
+            "pp": 2,
+            "pp_style": "1f1b",
+            "num_model_chunks": 1,
+            "num_microbatches": 4,
+            "zero": 0,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "max_length": 512,
+            "gradient_accumulation_step": 2,
+        },
+        {
+            "batch_size": 8,
+            "num_steps": 4,
+            "tp": 1,
+            "pp": 2,
+            "pp_style": "1f1b",
+            "num_model_chunks": 1,
+            "num_microbatches": 4,
+            "zero": 1,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "max_length": 512,
+            "gradient_accumulation_step": 2,
+        },
+        {
+            "batch_size": 1,
+            "num_steps": 4,
+            "tp": 2,
+            "pp": 1,
+            "pp_style": "1f1b",
+            "num_model_chunks": 1,
+            "num_microbatches": 1,
+            "zero": 2,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "max_length": 512,
+            "gradient_accumulation_step": 2,
+        },
+        {
+            "batch_size": 1,
+            "num_steps": 4,
+            "tp": 2,
+            "pp": 1,
+            "pp_style": "1f1b",
+            "num_model_chunks": 1,
+            "num_microbatches": 1,
+            "zero": 0,
+            "precision": "fp16",
+            "initial_scale": 1,
+            "max_length": 512,
+            "gradient_accumulation_step": 2,
+        },
+    ],
+)
+def run_grad_acc_test(test_args):
+    model_fn, *_ = next(iter(model_zoo.get_sub_registry("transformers_gpt_lm").values()))
+    model = model_fn()
+    optimizer = HybridAdam(model.parameters())
+    origin_model = copy.deepcopy(model).cuda()
+    origin_optimizer = HybridAdam(origin_model.parameters())
+
+    plugin = HybridParallelPlugin(
+        tp_size=test_args["tp"],
+        pp_size=test_args["pp"],
+        pp_style=test_args["pp_style"],
+        zero_stage=test_args["zero"],
+        num_model_chunks=test_args["num_model_chunks"],
+        enable_fused_normalization=True,
+        num_microbatches=test_args["num_microbatches"],
+        precision=test_args["precision"],
+    )
+    booster = Booster(plugin=plugin)
+
+    dataset = RandomDataset(
+        num_samples=test_args["batch_size"] * test_args["num_steps"] * plugin.dp_size,
+        max_length=test_args["max_length"],
+        vocab_size=model.config.vocab_size,
+    )
+    dataloader = plugin.prepare_dataloader(dataset, batch_size=test_args["batch_size"], shuffle=True, drop_last=True)
+
+    model, optimizer, _, dataloader, _ = booster.boost(model, optimizer, dataloader=dataloader)
+
+    grad_accu_step = test_args["gradient_accumulation_step"]
+    for step, batch in enumerate(dataloader):
+        batch = move_to_cuda(batch)
+        # train origin model
+        origin_output = origin_model(**batch)
+        origin_loss = origin_output[0] / grad_accu_step
+        origin_loss.backward()
+
+        if (step + 1) % grad_accu_step != 0 and test_args["zero"] != 2:
+            ctx = booster.no_sync(model, optimizer)
+        else:
+            ctx = nullcontext()
+
+        with ctx:
+            if plugin.stage_manager is not None:
+                batch = iter([batch])
+                booster.execute_pipeline(
+                    batch,
+                    model,
+                    criterion=lambda outputs, inputs: outputs[0] / grad_accu_step,
+                    optimizer=optimizer,
+                    return_loss=False,
+                )
+            else:
+                outputs = model(**batch)
+                loss = outputs[0] / grad_accu_step
+                booster.backward(loss, optimizer)
+
+        if (step + 1) % grad_accu_step == 0:
+            # update origin model weight
+            origin_optimizer.step()
+            origin_optimizer.zero_grad()
+
+            # update sharded model
+            optimizer.step()
+            optimizer.zero_grad()
+
+    # tricky code here, shard the origin model inorder to check the parameters in the same stage.
+    origin_model, origin_optimizer, _, dataloader, _ = booster.boost(
+        origin_model, origin_optimizer, dataloader=dataloader
+    )
+    for p1, p2 in zip(model.unwrap().parameters(), origin_model.unwrap().parameters()):
+        assert_close(p1.to(p2.dtype), p2, atol=1e-2, rtol=1e-2)
+
+
 def run_dist(rank, world_size, port, early_stop: bool = True):
    # init dist env
    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host="localhost")
    check_3d_plugin(early_stop=early_stop)
+    run_grad_acc_test()


 @rerun_if_address_is_in_use()

--- a/tests/test_booster/test_plugin/test_gemini_plugin.py
+++ b/tests/test_booster/test_plugin/test_gemini_plugin.py
 from contextlib import nullcontext
 from typing import Optional
-import pytest

+import pytest
 import torch
 import torch.distributed as dist

@@ -11,13 +11,18 @@ from colossalai.booster.plugin import GeminiPlugin
 from colossalai.fx import is_compatible_with_meta
 from colossalai.lazy.lazy_init import LazyInitContext
 from colossalai.nn.optimizer import HybridAdam
-from colossalai.tensor.d_tensor.api import clear_layout_converter
-from colossalai.shardformer.layer.utils import Randomizer
 from colossalai.tensor.colo_parameter import ColoParameter
-from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
-from tests.kit.model_zoo import model_zoo
+from colossalai.testing import (
+    clear_cache_before_run,
+    parameterize,
+    rerun_if_address_is_in_use,
+    skip_if_not_enough_gpus,
+    spawn,
+)
+from tests.kit.model_zoo import COMMON_MODELS, IS_FAST_TEST, model_zoo


+@clear_cache_before_run()
 def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size) -> Optional[str]:
    try:
        if init_method == "lazy":
@@ -26,7 +31,13 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t
            ctx = nullcontext()
        extra_dp_size = dist.get_world_size() // (zero_size * tp_size)
        enable_all_optimization = True if tp_size > 1 else False
-        plugin = GeminiPlugin(max_norm=1.0, initial_scale=2**5, tp_size=tp_size, extra_dp_size=extra_dp_size, enable_all_optimization=enable_all_optimization)
+        plugin = GeminiPlugin(
+            max_norm=1.0,
+            initial_scale=2**5,
+            tp_size=tp_size,
+            extra_dp_size=extra_dp_size,
+            enable_all_optimization=enable_all_optimization,
+        )
        booster = Booster(plugin=plugin)
        with ctx:
            model = model_fn()
@@ -62,11 +73,13 @@ def run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, t
 # @parameterize('init_method', ['lazy', 'none', 'colo'])


-@parameterize("subset", ["torchvision", "transformers", "diffusers"])
+@parameterize("subset", [COMMON_MODELS] if IS_FAST_TEST else ["torchvision", "transformers", "diffusers"])
 @parameterize("init_method", ["none"])
 @parameterize("zero_size", [2])
 @parameterize("tp_size", [2])
-def check_gemini_plugin(subset: str, init_method: str = "none", early_stop: bool = True, zero_size: int = 1, tp_size: int = 1):
+def check_gemini_plugin(
+    subset: str, init_method: str = "none", early_stop: bool = True, zero_size: int = 1, tp_size: int = 1
+):
    """check gemini plugin over model zoo

    Args:
@@ -105,6 +118,14 @@ def check_gemini_plugin(subset: str, init_method: str = "none", early_stop: bool
            "transformers_sam",
            "transformers_vit",
            "transformers_gpt_double_heads",  # TODO check why does the model fail to run using Gemini
+            "transformers_falcon",  # TODO check why falcon fails to run Gemini
+            "transformers_falcon_for_causal_lm",
+            "transformers_falcon_for_sequence_classification",
+            "transformers_falcon_for_token_classification",
+            "transformers_falcon_for_question_answering",
+            "transformers_gptj_lm",  # lead to OOM when running in ci
+            "transformers_gptj_for_question_answering",
+            "transformers_gptj_for_sequence_classification",
        ]:
            continue

@@ -131,7 +152,6 @@ def check_gemini_plugin(subset: str, init_method: str = "none", early_stop: bool
            tp_size = 1

        err = run_fn(init_method, model_fn, data_gen_fn, output_transform_fn, zero_size, tp_size)
-        torch.cuda.empty_cache()
        if err is None:
            passed_models.append(name)
        else:
@@ -156,7 +176,9 @@ def run_dist(rank, world_size, port, early_stop: bool = True):
 def test_gemini_plugin(early_stop: bool = True):
    spawn(run_dist, 4, early_stop=early_stop)

+
 @pytest.mark.largedist
+@skip_if_not_enough_gpus(8)
 @rerun_if_address_is_in_use()
 def test_gemini_plugin_3d(early_stop: bool = True):
    spawn(run_dist, 8, early_stop=early_stop)