[pipeline] refactor test pipeline and remove useless utils in pipeline (#4324)

* refactor tests * refactor bloom model * finish policy tests * refactor tests * fix test pure pipeline * remove test pipeline and cutdown launch process * refactor tests * refactor bloom model * finish policy tests * refactor tests * fix test pure pipeline * remove test pipeline and cutdown launch process

[pipeline] refactor test pipeline and remove useless utils in pipeline (#4324)
* refactor tests * refactor bloom model * finish policy tests * refactor tests * fix test pure pipeline * remove test pipeline and cutdown launch process * refactor tests * refactor bloom model * finish policy tests * refactor tests * fix test pure pipeline * remove test pipeline and cutdown launch process
f13954cd · Jianghai · Hongxin Liu · d3c6cd66 · d3c6cd66 · d3c6cd66
Commit f13954cd authored Aug 01, 2023 by Jianghai Committed by Hongxin Liu Aug 15, 2023
14 changed files
--- a/colossalai/pipeline/policy/__init__.py
+++ b/colossalai/pipeline/policy/__init__.py
-from typing import Any, Dict, List, Optional, Tuple, Type
-from torch import Tensor
-from torch.nn import Module, Parameter
-from colossalai.pipeline.stage_manager import PipelineStageManager
-from .base import Policy
-from .bert import BertModel, BertModelPolicy
-POLICY_MAP: Dict[Type[Module], Type[Policy]] = {
-    BertModel: BertModelPolicy,
-}
-def pipeline_parallelize(
-        model: Module,
-        stage_manager: PipelineStageManager) -> Tuple[Dict[str, Parameter], Dict[str, Tensor], List[Dict[int, Tensor]]]:
-    if type(model) not in POLICY_MAP:
-        raise NotImplementedError(f"Policy for {type(model)} not implemented")
-    policy = POLICY_MAP[type(model)](stage_manager)
-    return policy.parallelize_model(model)
--- a/colossalai/pipeline/policy/base.py
+++ b/colossalai/pipeline/policy/base.py
-from typing import Any, Dict, List, Optional, Tuple
-import numpy as np
-from torch import Tensor
-from torch.nn import Module, Parameter
-from colossalai.lazy import LazyTensor
-from colossalai.pipeline.stage_manager import PipelineStageManager
-class Policy:
-    def __init__(self, stage_manager: PipelineStageManager) -> None:
-        self.stage_manager = stage_manager
-    def setup_model(self, module: Module) -> Tuple[Dict[str, Parameter], Dict[str, Tensor]]:
-        """Setup model for pipeline parallel
-        Args:
-            module (Module): Module to be setup
-        Returns:
-            Tuple[Dict[str, Parameter], Dict[str, Tensor]]: Hold parameters and buffers
-        """
-        hold_params = set()
-        hold_buffers = set()
-        def init_layer(layer: Module):
-            for p in layer.parameters():
-                if isinstance(p, LazyTensor):
-                    p.materialize()
-                p.data = p.cuda()
-                hold_params.add(p)
-            for b in layer.buffers():
-                if isinstance(b, LazyTensor):
-                    b.materialize()
-                b.data = b.cuda()
-                hold_buffers.add(b)
-        hold_layers = self.get_hold_layers(module)
-        for layer in hold_layers:
-            init_layer(layer)
-        hold_params_dict = {}
-        hold_buffers_dict = {}
-        # release other tensors
-        for n, p in module.named_parameters():
-            if p in hold_params:
-                hold_params_dict[n] = p
-            else:
-                if isinstance(p, LazyTensor):
-                    p.materialize()
-                p.data = p.cuda()
-                p.storage().resize_(0)
-        for n, b in module.named_buffers():
-            if b in hold_buffers:
-                hold_buffers_dict[n] = b
-            else:
-                if isinstance(b, LazyTensor):
-                    b.materialize()
-                b.data = b.cuda()
-                # FIXME(ver217): use meta tensor may be better
-                b.storage().resize_(0)
-        return hold_params_dict, hold_buffers_dict
-    def replace_forward(self, module: Module) -> None:
-        """Replace module forward in place. This method should be implemented by subclass. The output of internal layers must be a dict
-        Args:
-            module (Module): _description_
-        """
-        raise NotImplementedError
-    def get_hold_layers(self, module: Module) -> List[Module]:
-        """Get layers that should be hold in current stage. This method should be implemented by subclass.
-        Args:
-            module (Module): Module to be setup
-        Returns:
-            List[Module]: List of layers that should be hold in current stage
-        """
-        raise NotImplementedError
-    def get_shared_params(self, module: Module) -> List[Dict[int, Tensor]]:
-        """Get parameters that should be shared across stages. This method should be implemented by subclass.
-        Args:
-            module (Module): Module to be setup
-        Returns:
-            List[Module]: List of parameters that should be shared across stages. E.g. [{0: module.model.embed_tokens.weight, 3: module.lm_head.weight}]
-        """
-        raise NotImplementedError
-    def parallelize_model(self,
-                          module: Module) -> Tuple[Dict[str, Parameter], Dict[str, Tensor], List[Dict[int, Tensor]]]:
-        """Parallelize model for pipeline parallel
-        Args:
-            module (Module): Module to be setup
-        Returns:
-            Tuple[Dict[str, Parameter], Dict[str, Tensor], List[Dict[int, Tensor]]]: Hold parameters, buffers and shared parameters
-        """
-        hold_params, hold_buffers = self.setup_model(module)
-        self.replace_forward(module)
-        shared_params = self.get_shared_params(module)
-        return hold_params, hold_buffers, shared_params
--- a/colossalai/pipeline/policy/bert.py
+++ b/colossalai/pipeline/policy/bert.py
--- a/colossalai/pipeline/policy/bloom.py
+++ b/colossalai/pipeline/policy/bloom.py
-import warnings
-from functools import partial
-from types import MethodType
-from typing import Dict, List, Optional, Tuple, Union
-import numpy as np
-import torch
-from torch import Tensor
-from torch.nn import CrossEntropyLoss, Module
-from transformers.modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
-from transformers.models.bloom.modeling_bloom import BloomModel
-from transformers.utils import logging
-from colossalai.pipeline.stage_manager import PipelineStageManager
-from .base import Policy
-logger = logging.get_logger(__name__)
-def bloom_model_forward(
-    self: BloomModel,
-    input_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    head_mask: Optional[torch.LongTensor] = None,
-    inputs_embeds: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    stage_manager: Optional[PipelineStageManager] = None,
-    hidden_states: Optional[torch.FloatTensor] = None,
-    **deprecated_arguments,
-) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
-    if deprecated_arguments.pop("position_ids", False) is not False:
-        # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None`
-        warnings.warn(
-            "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore"
-            " passing `position_ids`.",
-            FutureWarning,
-        )
-    if len(deprecated_arguments) > 0:
-        raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (output_hidden_states
-                            if output_hidden_states is not None else self.config.output_hidden_states)
-    use_cache = use_cache if use_cache is not None else self.config.use_cache
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-    # add warnings here
-    if output_attentions:
-        logger.warning_once('output_attentions=True is not supported for pipeline models at the moment.')
-        output_attentions = False
-    if output_hidden_states:
-        logger.warning_once('output_hidden_states=True is not supported for pipeline models at the moment.')
-        output_hidden_states = False
-    if use_cache:
-        logger.warning_once('use_cache=True is not supported for pipeline models at the moment.')
-        use_cache = False
-    # Prepare head mask if needed
-    # 1.0 in head_mask indicate we keep the head
-    # attention_probs has shape batch_size x num_heads x N x N
-    # head_mask has shape n_layer x batch x num_heads x N x N
-    head_mask = self.get_head_mask(head_mask, self.config.n_layer)
-    # case: First stage of training
-    if stage_manager.is_first_stage():
-        # check input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
-        # initialize in the first stage and then pass to the next stage
-    else:
-        input_shape = hidden_states.shape[:-1]
-        batch_size, seq_length = input_shape
-    # extra recording tensor should be generated in the first stage
-    presents = () if use_cache else None
-    all_self_attentions = () if output_attentions else None
-    all_hidden_states = () if output_hidden_states else None
-    if self.gradient_checkpointing and self.training:
-        if use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...")
-            use_cache = False
-    if past_key_values is None:
-        past_key_values = tuple([None] * len(self.h))
-    # Compute alibi tensor: check build_alibi_tensor documentation,build for every stage
-    seq_length_with_past = seq_length
-    past_key_values_length = 0
-    if past_key_values[0] is not None:
-        past_key_values_length = past_key_values[0][0].shape[2]    # source_len
-        seq_length_with_past = seq_length_with_past + past_key_values_length
-    if attention_mask is None:
-        attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
-    else:
-        attention_mask = attention_mask.to(hidden_states.device)
-    alibi = self.build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype)
-    # causal_mask is constructed every stage and its input is passed through different stages
-    causal_mask = self._prepare_attn_mask(
-        attention_mask,
-        input_shape=(batch_size, seq_length),
-        past_key_values_length=past_key_values_length,
-    )
-    # calculate the num_layers
-    num_layers_per_stage = len(self.h) // stage_manager.num_stages
-    start_layer = stage_manager.stage * num_layers_per_stage
-    end_layer = (stage_manager.stage + 1) * num_layers_per_stage
-    for i, (block, layer_past) in enumerate(zip(self.h[start_layer:end_layer], past_key_values[start_layer:end_layer])):
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-        if self.gradient_checkpointing and self.training:
-            def create_custom_forward(module):
-                def custom_forward(*inputs):
-                    # None for past_key_value
-                    return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
-                return custom_forward
-            outputs = torch.utils.checkpoint.checkpoint(
-                create_custom_forward(block),
-                hidden_states,
-                alibi,
-                causal_mask,
-                layer_past,
-                head_mask[i],
-            )
-        else:
-            outputs = block(
-                hidden_states,
-                layer_past=layer_past,
-                attention_mask=causal_mask,
-                head_mask=head_mask[i],
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-                alibi=alibi,
-            )
-        hidden_states = outputs[0]
-        if use_cache is True:
-            presents = presents + (outputs[1],)
-        if output_attentions:
-            all_self_attentions = all_self_attentions + \
-                (outputs[2 if use_cache else 1],)
-    if stage_manager.is_last_stage():
-        # Add last hidden state
-        hidden_states = self.ln_f(hidden_states)
-    # TODO: deal with all_hidden_states, all_self_attentions, presents
-    if output_hidden_states:
-        all_hidden_states = all_hidden_states + (hidden_states,)
-    if not return_dict:
-        return tuple(v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None)
-    # attention_mask is not returned ; presents = past_key_values
-    return BaseModelOutputWithPastAndCrossAttentions(
-        last_hidden_state=hidden_states,
-        past_key_values=presents,
-        hidden_states=all_hidden_states,
-        attentions=all_self_attentions,
-    )
-class BloomModelPolicy(Policy):
-    def __init__(self, stage_manager: PipelineStageManager, num_layers: int, num_stages: int):
-        super().__init__(stage_manager=stage_manager)
-        self.stage_manager = stage_manager
-        self.layers_per_stage = self.distribute_layers(num_layers, num_stages)
-    def get_hold_layers(self, module: BloomModel) -> List[Module]:
-        """
-        get pipeline layers for current stage
-        """
-        hold_layers = []
-        if self.stage_manager.is_first_stage():
-            hold_layers.append(module.word_embeddings)
-            hold_layers.append(module.word_embeddings_layernorm)
-        start_idx, end_idx = self.get_stage_index(self.layers_per_stage, self.stage_manager.stage)
-        hold_layers.extend(module.h[start_idx:end_idx])
-        if self.stage_manager.is_last_stage():
-            hold_layers.append(module.ln_f)
-        return hold_layers
-    def get_shared_params(self, module: BloomModel) -> List[Dict[int, Tensor]]:
-        '''no shared params in bloommodel'''
-        pass
-    def replace_forward(self, module: Module) -> None:
-        module.forward = MethodType(partial(bloom_model_forward, stage_manager=self.stage_manager), module.model)
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -76,7 +76,6 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
        # for the first stage, input_obj is None
        # for the non-first stage, input_obj is the output of the previous stage and it's must be a dict
        output_obj = model_forward(model, micro_batch, input_obj)
        if self.stage_manager.is_last_stage():
            loss = criterion(output_obj, micro_batch) / self.num_microbatches
            if accum_loss is not None:

--- a/colossalai/shardformer/policies/bert.py
+++ b/colossalai/shardformer/policies/bert.py
@@ -315,7 +315,7 @@ class BertForMaskedLMPolicy(BertPolicy):
    def module_policy(self):
        policy = super().module_policy()
        policy = self.add_lm_head_policy(policy)
-        mpolicy = self.add_lm_prediction_policy(policy)
+        policy = self.add_lm_prediction_policy(policy)
        from transformers.models.bert.modeling_bert import BertForMaskedLM
        if self.pipeline_stage_manager:
            self.set_pipeline_forward(model_cls=BertForMaskedLM,

--- a/tests/test_pipeline/test_policy/test_bert_for_pretraining_model.py
+++ b/tests/test_pipeline/test_policy/test_bert_for_pretraining_model.py
-import pytest
-import torch
-import torch.distributed as dist
-from transformers.models.bert import BertConfig
-from transformers.models.bert.modeling_bert import BertForPreTraining
-import colossalai
-from colossalai.cluster import ProcessGroupMesh
-from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.policies.base_policy import Policy
-from colossalai.shardformer.policies.bert import BertForPreTrainingPolicy
-from colossalai.shardformer.shard import ShardConfig
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-def check_bert_for_pretraining_policy():
-    configuration = BertConfig()
-    model = BertForPreTraining(configuration)
-    DP_DIM, PP_DIM = 0, 1
-    DP_SIZE, PP_SIZE = 2, 2
-    RANK_TO_COORDINATE = {
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
-    # print(pg_mesh)
-    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
-    rank = dist.get_rank()
-    model_policy = BertForPreTrainingPolicy()
-    model_policy.set_model(model)
-    model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
-    model_policy.set_shard_config(model_config)
-    layers = model_policy.get_held_layers()
-    if stage_manager.is_first_stage():
-        assert len(layers) == 6 + 1
-    else:
-        assert len(layers) == 6 + 2
-def run_dist_policy(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    check_bert_for_pretraining_policy()
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_bert_for_pretraining_policy():
-    spawn(run_dist_policy, 4)
-if __name__ == "__main__":
-    """test the bert for pretraining model forward and bert for pretraining model policy"""
-    test_bert_for_pretraining_policy()
--- a/tests/test_pipeline/test_policy/test_bert_lm_head_model.py
+++ b/tests/test_pipeline/test_policy/test_bert_lm_head_model.py
-import pytest
-import torch
-import torch.distributed as dist
-from transformers.models.bert import BertConfig
-from transformers.models.bert.modeling_bert import BertLMHeadModel
-import colossalai
-from colossalai.cluster import ProcessGroupMesh
-from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.policies.base_policy import Policy
-from colossalai.shardformer.policies.bert import BertLMHeadModelPolicy
-from colossalai.shardformer.shard import ShardConfig
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-def check_bert_lmhead_policy():
-    configuration = BertConfig()
-    model = BertLMHeadModel(configuration)
-    DP_DIM, PP_DIM = 0, 1
-    DP_SIZE, PP_SIZE = 2, 2
-    RANK_TO_COORDINATE = {
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
-    # print(pg_mesh)
-    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
-    rank = dist.get_rank()
-    model_policy = BertLMHeadModelPolicy()
-    model_policy.set_model(model)
-    model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
-    model_policy.set_shard_config(model_config)
-    layers = model_policy.get_held_layers()
-    if stage_manager.is_first_stage():
-        assert len(layers) == 6 + 1
-    else:
-        assert len(layers) == 6 + 2
-def run_dist_policy(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    check_bert_lmhead_policy()
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_bert_lmhead_policy():
-    spawn(run_dist_policy, 4)
-if __name__ == "__main__":
-    """test the bert for lm head model policy"""
-    test_bert_lmhead_policy()
--- a/tests/test_pipeline/test_policy/test_bert_model.py
+++ b/tests/test_pipeline/test_policy/test_bert_model.py
-'''
-In the test policy we only test policy: held layers and others, as the tests for forward logic are done in test_shardformer/test_model
-'''
-import pytest
-import torch.distributed as dist
-from transformers.models.bert.modeling_bert import BertModel
-import colossalai
-from colossalai.cluster import ProcessGroupMesh
-from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.policies.base_policy import Policy
-from colossalai.shardformer.policies.bert import BertModelPolicy
-from colossalai.shardformer.shard import ShardConfig
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-def check_bert_model_policy():
-    model = BertModel.from_pretrained('bert-base-uncased')
-    DP_DIM, PP_DIM = 0, 1
-    DP_SIZE, PP_SIZE = 2, 2
-    RANK_TO_COORDINATE = {
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
-    # print(pg_mesh)
-    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
-    rank = dist.get_rank()
-    model_policy = BertModelPolicy()
-    model_policy.set_model(model)
-    model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
-    model_policy.set_shard_config(model_config)
-    layers = model_policy.get_held_layers()
-    if stage_manager.is_first_stage():
-        assert len(layers) == 6 + 1
-    else:
-        assert len(layers) == 6 + 1
-def run_dist_policy(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    check_bert_model_policy()
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_bert_model_policy():
-    spawn(run_dist_policy, 4)
-if __name__ == "__main__":
-    """test the bert model policy"""
-    test_bert_model_policy()
--- a/tests/test_pipeline/test_policy/test_bloom_model.py
+++ b/tests/test_pipeline/test_policy/test_bloom_model.py
-import pytest
-import torch
-import torch.distributed as dist
-from transformers.models.bloom import BloomConfig, BloomModel
-import colossalai
-from colossalai.cluster import ProcessGroupMesh
-from colossalai.pipeline.stage_manager import PipelineStageManager
-from colossalai.shardformer.policies.base_policy import Policy
-from colossalai.shardformer.policies.bloom import BloomModelPolicy
-from colossalai.shardformer.shard import ShardConfig
-from colossalai.testing import rerun_if_address_is_in_use, spawn
-def check_bloom_model_policy():
-    # create a BloomModel
-    configuration = BloomConfig()
-    model = BloomModel(configuration)
-    DP_DIM, PP_DIM = 0, 1
-    DP_SIZE, PP_SIZE = 2, 2
-    RANK_TO_COORDINATE = {
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
-    # print(pg_mesh)
-    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
-    rank = dist.get_rank()
-    model_policy = BloomModelPolicy()
-    model_policy.set_model(model)
-    model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
-    model_policy.set_shard_config(model_config)
-    layers = model_policy.get_held_layers()
-    if stage_manager.is_first_stage():
-        assert len(layers) == 1 + 2
-    else:
-        assert len(layers) == 1 + 1
-def run_dist_policy(rank, world_size, port):
-    colossalai.launch(config={}, rank=rank, world_size=world_size, port=port, host='localhost')
-    check_bloom_model_policy()
-@pytest.mark.dist
-@rerun_if_address_is_in_use()
-def test_bloom_model_policy():
-    spawn(run_dist_policy, 4)
-if __name__ == "__main__":
-    """test the bloom model policy"""
-    test_bloom_model_policy()
--- a/tests/test_shardformer/test_model/test_shard_bert.py
+++ b/tests/test_shardformer/test_model/test_shard_bert.py
@@ -2,7 +2,10 @@ import pytest
 import torch
 import colossalai
+from colossalai.cluster import ProcessGroupMesh
 from colossalai.logging import disable_existing_loggers
+from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.policies.auto_policy import get_autopolicy
 from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
 from colossalai.testing import (
    assert_hf_output_close,

--- a/tests/test_shardformer/test_model/test_shard_bert_pipeline.py
+++ b/tests/test_shardformer/test_model/test_shard_bert_pipeline.py
@@ -5,6 +5,8 @@ import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.logging import disable_existing_loggers
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.policies.auto_policy import get_autopolicy
+from colossalai.shardformer.shard import ShardConfig
 from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
 from colossalai.testing import (
    assert_hf_output_close,
@@ -17,9 +19,55 @@ from tests.kit.model_zoo import model_zoo
 from tests.test_shardformer.test_model._utils import build_model, build_pipeline_model, run_forward
-def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
+def check_bert_model_policy(name, model: torch.nn.Module, stage_manager: PipelineStageManager):
-    # check forward
+    stage_manager = stage_manager
-    pass
+    policy = get_autopolicy(model)
+    policy.set_model(model)
+    model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
+    policy.set_shard_config(model_config)
+    layers = policy.get_held_layers()
+    if stage_manager.is_first_stage():
+        assert len(layers) == 1 + 1
+    else:
+        if name == "transformers_bert":
+            assert len(layers) == 1 + 1
+        elif name in [
+                "transformers_bert_for_sequence_classification", "transformers_bert_for_token_classification",
+                "transformers_bert_for_mcq"
+        ]:
+            assert len(layers) == 1 + 3
+        else:
+            assert len(layers) == 1 + 2
+def check_bert_model_pipeline_forward(name, sharded_model, stage_manager: PipelineStageManager):
+    if name == 'transformers_bert_for_mcq':
+        x = torch.randint(0, 1000, (2, 3, 3)).cuda()
+        attention_mask = torch.ones_like(x).cuda()
+        if stage_manager.stage == 0:
+            output = sharded_model(input_ids=x, attention_mask=attention_mask, stage_manager=stage_manager)
+            assert output['hidden_states'].shape == (6, 3, 128)
+        else:
+            hidden_states = torch.randint(0, 1000, (6, 3, 128)).to(torch.float32).cuda()
+            output = sharded_model(input_ids=x,
+                                   hidden_states=hidden_states,
+                                   attention_mask=attention_mask,
+                                   stage_manager=stage_manager)
+            assert output[0].shape == (2, 3)
+    else:
+        x = torch.randint(0, 1000, (2, 3)).cuda()
+        # one batch, 2 single sentences, each sentence has 3 tokens
+        hidden_states = torch.randint(0, 1000, (2, 3, 128)).to(torch.float32).cuda()
+        if stage_manager.stage == 0:
+            attention_mask = torch.ones_like(x).cuda()
+            output = sharded_model(input_ids=x, attention_mask=attention_mask, stage_manager=stage_manager)
+            assert output['hidden_states'].shape == (2, 3, 128)
+        else:
+            attention_mask = torch.ones((2, 3)).cuda()
+            output = sharded_model(hidden_states=hidden_states,
+                                   attention_mask=attention_mask,
+                                   stage_manager=stage_manager)
+            assert output[0].shape[0] == 2
 @parameterize('enable_fused_normalization', [False])
@@ -27,55 +75,17 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
 @parameterize('use_lazy_init', [False])
 #TODO: merge this into test_shard_bert
 def run_bert_test(enable_fused_normalization, enable_tensor_parallelism, use_lazy_init):
-    DP_DIM, PP_DIM = 0, 1
+    PP_DIM = 0
-    DP_SIZE, PP_SIZE = 2, 2
+    PP_SIZE = 2
-    RANK_TO_COORDINATE = {
+    pg_mesh = ProcessGroupMesh(PP_SIZE)
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
    sub_model_zoo = model_zoo.get_sub_registry('transformers_bert')
    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
        org_model, sharded_model = build_pipeline_model(model_fn, stage_manager, enable_fused_normalization,
                                                        enable_tensor_parallelism, use_lazy_init)
+        check_bert_model_policy(name, org_model, stage_manager)
-        if name == 'transformers_bert_for_mcq':
+        check_bert_model_pipeline_forward(name, sharded_model, stage_manager)
-            x = torch.randint(0, 1000, (2, 3, 3)).cuda()
-            attention_mask = torch.ones_like(x).cuda()
-            if stage_manager.stage == 0:
-                output = sharded_model(input_ids=x, attention_mask=attention_mask, stage_manager=stage_manager)
-                assert output['hidden_states'].shape == (6, 3, 128)
-            else:
-                hidden_states = torch.randint(0, 1000, (6, 3, 128)).to(torch.float32).cuda()
-                output = sharded_model(input_ids=x,
-                                       hidden_states=hidden_states,
-                                       attention_mask=attention_mask,
-                                       stage_manager=stage_manager)
-                assert output[0].shape == (2, 3)
-        else:
-            x = torch.randint(0, 1000, (2, 3)).cuda()
-            # one batch, 2 single sentences, each sentence has 3 tokens
-            hidden_states = torch.randint(0, 1000, (2, 3, 128)).to(torch.float32).cuda()
-            if stage_manager.stage == 0:
-                attention_mask = torch.ones_like(x).cuda()
-                output = sharded_model(input_ids=x, attention_mask=attention_mask, stage_manager=stage_manager)
-                assert output['hidden_states'].shape == (2, 3, 128)
-            else:
-                attention_mask = torch.ones((2, 3)).cuda()
-                output = sharded_model(hidden_states=hidden_states,
-                                       attention_mask=attention_mask,
-                                       stage_manager=stage_manager)
-                assert output[0].shape[0] == 2
    torch.cuda.empty_cache()
@@ -90,7 +100,7 @@ def check_bert(rank, world_size, port):
 @rerun_if_address_is_in_use()
 @clear_cache_before_run()
 def test_bert():
-    spawn(check_bert, 4)
+    spawn(check_bert, 2)
 if __name__ == "__main__":

--- a/tests/test_shardformer/test_model/test_shard_bloom_pipeline.py
+++ b/tests/test_shardformer/test_model/test_shard_bloom_pipeline.py
@@ -5,7 +5,9 @@ import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.logging import disable_existing_loggers
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.policies.auto_policy import get_autopolicy
 from colossalai.shardformer.policies.base_policy import Policy
+from colossalai.shardformer.shard import ShardConfig
 from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
 from colossalai.testing import (
    assert_hf_output_close,
@@ -18,9 +20,37 @@ from tests.kit.model_zoo import model_zoo
 from tests.test_shardformer.test_model._utils import build_model, build_pipeline_model, run_forward
-def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
+def check_bloom_model_policy(name, model: torch.nn.Module, stage_manager: PipelineStageManager):
-    # check forward
+    policy = get_autopolicy(model)
-    pass
+    policy.set_model(model)
+    model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
+    policy.set_shard_config(model_config)
+    layers = policy.get_held_layers()
+    if stage_manager.is_first_stage():
+        assert len(layers) == 0 + 2
+    else:
+        if name == 'transformers_bloom':
+            assert len(layers) == 1 + 1
+        elif name == 'transformers_bloom_for_token_classification':
+            assert len(layers) == 1 + 3
+        else:
+            assert len(layers) == 1 + 2
+def check_bloom_model_pipeline_forward(name, sharded_model, stage_manager: PipelineStageManager):
+    if stage_manager.stage == 0:
+        x = torch.randint(0, 1000, (1, 3)).cuda()
+        attention_mask = torch.ones_like(x).cuda()
+        output = sharded_model(input_ids=x, attention_mask=attention_mask)
+        assert output['hidden_states'].shape == (1, 3, 64)
+    else:
+        attention_mask = torch.ones((1, 3)).cuda()
+        hidden_states = torch.randint(0, 1000, (1, 3, 64)).to(torch.float32).cuda()
+        output = sharded_model(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+        assert output[0].shape[0] == 1
 @parameterize('enable_fused_normalization', [False])
@@ -28,40 +58,17 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
 @parameterize('use_lazy_init', [False])
 #TODO: merge this into test_shard_bloom
 def run_bloom_test(enable_fused_normalization, enable_tensor_parallelism, use_lazy_init):
-    DP_DIM, PP_DIM = 0, 1
+    PP_DIM = 0
-    DP_SIZE, PP_SIZE = 2, 2
+    PP_SIZE = 2
-    RANK_TO_COORDINATE = {
+    pg_mesh = ProcessGroupMesh(PP_SIZE)
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
    sub_model_zoo = model_zoo.get_sub_registry('transformers_bloom')
-    x = torch.randint(0, 1000, (1, 3)).cuda()
-    hidden_states = torch.randint(0, 1000, (1, 3, 64)).to(torch.float32).cuda()
    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
        org_model, sharded_model = build_pipeline_model(model_fn, stage_manager, enable_fused_normalization,
                                                        enable_tensor_parallelism, use_lazy_init)
-        if stage_manager.stage == 0:
+        check_bloom_model_policy(name, org_model, stage_manager)
-            attention_mask = torch.ones_like(x).cuda()
+        check_bloom_model_pipeline_forward(name, sharded_model, stage_manager)
-            output = sharded_model(input_ids=x, attention_mask=attention_mask)
-            assert output['hidden_states'].shape == (1, 3, 64)
-        else:
-            attention_mask = torch.ones((1, 3)).cuda()
-            output = sharded_model(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-            )
-            assert output[0].shape[0] == 1
    torch.cuda.empty_cache()
@@ -76,7 +83,7 @@ def check_bloom(rank, world_size, port):
 @rerun_if_address_is_in_use()
 @clear_cache_before_run()
 def test_bloom():
-    spawn(check_bloom, 4)
+    spawn(check_bloom, 2)
 if __name__ == "__main__":

--- a/tests/test_shardformer/test_model/test_shard_llama_pipeline.py
+++ b/tests/test_shardformer/test_model/test_shard_llama_pipeline.py
@@ -5,7 +5,9 @@ import colossalai
 from colossalai.cluster import ProcessGroupMesh
 from colossalai.logging import disable_existing_loggers
 from colossalai.pipeline.stage_manager import PipelineStageManager
+from colossalai.shardformer.policies.auto_policy import get_autopolicy
 from colossalai.shardformer.policies.base_policy import Policy
+from colossalai.shardformer.shard import ShardConfig
 from colossalai.tensor.d_tensor.api import is_customized_distributed_tensor, is_distributed_tensor
 from colossalai.testing import (
    assert_hf_output_close,
@@ -18,9 +20,35 @@ from tests.kit.model_zoo import model_zoo
 from tests.test_shardformer.test_model._utils import build_model, build_pipeline_model, run_forward
-def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transform_fn, loss_fn):
+def check_llama_model_policy(name, model: torch.nn.Module, stage_manager: PipelineStageManager):
-    # check forward
+    policy = get_autopolicy(model)
-    pass
+    policy.set_model(model)
+    model_config = ShardConfig(pipeline_stage_manager=stage_manager, enable_tensor_parallelism=False)
+    policy.set_shard_config(model_config)
+    layers = policy.get_held_layers()
+    if stage_manager.is_first_stage():
+        assert len(layers) == 2 + 1
+    else:
+        if name == "transformers_llama":
+            assert len(layers) == 2 + 1
+        else:
+            assert len(layers) == 2 + 2
+def check_llama_model_pipeline_forward(name, sharded_model, stage_manager: PipelineStageManager):
+    x = torch.randint(0, 1000, (2, 3)).cuda()
+    if stage_manager.stage == 0:
+        attention_mask = torch.ones_like(x).cuda()
+        output = sharded_model(input_ids=x, attention_mask=attention_mask)
+        assert output['hidden_states'].shape == (2, 3, 128)
+    else:
+        hidden_states = torch.randint(0, 1000, (2, 3, 128)).to(torch.float32).cuda()
+        attention_mask = torch.ones((2, 3)).cuda()
+        output = sharded_model(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+        )
+        assert output[0] is not None
 @parameterize('enable_fused_normalization', [False])
@@ -28,40 +56,18 @@ def check_forward_backward(org_model, sharded_model, data_gen_fn, output_transfo
 @parameterize('use_lazy_init', [False])
 #TODO: merge this into test_shard_llama
 def run_llama_test(enable_fused_normalization, enable_tensor_parallelism, use_lazy_init):
-    DP_DIM, PP_DIM = 0, 1
+    PP_DIM = 0
-    DP_SIZE, PP_SIZE = 2, 2
+    PP_SIZE = 2
-    RANK_TO_COORDINATE = {
+    pg_mesh = ProcessGroupMesh(PP_SIZE)
-        0: (0, 0),
-        1: (0, 1),
-        2: (1, 0),
-        3: (1, 1),
-    }
-    PP_RANKS_IN_GROUP = {
-        0: [0, 1],
-        1: [0, 1],
-        2: [2, 3],
-        3: [2, 3],
-    }
-    pg_mesh = ProcessGroupMesh(DP_SIZE, PP_SIZE)
    stage_manager = PipelineStageManager(pg_mesh, PP_DIM)
    sub_model_zoo = model_zoo.get_sub_registry('transformers_llama')
-    x = torch.randint(0, 1000, (2, 3)).cuda()
-    hidden_states = torch.randint(0, 1000, (2, 3, 128)).to(torch.float32).cuda()
    for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
        org_model, sharded_model = build_pipeline_model(model_fn, stage_manager, enable_fused_normalization,
                                                        enable_tensor_parallelism, use_lazy_init)
-        if stage_manager.stage == 0:
+        check_llama_model_policy(name, org_model, stage_manager)
-            attention_mask = torch.ones_like(x).cuda()
+        check_llama_model_pipeline_forward(name, sharded_model, stage_manager)
-            output = sharded_model(input_ids=x, attention_mask=attention_mask)
-            assert output['hidden_states'].shape == (2, 3, 128)
-        else:
-            attention_mask = torch.ones((2, 3)).cuda()
-            output = sharded_model(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-            )
-            assert output[0] is not None
    torch.cuda.empty_cache()
@@ -76,7 +82,7 @@ def check_llama(rank, world_size, port):
 @rerun_if_address_is_in_use()
 @clear_cache_before_run()
 def test_llama():
-    spawn(check_llama, 4)
+    spawn(check_llama, 2)
 if __name__ == "__main__":