Merge pull request #5278 from ver217/sync/npu

[sync] sync npu branch with main

Merge pull request #5278 from ver217/sync/npu
[sync] sync npu branch with main
d66e6988 · Frank Lee · GitHub · 9102d655 · 14846934 · d66e6988
Unverified Commit d66e6988 authored Jan 18, 2024 by Frank Lee Committed by GitHub Jan 18, 2024
20 changed files
--- a/colossalai/shardformer/policies/falcon.py
+++ b/colossalai/shardformer/policies/falcon.py
+import warnings
+from functools import partial
+from typing import Callable, Dict, List
+from torch import Tensor, nn
+from torch.nn import Module
+import colossalai.shardformer.layer as col_nn
+from ..modeling.falcon import (
+    FalconPipelineForwards,
+    build_falcon_alibi_tensor_fn,
+    get_falcon_flash_attention_forward,
+    get_tp_falcon_decoder_layer_forward,
+)
+from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+__all__ = ["FalconPolicy"]
+class FalconPolicy(Policy):
+    def __init__(self) -> None:
+        super().__init__()
+        import transformers
+        from packaging.version import Version
+        assert Version(transformers.__version__) <= Version(
+            "4.33.0"
+        ), "The Falcon model should run on a transformers version not greater than 4.33.0."
+    def config_sanity_check(self):
+        pass
+    def preprocess(self):
+        # reshape the embedding layer
+        r"""
+        Reshape the Embedding layer to make the embedding dimension divisible by world_size
+        """
+        if self.shard_config.enable_tensor_parallelism:
+            vocab_size = self.model.config.vocab_size
+            world_size = self.shard_config.tensor_parallel_size
+            if vocab_size % world_size != 0:
+                new_vocab_size = vocab_size + world_size - vocab_size % world_size
+                self.model.resize_token_embeddings(new_vocab_size)
+        return self.model
+    def module_policy(self):
+        from transformers.models.falcon.modeling_falcon import FalconAttention, FalconDecoderLayer, FalconModel
+        if not self.model.config.new_decoder_architecture and self.model.config.multi_query:
+            warnings.warn(
+                "Falcon dosen't support tensor parallelism when (not new_decoder_architecture and multi_query) is True, will ignore the tensor parallelism flag."
+            )
+            self.shard_config.enable_tensor_parallelism = False
+        if self.shard_config.enable_sequence_parallelism:
+            self.shard_config.enable_sequence_parallelism = False
+            warnings.warn("Falcon doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
+        policy = {}
+        if self.shard_config.enable_tensor_parallelism:
+            attn_attribute_replacement = {
+                "self_attention.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attention.split_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attention.num_heads": self.model.config.num_attention_heads
+                // self.shard_config.tensor_parallel_size,
+                "self_attention.num_kv_heads": self.model.config.num_kv_heads // self.shard_config.tensor_parallel_size,
+            }
+            policy[FalconDecoderLayer] = ModulePolicyDescription(
+                attribute_replacement=attn_attribute_replacement,
+                method_replacement={"forward": get_tp_falcon_decoder_layer_forward()},
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="self_attention.query_key_value",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attention.dense",
+                        target_module=col_nn.Linear1D_Row,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attention.attention_dropout",
+                        target_module=col_nn.DropoutForParallelInput,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.dense_h_to_4h",
+                        target_module=col_nn.Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(suffix="mlp.dense_4h_to_h", target_module=col_nn.Linear1D_Row),
+                ],
+            )
+            policy[FalconModel] = ModulePolicyDescription(
+                attribute_replacement={
+                    "num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                },
+                method_replacement={
+                    "build_alibi_tensor": build_falcon_alibi_tensor_fn(self.shard_config.tensor_parallel_process_group)
+                },
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="word_embeddings",
+                        target_module=col_nn.VocabParallelEmbedding1D,
+                    )
+                ],
+            )
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            # handle falcon model
+            self.append_or_create_submodule_replacement(
+                description=[
+                    SubModuleReplacementDescription(
+                        suffix="ln_f",
+                        target_module=col_nn.FusedLayerNorm,
+                    ),
+                ],
+                policy=policy,
+                target_key=FalconModel,
+            )
+            # handle falcon decoder layer
+            self.append_or_create_submodule_replacement(
+                description=[
+                    SubModuleReplacementDescription(
+                        suffix="ln_attn", target_module=col_nn.FusedLayerNorm, ignore_if_not_exist=True
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="ln_mlp", target_module=col_nn.FusedLayerNorm, ignore_if_not_exist=True
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="input_layernorm", target_module=col_nn.FusedLayerNorm, ignore_if_not_exist=True
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="post_attention_layernorm", target_module=col_nn.FusedLayerNorm, ignore_if_not_exist=True
+                    ),
+                ],
+                policy=policy,
+                target_key=FalconDecoderLayer,
+            )
+        if self.shard_config.enable_flash_attention:
+            self.append_or_create_method_replacement(
+                description={"forward": get_falcon_flash_attention_forward()},
+                policy=policy,
+                target_key=FalconAttention,
+            )
+        return policy
+    def postprocess(self):
+        return self.model
+    def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
+        """If under pipeline parallel setting, replacing the original forward method of huggingface
+        to customized forward method, and add this changing to policy."""
+        if self.pipeline_stage_manager:
+            stage_manager = self.pipeline_stage_manager
+            if self.model.__class__.__name__ == "FalconModel":
+                module = self.model
+            else:
+                module = self.model.transformer
+            layers_per_stage = Policy.distribute_layers(len(module.h), stage_manager.num_stages)
+            stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
+            method_replacement = {
+                "forward": partial(
+                    new_forward, stage_manager=stage_manager, stage_index=stage_index, shard_config=self.shard_config
+                )
+            }
+            self.append_or_create_method_replacement(
+                description=method_replacement, policy=policy, target_key=model_cls
+            )
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        assert self.pipeline_stage_manager is not None
+        if self.model.__class__.__name__ == "FalconModel":
+            module = self.model
+        else:
+            module = self.model.transformer
+        stage_manager = self.pipeline_stage_manager
+        held_layers = []
+        layers_per_stage = self.distribute_layers(len(module.h), stage_manager.num_stages)
+        if stage_manager.is_first_stage():
+            held_layers.append(module.word_embeddings)
+        start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
+        held_layers.extend(module.h[start_idx:end_idx])
+        if stage_manager.is_last_stage():
+            held_layers.append(module.ln_f)
+        return held_layers
+class FalconModelPolicy(FalconPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        policy = super().module_policy()
+        from transformers.models.falcon.modeling_falcon import FalconModel
+        if self.pipeline_stage_manager:
+            self.set_pipeline_forward(
+                model_cls=FalconModel, new_forward=FalconPipelineForwards.falcon_model_forward, policy=policy
+            )
+        return policy
+    def get_held_layers(self) -> List[Module]:
+        """
+        get pipeline layers for current stage
+        """
+        held_layers = super().get_held_layers()
+        return held_layers
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """no shared params in falcon model"""
+        return []
+class FalconForCausalLMPolicy(FalconPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        from transformers.models.falcon.modeling_falcon import FalconForCausalLM
+        policy = super().module_policy()
+        # handle tensor parallelism
+        if self.shard_config.enable_tensor_parallelism:
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs=dict(gather_output=True)
+                ),
+                policy=policy,
+                target_key=FalconForCausalLM,
+            )
+        if self.pipeline_stage_manager:
+            self.set_pipeline_forward(
+                model_cls=FalconForCausalLM,
+                new_forward=FalconPipelineForwards.falcon_for_causal_lm_forward,
+                policy=policy,
+            )
+        return policy
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        stage_manager = self.pipeline_stage_manager
+        held_layers = super().get_held_layers()
+        if stage_manager.is_last_stage():
+            held_layers.append(self.model.lm_head)
+        return held_layers
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        falcon_model = self.model
+        if self.pipeline_stage_manager and self.pipeline_stage_manager.num_stages > 1:
+            if id(falcon_model.transformer.word_embeddings.weight) == id(falcon_model.lm_head.weight):
+                # tie weights
+                return [
+                    {
+                        0: falcon_model.transformer.word_embeddings.weight,
+                        self.pipeline_stage_manager.num_stages - 1: falcon_model.lm_head.weight,
+                    }
+                ]
+        return []
+class FalconForSequenceClassificationPolicy(FalconPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        from transformers.models.falcon.modeling_falcon import FalconForSequenceClassification
+        policy = super().module_policy()
+        # handle tensor parallelism
+        if self.shard_config.enable_tensor_parallelism:
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="score", target_module=col_nn.Linear1D_Col, kwargs=dict(gather_output=True)
+                ),
+                policy=policy,
+                target_key=FalconForSequenceClassification,
+            )
+        if self.pipeline_stage_manager:
+            self.set_pipeline_forward(
+                model_cls=FalconForSequenceClassification,
+                new_forward=FalconPipelineForwards.falcon_for_sequence_classification_forward,
+                policy=policy,
+            )
+        return policy
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        stage_manager = self.pipeline_stage_manager
+        held_layers = super().get_held_layers()
+        if stage_manager.is_last_stage():
+            held_layers.append(self.model.score)
+        return held_layers
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """No shared params in falcon for sequence classification model"""
+        return []
+class FalconForTokenClassificationPolicy(FalconPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        from transformers.models.falcon.modeling_falcon import FalconForTokenClassification
+        policy = super().module_policy()
+        # handle tensor parallelism
+        if self.shard_config.enable_tensor_parallelism:
+            self.append_or_create_submodule_replacement(
+                description=[
+                    SubModuleReplacementDescription(
+                        suffix="classifier", target_module=col_nn.Linear1D_Col, kwargs=dict(gather_output=True)
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="dropout",
+                        target_module=col_nn.DropoutForReplicatedInput,
+                    ),
+                ],
+                policy=policy,
+                target_key=FalconForTokenClassification,
+            )
+        if self.pipeline_stage_manager:
+            self.set_pipeline_forward(
+                model_cls=FalconForTokenClassification,
+                new_forward=FalconPipelineForwards.falcon_for_token_classification_forward,
+                policy=policy,
+            )
+        return policy
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        stage_manager = self.pipeline_stage_manager
+        held_layers = super().get_held_layers()
+        if stage_manager.is_last_stage():
+            held_layers.append(self.model.dropout)
+            held_layers.append(self.model.classifier)
+        return held_layers
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """No shared params in falcon for token classification model"""
+        return []
+class FalconForQuestionAnsweringPolicy(FalconPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        from transformers.models.falcon.modeling_falcon import FalconForQuestionAnswering
+        policy = super().module_policy()
+        # handle tensor parallelism
+        if self.shard_config.enable_tensor_parallelism:
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="qa_outputs", target_module=col_nn.Linear1D_Col, kwargs=dict(gather_output=True)
+                ),
+                policy=policy,
+                target_key=FalconForQuestionAnswering,
+            )
+        if self.pipeline_stage_manager:
+            self.set_pipeline_forward(
+                model_cls=FalconForQuestionAnswering,
+                new_forward=FalconPipelineForwards.falcon_for_question_answering_forward,
+                policy=policy,
+            )
+        return policy
+    def get_held_layers(self) -> List[Module]:
+        """Get pipeline layers for current stage."""
+        held_layers = super().get_held_layers()
+        stage_manager = self.pipeline_stage_manager
+        if stage_manager.is_last_stage():
+            held_layers.append(self.model.qa_outputs)
+        return held_layers
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """No shared params in falcon for question answering model"""
+        return []
--- a/colossalai/shardformer/policies/gptj.py
+++ b/colossalai/shardformer/policies/gptj.py
+import warnings
+from functools import partial
+from typing import Callable, Dict, List
+from torch import Tensor, nn
+import colossalai.shardformer.layer as col_nn
+from ..modeling.gptj import GPTJPipelineForwards, get_gptj_flash_attention_forward
+from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+__all__ = [
+    "GPTJPolicy",
+    "GPTJModelPolicy",
+    "GPTJForCausalLMPolicy",
+    "GPTJForSequenceClassificationPolicy",
+    "GPTJForQuestionAnsweringPolicy",
+    "FlaxGPTJPolicy",
+    "FlaxGPTJForCausalLMPolicy",
+]
+class GPTJPolicy(Policy):
+    def config_sanity_check(self):
+        pass
+    def preprocess(self):
+        # reshape the embedding layer
+        r"""
+        Reshape the Embedding layer to make the embedding dimension divisible by world_size
+        """
+        if self.shard_config.enable_tensor_parallelism:
+            vocab_size = self.model.config.vocab_size
+            world_size = self.shard_config.tensor_parallel_size
+            if vocab_size % world_size != 0:
+                new_vocab_size = vocab_size + world_size - vocab_size % world_size
+                self.model.resize_token_embeddings(new_vocab_size)
+        return self.model
+    def module_policy(self):
+        from transformers.models.gptj.modeling_gptj import GPTJAttention, GPTJBlock, GPTJModel
+        policy = {}
+        if self.shard_config.enable_sequence_parallelism:
+            self.shard_config.enable_sequence_parallelism = False
+            warnings.warn("GPTJ doesn't support sequence parallelism now, will ignore the sequence parallelism flag.")
+        use_sequence_parallel = self.shard_config.enable_sequence_parallelism
+        overlap = self.shard_config.enable_sequence_overlap
+        if self.shard_config.enable_tensor_parallelism:
+            policy[GPTJModel] = ModulePolicyDescription(
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="wte",
+                        target_module=col_nn.VocabParallelEmbedding1D,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="drop",
+                        target_module=col_nn.DropoutForParallelInput,
+                    ),
+                ]
+            )
+            policy[GPTJBlock] = ModulePolicyDescription(
+                attribute_replacement={
+                    "attn.embed_dim": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                    "attn.num_attention_heads": self.model.config.num_attention_heads
+                    // self.shard_config.tensor_parallel_size,
+                },
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="attn.k_proj",
+                        target_module=col_nn.Linear1D_Col,
+                        kwargs={"seq_parallel": use_sequence_parallel, "overlap": overlap},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="attn.q_proj",
+                        target_module=col_nn.Linear1D_Col,
+                        kwargs={"seq_parallel": use_sequence_parallel, "overlap": overlap},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="attn.v_proj",
+                        target_module=col_nn.Linear1D_Col,
+                        kwargs={"seq_parallel": use_sequence_parallel, "overlap": overlap},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="attn.out_proj",
+                        target_module=col_nn.Linear1D_Row,
+                        kwargs={"seq_parallel": use_sequence_parallel},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.fc_in",
+                        target_module=col_nn.Linear1D_Col,
+                        kwargs={"seq_parallel": use_sequence_parallel},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.fc_out",
+                        target_module=col_nn.Linear1D_Row,
+                        kwargs={"seq_parallel": use_sequence_parallel},
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="attn.attn_dropout",
+                        target_module=col_nn.DropoutForParallelInput,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="attn.resid_dropout",
+                        target_module=col_nn.DropoutForParallelInput,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.dropout",
+                        target_module=col_nn.DropoutForParallelInput,
+                    ),
+                ],
+            )
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="ln_f",
+                    target_module=col_nn.FusedLayerNorm,
+                ),
+                policy=policy,
+                target_key=GPTJModel,
+            )
+            self.append_or_create_submodule_replacement(
+                description=[
+                    SubModuleReplacementDescription(
+                        suffix="ln_1",
+                        target_module=col_nn.FusedLayerNorm,
+                    )
+                ],
+                policy=policy,
+                target_key=GPTJBlock,
+            )
+        if self.shard_config.enable_flash_attention:
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": get_gptj_flash_attention_forward(),
+                },
+                policy=policy,
+                target_key=GPTJAttention,
+            )
+        return policy
+    def postprocess(self):
+        return self.model
+    def get_held_layers(self) -> List[nn.Module]:
+        """Get pipeline layers for current stage."""
+        assert self.pipeline_stage_manager is not None
+        if self.model.__class__.__name__ == "GPTJModel":
+            module = self.model
+        else:
+            module = self.model.transformer
+        stage_manager = self.pipeline_stage_manager
+        held_layers = []
+        layers_per_stage = self.distribute_layers(len(module.h), stage_manager.num_stages)
+        if stage_manager.is_first_stage():
+            held_layers.append(module.wte)
+            held_layers.append(module.drop)
+        start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
+        held_layers.extend(module.h[start_idx:end_idx])
+        if stage_manager.is_last_stage():
+            held_layers.append(module.ln_f)
+        return held_layers
+    def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
+        """If under pipeline parallel setting, replacing the original forward method of huggingface
+        to customized forward method, and add this changing to policy."""
+        if not self.pipeline_stage_manager:
+            raise ValueError("set_pipeline_forward method can only be called when pipeline parallel is enabled.")
+        stage_manager = self.pipeline_stage_manager
+        if self.model.__class__.__name__ == "GPTJModel":
+            module = self.model
+        else:
+            module = self.model.transformer
+        layers_per_stage = Policy.distribute_layers(len(module.h), stage_manager.num_stages)
+        stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
+        method_replacement = {
+            "forward": partial(
+                new_forward, stage_manager=stage_manager, stage_index=stage_index, shard_config=self.shard_config
+            )
+        }
+        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=model_cls)
+# GPTJModel
+class GPTJModelPolicy(GPTJPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        from transformers.models.gptj.modeling_gptj import GPTJModel
+        policy = super().module_policy()
+        if self.pipeline_stage_manager is not None:
+            self.set_pipeline_forward(
+                model_cls=GPTJModel, new_forward=GPTJPipelineForwards.gptj_model_forward, policy=policy
+            )
+        return policy
+    def get_held_layers(self) -> List[nn.Module]:
+        return super().get_held_layers()
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """No shared params in GPT2Model."""
+        return []
+# GPTJForCausalLM
+class GPTJForCausalLMPolicy(GPTJPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
+        policy = super().module_policy()
+        if self.shard_config.enable_tensor_parallelism:
+            addon_module = {
+                GPTJForCausalLM: ModulePolicyDescription(
+                    sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="lm_head", target_module=col_nn.Linear1D_Col, kwargs={"gather_output": True}
+                        )
+                    ]
+                )
+            }
+            policy.update(addon_module)
+        if self.pipeline_stage_manager is not None:
+            self.set_pipeline_forward(
+                model_cls=GPTJForCausalLM, new_forward=GPTJPipelineForwards.gptj_causallm_model_forward, policy=policy
+            )
+        return policy
+    def get_held_layers(self) -> List[nn.Module]:
+        held_layers = super().get_held_layers()
+        if self.pipeline_stage_manager.is_last_stage():
+            held_layers.append(self.model.lm_head)
+        return held_layers
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """The weights of wte and lm_head are shared."""
+        module = self.model
+        stage_manager = self.pipeline_stage_manager
+        if stage_manager is not None:
+            if stage_manager.num_stages > 1 and id(module.transformer.wte.weight) == id(module.lm_head.weight):
+                first_stage, last_stage = 0, stage_manager.num_stages - 1
+                return [{first_stage: module.transformer.wte.weight, last_stage: module.lm_head.weight}]
+        return []
+# GPTJForSequenceClassification
+class GPTJForSequenceClassificationPolicy(GPTJPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        from transformers.models.gptj.modeling_gptj import GPTJForSequenceClassification
+        policy = super().module_policy()
+        if self.pipeline_stage_manager is not None:
+            self.set_pipeline_forward(
+                model_cls=GPTJForSequenceClassification,
+                new_forward=GPTJPipelineForwards.gptj_for_sequence_classification_forward,
+                policy=policy,
+            )
+        return policy
+    def get_held_layers(self) -> List[nn.Module]:
+        held_layers = super().get_held_layers()
+        if self.pipeline_stage_manager.is_last_stage():
+            held_layers.append(self.model.score)
+        return held_layers
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """No shared params in GPTJForSequenceClassification."""
+        return []
+# GPTJForQuestionAnswering
+class GPTJForQuestionAnsweringPolicy(GPTJPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        from transformers.models.gptj.modeling_gptj import GPTJForQuestionAnswering
+        policy = super().module_policy()
+        if self.pipeline_stage_manager is not None:
+            self.set_pipeline_forward(
+                model_cls=GPTJForQuestionAnswering,
+                new_forward=GPTJPipelineForwards.gptj_for_question_answering_forward,
+                policy=policy,
+            )
+        return policy
+    def get_held_layers(self) -> List[nn.Module]:
+        held_layers = super().get_held_layers()
+        if self.pipeline_stage_manager.is_last_stage():
+            held_layers.append(self.model.qa_outputs)
+        return held_layers
+    def get_shared_params(self) -> List[Dict[int, Tensor]]:
+        """No shared params in GPT2ForQuestionAnswering."""
+        return []
--- a/colossalai/shardformer/policies/llama.py
+++ b/colossalai/shardformer/policies/llama.py
@@ -8,7 +8,11 @@ from torch.nn import Module
 from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col, Linear1D_Row, RMSNorm, VocabParallelEmbedding1D
-from ..modeling.llama import LlamaPipelineForwards, get_llama_flash_attention_forward
+from ..modeling.llama import (
+    LlamaPipelineForwards,
+    get_llama_flash_attention_forward,
+    get_lm_forward_with_dist_cross_entropy,
+)
 from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
 __all__ = ["LlamaPolicy", "LlamaForCausalLMPolicy", "LlamaForSequenceClassificationPolicy"]
@@ -126,7 +130,7 @@ class LlamaPolicy(Policy):
        if self.shard_config.enable_flash_attention:
            self.append_or_create_method_replacement(
                description={
-                    "forward": get_llama_flash_attention_forward(),
+                    "forward": get_llama_flash_attention_forward(self.shard_config),
                },
                policy=policy,
                target_key=LlamaAttention,
@@ -140,21 +144,42 @@ class LlamaPolicy(Policy):
    def set_pipeline_forward(self, model_cls: nn.Module, new_forward: Callable, policy: Dict) -> None:
        """If under pipeline parallel setting, replacing the original forward method of huggingface
        to customized forward method, and add this changing to policy."""
-        if self.pipeline_stage_manager:
+        if self.pipeline_stage_manager is None:
-            stage_manager = self.pipeline_stage_manager
+            return
-            if self.model.__class__.__name__ == "LlamaModel":
-                module = self.model
+        stage_manager = self.pipeline_stage_manager
-            else:
+        if self.model.__class__.__name__ == "LlamaModel":
-                module = self.model.model
+            module = self.model
+        else:
+            module = self.model.model
+        if stage_manager.is_interleave:
+            layers_per_stage = self.distribute_layers(
+                len(module.layers), stage_manager.num_stages * stage_manager.num_model_chunks
+            )
+            stage_manager.stage_indices = Policy.get_stage_index(
+                layers_per_stage,
+                stage_manager.stage,
+                num_model_chunks=stage_manager.num_model_chunks,
+                num_stages=stage_manager.num_stages,
+            )
+            method_replacement = {
+                "forward": partial(new_forward, stage_manager=stage_manager, shard_config=self.shard_config)
+            }
+        else:
            layers_per_stage = Policy.distribute_layers(len(module.layers), stage_manager.num_stages)
            stage_index = Policy.get_stage_index(layers_per_stage, stage_manager.stage)
-            method_replacement = {"forward": partial(new_forward, stage_manager=stage_manager, stage_index=stage_index)}
+            method_replacement = {
+                "forward": partial(
+                    new_forward, stage_manager=stage_manager, stage_index=stage_index, shard_config=self.shard_config
+                )
+            }
            self.append_or_create_method_replacement(
                description=method_replacement, policy=policy, target_key=model_cls
            )
-        return
+        self.append_or_create_method_replacement(description=method_replacement, policy=policy, target_key=model_cls)
    def get_held_layers(self) -> List[Module]:
        """Get pipeline layers for current stage."""
@@ -167,13 +192,32 @@ class LlamaPolicy(Policy):
        stage_manager = self.pipeline_stage_manager
        held_layers = []
-        layers_per_stage = self.distribute_layers(len(module.layers), stage_manager.num_stages)
+        if stage_manager.is_interleave:
-        if stage_manager.is_first_stage():
+            assert stage_manager.num_model_chunks is not None
-            held_layers.append(module.embed_tokens)
+            layers_per_stage = self.distribute_layers(
-        start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
+                len(module.layers), stage_manager.num_stages * stage_manager.num_model_chunks
-        held_layers.extend(module.layers[start_idx:end_idx])
+            )
-        if stage_manager.is_last_stage():
+            stage_indices = Policy.get_stage_index(
-            held_layers.append(module.norm)
+                layers_per_stage,
+                stage_manager.stage,
+                num_model_chunks=stage_manager.num_model_chunks,
+                num_stages=stage_manager.num_stages,
+            )
+            if stage_manager.is_first_stage(ignore_chunk=True):
+                held_layers.append(module.embed_tokens)
+            for start_idx, end_idx in stage_indices:
+                held_layers.extend(module.layers[start_idx:end_idx])
+            if stage_manager.is_last_stage(ignore_chunk=True):
+                held_layers.append(module.norm)
+        else:
+            layers_per_stage = self.distribute_layers(len(module.layers), stage_manager.num_stages)
+            if stage_manager.is_first_stage():
+                held_layers.append(module.embed_tokens)
+            start_idx, end_idx = self.get_stage_index(layers_per_stage, stage_manager.stage)
+            held_layers.extend(module.layers[start_idx:end_idx])
+            if stage_manager.is_last_stage():
+                held_layers.append(module.norm)
        return held_layers
@@ -206,15 +250,16 @@ class LlamaForCausalLMPolicy(LlamaPolicy):
        policy = super().module_policy()
+        setattr(self.shard_config, "causal_lm", True)
        if self.shard_config.enable_tensor_parallelism:
            # add a new item for casual lm
            new_item = {
                LlamaForCausalLM: ModulePolicyDescription(
                    sub_module_replacement=[
-                        SubModuleReplacementDescription(
+                        SubModuleReplacementDescription(suffix="lm_head", target_module=Linear1D_Col)
-                            suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                    ],
-                        )
+                    method_replacement={"forward": get_lm_forward_with_dist_cross_entropy(self.shard_config)},
-                    ]
                )
            }
            policy.update(new_item)
@@ -231,7 +276,7 @@ class LlamaForCausalLMPolicy(LlamaPolicy):
        """Get pipeline layers for current stage."""
        stage_manager = self.pipeline_stage_manager
        held_layers = super().get_held_layers()
-        if stage_manager.is_last_stage():
+        if stage_manager.is_last_stage(ignore_chunk=True):
            held_layers.append(self.model.lm_head)
        return held_layers
@@ -284,7 +329,7 @@ class LlamaForSequenceClassificationPolicy(LlamaPolicy):
        """Get pipeline layers for current stage."""
        stage_manager = self.pipeline_stage_manager
        held_layers = super().get_held_layers()
-        if stage_manager.is_last_stage():
+        if stage_manager.is_last_stage(ignore_chunk=True):
            held_layers.append(self.model.score)
        return held_layers

--- a/colossalai/shardformer/policies/mistral.py
+++ b/colossalai/shardformer/policies/mistral.py
+import warnings
+from typing import Dict, Union
+import torch.nn as nn
+from colossalai.shardformer.layer import FusedRMSNorm, Linear1D_Col, Linear1D_Row, VocabParallelEmbedding1D
+from ..modeling.mistral import get_mistral_flash_attention_forward
+from .base_policy import ModulePolicyDescription, Policy, SubModuleReplacementDescription
+__all__ = ["MistralPolicy", "MistralModelPolicy", "MistralForCausalLMPolicy", "MistralForSequenceClassificationPolicy"]
+class MistralPolicy(Policy):
+    def config_sanity_check(self):
+        pass
+    def preprocess(self):
+        if self.shard_config.enable_tensor_parallelism:
+            # Resize embedding
+            vocab_size = self.model.config.vocab_size
+            world_size = self.shard_config.tensor_parallel_size
+            if vocab_size % world_size != 0:
+                new_vocab_size = vocab_size + world_size - vocab_size % world_size
+                self.model.resize_token_embeddings(new_vocab_size)
+        return self.model
+    def module_policy(self) -> Dict[Union[str, nn.Module], ModulePolicyDescription]:
+        from transformers.models.mistral.modeling_mistral import MistralAttention, MistralDecoderLayer, MistralModel
+        policy = {}
+        if self.shard_config.enable_sequence_parallelism:
+            self.shard_config.enable_sequence_parallelism = False
+            warnings.warn(
+                "Mistral dosen't support sequence parallelism now, will ignore the sequence parallelism flag."
+            )
+        if self.shard_config.enable_tensor_parallelism:
+            decoder_attribute_replacement = {
+                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+                "self_attn.num_key_value_heads": self.model.config.num_key_value_heads
+                // self.shard_config.tensor_parallel_size,
+            }
+            policy[MistralDecoderLayer] = ModulePolicyDescription(
+                attribute_replacement=decoder_attribute_replacement,
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.q_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.k_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.v_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn.o_proj",
+                        target_module=Linear1D_Row,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.gate_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.up_proj",
+                        target_module=Linear1D_Col,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="mlp.down_proj",
+                        target_module=Linear1D_Row,
+                    ),
+                ],
+            )
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="embed_tokens",
+                    target_module=VocabParallelEmbedding1D,
+                ),
+                policy=policy,
+                target_key=MistralModel,
+            )
+        # optimization configuration
+        if self.shard_config.enable_fused_normalization:
+            self.append_or_create_submodule_replacement(
+                description=[
+                    SubModuleReplacementDescription(
+                        suffix="input_layernorm",
+                        target_module=FusedRMSNorm,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="post_attention_layernorm",
+                        target_module=FusedRMSNorm,
+                    ),
+                ],
+                policy=policy,
+                target_key=MistralDecoderLayer,
+            )
+            self.append_or_create_submodule_replacement(
+                description=SubModuleReplacementDescription(
+                    suffix="norm",
+                    target_module=FusedRMSNorm,
+                ),
+                policy=policy,
+                target_key=MistralModel,
+            )
+        if self.shard_config.enable_flash_attention:
+            self.append_or_create_method_replacement(
+                description={
+                    "forward": get_mistral_flash_attention_forward(),
+                },
+                policy=policy,
+                target_key=MistralAttention,
+            )
+        return policy
+    def postprocess(self):
+        return self.model
+class MistralModelPolicy(MistralPolicy):
+    def __init__(self) -> None:
+        super().__init__()
+    def module_policy(self):
+        if self.pipeline_stage_manager:
+            warnings.warn("Mistral dosen't support pipeline parallelism now.")
+        return super().module_policy()
+class MistralForCausalLMPolicy(MistralPolicy):
+    def module_policy(self):
+        from transformers import MistralForCausalLM
+        policy = super().module_policy()
+        if self.shard_config.enable_tensor_parallelism:
+            # add a new item for casual lm
+            new_item = {
+                MistralForCausalLM: ModulePolicyDescription(
+                    sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="lm_head", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                        )
+                    ]
+                )
+            }
+            if self.pipeline_stage_manager:
+                warnings.warn("Mistral dosen't support pipeline parallelism now.")
+            policy.update(new_item)
+        return policy
+class MistralForSequenceClassificationPolicy(MistralPolicy):
+    def module_policy(self):
+        from transformers import MistralForSequenceClassification
+        policy = super().module_policy()
+        if self.shard_config.enable_tensor_parallelism:
+            # add a new item for sequence classification
+            new_item = {
+                MistralForSequenceClassification: ModulePolicyDescription(
+                    sub_module_replacement=[
+                        SubModuleReplacementDescription(
+                            suffix="score", target_module=Linear1D_Col, kwargs=dict(gather_output=True)
+                        )
+                    ]
+                )
+            }
+            if self.pipeline_stage_manager:
+                warnings.warn("Mistral dosen't support pipeline parallelism now.")
+            policy.update(new_item)
+        return policy
--- a/colossalai/shardformer/policies/opt.py
+++ b/colossalai/shardformer/policies/opt.py
@@ -22,6 +22,15 @@ __all__ = [
 class OPTPolicy(Policy):
+    def __init__(self) -> None:
+        super().__init__()
+        import transformers
+        from packaging.version import Version
+        assert Version(transformers.__version__) <= Version(
+            "4.33.0"
+        ), "The OPT model should run on a transformers version not greater than 4.33.0."
    def config_sanity_check(self):
        pass

--- a/colossalai/shardformer/policies/whisper.py
+++ b/colossalai/shardformer/policies/whisper.py
@@ -26,6 +26,15 @@ __all__ = [
 class WhisperPolicy(Policy):
+    def __init__(self) -> None:
+        super().__init__()
+        import transformers
+        from packaging.version import Version
+        assert Version(transformers.__version__) <= Version(
+            "4.33.0"
+        ), "The Whisper model should run on a transformers version not greater than 4.33.0."
    def config_sanity_check(self):
        pass

--- a/colossalai/shardformer/shard/shard_config.py
+++ b/colossalai/shardformer/shard/shard_config.py
@@ -22,8 +22,8 @@ class ShardConfig:
        enable_flash_attention (bool, optional): Whether to switch on flash attention. Defaults to False.
        enable_jit_fused (bool, optional): Whether to switch on JIT fused operators. Defaults to False.
        enable_sequence_parallelism (bool): Whether to turn on sequence parallelism, which partitions non-tensor-parallel regions along the sequence dimension. Defaults to False.
-        enable_sequence_overlap (bool): Whether to turn on sequence overlap, wheich overlap the computation and communication in sequence parallelism. It can only be used when enable_sequence_parallelism is True. Defaults to False.
+        enable_sequence_overlap (bool): Whether to turn on sequence overlap, which overlap the computation and communication in sequence parallelism. It can only be used when enable_sequence_parallelism is True. Defaults to False.
-        enable_all_optimization (bool): Whether to turn on all optimization tools including 'fused normalizaion', 'flash attention', 'JIT fused operators', 'sequence parallelism' and 'sequence overlap'. Defaults to False.
+        enable_all_optimization (bool): Whether to turn on all optimization tools including 'fused normalization', 'flash attention', 'JIT fused operators', 'sequence parallelism' and 'sequence overlap'. Defaults to False.
    """
    tensor_parallel_process_group: Optional[ProcessGroup] = None
    pipeline_stage_manager: Optional[PipelineStageManager] = None

--- a/colossalai/shardformer/shard/sharder.py
+++ b/colossalai/shardformer/shard/sharder.py
@@ -37,7 +37,7 @@ class ModelSharder(object):
        self.policy.set_model(self.model)
        self.policy.set_shard_config(self.shard_config)
        self._preprocess()
-        # get shared params before release unheld layers, this avoid misjudgement of shared params (None is None)
+        # get shared params before release unheld layers, this avoid misjudgment of shared params (None is None)
        shared_params = self.policy.get_shared_params()
        held_layers = self._release_unheld_layers()
        self._replace_module(include=held_layers)

--- a/colossalai/tensor/colo_parameter.py
+++ b/colossalai/tensor/colo_parameter.py
@@ -7,7 +7,7 @@ from colossalai.tensor.param_op_hook import ColoParamOpHookManager
 from .colo_tensor import _convert_output
-WHITE_LIST_FUNCS = {torch.Tensor.__getitem__}
+WHITE_LIST_FUNCS = {torch.Tensor.__getitem__, torch.Tensor.is_floating_point}
 def is_no_hook_op(func) -> bool:

--- a/colossalai/tensor/d_tensor/comm_spec.py
+++ b/colossalai/tensor/d_tensor/comm_spec.py
@@ -112,7 +112,7 @@ def _split(tensor: torch.Tensor, comm_spec: CommSpec):
    dim = comm_spec.shard_dim
    length = tensor.shape[comm_spec.shard_dim] // dist.get_world_size(process_group)
    start = length * dist.get_rank(process_group)
-    output = torch.narrow(tensor, dim, start, length).contiguous()
+    output = torch.narrow(tensor, dim, start, length).clone().contiguous()
    return output

--- a/colossalai/utils/memory.py
+++ b/colossalai/utils/memory.py
+from collections import namedtuple
+import psutil
+import torch
+import torch.distributed as dist
+from colossalai.utils import get_current_device
+_GLOBAL_CUDA_MEM_FRACTION = 1.0
+_GLOBAL_CPU_MEM_CAPACITY = -1
+# copy from PatrickStar
+def _get_cpu_memory_info():
+    ps_mem_info = namedtuple("ps_mem_info", ["total", "free", "cached", "buffers", "used"])
+    try:
+        # psutil reads the memory info from /proc/memory_info,
+        # which results in returning the host memory instead of
+        # that of container.
+        # Here we try to read the container memory with method in:
+        # https://stackoverflow.com/a/46213331/5163915
+        mems = {}
+        with open("/sys/fs/cgroup/memory/memory.meminfo", "rb") as f:
+            for line in f:
+                fields = line.split()
+                mems[fields[0]] = int(fields[1]) * 1024
+        total = mems[b"MemTotal:"]
+        free = mems[b"MemFree:"]
+        cached = mems[b"Cached:"]
+        buffers = mems[b"Buffers:"]
+        used = total - free - cached - buffers
+        if used < 0:
+            used = total - free
+        mem_info = ps_mem_info(total=total, free=free, cached=cached, buffers=buffers, used=used)
+    except FileNotFoundError:
+        mems = psutil.virtual_memory()
+        mem_info = ps_mem_info(
+            total=mems.total,
+            free=mems.free,
+            cached=mems.cached,
+            buffers=mems.buffers,
+            used=mems.used,
+        )
+    return mem_info
+def colo_device_memory_capacity(device: torch.device) -> int:
+    """
+    Get the capacity of the memory of the device
+    Args:
+        device (torch.device): a device
+    Returns:
+        int: size in byte
+    """
+    # TODO: add NPU support
+    assert isinstance(device, torch.device)
+    if device.type == "cpu":
+        # In the context of 1-CPU-N-GPU, the memory capacity of the current process is 1/N overall CPU memory.
+        return colo_get_cpu_memory_capacity() // dist.get_world_size()
+    if device.type == "cuda":
+        return torch.cuda.get_device_properties(get_current_device()).total_memory * _GLOBAL_CUDA_MEM_FRACTION
+def colo_get_cpu_memory_capacity() -> int:
+    """
+    Get the cpu memory capacity. We may not use all of it.
+    Returns:
+        int: _description_
+    """
+    global _GLOBAL_CPU_MEM_CAPACITY
+    if _GLOBAL_CPU_MEM_CAPACITY == -1:
+        mem_info = _get_cpu_memory_info()
+        return mem_info.total
+    else:
+        return _GLOBAL_CPU_MEM_CAPACITY
--- a/colossalai/zero/__init__.py
+++ b/colossalai/zero/__init__.py
-from .gemini import (
+from .gemini import GeminiAdamOptimizer, GeminiDDP, GeminiOptimizer, get_static_torch_model
-    ColoInitContext,
-    GeminiAdamOptimizer,
-    GeminiDDP,
-    GeminiOptimizer,
-    get_static_torch_model,
-    post_process_colo_init_ctx,
-)
 from .low_level import LowLevelZeroOptimizer
 from .wrapper import zero_model_wrapper, zero_optim_wrapper
@@ -16,7 +9,5 @@ __all__ = [
    "zero_model_wrapper",
    "zero_optim_wrapper",
    "LowLevelZeroOptimizer",
-    "ColoInitContext",
-    "post_process_colo_init_ctx",
    "get_static_torch_model",
 ]
--- a/colossalai/zero/gemini/__init__.py
+++ b/colossalai/zero/gemini/__init__.py
 from .chunk import ChunkManager, TensorInfo, TensorState, search_chunk_configuration
-from .colo_init_context import ColoInitContext, post_process_colo_init_ctx
 from .gemini_ddp import GeminiDDP
 from .gemini_mgr import GeminiManager
 from .gemini_optimizer import GeminiAdamOptimizer, GeminiOptimizer
@@ -15,6 +14,4 @@ __all__ = [
    "get_static_torch_model",
    "GeminiAdamOptimizer",
    "GeminiOptimizer",
-    "ColoInitContext",
-    "post_process_colo_init_ctx",
 ]
--- a/docs/README-zh-Hans.md
+++ b/docs/README-zh-Hans.md
@@ -24,15 +24,16 @@
 </div>
 ## 新闻
-* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
+* [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer)
+* [2024/01] [Construct Refined 13B Private Model With Just $5000 USD, Upgraded Colossal-AI Llama-2 Open Source](https://hpc-ai.com/blog/colossal-llama-2-13b)
+* [2023/11] [Enhanced MoE Parallelism, Open-source MoE Model Training Can Be 9 Times More Efficient](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient)
+* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific LLM Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
 * [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training)
 * [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
 * [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
 * [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
 * [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
 * [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
-* [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
-* [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02)
 ## 目录
 <ul>
@@ -51,6 +52,7 @@
   <a href="#并行训练样例展示">并行训练样例展示</a>
   <ul>
     <li><a href="#LLaMA2">LLaMA 1/2</a></li>
+     <li><a href="#MoE">MoE</a></li>
     <li><a href="#GPT-3">GPT-3</a></li>
     <li><a href="#GPT-2">GPT-2</a></li>
     <li><a href="#BERT">BERT</a></li>
@@ -68,8 +70,9 @@
   </ul>
 </li>
 <li>
-   <a href="#推理-Energon-AI-样例展示">推理 (Energon-AI) 样例展示</a>
+   <a href="#推理">推理</a>
   <ul>
+     <li><a href="#SwiftInfer">SwiftInfer:打破LLM多轮对话的长度限制，推理加速46%</a></li>
     <li><a href="#GPT-3-Inference">GPT-3</a></li>
     <li><a href="#OPT-Serving">1750亿参数OPT在线推理服务</a></li>
     <li><a href="#BLOOM-Inference">1760亿参数 BLOOM</a></li>
@@ -114,41 +117,42 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
  - [PatrickStar](https://arxiv.org/abs/2108.05818)
 - 使用友好
  - 基于参数文件的并行化
- 推理
-  - [Energon-AI](https://github.com/hpcaitech/EnergonAI)
 <p align="right">(<a href="#top">返回顶端</a>)</p>
 ## Colossal-AI 成功案例
 ### Colossal-LLaMA-2
- 千元预算半天训练，效果媲美主流大模型，开源可商用中文LLaMA-2
+- 7B：千元预算半天训练，效果媲美主流大模型，开源可商用中文LLaMA-2
 [[代码]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
 [[博客]](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
 [[模型权重]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base)
-|                                |  Backbone  | Tokens Consumed |  |         MMLU         |     CMMLU     | AGIEval | GAOKAO | CEval  |
+- 13B: 万元预算打造高质量13B私有模型
-| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :-----: | :----: | :----: | :------------------------------: |
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
-|                                |           |        -        |                |        5-shot        |    5-shot     | 5-shot  | 0-shot | 5-shot |
+[[blog]](https://hpc-ai.com/blog/colossal-llama-2-13b)
-|          Baichuan-7B           |     -      |      1.2T       |             |    42.32 (42.30)     | 44.53 (44.02) |  38.72  | 36.74  | 42.80  |
+[[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-13b-base)
-|       Baichuan-13B-Base        |     -      |      1.4T       |             |    50.51 (51.60)     | 55.73 (55.30) |  47.20  | 51.41  | 53.60  |
+[[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-13b-base/summary)
-|       Baichuan2-7B-Base        |     -      |      2.6T       |             |    46.97 (54.16)     | 57.67 (57.07) |  45.76  | 52.60  | 54.00  |
-|       Baichuan2-13B-Base       |     -      |      2.6T       |             |    54.84 (59.17)     | 62.62 (61.97) |  52.08  | 58.25  | 58.10  |
+|              Model             |  Backbone  | Tokens Consumed |     MMLU (5-shot)    | CMMLU (5-shot)| AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot)  |
-|           ChatGLM-6B           |     -      |      1.0T       |             |    39.67 (40.63)     |   41.17 (-)   |  40.10  | 36.53  | 38.90  |
+| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :--------------: | :-------------: | :-------------: |
-|          ChatGLM2-6B           |     -      |      1.4T       |             |    44.74 (45.46)     |   49.40 (-)   |  46.36  | 45.49  | 51.70  |
+|          Baichuan-7B           |     -      |      1.2T       |    42.32 (42.30)     | 44.53 (44.02) |        38.72     |       36.74     |       42.80     |
-|          InternLM-7B           |     -      |      1.6T       |                |    46.70 (51.00)     |   52.00 (-)   |  44.77  | 61.64  | 52.80  |
+|       Baichuan-13B-Base        |     -      |      1.4T       |    50.51 (51.60)     | 55.73 (55.30) |        47.20     |       51.41     |       53.60     |
-|            Qwen-7B             |     -      |      2.2T       |             | 54.29 (56.70) | 56.03 (58.80) |  52.47  | 56.42  | 59.60  |
+|       Baichuan2-7B-Base        |     -      |      2.6T       |    46.97 (54.16)     | 57.67 (57.07) |        45.76     |       52.60     |       54.00     |
-|                                |            |                 |                 |                      |               |         |        |        |
+|       Baichuan2-13B-Base       |     -      |      2.6T       |    54.84 (59.17)     | 62.62 (61.97) |        52.08     |       58.25     |       58.10     |
-|           Llama-2-7B           |     -      |      2.0T       |             |    44.47 (45.30)     |   32.97 (-)   |  32.60  | 25.46  |   -    |
+|           ChatGLM-6B           |     -      |      1.0T       |    39.67 (40.63)     |   41.17 (-)   |        40.10     |       36.53     |       38.90     |
-| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B |      1.0T       |             |        37.43         |     29.92     |  32.00  | 27.57  |   -    |
+|          ChatGLM2-6B           |     -      |      1.4T       |    44.74 (45.46)     |   49.40 (-)   |        46.36     |       45.49     |       51.70     |
-| wenge-research/yayi-7b-llama2  | Llama-2-7B |        -        |                |        38.56         |     31.52     |  30.99  | 25.95  |   -    |
+|          InternLM-7B           |     -      |      1.6T       |    46.70 (51.00)     |   52.00 (-)   |        44.77     |       61.64     |       52.80     |
-| ziqingyang/chinese-llama-2-7b  | Llama-2-7B |        -        |                |        33.86         |     34.69     |  34.52  | 25.18  |  34.2  |
+|            Qwen-7B             |     -      |      2.2T       |        54.29 (56.70) | 56.03 (58.80) |        52.47     |       56.42     |       59.60     |
-| TigerResearch/tigerbot-7b-base | Llama-2-7B |      0.3T       |             |        43.73         |     42.04     |  37.64  | 30.61  |   -    |
+|           Llama-2-7B           |     -      |      2.0T       |    44.47 (45.30)     |   32.97 (-)   |        32.60     |       25.46     |         -       |
-|  LinkSoul/Chinese-Llama-2-7b   | Llama-2-7B |        -        |                |        48.41         |     38.31     |  38.45  | 27.72  |   -    |
+| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B |      1.0T       |        37.43         |     29.92     |        32.00     |       27.57     |         -       |
-|       FlagAlpha/Atom-7B        | Llama-2-7B |      0.1T       |             |        49.96         |     41.10     |  39.83  | 33.00  |   -    |
+| wenge-research/yayi-7b-llama2  | Llama-2-7B |        -        |        38.56         |     31.52     |        30.99     |       25.95     |         -       |
-| IDEA-CCNL/Ziya-LLaMA-13B-v1.1  | Llama-13B  |      0.11T      |            |        50.25         |     40.99     |  40.04  | 30.54  |   -    |
+| ziqingyang/chinese-llama-2-7b  | Llama-2-7B |        -        |        33.86         |     34.69     |        34.52     |       25.18     |        34.2     |
-|  |  |  |  |  |  |  |  |  |
+| TigerResearch/tigerbot-7b-base | Llama-2-7B |      0.3T       |        43.73         |     42.04     |        37.64     |       30.61     |         -       |
-|    **Colossal-LLaMA-2-7b-base**    | Llama-2-7B |      **0.0085T**      |            |        53.06         |     49.89     |  51.48  | 58.82  |  50.2  |
+|  LinkSoul/Chinese-Llama-2-7b   | Llama-2-7B |        -        |        48.41         |     38.31     |        38.45     |       27.72     |         -       |
+|       FlagAlpha/Atom-7B        | Llama-2-7B |      0.1T       |        49.96         |     41.10     |        39.83     |       33.00     |         -       |
+| IDEA-CCNL/Ziya-LLaMA-13B-v1.1  | Llama-13B  |      0.11T      |        50.25         |     40.99     |        40.04     |       30.54     |         -       |
+|  **Colossal-LLaMA-2-7b-base**  | Llama-2-7B |   **0.0085T**   |        53.06         |     49.89     |        51.48     |       58.82     |        50.2     |
 ### ColossalChat
@@ -208,7 +212,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 - [DreamBooth微调](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): 仅需3-5张目标主题图像个性化微调
-<p id="inference" align="center">
+<p id="inference-sd" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20Inference.jpg" width=800/>
 </p>
@@ -260,6 +264,15 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 [[代码]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
 [[博客]](https://www.hpc-ai.tech/blog/large-model-pretraining)
+### MoE
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/MOE_training.png" width=800/>
+</p>
+- 专家并行再升级，开源MoE模型训练效率提升9倍
+[[代码]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/openmoe)
+[[博客]](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient)
 ### GPT-3
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/GPT3-v5.png" width=700/>
@@ -331,7 +344,12 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 <p align="right">(<a href="#top">返回顶端</a>)</p>
-## 推理 (Energon-AI) 样例展示
+## 推理
+<p id="SwiftInfer" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/SwiftInfer.jpg" width=800/>
+</p>
+- [SwiftInfer](https://github.com/hpcaitech/SwiftInfer): Inference performance improved by 46%, open source solution breaks the length limit of LLM for multi-round conversations
 <p id="GPT-3-Inference" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference_GPT-3.jpg" width=800/>
@@ -357,7 +375,7 @@ Colossal-AI 为您提供了一系列并行组件。我们的目标是让您的
 环境要求:
- PyTorch >= 1.11 (PyTorch 2.x 正在适配中)
+- PyTorch >= 1.11 并且 PyTorch <= 2.1
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)

--- a/docs/source/en/features/lazy_init.md
+++ b/docs/source/en/features/lazy_init.md
@@ -73,4 +73,4 @@ And some models are not supported at all which will raise an error. We tested mo
 | Blip2Model                    | transformers |
 | Blip2ForConditionalGeneration | transformers |
-<!-- doc-test-command: torchrun --standalone --nproc_per_node=2 lazy_iniy.py  -->
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=2 lazy_init.py  -->
--- a/docs/source/en/features/pipeline_parallel.md
+++ b/docs/source/en/features/pipeline_parallel.md
@@ -167,7 +167,7 @@ plugin = HybridParallelPlugin(tp_size=1,
 booster = Booster(plugin=plugin)
 ```
-Boost these train componts with the booster created.
+Boost these train components with the booster created.
 ```python
 model, optimizer, _criterion, _, lr_scheduler = booster.boost(model,
                                                                optimizer,

--- a/docs/source/en/features/shardformer.md
+++ b/docs/source/en/features/shardformer.md
@@ -178,6 +178,18 @@ Model/Feature Compatibility Matrix:
    <td nowrap="nowrap" align="center">❌</td>
    <td nowrap="nowrap" align="center">❌</td>
  </tr>
+  <tr>
+    <td nowrap="nowrap">Falcon</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">❌</td>
+    <td nowrap="nowrap" align="center">✔️</td>
+    <td nowrap="nowrap" align="center">❌</td>
+    <td nowrap="nowrap" align="center">❌</td>
+  </tr>
  <tr>
    <td colspan="39"></td>
  </tr>

--- a/docs/source/en/get_started/installation.md
+++ b/docs/source/en/get_started/installation.md
 # Setup
 Requirements:
- PyTorch >= 1.11 (PyTorch 2.x in progress)
+- PyTorch >= 1.11 and PyTorch <= 2.1
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)

--- a/docs/source/zh-Hans/basics/booster_api.md
+++ b/docs/source/zh-Hans/basics/booster_api.md
@@ -23,7 +23,7 @@
 Booster 插件是管理并行配置的重要组件（eg：gemini 插件封装了 gemini 加速方案）。目前支持的插件如下：
-**_HybridParallelPlugin:_** HybirdParallelPlugin 插件封装了混合并行的加速解决方案。它提供的接口可以在张量并行，流水线并行以及两种数据并行方法（DDP, Zero）间进行任意的组合。
+**_HybridParallelPlugin:_** HybridParallelPlugin 插件封装了混合并行的加速解决方案。它提供的接口可以在张量并行，流水线并行以及两种数据并行方法（DDP, Zero）间进行任意的组合。
 **_GeminiPlugin:_** GeminiPlugin 插件封装了 gemini 加速解决方案，即基于块内存管理的 ZeRO 优化方案。

--- a/docs/source/zh-Hans/features/lazy_init.md
+++ b/docs/source/zh-Hans/features/lazy_init.md
@@ -73,4 +73,4 @@ model, *_ = booster.boost(model)
 | Blip2Model                    | transformers |
 | Blip2ForConditionalGeneration | transformers |
-<!-- doc-test-command: torchrun --standalone --nproc_per_node=2 lazy_iniy.py  -->
+<!-- doc-test-command: torchrun --standalone --nproc_per_node=2 lazy_init.py  -->