Merge branch 'main' of https://github.com/oahzxl/ColossalAI into chunk

e532679c · oahzxl · c1492e50 · 7d5640b9 · e532679c · e532679c
Commit e532679c authored Jan 10, 2023 by oahzxl
20 changed files
--- a/examples/language/gpt/experiments/auto_parallel/gpt_modules.py
+++ b/examples/language/gpt/experiments/auto_parallel/gpt_modules.py
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from transformers.activations import ACT2FN
+from transformers.models.gpt2.modeling_gpt2 import BaseModelOutputWithPastAndCrossAttentions, GPT2PreTrainedModel
+from transformers.pytorch_utils import Conv1D
+
+
+class GPT2MLP(nn.Module):
+
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        return hidden_states
+
+
+# The reason Why we don't import GPT2Attention from transformers directly is that:
+# 1. The tracer will not work correctly when we feed meta_args and concrete_args at same time,
+# so we have to build the customized GPT2Attention class and remove the conditional branch manually.
+# 2. The order of split and view op has been changed in the customized GPT2Attention class, the new
+# order is same as megatron-lm gpt model.
+class GPT2Attention(nn.Module):
+
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions),
+                                  dtype=torch.uint8)).view(1, 1, max_positions, max_positions),
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4))
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        self.scale_attn_weights = config.scale_attn_weights
+
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+
+        self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+        self.pruned_heads = set()
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / (value.size(-1)**0.5)
+
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+
+        # if only "normal" attention layer implements causal mask
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = self.bias[:, :, key_length - query_length:key_length, :key_length].to(torch.bool)
+        attn_weights = torch.where(causal_mask, attn_weights, self.masked_bias.to(attn_weights.dtype))
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights.type(value.dtype)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(new_shape)
+        return tensor.permute(0, 2, 1, 3)    # (batch, head, seq_length, head_features)
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]:
+
+        qkv = self.c_attn(hidden_states)
+        query, key, value = self._split_heads(qkv, self.num_heads, 3 * self.head_dim).split(self.head_dim, dim=3)
+        present = (key, value)
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.c_proj(attn_output)
+        return attn_output
+
+
+class GPT2Block(nn.Module):
+
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPT2Attention(config, layer_idx=layer_idx)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPT2MLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: Optional[Tuple[torch.FloatTensor]],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+        )
+        # residual connection
+        hidden_states = attn_outputs + residual
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        return hidden_states
+
+
+class GPT2Model(GPT2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([GPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        batch_size = input_ids.shape[0]
+
+        device = input_ids.device
+
+        past_length = 0
+        past_key_values = tuple([None] * len(self.h))
+
+        position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+        position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1])
+
+        # GPT2Attention mask.
+        attention_mask = attention_mask.view(batch_size, -1)
+        attention_mask = attention_mask[:, None, None, :]
+        attention_mask = attention_mask.to(dtype=self.dtype)    # fp16 compatibility
+        attention_mask = (1.0 - attention_mask) * -10000.0
+
+        encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+
+        hidden_states = inputs_embeds + position_embeds
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            outputs = block(hidden_states, attention_mask=attention_mask, head_mask=head_mask[i])
+            hidden_states = outputs
+
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(output_shape)
+
+        return hidden_states
+
+
+class GPT2LMHeadModel(GPT2PreTrainedModel):
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+    ):
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+        lm_logits = self.lm_head(transformer_outputs)
+
+        return lm_logits
+
+
+class GPTLMLoss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
--- a/examples/language/gpt/experiments/auto_parallel/requirements.txt
+++ b/examples/language/gpt/experiments/auto_parallel/requirements.txt
+colossalai >= 0.1.12
+torch >= 1.8.1
+transformers >= 4.231
+PuLP >= 2.7.0
--- a/examples/language/gpt/experiments/pipeline_parallel/README.md
+++ b/examples/language/gpt/experiments/pipeline_parallel/README.md
+# Pipeline Parallelism Demo with GPT2
+
+## Requirements
+
+Before you can launch training, you need to install the following requirements.
+
+### Install PyTorch
+
+```bash
+#conda
+conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch
+#pip
+pip install torch==1.12.0+cu113 torchvision==0.13.0+cu113 torchaudio==0.12.0 --extra-index-url https://download.pytorch.org/whl/cu113
+```
+
+### Install [Colossal-AI v0.2.0](https://colossalai.org/download/) From Official Website
+
+```bash
+pip install colossalai==0.2.0+torch1.12cu11.3 -f https://release.colossalai.org
+```
+
+### Install transformers
+
+```bash
+pip install transformers
+```
+
+## Dataset
+
+For simplicity, the input data is randonly generated here.
+
+## Training
+
+```bash
+#Run the Pipeline Parallel on GPT with default setting and a dummy dataset.
+#You can change the GPU number or microbatch number in the run.sh .
+bash run.sh
+```
--- a/examples/language/gpt/experiments/pipeline_parallel/model_zoo.py
+++ b/examples/language/gpt/experiments/pipeline_parallel/model_zoo.py
+from torch import nn
+from transformers import GPT2Config, GPT2LMHeadModel
+
+
+## Define the Model and Loss Based on Huggingface transformers GPT2LMHeadModel
+class GPTLMModel(nn.Module):
+
+    def __init__(self,
+                 hidden_size=768,
+                 num_layers=12,
+                 num_attention_heads=12,
+                 max_seq_len=1024,
+                 vocab_size=50257,
+                 checkpoint=False):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.config = GPT2Config(n_embd=hidden_size,
+                                 n_layer=num_layers,
+                                 n_head=num_attention_heads,
+                                 n_positions=max_seq_len,
+                                 n_ctx=max_seq_len,
+                                 vocab_size=vocab_size)
+        self.model = GPT2LMHeadModel(self.config)
+        if checkpoint:
+            self.model.gradient_checkpointing_enable()
+
+    def forward(self, input_ids, attention_mask):
+        # Only return lm_logits
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
+
+
+def gpt2_medium(checkpoint=False):
+    return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_xl(checkpoint=True):
+    return GPTLMModel(hidden_size=1600, num_layers=48, num_attention_heads=32, checkpoint=checkpoint)
+
+
+def gpt2_10b(checkpoint=True):
+    return GPTLMModel(hidden_size=4096, num_layers=50, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_14b(checkpoint=True):
+    return GPTLMModel(hidden_size=4096, num_layers=70, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_20b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=25, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_24b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def model_builder(model_size: str) -> callable:
+    if model_size == "gpt2_medium":
+        return gpt2_medium
+    elif model_size == "gpt2_xl":
+        return gpt2_xl
+    elif model_size == "gpt2_10b":
+        return gpt2_10b
+    elif model_size == "gpt2_14b":
+        return gpt2_14b
+    elif model_size == "gpt2_20b":
+        return gpt2_20b
+    elif model_size == "gpt2_24b":
+        return gpt2_24b
+    else:
+        raise TypeError(f"model_builder {model_size}")
+
+
+__all__ = ['model_builder']
--- a/examples/language/gpt/experiments/pipeline_parallel/run.sh
+++ b/examples/language/gpt/experiments/pipeline_parallel/run.sh
+export GPUNUM=${GPUNUM:-4}
+export BATCH_SIZE=${BATCH_SIZE:-16}
+export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
+export NUM_MICROBATCH=${NUM_MICROBATCH:-8}
+
+mkdir -p pp_logs
+python train_gpt_pp.py --device="cuda" --model_type=${MODEL_TYPE} --num_microbatches=${NUM_MICROBATCH} --world_size=${GPUNUM} --batch_size=${BATCH_SIZE} 2>&1 | tee ./pp_logs/${MODEL_TYPE}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_nm_${NUM_MICROBATCH}.log
--- a/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
+++ b/examples/language/gpt/experiments/pipeline_parallel/train_gpt_pp.py
+import argparse
+import time
+from functools import partial
+
+import torch
+from model_zoo import model_builder
+from torch import nn
+from tqdm import tqdm
+
+from colossalai.fx import ColoTracer
+from colossalai.fx.passes.adding_split_node_pass import avgnode_split_pass, split_with_split_nodes_pass
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.pipeline.middleware.adaptor import get_fx_topology
+from colossalai.pipeline.rpc._pipeline_schedule import OneFOneBPipelineEngine
+from colossalai.pipeline.rpc.utils import rpc_run
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_type', type=str, default="gpt2_medium")
+    parser.add_argument('--world_size', type=int, default=2)
+    parser.add_argument('--batch_size', type=int, default=16)
+    parser.add_argument('--dp_degree', type=int, default=1)
+    parser.add_argument('--tp_degree', type=int, default=1)
+    parser.add_argument('--num_microbatches', type=int, default=2)
+    parser.add_argument('--device', type=str, choices=['cpu', 'cuda'], default='cuda')
+    parser.add_argument('--master_addr', type=str, default='localhost')
+    parser.add_argument('--master_port', type=str, default='29011')
+    parser.add_argument('--num_worker_threads', type=int, default=128)
+    return parser.parse_args()
+
+
+class GPTLMLoss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+
+# Randomly Generated Data
+def get_data(batch_size, seq_len, vocab_size):
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+    attention_mask = torch.ones_like(input_ids)
+    return input_ids, attention_mask
+
+
+def get_tflops(model_numel, batch_size, seq_len, step_time):
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
+
+
+def create_partition_module(pp_rank: int, stage_num: int, model, data_kwargs):
+    tracer = ColoTracer()
+    meta_args = {k: v.to('meta') for k, v in data_kwargs.items()}
+    graph = tracer.trace(root=model, meta_args=meta_args)
+    gm = torch.fx.GraphModule(model, graph, model.__class__.__name__)
+    annotated_model = avgnode_split_pass(gm, stage_num)
+
+    top_module, split_submodules = split_with_split_nodes_pass(annotated_model, merge_output=True)
+    topo = get_fx_topology(top_module)
+    for submodule in split_submodules:
+        if isinstance(submodule, torch.fx.GraphModule):
+            setattr(submodule, '_topo', topo)
+    return split_submodules[pp_rank + 1]
+
+
+def partition(model, data_kwargs, pp_rank: int, chunk: int, stage_num: int):
+    module = create_partition_module(pp_rank, stage_num, model, data_kwargs)
+    return module
+
+
+def run_master(args):
+    batch_size = args.batch_size
+    device = args.device
+    world_size = args.world_size
+    stage_num = world_size
+    num_microbatches = args.num_microbatches
+    model_type = args.model_type
+    # batch size per DP degree
+    SEQ_LEN = 1024
+    VOCAB_SIZE = 50257
+    NUM_STEPS = 10
+    WARMUP_STEPS = 1
+
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    logger.info(f"{args.model_type}, batch size {batch_size}, num stage {stage_num}, num microbatch {num_microbatches}",
+                ranks=[0])
+
+    torch.manual_seed(123)
+
+    # build criterion
+    criterion = GPTLMLoss()
+
+    # warm up pipeline fx partition
+    input_ids, attn_mask = get_data(batch_size, SEQ_LEN, VOCAB_SIZE)
+    warmup_data_kwargs = {'input_ids': input_ids, 'attention_mask': attn_mask}
+
+    # create model
+    model = model_builder(model_type)(checkpoint=False)
+
+    # set 1f1b pipeline engine
+    pp_engine = OneFOneBPipelineEngine(partition_fn=partial(partition, model, warmup_data_kwargs),
+                                       stage_num=stage_num,
+                                       num_microbatches=num_microbatches,
+                                       device=device,
+                                       chunk=1,
+                                       criterion=criterion,
+                                       metric=None,
+                                       checkpoint=False)
+
+    partition_numels = pp_engine.remote_numels()
+    for rank, numel in partition_numels.items():
+        logger.info(f'{rank=} numel in the partition:{numel}')
+
+    # build optim
+    pp_engine.initialize_optimizer(HybridAdam, lr=1e-3)
+
+    ranks_tflops = {}
+    for n in range(NUM_STEPS):
+        # we just use randomly generated data here
+        input_ids, attn_mask = get_data(batch_size, SEQ_LEN, VOCAB_SIZE)
+        batch = {'input_ids': input_ids, 'attention_mask': attn_mask}
+
+        start = time.time()
+        outputs = pp_engine.forward_backward(batch=batch, labels=input_ids, forward_only=False)
+        step_time = time.time() - start
+
+        for rank, numel in partition_numels.items():
+            if rank not in ranks_tflops:
+                ranks_tflops[rank] = []
+            step_tflops = get_tflops(numel, batch_size, SEQ_LEN, step_time)
+
+            logger.info(
+                f"Rank{rank} , [{n + 1}/{NUM_STEPS}] , Step time: {step_time:.3f}s, TFLOPS: {get_tflops(numel, batch_size, SEQ_LEN, step_time):.3f}",
+                ranks=[0],
+            )
+
+            if n >= WARMUP_STEPS:
+                ranks_tflops[rank].append(step_tflops)
+
+    median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
+    gpu_tflops = []
+    for rank, tflops_list in ranks_tflops.items():
+        tflops_list.sort()
+        gpu_tflops.append(tflops_list[median_index])
+        logger.info(f"GPU{rank} Median TFLOPS is {tflops_list[median_index]:.3f}")
+
+    logger.info(f"Total TFLOPS is {sum(gpu_tflops):.3f}")
+    logger.info(f"Avg TFLOPS per GPU is {sum(gpu_tflops) / world_size:.3f}")
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    rpc_run(args, run_master)
--- a/examples/language/gpt/gemini/benchmark_gemini.sh
+++ b/examples/language/gpt/gemini/benchmark_gemini.sh
+for MODEL_TYPE in "gpt2_medium"; do
+  for DISPAN in "colossalai"; do
+    for BATCH_SIZE in 16; do
+      for GPUNUM in 1 2 4 8; do
+        for TPDEGREE in 1 2 4 8; do
+          if [ ${TPDEGREE} -gt ${GPUNUM} ]; then
+            continue
+          fi
+          for PLACEMENT in "cpu" "auto"; do
+            echo "****************** Begin ***************************"
+            echo "+ benchmrking MODEL ${MODEL_TYPE} DISPAN ${DISPAN} GPU ${GPUNUM} BS ${BATCH_SIZE} TP ${TPDEGREE} POLICY ${PLACEMENT}"
+            MODEL_TYPE=${MODEL_TYPE} DISPAN=${DISPAN} BATCH_SIZE=${BATCH_SIZE} GPUNUM=${GPUNUM} TPDEGREE=${TPDEGREE} PLACEMENT=${PLACEMENT} \
+            bash ./run_gemini.sh
+            echo "****************** Finished ***************************"
+            echo ""
+            echo ""
+          done
+        done
+      done
+    done
+  done
+done
--- a/examples/language/gpt/gemini/commons/model_zoo.py
+++ b/examples/language/gpt/gemini/commons/model_zoo.py
+from torch import nn
+from transformers import GPT2Config, GPT2LMHeadModel
+
+
+## Define the Model and Loss Based on Huggingface transformers GPT2LMHeadModel
+class GPTLMModel(nn.Module):
+
+    def __init__(self,
+                 hidden_size=768,
+                 num_layers=12,
+                 num_attention_heads=12,
+                 max_seq_len=1024,
+                 vocab_size=50257,
+                 checkpoint=False):
+        super().__init__()
+        self.checkpoint = checkpoint
+        self.config = GPT2Config(n_embd=hidden_size,
+                                 n_layer=num_layers,
+                                 n_head=num_attention_heads,
+                                 n_positions=max_seq_len,
+                                 n_ctx=max_seq_len,
+                                 vocab_size=vocab_size)
+        self.model = GPT2LMHeadModel(self.config)
+        if checkpoint:
+            self.model.gradient_checkpointing_enable()
+
+    def forward(self, input_ids, attention_mask):
+        # Only return lm_logits
+        return self.model(input_ids=input_ids, attention_mask=attention_mask, use_cache=not self.checkpoint)[0]
+
+
+def gpt2_medium(checkpoint=False):
+    return GPTLMModel(hidden_size=1024, num_layers=24, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_xl(checkpoint=True):
+    return GPTLMModel(hidden_size=1600, num_layers=48, num_attention_heads=32, checkpoint=checkpoint)
+
+
+def gpt2_10b(checkpoint=True):
+    return GPTLMModel(hidden_size=4096, num_layers=50, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_14b(checkpoint=True):
+    return GPTLMModel(hidden_size=4096, num_layers=70, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_20b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=25, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_24b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def model_builder(model_size: str) -> callable:
+    if model_size == "gpt2_medium":
+        return gpt2_medium
+    elif model_size == "gpt2_xl":
+        return gpt2_xl
+    elif model_size == "gpt2_10b":
+        return gpt2_10b
+    elif model_size == "gpt2_14b":
+        return gpt2_14b
+    elif model_size == "gpt2_20b":
+        return gpt2_20b
+    elif model_size == "gpt2_24b":
+        return gpt2_24b
+    else:
+        raise TypeError(f"model_builder {model_size}")
+
+
+__all__ = ['model_builder']
--- a/examples/language/gpt/gemini/commons/utils.py
+++ b/examples/language/gpt/gemini/commons/utils.py
+import torch
+
+
+# Randomly Generated Data
+def get_data(batch_size, seq_len, vocab_size):
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+    attention_mask = torch.ones_like(input_ids)
+    return input_ids, attention_mask
+
+
+def get_tflops(model_numel, batch_size, seq_len, step_time):
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
--- a/examples/language/gpt/gemini/run_gemini.sh
+++ b/examples/language/gpt/gemini/run_gemini.sh
+set -x
+# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
+export DISTPAN=${DISTPAN:-"colossalai"}
+
+# The following options only valid when DISTPAN="colossalai"
+export GPUNUM=${GPUNUM:-1}
+export TPDEGREE=${TPDEGREE:-1}
+export PLACEMENT=${PLACEMENT:-"cpu"}
+export USE_SHARD_INIT=${USE_SHARD_INIT:-False}
+export BATCH_SIZE=${BATCH_SIZE:-16}
+export MODEL_TYPE=${MODEL_TYPE:-"gpt2_medium"}
+
+# export PYTHONPATH=$PWD:$PYTHONPATH
+
+mkdir -p gemini_logs
+
+torchrun --standalone --nproc_per_node=${GPUNUM} ./train_gpt_demo.py \
+--tp_degree=${TPDEGREE} \
+--model_type=${MODEL_TYPE} \
+--batch_size=${BATCH_SIZE} \
+--placement=${PLACEMENT} \
+--shardinit=${USE_SHARD_INIT} \
+--distplan=${DISTPAN} \
+2>&1 | tee ./gemini_logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}_${PLACEMENT}.log
--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
+import os
+from functools import partial
+from time import time
+
+import psutil
+import torch
+import torch.nn as nn
+from commons.model_zoo import model_builder
+from commons.utils import get_data, get_tflops
+from packaging import version
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+import colossalai
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.parallel import ZeroDDP
+from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, ProcessGroup, ReplicaSpec, ShardSpec
+from colossalai.utils import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+
+CAI_VERSION = colossalai.__version__
+
+if version.parse(CAI_VERSION) > version.parse("0.1.10"):
+    # These are added after 0.1.10
+    from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
+    from colossalai.nn.parallel import GeminiDDP
+    from colossalai.zero.sharded_optim import LowLevelZeroOptimizer
+
+
+def parse_args():
+    parser = colossalai.get_default_parser()
+    parser.add_argument(
+        "--distplan",
+        type=str,
+        default='colossalai',
+        help="The distributed plan [colossalai, zero1, zero2, torch_ddp, torch_zero].",
+    )
+    parser.add_argument(
+        "--tp_degree",
+        type=int,
+        default=1,
+        help="Tensor Parallelism Degree. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--placement",
+        type=str,
+        default='cpu',
+        help="Placement Policy for Gemini. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--shardinit",
+        type=bool,
+        default=False,
+        help=
+        "Shard the tensors when init the model to shrink peak memory size on the assigned device. Valid when using colossalai as dist plan.",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=8,
+        help="batch size per DP group of training.",
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default="gpt2_medium",
+        help="model model scale",
+    )
+    args = parser.parse_args()
+    return args
+
+
+# Parameter Sharding Strategies for Tensor Parallelism
+def split_param_single_dim_tp1d(dim: int, param: ColoParameter, pg: ProcessGroup):
+    spec = (ShardSpec([dim], [pg.tp_world_size()]), ComputeSpec(ComputePattern.TP1D))
+    param.set_tensor_spec(*spec)
+
+
+def split_param_row_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(0, param, pg)
+
+
+def split_param_col_tp1d(param: ColoParameter, pg: ProcessGroup):
+    split_param_single_dim_tp1d(-1, param, pg)
+
+
+class GPTLMLoss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, logits, labels):
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        return self.loss_fn(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+
+def get_cpu_mem():
+    return psutil.Process().memory_info().rss / 1024**2
+
+
+def get_gpu_mem():
+    return torch.cuda.memory_allocated() / 1024**2
+
+
+def get_mem_info(prefix=''):
+    return f'{prefix}GPU memory usage: {get_gpu_mem():.2f} MB, CPU memory usage: {get_cpu_mem():.2f} MB'
+
+
+def get_model_size(model: nn.Module):
+    total_numel = 0
+    for module in model.modules():
+        for p in module.parameters(recurse=False):
+            total_numel += p.numel()
+    return total_numel
+
+
+def model_size_formatter(numel: int) -> str:
+    GB_SIZE = 10**9
+    MB_SIZE = 10**6
+    KB_SIZE = 10**3
+    if numel >= GB_SIZE:
+        return f'{numel / GB_SIZE:.1f}B'
+    elif numel >= MB_SIZE:
+        return f'{numel / MB_SIZE:.1f}M'
+    elif numel >= KB_SIZE:
+        return f'{numel / KB_SIZE:.1f}K'
+    else:
+        return str(numel)
+
+
+def set_cpu_maximum_parallelism():
+    conf_str = torch.__config__.parallel_info()
+    inter_str = conf_str.split("hardware_concurrency() : ")[1]
+    max_concurrency = inter_str.split('\n')[0]
+    os.environ["OMP_NUM_THREADS"] = max_concurrency
+    print(f"environmental variable OMP_NUM_THREADS is set to {max_concurrency}.")
+
+
+# Tensor Parallel
+def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
+    """tensor_parallelize
+    Sharding the Model Parameters.
+
+    Args:
+        model (torch.nn.Module): a torch module to be sharded
+    """
+    for mn, module in model.named_modules():
+        for pn, param in module.named_parameters(recurse=False):
+            # NOTE() a param maybe shared by two modules
+            if hasattr(param, 'visited'):
+                continue
+
+            # if shard init, then convert param to replica and use the dp-only ProcessGroup
+            param: ColoParameter = param
+            param.set_dist_spec(ReplicaSpec())
+            param.set_process_group(pg)
+
+            # shard it w.r.t tp pattern
+            if 'mlp.c_fc' in mn:
+                if 'weight' in pn or 'bias' in pn:
+                    split_param_col_tp1d(param, pg)    # colmn slice
+                    # keep the shape of the output from c_fc
+                    param.compute_spec.set_output_replicate(False)
+                else:
+                    param.set_dist_spec(ReplicaSpec())
+            elif 'mlp.c_proj' in mn:
+                if 'weight' in pn:
+                    split_param_row_tp1d(param, pg)    # row slice
+                else:
+                    param.set_dist_spec(ReplicaSpec())
+            elif 'wte' in mn or 'wpe' in mn:
+                split_param_col_tp1d(param, pg)    # colmn slice
+            elif 'c_attn' in mn or 'c_proj' in mn:
+                split_param_col_tp1d(param, pg)    # colmn slice
+            else:
+                param.set_dist_spec(ReplicaSpec())
+            param.visited = True
+
+
+# Gemini + ZeRO DDP
+def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
+    fp16_init_scale = 2**5
+    gpu_margin_mem_ratio_for_auto = 0
+
+    if version.parse(CAI_VERSION) > version.parse("0.1.10"):
+        model = GeminiDDP(model,
+                          device=get_current_device(),
+                          placement_policy=placement_policy,
+                          pin_memory=True,
+                          hidden_dim=model.config.n_embd,
+                          search_range_mb=64)
+        # configure the const policy
+        if placement_policy == 'const':
+            model.gemini_manager._placement_policy.set_const_memory_boundary(2 * 1024)
+        # build a highly optimized cpu optimizer
+        optimizer = GeminiAdamOptimizer(model,
+                                        lr=1e-3,
+                                        initial_scale=fp16_init_scale,
+                                        gpu_margin_mem_ratio=gpu_margin_mem_ratio_for_auto)
+    elif version.parse("0.1.9") <= version.parse(CAI_VERSION) <= version.parse("0.1.10"):
+        from colossalai.gemini import ChunkManager, GeminiManager
+        from colossalai.nn.optimizer import HybridAdam
+        from colossalai.zero import ZeroOptimizer
+        chunk_size = ChunkManager.search_chunk_size(model, 64 * 1024**2, 1024, filter_exlarge_params=True)
+        chunk_manager = ChunkManager(chunk_size,
+                                     pg,
+                                     enable_distributed_storage=True,
+                                     init_device=GeminiManager.get_default_device(placement_policy))
+        gemini_manager = GeminiManager(placement_policy, chunk_manager)
+        model = ZeroDDP(model, gemini_manager)
+        optimizer = HybridAdam(model.parameters(), lr=1e-3)
+        optimizer = ZeroOptimizer(optimizer,
+                                  model,
+                                  initial_scale=fp16_init_scale,
+                                  gpu_margin_mem_ratio=gpu_margin_mem_ratio_for_auto)
+    else:
+        raise NotImplemented(f"CAI version {CAI_VERSION} is not supported")
+    return model, optimizer
+
+
+def main():
+    # version check
+    # this example is supposed to work for versions greater than 0.1.9
+    assert version.parse(CAI_VERSION) >= version.parse("0.1.9")
+
+    set_cpu_maximum_parallelism()
+    args = parse_args()
+
+    if args.distplan not in ["colossalai", "torch_ddp", "torch_zero", "zero1", "zero2"]:
+        raise TypeError(f"{args.distplan} is error")
+
+    # batch size per DP degree
+    BATCH_SIZE = args.batch_size
+    SEQ_LEN = 1024
+    VOCAB_SIZE = 50257
+
+    NUM_STEPS = 10
+    WARMUP_STEPS = 1
+    assert WARMUP_STEPS < NUM_STEPS, "warmup steps should smaller than the total steps"
+    assert (NUM_STEPS - WARMUP_STEPS) % 2 == 1, "the number of valid steps should be odd to take the median "
+
+    disable_existing_loggers()
+    colossalai.launch_from_torch(config={})
+
+    logger = get_dist_logger()
+    logger.info(f"{args.model_type}, {args.distplan}, batch size {BATCH_SIZE}", ranks=[0])
+
+    # build criterion
+    criterion = GPTLMLoss()
+
+    torch.manual_seed(123)
+    if args.distplan == "colossalai":
+        # all param must use the same process group.
+        world_size = torch.distributed.get_world_size()
+        shard_pg = ProcessGroup(tp_degree=world_size)
+        default_dist_spec = ShardSpec([-1], [world_size]) if args.shardinit else None
+
+        # build GPT model
+        if version.parse(CAI_VERSION) > version.parse("0.1.10"):
+            with ColoInitContext(device=get_current_device(),
+                                 dtype=torch.half,
+                                 default_dist_spec=default_dist_spec,
+                                 default_pg=shard_pg):
+                model = model_builder(args.model_type)(checkpoint=True)
+        else:
+            with ColoInitContext(device=get_current_device()):
+                model = model_builder(args.model_type)(checkpoint=True)
+
+        tp_pg = ProcessGroup(tp_degree=args.tp_degree)
+        # Tensor Parallelism (TP)
+        # You should notice that v0.1.10 is not compatible with TP degree > 1
+        tensor_parallelize(model, tp_pg)
+
+        # build a Gemini model and a highly optimized cpu optimizer
+        # Gemini + ZeRO DP, Note it must be used after TP
+        model, optimizer = build_gemini(model, tp_pg, args.placement)
+
+        logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
+    else:
+        assert args.tp_degree == 1, "The degree of TP should be 1 for DDP examples."
+        model = model_builder(args.model_type)(checkpoint=True).cuda()
+
+    if args.distplan.startswith("torch"):
+        model = DDP(model)
+        if args.distplan.endswith("ddp"):
+            optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+        elif args.distplan.endswith("zero"):
+            from torch.distributed.optim import ZeroRedundancyOptimizer
+            optimizer = ZeroRedundancyOptimizer(model.parameters(), optimizer_class=torch.optim.Adam, lr=0.01)
+    elif args.distplan.startswith("zero"):
+        partition_flag = args.distplan == "zero2"
+        optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
+        optimizer = LowLevelZeroOptimizer(optimizer,
+                                          overlap_communication=True,
+                                          partition_grad=partition_flag,
+                                          verbose=True)
+
+    # model is shared after TP
+    numel = get_model_size(model)
+    logger.info(f"the size of testing model size is {model_size_formatter(numel)}.")
+    logger.info(get_mem_info(prefix='After init model, '), ranks=[0])
+
+    # Tflops_per_GPU = global_batch * global_numel * seq_len * 8 / #gpu
+    # = (batch_per_DP_group * dp_degree) * (numel * tp_degree) * seq_len * 8 / (tp_degree * dp_degree)
+    # = batch_per_DP_group * numel * seq_len * 8
+    get_tflops_func = partial(get_tflops, numel, BATCH_SIZE, SEQ_LEN)
+
+    torch.cuda.synchronize()
+    model.train()
+    tflops_list = []
+    for n in range(NUM_STEPS):
+        # we just use randomly generated data here
+        input_ids, attn_mask = get_data(BATCH_SIZE, SEQ_LEN, VOCAB_SIZE)
+        optimizer.zero_grad()
+
+        start = time()
+        outputs = model(input_ids, attn_mask)
+        loss = criterion(outputs, input_ids)
+        torch.cuda.synchronize()
+        fwd_end = time()
+        fwd_time = fwd_end - start
+        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Forward '), ranks=[0])
+
+        if args.distplan in ["colossalai", "zero1", "zero2"]:
+            optimizer.backward(loss)
+        elif args.distplan in ["torch_ddp", "torch_zero"]:
+            loss.backward()
+        torch.cuda.synchronize()
+        bwd_end = time()
+        bwd_time = bwd_end - fwd_end
+        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Backward '), ranks=[0])
+
+        if args.distplan in ["zero1", "zero2"]:
+            optimizer.sync_grad()
+        optimizer.step()
+        torch.cuda.synchronize()
+        optim_time = time() - bwd_end
+        step_time = time() - start
+        logger.info(get_mem_info(prefix=f'[{n + 1}/{NUM_STEPS}] Optimizer step '), ranks=[0])
+
+        step_tflops = get_tflops_func(step_time)
+        logger.info(
+            f"[{n + 1}/{NUM_STEPS}] Loss:{loss.item():.3f}, Step time: {step_time:.3f}s, TFLOPS: {get_tflops_func(step_time):.3f}, FWD time: {fwd_time:.3f}s, BWD time: {bwd_time:.3f}s, OPTIM time: {optim_time:.3f}s",
+            ranks=[0],
+        )
+        if n >= WARMUP_STEPS:
+            tflops_list.append(step_tflops)
+
+    tflops_list.sort()
+    median_index = ((NUM_STEPS - WARMUP_STEPS) >> 1) + WARMUP_STEPS
+    logger.info(f"Median TFLOPS is {tflops_list[median_index]:.3f}")
+    torch.cuda.synchronize()
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/language/gpt/requirements.txt
+++ b/examples/language/gpt/requirements.txt
+transformers >= 4.23
--- a/examples/language/gpt/test_ci.sh
+++ b/examples/language/gpt/test_ci.sh
+pip install -r requirements.txt
+
+# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
+export DISTPAN="colossalai"
+
+# The following options only valid when DISTPAN="colossalai"
+export TPDEGREE=2
+export GPUNUM=4
+export PLACEMENT='cpu'
+export USE_SHARD_INIT=False
+export BATCH_SIZE=8
+export MODEL_TYPE="gpt2_medium"
+
+
+mkdir -p logs
+torchrun --standalone --nproc_per_node=${GPUNUM} train_gpt_demo.py --tp_degree=${TPDEGREE} --model_type=${MODEL_TYPE} --batch_size=${BATCH_SIZE} --placement ${PLACEMENT} --shardinit ${USE_SHARD_INIT} --distplan ${DISTPAN} 2>&1 | tee ./logs/${MODEL_TYPE}_${DISTPAN}_gpu_${GPUNUM}_bs_${BATCH_SIZE}_tp_${TPDEGREE}.log
--- a/examples/language/opt/README.md
+++ b/examples/language/opt/README.md
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+## OPT
+Meta recently released [Open Pretrained Transformer (OPT)](https://github.com/facebookresearch/metaseq), a 175-Billion parameter AI language model, which stimulates AI programmers to perform various downstream tasks and application deployments.
+
+The following example of [Colossal-AI](https://github.com/hpcaitech/ColossalAI) demonstrates fine-tuning Casual Language Modelling at low cost.
+
+We are using the pre-training weights of the OPT model provided by Hugging Face Hub on the raw WikiText-2 (no tokens were replaced before
+the tokenization). This training script is adapted from the [HuggingFace Language Modelling examples](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling).
+
+## Our Modifications
+We adapt the OPT training code to ColossalAI by leveraging Gemini and ZeRO DDP.
+
+## Quick Start
+You can launch training by using the following bash script
+
+```bash
+bash ./run_gemini.sh
+```
--- a/examples/language/opt/benchmark.sh
+++ b/examples/language/opt/benchmark.sh
+export BS=16
+export MEMCAP=0
+export MODEL="6.7b"
+export GPUNUM=1
+
+for MODEL in "6.7b" "13b" "1.3b"
+do
+for GPUNUM in 8 1
+do
+for BS in 16 24 32 8
+do
+for MEMCAP in 0 40
+do
+pkill -9 torchrun
+pkill -9 python
+
+env BS=$BS MEM_CAP=$MEMCAP MODEL=$MODEL GPUNUM=$GPUNUM bash ./run_gemini.sh
+done
+done
+done
+done
--- a/examples/language/opt/run_gemini.sh
+++ b/examples/language/opt/run_gemini.sh
+set -x
+export BS=${BS:-16}
+export MEMCAP=${MEMCAP:-0}
+# Acceptable values include `125m`, `350m`, `1.3b`, `2.7b`, `6.7`, `13b`, `30b`, `66b`. For `175b`
+export MODEL=${MODEL:-"125m"}
+export GPUNUM=${GPUNUM:-1}
+
+# make directory for logs
+mkdir -p ./logs
+
+export MODLE_PATH="facebook/opt-${MODEL}"
+
+# HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1
+torchrun \
+  --nproc_per_node ${GPUNUM} \
+  --master_port 19198 \
+  train_gemini_opt.py \
+  --mem_cap ${MEMCAP} \
+  --model_name_or_path ${MODLE_PATH} \
+  --batch_size ${BS} 2>&1 | tee ./logs/colo_${MODEL}_bs_${BS}_cap_${MEMCAP}_gpu_${GPUNUM}.log
--- a/examples/language/opt/train_gemini_opt.py
+++ b/examples/language/opt/train_gemini_opt.py
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for causal language modeling (GPT, GPT-2, CTRL, ...)
+on a text file or a dataset without using HuggingFace Trainer.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=text-generation
+"""
+# You can also adapt this script on your own causal language modeling task. Pointers for this are left as comments.
+
+import time
+from functools import partial
+
+import datasets
+import torch
+import torch.distributed as dist
+import transformers
+from transformers import CONFIG_MAPPING, MODEL_MAPPING, AutoConfig, OPTForCausalLM
+from transformers.utils.versions import require_version
+
+import colossalai
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.nn.optimizer.gemini_optimizer import GeminiAdamOptimizer
+from colossalai.nn.parallel import GeminiDDP
+from colossalai.utils import get_current_device
+from colossalai.utils.model.colo_init_context import ColoInitContext
+
+
+def get_data(batch_size, seq_len, vocab_size):
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=torch.cuda.current_device())
+    attention_mask = torch.ones_like(input_ids)
+    return input_ids, attention_mask
+
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
+
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def get_time_stamp():
+    torch.cuda.synchronize()
+    return time.time()
+
+
+def get_tflops(model_numel, batch_size, seq_len, step_time):
+    return model_numel * batch_size * seq_len * 8 / 1e12 / (step_time + 1e-12)
+
+
+def parse_args():
+    parser = colossalai.get_default_parser()
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per dp group) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=20,
+        help="Total number of training steps to perform.",
+    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    parser.add_argument("--mem_cap", type=int, default=0, help="use mem cap")
+    parser.add_argument("--init_in_cpu", action='store_true', default=False, help="init training model in cpu")
+    args = parser.parse_args()
+
+    return args
+
+
+def colo_memory_cap(size_in_GB):
+    from colossalai.utils import colo_device_memory_capacity, colo_set_process_memory_fraction, get_current_device
+    cuda_capacity = colo_device_memory_capacity(get_current_device())
+    if size_in_GB * (1024**3) < cuda_capacity:
+        colo_set_process_memory_fraction(size_in_GB * (1024**3) / cuda_capacity)
+        print("Using {} GB of GPU memory".format(size_in_GB))
+
+
+def main():
+    args = parse_args()
+    disable_existing_loggers()
+    colossalai.launch_from_torch({})
+    logger = get_dist_logger()
+    is_main_process = dist.get_rank() == 0
+
+    if is_main_process:
+        datasets.utils.logging.set_verbosity_warning()
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        datasets.utils.logging.set_verbosity_error()
+        transformers.utils.logging.set_verbosity_error()
+
+    if args.mem_cap > 0:
+        colo_memory_cap(args.mem_cap)
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        torch.mannul_seed(args.seed)
+        logger.info(f"Rank {dist.get_rank()}: random seed is set to {args.seed}")
+
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model
+    # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    logger.info("Model config has been created", ranks=[0])
+
+    if args.init_in_cpu:
+        init_dev = torch.device('cpu')
+    else:
+        init_dev = get_current_device()
+
+    # build model
+    if args.model_name_or_path is None or args.model_name_or_path == 'facebook/opt-13b':
+        # currently, there has a bug in pretrained opt-13b
+        # we can not import it until huggingface fix it
+        logger.info("Train a new model from scratch", ranks=[0])
+        with ColoInitContext(device=init_dev, dtype=torch.half):
+            model = OPTForCausalLM(config)
+    else:
+        logger.info("Finetune a pre-trained model", ranks=[0])
+        with ColoInitContext(device=init_dev, dtype=torch.half):
+            model = OPTForCausalLM.from_pretrained(args.model_name_or_path,
+                                                   from_tf=bool(".ckpt" in args.model_name_or_path),
+                                                   config=config,
+                                                   local_files_only=False)
+
+    # enable graident checkpointing
+    model.gradient_checkpointing_enable()
+
+    numel = sum([p.numel() for p in model.parameters()])
+    PLACEMENT_POLICY = 'cpu'
+    model = GeminiDDP(model, device=get_current_device(), placement_policy=PLACEMENT_POLICY, pin_memory=True)
+    optimizer = GeminiAdamOptimizer(model, lr=args.learning_rate, initial_scale=2**14, gpu_margin_mem_ratio=0.0)
+
+    SEQ_LEN = 1024
+    VOCAB_SIZE = 50257
+
+    get_tflops_func = partial(get_tflops, numel, args.batch_size, SEQ_LEN)
+
+    model.train()
+    for step in range(args.max_train_steps):
+        st_time = time.time()
+        input_ids, attn_mask = get_data(args.batch_size, SEQ_LEN, VOCAB_SIZE)
+
+        outputs = model(input_ids=input_ids, attention_mask=attn_mask, labels=input_ids, use_cache=False)
+        loss = outputs['loss']
+        optimizer.backward(loss)
+
+        optimizer.step()
+        optimizer.zero_grad()
+        torch.cuda.synchronize()
+        step_time = time.time() - st_time
+        step_tflops = get_tflops_func(step_time)
+
+        logger.info("step {} finished, Tflops {}".format(step, step_tflops), ranks=[0])
+
+    logger.info("Training finished", ranks=[0])
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/language/palm/README.md
+++ b/examples/language/palm/README.md
+<img src="./palm.gif" width="450px"></img>
+
+## PaLM - Pytorch
+
+Implementation of the specific Transformer architecture from <a href="https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html">PaLM - Scaling Language Modeling with Pathways</a>, in less than 200 lines of code.
+
+This model is pretty much SOTA on everything language.
+
+It obviously will not scale, but it is just for educational purposes. To elucidate the public how simple it all really is.
+
+## Install
+```bash
+$ pip install PaLM-pytorch
+```
+
+## Usage
+
+```python
+import torch
+from palm_pytorch import PaLM
+
+palm = PaLM(
+    num_tokens = 20000,
+    dim = 512,
+    depth = 12,
+    heads = 8,
+    dim_head = 64,
+)
+
+tokens = torch.randint(0, 20000, (1, 2048))
+logits = palm(tokens) # (1, 2048, 20000)
+```
+
+The PaLM 540B in the paper would be
+
+```python
+palm = PaLM(
+    num_tokens = 256000,
+    dim = 18432,
+    depth = 118,
+    heads = 48,
+    dim_head = 256
+)
+```
+
+## Test on Enwik8
+
+```bash
+$ python train.py
+```
+
+## Todo
+
+- [ ] offer a Triton optimized version of PaLM, bringing in https://github.com/lucidrains/triton-transformer
+
+## Citations
+
+```bibtex
+@article{chowdhery2022PaLM,
+  title   = {PaLM: Scaling Language Modeling with Pathways},
+  author  = {Chowdhery, Aakanksha et al},
+  year    = {2022}
+}
+```
--- a/examples/language/palm/data/README.md
+++ b/examples/language/palm/data/README.md
+# Data source
+
+The enwik8 data was downloaded from the Hutter prize page: http://prize.hutter1.net/
--- a/examples/language/palm/palm_pytorch/__init__.py
+++ b/examples/language/palm/palm_pytorch/__init__.py
+from palm_pytorch.palm_pytorch import PaLM