Merge branch 'main' of https://github.com/hpcaitech/ColossalAI

7bc5a8e3 · zhuwenwen · e6748d82 · 0f785cb1 · 7bc5a8e3 · 7bc5a8e3
Commit 7bc5a8e3 authored May 05, 2023 by zhuwenwen
20 changed files
--- a/applications/Chat/inference/tests/test_chat_prompt.py
+++ b/applications/Chat/inference/tests/test_chat_prompt.py
+import os
+
+from transformers import AutoTokenizer
+from utils import ChatPromptProcessor, Dialogue
+
+CONTEXT = 'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.'
+tokenizer = AutoTokenizer.from_pretrained(os.environ['PRETRAINED_PATH'])
+
+samples = [
+    ([
+        Dialogue(
+            instruction='Who is the best player in the history of NBA?',
+            response=
+            'The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
+        ),
+        Dialogue(instruction='continue this talk', response=''),
+    ], 128,
+     'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\nThe best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1\n\n### Instruction:\ncontinue this talk\n\n### Response:\n'
+    ),
+    ([
+        Dialogue(
+            instruction='Who is the best player in the history of NBA?',
+            response=
+            'The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
+        ),
+        Dialogue(instruction='continue this talk', response=''),
+    ], 200,
+     'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this talk\n\n### Response:\n'
+    ),
+    ([
+        Dialogue(
+            instruction='Who is the best player in the history of NBA?',
+            response=
+            'The best player in the history of the NBA is widely considered to be Michael Jordan. He is one of the most successful players in the league, having won 6 NBA championships with the Chicago Bulls and 5 more with the Washington Wizards. He is a 5-time MVP, 1'
+        ),
+        Dialogue(instruction='continue this talk', response=''),
+    ], 211,
+     'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\ncontinue this\n\n### Response:\n'
+    ),
+    ([
+        Dialogue(instruction='Who is the best player in the history of NBA?', response=''),
+    ], 128,
+     'Below is an instruction that describes a task. Write a response that appropriately completes the request. Do not generate new instructions.\n\n### Instruction:\nWho is the best player in the history of NBA?\n\n### Response:\n'
+    ),
+]
+
+
+def test_chat_prompt_processor():
+    processor = ChatPromptProcessor(tokenizer, CONTEXT, 256)
+    for history, max_new_tokens, result in samples:
+        prompt = processor.preprocess_prompt(history, max_new_tokens)
+        assert prompt == result
+
+
+if __name__ == '__main__':
+    test_chat_prompt_processor()
--- a/applications/Chat/inference/utils.py
+++ b/applications/Chat/inference/utils.py
+import re
+from threading import Lock
+from typing import Any, Callable, Generator, List, Optional
+import json
+import jieba
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from pydantic import BaseModel, Field
+
+try:
+    from transformers.generation_logits_process import (
+        LogitsProcessorList,
+        TemperatureLogitsWarper,
+        TopKLogitsWarper,
+        TopPLogitsWarper,
+    )
+except ImportError:
+    from transformers.generation import LogitsProcessorList, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper
+
+
+def prepare_logits_processor(top_k: Optional[int] = None,
+                             top_p: Optional[float] = None,
+                             temperature: Optional[float] = None) -> LogitsProcessorList:
+    processor_list = LogitsProcessorList()
+    if temperature is not None and temperature != 1.0:
+        processor_list.append(TemperatureLogitsWarper(temperature))
+    if top_k is not None and top_k != 0:
+        processor_list.append(TopKLogitsWarper(top_k))
+    if top_p is not None and top_p < 1.0:
+        processor_list.append(TopPLogitsWarper(top_p))
+    return processor_list
+
+
+def _is_sequence_finished(unfinished_sequences: torch.Tensor) -> bool:
+    if dist.is_initialized() and dist.get_world_size() > 1:
+        # consider DP
+        unfinished_sequences = unfinished_sequences.clone()
+        dist.all_reduce(unfinished_sequences)
+    return unfinished_sequences.max() == 0
+
+
+def sample_streamingly(model: nn.Module,
+                       input_ids: torch.Tensor,
+                       max_generate_tokens: int,
+                       early_stopping: bool = False,
+                       eos_token_id: Optional[int] = None,
+                       pad_token_id: Optional[int] = None,
+                       top_k: Optional[int] = None,
+                       top_p: Optional[float] = None,
+                       temperature: Optional[float] = None,
+                       prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
+                       update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
+                       **model_kwargs) -> Generator:
+
+    logits_processor = prepare_logits_processor(top_k, top_p, temperature)
+    unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
+
+    for _ in range(max_generate_tokens):
+        model_inputs = prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {
+            'input_ids': input_ids
+        }
+        outputs = model(**model_inputs)
+
+        next_token_logits = outputs['logits'][:, -1, :]
+        # pre-process distribution
+        next_token_logits = logits_processor(input_ids, next_token_logits)
+        # sample
+        probs = torch.softmax(next_token_logits, dim=-1, dtype=torch.float)
+        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+
+        # finished sentences should have their next token be a padding token
+        if eos_token_id is not None:
+            if pad_token_id is None:
+                raise ValueError("If `eos_token_id` is defined, make sure that `pad_token_id` is defined.")
+            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+
+        yield next_tokens
+
+        # update generated ids, model inputs for next step
+        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+        if update_model_kwargs_fn is not None:
+            model_kwargs = update_model_kwargs_fn(outputs, **model_kwargs)
+
+        # if eos_token was found in one sentence, set sentence to finished
+        if eos_token_id is not None:
+            unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
+
+        # stop when each sentence is finished if early_stopping=True
+        if early_stopping and _is_sequence_finished(unfinished_sequences):
+            break
+
+
+def update_model_kwargs_fn(outputs: dict, **model_kwargs) -> dict:
+    if "past_key_values" in outputs:
+        model_kwargs["past"] = outputs["past_key_values"]
+    else:
+        model_kwargs["past"] = None
+
+    # update token_type_ids with last value
+    if "token_type_ids" in model_kwargs:
+        token_type_ids = model_kwargs["token_type_ids"]
+        model_kwargs["token_type_ids"] = torch.cat([token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], dim=-1)
+
+    # update attention mask
+    if "attention_mask" in model_kwargs:
+        attention_mask = model_kwargs["attention_mask"]
+        model_kwargs["attention_mask"] = torch.cat(
+            [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+
+    return model_kwargs
+
+
+class Dialogue(BaseModel):
+    instruction: str = Field(min_length=1, example='Count up from 1 to 500.')
+    response: str = Field(example='')
+
+
+def _format_dialogue(instruction: str, response: str = ''):
+    return f'\n\n### Instruction:\n{instruction}\n\n### Response:\n{response}'
+
+
+STOP_PAT = re.compile(r'(###|instruction:).*', flags=(re.I | re.S))
+
+
+class ChatPromptProcessor:
+    SAFE_RESPONSE = 'The input/response contains inappropriate content, please rephrase your prompt.'
+
+    def __init__(self, tokenizer, context: str, max_len: int = 2048, censored_words: List[str]=[]):
+        self.tokenizer = tokenizer
+        self.context = context
+        self.max_len = max_len
+        self.censored_words = set([word.lower() for word in censored_words])
+        # These will be initialized after the first call of preprocess_prompt()
+        self.context_len: Optional[int] = None
+        self.dialogue_placeholder_len: Optional[int] = None
+
+    def preprocess_prompt(self, history: List[Dialogue], max_new_tokens: int) -> str:
+        if self.context_len is None:
+            self.context_len = len(self.tokenizer(self.context)['input_ids'])
+        if self.dialogue_placeholder_len is None:
+            self.dialogue_placeholder_len = len(
+                self.tokenizer(_format_dialogue(''), add_special_tokens=False)['input_ids'])
+        prompt = self.context
+        # the last dialogue must be in the prompt
+        last_dialogue = history.pop()
+        # the response of the last dialogue is empty
+        assert last_dialogue.response == ''
+        if len(self.tokenizer(_format_dialogue(last_dialogue.instruction), add_special_tokens=False)
+               ['input_ids']) + max_new_tokens + self.context_len >= self.max_len:
+            # to avoid truncate placeholder, apply truncate to the original instruction
+            instruction_truncated = self.tokenizer(last_dialogue.instruction,
+                                                   add_special_tokens=False,
+                                                   truncation=True,
+                                                   max_length=(self.max_len - max_new_tokens - self.context_len -
+                                                               self.dialogue_placeholder_len))['input_ids']
+            instruction_truncated = self.tokenizer.decode(instruction_truncated).lstrip()
+            prompt += _format_dialogue(instruction_truncated)
+            return prompt
+
+        res_len = self.max_len - max_new_tokens - len(self.tokenizer(prompt)['input_ids'])
+
+        rows = []
+        for dialogue in history[::-1]:
+            text = _format_dialogue(dialogue.instruction, dialogue.response)
+            cur_len = len(self.tokenizer(text, add_special_tokens=False)['input_ids'])
+            if res_len - cur_len < 0:
+                break
+            res_len -= cur_len
+            rows.insert(0, text)
+        prompt += ''.join(rows) + _format_dialogue(last_dialogue.instruction)
+        return prompt
+
+    def postprocess_output(self, output: str) -> str:
+        output = STOP_PAT.sub('', output)
+        return output.strip()
+
+    def has_censored_words(self, text: str) -> bool:
+        if len(self.censored_words) == 0:
+            return False
+        intersection = set(jieba.cut(text.lower())) & self.censored_words
+        return len(intersection) > 0
+
+class LockedIterator:
+
+    def __init__(self, it, lock: Lock) -> None:
+        self.lock = lock
+        self.it = iter(it)
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        with self.lock:
+            return next(self.it)
+
+def load_json(path: str):
+    with open(path) as f:
+        return json.load(f)
\ No newline at end of file
--- a/applications/Chat/pytest.ini
+++ b/applications/Chat/pytest.ini
+[pytest]
+markers =
+    cpu: tests which can run on CPU
+    gpu: tests which requires a single GPU
+    dist: tests which are run in a multi-GPU or multi-machine environment
+    experiment: tests for experimental features
--- a/applications/Chat/requirements-test.txt
+++ b/applications/Chat/requirements-test.txt
+pytest
--- a/applications/Chat/requirements.txt
+++ b/applications/Chat/requirements.txt
+transformers>=4.20.1
+tqdm
+datasets
+loralib
+colossalai>=0.2.4
+torch<2.0.0, >=1.12.1
+langchain
+tokenizers
+fastapi
+sse_starlette
+wandb
+sentencepiece
+gpustat
--- a/applications/Chat/setup.py
+++ b/applications/Chat/setup.py
+from setuptools import find_packages, setup
+
+
+def fetch_requirements(path):
+    with open(path, 'r') as fd:
+        return [r.strip() for r in fd.readlines()]
+
+
+def fetch_readme():
+    with open('README.md', encoding='utf-8') as f:
+        return f.read()
+
+
+def fetch_version():
+    with open('version.txt', 'r') as f:
+        return f.read().strip()
+
+
+setup(
+    name='coati',
+    version=fetch_version(),
+    packages=find_packages(exclude=(
+        'tests',
+        'benchmarks',
+        '*.egg-info',
+    )),
+    description='Colossal-AI Talking Intelligence',
+    long_description=fetch_readme(),
+    long_description_content_type='text/markdown',
+    license='Apache Software License 2.0',
+    url='https://github.com/hpcaitech/Coati',
+    install_requires=fetch_requirements('requirements.txt'),
+    python_requires='>=3.6',
+    classifiers=[
+        'Programming Language :: Python :: 3',
+        'License :: OSI Approved :: Apache Software License',
+        'Environment :: GPU :: NVIDIA CUDA',
+        'Topic :: Scientific/Engineering :: Artificial Intelligence',
+        'Topic :: System :: Distributed Computing',
+    ],
+)
--- a/applications/Chat/tests/__init__.py
+++ b/applications/Chat/tests/__init__.py
--- a/applications/Chat/tests/test_checkpoint.py
+++ b/applications/Chat/tests/test_checkpoint.py
+import os
+import tempfile
+from contextlib import nullcontext
+
+import pytest
+import torch
+import torch.distributed as dist
+from coati.models.gpt import GPTActor
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+
+from colossalai.nn.optimizer import HybridAdam
+from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)
+
+
+def get_data(batch_size: int, seq_len: int = 10) -> dict:
+    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
+    attention_mask = torch.ones_like(input_ids)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def run_test_checkpoint(strategy):
+    BATCH_SIZE = 2
+
+    if strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif strategy == 'colossalai_gemini':
+        strategy = ColossalAIStrategy(stage=3, placement_policy='cuda', initial_scale=2**5)
+    elif strategy == 'colossalai_zero2':
+        strategy = ColossalAIStrategy(stage=2, placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{strategy}"')
+
+    with strategy.model_init_context():
+        actor = GPTActor(config=GPT_CONFIG).cuda()
+
+    actor_optim = HybridAdam(actor.parameters())
+
+    actor, actor_optim = strategy.prepare((actor, actor_optim))
+
+    def run_step():
+        data = get_data(BATCH_SIZE)
+        action_mask = torch.ones_like(data['attention_mask'], dtype=torch.bool)
+        action_log_probs = actor(data['input_ids'], action_mask.size(1), data['attention_mask'])
+        loss = action_log_probs.sum()
+        strategy.backward(loss, actor, actor_optim)
+        strategy.optimizer_step(actor_optim)
+
+    run_step()
+
+    ctx = tempfile.TemporaryDirectory() if dist.get_rank() == 0 else nullcontext()
+
+    with ctx as dirname:
+        rank0_dirname = [dirname]
+        dist.broadcast_object_list(rank0_dirname)
+        rank0_dirname = rank0_dirname[0]
+
+        model_path = os.path.join(rank0_dirname, 'model.pt')
+        optim_path = os.path.join(rank0_dirname, f'optim-r{dist.get_rank()}.pt')
+
+        strategy.save_model(actor, model_path, only_rank0=True)
+        strategy.save_optimizer(actor_optim, optim_path, only_rank0=False)
+
+        dist.barrier()
+
+        strategy.load_model(actor, model_path, strict=False)
+        strategy.load_optimizer(actor_optim, optim_path)
+
+        dist.barrier()
+
+    run_step()
+
+
+def run_dist(rank, world_size, port, strategy):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = str(port)
+    run_test_checkpoint(strategy)
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize('strategy', ['ddp', 'colossalai_zero2', 'colossalai_gemini'])
+@rerun_if_address_is_in_use()
+def test_checkpoint(world_size, strategy):
+    spawn(run_dist, world_size, strategy=strategy)
+
+
+if __name__ == '__main__':
+    test_checkpoint(2, 'colossalai_zero2')
--- a/applications/Chat/tests/test_data.py
+++ b/applications/Chat/tests/test_data.py
+import os
+from copy import deepcopy
+
+import pytest
+import torch
+import torch.distributed as dist
+from coati.experience_maker import NaiveExperienceMaker
+from coati.models.base import RewardModel
+from coati.models.gpt import GPTActor, GPTCritic
+from coati.replay_buffer import NaiveReplayBuffer
+from coati.trainer.strategies import ColossalAIStrategy, DDPStrategy
+from transformers.models.gpt2.configuration_gpt2 import GPT2Config
+
+from colossalai.testing import rerun_if_address_is_in_use, spawn
+
+GPT_CONFIG = GPT2Config(n_embd=128, n_layer=4, n_head=4)
+
+
+def get_data(batch_size: int, seq_len: int = 10) -> dict:
+    input_ids = torch.randint(0, 50257, (batch_size, seq_len), device='cuda')
+    attention_mask = torch.ones_like(input_ids)
+    return dict(input_ids=input_ids, attention_mask=attention_mask)
+
+
+def gather_and_equal(tensor: torch.Tensor) -> bool:
+    world_size = dist.get_world_size()
+    outputs = [torch.empty_like(tensor) for _ in range(world_size)]
+    dist.all_gather(outputs, tensor.contiguous())
+    for t in outputs[1:]:
+        if not torch.equal(outputs[0], t):
+            return False
+    return True
+
+
+def run_test_data(strategy):
+    EXPERINCE_BATCH_SIZE = 4
+    SAMPLE_BATCH_SIZE = 2
+
+    if strategy == 'ddp':
+        strategy = DDPStrategy()
+    elif strategy == 'colossalai':
+        strategy = ColossalAIStrategy(placement_policy='cuda')
+    else:
+        raise ValueError(f'Unsupported strategy "{strategy}"')
+
+    actor = GPTActor(config=GPT_CONFIG).cuda()
+    critic = GPTCritic(config=GPT_CONFIG).cuda()
+
+    initial_model = deepcopy(actor)
+    reward_model = RewardModel(deepcopy(critic.model)).cuda()
+
+    experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model)
+    replay_buffer = NaiveReplayBuffer(SAMPLE_BATCH_SIZE, cpu_offload=False)
+
+    # experience of all ranks should be the same
+    for _ in range(2):
+        data = get_data(EXPERINCE_BATCH_SIZE)
+        assert gather_and_equal(data['input_ids'])
+        assert gather_and_equal(data['attention_mask'])
+        experience = experience_maker.make_experience(**data,
+                                                      do_sample=True,
+                                                      max_length=16,
+                                                      eos_token_id=50256,
+                                                      pad_token_id=50256)
+        assert gather_and_equal(experience.sequences)
+        assert gather_and_equal(experience.action_log_probs)
+        assert gather_and_equal(experience.values)
+        assert gather_and_equal(experience.reward)
+        assert gather_and_equal(experience.advantages)
+        assert gather_and_equal(experience.action_mask)
+        assert gather_and_equal(experience.attention_mask)
+        replay_buffer.append(experience)
+
+    # replay buffer's data should be the same
+    buffer_size = torch.tensor([len(replay_buffer)], device='cuda')
+    assert gather_and_equal(buffer_size)
+    for item in replay_buffer.items:
+        assert gather_and_equal(item.sequences)
+        assert gather_and_equal(item.action_log_probs)
+        assert gather_and_equal(item.values)
+        assert gather_and_equal(item.reward)
+        assert gather_and_equal(item.advantages)
+        assert gather_and_equal(item.action_mask)
+        assert gather_and_equal(item.attention_mask)
+
+    # dataloader of each rank should have the same size and different batch
+    dataloader = strategy.setup_dataloader(replay_buffer)
+    dataloader_size = torch.tensor([len(dataloader)], device='cuda')
+    assert gather_and_equal(dataloader_size)
+    for experience in dataloader:
+        assert not gather_and_equal(experience.sequences)
+        assert not gather_and_equal(experience.action_log_probs)
+        assert not gather_and_equal(experience.values)
+        assert not gather_and_equal(experience.reward)
+        assert not gather_and_equal(experience.advantages)
+        # action mask and attention mask may be same
+
+
+def run_dist(rank, world_size, port, strategy):
+    os.environ['RANK'] = str(rank)
+    os.environ['LOCAL_RANK'] = str(rank)
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = str(port)
+    run_test_data(strategy)
+
+
+@pytest.mark.skip
+@pytest.mark.dist
+@pytest.mark.parametrize('world_size', [2])
+@pytest.mark.parametrize('strategy', ['ddp', 'colossalai'])
+@rerun_if_address_is_in_use()
+def test_data(world_size, strategy):
+    spawn(run_dist, world_size, strategy=strategy)
+
+
+if __name__ == '__main__':
+    test_data(2, 'colossalai')
--- a/applications/Chat/version.txt
+++ b/applications/Chat/version.txt
+1.0.0
--- a/applications/README.md
+++ b/applications/README.md
+# Applications
+
+This directory contains the applications that are powered by Colossal-AI.
+
+The list of applications include:
+
+- [X] [Chatbot](./Chat/README.md)
+- [X] [FastFold](https://github.com/hpcaitech/FastFold): Optimizing AlphaFold (Biomedicine) Training and Inference on GPU Clusters
+
+> Please note that the `Chatbot` application is migrated from the original `ChatGPT` folder.
+
+You can find more example code for base models and functions in the [Examples](https://github.com/hpcaitech/ColossalAI/tree/main/examples) directory.
--- a/colossalai/_C/__init__.py
+++ b/colossalai/_C/__init__.py
--- a/colossalai/__init__.py
+++ b/colossalai/__init__.py
+from .initialize import (
+    get_default_parser,
+    initialize,
+    launch,
+    launch_from_openmpi,
+    launch_from_slurm,
+    launch_from_torch,
+)
+
+try:
+    # .version will be created by setup.py
+    from .version import __version__
+except ModuleNotFoundError:
+    # this will only happen if the user did not run `pip install`
+    # and directly set PYTHONPATH to use Colossal-AI which is a bad practice
+    __version__ = '0.0.0'
+    print('please install Colossal-AI from https://www.colossalai.org/download or from source')
--- a/colossalai/_analyzer/README.md
+++ b/colossalai/_analyzer/README.md
+# Analyzer
+
+# Overview
+The Analyzer is a collection of static graph utils including Colossal-AI FX. Features include:
+- MetaTensor -- enabling:
+  - Ahead-of-time Profiling
+  - Shape Propagation
+  - Ideal Flop Counter
+- symbolic_trace()
+  - Robust Control-flow Tracing / Recompile
+  - Robust Activation Checkpoint Tracing / CodeGen
+  - Easy-to-define Bias-Addition Split
+- symbolic_profile()
+  - Support ``MetaTensorMode``, where all Tensor operations are executed symbolically.
+  - Shape Inference Across Device and Unified ``MetaInfo``
+  - Ideal Flop Counter https://dev-discuss.pytorch.org/t/the-ideal-pytorch-flop-counter-with-torch-dispatch/505
+
+# Quickstart
+## Analyzer.FX
+**Reference:**
+
+  https://pytorch.org/docs/stable/fx.html [[paper](https://arxiv.org/pdf/2112.08429)]
+
+
+torch.FX is a toolkit for developers to use to transform nn.Module instances. FX consists of three main components: a symbolic tracer, an intermediate representation, and Python code generation. FX.Tracer hacks _\_\_torch_function\_\__ and use a Proxy object to propagate through any forward function of torch.nn.Module.
+![image](https://user-images.githubusercontent.com/78588128/212531495-bbb934dd-dbbb-4578-8869-6171973f7dd8.png)
+ColossalAI FX is modified from torch.FX, with the extra capability of ahead-of-time profiling enabled by the subclass of ``MetaTensor``.
+
+### Analyzer.FX.symbolic_trace()
+A drawback of the original torch.FX implementation is that it is poor at handling control flow. All control flow is not PyTorch native operands and requires actual instances that specify the branches to execute on. For example,
+
+```python
+class MyModule(nn.Module):
+    def forward(self, x):
+        if x.dim() == 3:
+            return x * 2 + 1
+        else:
+            return x - 5
+```
+
+The above function has the computation graph of
+
+![image](https://user-images.githubusercontent.com/78588128/212532631-dba30734-577b-4418-8dc9-004d7983abc5.png)
+
+However, since Proxy does not have concrete data, applying ``x.dim()`` will return nothing. In the context of the auto-parallel system, at least the control-flow dependencies for tensor shape should be removed, since any searched strategy could only auto-parallelize a specific computation graph with the same tensor shape. It is native to attach concrete data onto a Proxy, and propagate them through control flow.
+
+![image](https://user-images.githubusercontent.com/78588128/212533403-1b620986-1c3a-420a-87c6-d08c9702135d.png)
+
+
+With ``MetaTensor``, the computation during shape propagation can be virtualized. This speeds up tracing by avoiding allocating actual memory on devices.
+
+#### Remarks
+There is no free lunch for PyTorch to unify all operands in both its repo and other repos in its eco-system. For example, the einops library currently has no intention to support torch.FX (See https://github.com/arogozhnikov/einops/issues/188). To support different PyTorch-based libraries without modifying source code, good practices can be to allow users to register their implementation to substitute the functions not supported by torch.FX, or to avoid entering incompatible submodules.
+
+### Analyzer.FX.symbolic_profile()
+
+``symbolic_profile`` is another important feature of Colossal-AI's auto-parallel system. Profiling DNN can be costly, as you need to allocate memory and execute on real devices. However, since the profiling requirements for auto-parallel is enough if we can detect when and where the intermediate activations (i.e. Tensor) are generated, we can profile the whole procedure without actually executing it. ``symbolic_profile``, as its name infers, profiles the whole network with symbolic information only.
+
+```python
+with MetaTensorMode():
+    model = MyModule().cuda()
+    sample = torch.rand(100, 3, 224, 224).cuda()
+meta_args = dict(
+    x = sample,
+)
+gm = symbolic_trace(model, meta_args=meta_args)
+gm = symbolic_profile(gm, sample)
+```
+
+``symbolic_profile`` is enabled by ``ShapeProp`` and ``GraphProfile``.
+
+#### ShapeProp
+Both Tensor Parallel and Activation Checkpoint solvers need to know the shape information ahead of time. Unlike PyTorch's implementation, this ``ShapeProp`` can be executed under MetaTensorMode. With this, all the preparation for auto-parallel solvers can be done in milliseconds.
+
+Meanwhile, it is easy to keep track of the memory usage of each node when doing shape propagation. However, the drawbacks of FX is that not every ``call_function`` saves its input for backward, and different tensor that flows within one FX.Graph can actually have the same layout. This raises problems for fine-grained profiling.
+
+![image](https://user-images.githubusercontent.com/78588128/215312957-7eb6cbc3-61b2-49cf-95a4-6b859149eb8d.png)
+
+To address this problem, I came up with a simulated environment enabled by ``torch.autograd.graph.saved_tensor_hooks`` and fake ``data_ptr`` (check ``_subclasses/meta_tensor.py`` for more details of ``data_ptr`` updates).
+
+```python
+class sim_env(saved_tensors_hooks):
+    """
+    A simulation of memory allocation and deallocation in the forward pass
+    using ``saved_tensor_hooks``.
+
+    Attributes:
+        ctx (Dict[int, torch.Tensor]): A dictionary that maps the
+            data pointer of a tensor to the tensor itself. This is used
+            to track the memory allocation and deallocation.
+
+        param_ctx (Dict[int, torch.Tensor]): A dictionary that maps the
+            data pointer of all model parameters to the parameter itself.
+            This avoids overestimating the memory usage of the intermediate activations.
+    """
+
+    def __init__(self, module: Optional[torch.nn.Module] = None):
+        super().__init__(self.pack_hook, self.unpack_hook)
+        self.ctx = {}
+        self.param_ctx = {param.data_ptr(): param for param in module.parameters()}
+        self.buffer_ctx = {buffer.data_ptr(): buffer for buffer in module.buffers()} if module else {}
+
+    def pack_hook(self, tensor: torch.Tensor):
+        if tensor.data_ptr() not in self.param_ctx and tensor.data_ptr() not in self.buffer_ctx:
+            self.ctx[tensor.data_ptr()] = tensor
+        return tensor
+
+    def unpack_hook(self, tensor):
+        return tensor
+```
+The ``ctx`` variable will keep track of all saved tensors with a unique identifier. It is likely that ``nn.Parameter`` is also counted in the ``ctx``, which is not desired. To avoid this, we can use ``param_ctx`` to keep track of all parameters in the model. The ``buffer_ctx`` is used to keep track of all buffers in the model. The ``local_ctx`` that is attached to each ``Node`` marks the memory usage of the stage to which the node belongs. With simple ``intersect``, ``union`` and ``subtract`` operations, we can get any memory-related information. For non-profileable nodes, you might add your customized profile rules to simulate the memory allocation. If a ``Graph`` is modified with some non-PyTorch functions, such as fused operands, you can register the shape propagation rule with the decorator.
+
+```python
+@register_shape_impl(fuse_conv_bn)
+def fuse_conv_bn_shape_impl(*args, **kwargs):
+     # infer output shape here
+     return torch.empty(output_shape, device=output_device)
+```
+
+An important notice is that ``ShapeProp`` will attach additional information to the graph, which will be exactly the input of ``Profiler``.
+
+#### GraphProfiler
+``GraphProfiler`` executes at the node level, and profiles both forward and backward within one node. For example, ``FlopProfiler`` will profile the forward and backward FLOPs of a node, and ``CommunicationProfiler`` will profile the forward and backward communication cost of a node. The ``GraphProfiler`` will attach the profiling results to the ``Node``. These procedures are decoupled for better extensibility.
+
+To provide a general insight of the profiled results, you can set ``verbose=True`` to print the summary as well.
+```python
+model = tm.resnet18()
+sample = torch.rand(100, 3, 224, 224)
+meta_args = dict(x=sample)
+gm = symbolic_trace(model, meta_args=meta_args)
+gm = symbolic_profile(gm, sample, verbose=True)
+
+============================================================ Results =====================================================================
+       Op type                                              Op    Accumulate size    Incremental size    Output size    Temp size    Param size    Backward size      Fwd FLOPs      Bwd FLOPs
+-------------  ----------------------------------------------  -----------------  ------------------  -------------  -----------  ------------  ---------------  -------------  -------------
+  placeholder                                               x            4.59 Mb                 0 b        4.59 Mb          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module                                       conv_proj            4.59 Mb                 0 b            0 b      4.59 Mb       2.25 Mb          4.59 Mb  924.84 MFLOPs  924.84 MFLOPs
+  call_method                                         reshape            4.59 Mb                 0 b            0 b      4.59 Mb           0 b          4.59 Mb        0 FLOPs        0 FLOPs
+  call_method                                         permute            4.59 Mb                 0 b            0 b      4.59 Mb           0 b          4.59 Mb        0 FLOPs        0 FLOPs
+     get_attr                                     class_token            4.59 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_method                                          expand            4.59 Mb                 0 b            0 b     24.00 Kb       3.00 Kb              0 b        0 FLOPs    6.14 kFLOPs
+call_function                                             cat            4.59 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+     get_attr                           encoder_pos_embedding            4.59 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                             add            9.21 Mb             4.62 Mb        4.62 Mb          0 b     591.00 Kb          4.62 Mb    1.21 MFLOPs    1.21 MFLOPs
+  call_module                                 encoder_dropout            9.21 Mb                 0 b        4.62 Mb          0 b           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_0_ln_1            9.22 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_0_self_attention           46.52 Mb            37.30 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                         getitem           46.52 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_1           46.52 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_0_dropout           46.52 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_1           51.14 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_0_ln_2           51.15 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_0           74.24 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_1           92.71 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_2           92.71 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_3           92.71 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_0_mlp_4           92.71 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_2           97.32 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_1_ln_1          101.95 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_1_self_attention          134.63 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                       getitem_2          134.63 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_3          134.63 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_1_dropout          134.63 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_3          139.25 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_1_ln_2          139.26 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_0          162.35 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_1          180.82 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_2          180.82 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_3          180.82 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_1_mlp_4          180.82 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_4          185.43 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_2_ln_1          190.06 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_2_self_attention          222.74 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                       getitem_4          222.74 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_5          222.74 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_2_dropout          222.74 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_5          227.36 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_2_ln_2          227.37 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_0          250.46 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_1          268.93 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_2          268.93 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_3          268.93 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_2_mlp_4          268.93 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_6          273.54 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_3_ln_1          278.17 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_3_self_attention          310.86 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                       getitem_6          310.86 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_7          310.86 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_3_dropout          310.86 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_7          315.47 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_3_ln_2          315.48 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_0          338.57 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_1          357.04 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_2          357.04 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_3          357.04 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_3_mlp_4          357.04 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_8          361.66 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_4_ln_1          366.29 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_4_self_attention          398.97 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                       getitem_8          398.97 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                       getitem_9          398.97 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_4_dropout          398.97 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                           add_9          403.58 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_4_ln_2          403.60 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_0          426.68 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_1          445.15 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_2          445.15 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_3          445.15 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_4_mlp_4          445.15 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_10          449.77 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_5_ln_1          454.40 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_5_self_attention          487.08 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_10          487.08 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_11          487.08 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_5_dropout          487.08 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_11          491.70 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_5_ln_2          491.71 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_0          514.79 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_1          533.26 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_2          533.26 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_3          533.26 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_5_mlp_4          533.26 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_12          537.88 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_6_ln_1          542.51 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_6_self_attention          575.19 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_12          575.19 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_13          575.19 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_6_dropout          575.19 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_13          579.81 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_6_ln_2          579.82 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_0          602.90 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_1          621.37 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_2          621.37 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_3          621.37 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_6_mlp_4          621.37 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_14          625.99 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_7_ln_1          630.62 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_7_self_attention          663.30 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_14          663.30 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_15          663.30 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_7_dropout          663.30 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_15          667.92 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_7_ln_2          667.93 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_0          691.02 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_1          709.48 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_2          709.48 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_3          709.48 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_7_mlp_4          709.48 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_16          714.10 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_8_ln_1          718.73 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_8_self_attention          751.41 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_16          751.41 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_17          751.41 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_8_dropout          751.41 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_17          756.03 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_8_ln_2          756.04 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_0          779.13 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_1          797.60 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_2          797.60 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_3          797.60 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_8_mlp_4          797.60 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_18          802.21 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_9_ln_1          806.84 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module   encoder_layers_encoder_layer_9_self_attention          839.52 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_18          839.52 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_19          839.52 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module          encoder_layers_encoder_layer_9_dropout          839.52 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_19          844.14 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module             encoder_layers_encoder_layer_9_ln_2          844.15 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_0          867.24 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_1          885.71 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_2          885.71 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_3          885.71 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module            encoder_layers_encoder_layer_9_mlp_4          885.71 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_20          890.32 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_10_ln_1          894.95 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module  encoder_layers_encoder_layer_10_self_attention          927.63 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_20          927.63 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_21          927.63 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module         encoder_layers_encoder_layer_10_dropout          927.63 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_21          932.25 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_10_ln_2          932.26 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_0          955.35 Mb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_1          973.82 Mb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_2          973.82 Mb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_3          973.82 Mb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module           encoder_layers_encoder_layer_10_mlp_4          973.82 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_22          978.44 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_11_ln_1          983.06 Mb             4.63 Mb        4.62 Mb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module  encoder_layers_encoder_layer_11_self_attention         1015.75 Mb            32.68 Mb            0 b      4.62 Mb       9.01 Mb         13.85 Mb    4.20 GFLOPs    8.40 GFLOPs
+call_function                                      getitem_22         1015.75 Mb                 0 b            0 b      4.62 Mb           0 b              0 b        0 FLOPs        0 FLOPs
+call_function                                      getitem_23         1015.75 Mb                 0 b            0 b          0 b           0 b              0 b        0 FLOPs        0 FLOPs
+  call_module         encoder_layers_encoder_layer_11_dropout         1015.75 Mb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_23         1020.36 Mb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module            encoder_layers_encoder_layer_11_ln_2         1020.38 Mb            12.31 Kb            0 b      4.62 Mb       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_0            1.02 Gb            23.09 Mb       18.47 Mb          0 b       9.01 Mb          4.62 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_1            1.04 Gb            18.47 Mb       18.47 Mb          0 b           0 b         18.47 Mb    4.84 MFLOPs    4.84 MFLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_2            1.04 Gb                 0 b       18.47 Mb          0 b           0 b         18.47 Mb        0 FLOPs        0 FLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_3            1.04 Gb                 0 b            0 b      4.62 Mb       9.00 Mb         18.47 Mb    3.72 GFLOPs    7.44 GFLOPs
+  call_module           encoder_layers_encoder_layer_11_mlp_4            1.04 Gb                 0 b            0 b      4.62 Mb           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+call_function                                          add_24            1.04 Gb             4.62 Mb        4.62 Mb          0 b           0 b          9.23 Mb    1.21 MFLOPs        0 FLOPs
+  call_module                                      encoder_ln            1.04 Gb            36.31 Kb       24.00 Kb          0 b       6.00 Kb          4.62 Mb    6.05 MFLOPs    6.05 MFLOPs
+call_function                                      getitem_24            1.04 Gb                 0 b       24.00 Kb          0 b           0 b          4.62 Mb        0 FLOPs        0 FLOPs
+  call_module                                      heads_head            1.04 Gb                 0 b            0 b     31.25 Kb       2.93 Mb         24.00 Kb    6.14 MFLOPs   12.30 MFLOPs
+       output                                          output            1.04 Gb                 0 b            0 b     31.25 Kb           0 b         31.25 Kb        0 FLOPs        0 FLOPs
+```
--- a/colossalai/_analyzer/_subclasses/__init__.py
+++ b/colossalai/_analyzer/_subclasses/__init__.py
+from ._meta_registration import *
+from ._monkey_patch import *
+from .flop_tensor import flop_count, flop_mapping
+from .meta_tensor import MetaTensor, MetaTensorMode
--- a/colossalai/_analyzer/_subclasses/_meta_registration.py
+++ b/colossalai/_analyzer/_subclasses/_meta_registration.py
+# meta patch from https://github.com/pytorch/pytorch/blob/master/torch/_meta_registrations.py
+# should be activated for PyTorch version 1.12.0 and below
+# refer to https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
+# for more meta_registrations
+
+from typing import Callable, List, Optional, Tuple, Union
+
+import torch
+from packaging import version
+from torch.utils._pytree import tree_map
+
+aten = torch.ops.aten
+
+try:
+    meta_lib = torch.library.Library("aten", "IMPL", "Meta")
+except AttributeError:
+    meta_lib = None
+
+meta_table = {}
+
+orig_empty = torch.empty
+orig_empty_strided = torch.empty_strided
+orig_empty_like = torch.empty_like
+
+
+def new(*args, **kwargs):
+    return orig_empty(*args, **kwargs, device=torch.device('meta'))
+
+
+def new_strided(*args, **kwargs):
+    return orig_empty_strided(*args, **kwargs, device=torch.device('meta'))
+
+
+def new_like(*args, **kwargs):
+    return orig_empty_like(*args, **kwargs, device=torch.device('meta'))
+
+
+def register_meta(op, register_dispatcher=True):
+
+    def wrapper(f):
+
+        def add_func(op):
+            meta_table[op] = f
+            if register_dispatcher:
+                name = (op.__name__ if op._overloadname != "default" else op.overloadpacket.__name__)
+                try:
+                    meta_lib.impl(name, f)
+                except:
+                    pass
+
+        tree_map(add_func, op)
+        return f
+
+    return wrapper
+
+
+if version.parse(torch.__version__) >= version.parse('1.12.0'):
+    # ============================== Convolutions ======================================
+    # https://github.com/pytorch/pytorch/pull/79834
+    @register_meta(aten.convolution.default)
+    def meta_conv(
+        input_tensor: torch.Tensor,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        stride: List[int],
+        padding: List[int],
+        dilation: List[int],
+        is_transposed: bool,
+        output_padding: List[int],
+        groups: int,
+    ):
+
+        def _formula(ln: int, p: int, d: int, k: int, s: int) -> int:
+            """
+            Formula to apply to calculate the length of some dimension of the output
+            See: https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            Args:
+                ln: length of the dimension
+                p: padding in that dim
+                d: dilation in that dim
+                k: kernel size in that dim
+                s: stride in that dim
+            Returns:
+                The output length
+            """
+            return (ln + 2 * p - d * (k - 1) - 1) // s + 1
+
+        def _formula_transposed(ln: int, p: int, d: int, k: int, s: int, op: int) -> int:
+            """
+            Formula to apply to calculate the length of some dimension of the output
+            if transposed convolution is used.
+            See: https://pytorch.org/docs/stable/generated/torch.nn.ConvTranspose2d.html
+            Args:
+                ln: length of the dimension
+                p: padding in that dim
+                d: dilation in that dim
+                k: kernel size in that dim
+                s: stride in that dim
+                op: output padding in that dim
+            Returns:
+                The output length
+            """
+            return (ln - 1) * s - 2 * p + d * (k - 1) + op + 1
+
+        def calc_conv_nd_return_shape(
+            dims: torch.Size,
+            kernel_size: torch.Size,
+            stride: Union[List[int], int],
+            padding: Union[List[int], int],
+            dilation: Union[List[int], int],
+            output_padding: Optional[Union[List[int], int]] = None,
+        ):
+            ret_shape = []
+            if isinstance(stride, int):
+                stride = [stride] * len(dims)
+            elif len(stride) == 1:
+                stride = [stride[0]] * len(dims)
+
+            if isinstance(padding, int):
+                padding = [padding] * len(dims)
+            elif len(padding) == 1:
+                padding = [padding[0]] * len(dims)
+
+            if isinstance(dilation, int):
+                dilation = [dilation] * len(dims)
+            elif len(dilation) == 1:
+                dilation = [dilation[0]] * len(dims)
+
+            output_padding_list: Optional[List[int]] = None
+            if output_padding:
+                if isinstance(output_padding, int):
+                    output_padding_list = [output_padding] * len(dims)
+                elif len(output_padding) == 1:
+                    output_padding_list = [output_padding[0]] * len(dims)
+                else:
+                    output_padding_list = output_padding
+
+            for i in range(len(dims)):
+                # If output_padding is present, we are dealing with a transposed convolution
+                if output_padding_list:
+                    ret_shape.append(
+                        _formula_transposed(
+                            dims[i],
+                            padding[i],
+                            dilation[i],
+                            kernel_size[i],
+                            stride[i],
+                            output_padding_list[i],
+                        ))
+                else:
+                    ret_shape.append(_formula(dims[i], padding[i], dilation[i], kernel_size[i], stride[i]))
+            return ret_shape
+
+        def pick_memory_format():
+            if input_tensor.is_contiguous(memory_format=torch.channels_last):
+                return torch.channels_last
+            elif input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+                return torch.contiguous_format
+            elif input_tensor.is_contiguous(memory_format=torch.preserve_format):
+                return torch.preserve_format
+
+        kernel_size = weight.shape[2:]
+        dims = input_tensor.shape[2:]
+        if is_transposed:
+            out_channels = groups * weight.shape[1]
+
+            shape_out = calc_conv_nd_return_shape(
+                dims,
+                kernel_size,
+                stride,
+                padding,
+                dilation,
+                output_padding,
+            )
+
+        else:
+            out_channels = weight.shape[0]
+            if weight.shape[1] != input_tensor.shape[1] / groups:
+                raise RuntimeError("Invalid channel dimensions")
+            shape_out = calc_conv_nd_return_shape(dims, kernel_size, stride, padding, dilation)
+        out = input_tensor.new_empty((input_tensor.shape[0], out_channels, *shape_out))
+        mem_fmt = pick_memory_format()
+        out = out.to(memory_format=mem_fmt)    # type: ignore[call-overload]
+        return out
+
+    @register_meta(aten._convolution.default)
+    def meta__conv(input_tensor: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, stride: List[int],
+                   padding: List[int], dilation: List[int], is_transposed: bool, output_padding: List[int], groups: int,
+                   *extra_args):
+        out = meta_conv(input_tensor, weight, bias, stride, padding, dilation, is_transposed, output_padding, groups)
+        return out
+
+    @register_meta(aten.convolution_backward.default)
+    def meta_conv_backward(grad_output: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, bias_sizes, stride,
+                           padding, dilation, transposed, output_padding, groups, output_mask):
+        return new_like(input), new_like(weight), new((bias_sizes))
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/AdaptiveAveragePooling.cpp
+    @register_meta(aten._adaptive_avg_pool2d_backward.default)
+    def meta_adaptive_avg_pool2d_backward(
+        grad_output: torch.Tensor,
+        input: torch.Tensor,
+    ):
+        return new_like(input)
+
+    # ================================ RNN =============================================
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/RNN.cpp
+    @register_meta(aten._cudnn_rnn.default)
+    def meta_cuda_rnn(
+        input,
+        weight,
+        weight_stride0,
+        weight_buf,
+        hx,
+        cx,
+        mode,
+        hidden_size,
+        proj_size,
+        num_layers,
+        batch_first,
+        dropout,
+        train,
+        bidirectional,
+        batch_sizes,
+        dropout_state,
+    ):
+
+        is_input_packed = len(batch_sizes) != 0
+        if is_input_packed:
+            seq_length = len(batch_sizes)
+            mini_batch = batch_sizes[0]
+            batch_sizes_sum = input.shape[0]
+        else:
+            seq_length = input.shape[1] if batch_first else input.shape[0]
+            mini_batch = input.shape[0] if batch_first else input.shape[1]
+            batch_sizes_sum = -1
+
+        num_directions = 2 if bidirectional else 1
+        out_size = proj_size if proj_size != 0 else hidden_size
+        if is_input_packed:
+            out_shape = [batch_sizes_sum, out_size * num_directions]
+        else:
+            out_shape = ([mini_batch, seq_length, out_size *
+                          num_directions] if batch_first else [seq_length, mini_batch, out_size * num_directions])
+        output = input.new_empty(out_shape)
+
+        cell_shape = [num_layers * num_directions, mini_batch, hidden_size]
+        cy = new(0) if cx is None else cx.new_empty(cell_shape)
+
+        hy = hx.new_empty([num_layers * num_directions, mini_batch, out_size])
+
+        # TODO: Query cudnnGetRNNTrainingReserveSize (expose to python)
+        reserve_shape = 0 if train else 0
+        reserve = input.new_empty(reserve_shape, dtype=torch.uint8)
+
+        return output, hy, cy, reserve, weight_buf
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/RNN.cpp
+    @register_meta(aten._cudnn_rnn_backward.default)
+    def meta_cudnn_rnn_backward(input: torch.Tensor,
+                                weight: torch.Tensor,
+                                weight_stride0: int,
+                                hx: torch.Tensor,
+                                cx: Optional[torch.Tensor] = None,
+                                *args,
+                                **kwargs):
+        return new_like(input), new_like(weight), new_like(hx), new_like(cx) if cx is not None else new(
+            ())    # (grad_input, grad_weight, grad_hx, grad_cx)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Activation.cpp
+    # ============================== Activations =======================================
+    _unregistered_ewise = [
+        aten.relu.default,
+        aten.prelu.default,
+        aten.hardswish.default,
+        aten.hardtanh.default,
+        aten.hardswish_backward.default,
+        aten.hardtanh_backward.default,
+    ]
+
+    if version.parse(torch.__version__) < version.parse('2.0.0'):
+        _unregistered_ewise += [
+            aten.prelu_backward.default,
+        ]
+
+    @register_meta(_unregistered_ewise)
+    def meta_unregistered_ewise(input: torch.Tensor, *args):
+        return new_like(input)
+
+    # ============================== Normalization =====================================
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+    @register_meta(aten.native_batch_norm.default)
+    def meta_bn(input: torch.Tensor, weight, bias, running_mean, running_var, training, momentum, eps):
+        n_input = input.size(1)
+        return new_like(input), new((n_input)), new((n_input))
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+    @register_meta(aten.native_batch_norm_backward.default)
+    def meta_bn_backward(dY: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, running_mean, running_var,
+                         save_mean, save_invstd, train, eps, output_mask):
+        return new_like(input), new_like(weight), new_like(weight)    # (dX, dgamma, dbeta)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+    @register_meta(aten.cudnn_batch_norm.default)
+    def meta_cudnn_bn(input: torch.Tensor, weight, bias, running_mean, running_var, training, momentum, eps):
+        n_input = input.size(1)
+        return new_like(input), new((n_input)), new((n_input)), new(
+            (0), dtype=torch.uint8)    # (output, running_mean, running_var, reserve)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/cudnn/BatchNorm.cpp
+    # NB: CuDNN only implements the backward algorithm for batchnorm
+    # in training mode (evaluation mode batchnorm has a different algorithm),
+    # which is why this doesn't accept a 'training' parameter.
+    @register_meta(aten.cudnn_batch_norm_backward.default)
+    def meta_cudnn_bn_backward(dY: torch.Tensor, input: torch.Tensor, weight: torch.Tensor, running_mean, running_var,
+                               save_mean, save_invstd, eps, reserve):
+        return new_like(input), new_like(weight), new_like(weight)    # (dX, dgamma, dbeta)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp
+    @register_meta(aten.native_layer_norm.default)
+    def meta_ln(input: torch.Tensor, normalized_shape, weight, bias, eps):
+        bs, n_input = input.size(0), input.size(1)
+        return new_like(input), new((bs, n_input, 1)), new((bs, n_input, 1))    # (output, running_mean, running_var)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/layer_norm.cpp
+    @register_meta(aten.native_layer_norm_backward.default)
+    def meta_ln_backward(dY: torch.Tensor, input: torch.Tensor, normalized_shape, mean, rstd, weight, bias,
+                         grad_input_mask):
+        return new_like(input), new_like(weight), new_like(bias)    # (dX, dgamma, dbeta)
+
+    # ================================== Misc ==========================================
+    # Maybe incorrect
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Im2Col.cpp
+    @register_meta(aten.im2col.default)
+    def meta_im2col(input: torch.Tensor, kernel_size, dilation, padding, stride):
+        return new_like(input)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
+    @register_meta(aten.roll.default)
+    def meta_roll(input: torch.Tensor, shifts, dims):
+        return input
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Scalar.cpp
+    @register_meta(aten._local_scalar_dense.default)
+    def meta_local_scalar_dense(self: torch.Tensor):
+        return 0
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorCompare.cpp
+    @register_meta(aten.where.self)
+    def meta_where_self(condition: torch.Tensor, self: torch.Tensor, other: torch.Tensor):
+        result_type = torch.result_type(self, other)
+        return new_like(condition + self + other, dtype=result_type)
+
+    # ============================== Embedding =========================================
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Embedding.cpp
+
+    @register_meta(aten.embedding_dense_backward.default)
+    def meta_embedding_dense_backward(grad_output: torch.Tensor, indices: torch.Tensor, num_weights, padding_idx,
+                                      scale_grad_by_freq):
+        return new((num_weights, grad_output.size(-1)), dtype=grad_output.dtype, layout=grad_output.layout)
+
+    # ============================== Dropout ===========================================
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Dropout.cpp
+    @register_meta(aten.native_dropout.default)
+    def meta_native_dropout_default(input: torch.Tensor, p: float, train: bool = False):
+        # notice that mask is bool
+        return new_like(input), new_like(input, dtype=torch.bool)    # (output, mask)
+
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Dropout.cpp
+    @register_meta(aten.native_dropout_backward.default)
+    def meta_native_dropout_backward_default(grad: torch.Tensor, mask: torch.Tensor, scale: float):
+        return new_like(grad)    # (grad_in)
+
+    if version.parse(torch.__version__) < version.parse('1.13.0'):
+        # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/native_functions.yaml
+        @register_meta(aten.eye.m_out)
+        def meta_eye(n: int, m: int, out: torch.Tensor):
+            return out
+
+        @register_meta(aten.index.Tensor)
+        def meta_index_Tensor(self, indices):
+            assert indices, "at least one index must be provided"
+            # aten::index is the internal advanced indexing implementation
+            # checkIndexTensorTypes and expandTensors
+            result: List[Optional[torch.Tensor]] = []
+            for i, index in enumerate(indices):
+                if index is not None:
+                    assert index.dtype in [torch.long, torch.int8, torch.bool],\
+                        "tensors used as indices must be long, byte or bool tensors"
+                    if index.dtype in [torch.int8, torch.bool]:
+                        nonzero = index.nonzero()
+                        k = len(result)
+                        assert k + index.ndim <= self.ndim, f"too many indices for tensor of dimension {self.ndim}"
+                        for j in range(index.ndim):
+                            assert index.shape[j] == self.shape[
+                                k +
+                                j], f"The shape of the mask {index.shape} at index {i} does not match the shape of the indexed tensor {self.shape} at index {k + j}"
+                            result.append(nonzero.select(1, j))
+                    else:
+                        result.append(index)
+                else:
+                    result.append(index)
+            indices = result
+            assert len(
+                indices) <= self.ndim, f"too many indices for tensor of dimension {self.ndim} (got {len(indices)})"
+            # expand_outplace
+            import torch._refs as refs
+
+            indices = list(refs._maybe_broadcast(*indices))
+            # add missing null tensors
+            while len(indices) < self.ndim:
+                indices.append(None)
+
+            # hasContiguousSubspace
+            #   true if all non-null tensors are adjacent
+            # See:
+            # https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
+            # https://stackoverflow.com/questions/53841497/why-does-numpy-mixed-basic-advanced-indexing-depend-on-slice-adjacency
+            state = 0
+            has_contiguous_subspace = False
+            for index in indices:
+                if state == 0:
+                    if index is not None:
+                        state = 1
+                elif state == 1:
+                    if index is None:
+                        state = 2
+                else:
+                    if index is not None:
+                        break
+            else:
+                has_contiguous_subspace = True
+
+            # transposeToFront
+            # This is the logic that causes the newly inserted dimensions to show up
+            # at the beginning of the tensor, if they're not contiguous
+            if not has_contiguous_subspace:
+                dims = []
+                transposed_indices = []
+                for i, index in enumerate(indices):
+                    if index is not None:
+                        dims.append(i)
+                        transposed_indices.append(index)
+                for i, index in enumerate(indices):
+                    if index is None:
+                        dims.append(i)
+                        transposed_indices.append(index)
+                self = self.permute(dims)
+                indices = transposed_indices
+
+            # AdvancedIndex::AdvancedIndex
+            # Now we can assume the indices have contiguous subspace
+            # This is simplified from AdvancedIndex which goes to more effort
+            # to put the input and indices in a form so that TensorIterator can
+            # take them.  If we write a ref for this, probably that logic should
+            # get implemented
+            before_shape: List[int] = []
+            after_shape: List[int] = []
+            replacement_shape: List[int] = []
+            for dim, index in enumerate(indices):
+                if index is None:
+                    if replacement_shape:
+                        after_shape.append(self.shape[dim])
+                    else:
+                        before_shape.append(self.shape[dim])
+                else:
+                    replacement_shape = list(index.shape)
+            return self.new_empty(before_shape + replacement_shape + after_shape)
--- a/colossalai/_analyzer/_subclasses/_monkey_patch.py
+++ b/colossalai/_analyzer/_subclasses/_monkey_patch.py
+import torch
+import torch.distributed as dist
+from packaging import version
+
+__all__ = [
+    "_TorchFactoryMethod",
+    "_TorchOverrideableFactoryMethod",
+    "_TorchNonOverrideableFactoryMethod",
+    "_TensorPropertyMethod",
+    "_DistCommMethod",
+    "_AliasATen",
+    "_InplaceATen",
+    "_MaybeInplaceATen",
+]
+
+_TorchOverrideableFactoryMethod = [
+    "empty",
+    "eye",
+    "full",
+    "ones",
+    "rand",
+    "randn",
+    "zeros",
+]
+
+_TorchNonOverrideableFactoryMethod = [
+    "arange",
+    "finfo",
+    "linspace",
+    "logspace",
+    "randint",
+    "randperm",
+    "tensor",
+]
+
+_TorchFactoryMethod = _TorchOverrideableFactoryMethod + _TorchNonOverrideableFactoryMethod
+
+_TensorPropertyMethod = ["dtype", "shape", "device", "requires_grad", "grad", "grad_fn", "data"]
+
+_DistCommMethod = [
+    "all_gather",
+    "all_reduce",
+    "all_to_all",
+    "broadcast",
+    "gather",
+    "reduce",
+    "reduce_scatter",
+    "scatter",
+]
+
+if version.parse(torch.__version__) >= version.parse('1.12.0'):
+    aten = torch.ops.aten
+    # TODO: dive deep here
+    # refer to https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorShape.cpp
+    _AliasATen = [
+        aten.detach.default,
+        aten.detach_.default,
+        aten.t.default,
+        aten.transpose.int,
+        aten.view.default,
+        aten._unsafe_view.default,
+        aten._reshape_alias.default,
+    ]
+
+    _InplaceATen = [
+        aten.add_.Tensor,
+        aten.add_.Scalar,
+        aten.sub_.Tensor,
+        aten.sub_.Scalar,
+        aten.mul_.Tensor,
+        aten.mul_.Scalar,
+        aten.div_.Tensor,
+        aten.div_.Scalar,
+        aten.pow_.Tensor,
+        aten.pow_.Scalar,
+    ]
+
+    # use `MaybeInplace` because they call ``as_strided()`` or ``slice()``
+    _MaybeInplaceATen = [
+        aten.diagonal.default,
+        aten.expand.default,
+        aten.select.int,
+        aten.slice.Tensor,
+        aten.split.Tensor,
+        aten.squeeze.default,
+        aten.permute.default,
+        aten.unsqueeze.default,
+        aten.as_strided.default,
+    ]
+else:
+    _AliasATen = []
+    _InplaceATen = []
+    _MaybeInplaceATen = []
--- a/colossalai/_analyzer/_subclasses/flop_tensor.py
+++ b/colossalai/_analyzer/_subclasses/flop_tensor.py
+# adopted from https://github.com/facebookresearch/fvcore/blob/main/fvcore/nn/jit_handles.py
+# ideas from https://pastebin.com/AkvAyJBw
+# and https://dev-discuss.pytorch.org/t/the-ideal-pytorch-flop-counter-with-torch-dispatch/505
+
+import operator
+from collections import defaultdict
+from contextlib import contextmanager
+from enum import Enum, auto
+from functools import partial, reduce
+from numbers import Number
+from typing import Any, Callable, List, Optional, Union
+
+import torch
+from packaging import version
+from torch.utils._pytree import tree_map
+
+from .meta_tensor import MetaTensor
+
+aten = torch.ops.aten
+
+
+class Phase(Enum):
+    FWD = auto()
+    BWD = auto()
+
+
+def normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+def _format_flops(flop):
+    K = 1e3
+    M = 1e6
+    B = 1e9
+    T = 1e12
+    if flop < K:
+        return f'{flop:.2f}'
+    elif flop < M:
+        return f'{flop / K:.2f}K'
+    elif flop < B:
+        return f'{flop / M:.2f}M'
+    elif flop < T:
+        return f'{flop / B:.2f}B'
+    else:
+        return f'{flop / T:.2f}T'
+
+
+def flop_count(module: Union[torch.nn.Module, Callable] = None, *args, verbose: bool = False, **kwargs) -> Number:
+    """
+    Count the number of floating point operations in a model.
+    Ideas from https://pastebin.com/AkvAyJBw.
+    Args:
+        module (torch.nn.Module): A PyTorch model.
+        *args: Input arguments to the model.
+        verbose (bool): If True, print the number of flops for each module.
+        **kwargs: Input keyword arguments to the model.
+    Returns:
+        Number: The total number of floating point operations (FWD + BWD).
+    """
+    maybe_inplace = (getattr(module, 'inplace', False) or kwargs.get('inplace', False)
+                     or getattr(module, '__name__', None) in ('add_', 'mul_', 'div_', 'sub_'))
+
+    class DummyModule(torch.nn.Module):
+
+        def __init__(self, func):
+            super().__init__()
+            self.func = func
+            self.__name__ = func.__name__
+
+        def forward(self, *args, **kwargs):
+            return self.func(*args, **kwargs)
+
+    total_flop_count = {Phase.FWD: 0, Phase.BWD: 0}
+    flop_counts = defaultdict(lambda: defaultdict(int))
+    parents = ['Global']
+    module = module if isinstance(module, torch.nn.Module) else DummyModule(module)
+
+    class FlopTensor(MetaTensor):
+        _tensor: torch.Tensor
+
+        def __repr__(self):
+            name = 'FlopParameter' if getattr(self, '_is_param', False) else 'FlopTensor'
+            if self.grad_fn:
+                return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype}, grad_fn={self.grad_fn})"
+            return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype})"
+
+        @classmethod
+        def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+
+            # no_dispatch is only needed if you use enable_python_mode.
+            # It prevents infinite recursion.
+            rs = super().__torch_dispatch__(func, types, args, kwargs)
+
+            outs = normalize_tuple(rs)
+
+            if func in flop_mapping:
+                nonlocal flop_counts, total_flop_count
+                flop_count = flop_mapping[func](args, outs)
+                for par in parents:
+                    flop_counts[par][func.__name__] += flop_count
+                total_flop_count[cur_phase] += flop_count
+
+            def wrap(x):
+                if isinstance(x, MetaTensor):
+                    x = FlopTensor(x)
+                return x
+
+            rs = tree_map(wrap, rs)
+
+            return rs
+
+    def is_autogradable(x):
+        return isinstance(x, torch.Tensor) and x.is_floating_point()
+
+    def create_backwards_push(name):
+
+        class PushState(torch.autograd.Function):
+
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                nonlocal parents
+                parents.append(name)
+                return grad_outs
+
+        return PushState.apply
+
+    def create_backwards_pop(name):
+
+        class PopState(torch.autograd.Function):
+
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                nonlocal parents
+                assert (parents[-1] == name)
+                parents.pop()
+                return grad_outs
+
+        return PopState.apply
+
+    def enter_module(name):
+
+        def f(module, inputs):
+            nonlocal parents
+            parents.append(name)
+            inputs = normalize_tuple(inputs)
+            out = create_backwards_pop(name)(*inputs)
+            return out
+
+        return f
+
+    def exit_module(name):
+
+        def f(module, inputs, outputs):
+            nonlocal parents
+            assert (parents[-1] == name)
+            parents.pop()
+            outputs = normalize_tuple(outputs)
+            return create_backwards_push(name)(*outputs)
+
+        return f
+
+    @contextmanager
+    def instrument_module(mod):
+        registered = []
+        for name, module in dict(mod.named_children()).items():
+            registered.append(module.register_forward_pre_hook(enter_module(name)))
+            registered.append(module.register_forward_hook(exit_module(name)))
+        yield
+        for handle in registered:
+            handle.remove()
+
+    def display_flops():
+        for mod in flop_counts.keys():
+            print(f"Module: ", mod)
+            for k, v in flop_counts[mod].items():
+                print('\t', k, _format_flops(v))
+            print()
+
+    def detach_variables(r):
+        if isinstance(r, torch.Tensor):
+            requires_grad = r.requires_grad
+            r = r.detach()
+            r.requires_grad = requires_grad
+        return r
+
+    def wrap(r):
+        if isinstance(r, torch.Tensor):
+            data_ptr_fn = getattr(r, '_tensor', r).data_ptr
+            r = FlopTensor(detach_variables(r))
+            if maybe_inplace:
+                r = r + 0
+            r._tensor.data_ptr = data_ptr_fn
+        return r
+
+    with instrument_module(module):
+        cur_phase = Phase.FWD
+        rst = module(*tree_map(wrap, args), **tree_map(wrap, kwargs))
+        rst = tuple(r for r in normalize_tuple(rst) if is_autogradable(r) and r.requires_grad)
+        cur_phase = Phase.BWD
+
+        if rst:
+            grad = [torch.zeros_like(t) for t in rst]
+            torch.autograd.backward(
+                rst,
+                grad,
+            )
+
+    if verbose:
+        display_flops()
+
+    return total_flop_count[Phase.FWD], total_flop_count[Phase.BWD]
+
+
+def matmul_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for matmul.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two matrices.
+    input_shapes = [v.shape for v in inputs]
+    assert len(input_shapes) == 2, input_shapes
+
+    # There are three cases: 1) gemm, 2) gemv, 3) dot
+    if all(len(shape) == 2 for shape in input_shapes):
+        # gemm
+        assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes
+    elif all(len(shape) == 1 for shape in input_shapes):
+        # dot
+        assert input_shapes[0][0] == input_shapes[1][0], input_shapes
+
+        # expand shape
+        input_shapes[0] = torch.Size([1, input_shapes[0][0]])
+        input_shapes[1] = torch.Size([input_shapes[1][0], 1])
+    else:
+        # gemv
+        if len(input_shapes[0]) == 1:
+            assert input_shapes[0][0] == input_shapes[1][-2], input_shapes
+            input_shapes.reverse()
+        else:
+            assert input_shapes[1][0] == input_shapes[0][-1], input_shapes
+
+        # expand the shape of the vector to [batch size, 1]
+        input_shapes[-1] = torch.Size([input_shapes[-1][-1], 1])
+    flops = reduce(operator.mul, input_shapes[0]) * input_shapes[-1][-1]
+    return flops
+
+
+def addmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for fully connected layers.
+    """
+    # Count flop for nn.Linear
+    # inputs is a list of length 3.
+    input_shapes = [v.shape for v in inputs[1:3]]
+    # input_shapes[0]: [batch size, input feature dimension]
+    # input_shapes[1]: [input feature dimension, output feature dimension]
+    assert len(input_shapes[0]) == 2, input_shapes[0]
+    assert len(input_shapes[1]) == 2, input_shapes[1]
+    batch_size, input_dim = input_shapes[0]
+    output_dim = input_shapes[1][1]
+    flops = batch_size * input_dim * output_dim
+    return flops
+
+
+def linear_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for the aten::linear operator.
+    """
+    # Inputs is a list of length 3; unlike aten::addmm, it is the first
+    # two elements that are relevant.
+    input_shapes = [v.shape for v in inputs[0:2]]
+    # input_shapes[0]: [dim0, dim1, ..., input_feature_dim]
+    # input_shapes[1]: [output_feature_dim, input_feature_dim]
+    assert input_shapes[0][-1] == input_shapes[1][-1]
+    flops = reduce(operator.mul, input_shapes[0]) * input_shapes[1][0]
+    return flops
+
+
+def bmm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for the bmm operation.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor.
+    assert len(inputs) == 2, len(inputs)
+    input_shapes = [v.shape for v in inputs]
+    n, c, t = input_shapes[0]
+    d = input_shapes[-1][-1]
+    flops = n * c * t * d
+    return flops
+
+
+def conv_flop_count(
+    x_shape: List[int],
+    w_shape: List[int],
+    out_shape: List[int],
+    transposed: bool = False,
+) -> Number:
+    """
+    Count flops for convolution. Note only multiplication is
+    counted. Computation for addition and bias is ignored.
+    Flops for a transposed convolution are calculated as
+    flops = (x_shape[2:] * prod(w_shape) * batch_size).
+    Args:
+        x_shape (list(int)): The input shape before convolution.
+        w_shape (list(int)): The filter shape.
+        out_shape (list(int)): The output shape after convolution.
+        transposed (bool): is the convolution transposed
+    Returns:
+        int: the number of flops
+    """
+    batch_size = x_shape[0]
+    conv_shape = (x_shape if transposed else out_shape)[2:]
+    flops = batch_size * reduce(operator.mul, w_shape) * reduce(operator.mul, conv_shape)
+    return flops
+
+
+def conv_flop_jit(inputs: List[Any], outputs: List[Any]):
+    """
+    Count flops for convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (x.shape, w.shape, outputs[0].shape)
+    transposed = inputs[6]
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
+
+
+def transpose_shape(shape):
+    return [shape[1], shape[0]] + list(shape[2:])
+
+
+def conv_backward_flop_jit(inputs: List[Any], outputs: List[Any]):
+    grad_out_shape, x_shape, w_shape = [i.shape for i in inputs[:3]]
+    output_mask = inputs[-1]
+    fwd_transposed = inputs[7]
+    flop_count = 0
+
+    if output_mask[0]:
+        grad_input_shape = outputs[0].shape
+        flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not fwd_transposed)
+    if output_mask[1]:
+        grad_weight_shape = outputs[1].shape
+        flop_count += conv_flop_count(transpose_shape(x_shape), grad_out_shape, grad_weight_shape, fwd_transposed)
+
+    return flop_count
+
+
+def norm_flop_counter(affine_arg_index: int, input_arg_index: int) -> Callable:
+    """
+    Args:
+        affine_arg_index: index of the affine argument in inputs
+    """
+
+    def norm_flop_jit(inputs: List[Any], outputs: List[Any]) -> Number:
+        """
+        Count flops for norm layers.
+        """
+        # Inputs[0] contains the shape of the input.
+        input_shape = inputs[input_arg_index].shape
+
+        has_affine = inputs[affine_arg_index].shape is not None if hasattr(inputs[affine_arg_index],
+                                                                           'shape') else inputs[affine_arg_index]
+        assert 2 <= len(input_shape) <= 5, input_shape
+        # 5 is just a rough estimate
+        flop = reduce(operator.mul, input_shape) * (5 if has_affine else 4)
+        return flop
+
+    return norm_flop_jit
+
+
+def batchnorm_flop_jit(inputs: List[Any], outputs: List[Any], training: bool = None) -> Number:
+    if training is None:
+        training = inputs[-3]
+    assert isinstance(training, bool), "Signature of aten::batch_norm has changed!"
+    if training:
+        return norm_flop_counter(1, 0)(inputs, outputs)    # pyre-ignore
+    has_affine = inputs[1].shape is not None
+    input_shape = reduce(operator.mul, inputs[0].shape)
+    return input_shape * (2 if has_affine else 1)
+
+
+def ewise_flop_counter(input_scale: float = 1, output_scale: float = 0) -> Callable:
+    """
+    Count flops by
+        input_tensor.numel() * input_scale + output_tensor.numel() * output_scale
+    Args:
+        input_scale: scale of the input tensor (first argument)
+        output_scale: scale of the output tensor (first element in outputs)
+    """
+
+    def ewise_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+        ret = 0
+        if input_scale != 0:
+            shape = inputs[0].shape
+            ret += input_scale * reduce(operator.mul, shape) if shape else 0
+        if output_scale != 0:
+            shape = outputs[0].shape
+            ret += output_scale * reduce(operator.mul, shape) if shape else 0
+        return ret
+
+    return ewise_flop
+
+
+def zero_flop_jit(*args):
+    """
+        Count flops for zero flop layers.
+    """
+    return 0
+
+
+if version.parse(torch.__version__) >= version.parse('1.12.0'):
+    flop_mapping = {
+    # gemm
+        aten.mm.default: matmul_flop_jit,
+        aten.matmul.default: matmul_flop_jit,
+        aten.addmm.default: addmm_flop_jit,
+        aten.bmm.default: bmm_flop_jit,
+
+    # convolution
+        aten.convolution.default: conv_flop_jit,
+        aten._convolution.default: conv_flop_jit,
+        aten.convolution_backward.default: conv_backward_flop_jit,
+
+    # normalization
+        aten.native_batch_norm.default: batchnorm_flop_jit,
+        aten.native_batch_norm_backward.default: batchnorm_flop_jit,
+        aten.cudnn_batch_norm.default: batchnorm_flop_jit,
+        aten.cudnn_batch_norm_backward.default: partial(batchnorm_flop_jit, training=True),
+        aten.native_layer_norm.default: norm_flop_counter(2, 0),
+        aten.native_layer_norm_backward.default: norm_flop_counter(2, 0),
+
+    # pooling
+        aten.avg_pool1d.default: ewise_flop_counter(1, 0),
+        aten.avg_pool2d.default: ewise_flop_counter(1, 0),
+        aten.avg_pool2d_backward.default: ewise_flop_counter(0, 1),
+        aten.avg_pool3d.default: ewise_flop_counter(1, 0),
+        aten.avg_pool3d_backward.default: ewise_flop_counter(0, 1),
+        aten.max_pool1d.default: ewise_flop_counter(1, 0),
+        aten.max_pool2d.default: ewise_flop_counter(1, 0),
+        aten.max_pool3d.default: ewise_flop_counter(1, 0),
+        aten.max_pool1d_with_indices.default: ewise_flop_counter(1, 0),
+        aten.max_pool2d_with_indices.default: ewise_flop_counter(1, 0),
+        aten.max_pool2d_with_indices_backward.default: ewise_flop_counter(0, 1),
+        aten.max_pool3d_with_indices.default: ewise_flop_counter(1, 0),
+        aten.max_pool3d_with_indices_backward.default: ewise_flop_counter(0, 1),
+        aten._adaptive_avg_pool2d.default: ewise_flop_counter(1, 0),
+        aten._adaptive_avg_pool2d_backward.default: ewise_flop_counter(0, 1),
+        aten._adaptive_avg_pool3d.default: ewise_flop_counter(1, 0),
+        aten._adaptive_avg_pool3d_backward.default: ewise_flop_counter(0, 1),
+        aten.embedding_dense_backward.default: ewise_flop_counter(0, 1),
+        aten.embedding.default: ewise_flop_counter(1, 0),
+    }
+
+    ewise_flop_aten = [
+    # basic op
+        aten.add.Tensor,
+        aten.add_.Tensor,
+        aten.div.Tensor,
+        aten.div_.Tensor,
+        aten.div.Scalar,
+        aten.div_.Scalar,
+        aten.mul.Tensor,
+        aten.mul.Scalar,
+        aten.mul_.Tensor,
+        aten.neg.default,
+        aten.pow.Tensor_Scalar,
+        aten.rsub.Scalar,
+        aten.sum.default,
+        aten.sum.dim_IntList,
+        aten.mean.dim,
+
+    # activation op
+        aten.hardswish.default,
+        aten.hardswish_.default,
+        aten.hardswish_backward.default,
+        aten.hardtanh.default,
+        aten.hardtanh_.default,
+        aten.hardtanh_backward.default,
+        aten.hardsigmoid_backward.default,
+        aten.hardsigmoid.default,
+        aten.gelu.default,
+        aten.gelu_backward.default,
+        aten.silu.default,
+        aten.silu_.default,
+        aten.silu_backward.default,
+        aten.sigmoid.default,
+        aten.sigmoid_backward.default,
+        aten._softmax.default,
+        aten._softmax_backward_data.default,
+        aten.relu_.default,
+        aten.relu.default,
+        aten.tanh.default,
+        aten.tanh_backward.default,
+        aten.threshold_backward.default,
+
+    # dropout
+        aten.native_dropout.default,
+        aten.native_dropout_backward.default,
+
+    # distribution
+        aten.bernoulli_.float,
+
+    # where
+        aten.where.self,
+    ]
+    for op in ewise_flop_aten:
+        flop_mapping[op] = ewise_flop_counter(1, 0)
+
+    # fix-me: this will be removed in future
+    zero_flop_aten = [
+        aten.as_strided.default,
+        aten.as_strided_.default,
+        aten.cat.default,
+        aten.clone.default,
+        aten.copy_.default,
+        aten.detach.default,
+        aten.expand.default,
+        aten.empty_like.default,
+        aten.new_empty.default,
+        aten.new_empty_strided.default,
+        aten.ones_like.default,
+        aten._reshape_alias.default,
+        aten.select.int,
+        aten.select_backward.default,
+        aten.squeeze.dim,
+        aten.slice.Tensor,
+        aten.slice_backward.default,
+        aten.split.Tensor,
+        aten.permute.default,
+        aten.t.default,
+        aten.transpose.int,
+        aten._to_copy.default,
+        aten.unsqueeze.default,
+        aten.unbind.int,
+        aten._unsafe_view.default,
+        aten.view.default,
+        aten.zero_.default,
+        aten.zeros_like.default,
+    ]
+
+    for op in zero_flop_aten:
+        flop_mapping[op] = zero_flop_jit
+else:
+    flop_mapping = {}
+    elementwise_flop_aten = {}
+    zero_flop_aten = {}
--- a/colossalai/_analyzer/_subclasses/meta_tensor.py
+++ b/colossalai/_analyzer/_subclasses/meta_tensor.py
+import uuid
+from functools import partial
+
+import torch
+import torch.distributed as dist
+from torch.types import _bool, _device, _dtype
+from torch.utils._pytree import tree_flatten, tree_map
+
+from ._monkey_patch import _AliasATen, _DistCommMethod, _InplaceATen, _MaybeInplaceATen, _TorchOverrideableFactoryMethod
+
+__all__ = ['MetaTensor', 'MetaTensorMode']
+
+
+def register_storage(r, data_ptr_fn=None):
+    if isinstance(r, torch.Tensor):
+        if data_ptr_fn is not None:
+            r.data_ptr = data_ptr_fn
+        elif not r.data_ptr():
+            data_ptr = uuid.uuid1()
+            r.data_ptr = lambda: data_ptr
+
+
+def _normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+# a hack of inplace execution in PyTorch
+def _assert_alias(func):
+    return func in (_AliasATen + _InplaceATen + _MaybeInplaceATen    # TODO: check if should be this aggressive
+                   )
+
+
+class MetaTensor(torch.Tensor):
+    """
+    A wrapping tensor that hacks ``torch.autograd`` without patching more ``torch.ops.aten`` ops.
+    `device` is the device that ``MetaTensor`` is supposed to run on. Meta tensors give you the
+    ability to run PyTorch code without having to actually do computation through tensors
+    allocated on a `meta` device. Because the device is `meta`, meta tensors do not model
+    device propagation. ``MetaTensor`` extends its usage by carrying an additional `device`
+    which tracks devices that would have been used.
+
+    Reference:
+        https://github.com/pytorch/pytorch/blob/master/torch/_subclasses/fake_tensor.py
+    """
+
+    _tensor: torch.Tensor
+
+    @staticmethod
+    def __new__(cls, elem, device=None, data_ptr_fn=None):
+        requires_grad = elem.requires_grad
+        # Avoid multiple wrapping
+        while isinstance(elem, MetaTensor):
+            device = elem.device if device is None else device
+            elem = elem._tensor
+
+        # The wrapping tensor (MetaTensor) shouldn't hold any
+        # memory for the class in question, but it should still
+        # advertise the same device as before
+        r = torch.Tensor._make_wrapper_subclass(
+            cls,
+            elem.size(),
+            strides=elem.stride(),
+            storage_offset=elem.storage_offset(),
+            dtype=elem.dtype,
+            layout=elem.layout,
+            device=device or (elem.device if elem.device.type != 'meta' else torch.device('cpu')),
+            requires_grad=requires_grad)    # deceive the frontend for aten selections
+        r._tensor = elem
+        # ...the real tensor is held as an element on the tensor.
+        if not r._tensor.is_meta:
+            val = elem.data_ptr()
+            data_ptr_fn = lambda: val
+            r._tensor = r._tensor.to(torch.device('meta'))
+
+        # only tensor not on `meta` should be copied to `meta`
+        register_storage(r._tensor, data_ptr_fn)
+        if isinstance(elem, torch.nn.Parameter):
+            r = torch.nn.Parameter(r)
+        return r
+
+    def __repr__(self):
+        name = 'MetaParameter' if getattr(self, '_is_param', False) else 'MetaTensor'
+        if self.grad_fn:
+            return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype}, grad_fn={self.grad_fn})"
+        return f"{name}(..., size={tuple(self.shape)}, device='{self.device}', dtype={self.dtype})"
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args=(), kwargs=None):
+        device = None
+
+        def unwrap(x):
+            nonlocal device
+            if isinstance(x, MetaTensor):
+                device = x.device
+                x = x._tensor
+            elif isinstance(x, torch.Tensor):
+                device = x.device
+                x = x.to(torch.device('meta'))
+            return x
+
+        args = tree_map(unwrap, args)
+        kwargs = tree_map(unwrap, kwargs)
+
+        if 'device' in kwargs:
+            device = kwargs['device']
+            kwargs['device'] = torch.device('meta')
+
+        # run aten for backend=CPU but actually on backend=Meta
+        # here we detect whether or not the execution generates a physical copy
+        # of the input tensor
+        ret = func(*args, **kwargs)
+
+        if _assert_alias(func):
+            val = args[0].data_ptr()
+            tree_map(partial(register_storage, data_ptr_fn=lambda: val), _normalize_tuple(ret))
+
+        # Now, we want to continue propagating this tensor, so we rewrap Tensors in
+        # our custom tensor subclass
+        def wrap(x):
+            return MetaTensor(x, device=device) if isinstance(x, torch.Tensor) else x
+
+        return tree_map(wrap, ret)
+
+    def to(self, *args, **kwargs) -> torch.Tensor:
+        """An extension of `torch.Tensor.to()` to MetaTensor
+        Returns:
+            result (MetaTensor): MetaTensor
+        Usage:
+            >>> tensor = MetaTensor(torch.rand(10), device='cuda:100')
+            >>> tensor.to(torch.uint8)
+            MetaTensor(tensor(..., device='meta', size=(10,), dtype=torch.uint8), device='cuda:100')
+            >>> tensor.to(torch.device('cuda:42'))
+            MetaTensor(tensor(..., device='meta', size=(10,)), device='cuda:42')
+            >>> tensor.to('vulkan')
+            MetaTensor(tensor(..., device='meta', size=(10,)), device='vulkan')
+        """
+        # this imitates c++ function in the way of @overload
+        device = None
+
+        def replace(x):
+            nonlocal device
+            if isinstance(x, str) or isinstance(x, _device):
+                device = x
+                return torch.device('meta')
+            return x
+
+        elem = self._tensor.to(*tree_map(replace, args), **tree_map(replace, kwargs))
+        return MetaTensor(elem, device=device)
+
+    def cpu(self, *args, **kwargs):
+        if self.device.type == 'cpu':
+            return self.to(*args, **kwargs)
+        return self.to(*args, device='cpu', **kwargs)
+
+    def cuda(self, device=None, non_blocking=False):
+        if device is not None:
+            return self.to(device=device, non_blocking=non_blocking)
+        return self.to(device='cuda:0', non_blocking=non_blocking)
+
+    def data_ptr(self):
+        return self._tensor.data_ptr()
+
+
+class MetaTensorMode(object):
+    """
+    A context manager that enables MetaTensor mode.
+
+    Usage:
+        >>> with MetaTensorMode():
+        >>>     # all torch.xxx and torch.distributed.xxx will be replaced by patched functions
+        >>>     # and the actual execution will be on torch.device('meta')
+        >>>     a = torch.rand(100000, 100000)
+        >>>     b = torch.rand(100000, 100000)
+        >>>     c = torch.mm(a, b)
+    """
+
+    def __init__(self):
+        self.torch_overrides = {}    # override torch.xxx
+        self.dist_overrides = {}    # override torch.distributed.xxx
+
+    def __enter__(self):
+
+        def _dummy(*args, **kwargs):
+            pass
+
+        def _new(*args, orig_new=torch.empty, **kwargs):
+            return MetaTensor(orig_new(*args, **{
+                **kwargs, 'device': 'meta'
+            }),
+                              device=kwargs.get('device', torch.device('cpu')))
+
+        for func in _TorchOverrideableFactoryMethod:
+            self.torch_overrides[func] = getattr(torch, func)
+            setattr(torch, func, partial(_new, orig_new=getattr(torch, func)))
+
+        for func in _DistCommMethod:
+            self.dist_overrides[func] = getattr(dist, func)
+            setattr(dist, func, _dummy)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        for func, func_impl in self.torch_overrides.items():
+            setattr(torch, func, func_impl)
+
+        for func, func_impl in self.dist_overrides.items():
+            setattr(dist, func, func_impl)
--- a/colossalai/_analyzer/envs.py
+++ b/colossalai/_analyzer/envs.py
+from dataclasses import dataclass
+
+
+@dataclass
+class MeshConfig:
+    TFLOPS: float = 1.9e12
+    BANDWIDTH = 1.2e9