bloom

9fdb7dab · yuguo960516 · 9fdb7dab · 9fdb7dab · 9fdb7dab · 9fdb7dab
Commit 9fdb7dab authored Mar 30, 2023 by yuguo960516
20 changed files
--- a/libai/data/samplers/samplers.py
+++ b/libai/data/samplers/samplers.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import oneflow as flow
+from oneflow.utils.data import Sampler
+
+
+class CyclicSampler(Sampler):
+    """
+    This sampler supports cyclic sampling, and it is also compatible with
+    non-data parallelism and data parallelism.
+
+    Arguments:
+        dataset: dataset to be sampled.
+        micro_batch_size: batch size for per model instance.
+        global_batch_size is micro_batch_size times data_parallel_size.
+        shuffle: whether to shuffle the dataset.
+        consumed_samples: the number of samples that have been trained at the current time,
+            used for resuming training (default: ``0``).
+        data_parallel_rank: local rank for data parallelism.
+        data_parallel_size: the size of data parallelism.
+        seed: random seed, used for reproducing experiments (default: ``0``).
+    """
+
+    def __init__(
+        self,
+        dataset,
+        micro_batch_size,
+        shuffle=False,
+        consumed_samples=0,
+        data_parallel_rank=0,
+        data_parallel_size=1,
+        seed=0,
+    ):
+        self.dataset = dataset
+        self.data_size = len(self.dataset)
+        self.shuffle = shuffle
+
+        self.data_parallel_rank = data_parallel_rank
+        self.data_parallel_size = data_parallel_size
+        self.micro_batch_size = micro_batch_size
+        self.actual_batch_size = self.micro_batch_size * self.data_parallel_size
+        self.data_size_per_epoch = self.data_size // self.actual_batch_size * self.micro_batch_size
+        self.consumed_samples = consumed_samples
+
+        self.seed = seed
+
+    def __iter__(self):
+        """divide the data into data_parallel_size buckets,
+        and shuffle it if `shuffle` is set to `True`.
+        Each processor samples from its own buckets and data_loader
+        will load the corresponding data.
+        """
+        epoch = self.consumed_samples // self.data_size_per_epoch
+        current_epoch_samples = self.consumed_samples % self.data_size_per_epoch
+        batch = []
+
+        while True:
+            bucket_offset = current_epoch_samples // self.data_parallel_size
+            start_idx = self.data_parallel_rank * self.data_size_per_epoch
+
+            if self.shuffle:
+                generator = flow.Generator()
+                generator.manual_seed(self.seed + epoch)
+                random_idx = flow.randperm(self.data_size_per_epoch, generator=generator).tolist()
+                indices = [start_idx + x for x in random_idx[bucket_offset:]]
+            else:
+                seq_idx = flow.arange(self.data_size_per_epoch).tolist()
+                indices = [start_idx + x for x in seq_idx[bucket_offset:]]
+
+            epoch += 1
+
+            if hasattr(self.dataset, "supports_prefetch") and self.dataset.supports_prefetch:
+                self.dataset.prefetch(indices)
+
+            for idx in indices:
+                batch.append(idx)
+                if len(batch) == self.micro_batch_size:
+                    self.consumed_samples += self.actual_batch_size
+                    yield batch
+                    batch = []
+
+            current_epoch_samples = 0
+
+    def __len__(self):
+        return self.data_size
+
+    def set_consumed_samples(self, consumed_samples):
+        """You can recover the training iteration by setting `consumed_samples`."""
+        self.consumed_samples = consumed_samples
+
+    def set_epoch(self, epoch):
+        """Used for restoring training status."""
+        self.epoch = epoch
+
+
+class SingleRoundSampler(Sampler):
+    """
+    This sampler supports single round sampling, and it is also compatible with
+    non data parallelism and data parallelism.
+
+    Arguments:
+        dataset: dataset to be sampled.
+        micro_batch_size: batch size for per model instance, global_batch_size
+                          is micro_batch_size times data_parallel_size.
+        shuffle: whether to shuffle the dataset.
+        data_parallel_rank: local rank for data parallelism.
+        data_parallel_size: the size of data parallelism.
+        seed: random seed, used for reproducing experiments (default: ``0``).
+        drop_last: whether to drop the remaining data (default: ``False``).
+    """
+
+    def __init__(
+        self,
+        dataset,
+        micro_batch_size,
+        shuffle=False,
+        data_parallel_rank=0,
+        data_parallel_size=1,
+        seed=0,
+        drop_last=False,
+    ):
+        self.dataset = dataset
+        self.data_size = len(self.dataset)
+        self.shuffle = shuffle
+
+        self.data_parallel_rank = data_parallel_rank
+        self.data_parallel_size = data_parallel_size
+        self.micro_batch_size = micro_batch_size
+
+        self.seed = seed
+        self.drop_last = drop_last
+
+    def __iter__(self):
+        bucket_size = self.data_size // self.data_parallel_size
+        remain = self.data_size % self.data_parallel_size
+        start_idx = self.data_parallel_rank * bucket_size
+
+        if self.data_parallel_rank < remain:
+            bucket_size += 1
+        start_idx += min(self.data_parallel_rank, remain)
+
+        if self.shuffle:
+            generator = flow.Generator()
+            generator.manual_seed(self.seed)
+            random_idx = flow.randperm(bucket_size, generator=generator).tolist()
+            indices = [start_idx + x for x in random_idx]
+        else:
+            seq_idx = flow.arange(bucket_size).tolist()
+            indices = [start_idx + x for x in seq_idx]
+
+        if hasattr(self.dataset, "supports_prefetch") and self.dataset.supports_prefetch:
+            self.dataset.prefetch(indices)
+
+        batch = []
+        for idx in indices:
+            batch.append(idx)
+            if len(batch) == self.micro_batch_size:
+                yield batch
+                batch = []
+
+        if not self.drop_last:
+            if self.data_parallel_rank >= remain and remain > 0:
+                batch.append(0)
+            if len(batch) > 0:
+                yield batch
+
+    def __len__(self):
+        global_batch_size = self.micro_batch_size * self.data_parallel_size
+        if self.drop_last:
+            return self.data_size // global_batch_size
+        else:
+            return (self.data_size + global_batch_size - 1) // global_batch_size
--- a/libai/data/structures.py
+++ b/libai/data/structures.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import OrderedDict
+from dataclasses import dataclass, field
+from typing import Any, List
+
+import oneflow as flow
+
+from libai.utils import distributed as dist
+
+
+@dataclass
+class DistTensorData:
+    tensor: flow.Tensor
+    sbp_list: list = field(default_factory=lambda: ["split_0", "broadcast"])
+    placement_idx: int = 0
+
+    # Tensor-like methods
+    def to_global(self, sbp=None, placement=None, device_type="cuda"):
+        if sbp is not None:
+            self.sbp = sbp
+        else:
+            sbp_list = []
+            for sbp in self.sbp_list:
+                sbp = sbp.split("_")
+                if len(sbp) > 1:
+                    # split dim
+                    assert sbp[0] == "split"
+                    split_dim = int(sbp[1])
+                    sbp_list.append(flow.sbp.split(split_dim))
+                else:
+                    sbp_sign = sbp[0]
+                    sbp_list.append(getattr(flow.sbp, sbp_sign))
+            self.sbp = dist.get_nd_sbp(sbp_list)
+
+        if placement is not None:
+            self.tensor = self.tensor.to_global(sbp=self.sbp, placement=placement)
+        else:
+            # Convert local tensor to global tensor with default setting,
+            # if the placement parameter is not provided.
+            # When enable pipeline parallel training,
+            # all the devices will be grouped into several device groups
+            # and the model will be split into several stages.
+            # Each stage will be placed on the corresponding device group.
+            # For those tensors to be used in the last stage,
+            # we first convert them to global tensor by only retain those on the device group 0,
+            # then transfer the result to the last stage.
+            # We do that to make sure that all the tensors used by the model are all generated
+            # by the fist device group, in case that each device group containg
+            # some random augmentations to the tensors without setting the same global seed.
+            main_placement = dist.get_layer_placement(0, device_type)
+            self.tensor = self.tensor.to_global(sbp=self.sbp, placement=main_placement)
+            if self.placement_idx != 0:
+                self.tensor = self.tensor.to_global(
+                    placement=dist.get_layer_placement(self.placement_idx, device_type)
+                )
+
+    @staticmethod
+    def stack(distTensor_lists: List["DistTensorData"]) -> "DistTensorData":
+        if not isinstance(distTensor_lists[0].tensor, flow.Tensor):
+            raise TypeError(
+                "DistTensorData.tensor must be a flow.Tensor, but got {}. "
+                "Please check the return values of `__getitem__` in dataset.".format(
+                    type(distTensor_lists[0].tensor)
+                )
+            )
+
+        assert len(distTensor_lists) > 0
+        if len(distTensor_lists) == 1:
+            # TODO(l1aoxingyu): add inplace unsqueeze
+            # distTensor_lists[0].tensor.unsqueeze_(0)  # add batch dim
+            distTensor_lists[0].tensor = distTensor_lists[0].tensor.unsqueeze(0)  # add batch dim
+            return distTensor_lists[0]
+
+        tensor_size = distTensor_lists[0].tensor.size()
+        sbp_list = distTensor_lists[0].sbp_list
+        placement_idx = distTensor_lists[0].placement_idx
+        tensors = []
+        for data in distTensor_lists:
+            assert (
+                data.tensor.size() == tensor_size
+            ), f"tensor shape is not equal, {data.tensor.size()} != {tensor_size}"
+            assert (
+                data.sbp_list == sbp_list
+            ), f"sbp_list is not equal, {data.sbp_list} != {sbp_list}!"
+            assert (
+                data.placement_idx == placement_idx
+            ), f"placement_idx is not equal, {data.placement_idx} != {placement_idx}"
+            tensors.append(data.tensor)
+        tensors = flow.stack(tensors, dim=0)
+        ret = DistTensorData(tensors, sbp_list=sbp_list, placement_idx=placement_idx)
+        return ret
+
+
+class Instance:
+    """
+    This class represents a instance with metadata as attributes.
+    It stores the attributes of an instance (e.g., image, tokens) as "fields".
+
+    all other (non-filed) attributes of this class are considered private:
+    they must start with '_' and are not modifiable by a user.
+
+    Some basic usage:
+
+    1. Set/get/check a field:
+
+        .. code-block:: python
+
+            instance.tokens = Metadata(...)
+            instance.mask = Metadata(...)
+            print(instance.tokens)
+            print(instance.has("mask")) # True
+
+    2. ``len(instance)`` returns the number of instance
+    """
+
+    def __init__(self, **kwargs):
+
+        self._fields = OrderedDict()
+        for k, v in kwargs.items():
+            self.set(k, v)
+
+    def __setattr__(self, name: str, val: Any) -> None:
+        if name.startswith("_"):
+            super().__setattr__(name, val)
+        else:
+            self.set(name, val)
+
+    def __getattr__(self, name: str):
+        if name == "_fields" or name not in self._fields:
+            raise AttributeError(f"Cannot find field '{name}' in the given Instance!")
+        return self._fields[name]
+
+    def set(self, name: str, value: Any):
+        """
+        Set the field named `name` to `value`.
+        """
+        self._fields[name] = value
+
+    def has(self, name: str):
+        return name in self._fields
+
+    def remove(self, name: str):
+        del self._fields[name]
+
+    def get(self, name: str):
+        return self._fields[name]
+
+    def get_fields(self):
+        return self._fields
+
+    def __len__(self):
+        return len(self._fields.keys())
+
+    def __iter__(self):
+        raise NotImplementedError("`Instances` object is not iterable!")
+
+    @staticmethod
+    def stack(instance_lists: List["Instance"]) -> "Instance":
+        assert all(isinstance(i, Instance) for i in instance_lists)
+        assert len(instance_lists) > 0
+
+        ret = Instance()
+        for k in instance_lists[0]._fields.keys():
+            values = [i.get(k) for i in instance_lists]
+            v0 = values[0]
+            if isinstance(v0, flow.Tensor):
+                values = flow.stack(values, dim=0)
+            elif isinstance(v0, list):
+                pass
+            elif hasattr(type(v0), "stack"):
+                values = type(v0).stack(values)
+            else:
+                raise ValueError("Unsupported type {} for stack.".format(type(v0)))
+            ret.set(k, values)
+        return ret
+
+    def __str__(self):
+        s = self.__class__.__name__ + "("
+        s += "fields=[{}]".format(", ".join((f"{k}: {v}" for k, v in self._fields.items())))
+        return s
+
+    __repr__ = __str__
--- a/libai/engine/__init__.py
+++ b/libai/engine/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .default import DefaultTrainer, default_setup
--- a/libai/engine/default.py
+++ b/libai/engine/default.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import math
+import os
+import time
+from collections import OrderedDict
+from typing import Callable, Optional
+
+import oneflow as flow
+from omegaconf import OmegaConf
+from termcolor import colored
+
+from libai.config import LazyConfig, instantiate, try_get_key
+from libai.data import Instance
+from libai.engine import hooks
+from libai.engine.trainer import EagerTrainer, GraphTrainer, TrainerBase
+from libai.evaluation import inference_on_dataset, print_csv_format
+from libai.models import build_graph, build_model
+from libai.optim import build_optimizer
+from libai.scheduler import build_lr_scheduler
+from libai.tokenizer import build_tokenizer
+from libai.utils import distributed as dist
+from libai.utils.checkpoint import Checkpointer
+from libai.utils.events import CommonMetricPrinter, JSONWriter, TensorboardXWriter
+from libai.utils.logger import setup_logger
+
+# --------------------------------------------------------
+# References:
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/defaults.py
+# --------------------------------------------------------
+
+
+def _highlight(code, filename):
+    try:
+        import pygments
+    except ImportError:
+        return code
+
+    from pygments.formatters import Terminal256Formatter
+    from pygments.lexers import Python3Lexer, YamlLexer
+
+    lexer = Python3Lexer() if filename.endswith(".py") else YamlLexer()
+    code = pygments.highlight(code, lexer, Terminal256Formatter(style="monokai"))
+    return code
+
+
+def _check_batch_size(cfg):
+    train_micro_batch_size = try_get_key(cfg, "train.train_micro_batch_size", default=None)
+    global_batch_size = try_get_key(cfg, "train.global_batch_size", default=None)
+    num_accumulation_steps = try_get_key(cfg, "train.num_accumulation_steps", default=None)
+
+    if train_micro_batch_size is not None and global_batch_size is not None:
+        if num_accumulation_steps is None:
+            if global_batch_size % (train_micro_batch_size * dist.get_data_parallel_size()) != 0:
+                raise ValueError(
+                    f"global_batch_size {global_batch_size} must be divisible by "
+                    "train_micro_batch_size * data_parallel_size "
+                    f"({train_micro_batch_size} * {dist.get_data_parallel_size()})"
+                )
+
+            cfg.train.num_accumulation_steps = global_batch_size // (
+                train_micro_batch_size * dist.get_data_parallel_size()
+            )
+
+        else:
+            if (
+                global_batch_size
+                != train_micro_batch_size * dist.get_data_parallel_size() * num_accumulation_steps
+            ):
+                raise ValueError(
+                    f"global_batch_size {global_batch_size} must equal to "
+                    "train_micro_batch_size * data_parallel_size * num_accumulation_steps "
+                    f"({train_micro_batch_size} * {dist.get_data_parallel_size()} * {num_accumulation_steps})"  # noqa
+                )
+    elif train_micro_batch_size is not None and global_batch_size is None:
+        if num_accumulation_steps is None:
+            cfg.train.num_accumulation_steps = 1
+
+        cfg.train.global_batch_size = (
+            train_micro_batch_size
+            * dist.get_data_parallel_size()
+            * cfg.train.num_accumulation_steps
+        )
+    elif train_micro_batch_size is None and global_batch_size is not None:
+        if num_accumulation_steps is None:
+            cfg.train.num_accumulation_steps = 1
+
+        if (
+            global_batch_size % (dist.get_data_parallel_size() * cfg.train.num_accumulation_steps)
+            != 0
+        ):
+            raise ValueError(
+                f"global_batch_size {global_batch_size} must be divisible by "
+                "data_parallel_size * num_accumulation_steps "
+                f"({dist.get_data_parallel_size()} * {cfg.train.num_accumulation_steps})"
+            )
+
+        cfg.train.train_micro_batch_size = global_batch_size // (
+            dist.get_data_parallel_size() * cfg.train.num_accumulation_steps
+        )
+    else:
+        raise ValueError("train_micro_batch_size and global_batch_size must be set either")
+    # Set total training samples.
+    cfg.train.samples = cfg.train.train_iter * cfg.train.global_batch_size
+
+
+def _compile_dependencies():
+    logger = logging.getLogger(__name__)
+    # =========================
+    # Compile dataset C++ code.
+    # =========================
+    # TODO: move this to ninja
+    if dist.get_local_rank() == 0:
+        start_time = time.time()
+        logger.info("> compiling dataset index builder ...")
+        from libai.data.data_utils import compile_helper
+
+        compile_helper()
+        logger.info(
+            ">>> done with dataset index builder. Compilation time: {:.3f} "
+            "seconds".format(time.time() - start_time)
+        )
+
+    dist.synchronize()
+    if dist.get_local_rank() == 0:
+        logger.info(
+            ">>> done with compiling. "
+            "Compilation time: {:.3f} seconds".format(time.time() - start_time)
+        )
+
+
+def default_setup(cfg, args):
+    """
+    Perform some basic common setups at the beginning of a job, including:
+
+    1. Set up the libai logger
+    2. Log basic information about environment, cmdline arguments, and config
+    3. Setup the distributed environment
+    4. Setup tokenizer if it's an NLP related task
+    5. Check batch_size
+    6. Backup the config to the output directory
+    7. Compile dependencies
+
+    Args:
+        args (argparse.NameSpace): the command line arguments to be logged
+    """
+
+    output_dir = try_get_key(cfg, "train.output_dir")
+    if dist.is_main_process() and output_dir:
+        os.makedirs(output_dir, exist_ok=True)
+
+    cfg.train.resume = args.resume
+
+    rank = dist.get_rank()
+    logger = setup_logger(output_dir, distributed_rank=rank)
+
+    logger.info("Rank of current process: {}. World size: {}".format(rank, dist.get_world_size()))
+    logger.info("Command line arguments: " + str(args))
+
+    if hasattr(args, "config_file") and args.config_file != "":
+        logger.info(
+            "Contents of args.config_file={}:\n{}".format(
+                args.config_file,
+                _highlight(open(args.config_file, "r").read(), args.config_file),
+            )
+        )
+
+    dist.setup_dist_util(cfg.train.dist)
+
+    _check_batch_size(cfg)
+
+    if dist.is_main_process() and output_dir:
+        # Note: some of our scripts may expect the existence of
+        # config.yaml in output directory
+        path = os.path.join(output_dir, "config.yaml")
+        LazyConfig.save(cfg, path)
+        logger.info("Full config saved to {}".format(path))
+
+    flow.boxing.nccl.set_fusion_threshold_mbytes(
+        try_get_key(cfg, "train.nccl_fusion_threshold_mb", default=16)
+    )
+    flow.boxing.nccl.set_fusion_max_ops_num(
+        try_get_key(cfg, "train.nccl_fusion_max_ops", default=24)
+    )
+
+    _compile_dependencies()
+
+
+class DefaultTrainer(TrainerBase):
+    """
+    A trainer with default training logic. Compared to `TrainerBase`, it
+    also contains the following logic:
+
+    1. Create model, optimizer, scheduler, dataloader from the given config.
+    2. Load a checkpoint or `cfg.MODEL.WEIGHTS`, if exists.
+    3. Register a few common hooks defined by the config.
+
+    With standard features, it is created to simplify the **standard model training workflow** and
+    reduce code boilerplate for users who only need the standard training workflow.
+
+    It means this class makes **many assumptions** about your training logic that
+    may easily become invalid in a new research. In fact, any assumptions beyond those made in the
+    :class:`TrainerBase` are too much for research.
+
+    The code of this class has been annotated about restrictive assumptions it made.
+    When they do not work for you, you're encouraged to:
+
+    1. Overwrite methods of this class, OR:
+    2. Use :class:`TrainerBase`, which only does minimal SGD training and
+       nothing else. You can then add your own hooks if needed. OR:
+    3. Write your own training loop similar to ``tools/train_net.py``.
+
+    Also note that the behavior of this class, like other functions/classes in
+    this file, is not stable, since it is meant to represent the "common default behavior".
+    It is only guaranteed to work well with the standard models and training workflow in libai.
+    To obtain more stable behavior, write your own training logic with other public APIs.
+
+
+    Examples:
+
+    .. code-block:: python
+
+        trainer = DefaultTrainer(cfg)
+        trainer.resume_or_load()  # load last checkpoint or MODEL.WEIGHTS
+        trainer.train()
+
+    Attributes:
+        scheduler:
+        checkpointer (Checkpointer):
+        cfg (omegaconf.dictconfig.DictConfig):
+    """
+
+    def __init__(self, cfg):
+        """
+        Args:
+            cfg (omegaconf.dictconfig.DictConfig):
+        """
+        super().__init__()
+        self.cfg = cfg
+        logger = logging.getLogger("libai")
+
+        # setup_logger is not called for LiBai
+        if not logger.isEnabledFor(logging.INFO):
+            setup_logger()
+
+        # Initialize tokenizer
+        self.tokenizer = self.build_tokenizer(cfg)
+
+        self.start_iter = 0
+        if cfg.train.resume:
+            save_file = os.path.join(cfg.train.output_dir, "last_checkpoint")
+            try:
+                with open(save_file, "r") as f:
+                    last_saved = f.read().strip()
+                assert (
+                    last_saved != "model_final"
+                ), "model training has finished, check your model in train.output_dir"
+                self.start_iter = int(last_saved.split("_")[-1]) + 1
+            except IOError:
+                # If file doesn't exist, maybe because it has just been deleted.
+                # We just set start_iter to 0.
+                self.start_iter = 0
+        if cfg.graph.enabled:
+            cfg.dataloader.consumed_samples = self.start_iter * cfg.train.global_batch_size
+        else:
+            cfg.dataloader.consumed_samples = (
+                self.start_iter * cfg.train.global_batch_size // cfg.train.num_accumulation_steps
+            )
+
+        self.train_loader = None
+        self.test_loader = []
+
+        train_loader, val_loader, test_loader = self.build_train_loader(cfg, self.tokenizer)
+        self.train_loader = train_loader
+
+        if val_loader is not None:
+            self.test_loader.append(val_loader)
+        if test_loader is not None:
+            self.test_loader.append(test_loader)
+
+        self.test_loader.extend(self.build_test_loader(cfg, self.tokenizer))
+
+        if cfg.train.rdma_enabled:
+            # set rdma
+            flow.env.init_rdma()
+
+        # Automatically scale the hyperparams
+        self.auto_scale_hyperparams(cfg, self.train_loader)
+
+        # Assume these objects must be constructed in this order.
+        dist.synchronize()
+        start_time = time.time()
+        logger.info("> Start building model...")
+        self.model = self.build_model(cfg)
+
+        dist.synchronize()
+        logger.info(
+            ">>> done with building model. "
+            "Building time: {:.3f} seconds".format(time.time() - start_time)
+        )
+
+        self.optimizer = self.build_optimizer(cfg, self.model)
+        self.lr_scheduler = self.build_lr_scheduler(cfg, self.optimizer)
+
+        if cfg.graph.enabled:
+            self.graph_train = self.build_graph(
+                cfg, self.model, self.optimizer, self.lr_scheduler, is_train=True
+            )
+            self.graph_eval = self.build_graph(cfg, self.model, is_train=False)
+            self._trainer = GraphTrainer(
+                self.graph_train, self.train_loader, cfg.train.num_accumulation_steps
+            )
+        else:
+            self._trainer = EagerTrainer(
+                self.model, self.train_loader, self.optimizer, cfg.train.num_accumulation_steps
+            )
+
+        # Assume no other objects need to be checkpointed.
+        # We can later make it checkpoint the stateful hooks
+        if cfg.graph.enabled:
+            self.checkpointer = Checkpointer(
+                # Assume you want to save checkpoints together with logs/statistics
+                self.model,
+                cfg.train.output_dir,
+                # In static graph mode, optimizer and scheduler state_dict will
+                # be saved with graph.state_dict().
+                graph=self.graph_train,
+                # We print lr by `LRScheduler` hook, so we need to save/load eager lr_scheduler,
+                # otherwise, lr will be reset to initial state when resuming training.
+                lr_scheduler=self.lr_scheduler,
+            )
+        else:
+            self.checkpointer = Checkpointer(
+                # Assume you want to save checkpoints together with logs/statistics
+                self.model,
+                cfg.train.output_dir,
+                optimizer=self.optimizer,
+                lr_scheduler=self.lr_scheduler,
+            )
+
+        # Loading checkpoint before dataloader construction, because
+        # dataloader needs to know the consumed iterations from
+        # the last breakpoint.
+        self.resume_or_load(cfg.train.resume)
+        cfg.train.start_iter = self.start_iter
+
+        # global_batch_size = micro_batch_size * num_gpus * num_accumulation_steps
+        # When using gradient accumulation in graph mode, each run_step
+        # handle `global_batch_size` samples.
+        # When using gradient accumulation in eager mode, each run_step just handle
+        # `micro_batch_size * num_gpus` samples, so we need to divide `num_accumulation_steps`
+        # to get the actual `batch_size` for computing `throughput` and `consumed_samples`
+        self.global_batch_size = (
+            cfg.train.global_batch_size
+            if cfg.graph.enabled
+            else cfg.train.global_batch_size // cfg.train.num_accumulation_steps
+        )
+        self.max_iter = cfg.train.train_iter
+
+        self.register_hooks(self.build_hooks())
+
+    def resume_or_load(self, resume=True):
+        """
+        If `resume==True` and `cfg.train.output_dir` contains the last checkpoint (defined by
+        a `last_checkpoint` file), resume from the file. Resuming means loading all
+        available states (eg. optimizer and scheduler) and update iteration counter
+        from the checkpoint. ``cfg.train.load_weight`` will not be used.
+        Otherwise, this is considered as an independent training. The method will load model
+        weights from the file ``cfg.train.load_weight`` (but will not load other states) and start
+        from iteration 0.
+
+        Args:
+            resume (bool): whether to do resume or not
+        """
+        weight_path = self.cfg.train.load_weight
+        assert isinstance(
+            weight_path, str
+        ), f"cfg.train.load_weight:{self.cfg.train.load_weight} must be string"
+        if resume:
+            assert self.checkpointer.has_checkpoint()
+            # The checkpoint stores the training iteration that just finished, thus we start
+            # at the next iteration (or iter zero if there's no checkpoint).
+            assert self.start_iter == (
+                self.checkpointer.resume_or_load(None, resume=True).get("iter", -1) + 1
+            )
+        elif len(weight_path) != 0:
+            assert os.path.isdir(
+                weight_path
+            ), f"cfg.train.load_weight:{self.cfg.train.load_weight} must be directory"
+            self.checkpointer.load(weight_path, checkpointables=[])
+
+    def build_hooks(self):
+        """
+        Build a list of default hooks, including timing, evaluation,
+        checkpointing, lr scheduling, precise BN, writing events.
+
+        Returns:
+            list[HookBase]:
+        """
+
+        ret = [
+            hooks.IterationTimer(),
+            hooks.LRScheduler(),  # for beauty lr scheduler printer in `nn.Graph` mode
+            hooks.PeriodicCheckpointer(
+                self.checkpointer,
+                self.cfg.train.checkpointer.period,
+                max_to_keep=self.cfg.train.checkpointer.max_to_keep,
+            ),
+        ]
+
+        if self.cfg.train.evaluation.enabled:
+            assert self.cfg.train.evaluation.eval_iter > 0, "run_iter must be positive number"
+
+            def test_and_save_results():
+                model = self.graph_eval if self.cfg.graph.enabled else self.model
+                self._last_eval_results = self.test(self.cfg, self.test_loader, model)
+                return self._last_eval_results
+
+            ret.append(hooks.EvalHook(self.cfg.train.evaluation.eval_period, test_and_save_results))
+            ret.append(
+                hooks.BestCheckpointer(
+                    self.cfg.train.evaluation.eval_period,
+                    self.checkpointer,
+                    val_metric=try_get_key(
+                        self.cfg, "train.evaluation.eval_metric", default="Acc@1"
+                    ),
+                    mode=try_get_key(self.cfg, "train.evaluation.eval_mode", default="max"),
+                )
+            )
+
+        if dist.is_main_process():
+            # run writers in the end, so that evaluation metrics are written
+            ret.append(hooks.PeriodicWriter(self.build_writers(), self.cfg.train.log_period))
+        return ret
+
+    def build_writers(self):
+        """
+        Build a list of writers to be used. By default it contains
+        writers that write metrics to the screen,
+        a json file, and a tensorboard event file respectively.
+        If you'd like a different list of writers, you can overwrite it in
+        your trainer.
+
+        Returns:
+            list[EventWriter]: a list of :class:`EventWriter` objects.
+
+        It is now implemented by:
+
+        .. code-block:: python
+
+            return [
+                CommonMetricPrinter(self.global_batch_size, self.max_iter),
+                JSONWriter(os.path.join(self.cfg.train.output_dir, "metrics.json")),
+                TensorboardXWriter(self.cfg.train.output_dir),
+            ]
+        """
+        # Assume the default print/log frequency.
+        return [
+            # It may not always print what you want to see, since it prints "common" metrics only.
+            CommonMetricPrinter(self.global_batch_size, self.max_iter),
+            JSONWriter(os.path.join(self.cfg.train.output_dir, "metrics.json")),
+            TensorboardXWriter(self.cfg.train.output_dir),
+        ]
+
+    def train(self):
+        """
+        Run training.
+
+        Returns:
+            OrderedDict of results, if evaluation is enabled. Otherwise None.
+        """
+        super().train(self.start_iter, self.max_iter)
+
+    def run_step(self):
+        self._trainer.iter = self.iter
+        self._trainer.run_step(self.get_batch, self.cfg.train.input_placement_device)
+
+    @classmethod
+    def get_batch(
+        cls,
+        data: Instance,
+        input_placement_device: str = "cuda",
+        mixup_func: Optional[Callable] = None,
+    ):
+        """
+        Convert batched local tensor to distributed tensor for model step running.
+
+        If you want to do something with batched data before model, (e.g. mixup),
+        you can rewrite this function.
+        """
+        if isinstance(data, flow.utils.data._utils.worker.ExceptionWrapper):
+            data.reraise()
+
+        if mixup_func is not None:
+            images, labels = mixup_func(
+                data.get("images").tensor.cuda(),
+                data.get("labels").tensor.cuda(),
+            )
+            data.get("images").tensor = images
+            data.get("labels").tensor = labels
+
+        ret_dict = {}
+        for key, value in data.get_fields().items():
+            value.to_global(device_type=input_placement_device)
+            ret_dict[key] = value.tensor
+        return ret_dict
+
+    @classmethod
+    def build_tokenizer(cls, cfg):
+        """
+        Returns:
+            libai.tokenizer.PreTrainedTokenizer:
+
+        It now calls :func:`libai.tokenizer.build_tokenizer`.
+        """
+        tokenizer = None
+        if try_get_key(cfg, "tokenization") is not None:
+            tokenizer = build_tokenizer(cfg.tokenization)
+            # FIXME(lxy): In case model is not defined with cfg, the `vocab_size` can be
+            # accessed by `model.vocab_size`.
+            if try_get_key(cfg, "model.cfg.vocab_size", default=None) is not None:
+                # In case the model does not need vocab_size as argument
+                multiple = (
+                    cfg.tokenization.make_vocab_size_divisible_by
+                    * cfg.train.dist.tensor_parallel_size
+                )
+                cfg.model.cfg.vocab_size = tokenizer.padded_vocab_size(multiple)
+        return tokenizer
+
+    @classmethod
+    def build_model(cls, cfg):
+        """
+        Returns:
+            flow.nn.Module:
+
+        It now calls :func:`libai.models.build_model`.
+        Overwrite it if you'd like a different model.
+        """
+        assert try_get_key(cfg, "model") is not None, "cfg must contain `model` namespace"
+        # Set model fp16 option because of embedding layer `white_identity` manual
+        # insert for amp training if provided.
+        if try_get_key(cfg.model, "cfg.amp_enabled") is not None:
+            cfg.model.cfg.amp_enabled = cfg.train.amp.enabled and cfg.graph.enabled
+        # In case some model define without cfg keyword.
+        elif try_get_key(cfg.model, "amp_enabled") is not None:
+            cfg.model.amp_enabled = cfg.train.amp.enabled and cfg.graph.enabled
+        model = build_model(cfg.model)
+        logger = logging.getLogger(__name__)
+        logger.info("Model:\n{}".format(model))
+        model._apply(dist.convert_to_distributed_default_setting)
+        return model
+
+    @classmethod
+    def build_graph(cls, cfg, model, optimizer=None, lr_scheduler=None, is_train=True):
+        assert try_get_key(cfg, "graph") is not None, "cfg must contain `graph` namespace"
+        graph = build_graph(cfg, model, optimizer, lr_scheduler, is_train)
+        debug_graph = try_get_key(cfg, "graph.debug", default=-1)
+        if debug_graph >= 0:
+            logger = logging.getLogger(__name__)
+            logger.info("Graph debug mode on, automatically output debug info.")
+            graph.debug(cfg.graph.debug)
+        return graph
+
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        """
+        Returns:
+            flow.optim.Optimizer:
+
+        It now calls :func:`libai.optim.build_optimizer`.
+        Overwrite it if you'd like a different optimizer.
+        """
+        assert try_get_key(cfg, "optim") is not None, "cfg must contain `optim` namespace"
+        return build_optimizer(cfg.optim, model)
+
+    @classmethod
+    def build_lr_scheduler(cls, cfg, optimizer):
+        """
+        It now calls :func:`libai.scheduler.build_lr_scheduler`.
+        Overwrite it if you'd like a different scheduler.
+        """
+        assert (
+            try_get_key(cfg, "train.scheduler") is not None
+        ), "cfg.train must contain `scheduler` namespace"
+        return build_lr_scheduler(cfg.train.scheduler, optimizer)
+
+    @classmethod
+    def build_train_loader(cls, cfg, tokenizer=None):
+        """
+        Returns:
+            iterable
+
+        It now calls :func:`libai.data.build_train_valid_test_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        assert (
+            try_get_key(cfg, "dataloader.train") is not None
+        ), "cfg must contain `dataloader.train` namespace"
+        logger = logging.getLogger(__name__)
+        logger.info("Prepare training, validating, testing set")
+        if cfg.graph.enabled:
+            # In static graph mode, data will be sliced in nn.Graph automatically,
+            # dataloader will get micro-batch-size and data will be concated
+            # in graph_trainer.run_step to get mini-batch-size.
+            cfg.dataloader.train.train_batch_size = cfg.train.train_micro_batch_size
+        else:
+            # In eager mode, gradient accumulation will act like PyTorch, so dataloader
+            # will get micro-batch-size
+            cfg.dataloader.train.train_batch_size = cfg.train.train_micro_batch_size
+        cfg.dataloader.train.test_batch_size = cfg.train.test_micro_batch_size
+        cfg.dataloader.train.seed = cfg.train.seed
+
+        # used by nlp dataloader
+        if hasattr(cfg.dataloader.train, "train_val_test_num_samples"):
+            eval_iter = (
+                (cfg.train.train_iter // cfg.train.evaluation.eval_period + 1)
+                * cfg.train.evaluation.eval_iter
+                if cfg.train.evaluation.enabled
+                # samples for test_dataset must be larger than 0 even if there is no evaluation
+                else 1
+            )
+            test_iter = cfg.train.evaluation.eval_iter if cfg.train.evaluation.enabled else 1
+
+            cfg.dataloader.train.train_val_test_num_samples = [
+                int(cfg.train.samples),
+                int(eval_iter * cfg.train.test_micro_batch_size * dist.get_data_parallel_size()),
+                int(test_iter * cfg.train.test_micro_batch_size * dist.get_data_parallel_size()),
+            ]
+
+        if OmegaConf.is_list(cfg.dataloader.train.dataset):
+            for dataset in cfg.dataloader.train.dataset:
+                if hasattr(dataset, "seed"):
+                    dataset.seed = cfg.train.seed
+        else:
+            dataset = cfg.dataloader.train.dataset
+            if hasattr(dataset, "seed"):
+                dataset.seed = cfg.train.seed
+
+        # Set tokenizer for each dataset
+        if tokenizer:
+            if OmegaConf.is_list(cfg.dataloader.train.dataset):
+                for dataset in cfg.dataloader.train.dataset:
+                    dataset.tokenizer = tokenizer
+            else:
+                cfg.dataloader.train.dataset.tokenizer = tokenizer
+
+        train_loader, valid_loader, test_loader = instantiate(
+            cfg.dataloader.train, _recursive_=False
+        )
+        return train_loader, valid_loader, test_loader
+
+    @classmethod
+    def build_test_loader(cls, cfg, tokenizer=None):
+        """
+        Returns:
+            iterable
+
+        It now calls :func:`libai.data.build_image_test_loader` for CV tasks
+        or :func:`libai.data.build_nlp_test_loader` for NLP tasks.
+        Overwrite it if you'd like a different data loader.
+        """
+        # If there is no test_loader, just return []
+        if not try_get_key(cfg, "dataloader.test", default=False):
+            return []
+        logger = logging.getLogger(__name__)
+        logger.info("Prepare testing set")
+        assert OmegaConf.is_list(
+            cfg.dataloader.test
+        ), f"dataloader.test must be list but got type of {type(cfg.dataloader.test)}"
+        for i in range(len(cfg.dataloader.test)):
+            cfg.dataloader.test[i].test_batch_size = cfg.train.test_micro_batch_size
+            cfg.dataloader.test[i].seed = cfg.train.seed  # set seed
+            if tokenizer:
+                cfg.dataloader.test[i].dataset.tokenizer = tokenizer
+        # list[dataloader1, dataloader2, ...]
+        test_loader = instantiate(cfg.dataloader.test, _recursive_=False)
+        return test_loader
+
+    @classmethod
+    def auto_scale_hyperparams(cls, cfg, data_loader):
+        logger = logging.getLogger(__name__)
+        log_info = ""
+
+        # Get or set default iteration cfg
+        train_iter = try_get_key(cfg, "train.train_iter", default=0)
+        train_epoch = try_get_key(cfg, "train.train_epoch", default=0)
+        warmup_ratio = try_get_key(cfg, "train.warmup_ratio", default=0)
+        assert (
+            warmup_ratio < 1 and warmup_ratio >= 0
+        ), "warmup_ratio must be in [0, 1) that presents the ratio of warmup iter to the train iter"
+
+        # Automatically scale iteration num depend on the settings
+        # The total iters in one epoch is `len(dataset) / global_batch_size`
+        cfg.train.train_iter = max(
+            math.ceil(len(data_loader.dataset) * train_epoch / cfg.train.global_batch_size),
+            train_iter,
+        )
+        cfg.train.warmup_iter = math.ceil(cfg.train.train_iter * cfg.train.warmup_ratio)
+        if not cfg.graph.enabled:
+            # In eager mode, dataloader only get micro-batch-size each iter,
+            # which is mini-batch-size // num_accumulation, so scale `train_iter`
+            # and `warmup_iter` to be consistent with static graph mode.
+            cfg.train.train_iter *= cfg.train.num_accumulation_steps
+            cfg.train.warmup_iter *= cfg.train.num_accumulation_steps
+        log_info += "Auto-scaling the config to train.train_iter={}, train.warmup_iter={}".format(
+            cfg.train.train_iter, cfg.train.warmup_iter
+        )
+
+        # Automatically scale the milestones
+        if try_get_key(cfg, "train.scheduler.milestones"):
+            if len(
+                [
+                    milestone
+                    for milestone in cfg.train.scheduler.milestones
+                    if milestone < 0 or milestone >= 1
+                ]
+            ):
+                raise ValueError(
+                    "milestones should be a list of increasing ratio in [0, 1), but got {}".format(
+                        cfg.train.scheduler.milestones
+                    )
+                )
+            cfg.train.scheduler.milestones = [
+                int(milestone * cfg.train.train_iter)
+                for milestone in cfg.train.scheduler.milestones
+            ]
+            log_info += f", scheduler milestones={cfg.train.scheduler.milestones}"
+        logger.info(log_info)
+
+        # Global scheduler cfg
+        cfg.train.scheduler.warmup_iter = cfg.train.warmup_iter
+        cfg.train.scheduler.max_iter = cfg.train.train_iter
+
+        # train iter per epoch
+        iter_per_epoch = len(data_loader.dataset) // cfg.train.global_batch_size
+
+        # rescale eval period
+        if try_get_key(cfg, "train.evaluation.eval_after_n_epoch"):
+            cfg.train.evaluation.eval_period = (
+                iter_per_epoch * cfg.train.evaluation.eval_after_n_epoch
+            )
+            logger.info(
+                f"Auto-scaling the config "
+                f"train.evaluation.eval_after_n_epoch={cfg.train.evaluation.eval_after_n_epoch} "
+                f"to train.evaluation.eval_period={cfg.train.evaluation.eval_period}"
+            )
+
+        # rescale save model period
+        if try_get_key(cfg, "train.checkpointer.save_model_after_n_epoch"):
+            cfg.train.checkpointer.period = (
+                iter_per_epoch * cfg.train.checkpointer.save_model_after_n_epoch
+            )
+            logger.info(
+                f"Auto-scaling the config "
+                f"train.checkpointer.save_model_after_n_epoch="
+                f"{cfg.train.checkpointer.save_model_after_n_epoch} "
+                f"to train.checkpointer.period={cfg.train.checkpointer.period}"
+            )
+
+    @classmethod
+    def build_evaluator(cls, cfg):
+        evaluator = instantiate(cfg.train.evaluation.evaluator)
+        return evaluator
+
+    @classmethod
+    def test(cls, cfg, test_loaders, model, evaluator=None):
+        """
+        Evaluate the given model. The given model is expected to already contain
+        weights to evaluate.
+
+        Args:
+            cfg (CfgNode):
+            test_loaders: list [dataloader1, dataloader2, ...]
+            model (nn.Graph):
+            evaluators (list[DatasetEvaluator] or None): if None, will call
+                :meth:`build_evaluator`. Otherwise, must have the same length as
+                ``cfg.DATASETS.TEST``.
+
+        Returns:
+            dict: a dict of result metrics
+        """
+        logger = logging.getLogger(__name__)
+        # TODO: support multi evaluator
+        # if isinstance(evaluators, DatasetEvaluator):
+        #     evaluators = [evaluators]
+        test_batch_size = cfg.train.test_micro_batch_size * dist.get_data_parallel_size()
+        evaluator = cls.build_evaluator(cfg) if not evaluator else evaluator
+
+        results = OrderedDict()
+        for idx, data_loader in enumerate(test_loaders):
+            # When evaluators are passed in as arguments,
+            # implicitly assume that evaluators can be created before data_loader.
+            dataset_name = type(data_loader.dataset).__name__
+            # TODO: support multi evaluator
+            # if evaluators is not None:
+            #     evaluator = evaluators[idx]
+            # else:
+            #     try:
+            #         evaluator = cls.build_evaluator(cfg)
+            #     except NotImplementedError:
+            #         logger.warn(
+            #             "No evaluator found. Use `DefaultTrainer.test(evaluators=)`, "
+            #             "or implement its `build_evaluator` method."
+            #         )
+            #         results[dataset_name] = {}
+            #         continue
+            results_i = inference_on_dataset(
+                model,
+                data_loader,
+                test_batch_size,
+                cfg.train.evaluation.eval_iter,
+                cls.get_batch,
+                cfg.train.input_placement_device,
+                evaluator,
+            )
+            results[dataset_name] = results_i
+            if dist.is_main_process():
+                assert isinstance(
+                    results_i, dict
+                ), "Evaluator must return a dict on the main process. Got {} instead.".format(
+                    results_i
+                )
+                logger.info(
+                    "Evaluation results for {} in csv format:".format(
+                        colored(dataset_name, "green")
+                    )
+                )
+                print_csv_format(results_i)
+
+        if len(results) == 1:
+            results = list(results.values())[0]
+        return results
--- a/libai/engine/hooks.py
+++ b/libai/engine/hooks.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import logging
+import math
+import operator
+import time
+from collections import Counter
+
+import oneflow as flow
+
+from libai.evaluation import flatten_results_dict
+from libai.utils import distributed as dist
+from libai.utils.checkpoint import Checkpointer
+from libai.utils.checkpoint import PeriodicCheckpointer as _PeriodicCheckpointer
+from libai.utils.events import EventWriter
+from libai.utils.timer import Timer
+
+from .trainer import HookBase
+
+# --------------------------------------------------------
+# References:
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/hooks.py
+# --------------------------------------------------------
+
+"""
+Implement some common hooks.
+"""
+logger = logging.getLogger(__name__)
+
+
+class CallbackHook(HookBase):
+    """
+    Create a hook using callback functions provided by the user.
+    """
+
+    def __init__(self, *, before_train=None, after_train=None, before_step=None, after_step=None):
+        """
+        Each argument is a function that takes one argument: the trainer.
+        """
+        self._before_train = before_train
+        self._before_step = before_step
+        self._after_step = after_step
+        self._after_train = after_train
+
+    def before_train(self):
+        if self._before_train:
+            self._before_train(self.trainer)
+
+    def after_train(self):
+        if self._after_train:
+            self._after_train(self.trainer)
+        # The functions may be closures that hold reference to the trainer
+        # Therefore, delete them to avoid circular reference.
+        del self._before_train, self._after_train
+        del self._before_step, self._after_step
+
+    def before_step(self):
+        if self._before_step:
+            self._before_step(self.trainer)
+
+    def after_step(self):
+        if self._after_step:
+            self._after_step(self.trainer)
+
+
+class IterationTimer(HookBase):
+    """
+    Track the time spent for each iteration (each run_step call in the trainer).
+    Print a summary in the end of training.
+    This hook uses the time between the call to its :meth:`before_step`
+    and :meth:`after_step` methods.
+    Under the convention that :meth:`before_step` of all hooks should only
+    take negligible amount of time, the :class:`IterationTimer` hook should be
+    placed at the beginning of the list of hooks to obtain accurate timing.
+    """
+
+    def __init__(self, warmup_iter=3):
+        """
+        Args:
+            warmup_iter (int): the number of iterations at the beginning to exclude
+                from timing.
+        """
+        self._warmup_iter = warmup_iter
+        self._step_timer = Timer()
+
+    def before_train(self):
+        self._start_time = time.perf_counter()
+        self._total_timer = Timer()
+        self._total_timer.pause()
+
+    def after_train(self):
+        total_time = time.perf_counter() - self._start_time
+        total_time_minus_hooks = self._total_timer.seconds()
+        hook_time = total_time - total_time_minus_hooks
+
+        num_iter = self.trainer.iter + 1 - self.trainer.start_iter - self._warmup_iter
+
+        if num_iter > 0 and total_time_minus_hooks > 0:
+            # Speed is meaningful only after warmup
+            # NOTE this format is parsed by grep in some scripts
+            logger.info(
+                "Overall training speed: {} iterations in {} ({:.4f} s / it)".format(
+                    num_iter,
+                    str(datetime.timedelta(seconds=int(total_time_minus_hooks))),
+                    total_time_minus_hooks / num_iter,
+                )
+            )
+
+        logger.info(
+            "Total training time: {} ({} on hooks)".format(
+                str(datetime.timedelta(seconds=int(total_time))),
+                str(datetime.timedelta(seconds=int(hook_time))),
+            )
+        )
+
+    def before_step(self):
+        self._step_timer.reset()
+        self._total_timer.resume()
+
+    def after_step(self):
+        # +1 because we're in after_step
+        iter_done = self.trainer.iter - self.trainer.start_iter + 1
+        if iter_done >= self._warmup_iter:
+            sec = self._step_timer.seconds()
+            self.trainer.storage.put_scalars(time=sec)
+        else:
+            self._start_time = time.perf_counter()
+            self._total_timer.reset()
+
+        self._total_timer.pause()
+
+
+class PeriodicWriter(HookBase):
+    """
+    Write events to EventStorage periodically.
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, writers, period=20):
+        """
+        Args:
+            writers (list[EventWriter]): a list of EventWriter objects
+            period (int):
+        """
+        self._writers = writers
+        for w in writers:
+            assert isinstance(w, EventWriter), w
+        self._period = period
+
+    def after_step(self):
+        if (self.trainer.iter + 1) % self._period == 0 or (
+            self.trainer.iter == self.trainer.max_iter - 1
+        ):
+            for writer in self._writers:
+                writer.write()
+
+    def after_train(self):
+        for writer in self._writers:
+            writer.close()
+
+
+class PeriodicCheckpointer(_PeriodicCheckpointer, HookBase):
+    """
+    Same as :class:`libai.utils.checkpoint.PeriodicCheckpointer`, but as a hook.
+    Note that when used as a hook,
+    it is unable to save additional data other than what's defined
+    by the given `checkpointer`.
+    It is executed every ``period`` iterations and after the last iteration.
+    """
+
+    def before_train(self):
+        self.max_iter = self.trainer.max_iter
+
+    def after_step(self):
+        self.step(self.trainer.iter)
+
+
+class BestCheckpointer(HookBase):
+    """
+    Checkpoints best weights based off given metric.
+    This hook should be used in conjunction to and executed after the hook
+    that produces the metric, e.g. `EvalHook`.
+    """
+
+    def __init__(
+        self,
+        eval_period: int,
+        checkpointer: Checkpointer,
+        val_metric: str,
+        mode: str = "max",
+        file_prefix: str = "model_best",
+    ) -> None:
+        """
+        Args:
+            eval_period (int): the period `EvalHook` is set to run.
+            checkpointer: the checkpointer object used to save checkpoints.
+            val_metric (str): validation metric to track for best checkpoint, e.g. "acc@1"
+            mode (str): one of {'max', 'min'}. controls whether the chosen val metric should be
+                maximized or minimized, e.g. for "acc@1" it should be "max"
+            file_prefix (str): the prefix of checkpoint's filename, defaults to "model_best"
+        """
+        self._period = eval_period
+        self._val_metric = val_metric
+        assert mode in [
+            "max",
+            "min",
+        ], f'Mode "{mode}" to `BestCheckpointer` is unknown. It should be one of {"max", "min"}.'
+        if mode == "max":
+            self._compare = operator.gt
+        else:
+            self._compare = operator.lt
+        self._checkpointer = checkpointer
+        self._file_prefix = file_prefix
+        self.best_metric = None
+        self.best_iter = None
+
+    def _update_best(self, val, iteration):
+        if math.isnan(val) or math.isinf(val):
+            return False
+        self.best_metric = val
+        self.best_iter = iteration
+        return True
+
+    def _best_checking(self):
+        metric_tuple = self.trainer.storage.latest().get(self._val_metric)
+        flag = flow.zeros(1)
+        if dist.is_main_process():
+            if metric_tuple is None:
+                logger.warning(
+                    f"Given val metric {self._val_metric} does not seem to be computed/stored. "
+                    "Will not be checkpointed based on that."
+                )
+            else:
+                latest_metric, metric_iter = metric_tuple
+
+                if self.best_metric is None:
+                    if self._update_best(latest_metric, metric_iter):
+                        flag = flag + 1
+                        logger.info(
+                            f"Saved first model at {self.best_metric:0.5f} @ {self.best_iter} steps"
+                        )
+                elif self._compare(latest_metric, self.best_metric):
+                    flag = flag + 1
+                    logger.info(
+                        f"Saved best model as latest eval score for {self._val_metric} is "
+                        f"{latest_metric:0.5f}, better than last best score "
+                        f"{self.best_metric:0.5f} @ iteration {self.best_iter}."
+                    )
+                    self._update_best(latest_metric, metric_iter)
+                else:
+                    logger.info(
+                        f"Not saving as latest eval score for "
+                        f"{self._val_metric} is {latest_metric:0.5f}, "
+                        f"not better than best score {self.best_metric:0.5f} "
+                        f"@ iteration {self.best_iter}."
+                    )
+
+        dist.synchronize()
+        flag = flag.to_global(
+            sbp=flow.sbp.broadcast, placement=flow.env.all_device_placement("cpu")
+        )
+        if flag.to_local().item() == 1:
+            self._checkpointer.save(f"{self._file_prefix}")
+
+    def after_step(self):
+        # same conditions as `EvalHook`
+        next_iter = self.trainer.iter + 1
+        if (
+            self._period > 0
+            and next_iter % self._period == 0
+            and next_iter != self.trainer.max_iter
+        ):
+            self._best_checking()
+
+    def after_train(self):
+        # same conditions as `EvalHook`
+        if self.trainer.iter + 1 >= self.trainer.max_iter:
+            self._best_checking()
+
+
+class EvalHook(HookBase):
+    """
+    Run an evaluation function periodically, and at the end of training.
+    It is executed every ``eval_period`` iterations and after the last iteration.
+    """
+
+    def __init__(self, eval_period, eval_function):
+        """
+        Args:
+            eval_period (int): the period to run `eval_function`.
+            eval_function (callable): a function which takes no arguments, and
+                returns a nested dict of evaluation metrics.
+        Note:
+            This hook must be enabled in all or none workers.
+            If you would like only certain workers to perform evaluation,
+            give other workers a no-op function (`eval_function=lambda: None`).
+        """
+        self._period = eval_period
+        self._func = eval_function
+
+    def _do_eval(self):
+
+        results = self._func()
+
+        if results:
+            assert isinstance(
+                results, dict
+            ), "Eval function must return a dict. Got {} instead.".format(results)
+
+            flattened_results = flatten_results_dict(results)
+            # fixme: flatten_results_dict is not defined
+            for k, v in flattened_results.items():
+                try:
+                    v = float(v)
+                except Exception:
+                    raise ValueError(
+                        "[EvalHook] eval_function should return a nested dict of float. "
+                        "Got '{}: {}' instead.".format(k, v)
+                    )
+            self.trainer.storage.put_scalars(**flattened_results, smoothing_hint=False)
+
+        # Evaluation may take different time among workers.
+        # A barrier make them start the next iteration together.
+        dist.synchronize()
+
+    def after_step(self):
+        next_iter = self.trainer.iter + 1
+        if self._period > 0 and next_iter % self._period == 0:
+            # do the last eval in after_train
+            if next_iter != self.trainer.max_iter:
+                self._do_eval()
+
+    def after_train(self):
+        # This condition is to prevent the eval from running after a failed training
+        if self.trainer.iter + 1 >= self.trainer.max_iter:
+            self._do_eval()
+        # func is likely a closure that holds reference to the trainer
+        # therefore we clean it to avoid circular reference in the end
+        del self._func
+
+
+class LRScheduler(HookBase):
+    """
+    A hook which executes a oneflow builtin LR scheduler and summarizes the LR.
+    It is executed after every iteration.
+    """
+
+    def __init__(self, optimizer=None, scheduler=None):
+        """
+        Args:
+            optimizer (flow.optim.Optimizer):
+            scheduler (flow.optim.LRScheduler):
+                if a :class:`ParamScheduler` object, it defines the multiplier over the base LR
+                in the optimizer.
+        If any argument is not given, will try to obtain it from the trainer.
+        """
+        self._optimizer = optimizer
+        self._scheduler = scheduler
+
+    def before_train(self):
+        self._optimizer = self._optimizer or self.trainer.optimizer
+        self._best_param_group_id = LRScheduler.get_best_param_group_id(self._optimizer)
+
+    @staticmethod
+    def get_best_param_group_id(optimizer):
+        # NOTE: some heuristics on what LR to summarize
+        # summarize the param group with most parameters
+        largest_group = max(len(g["params"]) for g in optimizer.state_dict()["param_groups"])
+
+        if largest_group == 1:
+            # If all groups have one parameter,
+            # then find the most common initial LR, and use it for summary
+            lr_count = Counter(
+                [g["_options"]["lr"] for g in optimizer.state_dict()["param_groups"]]
+            )
+            lr = lr_count.most_common()[0][0]
+            for i, g in enumerate(optimizer.state_dict()["param_groups"]):
+                if g["_options"]["lr"] == lr:
+                    return i
+        else:
+            for i, g in enumerate(optimizer.state_dict()["param_groups"]):
+                if len(g["params"]) == largest_group:
+                    return i
+
+    def after_step(self):
+        lr = self.scheduler.get_last_lr()[self._best_param_group_id]
+        self.trainer.storage.put_scalar("lr", lr, smoothing_hint=False)
+        self.scheduler.step()
+
+    @property
+    def scheduler(self):
+        return self._scheduler or self.trainer.lr_scheduler
+
+    def state_dict(self):
+        if isinstance(self.scheduler, flow.optim.lr_scheduler._LRScheduler):
+            return self.scheduler.state_dict()
+        return {}
+
+    def load_state_dict(self, state_dict):
+        if isinstance(self.scheduler, flow.optim.lr_scheduler._LRScheduler):
+            logger.info("Loading scheduler from state_dict ...")
+            self.scheduler.load_state_dict(state_dict)
--- a/libai/engine/trainer.py
+++ b/libai/engine/trainer.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import time
+import weakref
+from typing import Callable, List, Mapping
+
+import oneflow as flow
+
+from libai.utils import distributed as dist
+from libai.utils.events import EventStorage, get_event_storage
+
+# --------------------------------------------------------
+# References:
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/train_loop.py
+# --------------------------------------------------------
+
+
+class HookBase:
+    """
+    Base class for hooks that can be registered with :class:`TrainerBase`.
+
+    Each hook can implement 4 methods. The way they are called is demonstrated
+    in the following snippet:
+    ::
+
+        hook.before_train()
+        for iter in range(start_iter, max_iter):
+            hook.before_step()
+            trainer.run_step()
+            hook.after_step()
+        iter += 1
+        hook.after_train()
+
+    Notes:
+        1. In the hook method, users can access ``self.trainer`` to access more
+           properties about the context (e.g., model, current iteration, or config
+           if using :class:`DefaultTrainer`).
+
+        2. A hook that does something in :meth:`before_step` can often be
+           implemented equivalently in :meth:`after_step`.
+           If the hook takes non-trivial time, it is strongly recommended to
+           implement the hook in :meth:`after_step` instead of :meth:`before_step`.
+           The convention is that :meth:`before_step` should only take negligible time.
+
+           Following this convention will allow hooks that do care about the difference
+           between :meth:`before_step` and :meth:`after_step` (e.g., timer) to
+           function properly.
+    """
+
+    trainer: "TrainerBase" = None
+    """
+    A weak reference to the trainer object. Set by the trainer when the hook is registered.
+    """
+
+    def before_train(self):
+        """
+        Called before the first iteration.
+        """
+
+    def after_train(self):
+        """
+        Called after the last iteration.
+        """
+
+    def before_step(self):
+        """
+        Called before each iteration.
+        """
+
+    def after_step(self):
+        """
+        Called after each iteration.
+        """
+
+
+class TrainerBase:
+    """
+    Base class for iterative trainer with hooks.
+    The only assumption we made here is: the training runs in a loop.
+    A subclass can implement what the loop is.
+    We made no assumptions about the existence of dataloader, optimizer, model, etc.
+
+    Attributes:
+        iter(int): The current iteration.
+        start_iter(int): The iteration to start with.
+            By convention the minimum possible value is 0.
+        max_iter(int): The iteration to end training.
+        storage(EventStorage): An EventStorage that's opened during the course of training.
+    """
+
+    def __init__(self):
+        self._hooks: List[HookBase] = []
+        self.iter: int = 0
+        self.start_iter: int = 0
+        self.max_iter: int
+        self.storage: EventStorage
+
+    def register_hooks(self, hooks):
+        """
+        Register hooks to the trainer. The hooks are executed in the order
+        they are registered.
+
+        Args:
+            hooks (list[Optional[HookBase]]): list of hooks
+        """
+        hooks = [h for h in hooks if h is not None]
+        for h in hooks:
+            assert isinstance(h, HookBase)
+            # To avoid circular reference, hooks and trainer cannot own each other.
+            # This normally does not matter, but will cause memory leak if the
+            # involved objects contain __del__:
+            # See http://engineering.hearsaysocial.com/2013/06/16/circular-references-in-python/
+            h.trainer = weakref.proxy(self)
+        self._hooks.extend(hooks)
+
+    def train(self, start_iter: int, max_iter: int):
+        """
+        Args:
+            start_iter, max_iter (int): See docs above
+        """
+        logger = logging.getLogger(__name__)
+        logger.info("Starting training from iteration {}".format(start_iter))
+
+        self.iter = self.start_iter = start_iter
+        self.max_iter = max_iter
+
+        with EventStorage(self.start_iter) as self.storage:
+            try:
+                self.before_train()
+                for self.iter in range(start_iter, max_iter):
+                    self.before_step()
+                    self.run_step()
+                    self.after_step()
+                # self.iter == max_iter can be used by `after_train` to
+                # tell whether the training successfully finished or failed
+                # due to exceptions.
+                self.iter += 1
+            except Exception:
+                logger.exception("Exception during training:")
+                raise
+            finally:
+                self.after_train()
+
+    def before_train(self):
+        for h in self._hooks:
+            h.before_train()
+
+    def after_train(self):
+        for h in self._hooks:
+            h.after_train()
+
+    def before_step(self):
+        self.storage.iter = self.iter
+        for h in self._hooks:
+            h.before_step()
+
+    def after_step(self):
+        self.storage.samples = (self.iter + 1) * self.cfg.train.global_batch_size
+        for h in self._hooks:
+            h.after_step()
+
+    def run_step(self):
+        raise NotImplementedError
+
+    @staticmethod
+    def write_metrics(
+        loss_dict: Mapping[str, flow.Tensor],
+        data_time: float,
+        prefix: str = "",
+    ) -> None:
+        """
+        Args:
+            loss_dict (dict): dict of scalar losses
+            data_time (float): time taken by the dataloader iteration
+            prefix (str): prefix for logging keys
+        """
+        # get metric value, remove it to rank0 cause logger.info only work in rank0
+        metrics_dict = {
+            k: dist.tensor_to_rank0(v, device="cpu", to_local=True) for k, v in loss_dict.items()
+        }
+        metrics_dict["data_time"] = data_time
+
+        # TODO: Gather metrics among all workers for logging
+        # all_metrics_dict = dist.gather(metrics_dict)
+        all_metrics_dict = metrics_dict
+
+        if dist.is_main_process():
+            storage = get_event_storage()
+
+            # data_time among workers can have high variance. The actual latency
+            # caused by data_time is the maximum among workers.
+            # data_time = np.max([x.pop("data_time") for x in all_metrics_dict])
+            data_time = all_metrics_dict.pop("data_time")
+            storage.put_scalar("data_time", data_time)
+
+            # average the rest metrics
+            # metrics_dict = {
+            #     k: np.mean([x[k] for x in all_metrics_dict]) for k in all_metrics_dict[0].keys()
+            # }
+            metrics_dict = all_metrics_dict
+            total_losses_reduced = sum(v for k, v in metrics_dict.items() if "loss" in k)
+
+            storage.put_scalar("{}total_loss".format(prefix), total_losses_reduced)
+            if len(metrics_dict) > 1:
+                storage.put_scalars(**metrics_dict)
+
+
+class EagerTrainer(TrainerBase):
+    """
+    A simple eager trainer for the most common type of task:
+    single-cost single-optimizer single-data-source iterative optimization,
+    optionally using data-parallelism.
+    It assumes that in every step, you:
+
+    1. Compute the loss with a data from the data_loader.
+    2. Compute the gradients with the above loss.
+    3. Update the model with the optimizer.
+
+    All other tasks during training (checkpointing, logging, evaluation, LR schedule)
+    are maintained by hooks, which can be registered by :meth:`TrainerBase.register_hooks`.
+    If you want to do anything fancier than this,
+    either subclass TrainerBase and implement your own `run_step`,
+    or write your own training loop.
+    """
+
+    def __init__(self, model, data_loader, optimizer, grad_acc_steps=1):
+        """
+        Args:
+            model: a flow.nn.Module. Takes a data from data_loader and returns a
+                dict of losses.
+            data_loader: an iterable. Contains data to be used to call model.
+            optimizer: a flow optimizer.
+        """
+        super().__init__()
+
+        # We set the model to training mode in the trainer.
+        # However it's valid to train a model that's in eval mode.
+        # If you want your model (or a submodule of it) to behave
+        # like evaluation during training, you can overwrite its train() method.
+
+        model.train()
+
+        self.model = model
+        self.data_loader = data_loader
+        self._data_loader_iter = iter(data_loader)
+        self.optimizer = optimizer
+        self.grad_acc_steps = grad_acc_steps
+
+    def run_step(self, get_batch: Callable, input_placement_device: str = "cuda"):
+        """
+        Implement the standard training logic described above.
+        """
+        assert self.model.training, "[SimpleTrainer] model was changed to eval mode!"
+        start = time.perf_counter()
+
+        # If you want to do something with the data, you can wrap the dataloader.
+        data = next(self._data_loader_iter)
+        data = get_batch(
+            data, input_placement_device, getattr(self.data_loader, "mixup_func", None)
+        )
+        data_time = time.perf_counter() - start
+
+        loss_dict = self.model(**data)
+        losses = sum(v for k, v in loss_dict.items() if "loss" in k) / self.grad_acc_steps
+
+        losses.backward()
+        self.write_metrics(loss_dict, data_time)
+
+        if (self.iter + 1) % self.grad_acc_steps == 0:
+            self.optimizer.clip_grad()
+            self.optimizer.step()
+            self.optimizer.zero_grad()
+
+
+class GraphTrainer(TrainerBase):
+    """
+    A simple graph trainer for training and evaluating models in a static graph mode.
+    """
+
+    def __init__(self, graph, data_loader, grad_acc_steps=1):
+        super().__init__()
+
+        graph.model.train()
+        self.data_loader = data_loader
+        self._data_loader_iter = iter(data_loader)
+        self.graph = graph
+        self.grad_acc_steps = grad_acc_steps
+        self._temp_data = None
+        self._temp_count = 0
+
+    def run_step(self, get_batch: Callable, input_placement_device: str = "cuda"):
+        """
+        Implement the standard training logic described above.
+        """
+        assert self.graph.model.training, "[SimpleTrainer] model was changed to eval mode!"
+        start = time.perf_counter()
+
+        while self._temp_count != self.grad_acc_steps:
+            # If you want to do something with the data, you can wrap the dataloader.
+            data = next(self._data_loader_iter)
+
+            self._temp_count += 1
+            if self._temp_data is None:
+                self._temp_data = data
+            else:
+                # In static graph mode, data will be sliced in nn.Graph automatically,
+                # for geting mini-batch_size, we concat local_tensor first.
+                for key, value in data.get_fields().items():
+                    temp_value = self._temp_data.get(key)
+                    self._temp_data.get(key).tensor = flow.cat(
+                        (temp_value.tensor, value.tensor), dim=0
+                    )
+
+        data = self._temp_data
+        self._temp_count = 0
+        self._temp_data = None
+
+        data = get_batch(
+            data, input_placement_device, getattr(self.data_loader, "mixup_func", None)
+        )
+
+        data_time = time.perf_counter() - start
+
+        # If you want to do something with the losses, you can wrap the model.
+        loss_dict = self.graph(**data)
+        # Add this because when set up gradient accumulations, graph will return
+        # an unpacked n-d tensor whose size is accumulation step
+        for key, value in loss_dict.items():
+            if "loss" in key:
+                loss_dict[key] = value.mean()
+            else:
+                # NOTE: only support scalar tensor currently
+                loss_dict[key] = value.sum()
+
+        self.write_metrics(loss_dict, data_time)
--- a/libai/evaluation/__init__.py
+++ b/libai/evaluation/__init__.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .evaluator import DatasetEvaluator, inference_on_dataset
+from .utils import print_csv_format, flatten_results_dict
+from .cls_evaluator import ClsEvaluator
+from .ppl_evaluator import PPLEvaluator
+from .reg_evaluator import RegEvaluator
+from .bleu_evaluator import BLEUEvaluator
--- a/libai/evaluation/bleu_evaluator.py
+++ b/libai/evaluation/bleu_evaluator.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections import OrderedDict
+
+from nltk.translate.bleu_score import corpus_bleu
+
+from libai.utils import distributed as dist
+
+from .evaluator import DatasetEvaluator
+
+
+class BLEUEvaluator(DatasetEvaluator):
+    """
+    Evaluate BLEU(Bilingual Evaluation Understudy) score.
+
+    BLEU is a score for comparing a candidate translation
+    of text to one or more reference translations.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._predictions = []
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        candidate = outputs["candidate"]
+        reference = inputs["reference"]
+
+        self._predictions.append({"candidate": candidate, "reference": reference})
+
+    def evaluate(self):
+        if not dist.is_main_process():
+            return {}
+        else:
+            predictions = self._predictions
+
+        candidates = []
+        references = []
+        for pred in predictions:
+            candidates.append(pred["candidate"])
+            references.append(pred["reference"])
+
+        bleu_score = corpus_bleu(references, candidates)
+
+        self._results = OrderedDict()
+        self._results["bleu_score"] = bleu_score
+
+        return copy.deepcopy(self._results)
--- a/libai/evaluation/cls_evaluator.py
+++ b/libai/evaluation/cls_evaluator.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from collections import OrderedDict
+
+from libai.utils import distributed as dist
+
+from .evaluator import DatasetEvaluator
+
+
+def accuracy(output, target, topk=(1,)):
+    maxk = min(max(topk), output.size()[1])
+    batch_size = target.size(0)
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.reshape(1, -1).expand_as(pred))
+    return [
+        (correct[: min(k, maxk)].reshape(-1).float().sum(0) * 100.0 / batch_size).item()
+        for k in topk
+    ]
+
+
+class ClsEvaluator(DatasetEvaluator):
+    """
+    Evaluate accuracy for classification.
+    The metrics range from 0 to 100 (instead of 0 to 1).
+    We support evaluate different topk accuracy.
+    You can reset `cfg.train.topk=(1, 5, N)` according to your needs.
+    """
+
+    def __init__(self, topk=(1, 5)):
+        self.topk = topk
+        self._predictions = []
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        pred_logits = outputs["prediction_scores"]
+        labels = inputs["labels"]
+
+        # measure accuracy
+        topk_acc = accuracy(pred_logits, labels, topk=self.topk)
+        num_correct_acc_topk = [acc * labels.size(0) / 100 for acc in topk_acc]
+
+        self._predictions.append(
+            {"num_correct_topk": num_correct_acc_topk, "num_samples": labels.size(0)}
+        )
+
+    def evaluate(self):
+        if not dist.is_main_process():
+            return {}
+        else:
+            predictions = self._predictions
+
+        total_correct_num = OrderedDict()
+        for top_k in self.topk:
+            total_correct_num["Acc@" + str(top_k)] = 0
+
+        total_samples = 0
+        for prediction in predictions:
+            for top_k, num_correct_n in zip(self.topk, prediction["num_correct_topk"]):
+                total_correct_num["Acc@" + str(top_k)] += int(num_correct_n)
+
+            total_samples += int(prediction["num_samples"])
+
+        self._results = OrderedDict()
+        for top_k, topk_correct_num in total_correct_num.items():
+            self._results[top_k] = topk_correct_num / total_samples * 100
+
+        return copy.deepcopy(self._results)
--- a/libai/evaluation/evaluator.py
+++ b/libai/evaluation/evaluator.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+import logging
+import time
+from collections import OrderedDict, abc
+from contextlib import ExitStack, contextmanager
+from typing import Callable, List, Union
+
+import oneflow as flow
+
+from libai.utils import distributed as dist
+from libai.utils.logger import log_every_n_seconds
+
+from .utils import pad_batch
+
+# --------------------------------------------------------
+# References:
+# https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/evaluator.py
+# --------------------------------------------------------
+
+
+class DatasetEvaluator:
+    """
+    Base class for a dataset evaluator.
+    The function :func:`inference_on_dataset` runs the model over
+    all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs.
+    This class will accumulate information of the inputs/outputs (by :meth:`process`),
+    and produce evaluation results in the end (by :meth:`evaluate`).
+    """
+
+    def reset(self):
+        """
+        Preparation for a new round of evaluation.
+        Should be called before starting a round of evaluation.
+        """
+
+    def process(self, inputs, outputs):
+        """
+        Process the pair of inputs and outputs.
+
+        .. code-block:: python
+
+            pred_logits = outputs["prediction_scores"]
+            labels = inputs["labels"]
+            # do evaluation on pred_logits/labels pair
+            ...
+
+        Args:
+            inputs (dict): the inputs that's used to call the model.
+            outputs (dict): the return dict of `model(**inputs)`
+        """
+
+    def evaluate(self):
+        """
+        Evaluate/summarize the performance after processing all input/output pairs.
+
+        Returns:
+            dict:
+                A new evaluator class can return a dict of arbitrary format
+                as long as the user can process the results.
+                In our train_net.py, we expect the following format:
+
+                * key: the name of the task (e.g., Classification)
+                * value: a dict of {metric name: score}, e.g.: {"Acc@1": 75.0}
+        """
+
+
+class DatasetEvaluators(DatasetEvaluator):
+    """
+    Wrapper class to combine multiple :class:`DatasetEvaluator` instances.
+    This class dispatches every evaluation call to
+    all of its :class:`DatasetEvaluator`.
+    """
+
+    def __init__(self, evaluators):
+        """
+        Args:
+            evaluators (list): the evaluators to combine.
+        """
+        super().__init__()
+        self._evaluators = evaluators
+
+    def reset(self):
+        for evaluator in self._evaluators:
+            evaluator.reset()
+
+    def process(self, inputs, outputs):
+        for evaluator in self._evaluators:
+            evaluator.process(inputs, outputs)
+
+    def evaluate(self):
+        results = OrderedDict()
+        for evaluator in self._evaluators:
+            result = evaluator.evaluate()
+            if dist.is_main_process() and result is not None:
+                for k, v in result.items():
+                    assert (
+                        k not in results
+                    ), "Different evaluators produce results with the same key {}".format(k)
+                    results[k] = v
+        return results
+
+
+def inference_on_dataset(
+    model,
+    data_loader,
+    batch_size,
+    eval_iter,
+    get_batch: Callable,
+    input_placement_device: str,
+    evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None],
+):
+    """
+    Run model on the data_loader and evaluate the metrics with evaluator.
+    Also benchmark the inference speed of `model.__call__` accurately.
+    The model will be used in eval mode.
+
+    Args:
+        model (callable): a callable which takes an object from
+            `data_loader` and returns some outputs.
+            If it's an nn.Module, it will be temporarily set to `eval` mode.
+            If you wish to evaluate a model in `training` mode instead, you can
+            wrap the given model and override its behavior of `.eval()` and `.train()`.
+        batch_size: batch size for inference
+        data_loader: an iterable object with a length.
+            The elements it generates will be the inputs to the model.
+        eval_iter: running steps for evaluation
+        get_batch: a Callable function for getting data from dataloader
+        input_placement_device: used in get_batch, set it to `cuda` or `cpu`.
+            see input_placement_device in `libai.configs.common.train.py` for more details.
+        evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark,
+            but don't want to do any evaluation.
+
+    Returns:
+        The return value of `evaluator.evaluate()`
+    """
+    num_devices = dist.get_world_size()
+    logger = logging.getLogger(__name__)
+
+    total_samples = len(data_loader.dataset)  # inference data loader must have a fixed length
+    if evaluator is None:
+        # create a no-op evaluator
+        evaluator = DatasetEvaluators([])
+    if isinstance(evaluator, abc.MutableSequence):
+        evaluator = DatasetEvaluators(evaluator)
+    evaluator.reset()
+
+    num_warmup = min(5, len(data_loader) - 1)
+    start_time = time.perf_counter()
+    total_data_time = 0
+    total_compute_time = 0
+    total_eval_time = 0
+    consumed_samples = 0
+    dps = dist.get_data_parallel_size()
+    last_batch_lack = (dps - (total_samples % dps)) % dps
+
+    # reset total samples
+    real_eval_iter = min(eval_iter, len(data_loader))
+    total_samples = min(real_eval_iter * batch_size, len(data_loader.dataset))
+    logger.info(
+        f"with eval_iter {eval_iter}, "
+        f"reset total samples {len(data_loader.dataset)} to {total_samples}"
+    )
+    logger.info(f"Start inference on {total_samples} samples")
+
+    with ExitStack() as stack:
+        if isinstance(model, (flow.nn.Module, flow.nn.Graph)):
+            stack.enter_context(inference_context(model))
+        stack.enter_context(flow.no_grad())
+
+        start_data_time = time.perf_counter()
+        for idx, inputs in enumerate(data_loader):
+            if idx >= real_eval_iter:
+                break
+            total_data_time += time.perf_counter() - start_data_time
+            if idx == num_warmup:
+                start_time = time.perf_counter()
+                total_data_time = 0
+                total_compute_time = 0
+                total_eval_time = 0
+
+            start_compute_time = time.perf_counter()
+            # model forward
+            data = get_batch(inputs, input_placement_device)
+            is_last_batch = idx == len(data_loader) - 1
+            paded_data, valid_sample = pad_batch(data, batch_size, last_batch_lack, is_last_batch)
+            outputs = model(**paded_data)
+
+            # get valid sample
+            valid_data = {
+                key: dist.tensor_to_rank0(value, to_local=True)[:valid_sample]
+                for key, value in data.items()
+            }
+            valid_outputs = {}
+            for key, value in outputs.items():
+                value = dist.tensor_to_rank0(value, to_local=True)
+                if value.ndim > 1:
+                    valid_outputs[key] = value[:valid_sample]  # Slice if it's batched output
+                else:
+                    valid_outputs[key] = value
+
+            if flow.cuda.is_available():
+                dist.synchronize()
+            total_compute_time += time.perf_counter() - start_compute_time
+
+            start_eval_time = time.perf_counter()
+            if dist.is_main_process():
+                evaluator.process(valid_data, valid_outputs)
+            dist.synchronize()
+            total_eval_time += time.perf_counter() - start_eval_time
+
+            consumed_samples += valid_sample
+            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
+            data_seconds_per_iter = total_data_time / iters_after_start
+            compute_seconds_per_iter = total_compute_time / iters_after_start
+            eval_seconds_per_iter = total_eval_time / iters_after_start
+            total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
+            if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
+                eta = datetime.timedelta(
+                    seconds=int(total_seconds_per_iter * (total_samples // batch_size - idx - 1))
+                )
+                log_every_n_seconds(
+                    logging.INFO,
+                    (
+                        f"Inference done {consumed_samples}/{total_samples}. "
+                        f"Dataloading: {data_seconds_per_iter:.4f} s/iter. "
+                        f"Inference: {compute_seconds_per_iter:.4f} s/iter. "
+                        f"Eval: {eval_seconds_per_iter:.4f} s/iter. "
+                        f"Total: {total_seconds_per_iter:.4f} s/iter. "
+                        f"ETA={eta}"
+                    ),
+                    n=5,
+                )
+            start_data_time = time.perf_counter()
+
+    # Measure the time only for this worker (before the synchronization barrier)
+    total_time = time.perf_counter() - start_time
+    total_time_str = str(datetime.timedelta(seconds=total_time))
+    # NOTE this format is parsed by grep
+    logger.info("Total valid samples: {}".format(consumed_samples))
+    logger.info(
+        "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_time_str, total_time / (total_samples - num_warmup), num_devices
+        )
+    )
+    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
+    logger.info(
+        "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_compute_time_str,
+            total_compute_time / (total_samples - num_warmup),
+            num_devices,
+        )
+    )
+
+    results = evaluator.evaluate()
+    # An evaluator may return None when not in main process.
+    # Replace it by an empty dict instead to make it easier for downstream code to handle
+    if results is None:
+        results = {}
+    return results
+
+
+@contextmanager
+def inference_context(model):
+    """
+    A context where the model is temporarily changed to eval mode,
+    and restored to previous mode afterwards.
+    Args:
+        model: eager or graph mode in oneflow
+    """
+    training_mode = model.model.training if isinstance(model, flow.nn.Graph) else model.training
+    if isinstance(model, flow.nn.Graph):
+        model.model.eval()
+    else:
+        model.eval()
+    yield
+    if isinstance(model, flow.nn.Graph):
+        model.model.train(training_mode)
+    else:
+        model.train(training_mode)
--- a/libai/evaluation/ppl_evaluator.py
+++ b/libai/evaluation/ppl_evaluator.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import math
+from collections import OrderedDict
+
+from libai.utils import distributed as dist
+
+from .evaluator import DatasetEvaluator
+
+
+class PPLEvaluator(DatasetEvaluator):
+    """
+    Evaluate perplexity for Language Model.
+
+    Perplexity is a measurement of how well a probability distribution or
+    probability model predicts a sample.
+    """
+
+    def __init__(self):
+        self._predictions = []
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        for k, v in outputs.items():
+            ppl = math.exp(min(20, v.item()))
+            self._predictions.append({f"{k}_PPL": ppl})
+
+    def evaluate(self):
+        if not dist.is_main_process():
+            return {}
+        else:
+            predictions = self._predictions
+
+        self._results = OrderedDict()
+        for prediction in predictions:
+            for k, v in prediction.items():
+                if k not in self._results:
+                    self._results[k] = 0
+                self._results[k] += v
+
+        for k in self._results.keys():
+            self._results[k] /= len(predictions)
+
+        return copy.deepcopy(self._results)
--- a/libai/evaluation/reg_evaluator.py
+++ b/libai/evaluation/reg_evaluator.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+from collections import OrderedDict
+
+import numpy as np
+from scipy.stats import pearsonr, spearmanr
+
+from libai.utils import distributed as dist
+
+from .evaluator import DatasetEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class RegEvaluator(DatasetEvaluator):
+    def __init__(self):
+        self._predictions = []
+
+    def reset(self):
+        self._predictions = []
+
+    def process(self, inputs, outputs):
+        pred_logits = outputs["prediction_scores"]
+        labels = inputs["labels"]
+
+        # measure accuracy
+        preds = pred_logits.cpu().topk(1)[1].squeeze(1).numpy()
+        labels = labels.cpu().numpy()
+
+        self._predictions.append({"preds": preds, "labels": labels})
+
+    def evaluate(self):
+        if not dist.is_main_process():
+            return {}
+        else:
+            predictions = self._predictions
+
+        preds = np.array([])
+        labels = np.array([])
+        for prediction in predictions:
+            preds = np.concatenate((preds, prediction["preds"]))
+            labels = np.concatenate((labels, prediction["labels"]))
+
+        pearson_corr = pearsonr(preds, labels)[0]
+        spearman_corr = spearmanr(preds, labels)[0]
+        corr = (pearson_corr + spearman_corr) / 2
+
+        self._results = OrderedDict()
+        self._results["pearson"] = pearson_corr
+        self._results["spearman"] = spearman_corr
+        self._results["corr"] = corr
+
+        return copy.deepcopy(self._results)
--- a/libai/evaluation/utils.py
+++ b/libai/evaluation/utils.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from collections.abc import Mapping
+
+import oneflow as flow
+
+from libai.utils import distributed as dist
+
+
+def pad_batch(x_dict, batch_size, last_batch_lack, is_last_batch):
+    x = list(x_dict.values())[0]
+    tensor_batch = x.shape[0]
+    assert tensor_batch <= batch_size
+
+    if tensor_batch == batch_size and not is_last_batch:
+        return x_dict, batch_size
+
+    valid_sample = tensor_batch - last_batch_lack
+    data_parallel_size = dist.get_data_parallel_size()
+    assert tensor_batch % data_parallel_size == 0
+    tensor_micro_batch_size = tensor_batch // data_parallel_size
+    padded_dict = {}
+    for key, xi in x_dict.items():
+        pad_shape = (batch_size, *xi.shape[1:])
+        local_xi = xi.to_global(
+            sbp=flow.sbp.broadcast, placement=flow.env.all_device_placement("cuda")
+        ).to_local()
+        padded_xi = flow.zeros(pad_shape, dtype=xi.dtype, device="cuda")
+        padded_xi[:tensor_batch, ...] = padded_xi[:tensor_batch, ...] + local_xi
+        for i in range(last_batch_lack - 1):
+            start_idx = tensor_micro_batch_size * (data_parallel_size - i - 1) - 1
+            padded_xi[start_idx:-1] = padded_xi[start_idx + 1 :]
+        padded_xi = padded_xi.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), placement=xi.placement
+        ).to_global(sbp=xi.sbp)
+        padded_dict[key] = padded_xi
+    return padded_dict, valid_sample
+
+
+def print_csv_format(results):
+    """
+    Print main metrics in a particular format
+    so that they are easy to copypaste into a spreadsheet.
+
+    Args:
+        results (OrderedDict[dict]): task_name -> {metric -> score}
+            unordered dict can also be printed, but in arbitrary order
+    """
+    assert isinstance(results, Mapping) or not len(results), results
+    logger = logging.getLogger(__name__)
+    for task, res in results.items():
+        if isinstance(res, Mapping):
+            # Don't print "AP-category" metrics since they are usually not tracked.
+            important_res = [(k, v) for k, v in res.items() if "-" not in k]
+            logger.info("copypaste: Task: {}".format(task))
+            logger.info("copypaste: " + ",".join([k[0] for k in important_res]))
+            logger.info("copypaste: " + ",".join(["{0:.4f}".format(k[1]) for k in important_res]))
+        else:
+            logger.info(f"copypaste: {task}={res}")
+
+
+def flatten_results_dict(results):
+    """
+    Expand a hierarchical dict of scalars into a flat dict of scalars.
+    If results[k1][k2][k3] = v, the returned dict will have the entry
+    {"k1/k2/k3": v}.
+
+    Args:
+        results (dict):
+    """
+    r = {}
+    for k, v in results.items():
+        if isinstance(v, Mapping):
+            v = flatten_results_dict(v)
+            for kk, vv in v.items():
+                r[k + "/" + kk] = vv
+        else:
+            r[k] = v
+    return r
--- a/libai/inference/basic.py
+++ b/libai/inference/basic.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from abc import ABCMeta, abstractmethod
+from typing import Any, Dict
+
+import oneflow as flow
+
+from libai.config import LazyConfig, try_get_key
+from libai.engine import DefaultTrainer
+from libai.utils import distributed as dist
+from libai.utils.logger import setup_logger
+
+logger = setup_logger(distributed_rank=dist.get_rank())
+logger = logging.getLogger("libai.inference")
+
+
+class BasePipeline(metaclass=ABCMeta):
+    """
+    Base class for all task pipeline
+    """
+
+    def __init__(
+        self,
+        config_file,
+        data_parallel=None,
+        tensor_parallel=None,
+        pipeline_parallel=None,
+        pipeline_stage_id=None,
+        pipeline_num_layers=None,
+        model_path=None,
+        mode="libai",
+        **kwargs,
+    ):
+        # init cfg
+        self.cfg = LazyConfig.load(config_file)
+        flow.boxing.nccl.set_fusion_threshold_mbytes(
+            try_get_key(self.cfg, "train.nccl_fusion_threshold_mb", default=16)
+        )
+        flow.boxing.nccl.set_fusion_max_ops_num(
+            try_get_key(self.cfg, "train.nccl_fusion_max_ops", default=24)
+        )
+        self.update_cfg(
+            data_parallel,
+            tensor_parallel,
+            pipeline_parallel,
+            pipeline_stage_id,
+            pipeline_num_layers,
+        )
+        dist.setup_dist_util(self.cfg.train.dist)
+        assert (
+            self.cfg.train.dist.data_parallel_size == 1
+        ), "not support data parallel yet, only support tensor and pipeline parallel"
+        logger.info(self.cfg.train.dist)
+
+        # initial and load model
+
+        self.model = self.load_pretrain_weight(self.cfg.model, model_path, mode=mode)
+        self.model._apply(dist.convert_to_distributed_default_setting)
+        self.model = self.model.eval()
+
+        # initial tokenizer
+        if dist.is_main_process():
+            self.tokenizer = self.build_tokenizer(self.cfg)
+        else:
+            self.tokenizer = None
+        self.tokenizer = dist.broadcast_py_object(self.tokenizer, src=0)
+
+        # set parameters
+        (
+            self._preprocess_params,
+            self._forward_params,
+            self._postprocess_params,
+        ) = self._parse_parameters(**kwargs)
+
+    def update_cfg(
+        self,
+        data_parallel=1,
+        tensor_parallel=1,
+        pipeline_parallel=1,
+        pipeline_stage_id=None,
+        pipeline_num_layers=None,
+    ):
+        self.cfg.train.dist.data_parallel_size = data_parallel
+        self.cfg.train.dist.tensor_parallel_size = tensor_parallel
+        self.cfg.train.dist.pipeline_parallel_size = pipeline_parallel
+        self.cfg.train.dist.custom_pipeline_stage_id = pipeline_stage_id
+        if pipeline_num_layers is not None:
+            self.cfg.train.dist.pipeline_num_layers = pipeline_num_layers
+
+        if self.cfg.train.dist.pipeline_parallel_size > 1:
+            assert (
+                try_get_key(self.cfg.train.dist, "pipeline_num_layers") is not None
+            ), "cfg.train.dist.pipeline_num_layers must be set when run pipeline parallel"
+
+    def load_pretrain_weight(
+        self,
+        libai_cfg_model,
+        model_path,
+        mode="libai",
+    ):
+        """load pretrained model.
+
+        Args:
+            libai_cfg_model (libai.models): Lazy config Model in Libai, you can import it
+                by `from libai.config.configs.common.models.bert
+                    import pretrain_model as libai_cfg_model`
+            model_path (str): The directory path of pretrained model
+            mode (str): set it to `libai` for loading trained model from libai,
+                set it to `random` for quickly debugging by random initialized model
+        """
+        if mode == "libai":
+            from libai.models.utils.model_loader.base_loader import ModelLoaderLiBai
+
+            model_loader = ModelLoaderLiBai(libai_cfg_model, libai_cfg_model.cfg, model_path)
+            model_loader.base_model_prefix_1 = None
+            model_loader.base_model_prefix_2 = ""
+            return model_loader.load()
+        elif mode == "random":
+            return DefaultTrainer.build_model(self.cfg)
+        else:
+            raise NotImplementedError
+
+    def build_tokenizer(self, cfg):
+        tokenizer = None
+        if try_get_key(cfg, "tokenization") is not None:
+            tokenizer = DefaultTrainer.build_tokenizer(cfg)
+        return tokenizer
+
+    @abstractmethod
+    def _parse_parameters(self, **pipeline_parameters):
+        raise NotImplementedError("_parse_parameters not implemented")
+
+    def __call__(self, inputs, *args, batch_size=None, **kwargs) -> dict:
+
+        preprocess_params, forward_params, postprocess_params = self._parse_parameters(
+            **kwargs
+        )  # noqa
+
+        # Fuse __init__ params and __call__ params without modifying the __init__ ones.
+        preprocess_params = {**self._preprocess_params, **preprocess_params}
+        forward_params = {**self._forward_params, **forward_params}
+        postprocess_params = {**self._postprocess_params, **postprocess_params}
+
+        with flow.no_grad():
+            model_inputs_dict = self.preprocess(inputs, **preprocess_params)
+            model_outputs_dict = self.forward(model_inputs_dict, **forward_params)
+            model_outputs_dict = self.to_local(model_outputs_dict)
+            if dist.is_main_process():
+                outputs_dict = self.postprocess(model_outputs_dict, **postprocess_params)
+            else:
+                outputs_dict = {}
+            dist.synchronize()
+        return outputs_dict
+
+    def to_local(self, model_outputs_dict):
+        for key, value in model_outputs_dict.items():
+            if isinstance(value, flow.Tensor) and value.is_global:
+                model_outputs_dict[key] = dist.ttol(
+                    value, ranks=[0] if value.placement.ranks.ndim == 1 else [[0]]
+                )
+        if flow.cuda.is_available():
+            dist.synchronize()
+        return model_outputs_dict
+
+    @abstractmethod
+    def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> dict:
+        raise NotImplementedError("preprocess not implemented")
+
+    @abstractmethod
+    def forward(self, **kwargs: Dict) -> dict:
+        raise NotImplementedError("forward not implemented")
+
+    @abstractmethod
+    def postprocess(self, **kwargs: Dict) -> dict:
+        raise NotImplementedError("postprocess not implemented")
--- a/libai/inference/generator/generation_beam_search.py
+++ b/libai/inference/generator/generation_beam_search.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and
+# The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+from abc import ABC, abstractmethod
+from collections import UserDict
+from typing import Optional, Tuple
+
+import oneflow as flow
+
+from libai.utils import distributed as dist
+
+
+class BeamScorer(ABC):
+    @abstractmethod
+    def process(
+        self,
+        input_ids: flow.Tensor,
+        next_scores: flow.Tensor,
+        next_tokens: flow.Tensor,
+        next_indices: flow.Tensor,
+        **kwargs,
+    ):
+        raise NotImplementedError("This is an abstract method.")
+
+
+class BeamHypotheses:
+    def __init__(self, num_beams: int, length_penalty: float, early_stopping: bool):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self) -> int:
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(
+        self, hyp: flow.Tensor, sum_logprobs: float, beam_indices: Optional[flow.Tensor] = None
+    ):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp, beam_indices))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted([(s, idx) for idx, (s, _, _) in enumerate(self.beams)])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated
+        can become better than the worst one in the heap, then we are done with this sentence.
+        """
+
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
+
+
+class BeamSearchScorer(BeamScorer):
+    def __init__(
+        self,
+        batch_size: int,
+        num_beams: int,
+        length_penalty: Optional[float] = 1.0,
+        do_early_stopping: Optional[bool] = False,
+        num_beam_hyps_to_keep: Optional[int] = 1,
+        num_beam_groups: Optional[int] = 1,
+        **kwargs,
+    ):
+        self.num_beams = num_beams
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+        self.num_beam_groups = num_beam_groups
+        self.group_size = self.num_beams // self.num_beam_groups
+
+        self._is_init = False
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.num_beams,
+                length_penalty=self.length_penalty,
+                early_stopping=self.do_early_stopping,
+            )
+            for _ in range(batch_size)
+        ]
+
+        self._done = flow.tensor(
+            [False for _ in range(batch_size)],
+            dtype=flow.bool,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+        )
+
+        if not isinstance(num_beams, int) or num_beams <= 1:
+            raise ValueError(
+                f"`num_beams` has to be an integer strictly greater than 1, but is {num_beams}."
+                "For `num_beams` == 1, one should make use of `greedy_search` instead."
+            )
+
+        if (
+            not isinstance(num_beam_groups, int)
+            or (num_beam_groups > num_beams)
+            or (num_beams % num_beam_groups != 0)
+        ):
+            raise ValueError(
+                "`num_beam_groups` has to be an integer smaller or equal than `num_beams` and "
+                f"`num_beams` has to be divisible by `num_beam_groups`, but is {num_beam_groups}"
+                f"with `num_beams` being {num_beams}."
+            )
+
+        if "max_length" in kwargs:
+            warnings.warn(
+                "Passing `max_length` to BeamSearchScorer is deprecated and has no effect. "
+                "`max_length` should be passed directly to `beam_search(...)`, `beam_sample(...)`"
+                ", or `group_beam_search(...)`."
+            )
+
+    @property
+    def is_done(self) -> bool:
+        return self._done.all()
+
+    def process(
+        self,
+        input_ids: flow.Tensor,
+        next_scores: flow.Tensor,
+        next_tokens: flow.Tensor,
+        next_indices: flow.Tensor,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        beam_indices: Optional[flow.Tensor] = None,
+    ) -> Tuple[flow.Tensor]:
+        cur_len = input_ids.shape[-1]
+        batch_size = len(self._beam_hyps)
+        if not (batch_size == (input_ids.shape[0] // self.group_size)):
+            if self.num_beam_groups > 1:
+                raise ValueError(
+                    f"A group beam size of {input_ids.shape[0]} is used as the input, but a group "
+                    f"beam size of {self.group_size} is expected by the beam scorer."
+                )
+            else:
+                raise ValueError(
+                    f"A beam size of {input_ids.shape[0]} is used as the input, but a beam size of "
+                    f"{self.group_size} is expected by the beam scorer."
+                )
+        next_beam_scores = flow.zeros(
+            (batch_size, self.group_size),
+            dtype=next_scores.dtype,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+        )
+        next_beam_tokens = flow.zeros(
+            (batch_size, self.group_size),
+            dtype=next_tokens.dtype,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+        )
+        next_beam_indices = flow.zeros(
+            (batch_size, self.group_size),
+            dtype=next_indices.dtype,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+        )
+
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                if self.num_beams < len(beam_hyp):
+                    raise ValueError(
+                        f"Batch can only be done if at least {self.num_beams} beams have "
+                        "been generated"
+                    )
+                if eos_token_id is None or pad_token_id is None:
+                    raise ValueError(
+                        "Generated beams >= num_beams -> eos_token_id and pad_token have "
+                        "to be defined"
+                    )
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+
+            # next tokens for this sentence
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score, next_index) in enumerate(
+                zip(next_tokens[batch_idx], next_scores[batch_idx], next_indices[batch_idx])
+            ):
+                batch_beam_idx = batch_idx * self.group_size + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (next_token.item() == eos_token_id):
+                    # if beam_token does not belong to top num_beams tokens, it should not be added
+                    is_beam_token_worse_than_top_num_beams = beam_token_rank >= self.group_size
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    if beam_indices is not None:
+                        beam_index = beam_indices[batch_beam_idx]
+                        beam_index = beam_index + (next_index,)
+                    else:
+                        beam_index = None
+
+                    beam_hyp.add(
+                        input_ids[batch_beam_idx].clone(),
+                        next_score.item(),
+                        beam_indices=beam_index,
+                    )
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token
+                    next_beam_indices[batch_idx, beam_idx] = batch_beam_idx
+                    beam_idx += 1
+
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.group_size:
+                    break
+
+            if beam_idx < self.group_size:
+                raise ValueError(
+                    f"At most {self.group_size} tokens in {next_tokens[batch_idx]} can be equal "
+                    f"to `eos_token_id: {eos_token_id}`. Make sure {next_tokens[batch_idx]} "
+                    "are corrected."
+                )
+
+            # Check if we are done so that we can save a pad step if all(done)
+            self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
+                next_scores[batch_idx].max().item(), cur_len
+            )
+
+        return UserDict(
+            {
+                "next_beam_scores": next_beam_scores.view(-1),
+                "next_beam_tokens": next_beam_tokens.view(-1),
+                "next_beam_indices": next_beam_indices.view(-1),
+            }
+        )
+
+    def finalize(
+        self,
+        input_ids: flow.Tensor,
+        final_beam_scores: flow.Tensor,
+        final_beam_tokens: flow.Tensor,
+        final_beam_indices: flow.Tensor,
+        max_length: int,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        beam_indices: Optional[flow.Tensor] = None,
+    ):
+        batch_size = len(self._beam_hyps)
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx]:
+                continue
+
+            # all open beam hypotheses are added to the beam hypothesis
+            # beam hypothesis class automatically keeps the best beams
+            for beam_id in range(self.num_beams):
+                batch_beam_idx = batch_idx * self.num_beams + beam_id
+                final_score = final_beam_scores[batch_beam_idx].item()
+                final_tokens = input_ids[batch_beam_idx]
+                beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
+                beam_hyp.add(final_tokens, final_score, beam_indices=beam_index)
+
+        # select the best hypotheses
+        sent_lengths = flow.zeros(
+            batch_size * self.num_beam_hyps_to_keep,
+            dtype=flow.long,
+            sbp=input_ids.sbp,
+            placement=input_ids.placement,
+        )
+        best = []
+        best_indices = []
+        best_scores = flow.zeros(
+            batch_size * self.num_beam_hyps_to_keep,
+            dtype=flow.float32,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+        )
+
+        # retrieve best hypotheses
+        for i, beam_hyp in enumerate(self._beam_hyps):
+            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_hyp_tuple = sorted_hyps.pop()
+                best_score = best_hyp_tuple[0]
+                best_hyp = best_hyp_tuple[1]
+                best_index = best_hyp_tuple[2]
+                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
+
+                # append hyp to lists
+                best.append(best_hyp)
+
+                # append indices to list
+                best_indices.append(best_index)
+
+                best_scores[i * self.num_beam_hyps_to_keep + j] = best_score
+
+        # prepare for adding eos
+        sent_lengths_max = sent_lengths.max().item() + 1
+        sent_max_len = (
+            min(sent_lengths_max, max_length) if max_length is not None else sent_lengths_max
+        )
+        decoded = flow.zeros(
+            (batch_size * self.num_beam_hyps_to_keep, sent_max_len),
+            dtype=flow.long,
+            sbp=input_ids.sbp,
+            placement=input_ids.placement,
+        )
+
+        if len(best_indices) > 0 and best_indices[0] is not None:
+            indices = flow.zeros(
+                (batch_size * self.num_beam_hyps_to_keep, sent_max_len),
+                dtype=flow.long,
+                sbp=input_ids.sbp,
+                placement=input_ids.placement,
+            )
+        else:
+            indices = None
+
+        # shorter batches are padded if needed
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            assert pad_token_id is not None, "`pad_token_id` has to be defined"
+            decoded.fill_(pad_token_id)
+
+        if indices is not None:
+            indices.fill_(-1)
+
+        # fill with hypotheses and eos_token_id if the latter fits in
+        for i, (hypo, best_idx) in enumerate(zip(best, best_indices)):
+            decoded[i, : sent_lengths[i]] = hypo
+
+            if indices is not None:
+                indices[i, : len(best_idx)] = flow.tensor(best_idx)
+
+            if sent_lengths[i] < sent_max_len:
+                decoded[i, sent_lengths[i]] = eos_token_id
+
+        return UserDict(
+            {
+                "sequences": decoded,
+                "sequence_scores": best_scores,
+                "beam_indices": indices,
+            }
+        )
--- a/libai/inference/generator/generation_logits_processor.py
+++ b/libai/inference/generator/generation_logits_processor.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and
+# The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import math
+from typing import Callable, List, Tuple
+
+import oneflow as flow
+
+
+class LogitsProcessorList(list):
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor, **kwargs) -> flow.Tensor:
+        for processor in self:
+            function_args = inspect.signature(processor.__call__).parameters
+            if len(function_args) > 2:
+                if not all(arg in kwargs for arg in list(function_args.keys())[2:]):
+                    raise ValueError(
+                        f"Make sure that all the required parameters: {list(function_args.keys())} "
+                        "for {processor.__class__} are passed to the logits processor."
+                    )
+                scores = processor(input_ids, scores, **kwargs)
+            else:
+                scores = processor(input_ids, scores)
+        return scores
+
+
+class NormalizationLogitsProcessor(object):
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        scores = scores.log_softmax(dim=-1)
+        return scores
+
+
+class InfNanRemoveLogitsProcessor(object):
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        scores[scores != scores] = 0.0
+        scores[scores == float("inf")] = flow.finfo(scores.dtype).max
+        return scores
+
+
+class ForcedEOSTokenLogitsProcessor(object):
+    def __init__(self, max_length: int, eos_token_id: int):
+        self.max_length = max_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        cur_len = input_ids.shape[-1]
+        if cur_len == self.max_length - 1:
+            num_tokens = scores.shape[1]
+            scores[:, [i for i in range(num_tokens) if i != self.eos_token_id]] = -float("inf")
+            scores[:, self.eos_token_id] = 0
+        return scores
+
+
+class ForcedBOSTokenLogitsProcessor(object):
+    def __init__(self, bos_token_id: int):
+        self.bos_token_id = bos_token_id
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        cur_len = input_ids.shape[-1]
+        if cur_len == 1:
+            num_tokens = scores.shape[1]
+            scores[:, [i for i in range(num_tokens) if i != self.bos_token_id]] = -float("inf")
+            scores[:, self.bos_token_id] = 0
+        return scores
+
+
+class RepetitionPenaltyLogitsProcessor(object):
+    def __init__(self, penalty: float):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")
+        self.penalty = penalty
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        score = flow.gather(scores, 1, input_ids)
+        score = flow.where(score < 0, score * self.penalty, score / self.penalty)
+        scores = flow.scatter(scores, 1, input_ids, score)
+        return scores
+
+
+class HammingDiversityLogitsProcessor(object):
+    def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
+        if not isinstance(diversity_penalty, float) or (not diversity_penalty > 0.0):
+            raise ValueError("`diversity_penalty` should be a float strictly larger than 0.")
+        self._diversity_penalty = diversity_penalty
+        if not isinstance(num_beams, int) or num_beams < 2:
+            raise ValueError("`num_beams` should be an integer strictly larger than 1.")
+        self._num_beams = num_beams
+        if not isinstance(num_beam_groups, int) or num_beam_groups < 2:
+            raise ValueError("`num_beam_groups` should be an integer strictly larger than 1.")
+        if num_beam_groups > num_beams:
+            raise ValueError("`beam_groups` has to be smaller or equal to `num_beams`.")
+        self._num_sub_beams = num_beams // num_beam_groups
+
+    def __call__(self, input_ids, scores, current_tokens, beam_group_idx) -> flow.Tensor:
+        scores = scores.numpy()
+
+        batch_size = current_tokens.shape[0] // self._num_beams
+        group_start_idx = beam_group_idx * self._num_sub_beams
+        group_end_idx = min(group_start_idx + self._num_sub_beams, self._num_beams)
+        group_size = group_end_idx - group_start_idx
+        vocab_size = scores.shape[-1]
+
+        if group_start_idx == 0:
+            return scores
+
+        for batch_idx in range(batch_size):
+            # predicted tokens of last time step of previous groups
+            previous_group_tokens = current_tokens[
+                batch_idx * self._num_beams : batch_idx * self._num_beams + group_start_idx
+            ]
+            token_frequency = flow.bincount(previous_group_tokens, minlength=vocab_size)
+            scores[batch_idx * group_size : (batch_idx + 1) * group_size] = (
+                scores[batch_idx * group_size : (batch_idx + 1) * group_size]
+                - self._diversity_penalty * token_frequency
+            )
+
+        return scores
+
+
+def _get_ngrams(ngram_size: int, prev_input_ids: flow.Tensor, num_hypos: int):
+    generated_ngrams = [{} for _ in range(num_hypos)]
+    for idx in range(num_hypos):
+        gen_tokens = prev_input_ids[idx].tolist()
+        generated_ngram = generated_ngrams[idx]
+        for ngram in zip(*[gen_tokens[i:] for i in range(ngram_size)]):
+            prev_ngram_tuple = tuple(ngram[:-1])
+            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [
+                ngram[-1]
+            ]
+    return generated_ngrams
+
+
+def _get_generated_ngrams(banned_ngrams, prev_input_ids, ngram_size, cur_len):
+    start_idx = cur_len + 1 - ngram_size
+    ngram_idx = tuple(prev_input_ids[start_idx:cur_len].tolist())
+    return banned_ngrams.get(ngram_idx, [])
+
+
+def _calc_banned_ngram_tokens(
+    ngram_size: int, prev_input_ids: flow.Tensor, num_hypos: int, cur_len: int
+):
+    if cur_len + 1 < ngram_size:
+        return [[] for _ in range(num_hypos)]
+
+    generated_ngrams = _get_ngrams(ngram_size, prev_input_ids, num_hypos)
+
+    banned_tokens = [
+        _get_generated_ngrams(
+            generated_ngrams[hypo_idx], prev_input_ids[hypo_idx], ngram_size, cur_len
+        )
+        for hypo_idx in range(num_hypos)
+    ]
+    return banned_tokens
+
+
+class NoRepeatNGramLogitsProcessor(object):
+    def __init__(self, ngram_size: int):
+        if not isinstance(ngram_size, int) or ngram_size <= 0:
+            raise ValueError(
+                f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}"
+            )
+        self.ngram_size = ngram_size
+
+    def __call__(self, input_ids, scores) -> flow.Tensor:
+        num_batch_hypotheses = scores.shape[0]
+        cur_len = input_ids.shape[-1]
+        banned_batch_tokens = _calc_banned_ngram_tokens(
+            self.ngram_size, input_ids, num_batch_hypotheses, cur_len
+        )
+
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores[i, banned_tokens] = -float("inf")
+
+        return scores
+
+
+class EncoderNoRepeatNGramLogitsProcessor(object):
+    def __init__(self, encoder_ngram_size: int, encoder_input_ids: flow.Tensor):
+        if not isinstance(encoder_ngram_size, int) or encoder_ngram_size <= 0:
+            raise ValueError(
+                "`encoder_ngram_size` has to be a strictly positive integer, but is "
+                f"{encoder_ngram_size}"
+            )
+        self.ngram_size = encoder_ngram_size
+        if len(encoder_input_ids.shape) == 1:
+            encoder_input_ids = encoder_input_ids.unsqueeze(0)
+        self.batch_size = encoder_input_ids.shape[0]
+        self.generated_ngrams = _get_ngrams(encoder_ngram_size, encoder_input_ids, self.batch_size)
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        # B x num_beams
+        num_hypos = scores.shape[0]
+        num_beams = num_hypos // self.batch_size
+        cur_len = input_ids.shape[-1]
+        banned_batch_tokens = [
+            _get_generated_ngrams(
+                self.generated_ngrams[hypo_idx // num_beams],
+                input_ids[hypo_idx],
+                self.ngram_size,
+                cur_len,
+            )
+            for hypo_idx in range(num_hypos)
+        ]
+
+        for i, banned_tokens in enumerate(banned_batch_tokens):
+            scores[i, banned_tokens] = -float("inf")
+
+        return scores
+
+
+class MinLengthLogitsProcessor(object):
+    def __init__(self, min_length: int, eos_token_id: int):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(f"`min_length` has to be a positive integer, but is {min_length}")
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError(f"`eos_token_id` has to be a positive integer, but is {eos_token_id}")
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        cur_len = input_ids.shape[-1]
+        if cur_len < self.min_length:
+            scores[:, self.eos_token_id] = -float("inf")
+        return scores
+
+
+class PrefixConstrainedLogitsProcessor(object):
+    def __init__(
+        self, prefix_allowed_tokens_fn: Callable[[int, flow.Tensor], List[int]], num_beams: int
+    ):
+        self._prefix_allowed_tokens_fn = prefix_allowed_tokens_fn
+        self._num_beams = num_beams
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        mask = flow.full_like(scores, -math.inf)
+        for batch_id, beam_sent in enumerate(
+            input_ids.view(-1, self._num_beams, input_ids.shape[-1])
+        ):
+            for beam_id, sent in enumerate(beam_sent):
+                mask[
+                    batch_id * self._num_beams + beam_id,
+                    self._prefix_allowed_tokens_fn(batch_id, sent),
+                ] = 0
+
+        return scores + mask
+
+
+class ExponentialDecayLengthPenalty(object):
+    def __init__(
+        self, exponential_decay_length_penalty: Tuple, eos_token_id: int, input_ids_seq_length: int
+    ):
+        self.regulation_start = exponential_decay_length_penalty[0] + input_ids_seq_length
+        self.regulation_factor = exponential_decay_length_penalty[1]
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        cur_len = input_ids.shape[-1]
+        if cur_len > self.regulation_start:
+            scores[:, self.eos_token_id] = scores[:, self.eos_token_id] * pow(
+                self.regulation_factor, cur_len - self.regulation_start
+            )
+        return scores
+
+
+class TemperatureLogitsWarper(object):
+    def __init__(self, temperature: float):
+        if not isinstance(temperature, float) or not (temperature > 0):
+            raise ValueError(
+                f"`temperature` has to be a strictly positive float, but is {temperature}"
+            )
+        self.temperature = temperature
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        scores = scores / self.temperature
+        return scores
+
+
+class TopPLogitsWarper(object):
+    def __init__(
+        self, top_p: float, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1
+    ):
+        top_p = float(top_p)
+        if top_p < 0 or top_p > 1.0:
+            raise ValueError(f"`top_p` has to be a float > 0 and < 1, but is {top_p}")
+
+        self.top_p = top_p
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        sorted_logits, sorted_indices = flow.sort(scores, descending=True)
+        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+        # Remove tokens with cumulative top_p above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > self.top_p
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1
+            # because we add the first one below)
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep - 1] = 0
+
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # scatter sorted tensors to original indexing
+        indices_to_remove = flow.scatter(
+            sorted_indices_to_remove, 1, sorted_indices, sorted_indices_to_remove
+        )
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+class TopKLogitsWarper(object):
+    def __init__(
+        self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1
+    ):
+        if not isinstance(top_k, int) or top_k <= 0:
+            raise ValueError(f"`top_k` has to be a strictly positive integer, but is {top_k}")
+
+        self.top_k = top_k
+        self.filter_value = filter_value
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+        top_k = min(max(self.top_k, self.min_tokens_to_keep), scores.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = scores < flow.topk(scores, top_k)[0][..., -1, None]
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
+
+
+class TypicalLogitsWarper(object):
+    def __init__(
+        self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1
+    ):
+        mass = float(mass)
+        if not (mass > 0 and mass < 1):
+            raise ValueError(f"`typical_p` has to be a float > 0 and < 1, but is {mass}")
+
+        self.filter_value = filter_value
+        self.mass = mass
+        self.min_tokens_to_keep = min_tokens_to_keep
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> flow.Tensor:
+
+        # calculate entropy
+        normalized = flow.nn.functional.log_softmax(scores, dim=-1)
+        p = flow.exp(normalized)
+        ent = -flow.nansum(normalized * p, dim=-1, keepdim=True)
+
+        # shift and sort
+        shifted_scores = flow.abs((-normalized) - ent)
+        sorted_scores, sorted_indices = flow.sort(shifted_scores, descending=False)
+        sorted_logits = scores.gather(-1, sorted_indices)
+        cumulative_probs = sorted_logits.softmax(dim=-1).cumsum(dim=-1)
+
+        # Remove tokens with cumulative mass above the threshold
+        last_ind = (cumulative_probs < self.mass).sum(dim=1)
+        last_ind[last_ind < 0] = 0
+        sorted_indices_to_remove = sorted_scores > sorted_scores.gather(1, last_ind.view(-1, 1))
+        if self.min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep
+            # (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., : self.min_tokens_to_keep] = 0
+        indices_to_remove = flow.scatter(
+            sorted_indices_to_remove, 1, sorted_indices, sorted_indices_to_remove
+        )
+        scores = scores.masked_fill(indices_to_remove, self.filter_value)
+        return scores
--- a/libai/inference/generator/generation_stopping_criteria.py
+++ b/libai/inference/generator/generation_stopping_criteria.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and
+# The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import warnings
+from copy import deepcopy
+
+import oneflow as flow
+
+
+class StoppingCriteriaList(list):
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor, **kwargs) -> bool:
+        return any(criteria(input_ids, scores) for criteria in self)
+
+    @property
+    def max_length(self):
+        for stopping_criterium in self:
+            if isinstance(stopping_criterium, MaxLengthCriteria):
+                return stopping_criterium.max_length
+        return None
+
+
+class MaxLengthCriteria(object):
+    def __init__(self, max_length: int):
+        self.max_length = max_length
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor) -> bool:
+        return input_ids.shape[-1] >= self.max_length
+
+
+class MaxTimeCriteria(object):
+    def __init__(self, max_time: float, initial_timestamp: float = None):
+        self.max_time = max_time
+        self.initial_timestamp = time.time() if initial_timestamp is None else initial_timestamp
+
+    def __call__(self, input_ids: flow.Tensor, scores: flow.Tensor, **kwargs) -> bool:
+        return time.time() - self.initial_timestamp > self.max_time
+
+
+def validate_stopping_criteria(stopping_criteria: StoppingCriteriaList, max_length: int):
+    stopping_max_length = stopping_criteria.max_length
+    new_stopping_criteria = deepcopy(stopping_criteria)
+    if stopping_max_length is not None and stopping_max_length != max_length:
+        warnings.warn(
+            "You set different `max_length` for stopping criteria and `max_length` parameter",
+            UserWarning,
+        )
+    elif stopping_max_length is None:
+        new_stopping_criteria.append(MaxLengthCriteria(max_length=max_length))
+    return new_stopping_criteria
--- a/libai/inference/generator/generation_utils.py
+++ b/libai/inference/generator/generation_utils.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and
+# The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import logging
+import warnings
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Union
+
+import oneflow as flow
+from oneflow import nn
+
+from libai.utils import distributed as dist
+
+from .generation_beam_search import BeamScorer, BeamSearchScorer
+from .generation_logits_processor import (
+    EncoderNoRepeatNGramLogitsProcessor,
+    ExponentialDecayLengthPenalty,
+    ForcedBOSTokenLogitsProcessor,
+    ForcedEOSTokenLogitsProcessor,
+    HammingDiversityLogitsProcessor,
+    InfNanRemoveLogitsProcessor,
+    LogitsProcessorList,
+    MinLengthLogitsProcessor,
+    NoRepeatNGramLogitsProcessor,
+    NormalizationLogitsProcessor,
+    PrefixConstrainedLogitsProcessor,
+    RepetitionPenaltyLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+    TypicalLogitsWarper,
+)
+from .generation_stopping_criteria import (
+    MaxLengthCriteria,
+    MaxTimeCriteria,
+    StoppingCriteriaList,
+    validate_stopping_criteria,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class Generator:
+    def _prepare_model_inputs(
+        self,
+        inputs: Optional[flow.Tensor] = None,
+        bos_token_id: Optional[int] = None,
+        model_kwargs: Optional[Dict[str, flow.Tensor]] = None,
+    ):
+        if self.cfg.is_encoder_decoder:
+            input_name = "encoder_input_ids"
+        else:
+            input_name = "input_ids"
+
+        model_kwargs = {k: v for k, v in model_kwargs.items() if v is not None or k != input_name}
+
+        inputs_kwarg = model_kwargs.pop(input_name, None)
+        if inputs_kwarg is not None and inputs is not None:
+            raise ValueError(
+                f"`inputs`: {inputs}` were passed alongside "
+                f"{input_name} which is not allowed."
+                f"Make sure to either pass {inputs} or {input_name}=..."
+            )
+        elif inputs_kwarg is not None:
+            inputs = inputs_kwarg
+
+        if inputs is None:
+            inputs = self._prepare_input_ids_for_generation(
+                bos_token_id, model_kwargs.get("encoder_outputs", None)
+            )
+        return inputs, input_name, model_kwargs
+
+    def prepare_inputs_for_generation(self, input_ids: flow.Tensor, **kwargs):
+        """
+        Implement in subclasses of [`PreTrainedModel`] for custom behavior to prepare inputs in the
+        generate method.
+        """
+        return {"input_ids": input_ids}
+
+    def _prepare_input_ids_for_generation(
+        self, bos_token_id: Optional[int], encoder_outputs: Optional[flow.Tensor]
+    ):
+        if self.cfg.is_encoder_decoder and encoder_outputs is not None:
+            shape = encoder_outputs.size()[:-1]
+            return (
+                flow.ones(
+                    shape,
+                    dtype=flow.long,
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                    placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+                )
+                * -100
+            )
+
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` has to be defined when no `input_ids` are provided.")
+        return (
+            flow.ones(
+                (1, 1),
+                dtype=flow.long,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+            )
+            * bos_token_id
+        )
+
+    def _prepare_attention_mask_for_generation(
+        self,
+        inputs: flow.Tensor,
+        pad_token_id: Optional[int],
+        eos_token_id: Optional[int],
+    ):
+        is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [flow.int64, flow.long]
+        is_pad_token_in_inputs = (pad_token_id is not None) and (pad_token_id in inputs)
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id)
+        )
+        # Check if input is input_ids and padded -> only then is attention_mask defined
+        if is_input_ids and is_pad_token_in_inputs and is_pad_token_not_equal_to_eos_token_id:
+            return inputs.ne(pad_token_id).bool()
+        else:
+            return flow.ones(
+                inputs.shape[:2],
+                dtype=flow.bool,
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+            )
+
+    def _prepare_encoder_decoder_kwargs_for_generation(
+        self, inputs_tensor: flow.Tensor, model_kwargs, model_input_name: str
+    ):
+        only_encoder = True
+        model_kwargs[model_input_name] = inputs_tensor
+        if "encoder_decoder_attn_mask" in set(inspect.signature(self.forward).parameters):
+            model_kwargs["encoder_decoder_attn_mask"] = model_kwargs["encoder_attn_mask"]
+        model_kwargs["encoder_outputs"] = self(**model_kwargs, only_encoder=only_encoder)
+        model_kwargs.pop(model_input_name)
+        return model_kwargs
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        decoder_start_token_id: int = None,
+        bos_token_id: int = None,
+        model_kwargs=None,
+    ):
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            return model_kwargs.pop("decoder_input_ids")
+        else:
+            decoder_start_token_id = (
+                decoder_start_token_id
+                if decoder_start_token_id
+                else self.cfg.decoder_start_token_id
+            )
+            return (
+                flow.ones(
+                    (batch_size, 1),
+                    dtype=flow.long,
+                    sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                    placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+                )
+                * decoder_start_token_id
+            )
+
+    def _get_decoder_start_token_id(
+        self, decoder_start_token_id: int = None, bos_token_id: int = None
+    ):
+        if decoder_start_token_id is not None:
+            return decoder_start_token_id
+        elif self.cfg.is_encoder_decoder:
+            return self.cfg.decoder_start_token_id
+        elif bos_token_id is not None:
+            return bos_token_id
+        else:
+            return self.cfg.bos_token_idx
+
+    @staticmethod
+    def _expand_inputs_for_generation(
+        input_ids: flow.Tensor,
+        expand_size: int = 1,
+        is_encoder_decoder: bool = False,
+        attention_mask: Optional[flow.Tensor] = None,
+        encoder_outputs: Optional[flow.Tensor] = None,
+        **model_kwargs,
+    ):
+        expanded_return_idx = (
+            flow.arange(input_ids.shape[0]).view(-1, 1).repeat(1, expand_size).view(-1)
+        )
+        expanded_return_idx = expanded_return_idx.to_global(
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+        )
+
+        input_ids = input_ids.index_select(0, expanded_return_idx)
+
+        # token_type ids not supported.
+
+        if attention_mask is not None:
+            model_kwargs["attention_mask"] = attention_mask.index_select(0, expanded_return_idx)
+
+        if is_encoder_decoder:
+            if encoder_outputs is None:
+                raise ValueError(
+                    "If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined."
+                )
+            encoder_outputs = encoder_outputs.to_global(placement=expanded_return_idx.placement)
+            encoder_outputs = encoder_outputs.index_select(0, expanded_return_idx)
+            model_kwargs["encoder_outputs"] = encoder_outputs
+            model_kwargs["encoder_attn_mask"] = model_kwargs["encoder_attn_mask"].index_select(
+                0, expanded_return_idx
+            )
+            model_kwargs["encoder_decoder_attn_mask"] = model_kwargs["encoder_attn_mask"]
+        return input_ids, model_kwargs
+
+    def _update_model_kwargs_for_generation(
+        self, outputs, model_kwargs, is_encoder_decoder: bool = False
+    ):
+        if "past_key_values" in outputs:
+            model_kwargs["past"] = outputs["past_key_values"]
+        elif "mems" in outputs:
+            model_kwargs["past"] = outputs["mems"]
+        elif "past_buckets_states" in outputs:
+            model_kwargs["past"] = outputs["past_buckets_states"]
+        elif self.past_key_values[-1] is not None:
+            model_kwargs["past"] = self.past_key_values
+        else:
+            model_kwargs["past"] = None
+
+        # update attention mask
+        if "attention_mask" in model_kwargs and not is_encoder_decoder:
+            attention_mask = model_kwargs["attention_mask"]
+            pad = flow.ones(
+                (attention_mask.shape[0], 1),
+                sbp=attention_mask.sbp,
+                placement=attention_mask.placement,
+            )
+            model_kwargs["attention_mask"] = flow.cat([attention_mask, pad], dim=-1)
+
+        if "decoder_attn_mask" in model_kwargs and is_encoder_decoder:
+            attention_mask = model_kwargs["decoder_attn_mask"]
+            pad = flow.ones(
+                (attention_mask.shape[0], 1),
+                sbp=attention_mask.sbp,
+                placement=attention_mask.placement,
+            )
+            model_kwargs["decoder_attn_mask"] = flow.cat([attention_mask, pad], dim=-1)
+
+        return model_kwargs
+
+    def _reorder_cache(self, past, beam_idx):
+        raise NotImplementedError(
+            "Make sure that a `_reorder_cache` function is correctly implemented in "
+            f"{self.__class__.__module__} to enable beam search for {self.__class__}"
+        )
+
+    def _get_logits_warper(
+        self,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        temperature: Optional[float] = None,
+        num_beams: Optional[int] = None,
+        renormalize_logits: Optional[bool] = None,
+    ):
+        # instantiate warpers list
+        warpers = LogitsProcessorList()
+
+        # all samplers can be found in `generation_utils_samplers.py`
+        if temperature is not None and temperature != 1.0:
+            warpers.append(TemperatureLogitsWarper(temperature))
+        if top_k is not None and top_k != 0:
+            warpers.append(
+                TopKLogitsWarper(top_k=top_k, min_tokens_to_keep=(2 if num_beams > 1 else 1))
+            )
+        if top_p is not None and top_p < 1.0:
+            warpers.append(
+                TopPLogitsWarper(top_p=top_p, min_tokens_to_keep=(2 if num_beams > 1 else 1))
+            )
+        if typical_p is not None and typical_p < 1.0:
+            warpers.append(
+                TypicalLogitsWarper(mass=typical_p, min_tokens_to_keep=(2 if num_beams > 1 else 1))
+            )
+        # `LogitNormalization` should always be the last logit processor, when present
+        if renormalize_logits is True:
+            warpers.append(NormalizationLogitsProcessor())
+        return warpers
+
+    def _get_logits_processor(
+        self,
+        repetition_penalty: float,
+        no_repeat_ngram_size: int,
+        encoder_no_repeat_ngram_size: int,
+        input_ids_seq_length: int,
+        encoder_input_ids: flow.Tensor,
+        min_length: int,
+        max_length: int,
+        eos_token_id: int,
+        forced_bos_token_id: int,
+        forced_eos_token_id: int,
+        prefix_allowed_tokens_fn: Callable[[int, flow.Tensor], List[int]],
+        num_beams: int,
+        num_beam_groups: int,
+        diversity_penalty: float,
+        remove_invalid_values: bool,
+        exponential_decay_length_penalty: Tuple,
+        logits_processor: Optional[LogitsProcessorList],
+        renormalize_logits: Optional[bool],
+    ):
+        """
+        This class returns a [`LogitsProcessorList`] list object that contains all relevant
+        [`LogitsProcessor`] instances used to modify the scores of the language model head.
+        """
+        processors = LogitsProcessorList()
+
+        # instantiate processors list
+        if diversity_penalty is not None and diversity_penalty > 0.0:
+            processors.append(
+                HammingDiversityLogitsProcessor(
+                    diversity_penalty=diversity_penalty,
+                    num_beams=num_beams,
+                    num_beam_groups=num_beam_groups,
+                )
+            )
+        if repetition_penalty is not None and repetition_penalty != 1.0:
+            processors.append(RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+        if no_repeat_ngram_size is not None and no_repeat_ngram_size > 0:
+            processors.append(NoRepeatNGramLogitsProcessor(no_repeat_ngram_size))
+        if encoder_no_repeat_ngram_size is not None and encoder_no_repeat_ngram_size > 0:
+            if self.cfg.is_encoder_decoder:
+                processors.append(
+                    EncoderNoRepeatNGramLogitsProcessor(
+                        encoder_no_repeat_ngram_size, encoder_input_ids
+                    )
+                )
+            else:
+                raise ValueError(
+                    "It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only "
+                    "architecture"
+                )
+        if min_length is not None and eos_token_id is not None and min_length > 0:
+            processors.append(MinLengthLogitsProcessor(min_length, eos_token_id))
+        if prefix_allowed_tokens_fn is not None:
+            processors.append(
+                PrefixConstrainedLogitsProcessor(
+                    prefix_allowed_tokens_fn, num_beams // num_beam_groups
+                )
+            )
+        if forced_bos_token_id is not None:
+            processors.append(ForcedBOSTokenLogitsProcessor(forced_bos_token_id))
+        if forced_eos_token_id is not None:
+            processors.append(ForcedEOSTokenLogitsProcessor(max_length, forced_eos_token_id))
+        if remove_invalid_values is True:
+            processors.append(InfNanRemoveLogitsProcessor())
+        if exponential_decay_length_penalty is not None:
+            processors.append(
+                ExponentialDecayLengthPenalty(
+                    exponential_decay_length_penalty, eos_token_id, input_ids_seq_length
+                )
+            )
+        processors = self._merge_criteria_processor_list(processors, logits_processor)
+        # `LogitNormalization` should always be the last logit processor, when present
+        if renormalize_logits is True:
+            processors.append(NormalizationLogitsProcessor())
+        return processors
+
+    def _get_stopping_criteria(
+        self,
+        max_length: Optional[int],
+        max_time: Optional[float],
+        stopping_criteria: Optional[StoppingCriteriaList],
+    ):
+        criteria = StoppingCriteriaList()
+        if max_length is not None:
+            criteria.append(MaxLengthCriteria(max_length=max_length))
+        if max_time is not None:
+            criteria.append(MaxTimeCriteria(max_time=max_time))
+        criteria = self._merge_criteria_processor_list(criteria, stopping_criteria)
+        return criteria
+
+    def _merge_criteria_processor_list(self, default_list, custom_list):
+        if len(custom_list) == 0:
+            return default_list
+        for default in default_list:
+            for custom in custom_list:
+                if type(custom) is type(default):
+                    raise ValueError("Criteria repetition error.")
+        default_list.extend(custom_list)
+        return default_list
+
+    def compute_transition_beam_scores(
+        self,
+        sequences: flow.Tensor,
+        scores: Tuple[flow.Tensor],
+        beam_indices: flow.Tensor,
+        eos_token_id: int = None,
+    ):
+        scores = flow.stack(scores).reshape(len(scores), -1).transpose(0, 1)
+
+        beam_indices_mask = beam_indices < 0
+        max_beam_length = (1 - beam_indices_mask.long()).sum(-1).max()
+        beam_indices = beam_indices[:, :max_beam_length]
+        beam_indices_mask = beam_indices_mask[:, :max_beam_length]
+
+        beam_indices[beam_indices_mask] = 0
+
+        beam_sequence_indices = beam_indices * self.cfg.vocab_size
+
+        cut_idx = sequences.shape[-1] - max_beam_length
+        indices = sequences[:, cut_idx:] + beam_sequence_indices
+
+        transition_scores = scores.gather(0, indices)
+
+        transition_scores[beam_indices_mask] = 0
+
+        return transition_scores
+
+    def _validate_model_kwargs(self, model_kwargs):
+        if self.cfg.is_encoder_decoder:
+            for key in ["decoder_input_ids"]:
+                model_kwargs.pop(key, None)
+        unused_model_args = []
+        model_args = set(inspect.signature(self.prepare_inputs_for_generation).parameters)
+        if "kwargs" in model_args:
+            model_args |= set(inspect.signature(self.forward).parameters)
+        for key, value in model_kwargs.items():
+            if value is not None and key not in model_args:
+                unused_model_args.append(key)
+        if unused_model_args:
+            raise ValueError(
+                f"The following `model_kwargs` are not used by the model: {unused_model_args} "
+                "(note: typos in the generate arguments will also show up in this list)"
+            )
+
+    def greedy_search(
+        self,
+        input_ids: flow.Tensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        is_encoder_decoder: bool = False,
+        output_scores: bool = False,
+        **model_kwargs,
+    ):
+        pad_token_id = pad_token_id if pad_token_id is not None else self.cfg.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.cfg.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.cfg.output_scores
+        scores = () if output_scores else None
+        logits_processor = (
+            logits_processor if logits_processor is not None else LogitsProcessorList()
+        )
+        stopping_criteria = (
+            stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        )
+        if max_length is not None:
+            warnings.warn(
+                "`max_length` is deprecated in this function, use MaxLengthCriteria" " instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+
+        # keep track of which sequences are already finished
+        unfinished_sequences = flow.ones(input_ids.shape[0])
+        cur_len = input_ids.shape[-1]
+        while True:
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # generate
+            outputs = self(**model_inputs)
+            next_token_logits = outputs["logits"][:, -1, :]
+
+            # logits_processor
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+
+            # Store scores
+            if output_scores:
+                scores += (next_token_scores,)
+
+            # argmax
+            next_tokens = flow.argmax(next_token_scores, dim=-1)
+            next_tokens = next_tokens.to_global(placement=input_ids.placement)
+            unfinished_sequences = unfinished_sequences.to_global(
+                sbp=next_tokens.sbp, placement=next_tokens.placement
+            )
+
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError(
+                        "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
+                    )
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
+                    1 - unfinished_sequences
+                )
+
+            next_tokens = next_tokens.to(flow.long)
+            input_ids = flow.cat([input_ids, next_tokens[:, None]], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
+            )
+            cur_len = cur_len + 1
+
+            # if eos_token was found in one sentence, set sentence to finished
+            if eos_token_id is not None:
+                unfinished_sequences = flow.mul(
+                    unfinished_sequences, (next_tokens != eos_token_id).long()
+                )
+
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+
+        # Release records
+        if "past_key_values" in self.__dir__():
+            self.past_key_values = [None] * self.cfg.hidden_layers
+        if "encoder_states" in self.__dir__():
+            self.encoder_states = None
+
+        return input_ids
+
+    def multinomial_sample(
+        self,
+        input_ids: flow.Tensor,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        logits_warper: Optional[LogitsProcessorList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        is_encoder_decoder: bool = False,
+        output_scores: bool = False,
+        **model_kwargs,
+    ):
+        # init values
+        pad_token_id = pad_token_id if pad_token_id is not None else self.cfg.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.cfg.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.cfg.output_scores
+        scores = () if output_scores else None
+        logits_processor = (
+            logits_processor if logits_processor is not None else LogitsProcessorList()
+        )
+        stopping_criteria = (
+            stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        )
+        if max_length is not None:
+            warnings.warn(
+                "`max_length` is deprecated in this function, use "
+                "`stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))`"
+                "instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        logits_warper = logits_warper if logits_warper is not None else LogitsProcessorList()
+
+        unfinished_sequences = flow.ones(input_ids.shape[0])
+        cur_len = input_ids.shape[-1]
+
+        while True:
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            # generate
+            outputs = self(**model_inputs)
+            next_token_logits = outputs["logits"][:, -1, :]
+
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            next_token_scores = logits_warper(input_ids, next_token_scores)
+
+            # Store scores
+            if output_scores:
+                scores += (next_token_scores,)
+
+            # sample
+            probs = nn.functional.softmax(next_token_scores, dim=-1)
+            probs = probs.to_global(
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+            ).to_local()
+            next_tokens = flow.multinomial(probs, num_samples=1).squeeze(1)
+            next_tokens = next_tokens.to_global(
+                sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+                placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+            )
+            unfinished_sequences = unfinished_sequences.to_global(
+                sbp=next_tokens.sbp, placement=next_tokens.placement
+            )
+
+            if eos_token_id is not None:
+                if pad_token_id is None:
+                    raise ValueError(
+                        "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
+                    )
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (
+                    1 - unfinished_sequences
+                )
+
+            next_tokens = next_tokens.to(flow.long)
+            input_ids = flow.cat([input_ids, next_tokens[:, None]], dim=-1)
+
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
+            )
+            cur_len = cur_len + 1
+
+            if eos_token_id is not None:
+                unfinished_sequences = flow.mul(
+                    unfinished_sequences, (next_tokens != eos_token_id).long()
+                )
+
+            if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores):
+                break
+
+        # Release records
+        if "past_key_values" in self.__dir__():
+            self.past_key_values = [None] * self.cfg.hidden_layers
+        if "encoder_states" in self.__dir__():
+            self.encoder_states = None
+
+        return input_ids
+
+    def beam_search(
+        self,
+        input_ids: flow.Tensor,
+        beam_scorer: BeamScorer,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        max_length: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        is_encoder_decoder: bool = False,
+        output_scores: bool = False,
+        **model_kwargs,
+    ):
+        pad_token_id = pad_token_id if pad_token_id is not None else self.cfg.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.cfg.eos_token_id
+        output_scores = output_scores if output_scores is not None else self.cfg.output_scores
+        scores = () if output_scores else None
+        logits_processor = (
+            logits_processor if logits_processor is not None else LogitsProcessorList()
+        )
+        stopping_criteria = (
+            stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+        )
+        if max_length is not None:
+            warnings.warn(
+                "`max_length` is deprecated in this function, use "
+                "`stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))`"
+                "instead.",
+                UserWarning,
+            )
+            stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
+        if len(stopping_criteria) == 0:
+            warnings.warn(
+                "You don't have defined any stopping_criteria, this will likely loop forever",
+                UserWarning,
+            )
+
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, "
+                f"but is {batch_beam_size}."
+            )
+
+        beam_indices = None
+
+        beam_scores = flow.zeros(
+            (batch_size, num_beams),
+            dtype=flow.float,
+            sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]),
+            placement=flow.placement("cuda", list(range(dist.get_world_size()))),
+        )
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+
+        while True:
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+
+            outputs = self(**model_inputs)
+            next_token_logits = outputs["logits"][:, -1, :]
+
+            next_token_scores = nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+            next_token_scores = next_token_scores.to_global(
+                sbp=input_ids.sbp, placement=input_ids.placement
+            )
+
+            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores
+            )
+
+            # Store scores
+            if output_scores:
+                scores += (next_token_scores,)
+
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+
+            next_token_scores, next_tokens = flow.topk(
+                next_token_scores, 2 * num_beams, dim=1, largest=True, sorted=True
+            )
+            next_indices = next_tokens // vocab_size
+            next_tokens = next_tokens % vocab_size
+
+            beam_outputs = beam_scorer.process(
+                input_ids,
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
+            )
+
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+
+            input_ids = flow.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs, model_kwargs, is_encoder_decoder=is_encoder_decoder
+            )
+
+            # update past_key_value
+            if model_kwargs["past"] is not None:
+                model_kwargs["past"] = self._reorder_cache(beam_idx)
+
+            # increase cur_len
+            cur_len = cur_len + 1
+
+            if beam_scorer.is_done or stopping_criteria(input_ids, scores):
+                break
+
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
+        )
+
+        # Release records
+        if "past_key_values" in self.__dir__():
+            self.past_key_values = [None] * self.cfg.hidden_layers
+        if "encoder_states" in self.__dir__():
+            self.encoder_states = None
+
+        return sequence_outputs["sequences"]
+
+    @flow.no_grad()
+    def generate(
+        self,
+        inputs: Optional[flow.Tensor] = None,
+        max_length: Optional[int] = None,
+        min_length: Optional[int] = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[bool] = None,
+        num_beams: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        typical_p: Optional[float] = None,
+        repetition_penalty: Optional[float] = None,
+        force_words_ids: Optional[Union[Iterable[int], Iterable[Iterable[int]]]] = None,
+        bos_token_id: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        no_repeat_ngram_size: Optional[int] = None,
+        encoder_no_repeat_ngram_size: Optional[int] = None,
+        num_return_sequences: Optional[int] = None,
+        max_time: Optional[float] = None,
+        max_new_tokens: Optional[int] = None,
+        decoder_start_token_id: Optional[int] = None,
+        use_cache: Optional[bool] = None,
+        num_beam_groups: Optional[int] = None,
+        diversity_penalty: Optional[float] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, flow.Tensor], List[int]]] = None,
+        logits_processor: Optional[LogitsProcessorList] = LogitsProcessorList(),
+        renormalize_logits: Optional[bool] = None,
+        stopping_criteria=StoppingCriteriaList(),
+        constraints=None,
+        output_scores: Optional[bool] = None,
+        forced_bos_token_id: Optional[int] = None,
+        forced_eos_token_id: Optional[int] = None,
+        remove_invalid_values: Optional[bool] = None,
+        exponential_decay_length_penalty: Optional[Tuple[Union[int, float]]] = None,
+        **model_kwargs,
+    ):
+        # 0. Validate model kwargs
+        self._validate_model_kwargs(model_kwargs.copy())
+
+        # 1. Set generation parameters if not already defined
+        bos_token_id = bos_token_id if bos_token_id is not None else self.cfg.bos_token_id
+        num_beams = num_beams if num_beams is not None else self.cfg.num_beams
+        length_penalty = length_penalty if length_penalty is not None else self.cfg.length_penalty
+        early_stopping = early_stopping if early_stopping is not None else self.cfg.early_stopping
+        num_beam_groups = (
+            num_beam_groups if num_beam_groups is not None else self.cfg.num_beam_groups
+        )
+        do_sample = do_sample if do_sample is not None else self.cfg.do_sample
+        num_return_sequences = (
+            num_return_sequences
+            if num_return_sequences is not None
+            else self.cfg.num_return_sequences
+        )
+
+        pad_token_id = pad_token_id if pad_token_id is not None else self.cfg.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.cfg.eos_token_id
+
+        output_scores = output_scores if output_scores is not None else self.cfg.output_scores
+
+        # 2. Prepare model inputs
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, bos_token_id, model_kwargs
+        )
+        batch_size = inputs_tensor.shape[0]
+
+        # 3. Prepare other model kwargs
+        model_kwargs["use_cache"] = use_cache if use_cache is not None else self.cfg.use_cache
+
+        if self.cfg.is_encoder_decoder:
+            att_mask_name = "encoder_attn_mask"
+            accepts_attention_mask = att_mask_name in set(
+                inspect.signature(self.forward).parameters.keys()
+            )
+        else:
+            att_mask_name = "attention_mask"
+            accepts_attention_mask = att_mask_name in set(
+                inspect.signature(self.forward).parameters.keys()
+            )
+        requires_attention_mask = "encoder_outputs" not in model_kwargs
+
+        if (
+            model_kwargs.get(att_mask_name, None) is None
+            and requires_attention_mask
+            and accepts_attention_mask
+        ):
+            model_kwargs[att_mask_name] = self._prepare_attention_mask_for_generation(
+                inputs_tensor, pad_token_id, eos_token_id
+            )
+        if self.cfg.is_encoder_decoder and "encoder_outputs" not in model_kwargs:
+            # if model is encoder decoder encoder_outputs are created
+            # and added to `model_kwargs`
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+                inputs_tensor, model_kwargs, model_input_name
+            )
+
+        # 4. Prepare `input_ids` which will be used for auto-regressive generation
+        if self.cfg.is_encoder_decoder:
+            input_ids = self._prepare_decoder_input_ids_for_generation(
+                batch_size,
+                decoder_start_token_id=decoder_start_token_id,
+                bos_token_id=bos_token_id,
+                model_kwargs=model_kwargs,
+            )
+        else:
+            # if decoder-only then inputs_tensor has to be `input_ids`
+            input_ids = inputs_tensor
+
+        # 5. Prepare `max_length` depending on other stopping criteria.
+        input_ids_seq_length = input_ids.shape[-1]
+        if max_length is None and max_new_tokens is None:
+            if dist.is_main_process():
+                warnings.warn(
+                    "Neither `max_length` nor `max_new_tokens` has been set, `max_length` will "
+                    f"default to {self.cfg.max_length} (`self.cfg.max_length`).  we recommend using"
+                    " `max_new_tokens` to control the maximum length of the generation.",
+                    UserWarning,
+                )
+        elif max_length is None and max_new_tokens is not None:
+            max_length = max_new_tokens + input_ids_seq_length
+        elif max_length is not None and max_new_tokens is not None:
+            raise ValueError(
+                "Both `max_new_tokens` and `max_length` have been set but they serve the same"
+            )
+
+        # default to cfg if still None
+        max_length = max_length if max_length is not None else self.cfg.max_length
+        min_length = min_length if min_length is not None else self.cfg.min_length
+
+        if min_length is not None and min_length > max_length:
+            raise ValueError(
+                f"Unfeasable length constraints: the minimum length ({min_length}) is larger than"
+                f"the maximum length ({max_length})"
+            )
+        if input_ids_seq_length >= max_length:
+            input_ids_string = "decoder_input_ids" if self.cfg.is_encoder_decoder else "input_ids"
+            logger.warning(
+                f"Input length of {input_ids_string} is {input_ids_seq_length}, but `max_length` is"
+                f" set to {max_length}. This can lead to unexpected behavior. You should consider "
+                "increasing `max_new_tokens`."
+            )
+
+        # 6. Determine generation mode
+        is_constraint_gen_mode = constraints is not None or force_words_ids is not None
+        is_greedy_gen_mode = (
+            (num_beams == 1)
+            and (num_beam_groups == 1)
+            and do_sample is False
+            and not is_constraint_gen_mode
+        )
+        is_sample_gen_mode = (
+            (num_beams == 1)
+            and (num_beam_groups == 1)
+            and do_sample is True
+            and not is_constraint_gen_mode
+        )
+        is_beam_gen_mode = (
+            (num_beams > 1)
+            and (num_beam_groups == 1)
+            and do_sample is False
+            and not is_constraint_gen_mode
+        )
+        # is_beam_sample_gen_mode = (
+        #     (num_beams > 1)
+        #     and (num_beam_groups == 1)
+        #     and do_sample is True
+        #     and not is_constraint_gen_mode
+        # )
+        is_group_beam_gen_mode = (
+            (num_beams > 1) and (num_beam_groups > 1) and not is_constraint_gen_mode
+        )
+
+        if num_beam_groups > num_beams:
+            raise ValueError("`num_beam_groups` has to be smaller or equal to `num_beams`")
+        if is_group_beam_gen_mode and do_sample is True:
+            raise ValueError(
+                "Diverse beam search cannot be used in sampling mode. Make sure that `do_sample` is"
+                " set to `False`."
+            )
+
+        # 7. Prepare distribution pre_processing samplers
+        logits_processor = self._get_logits_processor(
+            repetition_penalty=repetition_penalty,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=inputs_tensor,
+            min_length=min_length,
+            max_length=max_length,
+            eos_token_id=eos_token_id,
+            forced_bos_token_id=forced_bos_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            num_beams=num_beams,
+            num_beam_groups=num_beam_groups,
+            diversity_penalty=diversity_penalty,
+            remove_invalid_values=remove_invalid_values,
+            exponential_decay_length_penalty=exponential_decay_length_penalty,
+            logits_processor=logits_processor,
+            renormalize_logits=renormalize_logits,
+        )
+
+        # 8. Prepare stopping criteria
+        stopping_criteria = self._get_stopping_criteria(
+            max_length=max_length, max_time=max_time, stopping_criteria=stopping_criteria
+        )
+
+        # 9. Go into different generation modes
+        if is_greedy_gen_mode:
+            if num_return_sequences > 1:
+                raise ValueError(
+                    f"num_return_sequences has to be 1, but is {num_return_sequences} when doing"
+                    " greedy search."
+                )
+
+            # 10. Run greedy search
+            return self.greedy_search(
+                input_ids,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                output_scores=output_scores,
+                **model_kwargs,
+            )
+
+        elif is_sample_gen_mode:
+            # 10. Prepare logits warper
+            logits_warper = self._get_logits_warper(
+                top_k=top_k,
+                top_p=top_p,
+                typical_p=typical_p,
+                temperature=temperature,
+                num_beams=num_beams,
+                renormalize_logits=renormalize_logits,
+            )
+
+            # 11. Expand input_ids with `num_return_sequences` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids,
+                expand_size=num_return_sequences,
+                is_encoder_decoder=self.cfg.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 12. Run multinomial sample
+            return self.multinomial_sample(
+                input_ids,
+                logits_processor=logits_processor,
+                logits_warper=logits_warper,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                output_scores=output_scores,
+                **model_kwargs,
+            )
+
+        elif is_beam_gen_mode:
+            if num_return_sequences > num_beams:
+                raise ValueError(
+                    "`num_return_sequences` has to be smaller or equal to `num_beams`."
+                )
+
+            if stopping_criteria.max_length is None:
+                raise ValueError("`max_length` needs to be a stopping_criteria for now.")
+
+            # 10. Prepare beam search scorer
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                num_beams=num_beams,
+                length_penalty=length_penalty,
+                do_early_stopping=early_stopping,
+                num_beam_hyps_to_keep=num_return_sequences,
+            )
+
+            # 11. Interleave input_ids with `num_beams` additional sequences per batch
+            input_ids, model_kwargs = self._expand_inputs_for_generation(
+                input_ids,
+                expand_size=num_beams,
+                is_encoder_decoder=self.cfg.is_encoder_decoder,
+                **model_kwargs,
+            )
+
+            # 12. Run beam search
+            return self.beam_search(
+                input_ids,
+                beam_scorer,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                output_scores=output_scores,
+                **model_kwargs,
+            )
--- a/libai/inference/image_classification.py
+++ b/libai/inference/image_classification.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import oneflow as flow
+from PIL import Image
+
+from libai.config import instantiate
+from libai.data.structures import DistTensorData, Instance
+from libai.inference.basic import BasePipeline
+
+
+class ImageClassificationPipeline(BasePipeline):
+    def __init__(
+        self,
+        config_file,
+        data_parallel=None,
+        tensor_parallel=None,
+        pipeline_parallel=None,
+        pipeline_stage_id=None,
+        pipeline_num_layers=None,
+        model_path=None,
+        mode="libai",
+        **kwargs,
+    ):
+        super().__init__(
+            config_file,
+            data_parallel,
+            tensor_parallel,
+            pipeline_parallel,
+            pipeline_stage_id,
+            pipeline_num_layers,
+            model_path,
+            mode,
+            **kwargs,
+        )
+        if "num_classes" in self.cfg.model:
+            self.num_classes = self.cfg.model.num_classes
+        elif "num_classes" in self.cfg.model.cfg:
+            self.num_classes = self.cfg.model.cfg.num_classes
+        else:
+            raise AttributeError("The model's config must contain num_classes")
+        label2id = self.label2id(self.num_classes)
+        self.id2label = {ind: label for label, ind in label2id.items()}
+        self.transform = instantiate(self.cfg.dataloader.test[0].dataset.transform)
+
+    def _parse_parameters(self, **pipeline_parameters):
+        preprocess_params = {}
+        forward_params = {}
+        postprocess_params = {**pipeline_parameters}
+        return preprocess_params, forward_params, postprocess_params
+
+    def preprocess(
+        self,
+        inputs,
+        **kwargs,
+    ) -> dict:
+        assert os.path.exists(inputs), "inputs must be an existing image path!"
+        with open(inputs, "rb") as f:
+            img = Image.open(f).convert("RGB")
+        img = self.transform(img)
+        img = img.unsqueeze(0)
+
+        # to global tensor
+        model_input = Instance(
+            images=DistTensorData(img),
+        )
+        mdoel_input_dict = {}
+        for key, value in model_input.get_fields().items():
+            value.to_global()
+            mdoel_input_dict[key] = value.tensor
+        return mdoel_input_dict
+
+    def forward(self, mdoel_input_dict) -> dict:
+        model_outputs_dict = self.model(**mdoel_input_dict)
+        return model_outputs_dict
+
+    def postprocess(
+        self, model_outputs_dict, function_to_apply=None, return_all_scores=False, **kwargs
+    ) -> dict:
+        # prepare
+        num_labels = self.num_classes
+        if function_to_apply is not None:
+            function_to_apply = function_to_apply.lower()
+            assert function_to_apply in [
+                "sigmoid",
+                "softmax",
+                "none",
+            ], f"Unrecognized `function_to_apply` argument: {function_to_apply}"
+        else:
+            if num_labels == 1:
+                function_to_apply = "sigmoid"
+            elif num_labels > 1:
+                function_to_apply = "softmax"
+
+        # process, logits: [num_labels]
+        logits = model_outputs_dict["prediction_scores"][0]
+
+        if function_to_apply == "sigmoid":
+            scores = flow.sigmoid(logits)
+        elif function_to_apply == "softmax":
+
+            scores = flow.softmax(logits)
+        else:
+            scores = logits
+        scores = scores.detach().numpy()
+
+        if return_all_scores:
+            return [
+                {"label": self.id2label[i], "score": score.item()} for i, score in enumerate(scores)
+            ]
+        else:
+            return {
+                "label": self.id2label[scores.argmax().item()],
+                "score": scores.max().item(),
+            }
+
+    def label2id(self, num_classes):
+        """
+        Args:
+            num_classes (int): the number of total classes
+        Returns:
+            labels (list): a dict contains all the labels for inference,
+                           each item should be the form as follows:
+                           {
+                               "tench": 0,
+                               "tiger": 1,
+                               "xxx", n,
+                           }
+
+        """
+        from libai.inference.utils.imagenet_class import IMAGENET_LABELS as labels
+
+        assert num_classes == len(labels), "number of labels must be equal to num_classes"
+        return {label: i for (i, label) in enumerate(labels)}
+
+
+if __name__ == "__main__":
+    pipeline = ImageClassificationPipeline("/home/chengpeng/config.yaml", 1, 1, 1)
+    print(pipeline("data_test/inference_test_data/ILSVRC2012_val_00000293.JPEG"))
--- a/libai/inference/text_classification.py
+++ b/libai/inference/text_classification.py
+# coding=utf-8
+# Copyright 2021 The OneFlow Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import oneflow as flow
+
+from libai.data.structures import DistTensorData, Instance
+from libai.inference.basic import BasePipeline
+
+
+class TextClassificationPipeline(BasePipeline):
+    def __init__(
+        self,
+        config_file,
+        data_parallel=None,
+        tensor_parallel=None,
+        pipeline_parallel=None,
+        pipeline_stage_id=None,
+        pipeline_num_layers=None,
+        model_path=None,
+        mode="libai",
+        **kwargs,
+    ):
+        super().__init__(
+            config_file,
+            data_parallel,
+            tensor_parallel,
+            pipeline_parallel,
+            pipeline_stage_id,
+            model_path,
+            pipeline_num_layers,
+            mode,
+            **kwargs,
+        )
+
+    def update_cfg(
+        self,
+        data_parallel=1,
+        tensor_parallel=1,
+        pipeline_parallel=1,
+        pipeline_stage_id=None,
+        pipeline_num_layers=None,
+    ):
+        super().update_cfg(
+            data_parallel,
+            tensor_parallel,
+            pipeline_parallel,
+            pipeline_stage_id,
+            pipeline_num_layers,
+        )
+        self.cfg.model.cfg.hidden_dropout_prob = 0.0
+        self.cfg.model.cfg.attention_probs_dropout_prob = 0.0
+
+        assert "num_labels" in self.cfg.model.cfg, "The model's config must contain num_labels"
+        if "label2id" not in self.cfg.model.cfg:
+            label2id = {"Label_" + str(i): i for i in range(self.cfg.model.cfg.num_labels)}
+            id2label = {ind: label for label, ind in label2id.items()}
+            self.cfg.model.cfg["label2id"] = label2id
+            self.cfg.model.cfg["id2label"] = id2label
+
+    def _parse_parameters(self, **pipeline_parameters):
+        preprocess_params = {}
+        forward_params = {}
+        postprocess_params = {**pipeline_parameters}
+        return preprocess_params, forward_params, postprocess_params
+
+    def preprocess(
+        self,
+        inputs,
+        pad: bool = False,
+        **kwargs,
+    ) -> dict:
+        # tokenizer encoder
+        input_ids = flow.tensor(np.array(self.tokenizer.encode(inputs)))
+        padding_mask = flow.tensor(np.ones(input_ids.shape), dtype=flow.bool)
+        # set batch size = 1
+        input_ids = input_ids.unsqueeze(0)
+        padding_mask = padding_mask.unsqueeze(0)
+
+        # to global tensor
+        model_input = Instance(
+            input_ids=DistTensorData(input_ids),
+            attention_mask=DistTensorData(padding_mask),
+        )
+        mdoel_input_dict = {}
+        for key, value in model_input.get_fields().items():
+            value.to_global()
+            mdoel_input_dict[key] = value.tensor
+        return mdoel_input_dict
+
+    def forward(self, mdoel_input_dict) -> dict:
+        model_outputs_dict = self.model(**mdoel_input_dict)
+        return model_outputs_dict
+
+    def postprocess(
+        self, model_outputs_dict, function_to_apply=None, return_all_scores=False, **kwargs
+    ) -> dict:
+        # prepare
+        num_labels = self.cfg.model.cfg.num_labels
+        if function_to_apply is not None:
+            function_to_apply = function_to_apply.lower()
+            assert function_to_apply in [
+                "sigmoid",
+                "softmax",
+                "none",
+            ], f"Unrecognized `function_to_apply` argument: {function_to_apply}"
+        else:
+            if num_labels == 1:
+                function_to_apply = "sigmoid"
+            elif num_labels > 1:
+                function_to_apply = "softmax"
+
+        # process, logits: [num_labels]
+        logits = model_outputs_dict["logits"][0]
+
+        if function_to_apply == "sigmoid":
+            scores = flow.sigmoid(logits)
+        elif function_to_apply == "softmax":
+            scores = flow.softmax(logits)
+        else:
+            scores = logits
+        scores = scores.detach().numpy()
+        if return_all_scores:
+            return [
+                {"label": self.cfg.model.cfg.id2label[i], "score": score.item()}
+                for i, score in enumerate(scores)
+            ]
+        else:
+            return {
+                "label": self.cfg.model.cfg.id2label[scores.argmax().item()],
+                "score": scores.max().item(),
+            }