version 1

581b8d15 · liangjing · 581b8d15 · 581b8d15 · 581b8d15 · 581b8d15
Commit 581b8d15 authored Apr 10, 2023 by liangjing
20 changed files
--- a/models/data_process/vocab.py
+++ b/models/data_process/vocab.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import io
+import json
+import numpy as np
+import os
+import warnings
+
+
+class Vocab(object):
+    """
+    The class used to convert between tokens and ids. It also includes some 
+    store/load functions.
+
+    Args:
+        counter (collections.Counter, optional): A Counter intance describes
+            the tokens and their frequencies. Its keys will be indexed accroding
+            to the order of frequency sorting to construct mapping relationship. 
+            If None, `token_to_idx` must be provided as the mapping relationship.
+            Default: None.
+        max_size (int, optional): Max size of vocab, not including special tokens.
+            Default: None.
+        min_freq (int, optional): Ignore tokens whose frequencies are less than 
+            `min_freq`. Default: 1.
+        token_to_idx (dict, optional): A dict specifies the mapping relationship
+            between tokens and indices to be used. If provided, adjust the tokens
+            and indices mapping according to it. If None, counter must be provided.
+            Default: None.
+        unk_token (str, optional): Special token for unknow token. If no need, 
+            it also could be None. Default: None.
+        pad_token (str, optional): Special token for padding token. If no need, 
+            it also could be None. Default: None.
+        bos_token (str, optional): Special token for bos token. If no need, it 
+            also could be None. Default: None.
+        eos_token (str, optional): Special token for eos token. If no need, it 
+            lso could be None. Default: None.
+
+        kwargs (dict): Keyword arguments ending with `_token`. It can be used
+            to specify further special tokens that will be exposed as attribute
+            of the vocabulary and associated with an index.
+    """
+
+    def __init__(self,
+                 counter=None,
+                 max_size=None,
+                 min_freq=1,
+                 token_to_idx=None,
+                 unk_token=None,
+                 pad_token=None,
+                 bos_token=None,
+                 eos_token=None,
+                 **kwargs):
+        # Handle special tokens
+        combs = (('unk_token', unk_token), ('pad_token', pad_token),
+                 ('bos_token', bos_token), ('eos_token', eos_token))
+        for name, value in combs:
+            kwargs[name] = value
+        special_tokens = []
+        special_iter = kwargs.keys()
+        # sort alphabetically
+        special_iter = sorted(special_iter)
+        for special_token_name in special_iter:
+            # Test if kwarg specifies a special token
+            if not special_token_name.endswith('_token'):
+                raise ValueError('{} is invalid. Only keyword arguments '
+                                 'that end in \'_token\' are supported '
+                                 'to declare special tokens.'.format(
+                                     special_token_name))
+
+            special_token = kwargs[special_token_name]
+            if special_token is not None and special_token not in special_tokens:
+                special_tokens.append(special_token)
+
+        if counter is None:
+            # use token_to_idx as dict to import pretrained vocabulary
+            assert token_to_idx, (
+                'token_to_idx should not be None when counter is None')
+            for special_token in special_tokens:
+                assert special_token in token_to_idx, '{} is not in token_to_idx'.format(
+                    special_token)
+            self._token_to_idx = token_to_idx
+            self._idx_to_token = {
+                idx: token
+                for token, idx in token_to_idx.items()
+            }
+            if unk_token:
+                unk_index = self._token_to_idx[unk_token]
+                self._token_to_idx = collections.defaultdict(lambda: unk_index)
+                self._token_to_idx.update(token_to_idx)
+        else:
+            self._idx_to_token = {
+                idx: special_token
+                for idx, special_token in enumerate(special_tokens)
+            }
+            self._token_to_idx = collections.defaultdict()
+            self._token_to_idx.update(
+                (token, idx) for idx, token in self._idx_to_token.items())
+            self._index_counter_keys(counter, special_tokens, max_size,
+                                     min_freq)
+            if token_to_idx:
+                self._sort_index_according_to_user_specification(token_to_idx)
+            if unk_token:
+                self._token_to_idx.default_factory = lambda: self._token_to_idx[unk_token]
+
+        # _expose_tokens_as_attributes
+        self._identifiers_to_tokens = kwargs
+        for identifier, token in kwargs.items():
+            if identifier.startswith('_'):
+                raise ValueError(
+                    'It is not allowed to use identifiers starting with '
+                    'underscore. In Python identifier names beginning with '
+                    'underscore are internal.')
+            if hasattr(self, identifier):
+                raise ValueError(
+                    'vocab.{} already exists. '
+                    'Please choose a different identifier for token {}'.format(
+                        identifier, token))
+            setattr(self, identifier, token)
+
+    def _index_counter_keys(self, counter, special_tokens, max_size, min_freq):
+        # sort by frequency, then alphabetically
+        token_freqs = sorted(counter.items(), key=lambda x: x[0])
+        token_freqs.sort(key=lambda x: x[1], reverse=True)
+        # frequencies of special tokens are not counted when building vocabulary
+        # in frequency order
+        special_tokens = set(special_tokens)
+        max_size = None if max_size is None else max_size + len(special_tokens)
+        for token, freq in token_freqs:
+            if freq < min_freq or len(self._idx_to_token) == max_size:
+                break
+            if token not in special_tokens:
+                self._idx_to_token[max(list(self._idx_to_token.keys()) + [-1]) +
+                                   1] = token
+                self._token_to_idx[token] = max(self._idx_to_token.keys())
+
+    def _sort_index_according_to_user_specification(self, token_to_idx):
+        # Sanity checks
+        if not set(token_to_idx.keys()).issubset(self.token_to_idx.keys()):
+            raise ValueError(
+                'User-specified token_to_idx mapping can only contain '
+                'tokens that will be part of the vocabulary.')
+        if len(set(token_to_idx.values())) != len(token_to_idx):
+            raise ValueError(
+                'User-specified indices must not contain duplicates.')
+        if min(token_to_idx.values()) < 0 or max(token_to_idx.values()) >= len(
+                self.token_to_idx):
+            raise ValueError(
+                'User-specified indices must not be < 0 or >= the number of tokens '
+                'that will be in the vocabulary. The current vocab contains {}'
+                'tokens.'.format(len(self.token_to_idx)))
+
+        # Update index ordering
+        for token, new_idx in token_to_idx.items():
+            old_idx = self.token_to_idx[token]
+            ousted_token = self.idx_to_token[new_idx]
+
+            self.token_to_idx[token] = new_idx
+            self.token_to_idx[ousted_token] = old_idx
+            self.idx_to_token[old_idx] = ousted_token
+            self.idx_to_token[new_idx] = token
+
+    def to_tokens(self, indices):
+        """
+        Maps the input indices to token list.
+
+        Args:
+            indices (int|list[int]|tuple[int]|numpy.ndarray): The input indice(s) for mapping.
+                Must be an `int` or 1D `list[int]`|`tuple[int]`|`numpy.ndarray`.
+
+        Returns:
+            str|list[str]: Obtained token(s). If `indices` is an integer, it 
+            will return a str. If `indices` is a list/tuple of integers, it will 
+            return a list of str.
+            
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                tokens = vocab.to_tokens([0, 1, 2, 3])
+                print(tokens)
+                # ['[PAD]', '[UNK]', '一斤三', '意面屋']
+        """
+        to_reduce = False
+        if not isinstance(indices, (list, tuple, np.ndarray)):
+            indices = [indices]
+            to_reduce = True
+        if isinstance(indices, (list, tuple)):
+            indices = np.asarray(indices)
+
+        if isinstance(indices, (np.ndarray)) and len(indices.shape) > 1:
+            raise ValueError(
+                'Token indices is invalid. Expected 1D array, but received {}D array. '.
+                format(len(indices.shape)))
+
+        tokens = []
+        for idx in indices:
+            if not isinstance(idx, (int, np.integer)):
+                warnings.warn(
+                    "The type of `to_tokens()`'s input `indices` is not `int` which will be forcibly transfered to `int`. "
+                )
+                idx = int(idx)
+
+            try:
+                tokens.append(self._idx_to_token[idx])
+            except KeyError:
+                raise ValueError(
+                    'Token index {} in the provided `indices` is invalid.'.
+                    format(idx))
+
+        return tokens[0] if to_reduce else tokens
+
+    def to_indices(self, tokens):
+        """
+        Maps the input tokens into indices.
+
+        Args:
+            tokens (str|list[str]|tuple[str], optional): The input token(s) for 
+                mapping.
+        
+        Returns:
+            int|list[int]: Obationed indice(s). If `tokens` is a str, it will 
+            return an integer. If `tokens` is a list/tuple of str, it will 
+            return a list of integers.
+            
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                tokens = vocab.to_indices(['[PAD]', '[UNK]', '一斤三', '意面屋'])
+                print(tokens)
+                # [0, 1, 2, 3]
+        """
+        return self[tokens]
+
+    def __getitem__(self, tokens):
+        if not isinstance(tokens, (list, tuple)):
+            return self._token_to_idx[tokens]
+        else:
+            return [self._token_to_idx[token] for token in tokens]
+
+    def __len__(self):
+        return len(self._idx_to_token)
+
+    def __contains__(self, token):
+        return token in self._token_to_idx
+
+    def __call__(self, tokens):
+        """
+        Maps the input tokens into indices. Its function is the same as the 
+        :meth:`to_indices` method.
+
+        See detail at `to_indices`.
+        """
+        return self[tokens]
+
+    @property
+    def idx_to_token(self):
+        # Returns index-token dict
+        return self._idx_to_token
+
+    @property
+    def token_to_idx(self):
+        # Return token-index dict
+        return self._token_to_idx
+
+    def to_json(self, path=None):
+        """
+        Summarizes some information of vocab as JSON string. If path is gaven,
+        the JSON string will be saved into files. The JSON string and the saved
+        file all can be used to reconstruct the :class:`Vocab` by calling 
+        :meth:`from_json` method.
+
+        Args:
+            path (str, optional): The path to save JSON string. If None, the
+                JSON will not be saved. Default: None.
+        
+        Returns:
+            str: The JSON string including information of vocab.
+            
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                json_str = vocab.to_json(path='./vocab.json')
+        """
+        vocab_dict = {}
+        vocab_dict['idx_to_token'] = dict(self.idx_to_token)
+        vocab_dict['token_to_idx'] = dict(self.token_to_idx)
+        vocab_dict['unk_token'] = self.unk_token
+        vocab_dict['identifiers_to_tokens'] = self._identifiers_to_tokens
+        json_str = json.dumps(vocab_dict)
+        if path:
+            with io.open(path, 'w', encoding='utf-8') as f:
+                f.write(json_str)
+        return json_str
+
+    @classmethod
+    def from_json(cls, json_str):
+        """
+        Loads :class:`Vocab` from JSON string or JSON file, which is gotten by 
+        calling :meth:`to_json` method.
+
+        Args:
+            json_str (str): JSON string or file path of JSON string.
+
+        Returns:
+            Vocab: An instance of :class:`Vocab` generated from information 
+            contained in JSON string.
+            
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                json_str = vocab.to_json(path='./vocab.json')
+
+                vocab1 = Vocab.from_json(json_str)
+                vocab2 = Vocab.from_json('./vocab.json')
+                print(len(vocab), len(vocab1), len(vocab2))
+                # 1256608 1256608 1256608
+        """
+        if os.path.isfile(json_str):
+            with io.open(json_str, 'r', encoding='utf-8') as f:
+                vocab_dict = json.load(f)
+        else:
+            vocab_dict = json.loads(json_str)
+        token_to_idx = vocab_dict.get('token_to_idx')
+        unk_token = vocab_dict.get('unk_token')
+        identifiers_to_tokens = vocab_dict.get('identifiers_to_tokens', dict())
+        if 'unk_token' in identifiers_to_tokens:
+            del identifiers_to_tokens['unk_token']
+        vocab = cls(counter=None,
+                    token_to_idx=token_to_idx,
+                    unk_token=unk_token,
+                    **identifiers_to_tokens)
+        return vocab
+
+    @classmethod
+    def from_dict(cls,
+                  token_to_idx,
+                  unk_token=None,
+                  pad_token=None,
+                  bos_token=None,
+                  eos_token=None,
+                  **kwargs):
+        """
+        Builds the :class:`Vocab` from a dict.
+
+        Args:
+            token_to_idx (dict): A dict describes the mapping relationship between
+                tokens and indices.
+            unk_token (str, optional): The special token for unknow token. If 
+                no need, it also could be None. Default: None.
+            pad_token (str, optional): The special token for padding token. If 
+                no need, it also could be None. Default: None.
+            bos_token (str, optional): The special token for bos token. If no 
+                need, it also could be None. Default: None.
+            eos_token (str, optional): The special token for eos token. If no 
+                need, it also could be None. Default: None.
+
+            kwargs (dict): Keyword arguments ending with `_token`. It can be 
+                used to specify further special tokens that will be exposed as 
+                attribute of the vocabulary and associated with an index.
+
+        Returns:
+            Vocab: An instance of :class:`Vocab` generated from the given dict 
+            and special tokens.
+            
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+
+                vocab1 = Vocab.from_dict(vocab.token_to_idx)
+                print(len(vocab), len(vocab.token_to_idx), len(vocab1))
+                # 1256608 1256608 1256608
+        """
+        vocab = cls(counter=None,
+                    token_to_idx=token_to_idx,
+                    unk_token=unk_token,
+                    pad_token=pad_token,
+                    bos_token=bos_token,
+                    eos_token=eos_token,
+                    **kwargs)
+        return vocab
+
+    @staticmethod
+    def build_vocab(iterator,
+                    max_size=None,
+                    min_freq=1,
+                    token_to_idx=None,
+                    unk_token=None,
+                    pad_token=None,
+                    bos_token=None,
+                    eos_token=None,
+                    **kwargs):
+        """
+        Builds the :class:`Vocab` accoring to given iterator and other 
+        information. Firstly, iterate over the `iterator` to construct a 
+        :class:`collections.Counter` and used to init the as  :class:`Vocab`.
+
+        Args:
+            iterator (collections.Iterable): Iterator of tokens. Each element 
+                should be a list of tokens if wordlevel vocab is needed.
+            max_size (int, optional): The max size of vocab, not including 
+                special tokens. Default: None.
+            min_freq (int, optional): Ignore tokens whose frequencies are less 
+                than `min_freq`. Default: 1.
+            token_to_idx (dict, optional): A dict specifies the mapping 
+                relationship between tokens and indices to be used. If provided, 
+                adjust the tokens and indices mapping according to it. If None, 
+                counter must be provided. Default: None.
+            unk_token (str, optional): The special token for unknow token 
+                '<unk>'. If no need, it also could be None. Default: None.
+            pad_token (str, optional): The special token for padding token 
+                '<pad>'. If no need, it also could be None. Default: None.
+            bos_token (str, optional): The special token for bos token '<bos>'. 
+                If no need, it also could be None. Default: None.
+            eos_token (str, optional): The special token for eos token '<eos>'. 
+                If no need, it also could be None. Default: None.
+            
+            kwargs (dict): Keyword arguments ending with `_token`. It can be 
+                used to specify further special tokens that will be exposed as 
+                attribute of the vocabulary and associated with an index.
+
+        Returns:
+            Vocab: An instance of :class:`Vocab` generated from given iterator 
+            and other informations.
+            
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+
+                vocab1 = Vocab.build_vocab([list(vocab.token_to_idx.keys())])
+                print(len(vocab), len(vocab1))
+                # 1256608 1256608
+        """
+        counter = collections.Counter()
+        for tokens in iterator:
+            counter.update(tokens)
+        vocab = Vocab(
+            counter,
+            max_size=max_size,
+            min_freq=min_freq,
+            token_to_idx=token_to_idx,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            **kwargs)
+        return vocab
+
+    @staticmethod
+    def load_vocabulary(filepath,
+                        unk_token=None,
+                        pad_token=None,
+                        bos_token=None,
+                        eos_token=None,
+                        **kwargs):
+        """
+        Builds the :class:`Vocab` from a file reserving all tokens by calling 
+        :meth:`Vocab.from_dict` method. The file contains a token per line, and 
+        the line index would be the index of corresponding token.
+
+        Args:
+            filepath (str): the path of file to construct vocabulary.
+            unk_token (str, optional): special token for unknown token. If no 
+                need, it also could be None. Default: None.
+            pad_token (str, optional): special token for padding token. If no 
+                need, it also could be None. Default: None.
+            bos_token (str, optional): special token for bos token. If no need, 
+                it also could be None. Default: None.
+            eos_token (str, optional): special token for eos token. If no need, 
+                it also could be None. Default: None.
+
+            kwargs (dict): Keyword arguments ending with `_token`. It can be 
+                used to specify further special tokens that will be exposed as 
+                attribute of the vocabulary and associated with an index.
+
+        Returns:
+            Vocab: An instance of :class:`Vocab` generated from the given file.
+            
+        Example:
+            .. code-block:: python
+
+                from paddlenlp.data import Vocab
+                # The vocab file. The sample file can be downloaded firstly.
+                # wget https://paddlenlp.bj.bcebos.com/data/senta_word_dict.txt
+                vocab_file_path = './senta_word_dict.txt'
+                # Initialize the Vocab
+                vocab = Vocab.load_vocabulary(
+                    vocab_file_path,
+                    unk_token='[UNK]',
+                    pad_token='[PAD]')
+                print(len(vocab))
+                # 1256608
+        """
+        token_to_idx = {}
+        with io.open(filepath, 'r', encoding='utf-8') as f:
+            for index, line in enumerate(f):
+                token = line.rstrip('\n')
+                token_to_idx[token] = int(index)
+        vocab = Vocab.from_dict(
+            token_to_idx,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            **kwargs)
+        return vocab
--- a/models/generation_utils.py
+++ b/models/generation_utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+from abc import ABC
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.fluid.data_feeder import convert_dtype
+from paddle.fluid.layers.utils import map_structure
+
+__all__ = ["GenerationMixin"]
+
+
+class BeamHypotheses:
+    def __init__(self, num_beams, length_penalty, early_stopping):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+
+    def add(self, hyp, sum_logprobs, origin_len=0):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / (((hyp.shape[-1] - origin_len + 5) / 6)
+                                **self.length_penalty)
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp))
+            if len(self) > self.num_beams:
+                sorted_next_scores = sorted(
+                    [(s, idx) for idx, (s, _) in enumerate(self.beams)])
+                del self.beams[sorted_next_scores[0][1]]
+                self.worst_score = sorted_next_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+
+    def is_done(self, best_sum_logprobs, cur_len, origin_len=0):
+        """
+        If there are enough hypotheses and that none of the hypotheses being 
+        generated can become better than the worst one in the heap, then we 
+        are done with this sentence.
+        """
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / (
+                (cur_len - origin_len + 5) / 6)**self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret
+
+
+class BeamSearchScorer(object):
+    """
+    implementing standard beam search decoding.
+    """
+
+    def __init__(self,
+                 batch_size,
+                 max_length,
+                 num_beams,
+                 length_penalty=1.0,
+                 do_early_stopping=False,
+                 num_beam_hyps_to_keep=1,
+                 num_beam_groups=1):
+        self.max_length = max_length
+        self.num_beams = num_beams
+        self.length_penalty = length_penalty
+        self.do_early_stopping = do_early_stopping
+        self.num_beam_hyps_to_keep = num_beam_hyps_to_keep
+        self.num_beam_groups = num_beam_groups
+        self.group_size = self.num_beams // self.num_beam_groups
+
+        self._is_init = False
+        self._beam_hyps = [
+            BeamHypotheses(
+                num_beams=self.num_beams,
+                length_penalty=self.length_penalty,
+                early_stopping=self.do_early_stopping)
+            for _ in range(batch_size)
+        ]
+        self._done = paddle.to_tensor(
+            [0 for _ in range(batch_size)], dtype='int64')
+
+        if not isinstance(num_beams, int) or num_beams <= 1:
+            raise ValueError(
+                "`num_beams` has to be an integer strictly greater than 1, but "
+                "received {}. For `num_beams` == 1, one should make use of "
+                "`greedy_search` instead.".format(num_beams))
+
+        if not isinstance(num_beam_groups, int) or (
+                num_beam_groups > num_beams) or (
+                    num_beams % num_beam_groups != 0):
+            raise ValueError(
+                "`num_beam_groups` has to be an integer smaller or equal than "
+                "`num_beams` and `num_beams` has to be divisible by "
+                "`num_beam_groups`, but received num_beam_groups={}, num_beams="
+                "{}.".format(num_beam_groups, num_beams))
+
+    @property
+    def is_done(self):
+        return paddle.min(self._done) == 1
+
+    def process(self,
+                input_ids,
+                next_scores,
+                next_tokens,
+                next_indices,
+                origin_len=0,
+                pad_token_id=None,
+                eos_token_id=None):
+        cur_len = input_ids.shape[-1]
+        batch_size = len(self._beam_hyps)
+        assert batch_size == (input_ids.shape[0] // self.group_size)
+
+        next_beam_scores = paddle.zeros(
+            [batch_size, self.group_size], dtype=next_scores.dtype)
+        next_beam_tokens = paddle.zeros(
+            [batch_size, self.group_size], dtype=next_tokens.dtype)
+        next_beam_indices = paddle.zeros(
+            [batch_size, self.group_size], dtype=next_indices.dtype)
+
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx] == 1:
+                assert (
+                    len(beam_hyp) >= self.num_beams
+                ), "Batch can only be done if at least {} beams have been generated".format(
+                    self.num_beams)
+                assert (
+                    eos_token_id is not None and pad_token_id is not None
+                ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
+                # pad the batch
+                next_beam_scores[batch_idx, :] = 0
+                next_beam_tokens[batch_idx, :] = pad_token_id
+                next_beam_indices[batch_idx, :] = 0
+                continue
+
+            # next tokens for this sentence
+            beam_idx = 0
+            for beam_token_rank, (next_token, next_score,
+                                  next_index) in enumerate(
+                                      zip(next_tokens[batch_idx], next_scores[
+                                          batch_idx], next_indices[batch_idx])):
+                batch_beam_idx = batch_idx * self.group_size + next_index
+                # add to generated hypotheses if end of sentence
+                if (eos_token_id is not None) and (
+                        next_token.numpy().item() == eos_token_id):
+                    # If beam_token does not belong to top num_beams tokens, 
+                    # it should not be added
+                    is_beam_token_worse_than_top_num_beams = (
+                        beam_token_rank >= self.group_size)
+                    if is_beam_token_worse_than_top_num_beams:
+                        continue
+                    beam_hyp.add(
+                        input_ids[batch_beam_idx.numpy().item()].clone(),
+                        next_score.numpy().item(), origin_len)
+
+                else:
+                    # add next predicted token since it is not eos_token
+                    next_beam_scores[batch_idx, beam_idx] = next_score
+                    next_beam_tokens[batch_idx, beam_idx] = next_token.numpy(
+                    ).item()
+                    next_beam_indices[batch_idx,
+                                      beam_idx] = batch_beam_idx.numpy().item()
+                    beam_idx += 1
+
+                # once the beam for next step is full, don't add more tokens to it.
+                if beam_idx == self.group_size:
+                    break
+
+            if beam_idx < self.group_size:
+                raise ValueError(
+                    "At most {} tokens in `next_tokens[batch_idx]` can be equal "
+                    "to `eos_token_id: {}`. Make sure `next_tokens[batch_idx]` "
+                    "are corrected.".format(self.group_size, eos_token_id))
+
+            # Check if we are done so that we can save a pad step if all(done)
+            if beam_hyp.is_done(next_scores[batch_idx].max().numpy().item(),
+                                cur_len, origin_len):
+                self._done[batch_idx] = 1
+
+        return {
+            "next_beam_scores": next_beam_scores.reshape([-1]),
+            "next_beam_tokens": next_beam_tokens.reshape([-1]),
+            "next_beam_indices": next_beam_indices.reshape([-1])
+        }
+
+    def finalize(self,
+                 input_ids,
+                 final_beam_scores,
+                 final_beam_tokens,
+                 final_beam_indices,
+                 pad_token_id=None,
+                 eos_token_id=None):
+        batch_size = len(self._beam_hyps)
+
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx, beam_hyp in enumerate(self._beam_hyps):
+            if self._done[batch_idx] == 1:
+                continue
+
+            # all open beam hypotheses are added to the beam hypothesis
+            # beam hypothesis class automatically keeps the best beams
+            for beam_id in range(self.num_beams):
+                batch_beam_idx = batch_idx * self.num_beams + beam_id
+                final_score = final_beam_scores[batch_beam_idx].numpy().item()
+                final_tokens = input_ids[batch_beam_idx]
+                beam_hyp.add(final_tokens, final_score)
+
+        # select the best hypotheses
+        sent_lengths = paddle.zeros(
+            [batch_size * self.num_beam_hyps_to_keep], dtype=input_ids.dtype)
+        best = []
+
+        # retrieve best hypotheses
+        for i, beam_hyp in enumerate(self._beam_hyps):
+            sorted_hyps = sorted(beam_hyp.beams, key=lambda x: x[0])
+            for j in range(self.num_beam_hyps_to_keep):
+                best_score, best_hyp = sorted_hyps.pop()
+                sent_lengths[self.num_beam_hyps_to_keep * i + j] = len(best_hyp)
+                best.append([best_hyp, best_score])
+
+        # prepare for adding eos
+        sent_max_len = min(sent_lengths.max().numpy().item() + 1,
+                           self.max_length)
+        decoded = paddle.zeros(
+            [batch_size * self.num_beam_hyps_to_keep, sent_max_len],
+            dtype=input_ids.dtype)
+        # shorter batches are padded if needed
+        if sent_lengths.min().numpy().item() != sent_lengths.max().numpy().item(
+        ):
+            assert pad_token_id is not None, "`pad_token_id` has to be defined"
+            decoded[:, :] = pad_token_id
+        decoded_score = paddle.zeros(
+            [batch_size * self.num_beam_hyps_to_keep, 1])
+
+        # fill with hypotheses and eos_token_id if the latter fits in
+        for i, (hypo, score) in enumerate(best):
+            decoded[i, :sent_lengths[i].numpy().item()] = hypo.numpy()
+            decoded_score[i] = score
+            if sent_lengths[i] < self.max_length:
+                decoded[i, sent_lengths[i].numpy().item()] = eos_token_id
+        return decoded, decoded_score
+
+
+class GenerationMixin(object):
+    r"""
+    This class implements the interface for generation task. 
+    
+    It's used as the base class of `paddlenlp.transformers.PretrainedModel 
+    <https://paddlenlp.readthedocs.io/zh/latest/source/paddlenlp.transformers.model_utils.html>`__.
+    """
+
+    @staticmethod
+    def prepare_input_ids_for_generation(bos_token_id, encoder_output=None):
+        batch_size = 1
+        if bos_token_id is None:
+            raise ValueError("`bos_token_id` should be defined when no "
+                             "`input_ids` are provided.")
+        if encoder_output is not None:
+            batch_size = encoder_output.shape[0]
+        return paddle.ones([batch_size, 1], dtype="int64") * bos_token_id
+
+    @staticmethod
+    def prepare_attention_mask_for_generation(input_ids, pad_token_id,
+                                              eos_token_id):
+        is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any(
+            input_ids == pad_token_id).numpy().item()
+        is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or (
+            (eos_token_id is not None) and (pad_token_id != eos_token_id))
+        if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id:
+            attention_mask = (input_ids == pad_token_id
+                              ).astype(paddle.get_default_dtype()) * -1e9
+        else:
+            attention_mask = paddle.zeros_like(
+                input_ids, dtype=paddle.get_default_dtype())
+        return paddle.unsqueeze(attention_mask, axis=[1, 2])
+
+    @staticmethod
+    def get_logits_processor(min_length=None,
+                             eos_token_id=None,
+                             repetition_penalty=None):
+        processors = LogitsProcessorList()
+        if min_length is not None and eos_token_id is not None and min_length > -1:
+            processors.append(
+                MinLengthLogitsProcessor(min_length, eos_token_id))
+        if repetition_penalty is not None and repetition_penalty != 1.0:
+            processors.append(
+                RepetitionPenaltyLogitsProcessor(penalty=repetition_penalty))
+        # TODO
+        # Add more pre_processing for distribution
+
+        return processors
+
+    @staticmethod
+    def expand_inputs_for_generation(input_ids,
+                                     expand_size,
+                                     attention_mask=None,
+                                     **model_kwargs):
+        index = paddle.tile(
+            paddle.arange(input_ids.shape[0]).unsqueeze(-1),
+            [1, expand_size]).reshape([-1])
+
+        input_ids = paddle.index_select(input_ids, index)
+
+        if attention_mask is not None:
+            model_kwargs["attention_mask"] = paddle.index_select(attention_mask,
+                                                                 index)
+
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = paddle.index_select(token_type_ids,
+                                                                 index)
+
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.index_select(position_ids,
+                                                               index)
+
+        if "seq_len" in model_kwargs:
+            seq_len = model_kwargs["seq_len"]
+            model_kwargs["seq_len"] = paddle.index_select(seq_len, index)
+
+        if "encoder_output" in model_kwargs:
+            encoder_output = model_kwargs["encoder_output"]
+            model_kwargs["encoder_output"] = paddle.index_select(encoder_output,
+                                                                 index)
+
+        return input_ids, model_kwargs
+
+    @staticmethod
+    def update_model_kwargs_for_generation(outputs,
+                                           model_kwargs,
+                                           is_encoder_decoder=False):
+        # Update the model inputs during generation. 
+        # Note that If `token_type_ids` and `attention_mask` in `model_kwargs` 
+        # and they contain pad value, the result vectors updated by this method 
+        # may be different from expected. In this case, you need to rewrite the 
+        # method.
+
+        # update cache
+        if isinstance(outputs, tuple):
+            model_kwargs["cache"] = outputs[1]
+
+        # update token_type_ids with last value
+        if "token_type_ids" in model_kwargs:
+            token_type_ids = model_kwargs["token_type_ids"]
+            model_kwargs["token_type_ids"] = paddle.concat(
+                [token_type_ids, token_type_ids[:, -1].unsqueeze(-1)], axis=-1)
+
+        # update position_ids
+        if "position_ids" in model_kwargs:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = paddle.concat(
+                [position_ids, position_ids[:, -1].reshape((-1, 1)) + 1],
+                axis=-1)
+
+        # update attention_mask
+        if not is_encoder_decoder and "attention_mask" in model_kwargs:
+            attention_mask = model_kwargs["attention_mask"]
+            # nn.Pad2D don't support the data type `bool`
+            if convert_dtype(attention_mask.dtype) == 'bool':
+                attention_mask = paddle.cast(attention_mask, 'int64')
+            attention_mask = nn.Pad2D(
+                [0, 0, 0, 1], mode='replicate')(attention_mask)
+            attention_mask = nn.Pad2D([0, 1, 0, 0], value=-1e9)(attention_mask)
+            dtype = convert_dtype(attention_mask.dtype)
+            if 'int' in dtype:
+                attention_mask[:, :, -1, -1] = 1
+            elif 'float' in dtype:
+                attention_mask[:, :, -1, -1] = 0.0
+            else:
+                raise ValueError('The data type of input `attention_mask` must '
+                                 'be bool, int or float')
+            model_kwargs["attention_mask"] = attention_mask
+
+        return model_kwargs
+
+    @staticmethod
+    def update_scores_for_generation(scores, next_scores, length,
+                                     unfinished_flag):
+        # update scores
+
+        unfinished_scores = (scores * length + next_scores) / (length + 1)
+        scores = paddle.where(unfinished_flag, unfinished_scores, scores)
+        return scores
+
+    def prepare_encoder_decoder_kwargs_for_generation(self, input_ids,
+                                                      model_kwargs):
+        if "encoder_output" not in model_kwargs:
+            # retrieve encoder hidden states
+            encoder = self.get_encoder()
+            encoder_kwargs = {
+                argument: value
+                for argument, value in model_kwargs.items()
+                if not (argument.startswith("decoder_") or argument.startswith(
+                    "cross_attn"))
+            }
+
+            model_kwargs["encoder_output"] = encoder(input_ids,
+                                                     **encoder_kwargs)
+        return model_kwargs
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        # Implement in subclasses for custom behavior to prepare inputs in the
+        # generate method.
+
+        return {"input_ids": input_ids}
+
+    def adjust_logits_during_generation(self, logits):
+        # Implement in subclasses for custom behavior to adjust the logits in 
+        # the generate method.
+
+        return logits
+
+    @paddle.no_grad()
+    def generate(self,
+                 input_ids=None,
+                 max_length=20,
+                 min_length=0,
+                 decode_strategy='greedy_search',
+                 temperature=1.0,
+                 top_k=0,
+                 top_p=1.0,
+                 repetition_penalty=1.0,
+                 num_beams=1,
+                 length_penalty=0.0,
+                 early_stopping=False,
+                 bos_token_id=None,
+                 eos_token_id=None,
+                 pad_token_id=None,
+                 num_return_sequences=1,
+                 diversity_rate=0.0,
+                 use_cache=True,
+                 **model_kwargs):
+        r"""
+        The interface for generation task. This method can generate sequences 
+        by using decoding strategy. Currently, there are three decoding 
+        strategies supported: "greedy_search", "sampling" and "beam_search".
+
+        Args:
+            input_ids (Tensor, optional): The input sequence ids for the 
+                generation. It is a Tensor with shape [batch_size, sequence_length]. 
+                The data type should be int32 or int64. Default to None, which 
+                we will initialize it as a Tensor with shape [1, 1], filled 
+                with the value `bos_token_id`.
+            max_length (int, optional): The maximum length of the sequence to 
+                be generated. Default to 20.
+            min_length (int, optional): The minimum length of the sequence to 
+                be generated. Default to 0.
+            decode_strategy (str, optional): The decoding strategy in generation.
+                Currently, there are three decoding strategies supported: 
+                "greedy_search", "sampling" and "beam_search". Default to 
+                "greedy_search".
+            temperature (float, optional): The value used to module the next 
+                token probabilities in the "sampling" strategy. Default to 1.0, 
+                which means no effect.
+            top_k (int, optional): The number of highest probability tokens to 
+                keep for top-k-filtering in the "sampling" strategy. Default to 
+                0, which means no effect.
+            top_p (float, optional): The cumulative probability for 
+                top-p-filtering in the "sampling" strategy. The value should 
+                satisfy :math:`0 <= top\_p < 1`. Default to 1.0, which means no 
+                effect.
+            repetition_penalty (float, optional):
+                The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+                <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details. Defaults to 1.0.
+            num_beams (int, optional): The number of beams in the "beam_search"
+                strategy. Default to 1.
+            length_penalty (float, optional): The exponential penalty to the 
+                sequence length in the "beam_search" strategy. The larger this
+                param is, the more that the model would generate shorter 
+                sequences. Default to 0.0, which means no penalty.
+            early_stopping (bool, optional): Whether to stop searching in the 
+                "beam_search" strategy when at least `num_beams` sentences are 
+                finished per batch or not. Default to False.
+            bos_token_id (int, optional): The id of the `bos_token`. Default to 
+                None.
+            eos_token_id (int, optional): The id of the `eos_token`. Default to 
+                None.
+            pad_token_id (int, optional): The id of the `pad_token`. Default to 
+                None.
+            num_return_sequences (int, optional): The number of returned 
+                sequences for each sequence in the batch. Default to 1.
+            diversity_rate (float, optional): The diversity_rate for diverse 
+                siblings search. See this paper for more details. 
+                `https://arxiv.org/abs/1611.08562`.
+            use_cache: (bool, optional): Whether or not use the model cache to 
+                speed up decoding. Default to True.
+            model_kwargs (dict): It can be used to specify additional kwargs 
+                passed to the model.
+
+        Returns:
+            tuple[Tensor]: It is a tuple contains two elements: ids and scores. 
+            Each element is a Tensor.
+
+            With the fields:
+
+            - ids (Tensor): 
+                The ids of the generated sequences. It is a Tensor with shape 
+                [batch_size * num_return_sequences, sequence_length]. The data 
+                type is same as the input `input_ids`.
+            - scores (Tensor): 
+                The scores of the generated sequences. It is a Tensor with shape 
+                [batch_size * num_return_sequences, 1]. The data type is float32 
+                or float64, which is the same as the parameters in the model.
+
+        Example:
+            .. code-block::
+
+                import paddle
+                from paddlenlp.transformers import (
+                    UnifiedTransformerLMHeadModel, 
+                    UnifiedTransformerTokenizer
+                )
+
+                paddle.seed(2)
+
+                # Initialize the model and tokenizer
+                model_name_or_path = 'unified_transformer-12L-cn-luge'
+                model = UnifiedTransformerLMHeadModel.from_pretrained(model_name_or_path)
+                tokenizer = UnifiedTransformerTokenizer.from_pretrained(model_name_or_path)
+
+                # Prepare the model inputs.
+                history = "早上好，今天空气质量不错。"
+                inputs = tokenizer.dialogue_encode(history, task_type='chitchat', 
+                    add_start_token_as_response=True, return_tensors=True)
+                
+            .. code-block::
+
+                # Generate the sequence by using "greedy_search" strategy
+                ids, scores = model.generate(
+                    input_ids=inputs['input_ids'],
+                    token_type_ids=inputs['token_type_ids'],
+                    position_ids=inputs['position_ids'],
+                    attention_mask=inputs['attention_mask'],
+                    decode_strategy="greedy_search")
+                print(ids.shape, scores.shape)
+                # [1, 3] [1, 1]
+                sequence_ids = ids.numpy().tolist()[0]
+                sequence_ids = sequence_ids[:sequence_ids.index(tokenizer.sep_token_id)]
+                response = tokenizer.convert_ids_to_string(sequence_ids, keep_space=False)
+                print(response)
+                # 是的
+
+            .. code-block::
+            
+                # Generate 2 sequences by using "sampling" strategy (top_k=5)
+                ids, scores = model.generate(
+                    input_ids=inputs['input_ids'],
+                    token_type_ids=inputs['token_type_ids'],
+                    position_ids=inputs['position_ids'],
+                    attention_mask=inputs['attention_mask'],
+                    decode_strategy="sampling",
+                    top_k=5,
+                    num_return_sequences=2)
+                print(ids.shape, scores.shape)
+                # [2, 7] [2, 1]
+                response = []
+                for sequence_ids in ids.numpy().tolist():
+                    sequence_ids = sequence_ids[:sequence_ids.index(tokenizer.sep_token_id)]
+                    text = tokenizer.convert_ids_to_string(sequence_ids, keep_space=False)
+                    response.append(text)
+                print(response)
+                # ['天气好,心情也好', '你也是']
+
+            .. code-block::
+            
+                # Generate 2 sequences by using "beam_search" strategy (num_beams=5)
+                ids, scores = model.generate(
+                    input_ids=inputs['input_ids'],
+                    token_type_ids=inputs['token_type_ids'],
+                    position_ids=inputs['position_ids'],
+                    attention_mask=inputs['attention_mask'],
+                    decode_strategy="beam_search",
+                    num_beams=5,
+                    num_return_sequences=2)
+                print(ids.shape, scores.shape)
+                # [2, 3] [2, 1]
+                response = []
+                for sequence_ids in ids.numpy().tolist():
+                    sequence_ids = sequence_ids[:sequence_ids.index(tokenizer.sep_token_id)]
+                    text = tokenizer.convert_ids_to_string(sequence_ids, keep_space=False)
+                    response.append(text)
+                print(response)
+                # ['是的', '嗯嗯']
+        """
+
+        # params check
+        bos_token_id = bos_token_id if bos_token_id is not None else getattr(
+            self, 'bos_token_id', None)
+        eos_token_id = eos_token_id if eos_token_id is not None else getattr(
+            self, 'eos_token_id', None)
+        pad_token_id = pad_token_id if pad_token_id is not None else getattr(
+            self, 'pad_token_id', None)
+
+        if input_ids is None:
+            # Init `input_ids` with bos_token_id
+            input_ids = self.prepare_input_ids_for_generation(bos_token_id)
+
+        if model_kwargs.get("attention_mask", None) is None:
+            # TODO
+            # Init `attention_mask` depending on `pad_token_id`
+            model_kwargs[
+                "attention_mask"] = self.prepare_attention_mask_for_generation(
+                    input_ids, pad_token_id, eos_token_id)
+        self.is_encoder_decoder = hasattr(self, 'encoder') and hasattr(
+            self, 'decoder')
+        if self.is_encoder_decoder:
+            model_kwargs = self.prepare_encoder_decoder_kwargs_for_generation(
+                input_ids, model_kwargs)
+            # set input_ids as decoder_input_ids
+            if "decoder_input_ids" in model_kwargs:
+                input_ids = model_kwargs.pop("decoder_input_ids")
+            else:
+                input_ids = self.prepare_input_ids_for_generation(
+                    bos_token_id, model_kwargs["encoder_output"])
+
+        if pad_token_id is None and eos_token_id is not None:
+            print("Setting `pad_token_id` to `eos_token_id`:{} for "
+                  "open-end generation.".format(eos_token_id))
+            pad_token_id = eos_token_id
+
+        model_kwargs["use_cache"] = use_cache
+        max_length += input_ids.shape[-1]
+        min_length += input_ids.shape[-1]
+        logits_processors = self.get_logits_processor(min_length, eos_token_id,
+                                                      repetition_penalty)
+
+        if decode_strategy == 'greedy_search':
+            if num_return_sequences > 1:
+                raise ValueError(
+                    "`num_return_sequences` has to be 1, but is {} "
+                    "when doing greedy search.".format(num_return_sequences))
+
+            return self.greedy_search(input_ids, logits_processors, max_length,
+                                      pad_token_id, eos_token_id,
+                                      **model_kwargs)
+
+        elif decode_strategy == 'sampling':
+            if num_return_sequences > 1:
+                input_ids, model_kwargs = self.expand_inputs_for_generation(
+                    input_ids, expand_size=num_return_sequences, **model_kwargs)
+
+            return self.sample(input_ids, logits_processors, max_length,
+                               pad_token_id, eos_token_id, top_k, top_p,
+                               temperature, **model_kwargs)
+
+        elif decode_strategy == 'beam_search':
+            batch_size = input_ids.shape[0]
+            if num_return_sequences > num_beams:
+                raise ValueError(
+                    "`num_return_sequences` has to be smaller or equal to "
+                    "`num_beams`. But received `num_return_sequences` is {}, "
+                    "`num_beams` is {}".format(num_return_sequences, num_beams))
+            if num_beams <= 1:
+                raise ValueError(
+                    "`num_beams` has to be bigger than 1. But received "
+                    "`num_beams` is {}. If `num_beams` is 1, `decode_strategy` "
+                    "should be 'greedy_search'".format(num_beams))
+
+            beam_scorer = BeamSearchScorer(
+                batch_size=batch_size,
+                max_length=max_length,
+                num_beams=num_beams,
+                length_penalty=length_penalty,
+                do_early_stopping=early_stopping,
+                num_beam_hyps_to_keep=num_return_sequences)
+
+            input_ids, model_kwargs = self.expand_inputs_for_generation(
+                input_ids, expand_size=num_beams, **model_kwargs)
+
+            return self.beam_search(input_ids, beam_scorer, logits_processors,
+                                    max_length, diversity_rate, pad_token_id,
+                                    eos_token_id, **model_kwargs)
+
+        else:
+            raise ValueError(
+                '`decode_strategy` must be one of "greedy_search", "sampling" '
+                'and "beam_search".')
+
+    def greedy_search(self, input_ids, logits_processors, max_length,
+                      pad_token_id, eos_token_id, **model_kwargs):
+        batch_size, cur_len = input_ids.shape
+        origin_len = cur_len
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')
+        scores = paddle.full(
+            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype())
+
+        while cur_len < max_length:
+            # prepare model inputs & get model output
+            model_inputs = self.prepare_inputs_for_generation(input_ids,
+                                                              **model_kwargs)
+            outputs = self(**model_inputs)
+            logits = outputs[0] if isinstance(outputs, tuple) else outputs
+            # [batch_size, vocab_size]
+            logits = logits[:, -1, :]
+            # pre-process distribution
+            logits = self.adjust_logits_during_generation(logits)
+            logits = logits_processors(input_ids, logits)
+            # greedy
+            probs = F.softmax(logits)
+            probs = paddle.log(probs)
+            next_tokens = paddle.argmax(probs, axis=-1).unsqueeze(-1)
+            next_scores = paddle.index_sample(probs, next_tokens)
+
+            if eos_token_id is not None:
+                next_tokens = paddle.where(unfinished_flag, next_tokens,
+                                           paddle.full_like(next_tokens,
+                                                            pad_token_id))
+
+            scores = self.update_scores_for_generation(
+                scores, next_scores, cur_len - origin_len, unfinished_flag)
+
+            cur_len += 1
+            input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+
+            if eos_token_id is not None:
+                unfinished_flag = paddle.logical_and(
+                    unfinished_flag, next_tokens != eos_token_id)
+
+            # Stop when there is a </s> in all sentences
+            if not paddle.any(unfinished_flag):
+                break
+
+            model_kwargs = self.update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.is_encoder_decoder)
+        return input_ids[:, origin_len:], scores
+
+    def sample(self,
+               input_ids,
+               logits_processors,
+               max_length,
+               pad_token_id,
+               eos_token_id,
+               top_k=None,
+               top_p=None,
+               temperature=None,
+               min_tokens_to_keep=1,
+               **model_kwargs):
+        def TopKProcess(probs, top_k, min_tokens_to_keep):
+            top_k = min(max(top_k, min_tokens_to_keep), probs.shape[-1])
+            # Remove all tokens with a probability less than the last token of the top-k
+            topk_probs, _ = paddle.topk(probs, k=top_k)
+            probs = paddle.where(probs >= topk_probs[:, -1:], probs,
+                                 paddle.full_like(probs, 0.0))
+            return probs
+
+        def TopPProcess(probs, top_p, min_tokens_to_keep):
+            sorted_probs = paddle.sort(probs, descending=True)
+            sorted_indices = paddle.argsort(probs, descending=True)
+            cumulative_probs = paddle.cumsum(sorted_probs, axis=-1)
+
+            # Remove tokens with cumulative probs above the top_p, But keep at 
+            # least min_tokens_to_keep tokens
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if min_tokens_to_keep > 1:
+                # Set 'min_tokens_to_keep - 1' because the first token is kept
+                sorted_indices_to_remove[:, :min_tokens_to_keep - 1] = 0
+            # Keep the first token
+            sorted_indices_to_remove = paddle.cast(
+                sorted_indices_to_remove, dtype='int64')
+            sorted_indices_to_remove[:, 1:] = (
+                sorted_indices_to_remove[:, :-1].clone())
+            sorted_indices_to_remove[:, 0] = 0
+
+            # Scatter sorted tensors to original indexing
+            sorted_indices = sorted_indices + paddle.arange(probs.shape[
+                0]).unsqueeze(-1) * probs.shape[-1]
+            condition = paddle.scatter(sorted_indices_to_remove.flatten(),
+                                       sorted_indices.flatten(),
+                                       sorted_indices_to_remove.flatten())
+            condition = paddle.cast(condition, 'bool').reshape(probs.shape)
+            probs = paddle.where(condition, paddle.full_like(probs, 0.0), probs)
+            return probs
+
+        batch_size, cur_len = input_ids.shape
+        origin_len = cur_len
+        unfinished_flag = paddle.full([batch_size, 1], True, dtype='bool')
+        scores = paddle.full(
+            [batch_size, 1], 0.0, dtype=paddle.get_default_dtype())
+
+        while cur_len < max_length:
+            # prepare model inputs & get model output
+            model_inputs = self.prepare_inputs_for_generation(input_ids,
+                                                              **model_kwargs)
+            outputs = self(**model_inputs)
+            logits = outputs[0] if isinstance(outputs, tuple) else outputs
+            # [batch_size, vocab_size]
+            logits = logits[:, -1, :]
+
+            # pre-process distribution
+            logits = self.adjust_logits_during_generation(logits)
+            logits = logits_processors(input_ids, logits)
+
+            # sample
+            origin_probs = F.softmax(logits)
+            origin_probs = paddle.log(origin_probs)
+            if temperature is not None and temperature != 1.0:
+                logits = logits / temperature
+            probs = F.softmax(logits)
+            if top_k is not None and top_k != 0:
+                probs = TopKProcess(probs, top_k, min_tokens_to_keep)
+            if top_p is not None and top_p < 1.0:
+                probs = TopPProcess(probs, top_p, min_tokens_to_keep)
+            next_tokens = paddle.multinomial(probs)
+            next_scores = paddle.index_sample(origin_probs, next_tokens)
+
+            if eos_token_id is not None:
+                next_tokens = paddle.where(unfinished_flag, next_tokens,
+                                           paddle.full_like(next_tokens,
+                                                            pad_token_id))
+
+            scores = self.update_scores_for_generation(
+                scores, next_scores, cur_len - origin_len, unfinished_flag)
+
+            cur_len += 1
+            input_ids = paddle.concat([input_ids, next_tokens], axis=1)
+
+            if eos_token_id is not None:
+                unfinished_flag = paddle.logical_and(
+                    unfinished_flag, next_tokens != eos_token_id)
+
+            # Stop when there is a </s> in all sentences
+            if not paddle.any(unfinished_flag):
+                break
+            model_kwargs = self.update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.is_encoder_decoder)
+        return input_ids[:, origin_len:], scores
+
+    def beam_search(self, input_ids, beam_scorer, logits_processors, max_length,
+                    diversity_rate, pad_token_id, eos_token_id, **model_kwargs):
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+
+        batch_beam_size, cur_len = input_ids.shape
+        origin_len = cur_len
+
+        assert (
+            num_beams * batch_size == batch_beam_size
+        ), "Batch dimension of `input_ids` should be {}, but received {}.".format(
+            num_beams * batch_size, batch_beam_size)
+
+        beam_scores = paddle.zeros(
+            (batch_size, num_beams), dtype=paddle.get_default_dtype())
+        beam_scores[:, 1:] = -1e9
+        beam_scores = paddle.reshape(beam_scores, [-1])
+
+        while cur_len < max_length:
+            # prepare model inputs & get model output
+            model_inputs = self.prepare_inputs_for_generation(input_ids,
+                                                              **model_kwargs)
+            outputs = self(**model_inputs)
+            logits = outputs[0] if isinstance(outputs, tuple) else outputs
+            # [batch_size, vocab_size]
+            logits = logits[:, -1, :]
+
+            # pre-process distribution
+            logits = self.adjust_logits_during_generation(logits)
+            logits = logits_processors(input_ids, logits)
+
+            # beam search
+            # [batch_size * num_beams, vocab_size]
+            next_scores = F.softmax(logits)
+            next_scores = paddle.log(next_scores)
+
+            next_scores = next_scores + beam_scores.unsqueeze(-1)
+
+            vocab_size = next_scores.shape[-1]
+            if diversity_rate == 0.0:
+                # reshape for beam search
+                next_scores = next_scores.reshape(
+                    [batch_size, num_beams * vocab_size])
+
+                next_scores, next_tokens = paddle.topk(
+                    next_scores, 2 * num_beams, axis=1)
+
+                next_indices = next_tokens // vocab_size
+
+            else:
+                next_scores, next_tokens = paddle.topk(
+                    next_scores, 2 * num_beams, axis=1)
+
+                sibling_score = paddle.tile(
+                    paddle.arange(1, 2 * num_beams + 1),
+                    repeat_times=[batch_size * num_beams, 1]) * diversity_rate
+
+                diversed_score = next_scores - sibling_score
+                next_scores = next_scores.reshape(
+                    [batch_size, 2 * num_beams * num_beams])
+                next_tokens = next_tokens.reshape(
+                    [batch_size, 2 * num_beams * num_beams])
+
+                diversed_score = diversed_score.reshape(
+                    [batch_size, 2 * num_beams * num_beams])
+                diversed_score, diversed_tokens = paddle.topk(
+                    diversed_score, 2 * num_beams, axis=1)
+
+                # TODO
+                # Use gather_nd() to select origan token and score
+                next_scores = paddle.stack([
+                    paddle.index_select(next_scores[i], diversed_tokens[i])
+                    for i in range(next_scores.shape[0])
+                ])
+                next_tokens = paddle.stack([
+                    paddle.index_select(next_tokens[i], diversed_tokens[i])
+                    for i in range(next_tokens.shape[0])
+                ])
+
+                next_indices = next_tokens // (2 * num_beams)
+
+            next_tokens = next_tokens % vocab_size
+
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids,
+                next_scores,
+                next_tokens,
+                next_indices,
+                origin_len=origin_len,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id, )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+
+            cur_len += 1
+            input_ids = paddle.concat(
+                [
+                    paddle.index_select(input_ids, beam_idx),
+                    beam_next_tokens.unsqueeze(-1)
+                ],
+                axis=-1)
+
+            if beam_scorer.is_done:
+                break
+            model_kwargs = self.update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.is_encoder_decoder)
+            if model_kwargs["cache"] is not None:
+                # reorder the cache
+                model_kwargs["cache"] = map_structure(
+                    lambda x: paddle.index_select(x, beam_idx),
+                    model_kwargs["cache"])
+
+        pred_ids, scores = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id)
+        return pred_ids[:, origin_len:], scores
+
+
+class LogitsProcessorList(List):
+    def __call__(self, input_ids, logits):
+        for processor in self:
+            logits = processor(input_ids, logits)
+        return logits
+
+
+class LogitsProcessor(ABC):
+    """
+    Abstract base class for all logit processors that can be applied during 
+    generation.
+    """
+
+    def __call__(self, input_ids, logits):
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. "
+            "Only classes inheriting this class can be called.")
+
+
+class MinLengthLogitsProcessor(LogitsProcessor):
+    r"""
+    Enforcing a min-length by setting EOS probability to 0.
+
+    Args:
+        min_length (int): The minimum length of generation sequence.
+        eos_token_id (int): The id of the `end-of-sequence` token.
+    """
+
+    def __init__(self, min_length, eos_token_id):
+        if not isinstance(min_length, int) or min_length < 0:
+            raise ValueError(
+                "`min_length` should be a positive integer, but get {}".format(
+                    min_length))
+
+        if not isinstance(eos_token_id, int) or eos_token_id < 0:
+            raise ValueError(
+                "`eos_token_id` should be a positive integer, but get {}".
+                format(eos_token_id))
+
+        self.min_length = min_length
+        self.eos_token_id = eos_token_id
+
+    def __call__(self, input_ids, logits):
+        cur_len = input_ids.shape[-1]
+        if cur_len < self.min_length:
+            logits[:, self.eos_token_id] = -1e9
+        return logits
+
+
+class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
+    r"""
+    Enforcing an exponential penalty on repeated sequences.
+
+    Args:
+        repetition_penalty (float):
+            The parameter for repetition penalty. 1.0 means no penalty. See `this paper
+            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+    """
+
+    def __init__(self, penalty: float):
+        if not isinstance(penalty, float) or not (penalty > 0):
+            raise ValueError(
+                f"`penalty` has to be a strictly positive float, but is {penalty}"
+            )
+
+        self.penalty = penalty
+
+    def __call__(self, input_ids, logits):
+        score = paddle.index_sample(logits, input_ids)
+        score = paddle.where(score < 0, score * self.penalty,
+                             score / self.penalty)
+        input_ids = input_ids + paddle.arange(logits.shape[0]).unsqueeze(
+            -1) * logits.shape[-1]
+        outputs = paddle.scatter(logits.flatten(),
+                                 input_ids.flatten(),
+                                 score.flatten()).reshape(logits.shape)
+        return outputs
--- a/models/load_tf_checkpoint.py
+++ b/models/load_tf_checkpoint.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import pickle
+
+
+def load_tf_checkpoint(input_path, output_path=None):
+    import tensorflow as tf
+    tf_path = os.path.abspath(input_path)
+    print("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        print("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    assert len(set(names)) == len(names)
+    assert len(names) == len(arrays)
+    name_to_array = dict(zip(names, arrays))
+    if output_path:
+        print("Save TF numpy weight to {}".format(output_path))
+        save_pickled_tf_checkpoint(name_to_array, output_path)
+    return name_to_array
+
+
+def load_pickled_tf_checkpoint(input_path):
+    with open(input_path, "rb") as f:
+        return pickle.load(f)
+
+
+def save_pickled_tf_checkpoint(name_to_array, output_path):
+    with open(output_path, "wb") as f:
+        pickle.dump(name_to_array, f, protocol=2)
+
+
+if __name__ == "__main__":
+    #input_path = "/data/zengjinle/dataset/bert_data/phase1/model.ckpt-28252"
+    #output_path = "tf_ckpt.pickle"
+    assert len(sys.argv) == 3
+    input_path = sys.argv[1]
+    output_path = sys.argv[2]
+    load_tf_checkpoint(input_path, output_path)
--- a/models/mlperf_logging_helper.py
+++ b/models/mlperf_logging_helper.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from mlperf_logging import mllog
+
+mllogger = mllog.get_mllogger()
+
+
+def _paddle_bert_print(logger,
+                       key,
+                       val=None,
+                       metadata=None,
+                       stack_offset=3,
+                       namespace="paddle_mlperf"):
+    logger(
+        key=key,
+        value=val,
+        metadata=metadata,
+        stack_offset=stack_offset,
+        namespace=namespace)
+
+
+def paddle_bert_print_start(key, val=None, metadata=None):
+    _paddle_bert_print(mllogger.start, key, val, metadata)
+
+
+def paddle_bert_print_end(key, val=None, metadata=None):
+    _paddle_bert_print(mllogger.end, key, val, metadata)
+
+
+def paddle_bert_print_event(key, val=None, metadata=None):
+    _paddle_bert_print(mllogger.event, key, val, metadata)
--- a/models/modeling.py
+++ b/models/modeling.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import copy
+import math
+import numpy as np
+import numbers
+import json
+import sys
+import os
+import paddle
+import paddle.nn as nn
+import paddle.tensor as tensor
+import paddle.nn.functional as F
+from paddle.nn import TransformerEncoder, Linear, Layer, Embedding, LayerNorm, Tanh
+from paddle.nn import Layer, LayerList
+from paddle.fluid.initializer import Constant
+import utility
+
+from bert_padding import generate_mask
+#try:
+#    from custom_setup_ops import custom_fmha, custom_fused_dropout_residual_ln, custom_fused_dense
+#except ImportError as e:
+#    print('custom_setup_ops import error: {}'.format(e))
+
+from .load_tf_checkpoint import load_pickled_tf_checkpoint, save_pickled_tf_checkpoint
+
+from .mlperf_logging_helper import paddle_bert_print_event
+
+__all__ = [
+    'BertConfig',
+    'BertModel',
+    'BertForPretraining',
+    'BertPretrainingCriterion',
+    'BertPretrainingHeads',
+]
+
+use_nv_input = utility.use_nv_input()
+
+GELU_APPROXIMATE = True
+
+
+def get_activation():
+    return nn.GELU(approximate=GELU_APPROXIMATE)
+    # return nn.ReLU()
+
+
+def mask_gather(var, mask):
+    return paddle.gather_nd(var, paddle.fluid.layers.where(mask))
+
+
+def gen_pos_id(input_ids):
+    ones = paddle.ones_like(input_ids)
+    seq_length = paddle.cumsum(ones, axis=-1)
+    position_ids = seq_length - ones
+    position_ids.stop_gradient = True
+    return position_ids
+
+
+def fuse_dense(x,
+               y,
+               bias,
+               transx=False,
+               transy=False,
+               with_gelu=False,
+               use_addto=False):
+    #out = custom_fused_dense(
+    #    x=x, y=y, bias=bias, transx=transx, transy=transy, use_addto=use_addto)
+ #   paddle.static.Print(x, message="print matmul x:")
+ #   paddle.static.Print(y, message="print matmul y:")
+    #paddle.static.Print(transx, message="print matmul transx:")
+    #paddle.static.Print(transy, message="print matmul transy:")
+#    print("print matmul transx: ", transx)
+#    print("print matmul transy", transy)
+   # print("B.shape", bias.shape)
+#    os.environ["ROCBLAS_LAYER"] = "3"
+#    setenv_status = int(os.environ.get('ROCBLAS_LAYER', '3'))
+    out = paddle.matmul(x, y, transpose_x=transx, transpose_y=transy) 
+ #   os.environ["ROCBLAS_LAYER"] = "0"
+#    paddle.static.Print(out, message="print matmul out:")
+ #   paddle.static.Print(bias, message="print matmul bias:")
+    #paddle.static.Print(out, message="print matmul out before bias:")
+    if bias is not None:
+        out = out + bias
+    #    paddle.static.Print(out, message="print matmul out after bias:")
+    if with_gelu:
+        out = get_activation()(out)
+#    paddle.static.Print(out, message="print matmul out after bias:")
+    return out
+
+
+class BertConfig(object):
+    """Configuration class to store the configuration of a `BertModel`.
+    """
+
+    def __init__(self, vocab_size_or_config_json_file, **kwargs):
+        """Constructs BertConfig.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `BertModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        if isinstance(vocab_size_or_config_json_file, str) or (
+                sys.version_info[0] == 2 and
+                isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(
+                    vocab_size_or_config_json_file, "r",
+                    encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            assert not kwargs, "kwargs should be empty if config json file is provided"
+            self._fill_dict(json_config)
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self._fill_dict(kwargs)
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)")
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `BertConfig` from a Python dictionary of parameters."""
+        config = BertConfig(vocab_size_or_config_json_file=-1)
+        config._fill_dict(json_object)
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def _fill_dict(self, kwargs=None):
+        defaults = {
+            "vocab_size": 30522,
+            "hidden_size": 768,
+            "num_hidden_layers": 12,
+            "num_attention_heads": 12,
+            "intermediate_size": 3072,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 512,
+            "type_vocab_size": 2,
+            "initializer_range": 0.02,
+            "pool_act": "tanh",
+            "pad_token_id": 0,
+        }
+
+        # fill defaults
+        for key, value in defaults.items():
+            if key not in self.__dict__:
+                self.__dict__[key] = value
+
+        # fill other values
+        if kwargs:
+            for key, value in kwargs.items():
+                self.__dict__[key] = value
+
+        assert self.pad_token_id == 0, "pad_token_id must be 0"
+
+
+def transpose_2d(x):
+    assert len(x.shape) == 2
+    return np.ascontiguousarray(np.transpose(x, (1, 0)))
+
+
+class TFCkptHelper:
+    def __init__(self, args, config, checkpoint_path, place):
+        self.pd_vars_to_tf_vars = collections.OrderedDict()
+        self.transpose_vars = set()
+        self.args = args
+        self.config = config
+        self.place = place
+        self.checkpoint_path = checkpoint_path
+        self._tf_vars = None
+
+        self.fuse_attn_qkv = self.args.unpad_fmha
+        self.attn_fused_qkv_weights = [
+            None for _ in range(self.config.num_hidden_layers)
+        ]
+        self.attn_fused_qkv_biases = [
+            None for _ in range(self.config.num_hidden_layers)
+        ]
+
+    @property
+    def tf_vars(self):
+        if self._tf_vars is None:
+            self._tf_vars = load_pickled_tf_checkpoint(self.checkpoint_path)
+        return self._tf_vars
+
+    def save(self, output_path, get_parameter_func):
+        tf_vars = {}
+        if self.fuse_attn_qkv:
+            for idx in range(self.config.num_hidden_layers):
+                prefix = self._enc_prefix(idx) + "attention/self/"
+
+                pd_weight_name = self.attn_fused_qkv_weights[idx]
+                pd_weight_var = self._get_fp32_param(pd_weight_name,
+                                                     get_parameter_func)
+                assert len(pd_weight_var.shape) == 2
+
+                pd_bias_name = self.attn_fused_qkv_biases[idx]
+                pd_bias_var = self._get_fp32_param(pd_bias_name,
+                                                   get_parameter_func)
+                assert len(pd_bias_var.shape) == 1
+
+                need_transpose = pd_weight_name in self.transpose_vars
+                if need_transpose:  # split along dim 0 and transpose
+                    assert pd_weight_var.shape[0] == 3 * pd_weight_var.shape[1]
+                    assert pd_weight_var.shape[0] == pd_bias_var.shape[0]
+                    tf_weights = np.split(pd_weight_var, 3, axis=0)
+                    tf_weights = [transpose_2d(w) for w in tf_weights]
+                else:  # split along dim 1 
+                    assert pd_weight_var.shape[1] == 3 * pd_weight_var.shape[0]
+                    assert pd_weight_var.shape[1] == pd_bias_var.shape[0]
+                    tf_weights = np.split(pd_weight_var, 3, axis=1)
+                tf_biases = np.split(pd_bias_var, 3, axis=0)
+
+                assert len(tf_weights) == 3
+                assert len(tf_biases) == 3
+
+                for i, name in enumerate(["query", "key", "value"]):
+                    tf_var_name = prefix + name + "/"
+                    weight_name = tf_var_name + "kernel"
+                    bias_name = tf_var_name + "bias"
+                    tf_vars[weight_name] = tf_weights[i]
+                    tf_vars[bias_name] = tf_biases[i]
+
+        for pd_var_name, tf_var_name in self.pd_vars_to_tf_vars.items():
+            pd_var = self._get_fp32_param(pd_var_name, get_parameter_func)
+            if "output_weights" in tf_var_name or pd_var_name in self.transpose_vars:
+                pd_var = transpose_2d(pd_var)
+            tf_vars[tf_var_name] = pd_var
+
+        for key, tf_value in self.tf_vars.items():
+            if key not in tf_vars:
+                tf_vars[key] = tf_value
+                continue
+
+            pd_value = tf_vars[key]
+            if tf_value.shape == pd_value.shape:
+                continue
+
+            if key == 'bert/embeddings/word_embeddings':
+                assert len(tf_value.shape) == 2
+                assert len(pd_value.shape) == 2
+                assert tf_value.shape[1] == pd_value.shape[1]
+                pd_value = pd_value[0:tf_value.shape[0]]
+            elif key == 'cls/predictions/output_bias':
+                assert len(tf_value.shape) == 1
+                assert len(pd_value.shape) == 1
+                pd_value = pd_value[0:tf_value.shape[0]]
+            else:
+                raise ValueError("unsupported key {}".format(key))
+
+            tf_vars[key] = pd_value
+
+        return save_pickled_tf_checkpoint(tf_vars, output_path)
+
+    def load(self, get_parameter_func):
+        tf_vars = self.tf_vars
+        loaded_var_names = set()
+
+        if self.fuse_attn_qkv:
+            for idx in range(self.config.num_hidden_layers):
+                weights = []
+                biases = []
+                prefix = self._enc_prefix(idx) + "attention/self/"
+
+                pd_weight_name = self.attn_fused_qkv_weights[idx]
+                pd_bias_name = self.attn_fused_qkv_biases[idx]
+
+                need_transpose = pd_weight_name in self.transpose_vars
+                for name in ["query", "key", "value"]:
+                    tf_var_name = prefix + name + "/"
+                    weight_name = tf_var_name + "kernel"
+                    bias_name = tf_var_name + "bias"
+                    
+                    if utility.get_trainer_id() == 0:
+                        paddle_bert_print_event(key='weights_initialization', metadata={'tensor':weight_name})
+                        paddle_bert_print_event(key='weights_initialization', metadata={'tensor':bias_name})
+
+                    if need_transpose:
+                        weights.append(transpose_2d(tf_vars[weight_name]))
+                    else:
+                        weights.append(tf_vars[weight_name])
+                    biases.append(tf_vars[bias_name])
+
+                    loaded_var_names.add(weight_name)
+                    loaded_var_names.add(bias_name)
+
+                weight = np.concatenate(
+                    weights, axis=0 if need_transpose else 1)
+                bias = np.concatenate(biases)
+                weight_pd_vars = get_parameter_func(pd_weight_name)
+                bias_pd_vars = get_parameter_func(pd_bias_name)
+
+                self._set_var_value(
+                    weight_pd_vars, weight,
+                    self.attn_fused_qkv_weights[idx] + "/qkv/kernel",
+                    prefix + "qkv/kernel")
+                self._set_var_value(
+                    bias_pd_vars, bias,
+                    self.attn_fused_qkv_biases[idx] + "/qkv/bias",
+                    prefix + "qkv/bias")
+
+        for idx, (pd_var_name,
+                  tf_var_name) in enumerate(self.pd_vars_to_tf_vars.items()):
+            if utility.get_trainer_id() == 0:
+                paddle_bert_print_event(key='weights_initialization', metadata={'tensor':tf_var_name})
+            var_value = tf_vars[tf_var_name]
+            if "output_weights" in tf_var_name or pd_var_name in self.transpose_vars:
+                if utility.get_trainer_id() == 0:
+                    print('{} needs to transpose'.format(tf_var_name))
+                var_value = transpose_2d(var_value)
+
+            pd_vars = get_parameter_func(pd_var_name)
+            self._set_var_value(pd_vars, var_value, pd_var_name, tf_var_name)
+
+            loaded_var_names.add(tf_var_name)
+
+        left_var_names = set()
+        for var_name in self.pd_vars_to_tf_vars.values():
+            if var_name not in loaded_var_names:
+                left_var_names.add(var_name)
+            else:
+                loaded_var_names.remove(var_name)
+
+        if self.fuse_attn_qkv:
+            assert len(loaded_var_names
+                       ) == 6 * self.config.num_hidden_layers, loaded_var_names
+        else:
+            assert len(loaded_var_names) == 0, loaded_var_names
+        assert len(left_var_names) == 0, left_var_names
+
+    def _set_var_value(self, pd_vars, var_value, pd_var_name, tf_var_name):
+        if isinstance(pd_vars, (list, tuple)):
+            assert len(pd_vars) == 2
+            pd_var, master_pd_var = pd_vars
+        else:
+            pd_var = pd_vars
+            master_pd_var = None
+        pd_var_shape = tuple(pd_var.shape())
+        tf_var_shape = tuple(var_value.shape)
+        if pd_var_shape != tf_var_shape:
+            if utility.get_trainer_id() == 0:
+                print("{} vs {} shape differs: {} vs {}".format(
+                    pd_var_name, tf_var_name, pd_var_shape, tf_var_shape))
+            assert len(pd_var_shape) == len(tf_var_shape)
+            slices = []
+            n = len(pd_var_shape)
+            for i in range(n):
+                assert pd_var_shape[i] >= tf_var_shape[i]
+                slices.append(slice(0, tf_var_shape[i], 1))
+            new_var_value = np.zeros(pd_var_shape, dtype=var_value.dtype)
+            new_var_value[slices] = var_value
+            var_value = new_var_value
+
+        if pd_var._dtype() == paddle.float16:
+            assert var_value.dtype == np.float32
+            if master_pd_var is not None:
+                assert master_pd_var._dtype() == paddle.float32
+                if utility.get_trainer_id() == 0:
+                    print("Set master weight for {} {}".format(pd_var_name,
+                                                               tf_var_name))
+                self._inplace_set_tensor(master_pd_var, var_value)
+            self._inplace_set_tensor(pd_var, var_value.astype(np.float16))
+        elif pd_var._dtype() == paddle.float32:
+            assert var_value.dtype == np.float32
+            assert master_pd_var is None
+            self._inplace_set_tensor(pd_var, var_value)
+        else:
+            raise TypeError("unsupported data type {}".format(pd_var._dtype()))
+
+    def _inplace_set_tensor(self, tensor, value):
+        old_ptr = tensor._ptr()
+        tensor.set(value, self.place)
+        new_ptr = tensor._ptr()
+        assert old_ptr == new_ptr
+
+    def _get_fp32_param(self, pd_var_name, get_parameter_func):
+        pd_var_name = self._to_pd_var_name(pd_var_name)
+        pd_vars = get_parameter_func(pd_var_name)
+
+        assert isinstance(pd_vars, (list, tuple))
+        assert len(pd_vars) == 2
+        pd_var, master_pd_var = pd_vars
+        if master_pd_var is not None:
+            assert pd_var._dtype() == paddle.float16
+            assert master_pd_var._dtype() == paddle.float32
+            assert pd_var.shape() == master_pd_var.shape()
+            return np.array(master_pd_var)
+        else:
+            assert pd_var._dtype() == paddle.float32
+            return np.array(pd_var)
+
+    def _enc_prefix(self, idx):
+        return "bert/encoder/layer_{}/".format(idx)
+
+    def _to_pd_var_name(self, var):
+        return var if isinstance(var, (str, bytes)) else var.name
+
+    def _record_pd_vars(self,
+                        pd_vars,
+                        tf_vars,
+                        tf_var_prefix="",
+                        weight_transpose=None):
+        if not isinstance(pd_vars, (list, tuple)):
+            pd_vars = [pd_vars]
+        pd_vars = [self._to_pd_var_name(v) for v in pd_vars]
+
+        if not isinstance(tf_vars, (list, tuple)):
+            tf_vars = [tf_vars]
+        tf_vars = [tf_var_prefix + v for v in tf_vars]
+
+        assert len(pd_vars) == len(tf_vars)
+        for pd_var, tf_var in zip(pd_vars, tf_vars):
+            assert pd_var not in self.pd_vars_to_tf_vars, pd_var
+            self.pd_vars_to_tf_vars[pd_var] = tf_var
+
+        if weight_transpose:
+            assert len(pd_vars) == 2
+            self.transpose_vars.add(pd_vars[0])
+
+    def embeddings(self, pd_vars):
+        return self._record_pd_vars(pd_vars, [
+            "word_embeddings", "position_embeddings", "token_type_embeddings"
+        ], "bert/embeddings/")
+
+    def norm_after_embeddings(self, pd_vars):
+        return self._record_pd_vars(pd_vars, ["gamma", "beta"],
+                                    "bert/embeddings/LayerNorm/")
+
+    def enc_attn_query_fc(self, pd_vars, idx, weight_transpose=None):
+        assert not self.fuse_attn_qkv
+        prefix = self._enc_prefix(idx) + "attention/self/query/"
+        return self._record_pd_vars(pd_vars, ["kernel", "bias"], prefix,
+                                    weight_transpose)
+
+    def enc_attn_key_fc(self, pd_vars, idx, weight_transpose=False):
+        assert not self.fuse_attn_qkv
+        prefix = self._enc_prefix(idx) + "attention/self/key/"
+        return self._record_pd_vars(pd_vars, ["kernel", "bias"], prefix,
+                                    weight_transpose)
+
+    def enc_attn_value_fc(self, pd_vars, idx, weight_transpose=False):
+        assert not self.fuse_attn_qkv
+        prefix = self._enc_prefix(idx) + "attention/self/value/"
+        return self._record_pd_vars(pd_vars, ["kernel", "bias"], prefix,
+                                    weight_transpose)
+
+    def enc_fused_attn_qkv_fc(self, pd_vars, idx, weight_transpose=False):
+        assert self.fuse_attn_qkv
+        assert self.attn_fused_qkv_weights[idx] is None
+        assert self.attn_fused_qkv_biases[idx] is None
+        weight, bias = pd_vars
+        self.attn_fused_qkv_weights[idx] = self._to_pd_var_name(weight)
+        self.attn_fused_qkv_biases[idx] = self._to_pd_var_name(bias)
+        if weight_transpose:
+            self.transpose_vars.add(self.attn_fused_qkv_weights[idx])
+
+    def enc_attn_proj_fc(self, pd_vars, idx, weight_transpose=False):
+        prefix = self._enc_prefix(idx) + "attention/output/dense/"
+        return self._record_pd_vars(pd_vars, ["kernel", "bias"], prefix,
+                                    weight_transpose)
+
+    def enc_attn_norm(self, pd_vars, idx):
+        prefix = self._enc_prefix(idx) + "attention/output/LayerNorm/"
+        return self._record_pd_vars(pd_vars, ["gamma", "beta"], prefix)
+
+    def enc_intermediate_fc(self, pd_vars, idx, weight_transpose=False):
+        prefix = self._enc_prefix(idx) + "intermediate/dense/"
+        return self._record_pd_vars(pd_vars, ["kernel", "bias"], prefix,
+                                    weight_transpose)
+
+    def enc_output_fc(self, pd_vars, idx, weight_transpose=False):
+        prefix = self._enc_prefix(idx) + "output/dense/"
+        return self._record_pd_vars(pd_vars, ["kernel", "bias"], prefix,
+                                    weight_transpose)
+
+    def enc_output_norm(self, pd_vars, idx):
+        prefix = self._enc_prefix(idx) + "output/LayerNorm/"
+        return self._record_pd_vars(pd_vars, ["gamma", "beta"], prefix)
+
+    def pooler_fc(self, pd_vars, weight_transpose=False):
+        return self._record_pd_vars(pd_vars, ["kernel", "bias"],
+                                    "bert/pooler/dense/", weight_transpose)
+
+    def cls_pred_trans_fc(self, pd_vars, weight_transpose=False):
+        return self._record_pd_vars(pd_vars, ["kernel", "bias"],
+                                    "cls/predictions/transform/dense/",
+                                    weight_transpose)
+
+    def cls_pred_trans_norm(self, pd_vars):
+        return self._record_pd_vars(pd_vars, ["gamma", "beta"],
+                                    "cls/predictions/transform/LayerNorm/")
+
+    def cls_pred_fc_bias(self, pd_vars):
+        return self._record_pd_vars(pd_vars, "output_bias", "cls/predictions/")
+
+    def cls_seq_relation_fc(self, pd_vars, weight_transpose=False):
+        return self._record_pd_vars(pd_vars, ["output_weights", "output_bias"],
+                                    "cls/seq_relationship/", weight_transpose)
+
+
+class FMHA(Layer):
+    def __init__(self, config):
+        super(FMHA, self).__init__()
+        self.p_dropout = config.attention_probs_dropout_prob
+        self.h = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.d = self.hidden_size // self.h
+        self.fused_qkv_bias = config.fused_bias_mha
+        self.weight_transpose = True
+        self.use_unpad_fmha_mke_opt = config.unpad_fmha_mke_opt
+        assert self.d * self.h == self.hidden_size, "Invalid hidden size/num_heads"
+
+        # create_parameter
+        self._dtype = self._helper.get_default_dtype()
+
+        if self.weight_transpose:
+            Wqkv_shape = [3 * config.hidden_size, config.hidden_size]
+        else:
+            Wqkv_shape = [config.hidden_size, 3 * config.hidden_size]
+
+        self.Wqkv = self.create_parameter(
+            shape=Wqkv_shape, attr=None, dtype=self._dtype, is_bias=False)
+        self.Bqkv = self.create_parameter(
+            shape=[3 * config.hidden_size],
+            attr=None,
+            dtype=self._dtype,
+            is_bias=True)
+
+    def forward(self,
+                hidden_states,
+                cu_seqlens,
+                host_cu_seqlens,
+                max_s,
+                is_training=True):
+        #paddle.static.Print(hidden_states, message="fmha layer, input shape")
+        #paddle.static.Print(self.Wqkv, message="Wqkv.shape")
+        #paddle.static.Print(self.Bqkv, message="Bqkv.shape")
+        #print("fmha layer, input shape: ", hidden_states.shape)
+        #print("Wqkv.shape", self.Wqkv.shape)
+        #print("Bqkv.shape", self.Bqkv.shape)
+        if not self.fused_qkv_bias:
+            qkv = paddle.matmul(
+                hidden_states,
+                self.Wqkv,
+                transpose_x=False,
+                transpose_y=self.weight_transpose)
+            qkv = qkv + self.Bqkv
+        else:
+            qkv = fuse_dense(
+                hidden_states,
+                self.Wqkv,
+                self.Bqkv,
+                transx=False,
+                transy=self.weight_transpose)
+
+        qkv = paddle.reshape(qkv, [-1, 3, self.h, self.d])
+        #print("qkv.shape", qkv.shape)
+        # FMHA: max_s =  var memcpy_0.tmp_0 : LOD_TENSOR.shape(1,).dtype(int32).stop_gradient(False)
+        #print("FMHA: max_s = ", max_s)
+        out, _ = custom_fmha(
+            qkv,
+            cu_seqlens,
+            host_cu_seqlens,
+            not is_training,
+            self.p_dropout,
+            zero_tensors=False,
+            use_fmha_mke_opt=self.use_unpad_fmha_mke_opt)
+        return paddle.reshape(out, [-1, self.hidden_size])
+
+
+class BertSelfAttention(Layer):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        assert config.hidden_size % config.num_attention_heads == 0
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size /
+                                       config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.weight_transpose = False
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.softmax = nn.Softmax(axis=-1)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = [0] * (
+            len(x.shape) - 1
+        ) + [self.num_attention_heads, self.attention_head_size]
+        x = paddle.reshape(x, new_x_shape)
+        return paddle.transpose(x, [0, 2, 1, 3])
+
+    def transpose_key_for_scores(self, x):
+        new_x_shape = [0] * (
+            len(x.shape) - 1
+        ) + [self.num_attention_heads, self.attention_head_size]
+        x = paddle.reshape(x, new_x_shape)
+        return paddle.transpose(x, [0, 2, 3, 1])
+
+    def forward(self, hidden_states, attention_mask):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_key_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+       # paddle.static.Print(query_layer, message="query_layer")
+       # paddle.static.Print(key_layer, message="key_layer")
+        attention_scores = paddle.matmul(query_layer, key_layer)
+        attention_scores = attention_scores / math.sqrt(
+            self.attention_head_size)
+
+        attention_scores = attention_scores + attention_mask.unsqueeze(
+            1).unsqueeze(2)
+
+        attention_probs = self.softmax(attention_scores)
+
+        attention_probs = self.dropout(attention_probs)
+
+       # paddle.static.Print(attention_probs, message="attention_probs")
+       # paddle.static.Print(value_layer, message="value_layer")
+
+        context_layer = paddle.matmul(attention_probs, value_layer)
+        #paddle.static.Print(context_layer, message="context_layer")
+
+        context_layer = paddle.transpose(context_layer, [0, 2, 1, 3])
+        new_context_layer_shape = [0] * (len(context_layer.shape) - 2
+                                         ) + [self.all_head_size]
+        context_layer = paddle.reshape(context_layer, new_context_layer_shape)
+        return context_layer
+
+
+class BertAttention(Layer):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self, input_tensor, attention_mask):
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class BertLayer(Layer):
+    def __init__(self, config):
+        super(BertLayer, self).__init__()
+        assert use_nv_input
+        assert not config.pad_fmha
+        self.unpad = config.unpad
+
+        if config.unpad_fmha:
+            assert self.unpad
+            self.attention = UnpadFMHABertAttention(config)
+        else:
+            self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                seqlen=None,
+                host_seqlen=None,
+                batch=None):
+        if self.unpad:
+            attention_output = self.attention(hidden_states, attention_mask,
+                                              seqlen, host_seqlen, batch)
+        else:
+            attention_output = self.attention(hidden_states, attention_mask)
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BertEncoder(Layer):
+    def __init__(self, config):
+        super(BertEncoder, self).__init__()
+        if use_nv_input:
+            self.layers = nn.LayerList(
+                [BertLayer(config) for _ in range(config.num_hidden_layers)])
+        else:
+            encoder_layer = nn.TransformerEncoderLayer(
+                config.hidden_size,
+                config.num_attention_heads,
+                config.intermediate_size,
+                dropout=config.hidden_dropout_prob,
+                activation=config.hidden_act,
+                attn_dropout=config.attention_probs_dropout_prob,
+                act_dropout=0)
+            self.encoder = nn.TransformerEncoder(encoder_layer,
+                                                 config.num_hidden_layers)
+
+        self.num_attention_heads = config.num_attention_heads
+        self.unpad = config.unpad
+        self.unpad_embed = config.unpad_embed
+        self.unpad_fmha = config.unpad_fmha
+        self.pad_fmha = config.pad_fmha
+        self.hidden_size = config.hidden_size
+        self.maxseqlen = config.max_seq_length
+
+    def record_ckpt_vars(self, ckpt, idx):
+        if use_nv_input:
+            layer = self.layers[idx]
+            attn = layer.attention
+            if isinstance(attn, UnpadFMHABertAttention):
+                ckpt.enc_fused_attn_qkv_fc([attn.fmha.Wqkv, attn.fmha.Bqkv],
+                                           idx, attn.fmha.weight_transpose)
+            else:
+                ckpt.enc_attn_query_fc(
+                    [attn.self.query.weight, attn.self.query.bias], idx,
+                    attn.self.weight_transpose)
+                ckpt.enc_attn_key_fc(
+                    [attn.self.key.weight, attn.self.key.bias], idx,
+                    attn.self.weight_transpose)
+                ckpt.enc_attn_value_fc(
+                    [attn.self.value.weight, attn.self.value.bias], idx,
+                    attn.self.weight_transpose)
+
+            ckpt.enc_attn_proj_fc(
+                [attn.output.dense.weight, attn.output.dense.bias], idx,
+                attn.output.weight_transpose)
+            if attn.output.fused_dropout:
+                ckpt.enc_attn_norm([
+                    attn.output.fused_dropout_add_ln.weight,
+                    attn.output.fused_dropout_add_ln.bias
+                ], idx)
+            else:
+                ckpt.enc_attn_norm([
+                    attn.output.layer_norm.weight, attn.output.layer_norm.bias
+                ], idx)
+
+            intermediate = layer.intermediate
+            last_output = layer.output
+            ckpt.enc_intermediate_fc(
+                [intermediate.dense.weight, intermediate.dense.bias], idx,
+                intermediate.weight_transpose)
+            ckpt.enc_output_fc(
+                [last_output.dense.weight, last_output.dense.bias], idx,
+                last_output.weight_transpose)
+            if last_output.fused_dropout:
+                ckpt.enc_output_norm([
+                    last_output.fused_dropout_add_ln.weight,
+                    last_output.fused_dropout_add_ln.bias
+                ], idx)
+            else:
+                ckpt.enc_output_norm([
+                    last_output.layer_norm.weight, last_output.layer_norm.bias
+                ], idx)
+        else:
+            layer = self.encoder.layers[idx]
+            attn = layer.self_attn
+            ckpt.enc_attn_query_fc([attn.q_proj.weight, attn.q_proj.bias], idx,
+                                   False)
+            ckpt.enc_attn_key_fc([attn.k_proj.weight, attn.k_proj.bias], idx,
+                                 False)
+            ckpt.enc_attn_value_fc([attn.v_proj.weight, attn.v_proj.bias], idx,
+                                   False)
+
+            ckpt.enc_attn_proj_fc([attn.out_proj.weight, attn.out_proj.bias],
+                                  idx, False)
+            ckpt.enc_attn_norm([layer.norm1.weight, layer.norm1.bias], idx)
+            ckpt.enc_intermediate_fc(
+                [layer.linear1.weight, layer.linear1.bias], idx, False)
+            ckpt.enc_output_fc([layer.linear2.weight, layer.linear2.bias], idx,
+                               False)
+            ckpt.enc_output_norm([layer.norm2.weight, layer.norm2.bias], idx)
+
+    def forward(self,
+                hidden_states,
+                attention_mask,
+                output_all_encoded_layers=False,
+                batch=56,
+                maxseqlen=512,
+                hidden_size=1024,
+                zero_tensor=None,
+                attention_indices=None,
+                new_attention_mask=None,
+                seqlen=None,
+                cu_seqlens=None,
+                host_cu_seqlens=None,
+                actual_seqlens=None,
+                maxseqlen_in_batch=None):
+
+        if use_nv_input:
+            return self.forward_with_nv_input(
+                hidden_states, attention_mask, output_all_encoded_layers, batch,
+                maxseqlen, hidden_size, zero_tensor, attention_indices,
+                new_attention_mask, seqlen, cu_seqlens, host_cu_seqlens,
+                actual_seqlens, maxseqlen_in_batch)
+
+        if output_all_encoded_layers:
+            output = hidden_states
+            encoder_outputs = []
+            for mod in self.encoder.layers:
+                output = mod(output, src_mask=attention_mask)
+                encoder_outputs.append(output)
+            if self.encoder.norm is not None:
+                encoder_outputs[-1] = self.encoder.norm(encoder_outputs[-1])
+            return encoder_outputs
+        else:
+            sequence_output = self.encoder(hidden_states, attention_mask)
+            return [sequence_output]
+
+    def forward_with_nv_input(self,
+                              hidden_states,
+                              attention_mask,
+                              output_all_encoded_layers=False,
+                              batch=56,
+                              maxseqlen=512,
+                              hidden_size=1024,
+                              zero_tensor=None,
+                              attention_indices=None,
+                              new_attention_mask=None,
+                              seqlen=None,
+                              cu_seqlens=None,
+                              host_cu_seqlens=None,
+                              actual_seqlens=None,
+                              maxseqlen_in_batch=None):
+        # Unpad inputs and mask. It will remove tokens that are padded. Assume ntokens is total number of tokens (padded and non-padded)
+        # and ntokens_unpad is total number of non-padded tokens. Then unpadding performs the following compression of the inputs:
+        #        hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden]
+
+        if not self.unpad_embed and self.unpad_fmha:
+            #batch = None
+            #seqlen = None
+            if self.unpad_fmha:
+                batch = hidden_states.shape[0]
+                maxseqlen = hidden_states.shape[1]
+                assert maxseqlen == self.maxseqlen
+                hidden_size = hidden_states.shape[2]
+                assert hidden_size == self.hidden_size
+
+                zero_tensor = paddle.zeros_like(hidden_states)
+                zero_tensor = paddle.reshape(zero_tensor,
+                                             [-1, self.hidden_size])
+
+                # attention_indices: 把attn_mask flatten后，提取非零元的下标。
+                # seqlen： 对[bs, max_seq_len]，每一行求和，代表获取每一行的实际seq_len（一维）。
+                # ntokens: 代表当前batch的所有实际seq_len之和。
+                # cu_seqlens: 对seq_len求prefix_sum的结果。
+                # actual_seqlens: 与seqlen相同的值。
+                # maxseqlen_in_batch: the max seqlen in a batch.
+                #attention_indices, attention_mask, seqlen, ntokens, cu_seqlens, actual_seqlens, maxseqlen_in_batch = generate_mask(
+                #    attention_mask, unpad_fmha=self.unpad_fmha)
+                print("maxseqlen_in_batch = ", maxseqlen_in_batch)
+                hidden_states = paddle.reshape(hidden_states,
+                                               [-1, self.hidden_size])
+                hidden_states = paddle.gather(hidden_states, attention_indices)
+                # print("unpad after shape: ", hidden_states)
+        elif self.unpad_fmha:
+            attention_mask = new_attention_mask
+
+        all_encoder_layers = []
+
+        def custom(start, end):
+            def custom_forward(*inputs):
+                layers = self.layers[start:end]
+                x_ = inputs[0]
+                for layer in layers:
+                    x_ = layer(x_, inputs[1])
+                return x_
+
+            return custom_forward
+
+        for i, layer_module in enumerate(self.layers):
+            if seqlen is None and batch is None:
+                hidden_states = layer_module(hidden_states, attention_mask)
+            else:
+                assert seqlen is not None
+                assert batch is not None
+                if self.unpad_fmha:
+                    hidden_states = layer_module(hidden_states, cu_seqlens,
+                                                 host_cu_seqlens,
+                                                 maxseqlen_in_batch)
+                    print("hidden_states:", hidden_states)
+                else:
+                    hidden_states = layer_module(hidden_states, attention_mask,
+                                                 seqlen, batch)
+
+            if output_all_encoded_layers:
+                all_encoder_layers.append(hidden_states)
+
+        # Pad inputs and mask. It will insert back zero-padded tokens. Assume ntokens is total number of tokens (padded and non-padded)
+        # and ntokens_unpad is total number of non-padded tokens. Then padding performs the following de-compression:
+        #        hidden_states[ntokens_unpad,hidden] -> hidden_states[ntokens,hidden]
+        if self.unpad_fmha:
+            hidden_states = paddle.scatter(zero_tensor, attention_indices,
+                                           hidden_states)
+            # todo: is self.maxseqlen same as maxseqlen?
+            hidden_states = paddle.reshape(
+                hidden_states, [batch, self.maxseqlen, self.hidden_size])
+            #print("hidden_states.shape:", hidden_states.shape)
+            #print("hidden_states:", hidden_states)
+
+        all_encoder_layers.append(hidden_states)
+        return all_encoder_layers
+
+
+class UnpadFMHABertAttention(Layer):
+    def __init__(self, config):
+        super(UnpadFMHABertAttention, self).__init__()
+        self.fmha = FMHA(config)
+        self.output = BertSelfOutput(config)
+
+    def forward(self,
+                input_tensor,
+                cu_seqlens,
+                host_cu_seqlens,
+                max_s,
+                batch_size=None):
+        self_output = self.fmha(
+            input_tensor,
+            cu_seqlens,
+            host_cu_seqlens,
+            max_s,
+            is_training=self.training)
+        attention_output = self.output(self_output, input_tensor)
+        return attention_output
+
+
+class FusedDropoutResidualLn(Layer):
+    def __init__(self, config, normalized_shape, epsilon=1e-12):
+        super(FusedDropoutResidualLn, self).__init__()
+        if isinstance(normalized_shape, numbers.Integral):
+            normalized_shape = [normalized_shape]
+        self._normalized_shape = list(normalized_shape)
+        param_shape = [np.prod(self._normalized_shape)]
+
+        self._weight_attr = None
+        self._bias_attr = None
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=param_shape,
+            default_initializer=Constant(1.0))
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=param_shape, is_bias=True)
+
+        self.p = config.hidden_dropout_prob
+        self.epsilon = epsilon
+        self.is_test = not self.training
+        # todo: use default configs.
+        self.fix_seed = False
+        self.is_upscale_in_train = True
+        self.seed_val = 0
+
+    def forward(self, hidden_states, input_tensor):
+        out, dropout_mask, ln_mean, ln_var, dropout_residual_out = custom_fused_dropout_residual_ln(
+            hidden_states, input_tensor, self.weight, self.bias, self.epsilon,
+            self.is_test, self.fix_seed, self.seed_val,
+            self.is_upscale_in_train, self.p)
+        return out
+
+    def extra_repr(self):
+        return 'normalized_shape={}, epsilon={}'.format(self._normalized_shape,
+                                                        self.epsilon)
+
+
+# support nn(weight is not transposed) and nt(weight is transposed)
+class FusedDense(Layer):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_transpose=False,
+                 weight_attr=None,
+                 bias_attr=None,
+                 with_gelu=False,
+                 name=None):
+        super(FusedDense, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.weight_transpose = weight_transpose
+        if weight_transpose:
+            self.weight = self.create_parameter(
+                shape=[out_features, in_features],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+        else:
+            self.weight = self.create_parameter(
+                shape=[in_features, out_features],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True)
+        self.with_gelu = with_gelu
+        self.name = name
+
+    def forward(self, hidden_states):
+        out = fuse_dense(
+            hidden_states,
+            self.weight,
+            self.bias,
+            transx=False,
+            transy=self.weight_transpose,
+            with_gelu=self.with_gelu)
+        return out
+
+
+class BertSelfOutput(Layer):
+    def __init__(self, config):
+        super(BertSelfOutput, self).__init__()
+        self.fused_fc_bias = config.fused_bias_fc
+        self.fused_dropout = config.fused_dropout_add_ln
+        if not self.fused_fc_bias:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+            self.weight_transpose = False
+        else:
+            self.dense = FusedDense(
+                config.hidden_size,
+                config.hidden_size,
+                weight_transpose=config.weight_transpose)
+            self.weight_transpose = config.weight_transpose
+        if self.fused_dropout:
+            self.fused_dropout_add_ln = FusedDropoutResidualLn(
+                config, config.hidden_size, epsilon=1e-12)
+        else:
+            self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12)
+            self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        #print("selfoutput input: hidden_states.shape = ", hidden_states.shape)
+        hidden_states = self.dense(hidden_states)
+        if not self.fused_dropout:
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = hidden_states + input_tensor
+            hidden_states = self.layer_norm(hidden_states)
+        else:
+            hidden_states = self.fused_dropout_add_ln(hidden_states,
+                                                      input_tensor)
+        return hidden_states
+
+
+class BertIntermediate(Layer):
+    def __init__(self, config):
+        super(BertIntermediate, self).__init__()
+        self.fused_fc_bias = config.fused_bias_fc
+        if not self.fused_fc_bias:
+            self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+            self.weight_transpose = False
+            self.intermediate_act_fn = get_activation()
+        else:
+            self.weight_transpose = config.weight_transpose
+            self.dense = FusedDense(
+                config.hidden_size,
+                config.intermediate_size,
+                weight_transpose=self.weight_transpose,
+                with_gelu=True)
+
+    def forward(self, hidden_states):
+        if not self.fused_fc_bias:
+            hidden_states = self.dense(hidden_states)
+            hidden_states = self.intermediate_act_fn(hidden_states)
+        else:
+            hidden_states = self.dense(hidden_states)
+        return hidden_states
+
+
+class BertOutput(Layer):
+    def __init__(self, config):
+        super(BertOutput, self).__init__()
+        self.fused_fc_bias = config.fused_bias_fc
+        self.fused_dropout = config.fused_dropout_add_ln
+        if not self.fused_fc_bias:
+            self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+            self.weight_transpose = False
+        else:
+            self.dense = FusedDense(
+                config.intermediate_size,
+                config.hidden_size,
+                weight_transpose=config.weight_transpose)
+            self.weight_transpose = config.weight_transpose
+        if self.fused_dropout:
+            self.fused_dropout_add_ln = FusedDropoutResidualLn(
+                config, config.hidden_size, epsilon=1e-12)
+        else:
+            self.dropout = nn.Dropout(config.hidden_dropout_prob)
+            self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12)
+        # todo: add fused_dropout opt.
+        self.p = config.hidden_dropout_prob
+
+    def forward(self, hidden_states, input_tensor):
+        #print("BertOutput, input.shape = ", hidden_states.shape)
+        hidden_states = self.dense(hidden_states)
+        if not self.fused_dropout:
+            hidden_states = self.dropout(hidden_states)
+            hidden_states = hidden_states + input_tensor
+            hidden_states = self.layer_norm(hidden_states)
+        else:
+            hidden_states = self.fused_dropout_add_ln(hidden_states,
+                                                      input_tensor)
+        return hidden_states
+
+
+class BertEmbeddings(Layer):
+    """
+    Include embeddings from word, position and token_type embeddings
+    """
+
+    def __init__(self,
+                 vocab_size,
+                 hidden_size=768,
+                 hidden_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=16,
+                 config=None):
+        super(BertEmbeddings, self).__init__()
+        self.unpad_embed = False
+        if config is not None:
+            self.unpad_embed = config.unpad_embed
+            self.unpad_fmha = config.unpad_fmha
+            if self.unpad_embed:
+                self.batch_size = config.batch_size
+                self.max_seq_length = config.max_seq_length
+        self.hidden_size = hidden_size
+
+        self.word_embeddings = nn.Embedding(
+            vocab_size, hidden_size, padding_idx=0)
+        self.position_embeddings = nn.Embedding(max_position_embeddings,
+                                                hidden_size)
+        self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)
+        self.layer_norm = nn.LayerNorm(hidden_size, epsilon=1e-12)
+        self.dropout = nn.Dropout(hidden_dropout_prob)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                position_ids=None,
+                attention_mask=None,
+                attention_indices=None,
+                seqlen=None,
+                cu_seqlens=None,
+                actual_seqlens=None,
+                maxseqlen_in_batch=None):
+        if position_ids is None:
+            position_ids = gen_pos_id(input_ids)
+        if token_type_ids is None:
+            token_type_ids = paddle.zeros_like(input_ids)
+
+        # todo(@limin29): in order to construct the shape of zero_tensor, we use pad method to compute token_type_embeddings.
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        if self.unpad_embed:
+            assert self.unpad_fmha
+            assert attention_mask is not None
+            assert attention_indices is not None
+            assert seqlen is not None
+            assert cu_seqlens is not None
+            assert maxseqlen_in_batch is not None
+
+            cur_batch_size = input_ids.shape[0]
+
+            zero_tensor = paddle.zeros_like(token_type_embeddings)
+            zero_tensor = paddle.reshape(zero_tensor, [-1, self.hidden_size])
+
+            # attention_indices: 把attn_mask flatten后，提取非零元的下标。
+            # seqlen： 对[bs, max_seq_len]，每一行求和，代表获取每一行的实际seq_len（一维）。
+            # ntokens: 代表当前batch的所有实际seq_len之和。
+            # cu_seqlens: 对seq_len求prefix_sum的结果。
+            # actual_seqlens: 与seqlen相同的值。
+            # maxseqlen_in_batch: the max seqlen in a batch.
+            #attention_indices, attention_mask, seqlen, ntokens, cu_seqlens, actual_seqlens, maxseqlen_in_batch = generate_mask(
+            #    attention_mask, unpad_fmha=self.unpad_fmha)
+            print("maxseqlen_in_batch = ", maxseqlen_in_batch)
+
+            input_ids = paddle.reshape(input_ids, [-1])
+            input_ids = paddle.gather(input_ids, attention_indices)
+
+            position_ids = paddle.reshape(position_ids, [-1])
+            position_ids = paddle.gather(position_ids, attention_indices)
+
+            #token_type_ids = paddle.reshape(token_type_ids, [-1])
+            #token_type_ids = paddle.gather(token_type_ids, attention_indices)
+            token_type_embeddings = paddle.reshape(token_type_embeddings,
+                                                   [-1, self.hidden_size])
+            token_type_embeddings = paddle.gather(token_type_embeddings,
+                                                  attention_indices)
+
+        input_embedings = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        #token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = input_embedings + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        if not self.unpad_embed:
+            return embeddings
+        else:
+            return embeddings, cur_batch_size, zero_tensor
+
+
+class BertPooler(Layer):
+    """
+    Pool the result of BertEncoder.
+    """
+
+    def __init__(self, hidden_size, pool_act="tanh"):
+        super(BertPooler, self).__init__()
+        #self.dense = nn.Linear(hidden_size, hidden_size)
+        #self.weight_transpose = False
+        self.dense = FusedDense(hidden_size, hidden_size)
+        self.weight_transpose = self.dense.weight_transpose
+        self.activation = nn.Tanh()
+        self.pool_act = pool_act
+        assert self.pool_act == "tanh"
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        if self.pool_act == "tanh":
+            pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class BertModel(nn.Layer):
+    """
+    The bare BERT Model transformer outputting raw hidden-states without any specific head on top.
+
+    This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`.
+    Refer to the superclass documentation for the generic methods.
+
+    This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation
+    /docs/en/api/paddle/fluid/dygraph/layers/Layer_en.html>`__ subclass. Use it as a regular Paddle Layer
+    and refer to the Paddle documentation for all matter related to general usage and behavior.
+
+    Args:
+        vocab_size (int):
+            Vocabulary size of `inputs_ids` in `BertModel`. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling `BertModel`.
+        hidden_size (int, optional):
+            Dimensionality of the embedding layer, encoder layer and pooler layer. Defaults to `768`.
+        num_hidden_layers (int, optional):
+            Number of hidden layers in the Transformer encoder. Defaults to `12`.
+        num_attention_heads (int, optional):
+            Number of attention heads for each attention layer in the Transformer encoder.
+            Defaults to `12`.
+        intermediate_size (int, optional):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors
+            to ff layers are firstly projected from `hidden_size` to `intermediate_size`,
+            and then projected back to `hidden_size`. Typically `intermediate_size` is larger than `hidden_size`.
+            Defaults to `3072`.
+        hidden_act (str, optional):
+            The non-linear activation function in the feed-forward layer.
+            ``"gelu"``, ``"relu"`` and any other paddle supported activation functions
+            are supported. Defaults to `"gelu"`.
+        hidden_dropout_prob (float, optional):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+            Defaults to `0.1`.
+        attention_probs_dropout_prob (float, optional):
+            The dropout probability used in MultiHeadAttention in all encoder layers to drop some attention target.
+            Defaults to `0.1`.
+        max_position_embeddings (int, optional):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length of an input
+            sequence. Defaults to `512`.
+        type_vocab_size (int, optional):
+            The vocabulary size of `token_type_ids`.
+            Defaults to `16`.
+
+        initializer_range (float, optional):
+            The standard deviation of the normal initializer.
+            Defaults to 0.02.
+
+        pad_token_id (int, optional):
+            The index of padding token in the token vocabulary.
+            Defaults to `0`.
+
+        pool_act (str, optional):
+            The non-linear activation function in the pooling layer.
+            Defaults to `"tanh"`.
+
+    """
+
+    def __init__(self, config):
+        super(BertModel, self).__init__()
+        self.unpad = config.unpad
+        self.pad_fmha = config.pad_fmha
+        self.unpad_embed = config.unpad_embed
+        self.unpad_fmha = config.unpad_fmha
+        self.pad_token_id = config.pad_token_id
+        self.initializer_range = config.initializer_range
+        self.num_hidden_layers = config.num_hidden_layers
+        self.maxseqlen = config.max_seq_length
+        self.hidden_size = config.hidden_size
+
+        # todo: 
+        self.embeddings = BertEmbeddings(
+            config.vocab_size, config.hidden_size, config.hidden_dropout_prob,
+            config.max_position_embeddings, config.type_vocab_size, config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config.hidden_size, config.pool_act)
+
+    def record_ckpt_vars(self, ckpt):
+        emb = self.embeddings
+        ckpt.embeddings([
+            emb.word_embeddings.weight,
+            emb.position_embeddings.weight,
+            emb.token_type_embeddings.weight,
+        ])
+
+        ckpt.norm_after_embeddings([
+            emb.layer_norm.weight,
+            emb.layer_norm.bias,
+        ])
+
+        for idx in range(self.num_hidden_layers):
+            self.encoder.record_ckpt_vars(ckpt, idx)
+
+        pooler = self.pooler
+        ckpt.pooler_fc([pooler.dense.weight, pooler.dense.bias],
+                       pooler.weight_transpose)
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                position_ids=None,
+                attention_mask=None,
+                seq_len=None,
+                prefix_sum_seq_len=None,
+                host_prefix_sum_seq_len=None,
+                max_seq_len=None,
+                nonzeros_indices=None,
+                output_hidden_states=False):
+        if use_nv_input:
+            if attention_mask is None:
+                attention_mask = paddle.ones_like(input_ids)
+            if token_type_ids is None:
+                token_type_ids = paddle.zeros_like(input_ids)
+
+            extended_attention_mask = attention_mask
+            if not self.unpad and not self.pad_fmha:
+                extended_attention_mask = extended_attention_mask.cast(
+                    dtype=paddle.float32)
+                extended_attention_mask = (
+                    1.0 - extended_attention_mask) * -10000.0
+            attention_mask = extended_attention_mask
+        else:
+            if attention_mask is None:
+                attention_mask = paddle.unsqueeze(
+                    (input_ids == self.pad_token_id
+                     ).astype(self.pooler.dense.weight.dtype) * -1e9,
+                    axis=[1, 2])
+
+        new_attention_mask = attention_mask
+        attention_indices = nonzeros_indices
+        seqlen = seq_len
+        cu_seqlens = prefix_sum_seq_len
+        host_cu_seqlens = host_prefix_sum_seq_len
+        maxseqlen_in_batch = max_seq_len
+        actual_seqlens = seqlen
+
+        if self.unpad_embed or self.unpad_fmha:
+            assert attention_indices is not None
+            assert seqlen is not None
+            assert cu_seqlens is not None
+            assert host_cu_seqlens is not None
+            assert maxseqlen_in_batch is not None
+
+        if not self.unpad_embed:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask)
+            cur_batch_size = None
+            zero_tensor = None
+            attention_indices = None
+            new_attention_mask = None
+            seqlen = None
+            cu_seqlens = None
+            host_cu_seqlens = None
+            actual_seqlens = None
+            maxseqlen_in_batch = None
+        else:
+            embedding_output, cur_batch_size, zero_tensor = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                token_type_ids=token_type_ids,
+                attention_mask=attention_mask,
+                attention_indices=attention_indices,
+                seqlen=seqlen,
+                cu_seqlens=cu_seqlens,
+                actual_seqlens=seqlen,
+                maxseqlen_in_batch=maxseqlen_in_batch)
+
+        encoder_outputs = self.encoder(
+            embedding_output, attention_mask, output_hidden_states,
+            cur_batch_size, self.maxseqlen, self.hidden_size, zero_tensor,
+            attention_indices, new_attention_mask, seqlen, cu_seqlens,
+            host_cu_seqlens, actual_seqlens, maxseqlen_in_batch)
+        pooled_output = self.pooler(encoder_outputs[-1])
+        print("encoder_outputs[-1]:", encoder_outputs[-1])
+        print("pooled_output:", pooled_output)
+        if output_hidden_states:
+            return encoder_outputs, pooled_output
+        else:
+            return encoder_outputs[-1], pooled_output
+
+
+class BertLMPredictionHead(Layer):
+    """
+    Bert Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+
+    def __init__(self, config, embedding_weights=None):
+        super(BertLMPredictionHead, self).__init__()
+        self.transform = FusedDense(
+            config.hidden_size, config.hidden_size, with_gelu=True)
+        self.weight_transpose = self.transform.weight_transpose
+        assert config.hidden_act == "gelu"
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size, epsilon=1e-12)
+        self.decoder_weight = self.create_parameter(
+            shape=[config.vocab_size, config.hidden_size],
+            dtype=self.transform.weight.dtype,
+            is_bias=False) if embedding_weights is None else embedding_weights
+        self.decoder_bias = self.create_parameter(
+            shape=[config.vocab_size],
+            dtype=self.decoder_weight.dtype,
+            is_bias=True)
+
+    def forward(self, hidden_states, masked_positions=None):
+        if masked_positions is not None:
+            hidden_states = paddle.reshape(hidden_states,
+                                           [-1, hidden_states.shape[-1]])
+            hidden_states = paddle.tensor.gather(hidden_states,
+                                                 masked_positions)
+        # gather masked tokens might be more quick
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = fuse_dense(
+            hidden_states,
+            self.decoder_weight,
+            self.decoder_bias,
+            transx=False,
+            transy=True)
+        return hidden_states
+
+
+class BertPretrainingHeads(Layer):
+    """
+    Perform language modeling task and next sentence classification task.
+
+    Args:
+        hidden_size (int):
+            See :class:`BertModel`.
+        vocab_size (int):
+            See :class:`BertModel`.
+        activation (str):
+            Activation function used in the language modeling task.
+        embedding_weights (Tensor, optional):
+            Decoding weights used to map hidden_states to logits of the masked token prediction.
+            Its data type should be float32 and its shape is [vocab_size, hidden_size].
+            Defaults to `None`, which means use the same weights of the embedding layer.
+
+    """
+
+    def __init__(self, config, embedding_weights=None):
+        super(BertPretrainingHeads, self).__init__()
+        self.predictions = BertLMPredictionHead(config, embedding_weights)
+        #self.seq_relationship = nn.Linear(config.hidden_size, 2)
+        #self.seq_relationship_weight_transpose = False
+        self.seq_relationship = FusedDense(config.hidden_size, 2)
+        self.seq_relationship_weight_transpose = self.seq_relationship.weight_transpose
+        self.dense_seq_output = config.dense_seq_output
+        self.share_weight = embedding_weights is not None
+
+    def record_ckpt_vars(self, ckpt):
+        pred_trans = self.predictions.transform
+        ckpt.cls_pred_trans_fc([pred_trans.weight, pred_trans.bias],
+                               self.predictions.weight_transpose)
+
+        norm = self.predictions.layer_norm
+        ckpt.cls_pred_trans_norm([norm.weight, norm.bias])
+
+        assert self.share_weight
+        ckpt.cls_pred_fc_bias(self.predictions.decoder_bias)
+
+        seq_relation_fc = self.seq_relationship
+        ckpt.cls_seq_relation_fc(
+            [seq_relation_fc.weight, seq_relation_fc.bias],
+            self.seq_relationship_weight_transpose)
+
+    def forward(self, sequence_output, pooled_output, masked_positions=None):
+        """
+        Args:
+            sequence_output(Tensor):
+                Sequence of hidden-states at the last layer of the model.
+                It's data type should be float32 and its shape is [batch_size, sequence_length, hidden_size].
+            pooled_output(Tensor):
+                The output of first token (`[CLS]`) in sequence.
+                We "pool" the model by simply taking the hidden state corresponding to the first token.
+                Its data type should be float32 and its shape is [batch_size, hidden_size].
+            masked_positions(Tensor, optional):
+                A tensor indicates positions to be masked in the position embedding.
+                Its data type should be int64 and its shape is [batch_size, mask_token_num].
+                `mask_token_num` is the number of masked tokens. It should be no bigger than `sequence_length`.
+                Defaults to `None`, which means we output hidden-states of all tokens in masked token prediction.
+
+        Returns:
+            tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``).
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        """
+        prediction_scores = self.predictions(sequence_output, masked_positions)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+    def nv_forward(self,
+                   sequence_output,
+                   pooled_output,
+                   masked_lm_labels,
+                   num_valid=None,
+                   masked_lm_ids=None,
+                   masked_lm_positions=None):
+        print("sequence_output: ", sequence_output)
+        if self.dense_seq_output:
+            # nonzero indices
+            index = masked_lm_positions
+            sequence_flattened = paddle.index_select(
+                sequence_output.reshape((-1, sequence_output.shape[-1])),
+                index=index,
+                axis=0)
+            sequence_output = sequence_flattened
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class BertForPretraining(nn.Layer):
+    """
+    Bert Model with pretraining tasks on top.
+
+    Args:
+        bert (:class:`BertModel`):
+            An instance of :class:`BertModel`.
+
+    """
+
+    def __init__(self, bert, config):
+        super(BertForPretraining, self).__init__()
+        self.config = config
+        self.bert = bert
+        self.cls = BertPretrainingHeads(
+            config,
+            embedding_weights=self.bert.embeddings.word_embeddings.weight)
+
+    def load_tf_ckpt(self, args, get_parameter_func):
+        place = utility.get_place()
+        ckpt = TFCkptHelper(args, self.config, args.tf_ckpt_path, place)
+        self.bert.record_ckpt_vars(ckpt)
+        self.cls.record_ckpt_vars(ckpt)
+        ckpt.load(get_parameter_func)
+        return ckpt
+
+    def forward(self,
+                input_ids,
+                token_type_ids=None,
+                position_ids=None,
+                attention_mask=None,
+                masked_positions=None,
+                seq_len=None,
+                prefix_sum_seq_len=None,
+                host_prefix_sum_seq_len=None,
+                max_seq_len=None,
+                nonzeros_indices=None,
+                num_valid=None,
+                masked_lm_ids=None,
+                masked_lm_positions=None):
+        r"""
+
+        Args:
+            input_ids (Tensor):
+                See :class:`BertModel`.
+            token_type_ids (Tensor, optional):
+                See :class:`BertModel`.
+            position_ids (Tensor, optional):
+                See :class:`BertModel`.
+            attention_mask (Tensor, optional):
+                See :class:`BertModel`.
+            masked_positions(Tensor, optional):
+                See :class:`BertPretrainingHeads`.
+
+        Returns:
+            tuple: Returns tuple (``prediction_scores``, ``seq_relationship_score``).
+
+            With the fields:
+
+            - `prediction_scores` (Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size].
+
+            - `seq_relationship_score` (Tensor):
+                The scores of next sentence prediction.
+                Its data type should be float32 and its shape is [batch_size, 2].
+
+        """
+        outputs = self.bert(
+            input_ids,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            seq_len=seq_len,
+            prefix_sum_seq_len=prefix_sum_seq_len,
+            host_prefix_sum_seq_len=host_prefix_sum_seq_len,
+            max_seq_len=max_seq_len,
+            nonzeros_indices=nonzeros_indices)
+
+        sequence_output, pooled_output = outputs
+        prediction_scores, seq_relationship_score = self.cls(
+            sequence_output, pooled_output, masked_positions, num_valid,
+            masked_lm_ids, masked_lm_positions)
+        return prediction_scores, seq_relationship_score
+
+
+class BertPretrainingCriterion(paddle.nn.Layer):
+    """
+
+    Args:
+        vocab_size(int):
+            Vocabulary size of `inputs_ids` in `BertModel`. Defines the number of different tokens that can
+            be represented by the `inputs_ids` passed when calling `BertModel`.
+
+    """
+
+    def __init__(self, config):
+        super(BertPretrainingCriterion, self).__init__()
+        # CrossEntropyLoss is expensive since the inner reshape (copy)
+        self.loss_fn = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1)
+        self.vocab_size = config.vocab_size
+        self.dense_seq_output = config.dense_seq_output
+
+    def forward(self, prediction_scores, seq_relationship_score,
+                masked_lm_labels, next_sentence_labels, masked_lm_scale):
+        """
+        Args:
+            prediction_scores(Tensor):
+                The scores of masked token prediction. Its data type should be float32.
+                If `masked_positions` is None, its shape is [batch_size, sequence_length, vocab_size].
+                Otherwise, its shape is [batch_size, mask_token_num, vocab_size]
+            seq_relationship_score(Tensor):
+                The scores of next sentence prediction. Its data type should be float32 and
+                its shape is [batch_size, 2]
+            masked_lm_labels(Tensor):
+                The labels of the masked language modeling, its dimensionality is equal to `prediction_scores`.
+                Its data type should be int64. If `masked_positions` is None, its shape is [batch_size, sequence_length, 1].
+                Otherwise, its shape is [batch_size, mask_token_num, 1]
+            next_sentence_labels(Tensor):
+                The labels of the next sentence prediction task, the dimensionality of `next_sentence_labels`
+                is equal to `seq_relation_labels`. Its data type should be int64 and
+                its shape is [batch_size, 1]
+            masked_lm_scale(Tensor or int):
+                The scale of masked tokens. Used for the normalization of masked language modeling loss.
+                If it is a `Tensor`, its data type should be int64 and its shape is equal to `prediction_scores`.
+
+        Returns:
+            Tensor: The pretraining loss, equals to the sum of `masked_lm_loss` plus the mean of `next_sentence_loss`.
+            Its data type should be float32 and its shape is [1].
+
+
+        """
+#        paddle.static.Print(prediction_scores, message="print prediction_scores:")
+#        paddle.static.Print(masked_lm_labels, message="print masked_lm_labels:")
+        masked_lm_loss = F.cross_entropy(
+            prediction_scores,
+            masked_lm_labels,
+            reduction='none',
+            ignore_index=-1)
+#        paddle.static.Print(masked_lm_loss, message="print masked_lm_loss:")
+        masked_lm_loss = masked_lm_loss / masked_lm_scale
+        next_sentence_loss = F.cross_entropy(
+            seq_relationship_score, next_sentence_labels, reduction='none')
+        valid_mask = masked_lm_labels != -1
+        total_loss_before_cast = paddle.sum(masked_lm_loss) + paddle.mean(
+            next_sentence_loss)
+
+        def func():
+            total_loss = total_loss_before_cast.astype('float32')
+            mlm_acc = paddle.cast(
+                paddle.sum((paddle.argmax(
+                    prediction_scores, axis=-1, keepdim=True) ==
+                            masked_lm_labels) * valid_mask),
+                dtype=masked_lm_scale.dtype) / masked_lm_scale
+            return total_loss, mlm_acc, masked_lm_scale
+
+        return func
+
+    def nv_forward(self,
+                   prediction_scores,
+                   seq_relationship_score,
+                   masked_lm_labels,
+                   next_sentence_labels,
+                   num_valid=None,
+                   masked_lm_ids=None,
+                   masked_lm_positions=None):
+        if self.dense_seq_output:
+            loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1)
+        else:
+            loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=0)
+        if self.dense_seq_output:
+            masked_lm_labels_dense = masked_lm_ids
+            #print("masked_lm_labels_dense = ", masked_lm_labels_dense)
+            #paddle.static.Print(prediction_scores.reshape((-1, self.vocab_size)), message="print prediction_scores.reshape((-1, self.vocab_size)):",summarize=1024,print_phase="forward")
+            #paddle.static.Print(masked_lm_labels_dense, message="print masked_lm_labels_dense:")
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape((-1, self.vocab_size)),
+                masked_lm_labels_dense)
+            #paddle.static.Print(masked_lm_loss, message="print masked_lm_loss:",print_phase="forward")
+            #with paddle.static.device_guard('cpu'):
+            #    num_valid = paddle.numel(masked_lm_labels_dense)
+        else:
+            masked_lm_loss = loss_fct(
+                prediction_scores.reshape((-1, self.vocab_size)),
+                masked_lm_labels.reshape((-1, )))
+        nsp_loss_fct = paddle.nn.loss.CrossEntropyLoss(ignore_index=-1)
+        next_sentence_loss = nsp_loss_fct(
+            seq_relationship_score.reshape([-1, 2]),
+            next_sentence_labels.reshape([-1]))
+        #paddle.static.Print(next_sentence_loss, message="print next_sentence_loss:")
+        total_loss = masked_lm_loss + next_sentence_loss
+
+        # Masked Language Model Accuracy
+        # NOTE: total_loss and mlm_acc use float32 in NV
+        if not self.dense_seq_output:
+
+            def func():
+                valid_mask = masked_lm_labels_flat != 0
+                num_valid_cnt = valid_mask.astype('int32').sum(dtype='float32')
+                mlm_labels = masked_lm_labels_flat
+                prediction_scores_flat = prediction_scores.reshape(
+                    (-1, prediction_scores.shape[-1]))
+                mlm_predictions_scores = prediction_scores_flat
+                mlm_predictions = mlm_predictions_scores.argmax(axis=-1)
+                mlm_acc = ((mlm_predictions == mlm_labels) *
+                           valid_mask).sum(dtype='float32') / num_valid_cnt
+                return total_loss, mlm_acc, num_valid_cnt
+        else:
+            mlm_labels = masked_lm_ids
+            dtype = masked_lm_labels_dense.dtype
+            if dtype == paddle.int32:
+                dtype = 'int32'
+            elif dtype == paddle.int64:
+                dtype = 'int64'
+            else:
+                assert False
+            mlm_predictions = prediction_scores.argmax(
+                axis=-1, dtype=dtype, keepdim=False)
+            assert len(mlm_predictions.shape) == 1
+            num_valid_cnt = num_valid
+
+            def func():
+                mlm_acc = paddle.cast(mlm_predictions == masked_lm_labels_dense,
+                                      'float32').mean()
+                return total_loss, mlm_acc, num_valid_cnt
+
+        return func
+
+
+if use_nv_input:
+    BertPretrainingHeads.forward = BertPretrainingHeads.nv_forward
+    BertPretrainingCriterion.forward = BertPretrainingCriterion.nv_forward
--- a/models/optimization.py
+++ b/models/optimization.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+from paddle.optimizer.lr import LRScheduler
+
+__all__ = ['LinearWarmupPolyDecayScheduler', ]
+
+
+class LinearWarmupPolyDecayScheduler(LRScheduler):
+    def __init__(self,
+                 startup_warmup_steps,
+                 warmup_steps,
+                 total_steps,
+                 base_lr,
+                 end_lr=0.0,
+                 degree=1.0,
+                 last_epoch=-1):
+        self.startup_warmup_steps = startup_warmup_steps
+        self.offset_step = int(startup_warmup_steps == 0)
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.base_lr = base_lr
+        self.end_lr = end_lr
+        self.degree = degree
+        super(LinearWarmupPolyDecayScheduler, self).__init__(
+            learning_rate=base_lr, last_epoch=last_epoch)
+
+    def get_lr(self):
+        step = self.last_epoch + 1
+        mod_step = step - self.offset_step - self.startup_warmup_steps
+        if mod_step < self.warmup_steps:
+            p = mod_step / (self.warmup_steps + 1e-6)
+            lr = self.base_lr * p
+        else:
+            p = min(1, (step - self.offset_step) / self.total_steps)
+            lr = (self.base_lr - self.end_lr) * (1 - p
+                                                 )**self.degree + self.end_lr
+        return lr
--- a/models/utils/__init__.py
+++ b/models/utils/__init__.py
--- a/models/utils/downloader.py
+++ b/models/utils/downloader.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import sys
+import os.path as osp
+import shutil
+import json
+import requests
+import hashlib
+import tarfile
+import zipfile
+import time
+import uuid
+import threading
+from collections import OrderedDict
+from .env import DOWNLOAD_SERVER, SUCCESS_STATUS, FAILED_STATUS
+
+try:
+    from tqdm import tqdm
+except:
+
+    class tqdm(object):
+        def __init__(self, total=None):
+            self.total = total
+            self.n = 0
+
+        def update(self, n):
+            self.n += n
+            if self.total is None:
+                sys.stderr.write("\r{0:.1f} bytes".format(self.n))
+            else:
+                sys.stderr.write("\r{0:.1f}%".format(100 * self.n / float(
+                    self.total)))
+            sys.stderr.flush()
+
+        def __enter__(self):
+            return self
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            sys.stderr.write('\n')
+
+
+from .log import logger
+
+__all__ = ['get_weights_path_from_url']
+
+COMMUNITY_MODEL_PREFIX = "https://paddlenlp.bj.bcebos.com/models/transformers/community/"
+
+WEIGHTS_HOME = osp.expanduser("~/.cache/paddle/hapi/weights")
+
+DOWNLOAD_RETRY_LIMIT = 3
+
+nlp_models = OrderedDict((
+    ('RoBERTa-zh-base',
+     'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_ext_L-12_H-768_A-12.tar.gz'
+     ),
+    ('RoBERTa-zh-large',
+     'https://bert-models.bj.bcebos.com/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16.tar.gz'
+     ),
+    ('ERNIE-v2-en-base',
+     'https://ernie.bj.bcebos.com/ERNIE_Base_en_stable-2.0.0.tar.gz'),
+    ('ERNIE-v2-en-large',
+     'https://ernie.bj.bcebos.com/ERNIE_Large_en_stable-2.0.0.tar.gz'),
+    ('XLNet-cased-base',
+     'https://xlnet.bj.bcebos.com/xlnet_cased_L-12_H-768_A-12.tgz'),
+    ('XLNet-cased-large',
+     'https://xlnet.bj.bcebos.com/xlnet_cased_L-24_H-1024_A-16.tgz'),
+    ('ERNIE-v1-zh-base',
+     'https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz'),
+    ('ERNIE-v1-zh-base-max-len-512',
+     'https://ernie.bj.bcebos.com/ERNIE_1.0_max-len-512.tar.gz'),
+    ('BERT-en-uncased-large-whole-word-masking',
+     'https://bert-models.bj.bcebos.com/wwm_uncased_L-24_H-1024_A-16.tar.gz'),
+    ('BERT-en-cased-large-whole-word-masking',
+     'https://bert-models.bj.bcebos.com/wwm_cased_L-24_H-1024_A-16.tar.gz'),
+    ('BERT-en-uncased-base',
+     'https://bert-models.bj.bcebos.com/uncased_L-12_H-768_A-12.tar.gz'),
+    ('BERT-en-uncased-large',
+     'https://bert-models.bj.bcebos.com/uncased_L-24_H-1024_A-16.tar.gz'),
+    ('BERT-en-cased-base',
+     'https://bert-models.bj.bcebos.com/cased_L-12_H-768_A-12.tar.gz'),
+    ('BERT-en-cased-large',
+     'https://bert-models.bj.bcebos.com/cased_L-24_H-1024_A-16.tar.gz'),
+    ('BERT-multilingual-uncased-base',
+     'https://bert-models.bj.bcebos.com/multilingual_L-12_H-768_A-12.tar.gz'),
+    ('BERT-multilingual-cased-base',
+     'https://bert-models.bj.bcebos.com/multi_cased_L-12_H-768_A-12.tar.gz'),
+    ('BERT-zh-base',
+     'https://bert-models.bj.bcebos.com/chinese_L-12_H-768_A-12.tar.gz'), ))
+
+
+def is_url(path):
+    """
+    Whether path is URL.
+    Args:
+        path (string): URL string or not.
+    """
+    return path.startswith('http://') or path.startswith('https://')
+
+
+def get_weights_path_from_url(url, md5sum=None):
+    """Get weights path from WEIGHT_HOME, if not exists,
+    download it from url.
+    Args:
+        url (str): download url
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded weights.
+    Examples:
+        .. code-block:: python
+            from paddle.utils.download import get_weights_path_from_url
+            resnet18_pretrained_weight_url = 'https://paddle-hapi.bj.bcebos.com/models/resnet18.pdparams'
+            local_weight_path = get_weights_path_from_url(resnet18_pretrained_weight_url)
+    """
+    path = get_path_from_url(url, WEIGHTS_HOME, md5sum)
+    return path
+
+
+def _map_path(url, root_dir):
+    # parse path after download under root_dir
+    fname = osp.split(url)[-1]
+    fpath = fname
+    return osp.join(root_dir, fpath)
+
+
+def get_path_from_url(url, root_dir, md5sum=None, check_exist=True):
+    """ Download from given url to root_dir.
+    if file or directory specified by url is exists under
+    root_dir, return the path directly, otherwise download
+    from url and decompress it, return the path.
+    Args:
+        url (str): download url
+        root_dir (str): root dir for downloading, it should be
+                        WEIGHTS_HOME or DATASET_HOME
+        md5sum (str): md5 sum of download package
+    
+    Returns:
+        str: a local path to save downloaded models & weights & datasets.
+    """
+
+    from paddle.fluid.dygraph.parallel import ParallelEnv
+
+    assert is_url(url), "downloading from {} not a url".format(url)
+    # parse path after download to decompress under root_dir
+    fullpath = _map_path(url, root_dir)
+
+    if osp.exists(fullpath) and check_exist and _md5check(fullpath, md5sum):
+        logger.info("Found {}".format(fullpath))
+    else:
+        if ParallelEnv().local_rank % 8 == 0:
+            fullpath = _download(url, root_dir, md5sum)
+        else:
+            while not os.path.exists(fullpath):
+                time.sleep(1)
+
+    if ParallelEnv().local_rank % 8 == 0:
+        if tarfile.is_tarfile(fullpath) or zipfile.is_zipfile(fullpath):
+            fullpath = _decompress(fullpath)
+
+    return fullpath
+
+
+def _download(url, path, md5sum=None):
+    """
+    Download from url, save to path.
+    url (str): download url
+    path (str): download to given path
+    """
+    if not osp.exists(path):
+        os.makedirs(path)
+
+    fname = osp.split(url)[-1]
+    fullname = osp.join(path, fname)
+    retry_cnt = 0
+
+    while not (osp.exists(fullname) and _md5check(fullname, md5sum)):
+        if retry_cnt < DOWNLOAD_RETRY_LIMIT:
+            retry_cnt += 1
+        else:
+            raise RuntimeError("Download from {} failed. "
+                               "Retry limit reached".format(url))
+
+        logger.info("Downloading {} from {}".format(fname, url))
+
+        req = requests.get(url, stream=True)
+        if req.status_code != 200:
+            raise RuntimeError("Downloading from {} failed with code "
+                               "{}!".format(url, req.status_code))
+
+        # For protecting download interupted, download to
+        # tmp_fullname firstly, move tmp_fullname to fullname
+        # after download finished
+        tmp_fullname = fullname + "_tmp"
+        total_size = req.headers.get('content-length')
+        with open(tmp_fullname, 'wb') as f:
+            if total_size:
+                with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+                    for chunk in req.iter_content(chunk_size=1024):
+                        f.write(chunk)
+                        pbar.update(1)
+            else:
+                for chunk in req.iter_content(chunk_size=1024):
+                    if chunk:
+                        f.write(chunk)
+        shutil.move(tmp_fullname, fullname)
+
+    return fullname
+
+
+def _md5check(fullname, md5sum=None):
+    if md5sum is None:
+        return True
+
+    logger.info("File {} md5 checking...".format(fullname))
+    md5 = hashlib.md5()
+    with open(fullname, 'rb') as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            md5.update(chunk)
+    calc_md5sum = md5.hexdigest()
+
+    if calc_md5sum != md5sum:
+        logger.info("File {} md5 check failed, {}(calc) != "
+                    "{}(base)".format(fullname, calc_md5sum, md5sum))
+        return False
+    return True
+
+
+def _md5(text):
+    """
+    Calculate the md5 value of the input text.
+    """
+
+    md5code = hashlib.md5(text.encode())
+    return md5code.hexdigest()
+
+
+def _decompress(fname):
+    """
+    Decompress for zip and tar file
+    """
+    logger.info("Decompressing {}...".format(fname))
+
+    # For protecting decompressing interupted,
+    # decompress to fpath_tmp directory firstly, if decompress
+    # successed, move decompress files to fpath and delete
+    # fpath_tmp and remove download compress file.
+
+    if tarfile.is_tarfile(fname):
+        uncompressed_path = _uncompress_file_tar(fname)
+    elif zipfile.is_zipfile(fname):
+        uncompressed_path = _uncompress_file_zip(fname)
+    else:
+        raise TypeError("Unsupport compress file type {}".format(fname))
+
+    return uncompressed_path
+
+
+def _uncompress_file_zip(filepath):
+    files = zipfile.ZipFile(filepath, 'r')
+    file_list = files.namelist()
+
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+
+        for item in file_list:
+            files.extract(item, file_dir)
+
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+        for item in file_list:
+            files.extract(item, os.path.join(file_dir, rootpath))
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _uncompress_file_tar(filepath, mode="r:*"):
+    files = tarfile.open(filepath, mode)
+    file_list = files.getnames()
+    file_dir = os.path.dirname(filepath)
+
+    if _is_a_single_file(file_list):
+        rootpath = file_list[0]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        files.extractall(file_dir, files.getmembers())
+    elif _is_a_single_dir(file_list):
+        rootpath = os.path.splitext(file_list[0])[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        files.extractall(file_dir, files.getmembers())
+    else:
+        rootpath = os.path.splitext(filepath)[0].split(os.sep)[-1]
+        uncompressed_path = os.path.join(file_dir, rootpath)
+        if not os.path.exists(uncompressed_path):
+            os.makedirs(uncompressed_path)
+
+        files.extractall(os.path.join(file_dir, rootpath), files.getmembers())
+
+    files.close()
+
+    return uncompressed_path
+
+
+def _is_a_single_file(file_list):
+    if len(file_list) == 1 and file_list[0].find(os.sep) < -1:
+        return True
+    return False
+
+
+def _is_a_single_dir(file_list):
+    new_file_list = []
+    for file_path in file_list:
+        if '/' in file_path:
+            file_path = file_path.replace('/', os.sep)
+        elif '\\' in file_path:
+            file_path = file_path.replace('\\', os.sep)
+        new_file_list.append(file_path)
+
+    file_name = new_file_list[0].split(os.sep)[0]
+    for i in range(1, len(new_file_list)):
+        if file_name != new_file_list[i].split(os.sep)[0]:
+            return False
+    return True
+
+
+class DownloaderCheck(threading.Thread):
+    """
+    Check the resource applicability  when downloading the models.
+    """
+
+    def __init__(self, task, command="taskflow", addition=None):
+        threading.Thread.__init__(self)
+        self.command = command
+        self.task = task
+        self.addition = addition
+        self.hash_flag = _md5(str(uuid.uuid1())[9:18]) + "-" + str(
+            int(time.time()))
+
+    def uri_path(self, server_url, api):
+        srv = server_url
+        if server_url.endswith('/'):
+            srv = server_url[:-1]
+        if api.startswith('/'):
+            srv += api
+        else:
+            api = '/' + api
+            srv += api
+        return srv
+
+    def request_check(self, task, command, addition):
+        if task is None:
+            return SUCCESS_STATUS
+        payload = {'word': self.task}
+        api_url = self.uri_path(DOWNLOAD_SERVER, 'search')
+        cache_path = os.path.join("～")
+        if os.path.exists(cache_path):
+            extra = {
+                "command": self.command,
+                "mtime": os.stat(cache_path).st_mtime,
+                "hub_name": self.hash_flag
+            }
+        else:
+            extra = {
+                "command": self.command,
+                "mtime": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
+                "hub_name": self.hash_flag
+            }
+        if addition is not None:
+            extra.update({"addition": addition})
+        try:
+            import paddle
+            payload['hub_version'] = " "
+            payload['paddle_version'] = paddle.__version__.split('-')[0]
+            payload['extra'] = json.dumps(extra)
+            r = requests.get(api_url, payload, timeout=1).json()
+            if r.get("update_cache", 0) == 1:
+                return SUCCESS_STATUS
+            else:
+                return FAILED_STATUS
+        except Exception as err:
+            return FAILED_STATUS
+
+    def run(self):
+        self.request_check(self.task, self.command, self.addition)
--- a/models/utils/env.py
+++ b/models/utils/env.py
+# Copyright (c) 2020  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+This module is used to store environmental variables in PaddleNLP.
+PPNLP_HOME              -->  the root directory for storing PaddleNLP related data. Default to ~/.paddlenlp. Users can change the
+├                            default value through the PPNLP_HOME environment variable.
+├─ MODEL_HOME              -->  Store model files.
+└─ DATA_HOME         -->  Store automatically downloaded datasets.
+'''
+import os
+
+
+def _get_user_home():
+    return os.path.expanduser('~')
+
+
+def _get_ppnlp_home():
+    if 'PPNLP_HOME' in os.environ:
+        home_path = os.environ['PPNLP_HOME']
+        if os.path.exists(home_path):
+            if os.path.isdir(home_path):
+                return home_path
+            else:
+                raise RuntimeError(
+                    'The environment variable PPNLP_HOME {} is not a directory.'.
+                    format(home_path))
+        else:
+            return home_path
+    return os.path.join(_get_user_home(), '.paddlenlp')
+
+
+def _get_sub_home(directory, parent_home=_get_ppnlp_home()):
+    home = os.path.join(parent_home, directory)
+    if not os.path.exists(home):
+        os.makedirs(home)
+    return home
+
+
+USER_HOME = _get_user_home()
+PPNLP_HOME = _get_ppnlp_home()
+MODEL_HOME = _get_sub_home('models')
+DATA_HOME = _get_sub_home('datasets')
+DOWNLOAD_SERVER = "http://paddlepaddle.org.cn/paddlehub"
+FAILED_STATUS = -1
+SUCCESS_STATUS = 0
--- a/models/utils/tools.py
+++ b/models/utils/tools.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+
+
+def static_params_to_dygraph(model, static_tensor_dict):
+    """Simple tool for convert static paramters to dygraph paramters dict.
+
+    **NOTE** The model must both support static graph and dygraph mode.
+
+    Args:
+        model (nn.Layer): the model of a neural network.
+        static_tensor_dict (string): path of which locate the saved paramters in static mode.
+            Usualy load by `paddle.static.load_program_state`.
+
+    Returns:
+        [tensor dict]: a state dict the same as the dygraph mode.
+    """
+    state_dict = model.state_dict()
+    # static_tensor_dict = paddle.static.load_program_state(static_params_path)
+
+    ret_dict = dict()
+    for n, p in state_dict.items():
+        ret_dict[n] = static_tensor_dict[p.name]
+
+    return ret_dict
+
+
+def dygraph_params_to_static(model, dygraph_tensor_dict, topo=None):
+    """Simple tool for convert dygraph paramters to static paramters dict.
+
+    **NOTE** The model must both support static graph and dygraph mode.
+
+    Args:
+        model (nn.Layer): the model of a neural network.
+        dygraph_tensor_dict (string): path of which locate the saved paramters in static mode.
+
+    Returns:
+        [tensor dict]: a state dict the same as the dygraph mode.
+    """
+    state_dict = model.state_dict()
+
+    ret_dict = dict()
+    for name, parm in state_dict.items():
+        if name not in dygraph_tensor_dict:
+            print("Miss \t\t", name)
+            continue
+
+        tensor = dygraph_tensor_dict[name]
+        if parm.is_distributed:
+            assert topo is not None
+            for dim, v in enumerate(tensor.shape):
+                if parm.shape[dim] != v:
+                    break
+
+            splited = np.split(
+                tensor, topo.mp_info.size, axis=dim)[topo.mp_info.rank]
+            ret_dict[parm.name] = splited
+        else:
+            ret_dict[parm.name] = tensor
+
+    return ret_dict
+
+
+class TimeCostAverage(object):
+    """
+    Simple tool for calcluating time average cost in the process of training and inferencing.
+    """
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        """
+        Reset the recoder state, and reset the `cnt` to zero.
+        """
+        self.cnt = 0
+        self.total_time = 0
+
+    def record(self, usetime):
+        """
+        Recoding the time cost in current step and accumulating the `cnt`.
+        """
+        self.cnt += 1
+        self.total_time += usetime
+
+    def get_average(self):
+        """
+        Returning the average time cost after the start of training.
+        """
+        if self.cnt == 0:
+            return 0
+        return self.total_time / self.cnt
+
+
+def get_env_device():
+    """
+    Return the device name of running enviroment.
+    """
+    if paddle.is_compiled_with_cuda():
+        return 'gpu'
+    elif paddle.is_compiled_with_npu():
+        return 'npu'
+    elif paddle.is_compiled_with_rocm():
+        return 'rocm'
+    elif paddle.is_compiled_with_xpu():
+        return 'xpu'
+    return 'cpu'
--- a/print_bind_cmd.py
+++ b/print_bind_cmd.py
+# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from argparse import ArgumentParser, REMAINDER
+
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(
+        description="The script to print PaddlePaddle CPU binding cmd.")
+
+    # Optional arguments for the launch helper
+    parser.add_argument(
+        "--nnodes",
+        type=int,
+        default=1,
+        help="The number of nodes to use for distributed "
+        "training")
+    parser.add_argument(
+        "--node_rank",
+        type=int,
+        default=0,
+        help="The rank of the node for multi-node distributed "
+        "training")
+    parser.add_argument(
+        "--local_rank", type=int, default=0, help="The local rank.")
+    parser.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=1,
+        help="The number of processes to launch on each node, "
+        "for GPU training, this is recommended to be set "
+        "to the number of GPUs in your system so that "
+        "each process can be bound to a single GPU.")
+    parser.add_argument(
+        '--no_hyperthreads',
+        action='store_true',
+        help='Flag to disable binding to hyperthreads')
+    parser.add_argument(
+        '--no_membind',
+        action='store_true',
+        help='Flag to disable memory binding')
+
+    # non-optional arguments for binding
+    parser.add_argument(
+        "--nsockets_per_node",
+        type=int,
+        required=True,
+        help="Number of CPU sockets on a node")
+    parser.add_argument(
+        "--ncores_per_socket",
+        type=int,
+        required=True,
+        help="Number of CPU cores per socket")
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # variables for numactrl binding
+
+    NSOCKETS = args.nsockets_per_node
+    NGPUS_PER_SOCKET = (args.nproc_per_node // args.nsockets_per_node) + (1 if (
+        args.nproc_per_node % args.nsockets_per_node) else 0)
+    NCORES_PER_GPU = args.ncores_per_socket // NGPUS_PER_SOCKET
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    local_rank = args.local_rank
+
+    # each process's rank
+    dist_rank = args.nproc_per_node * args.node_rank + local_rank
+
+    # form numactrl binding command
+    cpu_ranges = [
+        local_rank * NCORES_PER_GPU, (local_rank + 1) * NCORES_PER_GPU - 1,
+        local_rank * NCORES_PER_GPU +
+        (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS),
+        (local_rank + 1) * NCORES_PER_GPU +
+        (NCORES_PER_GPU * NGPUS_PER_SOCKET * NSOCKETS) - 1
+    ]
+
+    numactlargs = []
+    if args.no_hyperthreads:
+        numactlargs += ["--physcpubind={}-{}".format(*cpu_ranges[0:2])]
+    else:
+        numactlargs += ["--physcpubind={}-{},{}-{}".format(*cpu_ranges)]
+
+    if not args.no_membind:
+        memnode = local_rank // NGPUS_PER_SOCKET
+        numactlargs += ["--membind={}".format(memnode)]
+
+    cmd = ["/usr/bin/numactl"] + numactlargs
+    print(" ".join(cmd))
+
+
+if __name__ == "__main__":
+    main()
--- a/pybind/CMakeLists.txt
+++ b/pybind/CMakeLists.txt
+
+cmake_minimum_required(VERSION 3.4...3.18)
+project(functions LANGUAGES CXX)
+
+add_subdirectory(pybind11)
+
+include_directories(/public/home/zhangqha/.conda/envs/hhenv/include/python3.6m)
+include_directories(/public/home/zhangqha/.conda/envs/hhenv/lib/python3.6/site-packages/numpy/core/include)
+include_directories(/public/home/zhangqha/dtk-21.04/hipcub/include)
+include_directories(/public/home/zhangqha/dtk-21.04/hiprand/include)
+include_directories(/public/home/zhangqha/dtk-21.04/hipsparse/include)
+include_directories(/public/home/zhangqha/dtk-21.04/include)
+include_directories(/public/home/zhangqha/dtk-21.04/miopen/include)
+include_directories(/public/home/zhangqha/dtk-21.04/rccl/include)
+include_directories(/public/home/zhangqha/dtk-21.04/rocblas/include)
+include_directories(/public/home/zhangqha/dtk-21.04/rocfft/include)
+include_directories(/public/home/zhangqha/dtk-21.04/rocprim/include)
+include_directories(/public/home/zhangqha/dtk-21.04/rocrand/include)
+include_directories(/public/home/zhangqha/dtk-21.04/rocsparse/include)
+include_directories(/public/home/zhangqha/dtk-21.04/rocthrust/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/paddle/fluid/platform)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/dlpack/src/extern_dlpack/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/eigen3/src/extern_eigen3)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/brpc/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/gflags/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/glog/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/gloo/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/gtest/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/leveldb/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/libmct/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/mklml/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/mklml/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/protobuf/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/rocksdb/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/snappy/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/utf8proc/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/warpctc/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xbyak/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xbyak/include/xbyak)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xxhash/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/xxhash/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/install/zlib/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/pocketfft/src)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/pybind/src/extern_pybind/include)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/build/third_party/threadpool/src/extern_threadpool)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/paddle/fluid/framework/io)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/paddle/fluid/platform)
+include_directories(/public/home/zhangqha/for_baidu/Paddle-develop/patches/thrust)
+add_definitions(-DPADDLE_WITH_RCCL)
+add_definitions(-DEIGEN_USE_HIP)
+add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK)
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -D_GLIBCXX_USE_CXX11_ABI=1 -fPIC -D__HIP_PLATFORM_HCC__=1 ")
+
+set(extension_name "functions")
+add_definitions("-DMLPERF_EXTENSION_NAME=${extension_name}")
+pybind11_add_module(${extension_name} functions.cc)
+target_link_libraries(${extension_name} PRIVATE /usr/local/lib/python3.6/site-packages/paddle/fluid/core_avx.so)
--- a/pybind/README.md
+++ b/pybind/README.md
+# How to compile
+
+```shell
+export COMPILE_DIR=<PaddlePaddle build directory>
+python compile.py 
+```
--- a/pybind/bak/functions.cpython-36m-x86_64-linux-gnu.so
+++ b/pybind/bak/functions.cpython-36m-x86_64-linux-gnu.so
--- a/pybind/build/CMakeCache.txt
+++ b/pybind/build/CMakeCache.txt
+# This is the CMakeCache file.
+# For build in directory: /public/home/zhangqha/bert/pybind/build
+# It was generated by CMake: /opt/cmake/bin/cmake
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+
+########################
+# EXTERNAL cache entries
+########################
+
+//Path to a program.
+CMAKE_ADDR2LINE:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/addr2line
+
+//Path to a program.
+CMAKE_AR:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/ar
+
+//Choose the type of build, options are: None Debug Release RelWithDebInfo
+// MinSizeRel ...
+CMAKE_BUILD_TYPE:STRING=Release
+
+//Enable/Disable color output during build.
+CMAKE_COLOR_MAKEFILE:BOOL=ON
+
+//CXX compiler
+CMAKE_CXX_COMPILER:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/c++
+
+//A wrapper around 'ar' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_CXX_COMPILER_AR:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/gcc-ar
+
+//A wrapper around 'ranlib' adding the appropriate '--plugin' option
+// for the GCC compiler
+CMAKE_CXX_COMPILER_RANLIB:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/gcc-ranlib
+
+//Flags used by the CXX compiler during all build types.
+CMAKE_CXX_FLAGS:STRING=
+
+//Flags used by the CXX compiler during DEBUG builds.
+CMAKE_CXX_FLAGS_DEBUG:STRING=-g
+
+//Flags used by the CXX compiler during MINSIZEREL builds.
+CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
+
+//Flags used by the CXX compiler during RELEASE builds.
+CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
+
+//Flags used by the CXX compiler during RELWITHDEBINFO builds.
+CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
+
+//Path to a program.
+CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
+
+//Flags used by the linker during all build types.
+CMAKE_EXE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during DEBUG builds.
+CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during MINSIZEREL builds.
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during RELEASE builds.
+CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during RELWITHDEBINFO builds.
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Enable/Disable output of compile commands during generation.
+CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=OFF
+
+//User executables (bin)
+CMAKE_INSTALL_BINDIR:PATH=bin
+
+//Read-only architecture-independent data (DATAROOTDIR)
+CMAKE_INSTALL_DATADIR:PATH=
+
+//Read-only architecture-independent data root (share)
+CMAKE_INSTALL_DATAROOTDIR:PATH=share
+
+//Documentation root (DATAROOTDIR/doc/PROJECT_NAME)
+CMAKE_INSTALL_DOCDIR:PATH=
+
+//C header files (include)
+CMAKE_INSTALL_INCLUDEDIR:PATH=include
+
+//Info documentation (DATAROOTDIR/info)
+CMAKE_INSTALL_INFODIR:PATH=
+
+//Object code libraries (lib64)
+CMAKE_INSTALL_LIBDIR:PATH=lib64
+
+//Program executables (libexec)
+CMAKE_INSTALL_LIBEXECDIR:PATH=libexec
+
+//Locale-dependent data (DATAROOTDIR/locale)
+CMAKE_INSTALL_LOCALEDIR:PATH=
+
+//Modifiable single-machine data (var)
+CMAKE_INSTALL_LOCALSTATEDIR:PATH=var
+
+//Man documentation (DATAROOTDIR/man)
+CMAKE_INSTALL_MANDIR:PATH=
+
+//C header files for non-gcc (/usr/include)
+CMAKE_INSTALL_OLDINCLUDEDIR:PATH=/usr/include
+
+//Install path prefix, prepended onto install directories.
+CMAKE_INSTALL_PREFIX:PATH=/usr/local
+
+//Run-time variable data (LOCALSTATEDIR/run)
+CMAKE_INSTALL_RUNSTATEDIR:PATH=
+
+//System admin executables (sbin)
+CMAKE_INSTALL_SBINDIR:PATH=sbin
+
+//Modifiable architecture-independent data (com)
+CMAKE_INSTALL_SHAREDSTATEDIR:PATH=com
+
+//Read-only single-machine data (etc)
+CMAKE_INSTALL_SYSCONFDIR:PATH=etc
+
+//Path to a program.
+CMAKE_LINKER:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/ld
+
+//Path to a program.
+CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/gmake
+
+//Flags used by the linker during the creation of modules during
+// all build types.
+CMAKE_MODULE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of modules during
+// DEBUG builds.
+CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of modules during
+// MINSIZEREL builds.
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELEASE builds.
+CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELWITHDEBINFO builds.
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_NM:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/nm
+
+//Path to a program.
+CMAKE_OBJCOPY:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/objcopy
+
+//Path to a program.
+CMAKE_OBJDUMP:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/objdump
+
+//Value Computed by CMake
+CMAKE_PROJECT_DESCRIPTION:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_NAME:STATIC=functions
+
+//Path to a program.
+CMAKE_RANLIB:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/ranlib
+
+//Path to a program.
+CMAKE_READELF:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/readelf
+
+//Flags used by the linker during the creation of shared libraries
+// during all build types.
+CMAKE_SHARED_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during DEBUG builds.
+CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during MINSIZEREL builds.
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELEASE builds.
+CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELWITHDEBINFO builds.
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//If set, runtime paths are not added when installing shared libraries,
+// but are added when building.
+CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
+
+//If set, runtime paths are not added when using shared libraries.
+CMAKE_SKIP_RPATH:BOOL=NO
+
+//Flags used by the linker during the creation of static libraries
+// during all build types.
+CMAKE_STATIC_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during DEBUG builds.
+CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during MINSIZEREL builds.
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELEASE builds.
+CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELWITHDEBINFO builds.
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_STRIP:FILEPATH=/opt/rh/devtoolset-7/root/usr/bin/strip
+
+//If this value is on, makefiles will be generated without the
+// .SILENT directive, and all commands will be echoed to the console
+// during the make.  This is useful for debugging only. With Visual
+// Studio IDE projects all commands are done without /nologo.
+CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
+
+//C++ standard flag, e.g. -std=c++11, -std=c++14, /std:c++14. 
+// Defaults to C++14 mode.
+PYBIND11_CPP_STANDARD:STRING=-std=c++14
+
+//Install pybind11 header files?
+PYBIND11_INSTALL:BOOL=OFF
+
+//No help, variable specified on the command line.
+PYBIND11_PYTHON_VERSION:UNINITIALIZED=3.6
+
+//Build pybind11 test suite?
+PYBIND11_TEST:BOOL=OFF
+
+//Path to a program.
+PYTHON_EXECUTABLE:FILEPATH=/usr/local/bin/python3.6
+
+//Path to a library.
+PYTHON_LIBRARY:FILEPATH=/usr/local/lib/libpython3.6m.so
+
+//Install pybind11 headers in Python include directory instead
+// of default installation prefix
+USE_PYTHON_INCLUDE_DIR:BOOL=OFF
+
+//Value Computed by CMake
+functions_BINARY_DIR:STATIC=/public/home/zhangqha/bert/pybind/build
+
+//Dependencies for the target
+functions_LIB_DEPENDS:STATIC=general;/usr/local/lib/python3.6/site-packages/paddle/fluid/core_avx.so;
+
+//Value Computed by CMake
+functions_SOURCE_DIR:STATIC=/public/home/zhangqha/bert/pybind
+
+//Value Computed by CMake
+pybind11_BINARY_DIR:STATIC=/public/home/zhangqha/bert/pybind/build/pybind11
+
+//Value Computed by CMake
+pybind11_SOURCE_DIR:STATIC=/public/home/zhangqha/bert/pybind/pybind11
+
+
+########################
+# INTERNAL cache entries
+########################
+
+//ADVANCED property for variable: CMAKE_ADDR2LINE
+CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_AR
+CMAKE_AR-ADVANCED:INTERNAL=1
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=/public/home/zhangqha/bert/pybind/build
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=16
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=3
+//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
+CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=/opt/cmake/bin/cmake
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=/opt/cmake/bin/cpack
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=/opt/cmake/bin/ctest
+//ADVANCED property for variable: CMAKE_CXX_COMPILER
+CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_COMPILER_AR
+CMAKE_CXX_COMPILER_AR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_COMPILER_RANLIB
+CMAKE_CXX_COMPILER_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS
+CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
+CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
+CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
+CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
+CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_DLLTOOL
+CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
+//Path to cache edit program executable.
+CMAKE_EDIT_COMMAND:INTERNAL=/opt/cmake/bin/ccmake
+//Executable file format
+CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
+CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
+CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
+CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
+CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
+//Name of external makefile project generator.
+CMAKE_EXTRA_GENERATOR:INTERNAL=
+//Name of generator.
+CMAKE_GENERATOR:INTERNAL=Unix Makefiles
+//Generator instance identifier.
+CMAKE_GENERATOR_INSTANCE:INTERNAL=
+//Name of generator platform.
+CMAKE_GENERATOR_PLATFORM:INTERNAL=
+//Name of generator toolset.
+CMAKE_GENERATOR_TOOLSET:INTERNAL=
+//Source directory with the top level CMakeLists.txt file for this
+// project
+CMAKE_HOME_DIRECTORY:INTERNAL=/public/home/zhangqha/bert/pybind
+//ADVANCED property for variable: CMAKE_INSTALL_BINDIR
+CMAKE_INSTALL_BINDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_DATADIR
+CMAKE_INSTALL_DATADIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_DATAROOTDIR
+CMAKE_INSTALL_DATAROOTDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_DOCDIR
+CMAKE_INSTALL_DOCDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_INCLUDEDIR
+CMAKE_INSTALL_INCLUDEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_INFODIR
+CMAKE_INSTALL_INFODIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_LIBDIR
+CMAKE_INSTALL_LIBDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_LIBEXECDIR
+CMAKE_INSTALL_LIBEXECDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_LOCALEDIR
+CMAKE_INSTALL_LOCALEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_LOCALSTATEDIR
+CMAKE_INSTALL_LOCALSTATEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_MANDIR
+CMAKE_INSTALL_MANDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_OLDINCLUDEDIR
+CMAKE_INSTALL_OLDINCLUDEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_RUNSTATEDIR
+CMAKE_INSTALL_RUNSTATEDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_SBINDIR
+CMAKE_INSTALL_SBINDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_INSTALL_SHAREDSTATEDIR
+CMAKE_INSTALL_SHAREDSTATEDIR-ADVANCED:INTERNAL=1
+//Install .so files without execute permission.
+CMAKE_INSTALL_SO_NO_EXE:INTERNAL=0
+//ADVANCED property for variable: CMAKE_INSTALL_SYSCONFDIR
+CMAKE_INSTALL_SYSCONFDIR-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_LINKER
+CMAKE_LINKER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
+CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
+CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
+CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
+CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_NM
+CMAKE_NM-ADVANCED:INTERNAL=1
+//number of local generators
+CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=2
+//ADVANCED property for variable: CMAKE_OBJCOPY
+CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJDUMP
+CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
+//Platform information initialized
+CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_RANLIB
+CMAKE_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_READELF
+CMAKE_READELF-ADVANCED:INTERNAL=1
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=/opt/cmake/share/cmake-3.16
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
+CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
+CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
+CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
+CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_RPATH
+CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
+CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
+CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
+CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STRIP
+CMAKE_STRIP-ADVANCED:INTERNAL=1
+//uname command
+CMAKE_UNAME:INTERNAL=/usr/bin/uname
+//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
+CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
+//Details about finding PYTHON
+FIND_PACKAGE_MESSAGE_DETAILS_PYTHON:INTERNAL=/usr/local/bin/python3.6
+//Details about finding PythonInterp
+FIND_PACKAGE_MESSAGE_DETAILS_PythonInterp:INTERNAL=[/usr/local/bin/python3.6][v3.6.8(3.6)]
+//Test HAS_CPP14_FLAG
+HAS_CPP14_FLAG:INTERNAL=1
+//Test HAS_FLTO
+HAS_FLTO:INTERNAL=1
+PYBIND11_INCLUDE_DIR:INTERNAL=/public/home/zhangqha/bert/pybind/pybind11/include
+PYBIND11_LTO_CXX_FLAGS:INTERNAL=-flto;-fno-fat-lto-objects
+PYBIND11_LTO_LINKER_FLAGS:INTERNAL=-flto
+PYBIND11_VERSION_MAJOR:INTERNAL=2
+PYBIND11_VERSION_MINOR:INTERNAL=4
+PYBIND11_VERSION_PATCH:INTERNAL=3
+//ADVANCED property for variable: PYTHON_EXECUTABLE
+PYTHON_EXECUTABLE-ADVANCED:INTERNAL=1
+PYTHON_INCLUDE_DIRS:INTERNAL=/usr/local/include/python3.6m
+PYTHON_LIBRARIES:INTERNAL=/usr/local/lib/libpython3.6m.so
+//ADVANCED property for variable: PYTHON_LIBRARY
+PYTHON_LIBRARY-ADVANCED:INTERNAL=1
+PYTHON_MODULE_EXTENSION:INTERNAL=.cpython-36m-x86_64-linux-gnu.so
+PYTHON_MODULE_PREFIX:INTERNAL=
+PYTHON_VERSION_MAJOR:INTERNAL=3
+PYTHON_VERSION_MINOR:INTERNAL=6
+//CMAKE_INSTALL_PREFIX during last run
+_GNUInstallDirs_LAST_CMAKE_INSTALL_PREFIX:INTERNAL=/usr/local
+
--- a/pybind/build/CMakeFiles/3.16.3/CMakeCXXCompiler.cmake
+++ b/pybind/build/CMakeFiles/3.16.3/CMakeCXXCompiler.cmake
+set(CMAKE_CXX_COMPILER "/opt/rh/devtoolset-7/root/usr/bin/c++")
+set(CMAKE_CXX_COMPILER_ARG1 "")
+set(CMAKE_CXX_COMPILER_ID "GNU")
+set(CMAKE_CXX_COMPILER_VERSION "7.3.1")
+set(CMAKE_CXX_COMPILER_VERSION_INTERNAL "")
+set(CMAKE_CXX_COMPILER_WRAPPER "")
+set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "14")
+set(CMAKE_CXX_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters;cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates;cxx_std_17")
+set(CMAKE_CXX98_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters")
+set(CMAKE_CXX11_COMPILE_FEATURES "cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
+set(CMAKE_CXX14_COMPILE_FEATURES "cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
+set(CMAKE_CXX17_COMPILE_FEATURES "cxx_std_17")
+set(CMAKE_CXX20_COMPILE_FEATURES "")
+
+set(CMAKE_CXX_PLATFORM_ID "Linux")
+set(CMAKE_CXX_SIMULATE_ID "")
+set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "")
+set(CMAKE_CXX_SIMULATE_VERSION "")
+
+
+
+set(CMAKE_AR "/opt/rh/devtoolset-7/root/usr/bin/ar")
+set(CMAKE_CXX_COMPILER_AR "/opt/rh/devtoolset-7/root/usr/bin/gcc-ar")
+set(CMAKE_RANLIB "/opt/rh/devtoolset-7/root/usr/bin/ranlib")
+set(CMAKE_CXX_COMPILER_RANLIB "/opt/rh/devtoolset-7/root/usr/bin/gcc-ranlib")
+set(CMAKE_LINKER "/opt/rh/devtoolset-7/root/usr/bin/ld")
+set(CMAKE_MT "")
+set(CMAKE_COMPILER_IS_GNUCXX 1)
+set(CMAKE_CXX_COMPILER_LOADED 1)
+set(CMAKE_CXX_COMPILER_WORKS TRUE)
+set(CMAKE_CXX_ABI_COMPILED TRUE)
+set(CMAKE_COMPILER_IS_MINGW )
+set(CMAKE_COMPILER_IS_CYGWIN )
+if(CMAKE_COMPILER_IS_CYGWIN)
+  set(CYGWIN 1)
+  set(UNIX 1)
+endif()
+
+set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
+
+if(CMAKE_COMPILER_IS_MINGW)
+  set(MINGW 1)
+endif()
+set(CMAKE_CXX_COMPILER_ID_RUN 1)
+set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;m;mm;CPP)
+set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
+
+foreach (lang C OBJC OBJCXX)
+  if (CMAKE_${lang}_COMPILER_ID_RUN)
+    foreach(extension IN LISTS CMAKE_${lang}_SOURCE_FILE_EXTENSIONS)
+      list(REMOVE_ITEM CMAKE_CXX_SOURCE_FILE_EXTENSIONS ${extension})
+    endforeach()
+  endif()
+endforeach()
+
+set(CMAKE_CXX_LINKER_PREFERENCE 30)
+set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
+
+# Save compiler ABI information.
+set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
+set(CMAKE_CXX_COMPILER_ABI "ELF")
+set(CMAKE_CXX_LIBRARY_ARCHITECTURE "")
+
+if(CMAKE_CXX_SIZEOF_DATA_PTR)
+  set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_CXX_COMPILER_ABI)
+  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
+endif()
+
+if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
+  set(CMAKE_LIBRARY_ARCHITECTURE "")
+endif()
+
+set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
+if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
+  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
+endif()
+
+
+
+
+
+set(CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES "/opt/dtk/include;/opt/dtk/llvm/include;/opt/rh/devtoolset-7/root/usr/include/c++/7;/opt/rh/devtoolset-7/root/usr/include/c++/7/x86_64-redhat-linux;/opt/rh/devtoolset-7/root/usr/include/c++/7/backward;/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include;/usr/local/include;/opt/rh/devtoolset-7/root/usr/include;/usr/include")
+set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "stdc++;m;gcc_s;gcc;c;gcc_s;gcc")
+set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7;/opt/rh/devtoolset-7/root/usr/lib64;/lib64;/usr/lib64;/opt/rh/devtoolset-7/root/usr/lib")
+set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
--- a/pybind/build/CMakeFiles/3.16.3/CMakeDetermineCompilerABI_CXX.bin
+++ b/pybind/build/CMakeFiles/3.16.3/CMakeDetermineCompilerABI_CXX.bin
--- a/pybind/build/CMakeFiles/3.16.3/CMakeSystem.cmake
+++ b/pybind/build/CMakeFiles/3.16.3/CMakeSystem.cmake
+set(CMAKE_HOST_SYSTEM "Linux-3.10.0-957.el7.x86_64")
+set(CMAKE_HOST_SYSTEM_NAME "Linux")
+set(CMAKE_HOST_SYSTEM_VERSION "3.10.0-957.el7.x86_64")
+set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
+
+
+
+set(CMAKE_SYSTEM "Linux-3.10.0-957.el7.x86_64")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_VERSION "3.10.0-957.el7.x86_64")
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+
+set(CMAKE_CROSSCOMPILING "FALSE")
+
+set(CMAKE_SYSTEM_LOADED 1)
--- a/pybind/build/CMakeFiles/3.16.3/CompilerIdCXX/CMakeCXXCompilerId.cpp
+++ b/pybind/build/CMakeFiles/3.16.3/CompilerIdCXX/CMakeCXXCompilerId.cpp
+/* This source file must have a .cpp extension so that all C++ compilers
+   recognize the extension without flags.  Borland does not know .cxx for
+   example.  */
+#ifndef __cplusplus
+# error "A C compiler has been selected for C++."
+#endif
+
+
+/* Version number components: V=Version, R=Revision, P=Patch
+   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
+
+#if defined(__COMO__)
+# define COMPILER_ID "Comeau"
+  /* __COMO_VERSION__ = VRR */
+# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100)
+# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100)
+
+#elif defined(__INTEL_COMPILER) || defined(__ICC)
+# define COMPILER_ID "Intel"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_ID "GNU"
+# endif
+  /* __INTEL_COMPILER = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
+# if defined(__INTEL_COMPILER_UPDATE)
+#  define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
+# else
+#  define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
+# endif
+# if defined(__INTEL_COMPILER_BUILD_DATE)
+  /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
+#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
+# endif
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+# elif defined(__GNUG__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif defined(__PATHCC__)
+# define COMPILER_ID "PathScale"
+# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
+# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
+# if defined(__PATHCC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
+# endif
+
+#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
+# define COMPILER_ID "Embarcadero"
+# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
+# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
+# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
+
+#elif defined(__BORLANDC__)
+# define COMPILER_ID "Borland"
+  /* __BORLANDC__ = 0xVRR */
+# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
+# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
+
+#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
+# define COMPILER_ID "Watcom"
+   /* __WATCOMC__ = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__WATCOMC__)
+# define COMPILER_ID "OpenWatcom"
+   /* __WATCOMC__ = VVRP + 1100 */
+# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__SUNPRO_CC)
+# define COMPILER_ID "SunPro"
+# if __SUNPRO_CC >= 0x5100
+   /* __SUNPRO_CC = 0xVRRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# else
+   /* __SUNPRO_CC = 0xVRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# endif
+
+#elif defined(__HP_aCC)
+# define COMPILER_ID "HP"
+  /* __HP_aCC = VVRRPP */
+# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000)
+# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__HP_aCC     % 100)
+
+#elif defined(__DECCXX)
+# define COMPILER_ID "Compaq"
+  /* __DECCXX_VER = VVRRTPPPP */
+# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000)
+# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000  % 100)
+# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER         % 10000)
+
+#elif defined(__IBMCPP__) && defined(__COMPILER_VER__)
+# define COMPILER_ID "zOS"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__ibmxl__) && defined(__clang__)
+# define COMPILER_ID "XLClang"
+# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
+# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
+# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
+
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800
+# define COMPILER_ID "XL"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800
+# define COMPILER_ID "VisualAge"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__PGI)
+# define COMPILER_ID "PGI"
+# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
+# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
+# if defined(__PGIC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
+# endif
+
+#elif defined(_CRAYC)
+# define COMPILER_ID "Cray"
+# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
+# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
+
+#elif defined(__TI_COMPILER_VERSION__)
+# define COMPILER_ID "TI"
+  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
+# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
+# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
+# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
+
+#elif defined(__FUJITSU) || defined(__FCC_VERSION) || defined(__fcc_version)
+# define COMPILER_ID "Fujitsu"
+
+#elif defined(__ghs__)
+# define COMPILER_ID "GHS"
+/* __GHS_VERSION_NUMBER = VVVVRP */
+# ifdef __GHS_VERSION_NUMBER
+# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
+# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
+# endif
+
+#elif defined(__SCO_VERSION__)
+# define COMPILER_ID "SCO"
+
+#elif defined(__ARMCC_VERSION) && !defined(__clang__)
+# define COMPILER_ID "ARMCC"
+#if __ARMCC_VERSION >= 1000000
+  /* __ARMCC_VERSION = VRRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
+#else
+  /* __ARMCC_VERSION = VRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
+#endif
+
+
+#elif defined(__clang__) && defined(__apple_build_version__)
+# define COMPILER_ID "AppleClang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
+
+#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
+# define COMPILER_ID "ARMClang"
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION     % 10000)
+# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
+
+#elif defined(__clang__)
+# define COMPILER_ID "Clang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+#elif defined(__GNUC__) || defined(__GNUG__)
+# define COMPILER_ID "GNU"
+# if defined(__GNUC__)
+#  define COMPILER_VERSION_MAJOR DEC(__GNUC__)
+# else
+#  define COMPILER_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif defined(_MSC_VER)
+# define COMPILER_ID "MSVC"
+  /* _MSC_VER = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
+# if defined(_MSC_FULL_VER)
+#  if _MSC_VER >= 1400
+    /* _MSC_FULL_VER = VVRRPPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
+#  else
+    /* _MSC_FULL_VER = VVRRPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
+#  endif
+# endif
+# if defined(_MSC_BUILD)
+#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
+# endif
+
+#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__)
+# define COMPILER_ID "ADSP"
+#if defined(__VISUALDSPVERSION__)
+  /* __VISUALDSPVERSION__ = 0xVVRRPP00 */
+# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24)
+# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF)
+# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8  & 0xFF)
+#endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# define COMPILER_ID "IAR"
+# if defined(__VER__) && defined(__ICCARM__)
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
+#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
+#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__))
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
+#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
+#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# endif
+
+
+/* These compilers are either not known or too old to define an
+  identification macro.  Try to identify the platform and guess that
+  it is the native compiler.  */
+#elif defined(__hpux) || defined(__hpua)
+# define COMPILER_ID "HP"
+
+#else /* unknown compiler */
+# define COMPILER_ID ""
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
+#ifdef SIMULATE_ID
+char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
+#endif
+
+#ifdef __QNXNTO__
+char const* qnxnto = "INFO" ":" "qnxnto[]";
+#endif
+
+#if defined(__CRAYXE) || defined(__CRAYXC)
+char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
+#endif
+
+#define STRINGIFY_HELPER(X) #X
+#define STRINGIFY(X) STRINGIFY_HELPER(X)
+
+/* Identify known platforms by name.  */
+#if defined(__linux) || defined(__linux__) || defined(linux)
+# define PLATFORM_ID "Linux"
+
+#elif defined(__CYGWIN__)
+# define PLATFORM_ID "Cygwin"
+
+#elif defined(__MINGW32__)
+# define PLATFORM_ID "MinGW"
+
+#elif defined(__APPLE__)
+# define PLATFORM_ID "Darwin"
+
+#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define PLATFORM_ID "Windows"
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD)
+# define PLATFORM_ID "FreeBSD"
+
+#elif defined(__NetBSD__) || defined(__NetBSD)
+# define PLATFORM_ID "NetBSD"
+
+#elif defined(__OpenBSD__) || defined(__OPENBSD)
+# define PLATFORM_ID "OpenBSD"
+
+#elif defined(__sun) || defined(sun)
+# define PLATFORM_ID "SunOS"
+
+#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
+# define PLATFORM_ID "AIX"
+
+#elif defined(__hpux) || defined(__hpux__)
+# define PLATFORM_ID "HP-UX"
+
+#elif defined(__HAIKU__)
+# define PLATFORM_ID "Haiku"
+
+#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
+# define PLATFORM_ID "BeOS"
+
+#elif defined(__QNX__) || defined(__QNXNTO__)
+# define PLATFORM_ID "QNX"
+
+#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
+# define PLATFORM_ID "Tru64"
+
+#elif defined(__riscos) || defined(__riscos__)
+# define PLATFORM_ID "RISCos"
+
+#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
+# define PLATFORM_ID "SINIX"
+
+#elif defined(__UNIX_SV__)
+# define PLATFORM_ID "UNIX_SV"
+
+#elif defined(__bsdos__)
+# define PLATFORM_ID "BSDOS"
+
+#elif defined(_MPRAS) || defined(MPRAS)
+# define PLATFORM_ID "MP-RAS"
+
+#elif defined(__osf) || defined(__osf__)
+# define PLATFORM_ID "OSF1"
+
+#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
+# define PLATFORM_ID "SCO_SV"
+
+#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
+# define PLATFORM_ID "ULTRIX"
+
+#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
+# define PLATFORM_ID "Xenix"
+
+#elif defined(__WATCOMC__)
+# if defined(__LINUX__)
+#  define PLATFORM_ID "Linux"
+
+# elif defined(__DOS__)
+#  define PLATFORM_ID "DOS"
+
+# elif defined(__OS2__)
+#  define PLATFORM_ID "OS2"
+
+# elif defined(__WINDOWS__)
+#  define PLATFORM_ID "Windows3x"
+
+# else /* unknown platform */
+#  define PLATFORM_ID
+# endif
+
+#elif defined(__INTEGRITY)
+# if defined(INT_178B)
+#  define PLATFORM_ID "Integrity178"
+
+# else /* regular Integrity */
+#  define PLATFORM_ID "Integrity"
+# endif
+
+#else /* unknown platform */
+# define PLATFORM_ID
+
+#endif
+
+/* For windows compilers MSVC and Intel we can determine
+   the architecture of the compiler being used.  This is because
+   the compilers do not have flags that can change the architecture,
+   but rather depend on which compiler is being used
+*/
+#if defined(_WIN32) && defined(_MSC_VER)
+# if defined(_M_IA64)
+#  define ARCHITECTURE_ID "IA64"
+
+# elif defined(_M_X64) || defined(_M_AMD64)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# elif defined(_M_ARM64)
+#  define ARCHITECTURE_ID "ARM64"
+
+# elif defined(_M_ARM)
+#  if _M_ARM == 4
+#   define ARCHITECTURE_ID "ARMV4I"
+#  elif _M_ARM == 5
+#   define ARCHITECTURE_ID "ARMV5I"
+#  else
+#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
+#  endif
+
+# elif defined(_M_MIPS)
+#  define ARCHITECTURE_ID "MIPS"
+
+# elif defined(_M_SH)
+#  define ARCHITECTURE_ID "SHx"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__WATCOMC__)
+# if defined(_M_I86)
+#  define ARCHITECTURE_ID "I86"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# if defined(__ICCARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__ICCRX__)
+#  define ARCHITECTURE_ID "RX"
+
+# elif defined(__ICCRH850__)
+#  define ARCHITECTURE_ID "RH850"
+
+# elif defined(__ICCRL78__)
+#  define ARCHITECTURE_ID "RL78"
+
+# elif defined(__ICCRISCV__)
+#  define ARCHITECTURE_ID "RISCV"
+
+# elif defined(__ICCAVR__)
+#  define ARCHITECTURE_ID "AVR"
+
+# elif defined(__ICC430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__ICCV850__)
+#  define ARCHITECTURE_ID "V850"
+
+# elif defined(__ICC8051__)
+#  define ARCHITECTURE_ID "8051"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__ghs__)
+# if defined(__PPC64__)
+#  define ARCHITECTURE_ID "PPC64"
+
+# elif defined(__ppc__)
+#  define ARCHITECTURE_ID "PPC"
+
+# elif defined(__ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__x86_64__)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(__i386__)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+#else
+#  define ARCHITECTURE_ID
+#endif
+
+/* Convert integer to decimal digit literals.  */
+#define DEC(n)                   \
+  ('0' + (((n) / 10000000)%10)), \
+  ('0' + (((n) / 1000000)%10)),  \
+  ('0' + (((n) / 100000)%10)),   \
+  ('0' + (((n) / 10000)%10)),    \
+  ('0' + (((n) / 1000)%10)),     \
+  ('0' + (((n) / 100)%10)),      \
+  ('0' + (((n) / 10)%10)),       \
+  ('0' +  ((n) % 10))
+
+/* Convert integer to hex digit literals.  */
+#define HEX(n)             \
+  ('0' + ((n)>>28 & 0xF)), \
+  ('0' + ((n)>>24 & 0xF)), \
+  ('0' + ((n)>>20 & 0xF)), \
+  ('0' + ((n)>>16 & 0xF)), \
+  ('0' + ((n)>>12 & 0xF)), \
+  ('0' + ((n)>>8  & 0xF)), \
+  ('0' + ((n)>>4  & 0xF)), \
+  ('0' + ((n)     & 0xF))
+
+/* Construct a string literal encoding the version number components. */
+#ifdef COMPILER_VERSION_MAJOR
+char const info_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
+  COMPILER_VERSION_MAJOR,
+# ifdef COMPILER_VERSION_MINOR
+  '.', COMPILER_VERSION_MINOR,
+#  ifdef COMPILER_VERSION_PATCH
+   '.', COMPILER_VERSION_PATCH,
+#   ifdef COMPILER_VERSION_TWEAK
+    '.', COMPILER_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct a string literal encoding the internal version number. */
+#ifdef COMPILER_VERSION_INTERNAL
+char const info_version_internal[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
+  'i','n','t','e','r','n','a','l','[',
+  COMPILER_VERSION_INTERNAL,']','\0'};
+#endif
+
+/* Construct a string literal encoding the version number components. */
+#ifdef SIMULATE_VERSION_MAJOR
+char const info_simulate_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
+  SIMULATE_VERSION_MAJOR,
+# ifdef SIMULATE_VERSION_MINOR
+  '.', SIMULATE_VERSION_MINOR,
+#  ifdef SIMULATE_VERSION_PATCH
+   '.', SIMULATE_VERSION_PATCH,
+#   ifdef SIMULATE_VERSION_TWEAK
+    '.', SIMULATE_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
+char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
+
+
+
+
+#if defined(__INTEL_COMPILER) && defined(_MSVC_LANG) && _MSVC_LANG < 201403L
+#  if defined(__INTEL_CXX11_MODE__)
+#    if defined(__cpp_aggregate_nsdmi)
+#      define CXX_STD 201402L
+#    else
+#      define CXX_STD 201103L
+#    endif
+#  else
+#    define CXX_STD 199711L
+#  endif
+#elif defined(_MSC_VER) && defined(_MSVC_LANG)
+#  define CXX_STD _MSVC_LANG
+#else
+#  define CXX_STD __cplusplus
+#endif
+
+const char* info_language_dialect_default = "INFO" ":" "dialect_default["
+#if CXX_STD > 201703L
+  "20"
+#elif CXX_STD >= 201703L
+  "17"
+#elif CXX_STD >= 201402L
+  "14"
+#elif CXX_STD >= 201103L
+  "11"
+#else
+  "98"
+#endif
+"]";
+
+/*--------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+  int require = 0;
+  require += info_compiler[argc];
+  require += info_platform[argc];
+#ifdef COMPILER_VERSION_MAJOR
+  require += info_version[argc];
+#endif
+#ifdef COMPILER_VERSION_INTERNAL
+  require += info_version_internal[argc];
+#endif
+#ifdef SIMULATE_ID
+  require += info_simulate[argc];
+#endif
+#ifdef SIMULATE_VERSION_MAJOR
+  require += info_simulate_version[argc];
+#endif
+#if defined(__CRAYXE) || defined(__CRAYXC)
+  require += info_cray[argc];
+#endif
+  require += info_language_dialect_default[argc];
+  (void)argv;
+  return require;
+}
--- a/pybind/build/CMakeFiles/3.16.3/CompilerIdCXX/a.out
+++ b/pybind/build/CMakeFiles/3.16.3/CompilerIdCXX/a.out