bert_padding.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle


def generate_mask(attention_mask, unpad_fmha=False):
    if unpad_fmha:
        # 对[bs, max_seq_len]，每一行求和，代表获取每一行的实际seq_len（一维）。
        #seqlen = attention_mask.sum(dim=1).to(dtype=torch.int32).flatten()
        attention_mask_tmp = paddle.sum(attention_mask, axis=1)
        attention_mask_sum = paddle.cast(attention_mask_tmp, 'int32')
        seqlen = paddle.reshape(attention_mask_sum, [-1])
        print("seqlen is ", seqlen)

        # 把非零元的下标存储下来。
        #indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
        attention_mask_1d = paddle.reshape(attention_mask, [-1])
        indices = paddle.nonzero(attention_mask_1d, as_tuple=False)
        indices = paddle.reshape(indices, [-1])

        # 当前batch的max cur_len_seq
        # maxseqlen = seqlen.max().item()
        maxseqlen_d = paddle.max(seqlen)
        # Note: use paddle.CUDAPinnedPlace() will cause the following errors:
        '''
        File "/usr/local/lib/python3.8/dist-packages/paddle/fluid/framework.py", line 2305, in __init__
        for frame in traceback.extract_stack():
        UnimplementedError: Unsupported place type `CUDAPinnedPlace` when casting paddle place to enum place. (at /limin29/Paddle/paddle/fluid/framework/custom_tensor_utils.h:135)
        [operator < custom_fmha > error]
        '''
        # maxseqlen = paddle.tensor.creation._memcpy(maxseqlen_d, paddle.CUDAPinnedPlace())
        maxseqlen = paddle.tensor.creation._memcpy(maxseqlen_d,
                                                   paddle.CPUPlace())
        print("maxseqlen", maxseqlen)

        prefix_sum = paddle.cumsum(seqlen, axis=0)
        zero_tensor = paddle.zeros([1], dtype='int32')
        # 返回数组前缀和。[0, a[0], a[0] + a[1], ...]
        cu_seqlens = paddle.concat(x=[zero_tensor, prefix_sum])
        # 返回cu_seqlens最后一个元素，代表当前batch的所有实际seq_len之和。
        # device tensor with shape [1]
        ntokens_d = cu_seqlens[-1]
        # host tensor with shape [1]
        #ntokens = paddle.tensor.creation._memcpy(ntokens_d, paddle.CUDAPinnedPlace()) 
        ntokens = paddle.tensor.creation._memcpy(ntokens_d, paddle.CPUPlace())
        print("ntokens = ", ntokens)
        return indices, attention_mask, seqlen, ntokens, cu_seqlens, seqlen, maxseqlen
    else:
        raise NotImplementedError()