# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle from custom_setup_ops import custom_fmha import numpy as np total = 2 batch_size = 56 num_heads = 16 head_size = 64 is_training = True max_seq_len = 512 dropout_rate = 0.1 cu_seqlen = np.arange(batch_size + 1) cu_seqlen = np.cumsum(cu_seqlen) total = cu_seqlen[-1] #print("cu_seqlen", cu_seqlen) #print("cu_seqlen[-1]", cu_seqlen[-1]) cu_seqlen = paddle.to_tensor(cu_seqlen) cu_seqlen = paddle.cast(cu_seqlen, 'int32') qkv = np.random.random((total, 3, num_heads, head_size)).astype(np.float16) #print("qkv:", qkv) qkv = paddle.to_tensor(qkv, stop_gradient=False) max_seq_len_host = [max_seq_len] max_seq_len_host = paddle.to_tensor( max_seq_len_host, dtype='int32', place=paddle.CPUPlace()) ctx_out, s_out = custom_fmha(qkv, cu_seqlen, max_seq_len_host, is_training, dropout_rate, False) print("print ctx_out and s_out: ") print(ctx_out) print(s_out) # backward. print("print qkv.grad: ") grad_ctx_dout = np.random.random( (total, num_heads, head_size)).astype(np.float16) grad_ctx_dout = paddle.to_tensor(grad_ctx_dout) paddle.autograd.backward([ctx_out], [grad_ctx_dout], retain_graph=True) print(qkv.grad)