Commit dbe08e9b authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

2.4.2

parent b5499578
......@@ -14,6 +14,7 @@
from __future__ import print_function
import math
import numpy as np
import unittest
......@@ -45,34 +46,64 @@ class TestFoldOp(OpTest):
def calc_fold(self):
output_shape = [0] * 4
output_shape[0] = self.batch_size
output_shape[1] = int(self.input_channels /
(self.kernel_sizes[0] * self.kernel_sizes[1]))
output_shape[1] = int(
self.input_channels / (self.kernel_sizes[0] * self.kernel_sizes[1])
)
output_shape[2] = self.output_sizes[0]
output_shape[3] = self.output_sizes[1]
dkernel_h = self.dilations[0] * (self.kernel_sizes[0] - 1) + 1
dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1
col_height = int((self.output_sizes[0] + self.paddings[0] +
self.paddings[2] - dkernel_h) / self.strides[0]) + 1
col_width = int((self.output_sizes[1] + self.paddings[1] +
self.paddings[3] - dkernel_w) / self.strides[1]) + 1
col_height = (
int(
(
self.output_sizes[0]
+ self.paddings[0]
+ self.paddings[2]
- dkernel_h
)
/ self.strides[0]
)
+ 1
)
col_width = (
int(
(
self.output_sizes[1]
+ self.paddings[1]
+ self.paddings[3]
- dkernel_w
)
/ self.strides[1]
)
+ 1
)
output = np.zeros(output_shape).astype(np.float64)
############ calculate output ##############
for b in range(output_shape[0]):
for c in range(self.input_channels):
w_offset = int(c % self.kernel_sizes[1])
h_offset = int(
(c / self.kernel_sizes[1]) % self.kernel_sizes[0])
(c / self.kernel_sizes[1]) % self.kernel_sizes[0]
)
c_out = int(c / self.kernel_sizes[0] / self.kernel_sizes[1])
for h in range(col_height):
h_out = int(h * self.strides[0] - self.paddings[0] +
h_offset * self.dilations[0])
h_out = int(
h * self.strides[0]
- self.paddings[0]
+ h_offset * self.dilations[0]
)
for w in range(col_width):
w_out = int(w * self.strides[1] - self.paddings[1] +
w_offset * self.dilations[1])
w_out = int(
w * self.strides[1]
- self.paddings[1]
+ w_offset * self.dilations[1]
)
if (h_out >= 0 and h_out < self.output_sizes[0]) and (
w_out >= 0 and w_out < self.output_sizes[1]):
output[b, c_out, h_out,
w_out] += self.x[b, c, w + col_width * h]
w_out >= 0 and w_out < self.output_sizes[1]
):
output[b, c_out, h_out, w_out] += self.x[
b, c, w + col_width * h
]
self.outputs = output
......@@ -85,7 +116,7 @@ class TestFoldOp(OpTest):
'paddings': self.paddings,
'dilations': self.dilations,
'strides': self.strides,
'output_sizes': self.output_sizes
'output_sizes': self.output_sizes,
}
self.outputs = {'Y': self.outputs}
......@@ -101,9 +132,23 @@ class TestFoldOp(OpTest):
self.check_grad(['X'], 'Y', check_eager=True)
class TestFoldshape(TestFoldOp):
def init_data(self):
self.batch_size = 8
self.input_channels = 3 * 3 * 3
self.length = 6
self.kernel_sizes = [3, 3]
self.strides = [1, 1]
self.paddings = [0, 0, 0, 0]
self.dilations = [1, 1]
self.output_sizes = [4, 5]
input_shape = [self.batch_size, self.input_channels, self.length]
self.x = np.random.rand(*input_shape).astype(np.float64)
class TestFoldAPI(TestFoldOp):
#This is for test on paddle.nn.Fold
# This is for test on paddle.nn.Fold
def setUp(self):
self.op_type = 'fold'
......@@ -120,19 +165,19 @@ class TestFoldAPI(TestFoldOp):
m = paddle.nn.Fold(**self.attrs)
m.eval()
result = m(input)
np.testing.assert_allclose(result.numpy(),
self.outputs['Y'],
rtol=1e-05)
np.testing.assert_allclose(
result.numpy(), self.outputs['Y'], rtol=1e-05
)
def test_info(self):
str(paddle.nn.Fold(**self.attrs))
class TestFoldOpError(unittest.TestCase):
def test_errors(self):
from paddle.nn.functional import fold
from paddle.fluid.framework import Program, program_guard
with program_guard(Program(), Program()):
def test_input_shape():
......@@ -148,59 +193,67 @@ class TestFoldOpError(unittest.TestCase):
def test_padding_shape():
# padding_size must be 2 or 4
x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x,
output_sizes=[2, 3],
kernel_sizes=[2, 2],
paddings=[2, 2, 3])
out = fold(
x,
output_sizes=[2, 3],
kernel_sizes=[2, 2],
paddings=[2, 2, 3],
)
def test_dilations_shape():
# dialtions_size must be 2
x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x,
output_sizes=[2, 3],
kernel_sizes=[2, 2],
dilations=[2, 2, 3])
out = fold(
x,
output_sizes=[2, 3],
kernel_sizes=[2, 2],
dilations=[2, 2, 3],
)
def test_strides_shape():
# strids_size must be 2
x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x,
output_sizes=[2, 3],
kernel_sizes=[2, 2],
strides=[2, 2, 3])
out = fold(
x,
output_sizes=[2, 3],
kernel_sizes=[2, 2],
strides=[2, 2, 3],
)
def test_output_size():
# im_h * im_w must be L
x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x,
output_sizes=[6, 6],
kernel_sizes=[2, 2],
strides=[1, 1])
out = fold(
x, output_sizes=[6, 6], kernel_sizes=[2, 2], strides=[1, 1]
)
def test_output_size_2():
# out_size must GT 1
x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x,
output_sizes=[0.1, 0.2],
kernel_sizes=[2, 2],
strides=[1, 1])
out = fold(
x,
output_sizes=[0.1, 0.2],
kernel_sizes=[2, 2],
strides=[1, 1],
)
def test_block_h_w():
# test_block_h_w GT 0
x = paddle.randn(shape=[2, 1, 1], dtype="float32")
out = fold(x,
output_sizes=[1, 1],
kernel_sizes=[2, 2],
strides=1)
out = fold(
x, output_sizes=[1, 1], kernel_sizes=[2, 2], strides=1
)
def test_GT_0():
x = paddle.randn(shape=[2, 1, 1], dtype="float32")
out = fold(x,
output_sizes=[0, 0],
kernel_sizes=[0, 0],
dilations=0,
paddings=[0, 0],
strides=0)
out = fold(
x,
output_sizes=[0, 0],
kernel_sizes=[0, 0],
dilations=0,
paddings=[0, 0],
strides=0,
)
self.assertRaises(AssertionError, test_input_shape)
self.assertRaises(AssertionError, test_kernel_shape)
......
......@@ -30,10 +30,10 @@ from paddle.fluid.framework import default_main_program
from paddle.fluid import core
@unittest.skipIf(not core.is_compiled_with_cuda(),
"Paddle is not compiled with CUDA")
@unittest.skipIf(
not core.is_compiled_with_cuda(), "Paddle is not compiled with CUDA"
)
class TestFusedGateAttentionOp(OpTest):
def setUp(self):
self.__class__.op_type = "fused_gate_attention"
# use autograd to check grad in this unittest.
......@@ -57,7 +57,6 @@ class TestFusedGateAttentionOp(OpTest):
self.bias_attr = True
def generate_input_data(self):
def _random(shape):
if self.dtype == "bfloat16":
data = np.random.random(shape).astype("float32")
......@@ -67,7 +66,8 @@ class TestFusedGateAttentionOp(OpTest):
np.random.seed(123)
self.query = _random(
(self.batch_size, self.msa_len, self.res_len, self.q_dim))
(self.batch_size, self.msa_len, self.res_len, self.q_dim)
)
self.q_weight = _random((self.q_dim, self.num_heads, self.head_dim))
self.k_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
self.v_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
......@@ -80,15 +80,18 @@ class TestFusedGateAttentionOp(OpTest):
self.qkv_weight = np.stack([q_weight_t, k_weight_t, v_weight_t])
else:
self.key = _random(
(self.batch_size, self.msa_len, self.m_size, self.kv_dim))
(self.batch_size, self.msa_len, self.m_size, self.kv_dim)
)
self.qkv_weight = None
self.attn_mask = _random(
(self.batch_size, self.msa_len, 1, 1, self.m_size))
(self.batch_size, self.msa_len, 1, 1, self.m_size)
)
if self.bias_attr:
self.nonbatched_bias = _random(
(self.batch_size, 1, self.num_heads, self.res_len, self.m_size))
(self.batch_size, 1, self.num_heads, self.res_len, self.m_size)
)
if self.has_gating:
self.gating_w = _random((self.q_dim, self.num_heads, self.head_dim))
......@@ -98,12 +101,17 @@ class TestFusedGateAttentionOp(OpTest):
self.output_b = _random((self.out_dim))
self.dout = _random(
(self.batch_size, self.msa_len, self.res_len, self.q_dim))
(self.batch_size, self.msa_len, self.res_len, self.q_dim)
)
def collect_outputs(self, query, key, softmax_out, fmha_out, gate_out, out):
outputs = [
softmax_out, fmha_out, gate_out if self.has_gating else None, out,
query.grad, None if self.merge_qkv else key.grad
softmax_out,
fmha_out,
gate_out if self.has_gating else None,
out,
query.grad,
None if self.merge_qkv else key.grad,
]
return outputs
......@@ -111,14 +119,17 @@ class TestFusedGateAttentionOp(OpTest):
paddle.disable_static(place=paddle.CUDAPlace(0))
query = paddle.to_tensor(self.query, stop_gradient=False)
key = query if self.merge_qkv else paddle.to_tensor(self.key,
stop_gradient=False)
key = (
query
if self.merge_qkv
else paddle.to_tensor(self.key, stop_gradient=False)
)
q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
c = self.head_dim**(-0.5)
c = self.head_dim ** (-0.5)
# [batch_size, msa_len, res_len, q_dim], [q_dim, num_heads, head_dim]
# -> [batch_size, msa_len, res_len, num_heads, head_dim]
q = paddle.einsum('nbqa,ahc->nbqhc', query, q_weight) * c
......@@ -136,8 +147,9 @@ class TestFusedGateAttentionOp(OpTest):
# -> [batch_size, msa_len, num_heads, res_len, m_size]
logits = logits + src_mask
if self.bias_attr:
nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
stop_gradient=False)
nonbatched_bias = paddle.to_tensor(
self.nonbatched_bias, stop_gradient=False
)
# [batch_size, msa_len, num_heads, res_len, m_size], [batch_size, 1, num_heads, res_len, m_size]
# -> [batch_size, msa_len, num_heads, res_len, m_size]
logits = logits + nonbatched_bias
......@@ -159,14 +171,22 @@ class TestFusedGateAttentionOp(OpTest):
# gate_values = paddle.einsum('nbqc,chv->nbqhv', query,
# gating_w) + gating_b
gating_w_2d = paddle.reshape(
gating_w, shape=[self.q_dim, self.num_heads * self.head_dim])
gating_w, shape=[self.q_dim, self.num_heads * self.head_dim]
)
gate_values_4d = paddle.matmul(query, gating_w_2d)
gate_values = paddle.reshape(
gate_values_4d,
shape=[
self.batch_size, self.msa_len, self.res_len, self.num_heads,
self.head_dim
]) + gating_b
gate_values = (
paddle.reshape(
gate_values_4d,
shape=[
self.batch_size,
self.msa_len,
self.res_len,
self.num_heads,
self.head_dim,
],
)
+ gating_b
)
gate_values = nn.functional.sigmoid(gate_values)
gate_out = fmha_out * gate_values
else:
......@@ -183,20 +203,32 @@ class TestFusedGateAttentionOp(OpTest):
gate_out,
shape=[
self.batch_size * self.msa_len * self.res_len,
self.num_heads * self.head_dim
])
self.num_heads * self.head_dim,
],
)
output_w_2d = paddle.reshape(
output_w, shape=[self.num_heads * self.head_dim, self.out_dim])
output_w, shape=[self.num_heads * self.head_dim, self.out_dim]
)
out_2d = paddle.matmul(gate_out_2d, output_w_2d)
out = paddle.reshape(
out_2d,
shape=[self.batch_size, self.msa_len, self.res_len, self.out_dim
]) + output_b
paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
retain_graph=True)
return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out,
out)
out = (
paddle.reshape(
out_2d,
shape=[
self.batch_size,
self.msa_len,
self.res_len,
self.out_dim,
],
)
+ output_b
)
paddle.autograd.backward(
[out], [paddle.to_tensor(self.dout)], retain_graph=True
)
return self.collect_outputs(
query, key, softmax_out, fmha_out, gate_out, out
)
def get_fused_gate_attention_out(self):
paddle.disable_static(place=paddle.CUDAPlace(0))
......@@ -218,8 +250,9 @@ class TestFusedGateAttentionOp(OpTest):
src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
if self.bias_attr:
nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
stop_gradient=False)
nonbatched_bias = paddle.to_tensor(
self.nonbatched_bias, stop_gradient=False
)
else:
nonbatched_bias = None
if self.has_gating:
......@@ -232,18 +265,42 @@ class TestFusedGateAttentionOp(OpTest):
output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
_, _, _, _, softmax_out, fmha_out, gate_out, out = _legacy_C_ops.fused_gate_attention(
query, key, q_weight, k_weight, v_weight, qkv_weight,
nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b,
'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv)
paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
retain_graph=True)
return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out,
out)
(
_,
_,
_,
_,
softmax_out,
fmha_out,
gate_out,
out,
) = _legacy_C_ops.fused_gate_attention(
query,
key,
q_weight,
k_weight,
v_weight,
qkv_weight,
nonbatched_bias,
src_mask,
gating_w,
gating_b,
output_w,
output_b,
'has_gating',
self.has_gating,
'merge_qkv',
self.merge_qkv,
)
paddle.autograd.backward(
[out], [paddle.to_tensor(self.dout)], retain_graph=True
)
return self.collect_outputs(
query, key, softmax_out, fmha_out, gate_out, out
)
def check(self, ref, out, atol, rtol, check_equal, name):
def _convert(value):
if self.dtype == "bfloat16":
return convert_uint16_to_float(value)
......@@ -252,19 +309,25 @@ class TestFusedGateAttentionOp(OpTest):
if check_equal:
self.assertTrue(
np.equal(_convert(ref), _convert(out)).all(),
"Checking < {} > failed!".format(name))
"Checking < {} > failed!".format(name),
)
else:
np.testing.assert_allclose(
_convert(ref),
_convert(out),
atol=atol,
rtol=rtol,
err_msg="Checking < {} > failed!".format(name))
err_msg="Checking < {} > failed!".format(name),
)
def check_output_and_grad(self, atol, rtol):
output_names = [
"softmax_out", "fmha_out", "gate_out", "out", "query_grad",
"key_grad"
"softmax_out",
"fmha_out",
"gate_out",
"out",
"query_grad",
"key_grad",
]
outputs_ref = self.get_reference_out()
outputs_fused = self.get_fused_gate_attention_out()
......@@ -280,22 +343,26 @@ class TestFusedGateAttentionOp(OpTest):
# that in fused ops, check_equal is set to False and we use allclose
# to check the correctness.
check_equal = False
self.check(ref_res.numpy(), fused_res.numpy(), atol, rtol,
check_equal, output_names[i])
self.check(
ref_res.numpy(),
fused_res.numpy(),
atol,
rtol,
check_equal,
output_names[i],
)
def test_output_and_grad(self):
self.check_output_and_grad(atol=1e-5, rtol=1e-6)
class TestMergeQKVLargeBatchSizeCase(TestFusedGateAttentionOp):
def config(self):
super().config()
self.batch_size = 2
class TestSeparatedQKVCase(TestFusedGateAttentionOp):
def config(self):
self.dtype = "float32"
self.has_gating = False
......@@ -312,7 +379,6 @@ class TestSeparatedQKVCase(TestFusedGateAttentionOp):
class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
def config(self):
super().config()
self.has_gating = False
......@@ -320,7 +386,6 @@ class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
def config(self):
super().config()
self.dtype = "float16"
......@@ -332,18 +397,18 @@ class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
class TestMergeQKVLargeBatchSizeFp16Case(TestMergeQKVFp16Case):
def config(self):
super().config()
self.batch_size = 2
@unittest.skipIf(
not core.is_compiled_with_cuda() or get_cuda_version() < 11000,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
not core.is_compiled_with_cuda()
or get_cuda_version() < 11000
or paddle.device.cuda.get_device_capability()[0] < 8,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
)
class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
def config(self):
super().config()
self.dtype = "bfloat16"
......@@ -353,7 +418,6 @@ class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
class TestMergeQKVLargeBatchSizeBF16Case(TestMergeQKVBF16Case):
def config(self):
super().config()
self.batch_size = 2
......
......@@ -20,19 +20,22 @@ from functools import partial
class TestResnetGPU(TestResnetBase):
def test_seresnext_with_learning_rate_decay(self):
# NOTE(zcd): This test is compare the result of use parallel_executor
# and executor, and the result of drop_out op and batch_norm op in
# this two executor have diff, so the two ops should be removed
# from the model.
check_func = partial(self.check_network_convergence,
optimizer=seresnext_net.optimizer,
use_parallel_executor=False)
self._compare_result_with_origin_model(check_func,
use_device=DeviceType.CUDA,
delta2=1e-5,
compare_separately=False)
check_func = partial(
self.check_network_convergence,
optimizer=seresnext_net.optimizer,
use_parallel_executor=False,
)
self._compare_result_with_origin_model(
check_func,
use_device=DeviceType.CUDA,
delta2=1e-3,
compare_separately=False,
)
if __name__ == '__main__':
......
......@@ -93,14 +93,9 @@ def get_csr_value(mat, layout, nnz):
return value
def ref_sparse_attention(q,
k,
v,
offset,
columns,
kp_mask=None,
attn_mask=None,
bsz=None):
def ref_sparse_attention(
q, k, v, offset, columns, kp_mask=None, attn_mask=None, bsz=None
):
row, col, nnz = q.shape[0], q.shape[1], columns.shape[0]
mat = np.zeros((row, row))
for cur_row in range(row):
......@@ -111,7 +106,7 @@ def ref_sparse_attention(q,
mat[cur_row][cur_col] = 1
a = np.dot(q, k.T) * mat
a_value = get_csr_value(a, mat, nnz)
scaling = float(col)**-0.5
scaling = float(col) ** -0.5
a = scaling * a
for i in range(row):
for j in range(row):
......@@ -127,13 +122,9 @@ def ref_sparse_attention(q,
return result, a_value, b_value
def ref_batch_sparse_attention(q,
k,
v,
offset,
columns,
kp_mask=None,
attn_mask=None):
def ref_batch_sparse_attention(
q, k, v, offset, columns, kp_mask=None, attn_mask=None
):
batch_size, num_heads, row, col = q.shape
nnz = columns.shape[2]
result = np.zeros((batch_size, num_heads, row, col))
......@@ -141,11 +132,16 @@ def ref_batch_sparse_attention(q,
result_softmax = np.zeros((batch_size, num_heads, nnz))
for i in range(batch_size):
for j in range(num_heads):
cur_q, cur_k, cur_v, = q[i][j], k[i][j], v[i][j]
cur_q, cur_k, cur_v, = (
q[i][j],
k[i][j],
v[i][j],
)
cur_offset, cur_columns = offset[i][j], columns[i][j]
if kp_mask is None and attn_mask is None:
cur_result, cur_sdd, cur_softmax = ref_sparse_attention(
cur_q, cur_k, cur_v, cur_offset, cur_columns)
cur_q, cur_k, cur_v, cur_offset, cur_columns
)
else:
cur_result, cur_sdd, cur_softmax = ref_sparse_attention(
cur_q,
......@@ -155,7 +151,8 @@ def ref_batch_sparse_attention(q,
cur_columns,
kp_mask=kp_mask,
attn_mask=attn_mask,
bsz=i)
bsz=i,
)
result[i][j] = cur_result
result_sdd[i][j], result_softmax[i][j] = cur_sdd, cur_softmax
return result, result_sdd, result_softmax
......@@ -193,10 +190,9 @@ def init_csr_format(batch_size, num_heads, rows, blocksize):
@unittest.skipIf(
not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
)
class TestSparseAttentionOp(OpTest):
def config(self):
self.shape = (1, 1, 16, 16)
self.blocksize = 4
......@@ -212,8 +208,9 @@ class TestSparseAttentionOp(OpTest):
self.k = np.random.random(self.shape).astype(self.dtype)
self.v = np.random.random(self.shape).astype(self.dtype)
# init CSR tensor
offset, columns = init_csr_format(self.shape[0], self.shape[1],
self.shape[2], self.blocksize)
offset, columns = init_csr_format(
self.shape[0], self.shape[1], self.shape[2], self.blocksize
)
self.offset = offset.astype('int32')
self.columns = columns.astype('int32')
# init mask tensor
......@@ -234,10 +231,12 @@ class TestSparseAttentionOp(OpTest):
self.offset,
self.columns,
kp_mask=self.key_padding_mask,
attn_mask=self.attn_mask)
attn_mask=self.attn_mask,
)
else:
result, result_sdd, result_softmax = ref_batch_sparse_attention(
self.q, self.k, self.v, self.offset, self.columns)
self.q, self.k, self.v, self.offset, self.columns
)
if self.use_mask == True:
self.inputs = {
......@@ -260,7 +259,7 @@ class TestSparseAttentionOp(OpTest):
self.outputs = {
'Out': result.astype(self.dtype),
'SparseDotSdd': result_sdd.astype(self.dtype),
'Softmax': result_softmax.astype(self.dtype)
'Softmax': result_softmax.astype(self.dtype),
}
def test_check_output(self):
......@@ -273,7 +272,6 @@ class TestSparseAttentionOp(OpTest):
class TestSparseAttentionOpFp32Test(TestSparseAttentionOp):
def config(self):
self.shape = (1, 1, 8, 16)
self.blocksize = 2
......@@ -282,7 +280,6 @@ class TestSparseAttentionOpFp32Test(TestSparseAttentionOp):
class TestSparseAttentionOpShapeTest(TestSparseAttentionOp):
def config(self):
self.shape = (2, 2, 32, 8)
self.blocksize = 8
......@@ -292,10 +289,9 @@ class TestSparseAttentionOpShapeTest(TestSparseAttentionOp):
@unittest.skipIf(
not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
)
class TestSparseAttentionAPI(unittest.TestCase):
def setUp(self):
self.place = paddle.CUDAPlace(0)
self.shape = (1, 1, 8, 4)
......@@ -310,54 +306,62 @@ class TestSparseAttentionAPI(unittest.TestCase):
K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype)
V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype)
batch_size, num_heads, rows = self.shape[0], self.shape[
1], self.shape[2]
batch_size, num_heads, rows = (
self.shape[0],
self.shape[1],
self.shape[2],
)
block_num = rows / self.blocksize
block_last = rows % self.blocksize
sparse_nnz_num = block_num * self.blocksize * self.blocksize + block_last * block_last
sparse_nnz_num = (
block_num * self.blocksize * self.blocksize
+ block_last * block_last
)
offset_shape = (batch_size, num_heads, rows + 1)
columns_shape = (batch_size, num_heads, int(sparse_nnz_num))
offset = paddle.static.data(name="Offset",
shape=offset_shape,
dtype="int32")
columns = paddle.static.data(name="Columns",
shape=columns_shape,
dtype="int32")
offset = paddle.static.data(
name="Offset", shape=offset_shape, dtype="int32"
)
columns = paddle.static.data(
name="Columns", shape=columns_shape, dtype="int32"
)
key_padding_mask_shape = (self.shape[0], self.shape[2])
attn_mask_shape = (self.shape[2], self.shape[2])
if self.use_mask == True:
key_padding_mask = paddle.static.data(
name="KeyPaddingMask",
shape=key_padding_mask_shape,
dtype=self.dtype)
attn_mask = paddle.static.data(name="AttnMask",
shape=attn_mask_shape,
dtype=self.dtype)
Out = F.sparse_attention(Q,
K,
V,
offset,
columns,
key_padding_mask=key_padding_mask,
attn_mask=attn_mask)
dtype=self.dtype,
)
attn_mask = paddle.static.data(
name="AttnMask", shape=attn_mask_shape, dtype=self.dtype
)
Out = F.sparse_attention(
Q,
K,
V,
offset,
columns,
key_padding_mask=key_padding_mask,
attn_mask=attn_mask,
)
else:
Out = F.sparse_attention(Q, K, V, offset, columns)
Q_np = np.random.random(self.shape).astype(self.dtype)
K_np = np.random.random(self.shape).astype(self.dtype)
V_np = np.random.random(self.shape).astype(self.dtype)
offset_np, columns_np = init_csr_format(self.shape[0],
self.shape[1],
self.shape[2],
self.blocksize)
offset_np, columns_np = init_csr_format(
self.shape[0], self.shape[1], self.shape[2], self.blocksize
)
offset_np = offset_np.astype('int32')
columns_np = columns_np.astype('int32')
# init mask tensor
key_padding_mask_np = np.random.randint(0,
2,
size=key_padding_mask_shape)
key_padding_mask_np = np.random.randint(
0, 2, size=key_padding_mask_shape
)
attn_mask_np = np.random.randint(0, 2, size=attn_mask_shape)
key_padding_mask_np = init_mask(key_padding_mask_np)
attn_mask_np = init_mask(attn_mask_np)
......@@ -366,16 +370,18 @@ class TestSparseAttentionAPI(unittest.TestCase):
exe = fluid.Executor(self.place)
if self.use_mask == True:
fetches_result = exe.run(feed={
"Q": Q_np,
"K": K_np,
"V": V_np,
"Offset": offset_np,
"Columns": columns_np,
'KeyPaddingMask': key_padding_mask_np,
'AttnMask': attn_mask_np
},
fetch_list=[Out])
fetches_result = exe.run(
feed={
"Q": Q_np,
"K": K_np,
"V": V_np,
"Offset": offset_np,
"Columns": columns_np,
'KeyPaddingMask': key_padding_mask_np,
'AttnMask': attn_mask_np,
},
fetch_list=[Out],
)
expected_result, __, __ = ref_batch_sparse_attention(
Q_np,
K_np,
......@@ -383,28 +389,32 @@ class TestSparseAttentionAPI(unittest.TestCase):
offset_np,
columns_np,
kp_mask=key_padding_mask_np,
attn_mask=attn_mask_np)
attn_mask=attn_mask_np,
)
else:
fetches_result = exe.run(feed={
"Q": Q_np,
"K": K_np,
"V": V_np,
"Offset": offset_np,
"Columns": columns_np
},
fetch_list=[Out])
fetches_result = exe.run(
feed={
"Q": Q_np,
"K": K_np,
"V": V_np,
"Offset": offset_np,
"Columns": columns_np,
},
fetch_list=[Out],
)
expected_result, __, __ = ref_batch_sparse_attention(
Q_np, K_np, V_np, offset_np, columns_np)
Q_np, K_np, V_np, offset_np, columns_np
)
np.testing.assert_allclose(fetches_result,
expected_result,
rtol=1e-05,
atol=1e-05)
np.testing.assert_allclose(
fetches_result[0], expected_result, rtol=1e-05, atol=1e-05
)
def test_dygraph(self):
paddle.disable_static()
offset, columns = init_csr_format(self.shape[0], self.shape[1],
self.shape[2], self.blocksize)
offset, columns = init_csr_format(
self.shape[0], self.shape[1], self.shape[2], self.blocksize
)
offset = offset.astype('int32')
columns = columns.astype('int32')
query = np.random.random(self.shape).astype(self.dtype)
......@@ -429,13 +439,15 @@ class TestSparseAttentionAPI(unittest.TestCase):
paddle_attn_mask = paddle.to_tensor(attn_mask, place=self.place)
if self.use_mask == True:
paddle_result = F.sparse_attention(paddle_query,
paddle_key,
paddle_value,
paddle_offset,
paddle_colunmns,
key_padding_mask=paddle_kp_mask,
attn_mask=paddle_attn_mask)
paddle_result = F.sparse_attention(
paddle_query,
paddle_key,
paddle_value,
paddle_offset,
paddle_colunmns,
key_padding_mask=paddle_kp_mask,
attn_mask=paddle_attn_mask,
)
numpy_result, __, __ = ref_batch_sparse_attention(
query,
......@@ -444,25 +456,29 @@ class TestSparseAttentionAPI(unittest.TestCase):
offset,
columns,
kp_mask=key_padding_mask,
attn_mask=attn_mask)
attn_mask=attn_mask,
)
numpy_result = numpy_result.astype(self.dtype)
else:
paddle_result = F.sparse_attention(paddle_query, paddle_key,
paddle_value, paddle_offset,
paddle_colunmns)
paddle_result = F.sparse_attention(
paddle_query,
paddle_key,
paddle_value,
paddle_offset,
paddle_colunmns,
)
numpy_result, __, __ = ref_batch_sparse_attention(
query, key, value, offset, columns)
query, key, value, offset, columns
)
numpy_result = numpy_result.astype(self.dtype)
np.testing.assert_allclose(paddle_result.numpy(),
numpy_result,
rtol=1e-05,
atol=1e-05)
np.testing.assert_allclose(
paddle_result.numpy(), numpy_result, rtol=1e-05, atol=1e-05
)
class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
def setUp(self):
self.place = paddle.CUDAPlace(0)
self.shape = (2, 2, 8, 4)
......@@ -472,7 +488,6 @@ class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
def setUp(self):
self.place = paddle.CUDAPlace(0)
self.shape = (2, 2, 64, 32)
......@@ -482,7 +497,6 @@ class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
def setUp(self):
self.place = paddle.CUDAPlace(0)
self.shape = (2, 1, 64, 32)
......@@ -492,7 +506,6 @@ class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
def setUp(self):
self.place = paddle.CUDAPlace(0)
self.shape = (4, 4, 128, 32)
......@@ -502,7 +515,6 @@ class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI):
def setUp(self):
self.place = paddle.CUDAPlace(0)
self.shape = (3, 3, 35, 15)
......
......@@ -64,42 +64,50 @@ class TestSparseElementWiseAPI(unittest.TestCase):
csr_y = s_dense_y.to_sparse_csr()
actual_res = get_actual_res(csr_x, csr_y, op)
actual_res.backward(actual_res)
expect_res = op(dense_x, dense_y)
expect_res.backward(expect_res)
np.testing.assert_allclose(expect_res.numpy(),
actual_res.to_dense().numpy(),
rtol=1e-05,
equal_nan=True)
np.testing.assert_allclose(
expect_res.numpy(),
actual_res.to_dense().numpy(),
rtol=1e-05,
equal_nan=True,
)
if not (op == __truediv__ and dtype in ['int32', 'int64']):
np.testing.assert_allclose(dense_x.grad.numpy(),
csr_x.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True)
np.testing.assert_allclose(dense_y.grad.numpy(),
csr_y.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True)
actual_res.backward(actual_res)
np.testing.assert_allclose(
dense_x.grad.numpy(),
csr_x.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True,
)
np.testing.assert_allclose(
dense_y.grad.numpy(),
csr_y.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True,
)
def func_test_coo(self, op):
for sparse_dim in range(len(self.coo_shape) - 1, len(self.coo_shape)):
for dtype in self.support_dtypes:
x = np.random.randint(-255, 255,
size=self.coo_shape).astype(dtype)
y = np.random.randint(-255, 255,
size=self.coo_shape).astype(dtype)
x = np.random.randint(-255, 255, size=self.coo_shape).astype(
dtype
)
y = np.random.randint(-255, 255, size=self.coo_shape).astype(
dtype
)
dense_x = paddle.to_tensor(x, dtype=dtype, stop_gradient=False)
dense_y = paddle.to_tensor(y, dtype=dtype, stop_gradient=False)
s_dense_x = paddle.to_tensor(x,
dtype=dtype,
stop_gradient=False)
s_dense_y = paddle.to_tensor(y,
dtype=dtype,
stop_gradient=False)
s_dense_x = paddle.to_tensor(
x, dtype=dtype, stop_gradient=False
)
s_dense_y = paddle.to_tensor(
y, dtype=dtype, stop_gradient=False
)
coo_x = s_dense_x.to_sparse_coo(sparse_dim)
coo_y = s_dense_y.to_sparse_coo(sparse_dim)
......@@ -109,18 +117,24 @@ class TestSparseElementWiseAPI(unittest.TestCase):
expect_res = op(dense_x, dense_y)
expect_res.backward(expect_res)
np.testing.assert_allclose(expect_res.numpy(),
actual_res.to_dense().numpy(),
rtol=1e-05,
equal_nan=True)
np.testing.assert_allclose(dense_x.grad.numpy(),
coo_x.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True)
np.testing.assert_allclose(dense_y.grad.numpy(),
coo_y.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True)
np.testing.assert_allclose(
expect_res.numpy(),
actual_res.to_dense().numpy(),
rtol=1e-05,
equal_nan=True,
)
np.testing.assert_allclose(
dense_x.grad.numpy(),
coo_x.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True,
)
np.testing.assert_allclose(
dense_y.grad.numpy(),
coo_y.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True,
)
def test_support_dtypes_csr(self):
paddle.device.set_device('cpu')
......@@ -140,38 +154,37 @@ class TestSparseElementWiseAPI(unittest.TestCase):
values2_data = [[1.0], [2.0]]
shape = [2, 4, 2]
sp_a = sparse.sparse_coo_tensor(indices_data,
values1_data,
shape,
stop_gradient=False)
sp_b = sparse.sparse_coo_tensor(indices_data,
values2_data,
shape,
stop_gradient=False)
sp_a = sparse.sparse_coo_tensor(
indices_data, values1_data, shape, stop_gradient=False
)
sp_b = sparse.sparse_coo_tensor(
indices_data, values2_data, shape, stop_gradient=False
)
values1 = paddle.to_tensor(values1_data, stop_gradient=False)
values2 = paddle.to_tensor(values2_data, stop_gradient=False)
#c.values() = a.values() + b.values()
# c.values() = a.values() + b.values()
sp_c = sparse.add(sp_a, sp_b)
sp_c.backward()
ref_c = values1 + values2
ref_c.backward()
np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy())
np.testing.assert_allclose(sp_a.grad.values().numpy(),
values1.grad.numpy())
np.testing.assert_allclose(sp_b.grad.values().numpy(),
values2.grad.numpy())
np.testing.assert_allclose(
sp_a.grad.values().numpy(), values1.grad.numpy()
)
np.testing.assert_allclose(
sp_b.grad.values().numpy(), values2.grad.numpy()
)
def test_add_bias(self):
indices_data = [[0, 1], [0, 3]]
values_data = [[1.0, 1.0], [2.0, 2.0]]
shape = [2, 4, 2]
sp_a = sparse.sparse_coo_tensor(indices_data,
values_data,
shape,
stop_gradient=False)
sp_a = sparse.sparse_coo_tensor(
indices_data, values_data, shape, stop_gradient=False
)
bias_values = [1.0, 2.0]
......@@ -179,14 +192,15 @@ class TestSparseElementWiseAPI(unittest.TestCase):
values2 = paddle.to_tensor(bias_values, stop_gradient=False)
values3 = paddle.to_tensor(bias_values, stop_gradient=False)
#c.values() = a.values() + b
# c.values() = a.values() + b
sp_c = sparse.add(sp_a, values2)
sp_c.backward()
ref_c = values1 + values3
ref_c.backward()
np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy())
np.testing.assert_allclose(sp_a.grad.values().numpy(),
values1.grad.numpy())
np.testing.assert_allclose(
sp_a.grad.values().numpy(), values1.grad.numpy()
)
np.testing.assert_allclose(values2.grad.numpy(), values3.grad.numpy())
......
......@@ -28,7 +28,6 @@ paddle.enable_static()
# Correct: General.
class TestSqueezeOp(OpTest):
def setUp(self):
self.op_type = "squeeze2"
self.python_api = paddle.squeeze
......@@ -40,7 +39,7 @@ class TestSqueezeOp(OpTest):
self.init_attrs()
self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.ori_shape).astype("float64")
"XShape": np.random.random(self.ori_shape).astype("float64"),
}
def test_check_output(self):
......@@ -60,7 +59,6 @@ class TestSqueezeOp(OpTest):
# Correct: There is mins axis.
class TestSqueezeOp1(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (1, 20, 1, 5)
self.axes = (0, -2)
......@@ -69,7 +67,6 @@ class TestSqueezeOp1(TestSqueezeOp):
# Correct: No axes input.
class TestSqueezeOp2(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (1, 20, 1, 5)
self.axes = ()
......@@ -78,7 +75,6 @@ class TestSqueezeOp2(TestSqueezeOp):
# Correct: Just part of axes be squeezed.
class TestSqueezeOp3(TestSqueezeOp):
def init_test_case(self):
self.ori_shape = (6, 1, 5, 1, 4, 1)
self.axes = (1, -1)
......@@ -86,7 +82,6 @@ class TestSqueezeOp3(TestSqueezeOp):
class TestSqueeze2AxesTensor(UnittestBase):
def init_info(self):
self.shapes = [[2, 3, 4]]
self.save_path = os.path.join(self.temp_dir.name, 'squeeze_tensor')
......@@ -123,7 +118,6 @@ class TestSqueeze2AxesTensor(UnittestBase):
class TestSqueeze2AxesTensorList(UnittestBase):
def init_info(self):
self.shapes = [[2, 3, 4]]
self.save_path = os.path.join(self.temp_dir.name, 'squeeze_tensor')
......@@ -140,7 +134,7 @@ class TestSqueeze2AxesTensorList(UnittestBase):
# axes is a list[Variable]
axes = [
paddle.full([1], 0, dtype='int32'),
paddle.full([1], 2, dtype='int32')
paddle.full([1], 2, dtype='int32'),
]
out = paddle.squeeze(feat, axes)
out2 = paddle.fluid.layers.squeeze(feat, axes)
......@@ -162,5 +156,37 @@ class TestSqueeze2AxesTensorList(UnittestBase):
self.assertEqual(infer_out.shape, (2, 3, 10))
# test api
class TestSqueezeAPI(unittest.TestCase):
def setUp(self):
self.executed_api()
def executed_api(self):
self.squeeze = paddle.squeeze
def test_api(self):
paddle.disable_static()
input_data = np.random.random([3, 2, 1]).astype("float32")
x = paddle.to_tensor(input_data)
out = self.squeeze(x, axis=2)
out.backward()
self.assertEqual(out.shape, [3, 2])
paddle.enable_static()
def test_error(self):
def test_axes_type():
x2 = paddle.static.data(name="x2", shape=[2, 1, 25], dtype="int32")
self.squeeze(x2, axis=2.1)
self.assertRaises(TypeError, test_axes_type)
class TestSqueezeInplaceAPI(TestSqueezeAPI):
def executed_api(self):
self.squeeze = paddle.squeeze_
if __name__ == "__main__":
unittest.main()
......@@ -12,16 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import re
import unittest
import paddle.version as fluid_version
class VersionTest(unittest.TestCase):
def setUp(self):
self._major_regex = "[0-9]+"
self._minor_regex = "[0-9]+"
......@@ -37,15 +34,20 @@ class VersionTest(unittest.TestCase):
# check version format
if fluid_version.istaged:
self.assertEqual(fluid_version.major, 0)
self.assertEqual(fluid_version.minor, 0)
self.assertEqual(fluid_version.patch, "0")
self.assertEqual(fluid_version.rc, 0)
self.assertEqual(fluid_version.full_version, "0.0.0")
else:
self.assertTrue(re.match(self._major_regex, fluid_version.major))
self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
self.assertTrue(re.match(self._rc_regex, fluid_version.rc))
self.assertTrue(
re.match(self._version_regex, fluid_version.full_version))
re.match(self._version_regex, fluid_version.full_version)
)
else:
self.assertEqual(fluid_version.major, "0")
self.assertEqual(fluid_version.minor, "0")
self.assertEqual(fluid_version.patch, "0")
self.assertEqual(fluid_version.rc, "0")
self.assertEqual(fluid_version.full_version, "0.0.0")
if __name__ == '__main__':
unittest.main()
......@@ -241,13 +241,13 @@ def send_ue_recv(
src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
The available data type is int32, int64.
message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
Default value is `sum`.
out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
out_size is smaller or equal to 0, then this input will not be used.
Otherwise, `out_size` should be equal with or larger than
max(dst_index) + 1.
max(dst_index) + 1. Default value is `None`.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......
......@@ -26,6 +26,7 @@ def reindex_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None
):
"""
Reindex Graph API.
This API is mainly used in Graph Learning domain, which should be used
......@@ -49,12 +50,12 @@ def reindex_graph(
should be the same with `x`.
count (Tensor): The neighbor count of the input nodes `x`. And the
data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer.
if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......@@ -69,6 +70,7 @@ def reindex_graph(
.. code-block:: python
import paddle
x = [0, 1, 2]
neighbors = [8, 9, 0, 4, 7, 6, 7]
count = [2, 3, 2]
......@@ -138,6 +140,7 @@ def reindex_heter_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None
):
"""
Reindex HeterGraph API.
This API is mainly used in Graph Learning domain, which should be used
......@@ -161,12 +164,12 @@ def reindex_heter_graph(
The data type should be the same with `x`.
count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
And the data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer.
if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......@@ -183,6 +186,7 @@ def reindex_heter_graph(
.. code-block:: python
import paddle
x = [0, 1, 2]
neighbors_a = [8, 9, 0, 4, 7, 6, 7]
count_a = [2, 3, 2]
......
......@@ -32,6 +32,7 @@ def sample_neighbors(
name=None,
):
"""
Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to
......@@ -52,16 +53,16 @@ def sample_neighbors(
The data type should be the same with `row`.
input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
data type should be the same with `row`.
sample_size (int): The number of neighbors we need to sample. Default value is -1,
sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
which means returning all the neighbors of the input nodes.
eids (Tensor): The eid information of the input graph. If return_eids is True,
eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
then `eids` should not be None. The data type should be the
same with `row`. Default is None.
return_eids (bool): Whether to return eid information of sample edges. Default is False.
perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
is True, then `perm_buffer` should not be None. The data type should
be the same with `row`. If not None, we will use fiser-yates sampling
to speed up. Only useful for gpu version.
to speed up. Only useful for gpu version. Default is None.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
......@@ -69,15 +70,16 @@ def sample_neighbors(
- out_neighbors (Tensor), the sample neighbors of the input nodes.
- out_count (Tensor), the number of sampling neighbors of each input node, and the shape
should be the same with `input_nodes`.
should be the same with `input_nodes`.
- out_eids (Tensor), if `return_eids` is True, we will return the eid information of the
sample edges.
sample edges.
Examples:
.. code-block:: python
import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
......
......@@ -69,8 +69,9 @@ def to_list(value):
def to_numpy(var):
assert isinstance(var, (Variable, fluid.core.VarBase,
fluid.core.eager.Tensor)), "not a variable"
assert isinstance(
var, (Variable, fluid.core.VarBase, fluid.core.eager.Tensor)
), "not a variable"
if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
return var.numpy()
t = global_scope().find_var(var.name).get_tensor()
......@@ -105,10 +106,9 @@ def extract_args(func):
def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
return collective._c_allgather(x,
nranks,
ring_id=ring_id,
use_calc_stream=use_calc_stream)
return collective._c_allgather(
x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream
)
def wait_server_ready(endpoints):
......@@ -119,7 +119,8 @@ def wait_server_ready(endpoints):
for ep in endpoints:
ip_port = ep.split(":")
with contextlib.closing(
socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
socket.socket(socket.AF_INET, socket.SOCK_STREAM)
) as sock:
sock.settimeout(2)
result = sock.connect_ex((ip_port[0], int(ip_port[1])))
if result != 0:
......@@ -131,8 +132,9 @@ def wait_server_ready(endpoints):
break
def init_communicator(program, rank, nranks, wait_port, current_endpoint,
endpoints):
def init_communicator(
program, rank, nranks, wait_port, current_endpoint, endpoints
):
if nranks < 2:
return
other_endpoints = endpoints[:]
......@@ -144,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
nccl_id_var = block.create_var(
name=fluid.unique_name.generate('nccl_id'),
persistable=True,
type=fluid.core.VarDesc.VarType.RAW)
block.append_op(type='c_gen_nccl_id',
inputs={},
outputs={'Out': nccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints
})
block.append_op(type='c_comm_init',
inputs={'X': nccl_id_var},
outputs={},
attrs={
'nranks': nranks,
'rank': rank,
'ring_id': 0,
})
type=fluid.core.VarDesc.VarType.RAW,
)
block.append_op(
type='c_gen_nccl_id',
inputs={},
outputs={'Out': nccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
},
)
block.append_op(
type='c_comm_init',
inputs={'X': nccl_id_var},
outputs={},
attrs={
'nranks': nranks,
'rank': rank,
'ring_id': 0,
},
)
elif core.is_compiled_with_npu():
hccl_id_var = block.create_var(
name=fluid.unique_name.generate('hccl_id'),
persistable=True,
type=core.VarDesc.VarType.RAW)
block.append_op(type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints
})
block.append_op(type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': 0,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks
})
type=core.VarDesc.VarType.RAW,
)
block.append_op(
type='c_gen_hccl_id',
inputs={},
outputs={'Out': hccl_id_var},
attrs={
'rank': rank,
'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
},
)
block.append_op(
type='c_comm_init_hccl',
inputs={'X': hccl_id_var},
outputs={},
attrs={
'rank': rank,
'ring_id': 0,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
},
)
def prepare_distributed_context(place=None):
if place is None:
place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
place = (
fluid.CUDAPlace(ParallelEnv().dev_id)
if ParallelEnv().nranks > 1
else fluid.CUDAPlace(0)
)
place = _get_paddle_place(place)
strategy = fluid.dygraph.parallel.ParallelStrategy()
......@@ -208,9 +223,14 @@ def prepare_distributed_context(place=None):
def _init_context():
communicator_prog = fluid.Program()
init_communicator(communicator_prog, strategy.local_rank,
strategy.nranks, True, strategy.current_endpoint,
strategy.trainer_endpoints)
init_communicator(
communicator_prog,
strategy.local_rank,
strategy.nranks,
True,
strategy.current_endpoint,
strategy.trainer_endpoints,
)
exe = fluid.Executor(place)
exe.run(communicator_prog)
......@@ -220,7 +240,7 @@ def prepare_distributed_context(place=None):
fluid.enable_dygraph(place)
else:
assert ("Only support CUDAPlace for now.")
assert "Only support CUDAPlace for now."
_parallel_context_initialized = True
return strategy
......@@ -246,7 +266,9 @@ def _update_input_info(inputs):
class StaticGraphAdapter(object):
"""
Model traning/inference with a static graph.
"""
def __init__(self, model):
......@@ -269,7 +291,7 @@ class StaticGraphAdapter(object):
'eval_total': 0,
'test_total': 0,
'eval_batch': 0,
'test_batch': 0
'test_batch': 0,
}
self._nranks = ParallelEnv().nranks
......@@ -289,10 +311,13 @@ class StaticGraphAdapter(object):
self.model.mode = value
def train_batch(self, inputs, labels=None, update=True):
assert self.model._optimizer, \
"model not ready, please call `model.prepare()` first"
assert (
self.model._optimizer
), "model not ready, please call `model.prepare()` first"
self.mode = 'train'
assert update is True, "Does not support `update == False` in static mode by now."
assert (
update is True
), "Does not support `update == False` in static mode by now."
return self._run(inputs, labels)
def eval_batch(self, inputs, labels=None):
......@@ -307,7 +332,6 @@ class StaticGraphAdapter(object):
return self.model.network.parameters(*args, **kwargs)
def save(self, path):
def _save(state, path):
if not state:
return
......@@ -331,8 +355,7 @@ class StaticGraphAdapter(object):
# XXX `optimizer.state_dict()` only work in dygraph mode
optim_path = path + ".pdopt"
optim = {
p.name: p
for p in filter(is_belong_to_optimizer, prog.list_vars())
p.name: p for p in filter(is_belong_to_optimizer, prog.list_vars())
}
if not optim:
return
......@@ -348,8 +371,10 @@ class StaticGraphAdapter(object):
# restore parameter states
fluid.core._create_loaded_parameter(
[param for param, state in param_state_pairs], global_scope(),
executor)
[param for param, state in param_state_pairs],
global_scope(),
executor,
)
for param, state in param_state_pairs:
self._set_var(param, state)
......@@ -377,9 +402,10 @@ class StaticGraphAdapter(object):
# static-graph, since the time of global_step to increase is
# different.
state_val = (
np.array(converted_state.pop("global_step")) - 1
) if "global_step" in converted_state else converted_state.pop(
"@LR_DECAY_COUNTER@", None)
(np.array(converted_state.pop("global_step")) - 1)
if "global_step" in converted_state
else converted_state.pop("@LR_DECAY_COUNTER@", None)
)
if state_val is not None:
converted_state[var.name] = state_val
elif var.name.startswith("learning_rate_"):
......@@ -396,36 +422,61 @@ class StaticGraphAdapter(object):
opt_cls_name = self.model._optimizer.__class__.__name__
opt_unq_name = None
for name in self.model._optimizer._accumulators.keys():
accum_name = name if opt_name is None else name[
len(opt_name) + 1:]
for param_name, state_var in self.model._optimizer._accumulators[
name].items():
accum_name = (
name
if opt_name is None
else name[len(opt_name) + 1 :]
)
for (
param_name,
state_var,
) in self.model._optimizer._accumulators[name].items():
if opt_unq_name is None:
# can not infer out the exact unique(opt_name),
# thus try to extract rather than generate
for state_key in sorted(state.keys(),
key=lambda x: len(x),
reverse=True):
prefix = param_name + "_" + (
opt_cls_name
if opt_name is None else opt_name) + "_"
for state_key in sorted(
state.keys(),
key=lambda x: len(x),
reverse=True,
):
prefix = (
param_name
+ "_"
+ (
opt_cls_name
if opt_name is None
else opt_name
)
+ "_"
)
if state_key.startswith(prefix):
prefix_offset = state_key[len(
prefix):].find("_") + len(prefix)
prefix_offset = state_key[
len(prefix) :
].find("_") + len(prefix)
opt_unq_name = state_key[
len(param_name + "_"):prefix_offset]
len(
param_name + "_"
) : prefix_offset
]
# TODO: assert
# assert opt_unq_name is None
# gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
# always end with "_0" since the unique optimizer._name
dy_state_name = (param_name + "_" + opt_unq_name +
"_" + accum_name + "_0")
dy_state_name = (
param_name
+ "_"
+ opt_unq_name
+ "_"
+ accum_name
+ "_0"
)
converted_state[
state_var.name] = converted_state.pop(
dy_state_name)
state_var.name
] = converted_state.pop(dy_state_name)
assert var.name in converted_state, \
"variable [{}] is not in optimizer state file".format(var.name)
assert (
var.name in converted_state
), "variable [{}] is not in optimizer state file".format(var.name)
self._set_var(var, converted_state[var.name])
def _set_var(self, var, ndarray):
......@@ -444,15 +495,17 @@ class StaticGraphAdapter(object):
def _run(self, inputs, labels=None):
compiled_prog = self._compiled_progs.get(self.mode, None)
assert compiled_prog, \
"Model is not ready, please call `model.prepare()` first"
assert (
compiled_prog
), "Model is not ready, please call `model.prepare()` first"
inputs = to_list(inputs)
if labels is not None:
labels = to_list(labels)
assert len(inputs) == len(self._input_vars[self.mode]), \
"number of inputs" \
assert len(inputs) == len(self._input_vars[self.mode]), (
"number of inputs"
+ " does not match number of arguments of `forward` method"
)
feed = {}
input_names = [v.name for v in self._input_vars[self.mode]]
......@@ -462,8 +515,10 @@ class StaticGraphAdapter(object):
# train and test may take different arguments
if inputs[idx] is not None:
feed[n] = inputs[idx]
if self._amp_level == 'O2' and input_dtypes[
idx] == core.VarDesc.VarType.FP16:
if (
self._amp_level == 'O2'
and input_dtypes[idx] == core.VarDesc.VarType.FP16
):
if isinstance(feed[n], core.LoDTensor):
feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16)
elif isinstance(feed[n], np.array):
......@@ -491,10 +546,12 @@ class StaticGraphAdapter(object):
else:
pruned_fetch_list.append(fetch_var)
rets = self._executor.run(compiled_prog,
feed=feed,
fetch_list=pruned_fetch_list,
return_numpy=False)
rets = self._executor.run(
compiled_prog,
feed=feed,
fetch_list=pruned_fetch_list,
return_numpy=False,
)
# restore pruned fetch_list Variable from feeds
for i, name in enumerate(pruned_fetch_idx_name_map):
......@@ -510,20 +567,24 @@ class StaticGraphAdapter(object):
metrics = []
for metric, state in zip(self.model._metrics, metric_states):
# cut off padding size
if self.mode != 'train' and self.model._test_dataloader is not None \
and isinstance(self.model._test_dataloader, DataLoader) \
and self._nranks > 1:
if (
self.mode != 'train'
and self.model._test_dataloader is not None
and isinstance(self.model._test_dataloader, DataLoader)
and self._nranks > 1
):
total_size = len(self.model._test_dataloader.dataset)
# TODO: fixme if have better way to get batch size
samples = state[0].shape[0]
current_count = self._merge_count.get(self.mode + '_total', 0)
if current_count + samples >= total_size:
state = [
s[:int(total_size - current_count), ...] for s in state
s[: int(total_size - current_count), ...] for s in state
]
self._merge_count[self.mode + '_total'] = 0
self._merge_count[self.mode + '_batch'] = int(total_size -
current_count)
self._merge_count[self.mode + '_batch'] = int(
total_size - current_count
)
else:
self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples
......@@ -555,8 +616,11 @@ class StaticGraphAdapter(object):
if mode != 'train':
for op in list(prog.global_block().ops):
prog.global_block()._remove_op(0)
if mode == 'train' and self.model._optimizer \
and self.model._optimizer._learning_rate_map:
if (
mode == 'train'
and self.model._optimizer
and self.model._optimizer._learning_rate_map
):
# HACK workaround learning rate map issue
lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
new_lr_var = prog.global_block().vars[lr_var.name]
......@@ -594,20 +658,27 @@ class StaticGraphAdapter(object):
dist_strategy.amp = True
dist_strategy.amp_configs = self._amp_configs.copy()
dist_strategy.amp_configs.update(self._amp_custom_lists)
dist_strategy.amp_configs[
'use_pure_fp16'] = self._amp_level == 'O2'
dist_strategy.amp_configs['use_pure_fp16'] = (
self._amp_level == 'O2'
)
self.model._optimizer = fleet.distributed_optimizer(
self.model._optimizer, strategy=dist_strategy)
self.model._optimizer, strategy=dist_strategy
)
elif self._amp_level != "O0" and core.is_compiled_with_cuda:
amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
**self._amp_custom_lists
) if self._amp_custom_lists else None
amp_lists = (
paddle.static.amp.AutoMixedPrecisionLists(
**self._amp_custom_lists
)
if self._amp_custom_lists
else None
)
self.model._optimizer = paddle.static.amp.decorate(
self.model._optimizer,
amp_lists=amp_lists,
use_pure_fp16=self._amp_level == "O2",
use_fp16_guard=self._use_fp16_guard,
**self._amp_configs)
**self._amp_configs
)
self.model._optimizer.minimize(self._loss_endpoint)
......@@ -620,7 +691,7 @@ class StaticGraphAdapter(object):
self._endpoints[mode] = {
"output": outputs,
"loss": to_list(losses),
"metric": metrics
"metric": metrics,
}
def _compile_and_initialize(self, prog, mode):
......@@ -628,8 +699,9 @@ class StaticGraphAdapter(object):
if compiled_prog is not None:
return compiled_prog
assert self.model._place is not None, \
"device is not set, please call `model.prepare()` first"
assert (
self.model._place is not None
), "device is not set, please call `model.prepare()` first"
place = self.model._place
......@@ -642,8 +714,11 @@ class StaticGraphAdapter(object):
uninitialized = []
for var_py in self._startup_prog.list_vars():
var = fluid.global_scope().find_var(var_py.name)
if not var_py.name.startswith('nccl_id') and var and \
var.get_tensor()._is_initialized():
if (
not var_py.name.startswith('nccl_id')
and var
and var.get_tensor()._is_initialized()
):
continue
uninitialized.append(var_py)
......@@ -651,7 +726,10 @@ class StaticGraphAdapter(object):
startup_prog = self._startup_prog._prune(uninitialized)
self._executor.run(startup_prog)
if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
if (
self._amp_level == "O2"
and mode == 'train'
and core.is_compiled_with_cuda()
):
self.model._optimizer.amp_init(place)
......@@ -664,7 +742,6 @@ class StaticGraphAdapter(object):
class DynamicGraphAdapter(object):
def __init__(self, model):
super(DynamicGraphAdapter, self).__init__()
self.model = model
......@@ -674,7 +751,7 @@ class DynamicGraphAdapter(object):
'eval_total': 0,
'test_total': 0,
'eval_batch': 0,
'test_batch': 0
'test_batch': 0,
}
self._input_info = None
......@@ -691,7 +768,8 @@ class DynamicGraphAdapter(object):
stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
stradegy.current_endpoint = ParallelEnv().current_endpoint
self.ddp_model = fluid.dygraph.parallel.DataParallel(
self.model.network, stradegy)
self.model.network, stradegy
)
@property
def mode(self):
......@@ -703,8 +781,9 @@ class DynamicGraphAdapter(object):
# TODO multi device in dygraph mode not implemented at present time
def train_batch(self, inputs, labels=None, update=True):
assert self.model._optimizer, \
"model not ready, please call `model.prepare()` first"
assert (
self.model._optimizer
), "model not ready, please call `model.prepare()` first"
self.model.network.train()
self.mode = 'train'
inputs = to_list(inputs)
......@@ -716,9 +795,11 @@ class DynamicGraphAdapter(object):
if self._amp_level != "O0" and self.model._scaler is None:
self.model._scaler = paddle.amp.GradScaler(**self._amp_configs)
with paddle.amp.auto_cast(enable=self._amp_level != 'O0',
**self._amp_custom_lists,
level=self._amp_level):
with paddle.amp.auto_cast(
enable=self._amp_level != 'O0',
**self._amp_custom_lists,
level=self._amp_level
):
if self._nranks > 1:
outputs = self.ddp_model(*[to_variable(x) for x in inputs])
else:
......@@ -746,8 +827,11 @@ class DynamicGraphAdapter(object):
m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m)
return ([to_numpy(l) for l in losses], metrics) \
if len(metrics) > 0 else [to_numpy(l) for l in losses]
return (
([to_numpy(l) for l in losses], metrics)
if len(metrics) > 0
else [to_numpy(l) for l in losses]
)
def eval_batch(self, inputs, labels=None):
self.model.network.eval()
......@@ -777,21 +861,25 @@ class DynamicGraphAdapter(object):
metrics = []
for metric in self.model._metrics:
# cut off padding value.
if self.model._test_dataloader is not None and self._nranks > 1 \
and isinstance(self.model._test_dataloader, DataLoader):
if (
self.model._test_dataloader is not None
and self._nranks > 1
and isinstance(self.model._test_dataloader, DataLoader)
):
total_size = len(self.model._test_dataloader.dataset)
samples = outputs[0].shape[0]
current_count = self._merge_count.get(self.mode + '_total', 0)
if current_count + samples >= total_size:
outputs = [
o[:int(total_size - current_count)] for o in outputs
o[: int(total_size - current_count)] for o in outputs
]
labels = [
l[:int(total_size - current_count)] for l in labels
l[: int(total_size - current_count)] for l in labels
]
self._merge_count[self.mode + '_total'] = 0
self._merge_count[self.mode + '_batch'] = int(total_size -
current_count)
self._merge_count[self.mode + '_batch'] = int(
total_size - current_count
)
else:
self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples
......@@ -858,38 +946,48 @@ class DynamicGraphAdapter(object):
opt_unq_name = ''
opt_cls_name = self.model._optimizer.__class__.__name__
opt_name = opt_unq_name[:opt_unq_name.rfind("_")] # remove suffix idx
opt_name = opt_unq_name[: opt_unq_name.rfind("_")] # remove suffix idx
param_names = [param.name for param in self.model.network.parameters()]
for var_name, state_var in sorted(optim_state.items(),
key=lambda x: len(x[0]),
reverse=True):
for var_name, state_var in sorted(
optim_state.items(), key=lambda x: len(x[0]), reverse=True
):
if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
# NOTE: dygraph saved global_step is 1 larger than that in
# static-graph, since the time of global_step to increase is
# different.
if var_name == "@LR_DECAY_COUNTER@":
converted_state["global_step"] = np.array(
converted_state.pop("@LR_DECAY_COUNTER@")) + 1
converted_state["global_step"] = (
np.array(converted_state.pop("@LR_DECAY_COUNTER@")) + 1
)
else:
# moment and other accumulators
# extend state dict to include promising dygraph names
for param_name in param_names:
if var_name.startswith(param_name + "_" + opt_name):
# when init optimizer with name
accum_name = var_name[len(param_name + "_" + opt_name +
"_"):]
elif var_name.startswith(param_name +
"_") and opt_name == opt_cls_name:
accum_name = var_name[
len(param_name + "_" + opt_name + "_") :
]
elif (
var_name.startswith(param_name + "_")
and opt_name == opt_cls_name
):
# when init optimizer without name
accum_name = var_name[len(param_name + "_"):]
accum_name = var_name[len(param_name + "_") :]
else:
continue
# remove suffix idx
accum_name = accum_name[:accum_name.rfind("_")]
accum_name = accum_name[: accum_name.rfind("_")]
# state names always end with "_0" in dygraph because of the
# unique optimizer._name
dy_state_name = (param_name + "_" + opt_unq_name + "_" +
accum_name + "_0")
dy_state_name = (
param_name
+ "_"
+ opt_unq_name
+ "_"
+ accum_name
+ "_0"
)
converted_state[dy_state_name] = state_var
if not hasattr(self.model._optimizer, 'set_state_dict'):
......@@ -901,18 +999,23 @@ class DynamicGraphAdapter(object):
self.model._optimizer.set_state_dict(converted_state)
def prepare(self):
if self._amp_level == "O2" and self.model.mode == 'train' and core.is_compiled_with_cuda(
if (
self._amp_level == "O2"
and self.model.mode == 'train'
and core.is_compiled_with_cuda()
):
self.model.network, self.model._optimizer = paddle.amp.decorate(
models=self.model.network,
optimizers=self.model._optimizer,
level='O2')
level='O2',
)
if self._amp_level != "O0":
self.model._scaler = None
class Model(object):
"""
An Model object is network with training and inference features.
Dynamic graph and static graph are supported at the same time,
switched by `paddle.enable_static()`. The usage is as follows.
......@@ -920,7 +1023,7 @@ class Model(object):
instantiating a Model. The input description, i.e, paddle.static.InputSpec,
must be required for static graph.
When training on GPU, auto mixed precision (AMP O1) and pure float16
When training on GPU, auto mixed precision (AMP O1) and pure float16
(AMP O2) training are both supported in static mode and dynamic mode.
In static graph mode, before training with pure float16 (AMP O2),
`multi_precision` could be set to True when creating optimizer, which can
......@@ -965,7 +1068,7 @@ class Model(object):
# inputs and labels are not required for dynamic graph.
input = InputSpec([None, 784], 'float32', 'x')
label = InputSpec([None, 1], 'int64', 'label')
model = paddle.Model(net, input, label)
optim = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=model.parameters())
......@@ -1053,16 +1156,17 @@ class Model(object):
def train_batch(self, inputs, labels=None, update=True):
"""
Run one training step on one batch of data. And using `update` indicates
whether optimizer update gradients computing by this batch.
Args:
inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
be a numpy array or paddle.Tensor, or a list of arrays or
inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
be a numpy array or paddle.Tensor, or a list of arrays or
tensors (in case the model has multiple inputs).
labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
a numpy array or paddle.Tensor, or a list of arrays or tensors
(in case the model has multiple labels). If has no labels,
labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
a numpy array or paddle.Tensor, or a list of arrays or tensors
(in case the model has multiple labels). If has no labels,
set None. Default: None.
update (bool, optional): Whether update parameters after loss.backward() computing.
Set it to False to accumulate gradients. Default: True.
......@@ -1075,7 +1179,7 @@ class Model(object):
Examples:
.. code-block:: python
import paddle
import paddle.nn as nn
from paddle.static import InputSpec
......@@ -1098,6 +1202,7 @@ class Model(object):
loss = model.train_batch([data], [label])
print(loss)
# [array([2.192784], dtype=float32)]
"""
loss = self._adapter.train_batch(inputs, labels, update)
if fluid._non_static_mode() and self._input_info is None:
......@@ -1107,15 +1212,16 @@ class Model(object):
@no_grad()
def eval_batch(self, inputs, labels=None):
"""
Run one evaluating step on a batch of data.
Args:
inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
be a numpy array or paddle.Tensor, or a list of arrays or
inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
be a numpy array or paddle.Tensor, or a list of arrays or
tensors (in case the model has multiple inputs).
labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
a numpy array or paddle.Tensor, or a list of arrays or tensors
(in case the model has multiple labels). If has no labels,
labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
a numpy array or paddle.Tensor, or a list of arrays or tensors
(in case the model has multiple labels). If has no labels,
set None. Default: None.
Returns:
......@@ -1150,6 +1256,7 @@ class Model(object):
loss, acc = model.eval_batch([data], [label])
print(loss, acc)
# [array([2.8825705], dtype=float32)] [0.0]
"""
loss = self._adapter.eval_batch(inputs, labels)
if fluid._non_static_mode() and self._input_info is None:
......@@ -1159,11 +1266,12 @@ class Model(object):
@no_grad()
def predict_batch(self, inputs):
"""
Run one predicting step on a batch of data.
Args:
inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
be a numpy array or paddle.Tensor, or a list of arrays or
inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
be a numpy array or paddle.Tensor, or a list of arrays or
tensors (in case the model has multiple inputs).
Returns:
......@@ -1179,7 +1287,7 @@ class Model(object):
from paddle.static import InputSpec
device = paddle.set_device('cpu') # or 'gpu'
input = InputSpec([None, 784], 'float32', 'x')
label = InputSpec([None, 1], 'int64', 'label')
......@@ -1197,6 +1305,7 @@ class Model(object):
# [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
# 0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
# dtype=float32)]
"""
loss = self._adapter.predict_batch(inputs)
if fluid._non_static_mode() and self._input_info is None:
......@@ -1204,12 +1313,13 @@ class Model(object):
return loss
def save(self, path, training=True):
"""
This function saves parameters, optimizer information or model and
"""
This function saves parameters, optimizer information or model and
paramters only for inference to path. It depends on the parameter
`training`.
If `training` is set to True, the parameters saved contain all
If `training` is set to True, the parameters saved contain all
the trainable Variable, will save to a file with suffix ".pdparams".
The optimizer information contains all the variable used by optimizer.
For Adam optimizer, contains beta1, beta2, momentum etc. All the
......@@ -1268,10 +1378,11 @@ class Model(object):
T.Normalize([127.5], [127.5])
])
data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
model.fit(data, epochs=1, batch_size=32, verbose=0)
model.save('checkpoint/test') # save for training
model.save('inference_model', False) # save for inference
"""
if ParallelEnv().local_rank == 0:
......@@ -1282,6 +1393,7 @@ class Model(object):
def load(self, path, skip_mismatch=False, reset_optimizer=False):
"""
Load from files storing the model states and optimizer states. The file
for optimizer states is not necessary if no need to restore the optimizer.
......@@ -1329,6 +1441,7 @@ class Model(object):
model.save('checkpoint/test')
model.load('checkpoint/test')
"""
def _load_state_from_path(path):
......@@ -1341,17 +1454,24 @@ class Model(object):
state = param_state.get(key, None)
if state is None:
raise ValueError(
"{} is not found in the providing file.".format(key))
"{} is not found in the providing file.".format(key)
)
if list(state.shape) != list(param.shape):
raise ValueError(
"{} receives a shape {}, but the expected shape is {}.".
format(key, list(state.shape), list(param.shape)))
"{} receives a shape {}, but the expected shape is {}.".format(
key, list(state.shape), list(param.shape)
)
)
return param, state
def _strip_postfix(path):
path, ext = os.path.splitext(path)
assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
"Unknown postfix {} from weights".format(ext)
assert ext in [
'',
'.pdparams',
'.pdopt',
'.pdmodel',
], "Unknown postfix {} from weights".format(ext)
return path
path = _strip_postfix(path)
......@@ -1365,15 +1485,17 @@ class Model(object):
except ValueError as err:
if skip_mismatch:
warnings.warn(
("Skip loading for {}. ".format(key) + str(err)))
("Skip loading for {}. ".format(key) + str(err))
)
# reset optimizer when mismatch happens
reset_optimizer = True
else:
raise err
matched_param_state.append(match_res)
optim_state = None if reset_optimizer else _load_state_from_path(
path + ".pdopt")
optim_state = (
None if reset_optimizer else _load_state_from_path(path + ".pdopt")
)
# TODO: support save/load scaler state in static graph
if _non_static_mode():
......@@ -1382,13 +1504,15 @@ class Model(object):
if os.path.exists(path + '.pdscaler'):
scaler_state = paddle.load(path + '.pdscaler')
return self._adapter.load(matched_param_state, optim_state,
scaler_state)
return self._adapter.load(
matched_param_state, optim_state, scaler_state
)
else:
return self._adapter.load(matched_param_state, optim_state)
def parameters(self, *args, **kwargs):
"""
Returns a list of parameters of the model.
Returns:
......@@ -1398,30 +1522,32 @@ class Model(object):
Examples:
.. code-block:: python
import paddle
import paddle.nn as nn
from paddle.static import InputSpec
input = InputSpec([None, 784], 'float32', 'x')
model = paddle.Model(nn.Sequential(
nn.Linear(784, 200),
nn.Tanh(),
nn.Linear(200, 10)), input)
params = model.parameters()
"""
return self._adapter.parameters()
def _prepare_amp(self, amp_configs):
def _check_pure_fp16_configs():
# pure float16 training has some restricts now
if self._adapter._amp_level == "O2" and self._optimizer._grad_clip:
# clip by value is not supported
assert isinstance(self._optimizer._grad_clip, (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm)), \
"Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
assert isinstance(
self._optimizer._grad_clip,
(paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
self._adapter._amp_custom_lists = {}
self._adapter._amp_configs = {}
......@@ -1433,7 +1559,8 @@ class Model(object):
elif isinstance(amp_configs, str):
if amp_configs not in ('O0', 'O1', 'O2'):
raise ValueError(
"The level of amp_configs should be 'O0', 'O1' or 'O2'.")
"The level of amp_configs should be 'O0', 'O1' or 'O2'."
)
self._adapter._amp_level = amp_configs
_check_pure_fp16_configs()
return
......@@ -1442,7 +1569,8 @@ class Model(object):
self._adapter._amp_level = 'O1'
elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
raise ValueError(
"amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
"amp_configs['level'] should be 'O0', 'O1' or 'O2'."
)
else:
self._adapter._amp_level = amp_configs['level']
amp_config_key_set = set(amp_configs.keys()) - {'level'}
......@@ -1459,12 +1587,14 @@ class Model(object):
# construct amp_custom_lists
if self._adapter._amp_level != 'O0' and amp_config_key_set:
for param_name in [
'custom_white_list', 'custom_black_list',
'custom_black_varnames'
'custom_white_list',
'custom_black_list',
'custom_black_varnames',
]:
if param_name in amp_config_key_set:
self._adapter._amp_custom_lists[param_name] = amp_configs[
param_name]
param_name
]
amp_config_key_set -= {param_name}
def _check_amp_configs(amp_config_key_set):
......@@ -1479,13 +1609,16 @@ class Model(object):
}
if amp_config_key_set - accepted_param_set:
raise ValueError(
"Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized."
.format(tuple(amp_config_key_set - accepted_param_set)))
"Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".format(
tuple(amp_config_key_set - accepted_param_set)
)
)
if 'use_fp16_guard' in amp_config_key_set:
if _non_static_mode():
raise ValueError(
"'use_fp16_guard' is supported in static mode only.")
"'use_fp16_guard' is supported in static mode only."
)
self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
amp_config_key_set.remove('use_fp16_guard')
......@@ -1495,12 +1628,11 @@ class Model(object):
for key in amp_configs_set:
self._adapter._amp_configs[key] = amp_configs[key]
def prepare(self,
optimizer=None,
loss=None,
metrics=None,
amp_configs=None):
def prepare(
self, optimizer=None, loss=None, metrics=None, amp_configs=None
):
"""
Configures the model before runing.
Args:
......@@ -1532,6 +1664,7 @@ class Model(object):
Returns:
None
"""
self._place = _get_device()
if isinstance(self._place, fluid.CUDAPlace):
......@@ -1539,15 +1672,17 @@ class Model(object):
if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
if fluid._non_static_mode():
main_prog_seed = fluid.default_main_program().random_seed
startup_prog_seed = fluid.default_startup_program(
).random_seed
startup_prog_seed = (
fluid.default_startup_program().random_seed
)
fluid.disable_dygraph()
paddle.disable_static(self._place)
# enable_dygraph would create and switch to a new program,
# thus also copy seed to the new program
fluid.default_main_program().random_seed = main_prog_seed
fluid.default_startup_program(
).random_seed = startup_prog_seed
fluid.default_startup_program().random_seed = (
startup_prog_seed
)
else:
prepare_distributed_context(self._place)
_parallel_context_initialized = True
......@@ -1562,43 +1697,46 @@ class Model(object):
metrics = metrics or []
for metric in to_list(metrics):
assert isinstance(metric, Metric), \
"{} is not sub class of Metric".format(
metric.__class__.__name__)
assert isinstance(
metric, Metric
), "{} is not sub class of Metric".format(metric.__class__.__name__)
self._metrics = to_list(metrics)
self._prepare_amp(amp_configs)
self._adapter.prepare()
def fit(self,
train_data=None,
eval_data=None,
batch_size=1,
epochs=1,
eval_freq=1,
log_freq=10,
save_dir=None,
save_freq=1,
verbose=2,
drop_last=False,
shuffle=True,
num_workers=0,
callbacks=None,
accumulate_grad_batches=1,
num_iters=None):
def fit(
self,
train_data=None,
eval_data=None,
batch_size=1,
epochs=1,
eval_freq=1,
log_freq=10,
save_dir=None,
save_freq=1,
verbose=2,
drop_last=False,
shuffle=True,
num_workers=0,
callbacks=None,
accumulate_grad_batches=1,
num_iters=None,
):
"""
Trains the model for a fixed number of epochs. If `eval_data` is set,
evaluation will be done at the end of each epoch.
Args:
train_data (Dataset|DataLoader, optional): An iterable data loader is used for
train. An instance of paddle paddle.io.Dataset or
train_data (Dataset|DataLoader, optional): An iterable data loader is used for
train. An instance of paddle paddle.io.Dataset or
paddle.io.Dataloader is recomended. Default: None.
eval_data (Dataset|DataLoader, optional): An iterable data loader is used for
evaluation at the end of epoch. If None, will not do evaluation.
An instance of paddle.io.Dataset or paddle.io.Dataloader
evaluation at the end of epoch. If None, will not do evaluation.
An instance of paddle.io.Dataset or paddle.io.Dataloader
is recomended. Default: None.
batch_size (int, optional): The batch size of train_data and eval_data. When
batch_size (int, optional): The batch size of train_data and eval_data. When
train_data and eval_data are both the instance of Dataloader, this
parameter will be ignored. Default: 1.
epochs (int, optional): The number of epochs to train the model. Default: 1.
......@@ -1626,7 +1764,7 @@ class Model(object):
callbacks (Callback|None, optional): A list of `Callback` instances to apply
during training. If None, :ref:`api_paddle_callbacks_ProgBarLogger` and
:ref:`api_paddle_callbacks_ModelCheckpoint` are automatically inserted. Default: None.
accumulate_grad_batches (int, optional): The number of batches to accumulate gradident
accumulate_grad_batches (int, optional): The number of batches to accumulate gradident
during training process before optimizer updates. It can mimic large batch
size. Default: 1.
num_iters (int|None, optional): The number of iterations to evaluate the model.
......@@ -1641,7 +1779,7 @@ class Model(object):
How to make a batch is done internally.
.. code-block:: python
:name: code-example1
:name: code-example3
import paddle
import paddle.vision.transforms as T
......@@ -1681,7 +1819,7 @@ class Model(object):
DataLoader.
.. code-block:: python
:name: code-example2
:name: code-example4
import paddle
import paddle.vision.transforms as T
......@@ -1691,7 +1829,7 @@ class Model(object):
dynamic = True
if not dynamic:
paddle.enable_static()
transform = T.Compose([
T.Transpose(),
T.Normalize([127.5], [127.5])
......@@ -1718,31 +1856,38 @@ class Model(object):
val_loader,
epochs=2,
save_dir='mnist_checkpoint')
"""
assert train_data is not None, \
"train_data must be given!"
assert train_data is not None, "train_data must be given!"
if isinstance(train_data, Dataset):
train_sampler = DistributedBatchSampler(train_data,
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last)
train_loader = DataLoader(train_data,
batch_sampler=train_sampler,
places=self._place,
num_workers=num_workers,
return_list=True)
train_sampler = DistributedBatchSampler(
train_data,
batch_size=batch_size,
shuffle=shuffle,
drop_last=drop_last,
)
train_loader = DataLoader(
train_data,
batch_sampler=train_sampler,
places=self._place,
num_workers=num_workers,
return_list=True,
)
else:
train_loader = train_data
if eval_data is not None and isinstance(eval_data, Dataset):
eval_sampler = DistributedBatchSampler(eval_data,
batch_size=batch_size)
eval_loader = DataLoader(eval_data,
batch_sampler=eval_sampler,
places=self._place,
num_workers=num_workers,
return_list=True)
eval_sampler = DistributedBatchSampler(
eval_data, batch_size=batch_size
)
eval_loader = DataLoader(
eval_data,
batch_sampler=eval_sampler,
places=self._place,
num_workers=num_workers,
return_list=True,
)
elif eval_data is not None:
eval_loader = eval_data
else:
......@@ -1755,8 +1900,11 @@ class Model(object):
steps = self._len_data_loader(train_loader)
self.num_iters = num_iters
if num_iters is not None and isinstance(num_iters, int) and isinstance(
steps, int):
if (
num_iters is not None
and isinstance(num_iters, int)
and isinstance(steps, int)
):
assert num_iters > 0, "num_iters must be greater than 0!"
epochs = (num_iters // steps) + 1
steps = min(num_iters, steps)
......@@ -1784,10 +1932,10 @@ class Model(object):
if do_eval and epoch % eval_freq == 0:
eval_steps = self._len_data_loader(eval_loader)
cbks.on_begin('eval', {
'steps': eval_steps,
'metrics': self._metrics_name()
})
cbks.on_begin(
'eval',
{'steps': eval_steps, 'metrics': self._metrics_name()},
)
eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval')
......@@ -1798,20 +1946,22 @@ class Model(object):
cbks.on_end('train', logs)
self._test_dataloader = None
def evaluate(self,
eval_data,
batch_size=1,
log_freq=10,
verbose=2,
num_workers=0,
callbacks=None,
num_iters=None):
def evaluate(
self,
eval_data,
batch_size=1,
log_freq=10,
verbose=2,
num_workers=0,
callbacks=None,
num_iters=None,
):
"""
Evaluate the loss and metrics of the model on input dataset.
Args:
eval_data (Dataset|DataLoader): An iterable data loader is used for
evaluation. An instance of paddle.io.Dataset or
evaluation. An instance of paddle.io.Dataset or
paddle.io.Dataloader is recomended.
batch_size (int, optional): The batch size of train_data and eval_data.
When eval_data is the instance of Dataloader, this argument will be
......@@ -1859,13 +2009,16 @@ class Model(object):
"""
if eval_data is not None and isinstance(eval_data, Dataset):
eval_sampler = DistributedBatchSampler(eval_data,
batch_size=batch_size)
eval_loader = DataLoader(eval_data,
batch_sampler=eval_sampler,
places=self._place,
num_workers=num_workers,
return_list=True)
eval_sampler = DistributedBatchSampler(
eval_data, batch_size=batch_size
)
eval_loader = DataLoader(
eval_data,
batch_sampler=eval_sampler,
places=self._place,
num_workers=num_workers,
return_list=True,
)
else:
eval_loader = eval_data
......@@ -1881,15 +2034,17 @@ class Model(object):
eval_steps = self._len_data_loader(eval_loader)
self.num_iters = num_iters
if num_iters is not None and isinstance(num_iters, int) and isinstance(
eval_steps, int):
if (
num_iters is not None
and isinstance(num_iters, int)
and isinstance(eval_steps, int)
):
assert num_iters > 0, "num_iters must be greater than 0!"
eval_steps = min(num_iters, eval_steps)
self.num_iters = eval_steps
cbks.on_begin('eval', {
'steps': eval_steps,
'metrics': self._metrics_name()
})
cbks.on_begin(
'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
)
logs = self._run_one_epoch(eval_loader, cbks, 'eval')
......@@ -1903,13 +2058,15 @@ class Model(object):
return eval_result
def predict(self,
test_data,
batch_size=1,
num_workers=0,
stack_outputs=False,
verbose=1,
callbacks=None):
def predict(
self,
test_data,
batch_size=1,
num_workers=0,
stack_outputs=False,
verbose=1,
callbacks=None,
):
"""
Compute the output predictions on testing data.
......@@ -1919,7 +2076,7 @@ class Model(object):
is recomended.
batch_size (int, optional): The batch size of test_data. When test_data is the
instance of Dataloader, this argument will be ignored. Default: 1.
num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess
num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess
used and loading data in main process. When test_data is the instance of Dataloader,
this argument will be ignored. Default: 0.
stack_outputs (bool, optional): Whether stack output field like a batch, as for an output
......@@ -1980,13 +2137,16 @@ class Model(object):
"""
if test_data is not None and isinstance(test_data, Dataset):
test_sampler = DistributedBatchSampler(test_data,
batch_size=batch_size)
test_loader = DataLoader(test_data,
batch_sampler=test_sampler,
places=self._place,
num_workers=num_workers,
return_list=True)
test_sampler = DistributedBatchSampler(
test_data, batch_size=batch_size
)
test_loader = DataLoader(
test_data,
batch_sampler=test_sampler,
places=self._place,
num_workers=num_workers,
return_list=True,
)
else:
test_loader = test_data
......@@ -2036,7 +2196,8 @@ class Model(object):
if self._is_shape_inferred:
warnings.warn(
"'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization."
% self._input_info[0])
% self._input_info[0]
)
paddle.jit.save(layer, path, input_spec=self._inputs)
......@@ -2047,7 +2208,8 @@ class Model(object):
raise ValueError(
"The input path MUST be format of dirname/file_prefix "
"[dirname\\file_prefix in Windows system], but received "
"file_prefix is empty string.")
"file_prefix is empty string."
)
dirname = os.path.dirname(path)
if dirname and not os.path.exists(dirname):
......@@ -2058,21 +2220,24 @@ class Model(object):
params_filename = file_prefix + INFER_PARAMS_SUFFIX
prog = self._adapter._progs.get('test', None)
assert prog, \
"Model is not ready, please call `model.prepare()` first"
assert (
prog
), "Model is not ready, please call `model.prepare()` first"
infer_prog = prog.clone(for_test=True)
input_names = [v.name for v in self._adapter._input_vars['test']]
endpoints = self._adapter._endpoints['test']['output']
fluid.io.save_inference_model(model_path,
input_names,
endpoints,
self._adapter._executor,
main_program=infer_prog,
model_filename=model_filename,
params_filename=params_filename)
fluid.io.save_inference_model(
model_path,
input_names,
endpoints,
self._adapter._executor,
main_program=infer_prog,
model_filename=model_filename,
params_filename=params_filename,
)
def _run_one_epoch(
self,
......@@ -2098,16 +2263,21 @@ class Model(object):
# LoDTensor.shape is callable, where LoDTensor comes from
# DataLoader in static graph
batch_size = data[0].shape()[0] if callable(
data[0].shape) else data[0].shape[0]
batch_size = (
data[0].shape()[0]
if callable(data[0].shape)
else data[0].shape[0]
)
callbacks.on_batch_begin(mode, step, logs)
if mode != 'predict':
_inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
_inputs = [data[: len(self._inputs)], data[len(self._inputs) :]]
if mode == 'train':
_inputs.append((step + 1) % self._accumulate == 0
or step + 1 == len(data_loader))
_inputs.append(
(step + 1) % self._accumulate == 0
or step + 1 == len(data_loader)
)
outs = getattr(self, mode + '_batch')(*_inputs)
......@@ -2128,15 +2298,17 @@ class Model(object):
logs[k] = v
else:
if self._inputs is not None:
outs = self.predict_batch(data[:len(self._inputs)])
outs = self.predict_batch(data[: len(self._inputs)])
else:
outs = self.predict_batch(data)
outputs.append(outs)
logs['step'] = step
if mode == 'train' or self._adapter._merge_count.get(
mode + '_batch', 0) <= 0:
if (
mode == 'train'
or self._adapter._merge_count.get(mode + '_batch', 0) <= 0
):
logs['batch_size'] = batch_size * ParallelEnv().nranks
else:
logs['batch_size'] = self._adapter._merge_count[mode + '_batch']
......@@ -2158,10 +2330,10 @@ class Model(object):
"""Prints a string summary of the network.
Args:
input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor.
if not set, input_size will get from ``self._inputs`` if network only have
one input, input_size can be tuple or InputSpec. if model have multiple
input, input_size must be a list which contain every input's shape.
input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor.
if not set, input_size will get from ``self._inputs`` if network only have
one input, input_size can be tuple or InputSpec. if model have multiple
input, input_size must be a list which contain every input's shape.
Default: None.
dtype (str, optional): if dtype is None, 'float32' will be used, Default: None.
......@@ -2190,8 +2362,9 @@ class Model(object):
# {'total_params': 61610, 'trainable_params': 61610}
"""
assert (input_size is not None or self._inputs
is not None), "'input_size' or 'self._input' must be set"
assert (
input_size is not None or self._inputs is not None
), "'input_size' or 'self._input' must be set"
if input_size is not None:
_input_size = input_size
else:
......@@ -2208,7 +2381,10 @@ class Model(object):
if is_input:
arg_names = extract_args(self.network.forward)[1:]
# While Saving inference model in dygraph, and providing inputs only in running.
if shapes is not None and dtypes is not None and fluid._non_static_mode(
if (
shapes is not None
and dtypes is not None
and fluid._non_static_mode()
):
out_specs = [
Input(name=n, dtype=dtypes[i], shape=shapes[i])
......@@ -2221,7 +2397,8 @@ class Model(object):
elif isinstance(specs, dict):
assert is_input is False
out_specs = [
specs[n] for n in extract_args(self.network.forward)
specs[n]
for n in extract_args(self.network.forward)
if n != 'self'
]
else:
......@@ -2232,8 +2409,10 @@ class Model(object):
assert isinstance(spec, Input)
if spec.name is None:
raise ValueError(
"Requires Input[{}].name != None, but receive `None` with {}."
.format(i, spec))
"Requires Input[{}].name != None, but receive `None` with {}.".format(
i, spec
)
)
return out_specs
......@@ -2258,6 +2437,7 @@ class Model(object):
"Update self._inputs according to given inputs."
self._input_info = self._adapter._input_info
if self._input_info is not None and len(self._input_info) == 2:
self._inputs = self._verify_spec(None, self._input_info[0],
self._input_info[1], True)
self._inputs = self._verify_spec(
None, self._input_info[0], self._input_info[1], True
)
self._is_shape_inferred = True
......@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm(
name=None,
):
r"""
The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
.. code-block:: python
y = layer_norm(residual + dropout(bias + x))
Parameters:
......@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm(
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: The output Tensor, the data type and shape is same as `x`.
Tensor, The output Tensor, the data type and shape is same as `x`.
Examples:
.. code-block:: python
# required: gpu
......@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm(
x, residual, bias)
# [2, 4, 128]
print(output.shape)
"""
seed = None
if mode not in ('downscale_in_infer', 'upscale_in_train'):
......
......@@ -16,7 +16,10 @@ from paddle.incubate.nn import functional as incubate_f
from paddle.nn import Layer
from paddle.framework import ParamAttr
import paddle
from paddle.nn.layer.transformer import _convert_attention_mask, _convert_param_attr_to_list
from paddle.nn.layer.transformer import (
_convert_attention_mask,
_convert_param_attr_to_list,
)
from paddle.nn.initializer import Constant
from paddle.fluid.dygraph import no_grad
from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode
......@@ -51,7 +54,8 @@ def _to_dtype(t, dtype):
if t.place.is_gpu_place():
size_dtype = core.size_of_dtype(dtype)
waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
)
gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory:
t_used = t._copy_to(paddle.CPUPlace(), False)
......@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
output = fused_bias_dropout_residual_ln(x, residual) # [2, 4, 128]
"""
def __init__(self,
embed_dim,
dropout_rate=0.5,
weight_attr=None,
bias_attr=None,
epsilon=1e-5,
name=None):
def __init__(
self,
embed_dim,
dropout_rate=0.5,
weight_attr=None,
bias_attr=None,
epsilon=1e-5,
name=None,
):
super(FusedBiasDropoutResidualLayerNorm, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but recieved {}".format(embed_dim))
assert embed_dim > 0, (
"Expected embed_dim to be greater than 0, "
"but recieved {}".format(embed_dim)
)
self._dtype = self._helper.get_default_dtype()
self._bias_attr = bias_attr
self._weight_attr = weight_attr
self.embed_dim = embed_dim
self.linear_bias = self.create_parameter(shape=[embed_dim],
attr=self._bias_attr,
dtype=self._dtype,
is_bias=True)
self.linear_bias = self.create_parameter(
shape=[embed_dim],
attr=self._bias_attr,
dtype=self._dtype,
is_bias=True,
)
self.ln_scale = self.create_parameter(
attr=self._weight_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.ln_bias = self.create_parameter(attr=self._bias_attr,
shape=[embed_dim],
is_bias=True)
default_initializer=Constant(value=1.0),
)
self.ln_bias = self.create_parameter(
attr=self._bias_attr, shape=[embed_dim], is_bias=True
)
self.dropout_rate = dropout_rate
self._epsilon = epsilon
......@@ -163,14 +174,20 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
ln_epsilon=self._epsilon,
training=self.training,
mode='upscale_in_train',
name=self.name)
name=self.name,
)
return out
def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else ''
return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon,
self._dtype, name_str)
self.embed_dim,
self.seq_len,
self.dropout_rate,
self._epsilon,
self._dtype,
name_str,
)
class FusedMultiHeadAttention(Layer):
......@@ -246,33 +263,40 @@ class FusedMultiHeadAttention(Layer):
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128]
"""
def __init__(self,
embed_dim,
num_heads,
dropout_rate=0.5,
attn_dropout_rate=0.5,
kdim=None,
vdim=None,
normalize_before=False,
need_weights=False,
qkv_weight_attr=None,
qkv_bias_attr=None,
linear_weight_attr=None,
linear_bias_attr=None,
pre_ln_scale_attr=None,
pre_ln_bias_attr=None,
ln_scale_attr=None,
ln_bias_attr=None,
epsilon=1e-5,
nranks=1,
ring_id=-1,
name=None):
def __init__(
self,
embed_dim,
num_heads,
dropout_rate=0.5,
attn_dropout_rate=0.5,
kdim=None,
vdim=None,
normalize_before=False,
need_weights=False,
qkv_weight_attr=None,
qkv_bias_attr=None,
linear_weight_attr=None,
linear_bias_attr=None,
pre_ln_scale_attr=None,
pre_ln_bias_attr=None,
ln_scale_attr=None,
ln_bias_attr=None,
epsilon=1e-5,
nranks=1,
ring_id=-1,
name=None,
):
super(FusedMultiHeadAttention, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but received {}".format(embed_dim))
assert num_heads > 0, ("Expected nhead to be greater than 0, "
"but received {}".format(num_heads))
assert embed_dim > 0, (
"Expected embed_dim to be greater than 0, "
"but received {}".format(embed_dim)
)
assert (
num_heads > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
num_heads
)
self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype()
......@@ -285,7 +309,9 @@ class FusedMultiHeadAttention(Layer):
self.kdim = kdim
self.vdim = vdim
self.need_weights = need_weights
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
assert (
self.head_dim * num_heads == embed_dim
), "embed_dim must be divisible by num_heads"
assert need_weights is False, "Only support need_weight is False now."
# tensor model parallel
......@@ -296,21 +322,26 @@ class FusedMultiHeadAttention(Layer):
shape=[3, num_heads, self.head_dim, embed_dim],
attr=qkv_weight_attr,
dtype=self._dtype,
is_bias=False)
is_bias=False,
)
self.qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
self.linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr,
dtype=self._dtype,
is_bias=False)
self.linear_bias = self.create_parameter(shape=[embed_dim],
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=False,
)
self.linear_bias = self.create_parameter(
shape=[embed_dim],
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True,
)
# tensor model parallel
if nranks > 1:
......@@ -325,10 +356,11 @@ class FusedMultiHeadAttention(Layer):
self.pre_ln_scale = self.create_parameter(
attr=pre_ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
shape=[embed_dim],
is_bias=True)
default_initializer=Constant(value=1.0),
)
self.pre_ln_bias = self.create_parameter(
attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True
)
self.ln_scale = None
self.ln_bias = None
else:
......@@ -337,10 +369,11 @@ class FusedMultiHeadAttention(Layer):
self.ln_scale = self.create_parameter(
attr=ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
self.ln_bias = self.create_parameter(attr=ln_bias_attr,
shape=[embed_dim],
is_bias=True)
default_initializer=Constant(value=1.0),
)
self.ln_bias = self.create_parameter(
attr=ln_bias_attr, shape=[embed_dim], is_bias=True
)
self.dropout_rate = dropout_rate
self.attn_dropout_rate = attn_dropout_rate
......@@ -404,15 +437,25 @@ class FusedMultiHeadAttention(Layer):
ln_epsilon=self._epsilon,
training=self.training,
ring_id=self._ring_id,
name=self.name)
name=self.name,
)
return out
def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else ''
return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format(
self.embed_dim, self.num_heads, self.dropout_rate,
self.attn_dropout_rate, self._epsilon, self.kdim, self.vdim,
self.normalize_before, self.need_weights, self._dtype, name_str)
self.embed_dim,
self.num_heads,
self.dropout_rate,
self.attn_dropout_rate,
self._epsilon,
self.kdim,
self.vdim,
self.normalize_before,
self.need_weights,
self._dtype,
name_str,
)
def _amp_decorate(self, dtype):
# tmp fix for amp.decorator(O2)
......@@ -495,33 +538,39 @@ class FusedFeedForward(Layer):
# (1, 8, 8)
"""
def __init__(self,
d_model,
dim_feedforward,
dropout_rate=0.1,
epsilon=1e-05,
activation="relu",
act_dropout_rate=None,
normalize_before=False,
linear1_weight_attr=None,
linear1_bias_attr=None,
linear2_weight_attr=None,
linear2_bias_attr=None,
ln1_scale_attr=None,
ln1_bias_attr=None,
ln2_scale_attr=None,
ln2_bias_attr=None,
nranks=1,
ring_id=-1,
name=None):
def __init__(
self,
d_model,
dim_feedforward,
dropout_rate=0.1,
epsilon=1e-05,
activation="relu",
act_dropout_rate=None,
normalize_before=False,
linear1_weight_attr=None,
linear1_bias_attr=None,
linear2_weight_attr=None,
linear2_bias_attr=None,
ln1_scale_attr=None,
ln1_bias_attr=None,
ln2_scale_attr=None,
ln2_bias_attr=None,
nranks=1,
ring_id=-1,
name=None,
):
super(FusedFeedForward, self).__init__()
assert d_model > 0, (
"Expected d_model to be greater than 0, but received {}".format(
d_model))
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, but received {}".
format(dim_feedforward))
assert (
d_model > 0
), "Expected d_model to be greater than 0, but received {}".format(
d_model
)
assert (
dim_feedforward > 0
), "Expected dim_feedforward to be greater than 0, but received {}".format(
dim_feedforward
)
self._dtype = self._helper.get_default_dtype()
self._d_model = d_model
......@@ -530,7 +579,9 @@ class FusedFeedForward(Layer):
dim_feedforward = dim_feedforward // nranks
self._dim_feedforward = dim_feedforward
self._dropout_rate = dropout_rate
self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
self._act_dropout_rate = (
dropout_rate if act_dropout_rate is None else act_dropout_rate
)
self._act_method = activation
self._normalize_before = normalize_before
self._epsilon = epsilon
......@@ -540,22 +591,28 @@ class FusedFeedForward(Layer):
shape=[d_model, dim_feedforward],
attr=linear1_weight_attr,
dtype=self._dtype,
is_bias=False)
self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
attr=linear1_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=False,
)
self._linear1_bias = self.create_parameter(
shape=[dim_feedforward],
attr=linear1_bias_attr,
dtype=self._dtype,
is_bias=True,
)
self._linear2_weight = self.create_parameter(
shape=[dim_feedforward, d_model],
attr=linear2_weight_attr,
dtype=self._dtype,
is_bias=False)
is_bias=False,
)
self._linear2_bias = self.create_parameter(shape=[d_model],
attr=linear2_bias_attr,
dtype=self._dtype,
is_bias=True)
self._linear2_bias = self.create_parameter(
shape=[d_model],
attr=linear2_bias_attr,
dtype=self._dtype,
is_bias=True,
)
if nranks > 1:
assert ring_id != -1
......@@ -569,10 +626,11 @@ class FusedFeedForward(Layer):
shape=[d_model],
attr=ln1_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln1_bias = self.create_parameter(shape=[d_model],
attr=ln1_bias_attr,
is_bias=True)
default_initializer=Constant(1.0),
)
self._ln1_bias = self.create_parameter(
shape=[d_model], attr=ln1_bias_attr, is_bias=True
)
self._ln2_scale = None
self._ln2_bias = None
else:
......@@ -582,10 +640,11 @@ class FusedFeedForward(Layer):
shape=[d_model],
attr=ln2_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
self._ln2_bias = self.create_parameter(shape=[d_model],
attr=ln2_bias_attr,
is_bias=True)
default_initializer=Constant(1.0),
)
self._ln2_bias = self.create_parameter(
shape=[d_model], attr=ln2_bias_attr, is_bias=True
)
self.name = name
......@@ -608,15 +667,23 @@ class FusedFeedForward(Layer):
pre_layer_norm=self._normalize_before,
training=self.training,
ring_id=self._ring_id,
name=self.name)
name=self.name,
)
return out
def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else ''
return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format(
self._d_model, self._dim_feedforward, self._dropout_rate,
self._epsilon, self._act_method, self._act_dropout_rate,
self._normalize_before, self._dtype, name_str)
self._d_model,
self._dim_feedforward,
self._dropout_rate,
self._epsilon,
self._act_method,
self._act_dropout_rate,
self._normalize_before,
self._dtype,
name_str,
)
def _amp_decorate(self, dtype):
# tmp fix for amp.decorator(O2)
......@@ -640,6 +707,7 @@ class FusedFeedForward(Layer):
class FusedTransformerEncoderLayer(Layer):
"""
FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
attention and feedforward network. Before and after each sub-layer, pre-process
and post-precess would be applied on the input and output accordingly. If
......@@ -681,10 +749,9 @@ class FusedTransformerEncoderLayer(Layer):
Examples:
.. code-block:: python
# required: gpu
# required: gpu
import paddle
from paddle.incubate.nn import FusedTransformerEncoderLayer
......@@ -694,33 +761,47 @@ class FusedTransformerEncoderLayer(Layer):
attn_mask = paddle.rand((2, 2, 4, 4))
encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128]
"""
def __init__(self,
d_model,
nhead,
dim_feedforward,
dropout_rate=0.1,
activation="relu",
attn_dropout_rate=None,
act_dropout_rate=None,
normalize_before=False,
weight_attr=None,
bias_attr=None):
def __init__(
self,
d_model,
nhead,
dim_feedforward,
dropout_rate=0.1,
activation="relu",
attn_dropout_rate=None,
act_dropout_rate=None,
normalize_before=False,
weight_attr=None,
bias_attr=None,
):
self._config = locals()
self._config.pop("self")
self._config.pop("__class__", None) # py3
super(FusedTransformerEncoderLayer, self).__init__()
assert d_model > 0, ("Expected d_model to be greater than 0, "
"but received {}".format(d_model))
assert nhead > 0, ("Expected nhead to be greater than 0, "
"but received {}".format(nhead))
assert (
d_model > 0
), "Expected d_model to be greater than 0, " "but received {}".format(
d_model
)
assert (
nhead > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
nhead
)
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, "
"but received {}".format(dim_feedforward))
attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
"but received {}".format(dim_feedforward)
)
attn_dropout_rate = (
dropout_rate if attn_dropout_rate is None else attn_dropout_rate
)
act_dropout_rate = (
dropout_rate if act_dropout_rate is None else act_dropout_rate
)
self.normalize_before = normalize_before
weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
......@@ -739,22 +820,27 @@ class FusedTransformerEncoderLayer(Layer):
pre_ln_scale_attr=weight_attrs[0],
pre_ln_bias_attr=bias_attrs[0],
ln_scale_attr=weight_attrs[0],
ln_bias_attr=bias_attrs[0])
self.ffn = FusedFeedForward(d_model,
dim_feedforward,
dropout_rate=dropout_rate,
activation=activation,
act_dropout_rate=act_dropout_rate,
normalize_before=self.normalize_before,
linear1_weight_attr=weight_attrs[1],
linear1_bias_attr=bias_attrs[1],
linear2_weight_attr=weight_attrs[1],
linear2_bias_attr=bias_attrs[1])
ln_bias_attr=bias_attrs[0],
)
self.ffn = FusedFeedForward(
d_model,
dim_feedforward,
dropout_rate=dropout_rate,
activation=activation,
act_dropout_rate=act_dropout_rate,
normalize_before=self.normalize_before,
linear1_weight_attr=weight_attrs[1],
linear1_bias_attr=bias_attrs[1],
linear2_weight_attr=weight_attrs[1],
linear2_bias_attr=bias_attrs[1],
)
def forward(self, src, src_mask=None, cache=None):
"""
Applies a Transformer encoder layer on the input.
Parameters:
src (Tensor): The input of Transformer encoder layer. It is
a tensor with shape `[batch_size, sequence_length, d_model]`.
......@@ -770,25 +856,27 @@ class FusedTransformerEncoderLayer(Layer):
`-INF` values and the others have 0 values. It can be None when
nothing wanted or needed to be prevented attention to. Default None.
cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
See `TransformerEncoderLayer.gen_cache` for more details. It is
See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is
only used for inference and should be None for training. Default
None.
Returns:
Tensor|tuple: It is a tensor that has the same shape and data type \
Tensor|tuple, It is a tensor that has the same shape and data type \
as `enc_input`, representing the output of Transformer encoder \
layer. Or a tuple if `cache` is not None, except for encoder \
layer output, the tuple includes the new cache which is same \
as input `cache` argument but `incremental_cache` has an \
incremental length. See `MultiHeadAttention.gen_cache` and \
`MultiHeadAttention.forward` for more details.
"""
src_mask = _convert_attention_mask(src_mask, src.dtype)
if cache is None:
attn_out = self.fused_attn(src, attn_mask=src_mask)
else:
attn_out, incremental_cache = self.fused_attn(src,
attn_mask=src_mask,
cache=cache)
attn_out, incremental_cache = self.fused_attn(
src, attn_mask=src_mask, cache=cache
)
ffn_out = self.ffn(attn_out)
......@@ -889,21 +977,23 @@ class FusedTransformer(Layer):
cross_attn_mask) # [2, 6, 128]
"""
def __init__(self,
d_model=512,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False,
weight_attr=None,
bias_attr=None,
custom_encoder=None,
custom_decoder=None):
def __init__(
self,
d_model=512,
nhead=8,
num_encoder_layers=6,
num_decoder_layers=6,
dim_feedforward=2048,
dropout=0.1,
activation="relu",
attn_dropout=None,
act_dropout=None,
normalize_before=False,
weight_attr=None,
bias_attr=None,
custom_encoder=None,
custom_decoder=None,
):
super(fusedTransformer, self).__init__()
raise NotImplementedError()
......@@ -1071,40 +1161,49 @@ class FusedMultiTransformer(Layer):
enc_output = encoder_layers(enc_input, attn_mask) # [2, 4, 128]
"""
def __init__(self,
embed_dim,
num_heads,
dim_feedforward,
dropout_rate=0.0,
activation="gelu",
normalize_before=True,
ln_scale_attrs=None,
ln_bias_attrs=None,
qkv_weight_attrs=None,
qkv_bias_attrs=None,
linear_weight_attrs=None,
linear_bias_attrs=None,
ffn_ln_scale_attrs=None,
ffn_ln_bias_attrs=None,
ffn1_weight_attrs=None,
ffn1_bias_attrs=None,
ffn2_weight_attrs=None,
ffn2_bias_attrs=None,
epsilon=1e-5,
num_layers=-1,
nranks=1,
trans_qkvw=True,
ring_id=-1,
name=None):
def __init__(
self,
embed_dim,
num_heads,
dim_feedforward,
dropout_rate=0.0,
activation="gelu",
normalize_before=True,
ln_scale_attrs=None,
ln_bias_attrs=None,
qkv_weight_attrs=None,
qkv_bias_attrs=None,
linear_weight_attrs=None,
linear_bias_attrs=None,
ffn_ln_scale_attrs=None,
ffn_ln_bias_attrs=None,
ffn1_weight_attrs=None,
ffn1_bias_attrs=None,
ffn2_weight_attrs=None,
ffn2_bias_attrs=None,
epsilon=1e-5,
num_layers=-1,
nranks=1,
trans_qkvw=True,
ring_id=-1,
name=None,
):
super(FusedMultiTransformer, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
"but received {}".format(embed_dim))
assert num_heads > 0, ("Expected nhead to be greater than 0, "
"but received {}".format(num_heads))
assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, but received {}".
format(dim_feedforward))
assert embed_dim > 0, (
"Expected embed_dim to be greater than 0, "
"but received {}".format(embed_dim)
)
assert (
num_heads > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
num_heads
)
assert (
dim_feedforward > 0
), "Expected dim_feedforward to be greater than 0, but received {}".format(
dim_feedforward
)
self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype()
......@@ -1115,7 +1214,9 @@ class FusedMultiTransformer(Layer):
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
assert (
self.head_dim * num_heads == embed_dim
), "embed_dim must be divisible by num_heads"
# tensor model parallel
if nranks > 1:
......@@ -1161,57 +1262,71 @@ class FusedMultiTransformer(Layer):
ln_scale = self.create_parameter(
attr=ln_scale_attr,
shape=[embed_dim],
default_initializer=Constant(value=1.0))
ln_bias = self.create_parameter(attr=ln_bias_attr,
shape=[embed_dim],
is_bias=True)
default_initializer=Constant(value=1.0),
)
ln_bias = self.create_parameter(
attr=ln_bias_attr, shape=[embed_dim], is_bias=True
)
qkv_weight = self.create_parameter(
shape=[3, num_heads, self.head_dim, embed_dim]
if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim],
if trans_qkvw
else [embed_dim, 3, num_heads, self.head_dim],
attr=qkv_weight_attr,
dtype=self._dtype,
is_bias=False)
is_bias=False,
)
qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=True,
)
linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr,
dtype=self._dtype,
is_bias=False)
linear_bias = self.create_parameter(shape=[embed_dim],
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=False,
)
linear_bias = self.create_parameter(
shape=[embed_dim],
attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True,
)
ffn_ln_scale = self.create_parameter(
shape=[embed_dim],
attr=ffn_ln_scale_attr,
is_bias=False,
default_initializer=Constant(1.0))
ffn_ln_bias = self.create_parameter(shape=[embed_dim],
attr=ffn_ln_bias_attr,
is_bias=True)
default_initializer=Constant(1.0),
)
ffn_ln_bias = self.create_parameter(
shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True
)
ffn1_weight = self.create_parameter(
shape=[embed_dim, dim_feedforward],
attr=ffn1_weight_attr,
dtype=self._dtype,
is_bias=False)
ffn1_bias = self.create_parameter(shape=[dim_feedforward],
attr=ffn1_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=False,
)
ffn1_bias = self.create_parameter(
shape=[dim_feedforward],
attr=ffn1_bias_attr,
dtype=self._dtype,
is_bias=True,
)
ffn2_weight = self.create_parameter(
shape=[dim_feedforward, embed_dim],
attr=ffn2_weight_attr,
dtype=self._dtype,
is_bias=False)
ffn2_bias = self.create_parameter(shape=[embed_dim],
attr=ffn2_bias_attr,
dtype=self._dtype,
is_bias=True)
is_bias=False,
)
ffn2_bias = self.create_parameter(
shape=[embed_dim],
attr=ffn2_bias_attr,
dtype=self._dtype,
is_bias=True,
)
# tensor model parallel
if nranks > 1:
......@@ -1300,5 +1415,6 @@ class FusedMultiTransformer(Layer):
mode='upscale_in_train',
trans_qkvw=self._trans_qkvw,
ring_id=self._ring_id,
name=self.name)
name=self.name,
)
return out
......@@ -20,104 +20,134 @@ from paddle.fluid import core
from paddle import _C_ops, _legacy_C_ops
def graph_khop_sampler(row,
colptr,
input_nodes,
sample_sizes,
sorted_eids=None,
return_eids=False,
name=None):
def graph_khop_sampler(
row,
colptr,
input_nodes,
sample_sizes,
sorted_eids=None,
return_eids=False,
name=None,
):
"""
Graph Khop Sampler API.
This API is mainly used in Graph Learning domain, and the main purpose is to
This API is mainly used in Graph Learning domain, and the main purpose is to
provide high performance graph khop sampling method with subgraph reindex step.
For example, we get the CSC(Compressed Sparse Column) format of the input graph
edges as `row` and `colptr`, so as to covert graph data into a suitable format
edges as `row` and `colptr`, so as to covert graph data into a suitable format
for sampling. And the `input_nodes` means the nodes we need to sample neighbors,
and `sample_sizes` means the number of neighbors and number of layers we want
to sample.
to sample.
Args:
row (Tensor): One of the components of the CSC format of the input graph, and
row (Tensor): One of the components of the CSC format of the input graph, and
the shape should be [num_edges, 1] or [num_edges]. The available
data type is int32, int64.
colptr (Tensor): One of the components of the CSC format of the input graph,
and the shape should be [num_nodes + 1, 1] or [num_nodes].
and the shape should be [num_nodes + 1, 1] or [num_nodes].
The data type should be the same with `row`.
input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
data type should be the same with `row`.
sample_sizes (list|tuple): The number of neighbors and number of layers we want
to sample. The data type should be int, and the shape
should only have one dimension.
sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids`
sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids`
is True. The shape should be [num_edges, 1], and the data
type should be the same with `row`.
return_eids (bool): Whether to return the id of the sample edges. Default is False.
type should be the same with `row`. Default is None.
return_eids (bool, optional): Whether to return the id of the sample edges. Default is False.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
Returns:
edge_src (Tensor): The src index of the output edges, also means the first column of
the edges. The shape is [num_sample_edges, 1] currently.
edge_dst (Tensor): The dst index of the output edges, also means the second column
of the edges. The shape is [num_sample_edges, 1] currently.
sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes.
reindex_nodes (Tensor): The reindex id of the input nodes.
edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True.
- edge_src (Tensor), The src index of the output edges, also means the first column of
the edges. The shape is [num_sample_edges, 1] currently.
- edge_dst (Tensor), The dst index of the output edges, also means the second column
of the edges. The shape is [num_sample_edges, 1] currently.
- sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes.
- reindex_nodes (Tensor), The reindex id of the input nodes.
- edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True.
Examples:
.. code-block:: python
import paddle
import paddle
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
nodes = [0, 8, 1, 2]
sample_sizes = [2, 2]
row = paddle.to_tensor(row, dtype="int64")
colptr = paddle.to_tensor(colptr, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64")
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
nodes = [0, 8, 1, 2]
sample_sizes = [2, 2]
row = paddle.to_tensor(row, dtype="int64")
colptr = paddle.to_tensor(colptr, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64")
edge_src, edge_dst, sample_index, reindex_nodes = \
paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
"""
if _non_static_mode():
if return_eids:
if sorted_eids is None:
raise ValueError(f"`sorted_eid` should not be None "
f"if return_eids is True.")
edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \
_legacy_C_ops.graph_khop_sampler(row, sorted_eids,
colptr, input_nodes,
"sample_sizes", sample_sizes,
"return_eids", True)
raise ValueError(
f"`sorted_eid` should not be None "
f"if return_eids is True."
)
(
edge_src,
edge_dst,
sample_index,
reindex_nodes,
edge_eids,
) = _legacy_C_ops.graph_khop_sampler(
row,
sorted_eids,
colptr,
input_nodes,
"sample_sizes",
sample_sizes,
"return_eids",
True,
)
return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
else:
edge_src, edge_dst, sample_index, reindex_nodes, _ = \
_legacy_C_ops.graph_khop_sampler(row, None,
colptr, input_nodes,
"sample_sizes", sample_sizes,
"return_eids", False)
(
edge_src,
edge_dst,
sample_index,
reindex_nodes,
_,
) = _legacy_C_ops.graph_khop_sampler(
row,
None,
colptr,
input_nodes,
"sample_sizes",
sample_sizes,
"return_eids",
False,
)
return edge_src, edge_dst, sample_index, reindex_nodes
check_variable_and_dtype(row, "Row", ("int32", "int64"),
"graph_khop_sampler")
check_variable_and_dtype(
row, "Row", ("int32", "int64"), "graph_khop_sampler"
)
if return_eids:
if sorted_eids is None:
raise ValueError(f"`sorted_eid` should not be None "
f"if return_eids is True.")
check_variable_and_dtype(sorted_eids, "Eids", ("int32", "int64"),
"graph_khop_sampler")
raise ValueError(
f"`sorted_eid` should not be None " f"if return_eids is True."
)
check_variable_and_dtype(
sorted_eids, "Eids", ("int32", "int64"), "graph_khop_sampler"
)
check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
"graph_khop_sampler")
check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
"graph_khop_sampler")
check_variable_and_dtype(
colptr, "Col_Ptr", ("int32", "int64"), "graph_khop_sampler"
)
check_variable_and_dtype(
input_nodes, "X", ("int32", "int64"), "graph_khop_sampler"
)
helper = LayerHelper("graph_khop_sampler", **locals())
edge_src = helper.create_variable_for_type_inference(dtype=row.dtype)
......@@ -125,24 +155,23 @@ def graph_khop_sampler(row,
sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
helper.append_op(type="graph_khop_sampler",
inputs={
"Row": row,
"Eids": sorted_eids,
"Col_Ptr": colptr,
"X": input_nodes
},
outputs={
"Out_Src": edge_src,
"Out_Dst": edge_dst,
"Sample_Index": sample_index,
"Reindex_X": reindex_nodes,
"Out_Eids": edge_eids
},
attrs={
"sample_sizes": sample_sizes,
"return_eids": return_eids
})
helper.append_op(
type="graph_khop_sampler",
inputs={
"Row": row,
"Eids": sorted_eids,
"Col_Ptr": colptr,
"X": input_nodes,
},
outputs={
"Out_Src": edge_src,
"Out_Dst": edge_dst,
"Sample_Index": sample_index,
"Reindex_X": reindex_nodes,
"Out_Eids": edge_eids,
},
attrs={"sample_sizes": sample_sizes, "return_eids": return_eids},
)
if return_eids:
return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
else:
......
......@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops
import paddle.utils.deprecated as deprecated
@deprecated(since="2.4.0",
update_to="paddle.geometric.reindex_graph",
level=1,
reason="paddle.incubate.graph_reindex will be removed in future")
def graph_reindex(x,
neighbors,
count,
value_buffer=None,
index_buffer=None,
flag_buffer_hashtable=False,
name=None):
@deprecated(
since="2.4.0",
update_to="paddle.geometric.reindex_graph",
level=1,
reason="paddle.incubate.graph_reindex will be removed in future",
)
def graph_reindex(
x,
neighbors,
count,
value_buffer=None,
index_buffer=None,
flag_buffer_hashtable=False,
name=None,
):
"""
Graph Reindex API.
This API is mainly used in Graph Learning domain, which should be used
......@@ -40,11 +45,11 @@ def graph_reindex(x,
is to reindex the ids information of the input nodes, and return the
corresponding graph edges after reindex.
**Notes**:
Notes:
The number in x should be unique, otherwise it would cause potential errors.
Besides, we also support multi-edge-types neighbors reindexing. If we have different
edge_type neighbors for x, we should concatenate all the neighbors and count of x.
We will reindex all the nodes from 0.
Besides, we also support multi-edge-types neighbors reindexing. If we have different
edge_type neighbors for x, we should concatenate all the neighbors and count of x.
We will reindex all the nodes from 0.
Take input nodes x = [0, 1, 2] as an example.
If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2],
......@@ -58,98 +63,105 @@ def graph_reindex(x,
should be the same with `x`.
count (Tensor): The neighbor count of the input nodes `x`. And the
data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should
be int32, and should be filled with -1.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should
be int32, and should be filled with -1.
flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up.
value_buffer (Tensor, optional): Value buffer for hashtable. The data type should
be int32, and should be filled with -1. Default is None.
index_buffer (Tensor, optional): Index buffer for hashtable. The data type should
be int32, and should be filled with -1. Default is None.
flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up.
Default is False. Only useful for gpu version currently.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
Returns:
reindex_src (Tensor): The source node index of graph edges after reindex.
reindex_dst (Tensor): The destination node index of graph edges after reindex.
out_nodes (Tensor): The index of unique input nodes and neighbors before reindex,
where we put the input nodes `x` in the front, and put neighbor
nodes in the back.
- reindex_src (Tensor), The source node index of graph edges after reindex.
- reindex_dst (Tensor), The destination node index of graph edges after reindex.
- out_nodes (Tensor), The index of unique input nodes and neighbors before reindex,
where we put the input nodes `x` in the front, and put neighbor
nodes in the back.
Examples:
.. code-block:: python
import paddle
x = [0, 1, 2]
neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
count_e1 = [2, 3, 2]
x = paddle.to_tensor(x, dtype="int64")
neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
count_e1 = paddle.to_tensor(count_e1, dtype="int32")
reindex_src, reindex_dst, out_nodes = \
paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
# reindex_src: [3, 4, 0, 5, 6, 7, 6]
# reindex_dst: [0, 0, 1, 1, 1, 2, 2]
# out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
neighbors_e2 = [0, 2, 3, 5, 1]
count_e2 = [1, 3, 1]
neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
count_e2 = paddle.to_tensor(count_e2, dtype="int32")
neighbors = paddle.concat([neighbors_e1, neighbors_e2])
count = paddle.concat([count_e1, count_e2])
reindex_src, reindex_dst, out_nodes = \
paddle.incubate.graph_reindex(x, neighbors, count)
# reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
# reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
# out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
import paddle
x = [0, 1, 2]
neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
count_e1 = [2, 3, 2]
x = paddle.to_tensor(x, dtype="int64")
neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
count_e1 = paddle.to_tensor(count_e1, dtype="int32")
reindex_src, reindex_dst, out_nodes = \
paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
# reindex_src: [3, 4, 0, 5, 6, 7, 6]
# reindex_dst: [0, 0, 1, 1, 1, 2, 2]
# out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
neighbors_e2 = [0, 2, 3, 5, 1]
count_e2 = [1, 3, 1]
neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
count_e2 = paddle.to_tensor(count_e2, dtype="int32")
neighbors = paddle.concat([neighbors_e1, neighbors_e2])
count = paddle.concat([count_e1, count_e2])
reindex_src, reindex_dst, out_nodes = \
paddle.incubate.graph_reindex(x, neighbors, count)
# reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
# reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
# out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
"""
if flag_buffer_hashtable:
if value_buffer is None or index_buffer is None:
raise ValueError(f"`value_buffer` and `index_buffer` should not"
"be None if `flag_buffer_hashtable` is True.")
raise ValueError(
f"`value_buffer` and `index_buffer` should not"
"be None if `flag_buffer_hashtable` is True."
)
if _non_static_mode():
reindex_src, reindex_dst, out_nodes = \
_legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
"flag_buffer_hashtable", flag_buffer_hashtable)
reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
x,
neighbors,
count,
value_buffer,
index_buffer,
"flag_buffer_hashtable",
flag_buffer_hashtable,
)
return reindex_src, reindex_dst, out_nodes
check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"),
"graph_reindex")
check_variable_and_dtype(
neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
)
check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
if flag_buffer_hashtable:
check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"),
"graph_reindex")
check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"),
"graph_reindex")
check_variable_and_dtype(
value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
)
check_variable_and_dtype(
index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
)
helper = LayerHelper("graph_reindex", **locals())
reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type="graph_reindex",
inputs={
"X":
x,
"Neighbors":
neighbors,
"Count":
count,
"HashTable_Value":
value_buffer if flag_buffer_hashtable else None,
"HashTable_Index":
index_buffer if flag_buffer_hashtable else None,
},
outputs={
"Reindex_Src": reindex_src,
"Reindex_Dst": reindex_dst,
"Out_Nodes": out_nodes
},
attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
helper.append_op(
type="graph_reindex",
inputs={
"X": x,
"Neighbors": neighbors,
"Count": count,
"HashTable_Value": value_buffer if flag_buffer_hashtable else None,
"HashTable_Index": index_buffer if flag_buffer_hashtable else None,
},
outputs={
"Reindex_Src": reindex_src,
"Reindex_Dst": reindex_dst,
"Out_Nodes": out_nodes,
},
attrs={"flag_buffer_hashtable": flag_buffer_hashtable},
)
return reindex_src, reindex_dst, out_nodes
......@@ -25,17 +25,21 @@ import paddle.utils.deprecated as deprecated
since="2.4.0",
update_to="paddle.geometric.sample_neighbors",
level=1,
reason="paddle.incubate.graph_sample_neighbors will be removed in future")
def graph_sample_neighbors(row,
colptr,
input_nodes,
eids=None,
perm_buffer=None,
sample_size=-1,
return_eids=False,
flag_perm_buffer=False,
name=None):
reason="paddle.incubate.graph_sample_neighbors will be removed in future",
)
def graph_sample_neighbors(
row,
colptr,
input_nodes,
eids=None,
perm_buffer=None,
sample_size=-1,
return_eids=False,
flag_perm_buffer=False,
name=None,
):
"""
Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to
......@@ -71,86 +75,109 @@ def graph_sample_neighbors(row,
For more information, please refer to :ref:`api_guide_Name`.
Returns:
out_neighbors (Tensor): The sample neighbors of the input nodes.
out_count (Tensor): The number of sampling neighbors of each input node, and the shape
should be the same with `input_nodes`.
out_eids (Tensor): If `return_eids` is True, we will return the eid information of the
sample edges.
- out_neighbors (Tensor), The sample neighbors of the input nodes.
- out_count (Tensor), The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
- out_eids (Tensor), If `return_eids` is True, we will return the eid information of the sample edges.
Examples:
.. code-block:: python
import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
nodes = [0, 8, 1, 2]
sample_size = 2
row = paddle.to_tensor(row, dtype="int64")
colptr = paddle.to_tensor(colptr, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64")
out_neighbors, out_count = \
paddle.incubate.graph_sample_neighbors(row, colptr, nodes,
sample_size=sample_size)
import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
nodes = [0, 8, 1, 2]
sample_size = 2
row = paddle.to_tensor(row, dtype="int64")
colptr = paddle.to_tensor(colptr, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64")
out_neighbors, out_count = \
paddle.incubate.graph_sample_neighbors(row, colptr, nodes,
sample_size=sample_size)
"""
if return_eids:
if eids is None:
raise ValueError(
f"`eids` should not be None if `return_eids` is True.")
f"`eids` should not be None if `return_eids` is True."
)
if flag_perm_buffer:
if perm_buffer is None:
raise ValueError(
f"`perm_buffer` should not be None if `flag_perm_buffer`"
"is True.")
"is True."
)
if _non_static_mode():
out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors(
row, colptr, input_nodes, eids, perm_buffer, "sample_size",
sample_size, "return_eids", return_eids, "flag_perm_buffer",
flag_perm_buffer)
(
out_neighbors,
out_count,
out_eids,
) = _legacy_C_ops.graph_sample_neighbors(
row,
colptr,
input_nodes,
eids,
perm_buffer,
"sample_size",
sample_size,
"return_eids",
return_eids,
"flag_perm_buffer",
flag_perm_buffer,
)
if return_eids:
return out_neighbors, out_count, out_eids
return out_neighbors, out_count
check_variable_and_dtype(row, "Row", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(
row, "Row", ("int32", "int64"), "graph_sample_neighbors"
)
check_variable_and_dtype(
colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
)
check_variable_and_dtype(
input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
)
if return_eids:
check_variable_and_dtype(eids, "Eids", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(
eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
)
if flag_perm_buffer:
check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"),
"graph_sample_neighbors")
check_variable_and_dtype(
perm_buffer,
"Perm_Buffer",
("int32", "int64"),
"graph_sample_neighbors",
)
helper = LayerHelper("graph_sample_neighbors", **locals())
out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
helper.append_op(type="graph_sample_neighbors",
inputs={
"Row": row,
"Col_Ptr": colptr,
"X": input_nodes,
"Eids": eids if return_eids else None,
"Perm_Buffer":
perm_buffer if flag_perm_buffer else None
},
outputs={
"Out": out_neighbors,
"Out_Count": out_count,
"Out_Eids": out_eids
},
attrs={
"sample_size": sample_size,
"return_eids": return_eids,
"flag_perm_buffer": flag_perm_buffer
})
helper.append_op(
type="graph_sample_neighbors",
inputs={
"Row": row,
"Col_Ptr": colptr,
"X": input_nodes,
"Eids": eids if return_eids else None,
"Perm_Buffer": perm_buffer if flag_perm_buffer else None,
},
outputs={
"Out": out_neighbors,
"Out_Count": out_count,
"Out_Eids": out_eids,
},
attrs={
"sample_size": sample_size,
"return_eids": return_eids,
"flag_perm_buffer": flag_perm_buffer,
},
)
if return_eids:
return out_neighbors, out_count, out_eids
return out_neighbors, out_count
......@@ -36,106 +36,232 @@ from paddle import _C_ops, _legacy_C_ops
__all__ = ['resnet_basic_block', 'ResNetBasicBlock']
def resnet_basic_block(x,
filter1,
scale1,
bias1,
mean1,
var1,
filter2,
scale2,
bias2,
mean2,
var2,
filter3,
scale3,
bias3,
mean3,
var3,
stride1,
stride2,
stride3,
padding1,
padding2,
padding3,
dilation1,
dilation2,
dilation3,
groups,
momentum,
eps,
data_format,
has_shortcut,
use_global_stats=None,
training=False,
trainable_statistics=False,
find_conv_max=True):
def resnet_basic_block(
x,
filter1,
scale1,
bias1,
mean1,
var1,
filter2,
scale2,
bias2,
mean2,
var2,
filter3,
scale3,
bias3,
mean3,
var3,
stride1,
stride2,
stride3,
padding1,
padding2,
padding3,
dilation1,
dilation2,
dilation3,
groups,
momentum,
eps,
data_format,
has_shortcut,
use_global_stats=None,
training=False,
trainable_statistics=False,
find_conv_max=True,
):
if fluid.framework.in_dygraph_mode():
attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3,
'padding1', padding1, 'padding2', padding2, 'padding3',
padding3, 'dilation1', dilation1, 'dilation2', dilation2,
'dilation3', dilation3, 'group', groups, 'momentum', momentum,
'epsilon', eps, 'data_format', data_format, 'has_shortcut',
has_shortcut, 'use_global_stats', use_global_stats,
"trainable_statistics", trainable_statistics, 'is_test',
not training, 'act_type', "relu", 'find_conv_input_max',
find_conv_max)
out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \
getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \
filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs)
attrs = (
'stride1',
stride1,
'stride2',
stride2,
'stride3',
stride3,
'padding1',
padding1,
'padding2',
padding2,
'padding3',
padding3,
'dilation1',
dilation1,
'dilation2',
dilation2,
'dilation3',
dilation3,
'group',
groups,
'momentum',
momentum,
'epsilon',
eps,
'data_format',
data_format,
'has_shortcut',
has_shortcut,
'use_global_stats',
use_global_stats,
"trainable_statistics",
trainable_statistics,
'is_test',
not training,
'act_type',
"relu",
'find_conv_input_max',
find_conv_max,
)
(
out,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
) = getattr(_C_ops, "resnet_basic_block")(
x,
filter1,
scale1,
bias1,
mean1,
var1,
filter2,
scale2,
bias2,
mean2,
var2,
filter3,
scale3,
bias3,
mean3,
var3,
mean1,
var1,
mean2,
var2,
mean3,
var3,
*attrs
)
return out
helper = LayerHelper('resnet_basic_block', **locals())
bn_param_dtype = fluid.core.VarDesc.VarType.FP32
max_dtype = fluid.core.VarDesc.VarType.FP32
out = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
conv1 = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
out = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
conv1 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
running_mean1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1
running_var1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1
conv2 = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
running_mean1 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if mean1 is None
else mean1
)
running_var1 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var1 is None
else var1
)
conv2 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
conv2_input = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
running_mean2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2
running_var2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2
conv3 = helper.create_variable_for_type_inference(dtype=x.dtype,
stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
running_mean2 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if mean2 is None
else mean2
)
running_var2 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var2 is None
else var2
)
conv3 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True)
running_mean3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3
running_var3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3
dtype=bn_param_dtype, stop_gradient=True
)
running_mean3 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if mean3 is None
else mean3
)
running_var3 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var3 is None
else var3
)
conv1_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv1_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv2_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv2_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv3_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
conv3_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True)
dtype=max_dtype, stop_gradient=True
)
inputs = {
'X': x,
......@@ -175,7 +301,7 @@ def resnet_basic_block(x,
"trainable_statistics": trainable_statistics,
'is_test': not training,
'act_type': "relu",
'find_conv_input_max': find_conv_max
'find_conv_input_max': find_conv_max,
}
outputs = {
......@@ -203,88 +329,172 @@ def resnet_basic_block(x,
'MaxInput3': conv3_input_max,
'MaxFilter3': conv3_filter_max,
}
helper.append_op(type='resnet_basic_block',
inputs=inputs,
outputs=outputs,
attrs=attrs)
helper.append_op(
type='resnet_basic_block', inputs=inputs, outputs=outputs, attrs=attrs
)
return out
class ResNetBasicBlock(Layer):
"""
r"""
ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block.
The fusion op architecture like this:
has_shortcut = True: else:
X X
/ /
| | | |
CONV1 | CONV1 |
| | | |
BN1 | BN1 |
| | | |
RELU1 | RELU1 |
| | | |
CONV2 CONV3 CONV2 |
| | | |
BN2 BN3 BN2 |
\ / \ /
ADD ADD
| |
RELU RELU
| |
Y Y
If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time.
If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this
case the shape of output is same with input.
Args:
num_channels (int): The number of input image channel.
num_filter (int): The number of filter. It is as same as the output image channel.
filter_size (int|list|tuple): The filter size. If filter_size
is a tuple, it must contain two integers, (filter_size_height,
filter_size_width). Otherwise, filter_size_height = filter_size_width =\
filter_size.
stride (int, optional): The stride size. It means the stride in convolution.
If stride is a tuple, it must contain two integers, (stride_height, stride_width).
Otherwise, stride_height = stride_width = stride. Default: stride = 1.
act (str, optional): Activation type, if it is set to None, activation is not appended.
Default: None
momentum (float, optional): The value used for the moving_mean and
moving_var computation. This should be a float number or a Tensor with
shape [1] and data type as float32. The updated formula is:
:math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
:math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
Default is 0.9.
eps (float, optional): A value added to the denominator for
numerical stability. Default is 1e-5.
data_format (str, optional): Specify the data format of the input, and the data format of the output
will be consistent with that of the input. Now is only support `"NCHW"`, the data is stored in
the order of: `[batch_size, input_channels, input_height, input_width]`.
has_shortcut (bool, optional): Whether to calculate CONV3 and BN3. Default: False.
use_global_stats (bool, optional): Whether to use global mean and
variance. In inference or test mode, set use_global_stats to true
or is_test to true, and the behavior is equivalent.
In train mode, when setting use_global_stats True, the global mean
and variance are also used during train period. Default: False.
is_test (bool, optional): A flag indicating whether it is in
test phrase or not. Default: False.
filter_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
will create ParamAttr as param_attr. Default: None.
scale_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr
as param_attr, the name of scale can be set in ParamAttr. If the Initializer of the param_attr is not set,
the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
If the Initializer of the bias_attr is not set, the bias is initialized zero.
Default: None.
moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
will save global mean with the string. Default: None.
moving_var_name (str, optional): The name of the moving_variance which store the global Variance.
If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
will save global variance with the string. Default: None.
padding (int, optional): The padding size. It is only spupport padding_height = padding_width = padding.
Default: padding = 0.
dilation (int, optional): The dilation size. It means the spacing between the kernel
points. It is only spupport dilation_height = dilation_width = dilation.
Default: dilation = 1.
trainable_statistics (bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
Default: False.
find_conv_max (bool, optional): Whether to calculate max value of each conv2d. Default: True.
Returns:
A Tensor representing the ResNetBasicBlock, whose data type is the same with input.
Examples:
.. code-block:: python
# required: xpu
import paddle
from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
ch_in = 4
ch_out = 8
x = paddle.uniform((2, ch_in, 16, 16), dtype='float32', min=-1., max=1.)
resnet_basic_block = ResNetBasicBlock(num_channels1=ch_in,
num_filter1=ch_out,
filter1_size=3,
num_channels2=ch_out,
num_filter2=ch_out,
filter2_size=3,
num_channels3=ch_in,
num_filter3=ch_out,
filter3_size=1,
stride1=1,
stride2=1,
stride3=1,
act='relu',
padding1=1,
padding2=1,
padding3=0,
has_shortcut=True)
out = resnet_basic_block.forward(x)
print(out.shape) # [2, 8, 16, 16]
"""
def __init__(self,
num_channels1,
num_filter1,
filter1_size,
num_channels2,
num_filter2,
filter2_size,
num_channels3,
num_filter3,
filter3_size,
stride1=1,
stride2=1,
stride3=1,
act='relu',
momentum=0.9,
eps=1e-5,
data_format='NCHW',
has_shortcut=False,
use_global_stats=False,
is_test=False,
filter1_attr=None,
scale1_attr=None,
bias1_attr=None,
moving_mean1_name=None,
moving_var1_name=None,
filter2_attr=None,
scale2_attr=None,
bias2_attr=None,
moving_mean2_name=None,
moving_var2_name=None,
filter3_attr=None,
scale3_attr=None,
bias3_attr=None,
moving_mean3_name=None,
moving_var3_name=None,
padding1=0,
padding2=0,
padding3=0,
dilation1=1,
dilation2=1,
dilation3=1,
trainable_statistics=False,
find_conv_max=True):
def __init__(
self,
num_channels1,
num_filter1,
filter1_size,
num_channels2,
num_filter2,
filter2_size,
num_channels3,
num_filter3,
filter3_size,
stride1=1,
stride2=1,
stride3=1,
act='relu',
momentum=0.9,
eps=1e-5,
data_format='NCHW',
has_shortcut=False,
use_global_stats=False,
is_test=False,
filter1_attr=None,
scale1_attr=None,
bias1_attr=None,
moving_mean1_name=None,
moving_var1_name=None,
filter2_attr=None,
scale2_attr=None,
bias2_attr=None,
moving_mean2_name=None,
moving_var2_name=None,
filter3_attr=None,
scale3_attr=None,
bias3_attr=None,
moving_mean3_name=None,
moving_var3_name=None,
padding1=0,
padding2=0,
padding3=0,
dilation1=1,
dilation2=1,
dilation3=1,
trainable_statistics=False,
find_conv_max=True,
):
super(ResNetBasicBlock, self).__init__()
self._stride1 = stride1
self._stride2 = stride2
self._kernel1_size = utils.convert_to_list(filter1_size, 2,
'filter1_size')
self._kernel2_size = utils.convert_to_list(filter2_size, 2,
'filter2_size')
self._kernel1_size = utils.convert_to_list(
filter1_size, 2, 'filter1_size'
)
self._kernel2_size = utils.convert_to_list(
filter2_size, 2, 'filter2_size'
)
self._dilation1 = dilation1
self._dilation2 = dilation2
self._padding1 = padding1
......@@ -301,8 +511,9 @@ class ResNetBasicBlock(Layer):
self._find_conv_max = find_conv_max
if has_shortcut:
self._kernel3_size = utils.convert_to_list(filter3_size, 2,
'filter3_size')
self._kernel3_size = utils.convert_to_list(
filter3_size, 2, 'filter3_size'
)
self._padding3 = padding3
self._stride3 = stride3
self._dilation3 = dilation3
......@@ -317,11 +528,13 @@ class ResNetBasicBlock(Layer):
if data_format not in valid_format:
raise ValueError(
"conv_format must be one of {}, but got conv_format={}".format(
valid_format, data_format))
valid_format, data_format
)
)
def _get_default_param_initializer(channels, kernel_size):
filter_elem_num = np.prod(kernel_size) * channels
std = (2.0 / filter_elem_num)**0.5
std = (2.0 / filter_elem_num) ** 0.5
return I.Normal(0.0, std)
# init filter
......@@ -335,92 +548,128 @@ class ResNetBasicBlock(Layer):
shape=filter1_shape,
attr=filter1_attr,
default_initializer=_get_default_param_initializer(
num_channels1, self._kernel1_size))
num_channels1, self._kernel1_size
),
)
self.scale_1 = self.create_parameter(
shape=bn1_param_shape,
attr=scale1_attr,
dtype=bn_param_dtype,
default_initializer=I.Constant(1.0))
self.bias_1 = self.create_parameter(shape=bn1_param_shape,
attr=bias1_attr,
dtype=bn_param_dtype,
is_bias=True)
self.mean_1 = self.create_parameter(attr=ParamAttr(
name=moving_mean1_name,
initializer=I.Constant(0.0),
trainable=False),
shape=bn1_param_shape,
dtype=bn_param_dtype)
default_initializer=I.Constant(1.0),
)
self.bias_1 = self.create_parameter(
shape=bn1_param_shape,
attr=bias1_attr,
dtype=bn_param_dtype,
is_bias=True,
)
self.mean_1 = self.create_parameter(
attr=ParamAttr(
name=moving_mean1_name,
initializer=I.Constant(0.0),
trainable=False,
),
shape=bn1_param_shape,
dtype=bn_param_dtype,
)
self.mean_1.stop_gradient = True
self.var_1 = self.create_parameter(
attr=ParamAttr(name=moving_var1_name,
initializer=I.Constant(1.0),
trainable=False),
attr=ParamAttr(
name=moving_var1_name,
initializer=I.Constant(1.0),
trainable=False,
),
shape=bn1_param_shape,
dtype=bn_param_dtype)
dtype=bn_param_dtype,
)
self.var_1.stop_gradient = True
self.filter_2 = self.create_parameter(
shape=filter2_shape,
attr=filter2_attr,
default_initializer=_get_default_param_initializer(
num_channels2, self._kernel2_size))
num_channels2, self._kernel2_size
),
)
self.scale_2 = self.create_parameter(
shape=bn2_param_shape,
attr=scale2_attr,
dtype=bn_param_dtype,
default_initializer=I.Constant(1.0))
self.bias_2 = self.create_parameter(shape=bn2_param_shape,
attr=bias2_attr,
dtype=bn_param_dtype,
is_bias=True)
self.mean_2 = self.create_parameter(attr=ParamAttr(
name=moving_mean2_name,
initializer=I.Constant(0.0),
trainable=False),
shape=bn2_param_shape,
dtype=bn_param_dtype)
default_initializer=I.Constant(1.0),
)
self.bias_2 = self.create_parameter(
shape=bn2_param_shape,
attr=bias2_attr,
dtype=bn_param_dtype,
is_bias=True,
)
self.mean_2 = self.create_parameter(
attr=ParamAttr(
name=moving_mean2_name,
initializer=I.Constant(0.0),
trainable=False,
),
shape=bn2_param_shape,
dtype=bn_param_dtype,
)
self.mean_2.stop_gradient = True
self.var_2 = self.create_parameter(
attr=ParamAttr(name=moving_var2_name,
initializer=I.Constant(1.0),
trainable=False),
attr=ParamAttr(
name=moving_var2_name,
initializer=I.Constant(1.0),
trainable=False,
),
shape=bn2_param_shape,
dtype=bn_param_dtype)
dtype=bn_param_dtype,
)
self.var_2.stop_gradient = True
if has_shortcut:
bn3_param_shape = [1, 1, num_filter3]
filter3_shape = [
num_filter3, num_channels3, filter3_size, filter3_size
num_filter3,
num_channels3,
filter3_size,
filter3_size,
]
self.filter_3 = self.create_parameter(
shape=filter3_shape,
attr=filter3_attr,
default_initializer=_get_default_param_initializer(
num_channels3, self._kernel3_size))
num_channels3, self._kernel3_size
),
)
self.scale_3 = self.create_parameter(
shape=bn3_param_shape,
attr=scale3_attr,
dtype=bn_param_dtype,
default_initializer=I.Constant(1.0))
self.bias_3 = self.create_parameter(shape=bn3_param_shape,
attr=bias3_attr,
dtype=bn_param_dtype,
is_bias=True)
self.mean_3 = self.create_parameter(attr=ParamAttr(
name=moving_mean3_name,
initializer=I.Constant(0.0),
trainable=False),
shape=bn3_param_shape,
dtype=bn_param_dtype)
default_initializer=I.Constant(1.0),
)
self.bias_3 = self.create_parameter(
shape=bn3_param_shape,
attr=bias3_attr,
dtype=bn_param_dtype,
is_bias=True,
)
self.mean_3 = self.create_parameter(
attr=ParamAttr(
name=moving_mean3_name,
initializer=I.Constant(0.0),
trainable=False,
),
shape=bn3_param_shape,
dtype=bn_param_dtype,
)
self.mean_3.stop_gradient = True
self.var_3 = self.create_parameter(attr=ParamAttr(
name=moving_var3_name,
initializer=I.Constant(1.0),
trainable=False),
shape=bn3_param_shape,
dtype=bn_param_dtype)
self.var_3 = self.create_parameter(
attr=ParamAttr(
name=moving_var3_name,
initializer=I.Constant(1.0),
trainable=False,
),
shape=bn3_param_shape,
dtype=bn_param_dtype,
)
self.var_3.stop_gradient = True
else:
self.filter_3 = None
......@@ -464,5 +713,6 @@ class ResNetBasicBlock(Layer):
use_global_stats=self._use_global_stats,
training=self.training,
trainable_statistics=self._trainable_statistics,
find_conv_max=self._find_conv_max)
find_conv_max=self._find_conv_max,
)
return out
......@@ -715,6 +715,7 @@ def upsample(
name=None,
):
"""
This API resizes a batch of images.
The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
......@@ -725,11 +726,12 @@ def upsample(
and the resizing only applies on the three dimensions(depth, height and width).
Supporting resample methods:
'linear' : Linear interpolation
'bilinear' : Bilinear interpolation
'trilinear' : Trilinear interpolation
'nearest' : Nearest neighbor interpolation
'bicubic' : Bicubic interpolation
- 'linear' : Linear interpolation
- 'bilinear' : Bilinear interpolation
- 'trilinear' : Trilinear interpolation
- 'nearest' : Nearest neighbor interpolation
- 'bicubic' : Bicubic interpolation
Linear interpolation is the method of using a line connecting two known quantities
to determine the value of an unknown quantity between the two known quantities.
......@@ -762,77 +764,78 @@ def upsample(
`paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
Example:
.. code-block:: text
.. code-block:: text
For scale_factor:
if align_corners = True && out_size > 1 :
scale_factor = (in_size-1.0)/(out_size-1.0)
For scale_factor:
if align_corners = True && out_size > 1 :
scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = float(in_size/out_size)
Linear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,W_in)
output: (N,C,W_out) where:
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,W_in)
output: (N,C,W_out) where:
W_out = W_{in} * scale_{factor}
Nearest neighbor interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = floor (H_{in} * scale_{factor})
W_out = floor (W_{in} * scale_{factor})
else:
scale_factor = float(in_size/out_size)
Linear interpolation:
align_corners = True
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
Bilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,W_in)
output: (N,C,W_out) where:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,W_in)
output: (N,C,W_out) where:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Bicubic interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Trilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = (D_{in}+0.5) * scale_{factor} - 0.5
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = D_{in} * scale_{factor}
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Nearest neighbor interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = floor (H_{in} * scale_{factor})
W_out = floor (W_{in} * scale_{factor})
else:
align_corners = True
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
Bilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Bicubic interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Trilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = (D_{in}+0.5) * scale_{factor} - 0.5
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = D_{in} * scale_{factor}
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of linear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
......@@ -876,23 +879,24 @@ def upsample(
name(str, optional): The default value is None.
Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name`
Returns:
A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
Examples:
Examples:
.. code-block:: python
import paddle
import paddle.nn as nn
import paddle
import paddle.nn as nn
input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
upsample_out = paddle.nn.Upsample(size=[12,12])
input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
upsample_out = paddle.nn.Upsample(size=[12,12])
output = upsample_out(x=input_data)
print(output.shape)
# [2L, 3L, 12L, 12L]
output = upsample_out(x=input_data)
print(output.shape)
# [2L, 3L, 12L, 12L]
"""
return interpolate(
......
......@@ -23,6 +23,7 @@ __all__ = []
def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
r"""
It computes the pairwise distance between two vectors. The
distance is calculated by p-oreder norm:
......@@ -48,10 +49,11 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
Returns:
Tensor, the dtype is same as input tensor.
- If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
depending on whether the input has data shaped as :math:`[N, D]`.
depending on whether the input has data shaped as :math:`[N, D]`.
- If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
depending on whether the input has data shaped as :math:`[N, D]`.
depending on whether the input has data shaped as :math:`[N, D]`.
Examples:
.. code-block:: python
......
......@@ -1310,6 +1310,7 @@ def margin_ranking_loss(
def l1_loss(input, label, reduction='mean', name=None):
r"""
Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
If `reduction` set to ``'none'``, the loss is:
......@@ -1341,7 +1342,7 @@ def l1_loss(input, label, reduction='mean', name=None):
Returns:
Tensor, the L1 Loss of Tensor ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
Examples:
......@@ -1364,6 +1365,7 @@ def l1_loss(input, label, reduction='mean', name=None):
l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
print(l1_loss.numpy())
# [1.4]
"""
if reduction not in ['sum', 'mean', 'none']:
raise ValueError(
......@@ -2286,6 +2288,7 @@ def cross_entropy(
name=None,
):
r"""
By default, this operator implements the cross entropy loss function with softmax. This function
combines the calculation of the softmax operation and the cross entropy loss function
to provide a more numerically stable computing.
......@@ -2399,21 +2402,13 @@ def cross_entropy(
Parameters:
- **input** (Tensor)
Input tensor, the data type is float32, float64. Shape is
:math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` .
input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
Note:
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
output of softmax operator, which will produce incorrect results.
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
2. when use_softmax=False, it expects the output of softmax operator.
- **label** (Tensor)
label (Tensor):
1. If soft_label=False, the shape is
:math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
the data type is int32, int64, float32, float64, where each value is [0, C-1].
......@@ -2421,48 +2416,27 @@ def cross_entropy(
2. If soft_label=True, the shape and data type should be same with ``input`` ,
and the sum of the labels for each sample should be 1.
- **weight** (Tensor, optional)
a manual rescaling weight given to each class.
weight (Tensor, optional): a manual rescaling weight given to each class.
If given, has to be a Tensor of size C and the data type is float32, float64.
Default is ``'None'`` .
- **ignore_index** (int64, optional)
Specifies a target value that is ignored
ignore_index (int64, optional): Specifies a target value that is ignored
and does not contribute to the loss. A negative value means that no label
value needs to be ignored. Only valid when soft_label = False.
Default is ``-100`` .
- **reduction** (str, optional)
Indicate how to average the loss by batch_size,
reduction (str, optional): Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``.
- **soft_label** (bool, optional)
Indicate whether label is soft.
Default is ``False``.
- **axis** (int, optional)
The index of dimension to perform softmax calculations.
soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
axis (int, optional):The index of dimension to perform softmax calculations.
It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
number of dimensions of input :attr:`input`.
Default is ``-1`` .
- **use_softmax** (bool, optional)
Indicate whether compute softmax before cross_entropy.
use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
Default is ``True``.
- **name** (str, optional)
The name of the operator. Default is ``None`` .
name (str, optional): The name of the operator. Default is ``None`` .
For more information, please refer to :ref:`api_guide_Name` .
Returns:
......@@ -2478,9 +2452,7 @@ def cross_entropy(
2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
Examples:
.. code-block:: python
# hard labels
......@@ -3834,6 +3806,7 @@ def triplet_margin_loss(
def soft_margin_loss(input, label, reduction='mean', name=None):
"""
The API measures the soft margin loss between input predictions ``input``
and target labels ``label`` . It can be described as:
......@@ -3842,9 +3815,9 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Parameters:
input (Tensor): The input predications tensor with shape: [N, *],
input (Tensor): The input predications tensor with shape: ``[N, *]``,
N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
Available dtype is float32, float64.
Available dtype is float32, float64.
label (Tensor): The target labels tensor with the same shape as
``input``. The target labels which values should be numbers -1 or 1.
......@@ -3862,8 +3835,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Returns:
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
same as ``input`` , else the shape of output is [1].
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
Examples:
.. code-block:: python
......@@ -3889,6 +3861,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
# [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
# [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
# [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
"""
if reduction not in ['sum', 'mean', 'none']:
raise ValueError(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment