Commit dbe08e9b authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

2.4.2

parent b5499578
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from __future__ import print_function from __future__ import print_function
import math import math
import numpy as np import numpy as np
import unittest import unittest
...@@ -45,34 +46,64 @@ class TestFoldOp(OpTest): ...@@ -45,34 +46,64 @@ class TestFoldOp(OpTest):
def calc_fold(self): def calc_fold(self):
output_shape = [0] * 4 output_shape = [0] * 4
output_shape[0] = self.batch_size output_shape[0] = self.batch_size
output_shape[1] = int(self.input_channels / output_shape[1] = int(
(self.kernel_sizes[0] * self.kernel_sizes[1])) self.input_channels / (self.kernel_sizes[0] * self.kernel_sizes[1])
)
output_shape[2] = self.output_sizes[0] output_shape[2] = self.output_sizes[0]
output_shape[3] = self.output_sizes[1] output_shape[3] = self.output_sizes[1]
dkernel_h = self.dilations[0] * (self.kernel_sizes[0] - 1) + 1 dkernel_h = self.dilations[0] * (self.kernel_sizes[0] - 1) + 1
dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1 dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1
col_height = int((self.output_sizes[0] + self.paddings[0] + col_height = (
self.paddings[2] - dkernel_h) / self.strides[0]) + 1 int(
col_width = int((self.output_sizes[1] + self.paddings[1] + (
self.paddings[3] - dkernel_w) / self.strides[1]) + 1 self.output_sizes[0]
+ self.paddings[0]
+ self.paddings[2]
- dkernel_h
)
/ self.strides[0]
)
+ 1
)
col_width = (
int(
(
self.output_sizes[1]
+ self.paddings[1]
+ self.paddings[3]
- dkernel_w
)
/ self.strides[1]
)
+ 1
)
output = np.zeros(output_shape).astype(np.float64) output = np.zeros(output_shape).astype(np.float64)
############ calculate output ############## ############ calculate output ##############
for b in range(output_shape[0]): for b in range(output_shape[0]):
for c in range(self.input_channels): for c in range(self.input_channels):
w_offset = int(c % self.kernel_sizes[1]) w_offset = int(c % self.kernel_sizes[1])
h_offset = int( h_offset = int(
(c / self.kernel_sizes[1]) % self.kernel_sizes[0]) (c / self.kernel_sizes[1]) % self.kernel_sizes[0]
)
c_out = int(c / self.kernel_sizes[0] / self.kernel_sizes[1]) c_out = int(c / self.kernel_sizes[0] / self.kernel_sizes[1])
for h in range(col_height): for h in range(col_height):
h_out = int(h * self.strides[0] - self.paddings[0] + h_out = int(
h_offset * self.dilations[0]) h * self.strides[0]
- self.paddings[0]
+ h_offset * self.dilations[0]
)
for w in range(col_width): for w in range(col_width):
w_out = int(w * self.strides[1] - self.paddings[1] + w_out = int(
w_offset * self.dilations[1]) w * self.strides[1]
- self.paddings[1]
+ w_offset * self.dilations[1]
)
if (h_out >= 0 and h_out < self.output_sizes[0]) and ( if (h_out >= 0 and h_out < self.output_sizes[0]) and (
w_out >= 0 and w_out < self.output_sizes[1]): w_out >= 0 and w_out < self.output_sizes[1]
output[b, c_out, h_out, ):
w_out] += self.x[b, c, w + col_width * h] output[b, c_out, h_out, w_out] += self.x[
b, c, w + col_width * h
]
self.outputs = output self.outputs = output
...@@ -85,7 +116,7 @@ class TestFoldOp(OpTest): ...@@ -85,7 +116,7 @@ class TestFoldOp(OpTest):
'paddings': self.paddings, 'paddings': self.paddings,
'dilations': self.dilations, 'dilations': self.dilations,
'strides': self.strides, 'strides': self.strides,
'output_sizes': self.output_sizes 'output_sizes': self.output_sizes,
} }
self.outputs = {'Y': self.outputs} self.outputs = {'Y': self.outputs}
...@@ -101,9 +132,23 @@ class TestFoldOp(OpTest): ...@@ -101,9 +132,23 @@ class TestFoldOp(OpTest):
self.check_grad(['X'], 'Y', check_eager=True) self.check_grad(['X'], 'Y', check_eager=True)
class TestFoldshape(TestFoldOp):
def init_data(self):
self.batch_size = 8
self.input_channels = 3 * 3 * 3
self.length = 6
self.kernel_sizes = [3, 3]
self.strides = [1, 1]
self.paddings = [0, 0, 0, 0]
self.dilations = [1, 1]
self.output_sizes = [4, 5]
input_shape = [self.batch_size, self.input_channels, self.length]
self.x = np.random.rand(*input_shape).astype(np.float64)
class TestFoldAPI(TestFoldOp): class TestFoldAPI(TestFoldOp):
#This is for test on paddle.nn.Fold # This is for test on paddle.nn.Fold
def setUp(self): def setUp(self):
self.op_type = 'fold' self.op_type = 'fold'
...@@ -120,19 +165,19 @@ class TestFoldAPI(TestFoldOp): ...@@ -120,19 +165,19 @@ class TestFoldAPI(TestFoldOp):
m = paddle.nn.Fold(**self.attrs) m = paddle.nn.Fold(**self.attrs)
m.eval() m.eval()
result = m(input) result = m(input)
np.testing.assert_allclose(result.numpy(), np.testing.assert_allclose(
self.outputs['Y'], result.numpy(), self.outputs['Y'], rtol=1e-05
rtol=1e-05) )
def test_info(self): def test_info(self):
str(paddle.nn.Fold(**self.attrs)) str(paddle.nn.Fold(**self.attrs))
class TestFoldOpError(unittest.TestCase): class TestFoldOpError(unittest.TestCase):
def test_errors(self): def test_errors(self):
from paddle.nn.functional import fold from paddle.nn.functional import fold
from paddle.fluid.framework import Program, program_guard from paddle.fluid.framework import Program, program_guard
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
def test_input_shape(): def test_input_shape():
...@@ -148,59 +193,67 @@ class TestFoldOpError(unittest.TestCase): ...@@ -148,59 +193,67 @@ class TestFoldOpError(unittest.TestCase):
def test_padding_shape(): def test_padding_shape():
# padding_size must be 2 or 4 # padding_size must be 2 or 4
x = paddle.randn(shape=[2, 6, 6], dtype="float32") x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x, out = fold(
output_sizes=[2, 3], x,
kernel_sizes=[2, 2], output_sizes=[2, 3],
paddings=[2, 2, 3]) kernel_sizes=[2, 2],
paddings=[2, 2, 3],
)
def test_dilations_shape(): def test_dilations_shape():
# dialtions_size must be 2 # dialtions_size must be 2
x = paddle.randn(shape=[2, 6, 6], dtype="float32") x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x, out = fold(
output_sizes=[2, 3], x,
kernel_sizes=[2, 2], output_sizes=[2, 3],
dilations=[2, 2, 3]) kernel_sizes=[2, 2],
dilations=[2, 2, 3],
)
def test_strides_shape(): def test_strides_shape():
# strids_size must be 2 # strids_size must be 2
x = paddle.randn(shape=[2, 6, 6], dtype="float32") x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x, out = fold(
output_sizes=[2, 3], x,
kernel_sizes=[2, 2], output_sizes=[2, 3],
strides=[2, 2, 3]) kernel_sizes=[2, 2],
strides=[2, 2, 3],
)
def test_output_size(): def test_output_size():
# im_h * im_w must be L # im_h * im_w must be L
x = paddle.randn(shape=[2, 6, 6], dtype="float32") x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x, out = fold(
output_sizes=[6, 6], x, output_sizes=[6, 6], kernel_sizes=[2, 2], strides=[1, 1]
kernel_sizes=[2, 2], )
strides=[1, 1])
def test_output_size_2(): def test_output_size_2():
# out_size must GT 1 # out_size must GT 1
x = paddle.randn(shape=[2, 6, 6], dtype="float32") x = paddle.randn(shape=[2, 6, 6], dtype="float32")
out = fold(x, out = fold(
output_sizes=[0.1, 0.2], x,
kernel_sizes=[2, 2], output_sizes=[0.1, 0.2],
strides=[1, 1]) kernel_sizes=[2, 2],
strides=[1, 1],
)
def test_block_h_w(): def test_block_h_w():
# test_block_h_w GT 0 # test_block_h_w GT 0
x = paddle.randn(shape=[2, 1, 1], dtype="float32") x = paddle.randn(shape=[2, 1, 1], dtype="float32")
out = fold(x, out = fold(
output_sizes=[1, 1], x, output_sizes=[1, 1], kernel_sizes=[2, 2], strides=1
kernel_sizes=[2, 2], )
strides=1)
def test_GT_0(): def test_GT_0():
x = paddle.randn(shape=[2, 1, 1], dtype="float32") x = paddle.randn(shape=[2, 1, 1], dtype="float32")
out = fold(x, out = fold(
output_sizes=[0, 0], x,
kernel_sizes=[0, 0], output_sizes=[0, 0],
dilations=0, kernel_sizes=[0, 0],
paddings=[0, 0], dilations=0,
strides=0) paddings=[0, 0],
strides=0,
)
self.assertRaises(AssertionError, test_input_shape) self.assertRaises(AssertionError, test_input_shape)
self.assertRaises(AssertionError, test_kernel_shape) self.assertRaises(AssertionError, test_kernel_shape)
......
...@@ -30,10 +30,10 @@ from paddle.fluid.framework import default_main_program ...@@ -30,10 +30,10 @@ from paddle.fluid.framework import default_main_program
from paddle.fluid import core from paddle.fluid import core
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(
"Paddle is not compiled with CUDA") not core.is_compiled_with_cuda(), "Paddle is not compiled with CUDA"
)
class TestFusedGateAttentionOp(OpTest): class TestFusedGateAttentionOp(OpTest):
def setUp(self): def setUp(self):
self.__class__.op_type = "fused_gate_attention" self.__class__.op_type = "fused_gate_attention"
# use autograd to check grad in this unittest. # use autograd to check grad in this unittest.
...@@ -57,7 +57,6 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -57,7 +57,6 @@ class TestFusedGateAttentionOp(OpTest):
self.bias_attr = True self.bias_attr = True
def generate_input_data(self): def generate_input_data(self):
def _random(shape): def _random(shape):
if self.dtype == "bfloat16": if self.dtype == "bfloat16":
data = np.random.random(shape).astype("float32") data = np.random.random(shape).astype("float32")
...@@ -67,7 +66,8 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -67,7 +66,8 @@ class TestFusedGateAttentionOp(OpTest):
np.random.seed(123) np.random.seed(123)
self.query = _random( self.query = _random(
(self.batch_size, self.msa_len, self.res_len, self.q_dim)) (self.batch_size, self.msa_len, self.res_len, self.q_dim)
)
self.q_weight = _random((self.q_dim, self.num_heads, self.head_dim)) self.q_weight = _random((self.q_dim, self.num_heads, self.head_dim))
self.k_weight = _random((self.kv_dim, self.num_heads, self.head_dim)) self.k_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
self.v_weight = _random((self.kv_dim, self.num_heads, self.head_dim)) self.v_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
...@@ -80,15 +80,18 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -80,15 +80,18 @@ class TestFusedGateAttentionOp(OpTest):
self.qkv_weight = np.stack([q_weight_t, k_weight_t, v_weight_t]) self.qkv_weight = np.stack([q_weight_t, k_weight_t, v_weight_t])
else: else:
self.key = _random( self.key = _random(
(self.batch_size, self.msa_len, self.m_size, self.kv_dim)) (self.batch_size, self.msa_len, self.m_size, self.kv_dim)
)
self.qkv_weight = None self.qkv_weight = None
self.attn_mask = _random( self.attn_mask = _random(
(self.batch_size, self.msa_len, 1, 1, self.m_size)) (self.batch_size, self.msa_len, 1, 1, self.m_size)
)
if self.bias_attr: if self.bias_attr:
self.nonbatched_bias = _random( self.nonbatched_bias = _random(
(self.batch_size, 1, self.num_heads, self.res_len, self.m_size)) (self.batch_size, 1, self.num_heads, self.res_len, self.m_size)
)
if self.has_gating: if self.has_gating:
self.gating_w = _random((self.q_dim, self.num_heads, self.head_dim)) self.gating_w = _random((self.q_dim, self.num_heads, self.head_dim))
...@@ -98,12 +101,17 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -98,12 +101,17 @@ class TestFusedGateAttentionOp(OpTest):
self.output_b = _random((self.out_dim)) self.output_b = _random((self.out_dim))
self.dout = _random( self.dout = _random(
(self.batch_size, self.msa_len, self.res_len, self.q_dim)) (self.batch_size, self.msa_len, self.res_len, self.q_dim)
)
def collect_outputs(self, query, key, softmax_out, fmha_out, gate_out, out): def collect_outputs(self, query, key, softmax_out, fmha_out, gate_out, out):
outputs = [ outputs = [
softmax_out, fmha_out, gate_out if self.has_gating else None, out, softmax_out,
query.grad, None if self.merge_qkv else key.grad fmha_out,
gate_out if self.has_gating else None,
out,
query.grad,
None if self.merge_qkv else key.grad,
] ]
return outputs return outputs
...@@ -111,14 +119,17 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -111,14 +119,17 @@ class TestFusedGateAttentionOp(OpTest):
paddle.disable_static(place=paddle.CUDAPlace(0)) paddle.disable_static(place=paddle.CUDAPlace(0))
query = paddle.to_tensor(self.query, stop_gradient=False) query = paddle.to_tensor(self.query, stop_gradient=False)
key = query if self.merge_qkv else paddle.to_tensor(self.key, key = (
stop_gradient=False) query
if self.merge_qkv
else paddle.to_tensor(self.key, stop_gradient=False)
)
q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False) q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False) k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False) v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True) src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
c = self.head_dim**(-0.5) c = self.head_dim ** (-0.5)
# [batch_size, msa_len, res_len, q_dim], [q_dim, num_heads, head_dim] # [batch_size, msa_len, res_len, q_dim], [q_dim, num_heads, head_dim]
# -> [batch_size, msa_len, res_len, num_heads, head_dim] # -> [batch_size, msa_len, res_len, num_heads, head_dim]
q = paddle.einsum('nbqa,ahc->nbqhc', query, q_weight) * c q = paddle.einsum('nbqa,ahc->nbqhc', query, q_weight) * c
...@@ -136,8 +147,9 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -136,8 +147,9 @@ class TestFusedGateAttentionOp(OpTest):
# -> [batch_size, msa_len, num_heads, res_len, m_size] # -> [batch_size, msa_len, num_heads, res_len, m_size]
logits = logits + src_mask logits = logits + src_mask
if self.bias_attr: if self.bias_attr:
nonbatched_bias = paddle.to_tensor(self.nonbatched_bias, nonbatched_bias = paddle.to_tensor(
stop_gradient=False) self.nonbatched_bias, stop_gradient=False
)
# [batch_size, msa_len, num_heads, res_len, m_size], [batch_size, 1, num_heads, res_len, m_size] # [batch_size, msa_len, num_heads, res_len, m_size], [batch_size, 1, num_heads, res_len, m_size]
# -> [batch_size, msa_len, num_heads, res_len, m_size] # -> [batch_size, msa_len, num_heads, res_len, m_size]
logits = logits + nonbatched_bias logits = logits + nonbatched_bias
...@@ -159,14 +171,22 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -159,14 +171,22 @@ class TestFusedGateAttentionOp(OpTest):
# gate_values = paddle.einsum('nbqc,chv->nbqhv', query, # gate_values = paddle.einsum('nbqc,chv->nbqhv', query,
# gating_w) + gating_b # gating_w) + gating_b
gating_w_2d = paddle.reshape( gating_w_2d = paddle.reshape(
gating_w, shape=[self.q_dim, self.num_heads * self.head_dim]) gating_w, shape=[self.q_dim, self.num_heads * self.head_dim]
)
gate_values_4d = paddle.matmul(query, gating_w_2d) gate_values_4d = paddle.matmul(query, gating_w_2d)
gate_values = paddle.reshape( gate_values = (
gate_values_4d, paddle.reshape(
shape=[ gate_values_4d,
self.batch_size, self.msa_len, self.res_len, self.num_heads, shape=[
self.head_dim self.batch_size,
]) + gating_b self.msa_len,
self.res_len,
self.num_heads,
self.head_dim,
],
)
+ gating_b
)
gate_values = nn.functional.sigmoid(gate_values) gate_values = nn.functional.sigmoid(gate_values)
gate_out = fmha_out * gate_values gate_out = fmha_out * gate_values
else: else:
...@@ -183,20 +203,32 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -183,20 +203,32 @@ class TestFusedGateAttentionOp(OpTest):
gate_out, gate_out,
shape=[ shape=[
self.batch_size * self.msa_len * self.res_len, self.batch_size * self.msa_len * self.res_len,
self.num_heads * self.head_dim self.num_heads * self.head_dim,
]) ],
)
output_w_2d = paddle.reshape( output_w_2d = paddle.reshape(
output_w, shape=[self.num_heads * self.head_dim, self.out_dim]) output_w, shape=[self.num_heads * self.head_dim, self.out_dim]
)
out_2d = paddle.matmul(gate_out_2d, output_w_2d) out_2d = paddle.matmul(gate_out_2d, output_w_2d)
out = paddle.reshape( out = (
out_2d, paddle.reshape(
shape=[self.batch_size, self.msa_len, self.res_len, self.out_dim out_2d,
]) + output_b shape=[
self.batch_size,
paddle.autograd.backward([out], [paddle.to_tensor(self.dout)], self.msa_len,
retain_graph=True) self.res_len,
return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out, self.out_dim,
out) ],
)
+ output_b
)
paddle.autograd.backward(
[out], [paddle.to_tensor(self.dout)], retain_graph=True
)
return self.collect_outputs(
query, key, softmax_out, fmha_out, gate_out, out
)
def get_fused_gate_attention_out(self): def get_fused_gate_attention_out(self):
paddle.disable_static(place=paddle.CUDAPlace(0)) paddle.disable_static(place=paddle.CUDAPlace(0))
...@@ -218,8 +250,9 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -218,8 +250,9 @@ class TestFusedGateAttentionOp(OpTest):
src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True) src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
if self.bias_attr: if self.bias_attr:
nonbatched_bias = paddle.to_tensor(self.nonbatched_bias, nonbatched_bias = paddle.to_tensor(
stop_gradient=False) self.nonbatched_bias, stop_gradient=False
)
else: else:
nonbatched_bias = None nonbatched_bias = None
if self.has_gating: if self.has_gating:
...@@ -232,18 +265,42 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -232,18 +265,42 @@ class TestFusedGateAttentionOp(OpTest):
output_w = paddle.to_tensor(self.output_w, stop_gradient=False) output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
output_b = paddle.to_tensor(self.output_b, stop_gradient=False) output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
_, _, _, _, softmax_out, fmha_out, gate_out, out = _legacy_C_ops.fused_gate_attention( (
query, key, q_weight, k_weight, v_weight, qkv_weight, _,
nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b, _,
'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv) _,
_,
paddle.autograd.backward([out], [paddle.to_tensor(self.dout)], softmax_out,
retain_graph=True) fmha_out,
return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out, gate_out,
out) out,
) = _legacy_C_ops.fused_gate_attention(
query,
key,
q_weight,
k_weight,
v_weight,
qkv_weight,
nonbatched_bias,
src_mask,
gating_w,
gating_b,
output_w,
output_b,
'has_gating',
self.has_gating,
'merge_qkv',
self.merge_qkv,
)
paddle.autograd.backward(
[out], [paddle.to_tensor(self.dout)], retain_graph=True
)
return self.collect_outputs(
query, key, softmax_out, fmha_out, gate_out, out
)
def check(self, ref, out, atol, rtol, check_equal, name): def check(self, ref, out, atol, rtol, check_equal, name):
def _convert(value): def _convert(value):
if self.dtype == "bfloat16": if self.dtype == "bfloat16":
return convert_uint16_to_float(value) return convert_uint16_to_float(value)
...@@ -252,19 +309,25 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -252,19 +309,25 @@ class TestFusedGateAttentionOp(OpTest):
if check_equal: if check_equal:
self.assertTrue( self.assertTrue(
np.equal(_convert(ref), _convert(out)).all(), np.equal(_convert(ref), _convert(out)).all(),
"Checking < {} > failed!".format(name)) "Checking < {} > failed!".format(name),
)
else: else:
np.testing.assert_allclose( np.testing.assert_allclose(
_convert(ref), _convert(ref),
_convert(out), _convert(out),
atol=atol, atol=atol,
rtol=rtol, rtol=rtol,
err_msg="Checking < {} > failed!".format(name)) err_msg="Checking < {} > failed!".format(name),
)
def check_output_and_grad(self, atol, rtol): def check_output_and_grad(self, atol, rtol):
output_names = [ output_names = [
"softmax_out", "fmha_out", "gate_out", "out", "query_grad", "softmax_out",
"key_grad" "fmha_out",
"gate_out",
"out",
"query_grad",
"key_grad",
] ]
outputs_ref = self.get_reference_out() outputs_ref = self.get_reference_out()
outputs_fused = self.get_fused_gate_attention_out() outputs_fused = self.get_fused_gate_attention_out()
...@@ -280,22 +343,26 @@ class TestFusedGateAttentionOp(OpTest): ...@@ -280,22 +343,26 @@ class TestFusedGateAttentionOp(OpTest):
# that in fused ops, check_equal is set to False and we use allclose # that in fused ops, check_equal is set to False and we use allclose
# to check the correctness. # to check the correctness.
check_equal = False check_equal = False
self.check(ref_res.numpy(), fused_res.numpy(), atol, rtol, self.check(
check_equal, output_names[i]) ref_res.numpy(),
fused_res.numpy(),
atol,
rtol,
check_equal,
output_names[i],
)
def test_output_and_grad(self): def test_output_and_grad(self):
self.check_output_and_grad(atol=1e-5, rtol=1e-6) self.check_output_and_grad(atol=1e-5, rtol=1e-6)
class TestMergeQKVLargeBatchSizeCase(TestFusedGateAttentionOp): class TestMergeQKVLargeBatchSizeCase(TestFusedGateAttentionOp):
def config(self): def config(self):
super().config() super().config()
self.batch_size = 2 self.batch_size = 2
class TestSeparatedQKVCase(TestFusedGateAttentionOp): class TestSeparatedQKVCase(TestFusedGateAttentionOp):
def config(self): def config(self):
self.dtype = "float32" self.dtype = "float32"
self.has_gating = False self.has_gating = False
...@@ -312,7 +379,6 @@ class TestSeparatedQKVCase(TestFusedGateAttentionOp): ...@@ -312,7 +379,6 @@ class TestSeparatedQKVCase(TestFusedGateAttentionOp):
class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp): class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
def config(self): def config(self):
super().config() super().config()
self.has_gating = False self.has_gating = False
...@@ -320,7 +386,6 @@ class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp): ...@@ -320,7 +386,6 @@ class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
class TestMergeQKVFp16Case(TestFusedGateAttentionOp): class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
def config(self): def config(self):
super().config() super().config()
self.dtype = "float16" self.dtype = "float16"
...@@ -332,18 +397,18 @@ class TestMergeQKVFp16Case(TestFusedGateAttentionOp): ...@@ -332,18 +397,18 @@ class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
class TestMergeQKVLargeBatchSizeFp16Case(TestMergeQKVFp16Case): class TestMergeQKVLargeBatchSizeFp16Case(TestMergeQKVFp16Case):
def config(self): def config(self):
super().config() super().config()
self.batch_size = 2 self.batch_size = 2
@unittest.skipIf( @unittest.skipIf(
not core.is_compiled_with_cuda() or get_cuda_version() < 11000, not core.is_compiled_with_cuda()
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3" or get_cuda_version() < 11000
or paddle.device.cuda.get_device_capability()[0] < 8,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
) )
class TestMergeQKVBF16Case(TestFusedGateAttentionOp): class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
def config(self): def config(self):
super().config() super().config()
self.dtype = "bfloat16" self.dtype = "bfloat16"
...@@ -353,7 +418,6 @@ class TestMergeQKVBF16Case(TestFusedGateAttentionOp): ...@@ -353,7 +418,6 @@ class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
class TestMergeQKVLargeBatchSizeBF16Case(TestMergeQKVBF16Case): class TestMergeQKVLargeBatchSizeBF16Case(TestMergeQKVBF16Case):
def config(self): def config(self):
super().config() super().config()
self.batch_size = 2 self.batch_size = 2
......
...@@ -20,19 +20,22 @@ from functools import partial ...@@ -20,19 +20,22 @@ from functools import partial
class TestResnetGPU(TestResnetBase): class TestResnetGPU(TestResnetBase):
def test_seresnext_with_learning_rate_decay(self): def test_seresnext_with_learning_rate_decay(self):
# NOTE(zcd): This test is compare the result of use parallel_executor # NOTE(zcd): This test is compare the result of use parallel_executor
# and executor, and the result of drop_out op and batch_norm op in # and executor, and the result of drop_out op and batch_norm op in
# this two executor have diff, so the two ops should be removed # this two executor have diff, so the two ops should be removed
# from the model. # from the model.
check_func = partial(self.check_network_convergence, check_func = partial(
optimizer=seresnext_net.optimizer, self.check_network_convergence,
use_parallel_executor=False) optimizer=seresnext_net.optimizer,
self._compare_result_with_origin_model(check_func, use_parallel_executor=False,
use_device=DeviceType.CUDA, )
delta2=1e-5, self._compare_result_with_origin_model(
compare_separately=False) check_func,
use_device=DeviceType.CUDA,
delta2=1e-3,
compare_separately=False,
)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -93,14 +93,9 @@ def get_csr_value(mat, layout, nnz): ...@@ -93,14 +93,9 @@ def get_csr_value(mat, layout, nnz):
return value return value
def ref_sparse_attention(q, def ref_sparse_attention(
k, q, k, v, offset, columns, kp_mask=None, attn_mask=None, bsz=None
v, ):
offset,
columns,
kp_mask=None,
attn_mask=None,
bsz=None):
row, col, nnz = q.shape[0], q.shape[1], columns.shape[0] row, col, nnz = q.shape[0], q.shape[1], columns.shape[0]
mat = np.zeros((row, row)) mat = np.zeros((row, row))
for cur_row in range(row): for cur_row in range(row):
...@@ -111,7 +106,7 @@ def ref_sparse_attention(q, ...@@ -111,7 +106,7 @@ def ref_sparse_attention(q,
mat[cur_row][cur_col] = 1 mat[cur_row][cur_col] = 1
a = np.dot(q, k.T) * mat a = np.dot(q, k.T) * mat
a_value = get_csr_value(a, mat, nnz) a_value = get_csr_value(a, mat, nnz)
scaling = float(col)**-0.5 scaling = float(col) ** -0.5
a = scaling * a a = scaling * a
for i in range(row): for i in range(row):
for j in range(row): for j in range(row):
...@@ -127,13 +122,9 @@ def ref_sparse_attention(q, ...@@ -127,13 +122,9 @@ def ref_sparse_attention(q,
return result, a_value, b_value return result, a_value, b_value
def ref_batch_sparse_attention(q, def ref_batch_sparse_attention(
k, q, k, v, offset, columns, kp_mask=None, attn_mask=None
v, ):
offset,
columns,
kp_mask=None,
attn_mask=None):
batch_size, num_heads, row, col = q.shape batch_size, num_heads, row, col = q.shape
nnz = columns.shape[2] nnz = columns.shape[2]
result = np.zeros((batch_size, num_heads, row, col)) result = np.zeros((batch_size, num_heads, row, col))
...@@ -141,11 +132,16 @@ def ref_batch_sparse_attention(q, ...@@ -141,11 +132,16 @@ def ref_batch_sparse_attention(q,
result_softmax = np.zeros((batch_size, num_heads, nnz)) result_softmax = np.zeros((batch_size, num_heads, nnz))
for i in range(batch_size): for i in range(batch_size):
for j in range(num_heads): for j in range(num_heads):
cur_q, cur_k, cur_v, = q[i][j], k[i][j], v[i][j] cur_q, cur_k, cur_v, = (
q[i][j],
k[i][j],
v[i][j],
)
cur_offset, cur_columns = offset[i][j], columns[i][j] cur_offset, cur_columns = offset[i][j], columns[i][j]
if kp_mask is None and attn_mask is None: if kp_mask is None and attn_mask is None:
cur_result, cur_sdd, cur_softmax = ref_sparse_attention( cur_result, cur_sdd, cur_softmax = ref_sparse_attention(
cur_q, cur_k, cur_v, cur_offset, cur_columns) cur_q, cur_k, cur_v, cur_offset, cur_columns
)
else: else:
cur_result, cur_sdd, cur_softmax = ref_sparse_attention( cur_result, cur_sdd, cur_softmax = ref_sparse_attention(
cur_q, cur_q,
...@@ -155,7 +151,8 @@ def ref_batch_sparse_attention(q, ...@@ -155,7 +151,8 @@ def ref_batch_sparse_attention(q,
cur_columns, cur_columns,
kp_mask=kp_mask, kp_mask=kp_mask,
attn_mask=attn_mask, attn_mask=attn_mask,
bsz=i) bsz=i,
)
result[i][j] = cur_result result[i][j] = cur_result
result_sdd[i][j], result_softmax[i][j] = cur_sdd, cur_softmax result_sdd[i][j], result_softmax[i][j] = cur_sdd, cur_softmax
return result, result_sdd, result_softmax return result, result_sdd, result_softmax
...@@ -193,10 +190,9 @@ def init_csr_format(batch_size, num_heads, rows, blocksize): ...@@ -193,10 +190,9 @@ def init_csr_format(batch_size, num_heads, rows, blocksize):
@unittest.skipIf( @unittest.skipIf(
not core.is_compiled_with_cuda() or get_cuda_version() < 11030, not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3" "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
) )
class TestSparseAttentionOp(OpTest): class TestSparseAttentionOp(OpTest):
def config(self): def config(self):
self.shape = (1, 1, 16, 16) self.shape = (1, 1, 16, 16)
self.blocksize = 4 self.blocksize = 4
...@@ -212,8 +208,9 @@ class TestSparseAttentionOp(OpTest): ...@@ -212,8 +208,9 @@ class TestSparseAttentionOp(OpTest):
self.k = np.random.random(self.shape).astype(self.dtype) self.k = np.random.random(self.shape).astype(self.dtype)
self.v = np.random.random(self.shape).astype(self.dtype) self.v = np.random.random(self.shape).astype(self.dtype)
# init CSR tensor # init CSR tensor
offset, columns = init_csr_format(self.shape[0], self.shape[1], offset, columns = init_csr_format(
self.shape[2], self.blocksize) self.shape[0], self.shape[1], self.shape[2], self.blocksize
)
self.offset = offset.astype('int32') self.offset = offset.astype('int32')
self.columns = columns.astype('int32') self.columns = columns.astype('int32')
# init mask tensor # init mask tensor
...@@ -234,10 +231,12 @@ class TestSparseAttentionOp(OpTest): ...@@ -234,10 +231,12 @@ class TestSparseAttentionOp(OpTest):
self.offset, self.offset,
self.columns, self.columns,
kp_mask=self.key_padding_mask, kp_mask=self.key_padding_mask,
attn_mask=self.attn_mask) attn_mask=self.attn_mask,
)
else: else:
result, result_sdd, result_softmax = ref_batch_sparse_attention( result, result_sdd, result_softmax = ref_batch_sparse_attention(
self.q, self.k, self.v, self.offset, self.columns) self.q, self.k, self.v, self.offset, self.columns
)
if self.use_mask == True: if self.use_mask == True:
self.inputs = { self.inputs = {
...@@ -260,7 +259,7 @@ class TestSparseAttentionOp(OpTest): ...@@ -260,7 +259,7 @@ class TestSparseAttentionOp(OpTest):
self.outputs = { self.outputs = {
'Out': result.astype(self.dtype), 'Out': result.astype(self.dtype),
'SparseDotSdd': result_sdd.astype(self.dtype), 'SparseDotSdd': result_sdd.astype(self.dtype),
'Softmax': result_softmax.astype(self.dtype) 'Softmax': result_softmax.astype(self.dtype),
} }
def test_check_output(self): def test_check_output(self):
...@@ -273,7 +272,6 @@ class TestSparseAttentionOp(OpTest): ...@@ -273,7 +272,6 @@ class TestSparseAttentionOp(OpTest):
class TestSparseAttentionOpFp32Test(TestSparseAttentionOp): class TestSparseAttentionOpFp32Test(TestSparseAttentionOp):
def config(self): def config(self):
self.shape = (1, 1, 8, 16) self.shape = (1, 1, 8, 16)
self.blocksize = 2 self.blocksize = 2
...@@ -282,7 +280,6 @@ class TestSparseAttentionOpFp32Test(TestSparseAttentionOp): ...@@ -282,7 +280,6 @@ class TestSparseAttentionOpFp32Test(TestSparseAttentionOp):
class TestSparseAttentionOpShapeTest(TestSparseAttentionOp): class TestSparseAttentionOpShapeTest(TestSparseAttentionOp):
def config(self): def config(self):
self.shape = (2, 2, 32, 8) self.shape = (2, 2, 32, 8)
self.blocksize = 8 self.blocksize = 8
...@@ -292,10 +289,9 @@ class TestSparseAttentionOpShapeTest(TestSparseAttentionOp): ...@@ -292,10 +289,9 @@ class TestSparseAttentionOpShapeTest(TestSparseAttentionOp):
@unittest.skipIf( @unittest.skipIf(
not core.is_compiled_with_cuda() or get_cuda_version() < 11030, not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
"core is not compiled with CUDA and cuda version need larger than or equal to 11.3" "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
) )
class TestSparseAttentionAPI(unittest.TestCase): class TestSparseAttentionAPI(unittest.TestCase):
def setUp(self): def setUp(self):
self.place = paddle.CUDAPlace(0) self.place = paddle.CUDAPlace(0)
self.shape = (1, 1, 8, 4) self.shape = (1, 1, 8, 4)
...@@ -310,54 +306,62 @@ class TestSparseAttentionAPI(unittest.TestCase): ...@@ -310,54 +306,62 @@ class TestSparseAttentionAPI(unittest.TestCase):
K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype) K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype)
V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype) V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype)
batch_size, num_heads, rows = self.shape[0], self.shape[ batch_size, num_heads, rows = (
1], self.shape[2] self.shape[0],
self.shape[1],
self.shape[2],
)
block_num = rows / self.blocksize block_num = rows / self.blocksize
block_last = rows % self.blocksize block_last = rows % self.blocksize
sparse_nnz_num = block_num * self.blocksize * self.blocksize + block_last * block_last sparse_nnz_num = (
block_num * self.blocksize * self.blocksize
+ block_last * block_last
)
offset_shape = (batch_size, num_heads, rows + 1) offset_shape = (batch_size, num_heads, rows + 1)
columns_shape = (batch_size, num_heads, int(sparse_nnz_num)) columns_shape = (batch_size, num_heads, int(sparse_nnz_num))
offset = paddle.static.data(name="Offset", offset = paddle.static.data(
shape=offset_shape, name="Offset", shape=offset_shape, dtype="int32"
dtype="int32") )
columns = paddle.static.data(name="Columns", columns = paddle.static.data(
shape=columns_shape, name="Columns", shape=columns_shape, dtype="int32"
dtype="int32") )
key_padding_mask_shape = (self.shape[0], self.shape[2]) key_padding_mask_shape = (self.shape[0], self.shape[2])
attn_mask_shape = (self.shape[2], self.shape[2]) attn_mask_shape = (self.shape[2], self.shape[2])
if self.use_mask == True: if self.use_mask == True:
key_padding_mask = paddle.static.data( key_padding_mask = paddle.static.data(
name="KeyPaddingMask", name="KeyPaddingMask",
shape=key_padding_mask_shape, shape=key_padding_mask_shape,
dtype=self.dtype) dtype=self.dtype,
attn_mask = paddle.static.data(name="AttnMask", )
shape=attn_mask_shape, attn_mask = paddle.static.data(
dtype=self.dtype) name="AttnMask", shape=attn_mask_shape, dtype=self.dtype
Out = F.sparse_attention(Q, )
K, Out = F.sparse_attention(
V, Q,
offset, K,
columns, V,
key_padding_mask=key_padding_mask, offset,
attn_mask=attn_mask) columns,
key_padding_mask=key_padding_mask,
attn_mask=attn_mask,
)
else: else:
Out = F.sparse_attention(Q, K, V, offset, columns) Out = F.sparse_attention(Q, K, V, offset, columns)
Q_np = np.random.random(self.shape).astype(self.dtype) Q_np = np.random.random(self.shape).astype(self.dtype)
K_np = np.random.random(self.shape).astype(self.dtype) K_np = np.random.random(self.shape).astype(self.dtype)
V_np = np.random.random(self.shape).astype(self.dtype) V_np = np.random.random(self.shape).astype(self.dtype)
offset_np, columns_np = init_csr_format(self.shape[0], offset_np, columns_np = init_csr_format(
self.shape[1], self.shape[0], self.shape[1], self.shape[2], self.blocksize
self.shape[2], )
self.blocksize)
offset_np = offset_np.astype('int32') offset_np = offset_np.astype('int32')
columns_np = columns_np.astype('int32') columns_np = columns_np.astype('int32')
# init mask tensor # init mask tensor
key_padding_mask_np = np.random.randint(0, key_padding_mask_np = np.random.randint(
2, 0, 2, size=key_padding_mask_shape
size=key_padding_mask_shape) )
attn_mask_np = np.random.randint(0, 2, size=attn_mask_shape) attn_mask_np = np.random.randint(0, 2, size=attn_mask_shape)
key_padding_mask_np = init_mask(key_padding_mask_np) key_padding_mask_np = init_mask(key_padding_mask_np)
attn_mask_np = init_mask(attn_mask_np) attn_mask_np = init_mask(attn_mask_np)
...@@ -366,16 +370,18 @@ class TestSparseAttentionAPI(unittest.TestCase): ...@@ -366,16 +370,18 @@ class TestSparseAttentionAPI(unittest.TestCase):
exe = fluid.Executor(self.place) exe = fluid.Executor(self.place)
if self.use_mask == True: if self.use_mask == True:
fetches_result = exe.run(feed={ fetches_result = exe.run(
"Q": Q_np, feed={
"K": K_np, "Q": Q_np,
"V": V_np, "K": K_np,
"Offset": offset_np, "V": V_np,
"Columns": columns_np, "Offset": offset_np,
'KeyPaddingMask': key_padding_mask_np, "Columns": columns_np,
'AttnMask': attn_mask_np 'KeyPaddingMask': key_padding_mask_np,
}, 'AttnMask': attn_mask_np,
fetch_list=[Out]) },
fetch_list=[Out],
)
expected_result, __, __ = ref_batch_sparse_attention( expected_result, __, __ = ref_batch_sparse_attention(
Q_np, Q_np,
K_np, K_np,
...@@ -383,28 +389,32 @@ class TestSparseAttentionAPI(unittest.TestCase): ...@@ -383,28 +389,32 @@ class TestSparseAttentionAPI(unittest.TestCase):
offset_np, offset_np,
columns_np, columns_np,
kp_mask=key_padding_mask_np, kp_mask=key_padding_mask_np,
attn_mask=attn_mask_np) attn_mask=attn_mask_np,
)
else: else:
fetches_result = exe.run(feed={ fetches_result = exe.run(
"Q": Q_np, feed={
"K": K_np, "Q": Q_np,
"V": V_np, "K": K_np,
"Offset": offset_np, "V": V_np,
"Columns": columns_np "Offset": offset_np,
}, "Columns": columns_np,
fetch_list=[Out]) },
fetch_list=[Out],
)
expected_result, __, __ = ref_batch_sparse_attention( expected_result, __, __ = ref_batch_sparse_attention(
Q_np, K_np, V_np, offset_np, columns_np) Q_np, K_np, V_np, offset_np, columns_np
)
np.testing.assert_allclose(fetches_result, np.testing.assert_allclose(
expected_result, fetches_result[0], expected_result, rtol=1e-05, atol=1e-05
rtol=1e-05, )
atol=1e-05)
def test_dygraph(self): def test_dygraph(self):
paddle.disable_static() paddle.disable_static()
offset, columns = init_csr_format(self.shape[0], self.shape[1], offset, columns = init_csr_format(
self.shape[2], self.blocksize) self.shape[0], self.shape[1], self.shape[2], self.blocksize
)
offset = offset.astype('int32') offset = offset.astype('int32')
columns = columns.astype('int32') columns = columns.astype('int32')
query = np.random.random(self.shape).astype(self.dtype) query = np.random.random(self.shape).astype(self.dtype)
...@@ -429,13 +439,15 @@ class TestSparseAttentionAPI(unittest.TestCase): ...@@ -429,13 +439,15 @@ class TestSparseAttentionAPI(unittest.TestCase):
paddle_attn_mask = paddle.to_tensor(attn_mask, place=self.place) paddle_attn_mask = paddle.to_tensor(attn_mask, place=self.place)
if self.use_mask == True: if self.use_mask == True:
paddle_result = F.sparse_attention(paddle_query, paddle_result = F.sparse_attention(
paddle_key, paddle_query,
paddle_value, paddle_key,
paddle_offset, paddle_value,
paddle_colunmns, paddle_offset,
key_padding_mask=paddle_kp_mask, paddle_colunmns,
attn_mask=paddle_attn_mask) key_padding_mask=paddle_kp_mask,
attn_mask=paddle_attn_mask,
)
numpy_result, __, __ = ref_batch_sparse_attention( numpy_result, __, __ = ref_batch_sparse_attention(
query, query,
...@@ -444,25 +456,29 @@ class TestSparseAttentionAPI(unittest.TestCase): ...@@ -444,25 +456,29 @@ class TestSparseAttentionAPI(unittest.TestCase):
offset, offset,
columns, columns,
kp_mask=key_padding_mask, kp_mask=key_padding_mask,
attn_mask=attn_mask) attn_mask=attn_mask,
)
numpy_result = numpy_result.astype(self.dtype) numpy_result = numpy_result.astype(self.dtype)
else: else:
paddle_result = F.sparse_attention(paddle_query, paddle_key, paddle_result = F.sparse_attention(
paddle_value, paddle_offset, paddle_query,
paddle_colunmns) paddle_key,
paddle_value,
paddle_offset,
paddle_colunmns,
)
numpy_result, __, __ = ref_batch_sparse_attention( numpy_result, __, __ = ref_batch_sparse_attention(
query, key, value, offset, columns) query, key, value, offset, columns
)
numpy_result = numpy_result.astype(self.dtype) numpy_result = numpy_result.astype(self.dtype)
np.testing.assert_allclose(paddle_result.numpy(), np.testing.assert_allclose(
numpy_result, paddle_result.numpy(), numpy_result, rtol=1e-05, atol=1e-05
rtol=1e-05, )
atol=1e-05)
class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI): class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
def setUp(self): def setUp(self):
self.place = paddle.CUDAPlace(0) self.place = paddle.CUDAPlace(0)
self.shape = (2, 2, 8, 4) self.shape = (2, 2, 8, 4)
...@@ -472,7 +488,6 @@ class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI): ...@@ -472,7 +488,6 @@ class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI): class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
def setUp(self): def setUp(self):
self.place = paddle.CUDAPlace(0) self.place = paddle.CUDAPlace(0)
self.shape = (2, 2, 64, 32) self.shape = (2, 2, 64, 32)
...@@ -482,7 +497,6 @@ class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI): ...@@ -482,7 +497,6 @@ class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI): class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
def setUp(self): def setUp(self):
self.place = paddle.CUDAPlace(0) self.place = paddle.CUDAPlace(0)
self.shape = (2, 1, 64, 32) self.shape = (2, 1, 64, 32)
...@@ -492,7 +506,6 @@ class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI): ...@@ -492,7 +506,6 @@ class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI): class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
def setUp(self): def setUp(self):
self.place = paddle.CUDAPlace(0) self.place = paddle.CUDAPlace(0)
self.shape = (4, 4, 128, 32) self.shape = (4, 4, 128, 32)
...@@ -502,7 +515,6 @@ class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI): ...@@ -502,7 +515,6 @@ class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI): class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI):
def setUp(self): def setUp(self):
self.place = paddle.CUDAPlace(0) self.place = paddle.CUDAPlace(0)
self.shape = (3, 3, 35, 15) self.shape = (3, 3, 35, 15)
......
...@@ -64,42 +64,50 @@ class TestSparseElementWiseAPI(unittest.TestCase): ...@@ -64,42 +64,50 @@ class TestSparseElementWiseAPI(unittest.TestCase):
csr_y = s_dense_y.to_sparse_csr() csr_y = s_dense_y.to_sparse_csr()
actual_res = get_actual_res(csr_x, csr_y, op) actual_res = get_actual_res(csr_x, csr_y, op)
actual_res.backward(actual_res)
expect_res = op(dense_x, dense_y) expect_res = op(dense_x, dense_y)
expect_res.backward(expect_res) expect_res.backward(expect_res)
np.testing.assert_allclose(expect_res.numpy(), np.testing.assert_allclose(
actual_res.to_dense().numpy(), expect_res.numpy(),
rtol=1e-05, actual_res.to_dense().numpy(),
equal_nan=True) rtol=1e-05,
equal_nan=True,
)
if not (op == __truediv__ and dtype in ['int32', 'int64']): if not (op == __truediv__ and dtype in ['int32', 'int64']):
np.testing.assert_allclose(dense_x.grad.numpy(), actual_res.backward(actual_res)
csr_x.grad.to_dense().numpy(), np.testing.assert_allclose(
rtol=1e-05, dense_x.grad.numpy(),
equal_nan=True) csr_x.grad.to_dense().numpy(),
np.testing.assert_allclose(dense_y.grad.numpy(), rtol=1e-05,
csr_y.grad.to_dense().numpy(), equal_nan=True,
rtol=1e-05, )
equal_nan=True) np.testing.assert_allclose(
dense_y.grad.numpy(),
csr_y.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True,
)
def func_test_coo(self, op): def func_test_coo(self, op):
for sparse_dim in range(len(self.coo_shape) - 1, len(self.coo_shape)): for sparse_dim in range(len(self.coo_shape) - 1, len(self.coo_shape)):
for dtype in self.support_dtypes: for dtype in self.support_dtypes:
x = np.random.randint(-255, 255, x = np.random.randint(-255, 255, size=self.coo_shape).astype(
size=self.coo_shape).astype(dtype) dtype
y = np.random.randint(-255, 255, )
size=self.coo_shape).astype(dtype) y = np.random.randint(-255, 255, size=self.coo_shape).astype(
dtype
)
dense_x = paddle.to_tensor(x, dtype=dtype, stop_gradient=False) dense_x = paddle.to_tensor(x, dtype=dtype, stop_gradient=False)
dense_y = paddle.to_tensor(y, dtype=dtype, stop_gradient=False) dense_y = paddle.to_tensor(y, dtype=dtype, stop_gradient=False)
s_dense_x = paddle.to_tensor(x, s_dense_x = paddle.to_tensor(
dtype=dtype, x, dtype=dtype, stop_gradient=False
stop_gradient=False) )
s_dense_y = paddle.to_tensor(y, s_dense_y = paddle.to_tensor(
dtype=dtype, y, dtype=dtype, stop_gradient=False
stop_gradient=False) )
coo_x = s_dense_x.to_sparse_coo(sparse_dim) coo_x = s_dense_x.to_sparse_coo(sparse_dim)
coo_y = s_dense_y.to_sparse_coo(sparse_dim) coo_y = s_dense_y.to_sparse_coo(sparse_dim)
...@@ -109,18 +117,24 @@ class TestSparseElementWiseAPI(unittest.TestCase): ...@@ -109,18 +117,24 @@ class TestSparseElementWiseAPI(unittest.TestCase):
expect_res = op(dense_x, dense_y) expect_res = op(dense_x, dense_y)
expect_res.backward(expect_res) expect_res.backward(expect_res)
np.testing.assert_allclose(expect_res.numpy(), np.testing.assert_allclose(
actual_res.to_dense().numpy(), expect_res.numpy(),
rtol=1e-05, actual_res.to_dense().numpy(),
equal_nan=True) rtol=1e-05,
np.testing.assert_allclose(dense_x.grad.numpy(), equal_nan=True,
coo_x.grad.to_dense().numpy(), )
rtol=1e-05, np.testing.assert_allclose(
equal_nan=True) dense_x.grad.numpy(),
np.testing.assert_allclose(dense_y.grad.numpy(), coo_x.grad.to_dense().numpy(),
coo_y.grad.to_dense().numpy(), rtol=1e-05,
rtol=1e-05, equal_nan=True,
equal_nan=True) )
np.testing.assert_allclose(
dense_y.grad.numpy(),
coo_y.grad.to_dense().numpy(),
rtol=1e-05,
equal_nan=True,
)
def test_support_dtypes_csr(self): def test_support_dtypes_csr(self):
paddle.device.set_device('cpu') paddle.device.set_device('cpu')
...@@ -140,38 +154,37 @@ class TestSparseElementWiseAPI(unittest.TestCase): ...@@ -140,38 +154,37 @@ class TestSparseElementWiseAPI(unittest.TestCase):
values2_data = [[1.0], [2.0]] values2_data = [[1.0], [2.0]]
shape = [2, 4, 2] shape = [2, 4, 2]
sp_a = sparse.sparse_coo_tensor(indices_data, sp_a = sparse.sparse_coo_tensor(
values1_data, indices_data, values1_data, shape, stop_gradient=False
shape, )
stop_gradient=False) sp_b = sparse.sparse_coo_tensor(
sp_b = sparse.sparse_coo_tensor(indices_data, indices_data, values2_data, shape, stop_gradient=False
values2_data, )
shape,
stop_gradient=False)
values1 = paddle.to_tensor(values1_data, stop_gradient=False) values1 = paddle.to_tensor(values1_data, stop_gradient=False)
values2 = paddle.to_tensor(values2_data, stop_gradient=False) values2 = paddle.to_tensor(values2_data, stop_gradient=False)
#c.values() = a.values() + b.values() # c.values() = a.values() + b.values()
sp_c = sparse.add(sp_a, sp_b) sp_c = sparse.add(sp_a, sp_b)
sp_c.backward() sp_c.backward()
ref_c = values1 + values2 ref_c = values1 + values2
ref_c.backward() ref_c.backward()
np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy()) np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy())
np.testing.assert_allclose(sp_a.grad.values().numpy(), np.testing.assert_allclose(
values1.grad.numpy()) sp_a.grad.values().numpy(), values1.grad.numpy()
np.testing.assert_allclose(sp_b.grad.values().numpy(), )
values2.grad.numpy()) np.testing.assert_allclose(
sp_b.grad.values().numpy(), values2.grad.numpy()
)
def test_add_bias(self): def test_add_bias(self):
indices_data = [[0, 1], [0, 3]] indices_data = [[0, 1], [0, 3]]
values_data = [[1.0, 1.0], [2.0, 2.0]] values_data = [[1.0, 1.0], [2.0, 2.0]]
shape = [2, 4, 2] shape = [2, 4, 2]
sp_a = sparse.sparse_coo_tensor(indices_data, sp_a = sparse.sparse_coo_tensor(
values_data, indices_data, values_data, shape, stop_gradient=False
shape, )
stop_gradient=False)
bias_values = [1.0, 2.0] bias_values = [1.0, 2.0]
...@@ -179,14 +192,15 @@ class TestSparseElementWiseAPI(unittest.TestCase): ...@@ -179,14 +192,15 @@ class TestSparseElementWiseAPI(unittest.TestCase):
values2 = paddle.to_tensor(bias_values, stop_gradient=False) values2 = paddle.to_tensor(bias_values, stop_gradient=False)
values3 = paddle.to_tensor(bias_values, stop_gradient=False) values3 = paddle.to_tensor(bias_values, stop_gradient=False)
#c.values() = a.values() + b # c.values() = a.values() + b
sp_c = sparse.add(sp_a, values2) sp_c = sparse.add(sp_a, values2)
sp_c.backward() sp_c.backward()
ref_c = values1 + values3 ref_c = values1 + values3
ref_c.backward() ref_c.backward()
np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy()) np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy())
np.testing.assert_allclose(sp_a.grad.values().numpy(), np.testing.assert_allclose(
values1.grad.numpy()) sp_a.grad.values().numpy(), values1.grad.numpy()
)
np.testing.assert_allclose(values2.grad.numpy(), values3.grad.numpy()) np.testing.assert_allclose(values2.grad.numpy(), values3.grad.numpy())
......
...@@ -28,7 +28,6 @@ paddle.enable_static() ...@@ -28,7 +28,6 @@ paddle.enable_static()
# Correct: General. # Correct: General.
class TestSqueezeOp(OpTest): class TestSqueezeOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "squeeze2" self.op_type = "squeeze2"
self.python_api = paddle.squeeze self.python_api = paddle.squeeze
...@@ -40,7 +39,7 @@ class TestSqueezeOp(OpTest): ...@@ -40,7 +39,7 @@ class TestSqueezeOp(OpTest):
self.init_attrs() self.init_attrs()
self.outputs = { self.outputs = {
"Out": self.inputs["X"].reshape(self.new_shape), "Out": self.inputs["X"].reshape(self.new_shape),
"XShape": np.random.random(self.ori_shape).astype("float64") "XShape": np.random.random(self.ori_shape).astype("float64"),
} }
def test_check_output(self): def test_check_output(self):
...@@ -60,7 +59,6 @@ class TestSqueezeOp(OpTest): ...@@ -60,7 +59,6 @@ class TestSqueezeOp(OpTest):
# Correct: There is mins axis. # Correct: There is mins axis.
class TestSqueezeOp1(TestSqueezeOp): class TestSqueezeOp1(TestSqueezeOp):
def init_test_case(self): def init_test_case(self):
self.ori_shape = (1, 20, 1, 5) self.ori_shape = (1, 20, 1, 5)
self.axes = (0, -2) self.axes = (0, -2)
...@@ -69,7 +67,6 @@ class TestSqueezeOp1(TestSqueezeOp): ...@@ -69,7 +67,6 @@ class TestSqueezeOp1(TestSqueezeOp):
# Correct: No axes input. # Correct: No axes input.
class TestSqueezeOp2(TestSqueezeOp): class TestSqueezeOp2(TestSqueezeOp):
def init_test_case(self): def init_test_case(self):
self.ori_shape = (1, 20, 1, 5) self.ori_shape = (1, 20, 1, 5)
self.axes = () self.axes = ()
...@@ -78,7 +75,6 @@ class TestSqueezeOp2(TestSqueezeOp): ...@@ -78,7 +75,6 @@ class TestSqueezeOp2(TestSqueezeOp):
# Correct: Just part of axes be squeezed. # Correct: Just part of axes be squeezed.
class TestSqueezeOp3(TestSqueezeOp): class TestSqueezeOp3(TestSqueezeOp):
def init_test_case(self): def init_test_case(self):
self.ori_shape = (6, 1, 5, 1, 4, 1) self.ori_shape = (6, 1, 5, 1, 4, 1)
self.axes = (1, -1) self.axes = (1, -1)
...@@ -86,7 +82,6 @@ class TestSqueezeOp3(TestSqueezeOp): ...@@ -86,7 +82,6 @@ class TestSqueezeOp3(TestSqueezeOp):
class TestSqueeze2AxesTensor(UnittestBase): class TestSqueeze2AxesTensor(UnittestBase):
def init_info(self): def init_info(self):
self.shapes = [[2, 3, 4]] self.shapes = [[2, 3, 4]]
self.save_path = os.path.join(self.temp_dir.name, 'squeeze_tensor') self.save_path = os.path.join(self.temp_dir.name, 'squeeze_tensor')
...@@ -123,7 +118,6 @@ class TestSqueeze2AxesTensor(UnittestBase): ...@@ -123,7 +118,6 @@ class TestSqueeze2AxesTensor(UnittestBase):
class TestSqueeze2AxesTensorList(UnittestBase): class TestSqueeze2AxesTensorList(UnittestBase):
def init_info(self): def init_info(self):
self.shapes = [[2, 3, 4]] self.shapes = [[2, 3, 4]]
self.save_path = os.path.join(self.temp_dir.name, 'squeeze_tensor') self.save_path = os.path.join(self.temp_dir.name, 'squeeze_tensor')
...@@ -140,7 +134,7 @@ class TestSqueeze2AxesTensorList(UnittestBase): ...@@ -140,7 +134,7 @@ class TestSqueeze2AxesTensorList(UnittestBase):
# axes is a list[Variable] # axes is a list[Variable]
axes = [ axes = [
paddle.full([1], 0, dtype='int32'), paddle.full([1], 0, dtype='int32'),
paddle.full([1], 2, dtype='int32') paddle.full([1], 2, dtype='int32'),
] ]
out = paddle.squeeze(feat, axes) out = paddle.squeeze(feat, axes)
out2 = paddle.fluid.layers.squeeze(feat, axes) out2 = paddle.fluid.layers.squeeze(feat, axes)
...@@ -162,5 +156,37 @@ class TestSqueeze2AxesTensorList(UnittestBase): ...@@ -162,5 +156,37 @@ class TestSqueeze2AxesTensorList(UnittestBase):
self.assertEqual(infer_out.shape, (2, 3, 10)) self.assertEqual(infer_out.shape, (2, 3, 10))
# test api
class TestSqueezeAPI(unittest.TestCase):
def setUp(self):
self.executed_api()
def executed_api(self):
self.squeeze = paddle.squeeze
def test_api(self):
paddle.disable_static()
input_data = np.random.random([3, 2, 1]).astype("float32")
x = paddle.to_tensor(input_data)
out = self.squeeze(x, axis=2)
out.backward()
self.assertEqual(out.shape, [3, 2])
paddle.enable_static()
def test_error(self):
def test_axes_type():
x2 = paddle.static.data(name="x2", shape=[2, 1, 25], dtype="int32")
self.squeeze(x2, axis=2.1)
self.assertRaises(TypeError, test_axes_type)
class TestSqueezeInplaceAPI(TestSqueezeAPI):
def executed_api(self):
self.squeeze = paddle.squeeze_
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -12,16 +12,13 @@ ...@@ -12,16 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from __future__ import print_function
import unittest
import re import re
import unittest
import paddle.version as fluid_version import paddle.version as fluid_version
class VersionTest(unittest.TestCase): class VersionTest(unittest.TestCase):
def setUp(self): def setUp(self):
self._major_regex = "[0-9]+" self._major_regex = "[0-9]+"
self._minor_regex = "[0-9]+" self._minor_regex = "[0-9]+"
...@@ -37,15 +34,20 @@ class VersionTest(unittest.TestCase): ...@@ -37,15 +34,20 @@ class VersionTest(unittest.TestCase):
# check version format # check version format
if fluid_version.istaged: if fluid_version.istaged:
self.assertEqual(fluid_version.major, 0)
self.assertEqual(fluid_version.minor, 0)
self.assertEqual(fluid_version.patch, "0")
self.assertEqual(fluid_version.rc, 0)
self.assertEqual(fluid_version.full_version, "0.0.0")
else:
self.assertTrue(re.match(self._major_regex, fluid_version.major)) self.assertTrue(re.match(self._major_regex, fluid_version.major))
self.assertTrue(re.match(self._minor_regex, fluid_version.minor)) self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
self.assertTrue(re.match(self._patch_regex, fluid_version.patch)) self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
self.assertTrue(re.match(self._rc_regex, fluid_version.rc)) self.assertTrue(re.match(self._rc_regex, fluid_version.rc))
self.assertTrue( self.assertTrue(
re.match(self._version_regex, fluid_version.full_version)) re.match(self._version_regex, fluid_version.full_version)
)
else:
self.assertEqual(fluid_version.major, "0")
self.assertEqual(fluid_version.minor, "0")
self.assertEqual(fluid_version.patch, "0")
self.assertEqual(fluid_version.rc, "0")
self.assertEqual(fluid_version.full_version, "0.0.0")
if __name__ == '__main__':
unittest.main()
...@@ -241,13 +241,13 @@ def send_ue_recv( ...@@ -241,13 +241,13 @@ def send_ue_recv(
src_index (Tensor): An 1-D tensor, and the available data type is int32, int64. src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
The available data type is int32, int64. The available data type is int32, int64.
message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`. message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`. reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
Default value is `sum`. Default value is `sum`.
out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
out_size is smaller or equal to 0, then this input will not be used. out_size is smaller or equal to 0, then this input will not be used.
Otherwise, `out_size` should be equal with or larger than Otherwise, `out_size` should be equal with or larger than
max(dst_index) + 1. max(dst_index) + 1. Default value is `None`.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
......
...@@ -26,6 +26,7 @@ def reindex_graph( ...@@ -26,6 +26,7 @@ def reindex_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None x, neighbors, count, value_buffer=None, index_buffer=None, name=None
): ):
""" """
Reindex Graph API. Reindex Graph API.
This API is mainly used in Graph Learning domain, which should be used This API is mainly used in Graph Learning domain, which should be used
...@@ -49,12 +50,12 @@ def reindex_graph( ...@@ -49,12 +50,12 @@ def reindex_graph(
should be the same with `x`. should be the same with `x`.
count (Tensor): The neighbor count of the input nodes `x`. And the count (Tensor): The neighbor count of the input nodes `x`. And the
data type should be int32. data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32, value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32, index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None `value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer. if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
...@@ -69,6 +70,7 @@ def reindex_graph( ...@@ -69,6 +70,7 @@ def reindex_graph(
.. code-block:: python .. code-block:: python
import paddle import paddle
x = [0, 1, 2] x = [0, 1, 2]
neighbors = [8, 9, 0, 4, 7, 6, 7] neighbors = [8, 9, 0, 4, 7, 6, 7]
count = [2, 3, 2] count = [2, 3, 2]
...@@ -138,6 +140,7 @@ def reindex_heter_graph( ...@@ -138,6 +140,7 @@ def reindex_heter_graph(
x, neighbors, count, value_buffer=None, index_buffer=None, name=None x, neighbors, count, value_buffer=None, index_buffer=None, name=None
): ):
""" """
Reindex HeterGraph API. Reindex HeterGraph API.
This API is mainly used in Graph Learning domain, which should be used This API is mainly used in Graph Learning domain, which should be used
...@@ -161,12 +164,12 @@ def reindex_heter_graph( ...@@ -161,12 +164,12 @@ def reindex_heter_graph(
The data type should be the same with `x`. The data type should be the same with `x`.
count (list|tuple): The neighbor counts of the input nodes `x` from different graphs. count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
And the data type should be int32. And the data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32, value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version. Default is None.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32, index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
and should be filled with -1. Only useful for gpu version. and should be filled with -1. Only useful for gpu version.
`value_buffer` and `index_buffer` should be both not None `value_buffer` and `index_buffer` should be both not None
if you want to speed up by using hashtable buffer. if you want to speed up by using hashtable buffer. Default is None.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
...@@ -183,6 +186,7 @@ def reindex_heter_graph( ...@@ -183,6 +186,7 @@ def reindex_heter_graph(
.. code-block:: python .. code-block:: python
import paddle import paddle
x = [0, 1, 2] x = [0, 1, 2]
neighbors_a = [8, 9, 0, 4, 7, 6, 7] neighbors_a = [8, 9, 0, 4, 7, 6, 7]
count_a = [2, 3, 2] count_a = [2, 3, 2]
......
...@@ -32,6 +32,7 @@ def sample_neighbors( ...@@ -32,6 +32,7 @@ def sample_neighbors(
name=None, name=None,
): ):
""" """
Graph Sample Neighbors API. Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to This API is mainly used in Graph Learning domain, and the main purpose is to
...@@ -52,16 +53,16 @@ def sample_neighbors( ...@@ -52,16 +53,16 @@ def sample_neighbors(
The data type should be the same with `row`. The data type should be the same with `row`.
input_nodes (Tensor): The input nodes we need to sample neighbors for, and the input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
data type should be the same with `row`. data type should be the same with `row`.
sample_size (int): The number of neighbors we need to sample. Default value is -1, sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
which means returning all the neighbors of the input nodes. which means returning all the neighbors of the input nodes.
eids (Tensor): The eid information of the input graph. If return_eids is True, eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
then `eids` should not be None. The data type should be the then `eids` should not be None. The data type should be the
same with `row`. Default is None. same with `row`. Default is None.
return_eids (bool): Whether to return eid information of sample edges. Default is False. return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer` perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
is True, then `perm_buffer` should not be None. The data type should is True, then `perm_buffer` should not be None. The data type should
be the same with `row`. If not None, we will use fiser-yates sampling be the same with `row`. If not None, we will use fiser-yates sampling
to speed up. Only useful for gpu version. to speed up. Only useful for gpu version. Default is None.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
...@@ -69,15 +70,16 @@ def sample_neighbors( ...@@ -69,15 +70,16 @@ def sample_neighbors(
- out_neighbors (Tensor), the sample neighbors of the input nodes. - out_neighbors (Tensor), the sample neighbors of the input nodes.
- out_count (Tensor), the number of sampling neighbors of each input node, and the shape - out_count (Tensor), the number of sampling neighbors of each input node, and the shape
should be the same with `input_nodes`. should be the same with `input_nodes`.
- out_eids (Tensor), if `return_eids` is True, we will return the eid information of the - out_eids (Tensor), if `return_eids` is True, we will return the eid information of the
sample edges. sample edges.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
......
...@@ -69,8 +69,9 @@ def to_list(value): ...@@ -69,8 +69,9 @@ def to_list(value):
def to_numpy(var): def to_numpy(var):
assert isinstance(var, (Variable, fluid.core.VarBase, assert isinstance(
fluid.core.eager.Tensor)), "not a variable" var, (Variable, fluid.core.VarBase, fluid.core.eager.Tensor)
), "not a variable"
if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)): if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
return var.numpy() return var.numpy()
t = global_scope().find_var(var.name).get_tensor() t = global_scope().find_var(var.name).get_tensor()
...@@ -105,10 +106,9 @@ def extract_args(func): ...@@ -105,10 +106,9 @@ def extract_args(func):
def _all_gather(x, nranks, ring_id=0, use_calc_stream=True): def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
return collective._c_allgather(x, return collective._c_allgather(
nranks, x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream
ring_id=ring_id, )
use_calc_stream=use_calc_stream)
def wait_server_ready(endpoints): def wait_server_ready(endpoints):
...@@ -119,7 +119,8 @@ def wait_server_ready(endpoints): ...@@ -119,7 +119,8 @@ def wait_server_ready(endpoints):
for ep in endpoints: for ep in endpoints:
ip_port = ep.split(":") ip_port = ep.split(":")
with contextlib.closing( with contextlib.closing(
socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: socket.socket(socket.AF_INET, socket.SOCK_STREAM)
) as sock:
sock.settimeout(2) sock.settimeout(2)
result = sock.connect_ex((ip_port[0], int(ip_port[1]))) result = sock.connect_ex((ip_port[0], int(ip_port[1])))
if result != 0: if result != 0:
...@@ -131,8 +132,9 @@ def wait_server_ready(endpoints): ...@@ -131,8 +132,9 @@ def wait_server_ready(endpoints):
break break
def init_communicator(program, rank, nranks, wait_port, current_endpoint, def init_communicator(
endpoints): program, rank, nranks, wait_port, current_endpoint, endpoints
):
if nranks < 2: if nranks < 2:
return return
other_endpoints = endpoints[:] other_endpoints = endpoints[:]
...@@ -144,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint, ...@@ -144,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
nccl_id_var = block.create_var( nccl_id_var = block.create_var(
name=fluid.unique_name.generate('nccl_id'), name=fluid.unique_name.generate('nccl_id'),
persistable=True, persistable=True,
type=fluid.core.VarDesc.VarType.RAW) type=fluid.core.VarDesc.VarType.RAW,
)
block.append_op(type='c_gen_nccl_id',
inputs={}, block.append_op(
outputs={'Out': nccl_id_var}, type='c_gen_nccl_id',
attrs={ inputs={},
'rank': rank, outputs={'Out': nccl_id_var},
'endpoint': current_endpoint, attrs={
'other_endpoints': other_endpoints 'rank': rank,
}) 'endpoint': current_endpoint,
'other_endpoints': other_endpoints,
block.append_op(type='c_comm_init', },
inputs={'X': nccl_id_var}, )
outputs={},
attrs={ block.append_op(
'nranks': nranks, type='c_comm_init',
'rank': rank, inputs={'X': nccl_id_var},
'ring_id': 0, outputs={},
}) attrs={
'nranks': nranks,
'rank': rank,
'ring_id': 0,
},
)
elif core.is_compiled_with_npu(): elif core.is_compiled_with_npu():
hccl_id_var = block.create_var( hccl_id_var = block.create_var(
name=fluid.unique_name.generate('hccl_id'), name=fluid.unique_name.generate('hccl_id'),
persistable=True, persistable=True,
type=core.VarDesc.VarType.RAW) type=core.VarDesc.VarType.RAW,
block.append_op(type='c_gen_hccl_id', )
inputs={}, block.append_op(
outputs={'Out': hccl_id_var}, type='c_gen_hccl_id',
attrs={ inputs={},
'rank': rank, outputs={'Out': hccl_id_var},
'endpoint': current_endpoint, attrs={
'other_endpoints': other_endpoints 'rank': rank,
}) 'endpoint': current_endpoint,
block.append_op(type='c_comm_init_hccl', 'other_endpoints': other_endpoints,
inputs={'X': hccl_id_var}, },
outputs={}, )
attrs={ block.append_op(
'rank': rank, type='c_comm_init_hccl',
'ring_id': 0, inputs={'X': hccl_id_var},
'device_id': int(os.getenv("FLAGS_selected_npus")), outputs={},
'rank_ids': nranks attrs={
}) 'rank': rank,
'ring_id': 0,
'device_id': int(os.getenv("FLAGS_selected_npus")),
'rank_ids': nranks,
},
)
def prepare_distributed_context(place=None): def prepare_distributed_context(place=None):
if place is None: if place is None:
place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \ place = (
fluid.CUDAPlace(ParallelEnv().dev_id)
if ParallelEnv().nranks > 1
else fluid.CUDAPlace(0) else fluid.CUDAPlace(0)
)
place = _get_paddle_place(place) place = _get_paddle_place(place)
strategy = fluid.dygraph.parallel.ParallelStrategy() strategy = fluid.dygraph.parallel.ParallelStrategy()
...@@ -208,9 +223,14 @@ def prepare_distributed_context(place=None): ...@@ -208,9 +223,14 @@ def prepare_distributed_context(place=None):
def _init_context(): def _init_context():
communicator_prog = fluid.Program() communicator_prog = fluid.Program()
init_communicator(communicator_prog, strategy.local_rank, init_communicator(
strategy.nranks, True, strategy.current_endpoint, communicator_prog,
strategy.trainer_endpoints) strategy.local_rank,
strategy.nranks,
True,
strategy.current_endpoint,
strategy.trainer_endpoints,
)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(communicator_prog) exe.run(communicator_prog)
...@@ -220,7 +240,7 @@ def prepare_distributed_context(place=None): ...@@ -220,7 +240,7 @@ def prepare_distributed_context(place=None):
fluid.enable_dygraph(place) fluid.enable_dygraph(place)
else: else:
assert ("Only support CUDAPlace for now.") assert "Only support CUDAPlace for now."
_parallel_context_initialized = True _parallel_context_initialized = True
return strategy return strategy
...@@ -246,7 +266,9 @@ def _update_input_info(inputs): ...@@ -246,7 +266,9 @@ def _update_input_info(inputs):
class StaticGraphAdapter(object): class StaticGraphAdapter(object):
""" """
Model traning/inference with a static graph. Model traning/inference with a static graph.
""" """
def __init__(self, model): def __init__(self, model):
...@@ -269,7 +291,7 @@ class StaticGraphAdapter(object): ...@@ -269,7 +291,7 @@ class StaticGraphAdapter(object):
'eval_total': 0, 'eval_total': 0,
'test_total': 0, 'test_total': 0,
'eval_batch': 0, 'eval_batch': 0,
'test_batch': 0 'test_batch': 0,
} }
self._nranks = ParallelEnv().nranks self._nranks = ParallelEnv().nranks
...@@ -289,10 +311,13 @@ class StaticGraphAdapter(object): ...@@ -289,10 +311,13 @@ class StaticGraphAdapter(object):
self.model.mode = value self.model.mode = value
def train_batch(self, inputs, labels=None, update=True): def train_batch(self, inputs, labels=None, update=True):
assert self.model._optimizer, \ assert (
"model not ready, please call `model.prepare()` first" self.model._optimizer
), "model not ready, please call `model.prepare()` first"
self.mode = 'train' self.mode = 'train'
assert update is True, "Does not support `update == False` in static mode by now." assert (
update is True
), "Does not support `update == False` in static mode by now."
return self._run(inputs, labels) return self._run(inputs, labels)
def eval_batch(self, inputs, labels=None): def eval_batch(self, inputs, labels=None):
...@@ -307,7 +332,6 @@ class StaticGraphAdapter(object): ...@@ -307,7 +332,6 @@ class StaticGraphAdapter(object):
return self.model.network.parameters(*args, **kwargs) return self.model.network.parameters(*args, **kwargs)
def save(self, path): def save(self, path):
def _save(state, path): def _save(state, path):
if not state: if not state:
return return
...@@ -331,8 +355,7 @@ class StaticGraphAdapter(object): ...@@ -331,8 +355,7 @@ class StaticGraphAdapter(object):
# XXX `optimizer.state_dict()` only work in dygraph mode # XXX `optimizer.state_dict()` only work in dygraph mode
optim_path = path + ".pdopt" optim_path = path + ".pdopt"
optim = { optim = {
p.name: p p.name: p for p in filter(is_belong_to_optimizer, prog.list_vars())
for p in filter(is_belong_to_optimizer, prog.list_vars())
} }
if not optim: if not optim:
return return
...@@ -348,8 +371,10 @@ class StaticGraphAdapter(object): ...@@ -348,8 +371,10 @@ class StaticGraphAdapter(object):
# restore parameter states # restore parameter states
fluid.core._create_loaded_parameter( fluid.core._create_loaded_parameter(
[param for param, state in param_state_pairs], global_scope(), [param for param, state in param_state_pairs],
executor) global_scope(),
executor,
)
for param, state in param_state_pairs: for param, state in param_state_pairs:
self._set_var(param, state) self._set_var(param, state)
...@@ -377,9 +402,10 @@ class StaticGraphAdapter(object): ...@@ -377,9 +402,10 @@ class StaticGraphAdapter(object):
# static-graph, since the time of global_step to increase is # static-graph, since the time of global_step to increase is
# different. # different.
state_val = ( state_val = (
np.array(converted_state.pop("global_step")) - 1 (np.array(converted_state.pop("global_step")) - 1)
) if "global_step" in converted_state else converted_state.pop( if "global_step" in converted_state
"@LR_DECAY_COUNTER@", None) else converted_state.pop("@LR_DECAY_COUNTER@", None)
)
if state_val is not None: if state_val is not None:
converted_state[var.name] = state_val converted_state[var.name] = state_val
elif var.name.startswith("learning_rate_"): elif var.name.startswith("learning_rate_"):
...@@ -396,36 +422,61 @@ class StaticGraphAdapter(object): ...@@ -396,36 +422,61 @@ class StaticGraphAdapter(object):
opt_cls_name = self.model._optimizer.__class__.__name__ opt_cls_name = self.model._optimizer.__class__.__name__
opt_unq_name = None opt_unq_name = None
for name in self.model._optimizer._accumulators.keys(): for name in self.model._optimizer._accumulators.keys():
accum_name = name if opt_name is None else name[ accum_name = (
len(opt_name) + 1:] name
for param_name, state_var in self.model._optimizer._accumulators[ if opt_name is None
name].items(): else name[len(opt_name) + 1 :]
)
for (
param_name,
state_var,
) in self.model._optimizer._accumulators[name].items():
if opt_unq_name is None: if opt_unq_name is None:
# can not infer out the exact unique(opt_name), # can not infer out the exact unique(opt_name),
# thus try to extract rather than generate # thus try to extract rather than generate
for state_key in sorted(state.keys(), for state_key in sorted(
key=lambda x: len(x), state.keys(),
reverse=True): key=lambda x: len(x),
prefix = param_name + "_" + ( reverse=True,
opt_cls_name ):
if opt_name is None else opt_name) + "_" prefix = (
param_name
+ "_"
+ (
opt_cls_name
if opt_name is None
else opt_name
)
+ "_"
)
if state_key.startswith(prefix): if state_key.startswith(prefix):
prefix_offset = state_key[len( prefix_offset = state_key[
prefix):].find("_") + len(prefix) len(prefix) :
].find("_") + len(prefix)
opt_unq_name = state_key[ opt_unq_name = state_key[
len(param_name + "_"):prefix_offset] len(
param_name + "_"
) : prefix_offset
]
# TODO: assert # TODO: assert
# assert opt_unq_name is None # assert opt_unq_name is None
# gen(param.name + "_" + gen(opt_name) + "_" + accum_name) # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
# always end with "_0" since the unique optimizer._name # always end with "_0" since the unique optimizer._name
dy_state_name = (param_name + "_" + opt_unq_name + dy_state_name = (
"_" + accum_name + "_0") param_name
+ "_"
+ opt_unq_name
+ "_"
+ accum_name
+ "_0"
)
converted_state[ converted_state[
state_var.name] = converted_state.pop( state_var.name
dy_state_name) ] = converted_state.pop(dy_state_name)
assert var.name in converted_state, \ assert (
"variable [{}] is not in optimizer state file".format(var.name) var.name in converted_state
), "variable [{}] is not in optimizer state file".format(var.name)
self._set_var(var, converted_state[var.name]) self._set_var(var, converted_state[var.name])
def _set_var(self, var, ndarray): def _set_var(self, var, ndarray):
...@@ -444,15 +495,17 @@ class StaticGraphAdapter(object): ...@@ -444,15 +495,17 @@ class StaticGraphAdapter(object):
def _run(self, inputs, labels=None): def _run(self, inputs, labels=None):
compiled_prog = self._compiled_progs.get(self.mode, None) compiled_prog = self._compiled_progs.get(self.mode, None)
assert compiled_prog, \ assert (
"Model is not ready, please call `model.prepare()` first" compiled_prog
), "Model is not ready, please call `model.prepare()` first"
inputs = to_list(inputs) inputs = to_list(inputs)
if labels is not None: if labels is not None:
labels = to_list(labels) labels = to_list(labels)
assert len(inputs) == len(self._input_vars[self.mode]), \ assert len(inputs) == len(self._input_vars[self.mode]), (
"number of inputs" \ "number of inputs"
+ " does not match number of arguments of `forward` method" + " does not match number of arguments of `forward` method"
)
feed = {} feed = {}
input_names = [v.name for v in self._input_vars[self.mode]] input_names = [v.name for v in self._input_vars[self.mode]]
...@@ -462,8 +515,10 @@ class StaticGraphAdapter(object): ...@@ -462,8 +515,10 @@ class StaticGraphAdapter(object):
# train and test may take different arguments # train and test may take different arguments
if inputs[idx] is not None: if inputs[idx] is not None:
feed[n] = inputs[idx] feed[n] = inputs[idx]
if self._amp_level == 'O2' and input_dtypes[ if (
idx] == core.VarDesc.VarType.FP16: self._amp_level == 'O2'
and input_dtypes[idx] == core.VarDesc.VarType.FP16
):
if isinstance(feed[n], core.LoDTensor): if isinstance(feed[n], core.LoDTensor):
feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16) feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16)
elif isinstance(feed[n], np.array): elif isinstance(feed[n], np.array):
...@@ -491,10 +546,12 @@ class StaticGraphAdapter(object): ...@@ -491,10 +546,12 @@ class StaticGraphAdapter(object):
else: else:
pruned_fetch_list.append(fetch_var) pruned_fetch_list.append(fetch_var)
rets = self._executor.run(compiled_prog, rets = self._executor.run(
feed=feed, compiled_prog,
fetch_list=pruned_fetch_list, feed=feed,
return_numpy=False) fetch_list=pruned_fetch_list,
return_numpy=False,
)
# restore pruned fetch_list Variable from feeds # restore pruned fetch_list Variable from feeds
for i, name in enumerate(pruned_fetch_idx_name_map): for i, name in enumerate(pruned_fetch_idx_name_map):
...@@ -510,20 +567,24 @@ class StaticGraphAdapter(object): ...@@ -510,20 +567,24 @@ class StaticGraphAdapter(object):
metrics = [] metrics = []
for metric, state in zip(self.model._metrics, metric_states): for metric, state in zip(self.model._metrics, metric_states):
# cut off padding size # cut off padding size
if self.mode != 'train' and self.model._test_dataloader is not None \ if (
and isinstance(self.model._test_dataloader, DataLoader) \ self.mode != 'train'
and self._nranks > 1: and self.model._test_dataloader is not None
and isinstance(self.model._test_dataloader, DataLoader)
and self._nranks > 1
):
total_size = len(self.model._test_dataloader.dataset) total_size = len(self.model._test_dataloader.dataset)
# TODO: fixme if have better way to get batch size # TODO: fixme if have better way to get batch size
samples = state[0].shape[0] samples = state[0].shape[0]
current_count = self._merge_count.get(self.mode + '_total', 0) current_count = self._merge_count.get(self.mode + '_total', 0)
if current_count + samples >= total_size: if current_count + samples >= total_size:
state = [ state = [
s[:int(total_size - current_count), ...] for s in state s[: int(total_size - current_count), ...] for s in state
] ]
self._merge_count[self.mode + '_total'] = 0 self._merge_count[self.mode + '_total'] = 0
self._merge_count[self.mode + '_batch'] = int(total_size - self._merge_count[self.mode + '_batch'] = int(
current_count) total_size - current_count
)
else: else:
self._merge_count[self.mode + '_total'] += samples self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples self._merge_count[self.mode + '_batch'] = samples
...@@ -555,8 +616,11 @@ class StaticGraphAdapter(object): ...@@ -555,8 +616,11 @@ class StaticGraphAdapter(object):
if mode != 'train': if mode != 'train':
for op in list(prog.global_block().ops): for op in list(prog.global_block().ops):
prog.global_block()._remove_op(0) prog.global_block()._remove_op(0)
if mode == 'train' and self.model._optimizer \ if (
and self.model._optimizer._learning_rate_map: mode == 'train'
and self.model._optimizer
and self.model._optimizer._learning_rate_map
):
# HACK workaround learning rate map issue # HACK workaround learning rate map issue
lr_var = self.model._optimizer._learning_rate_map[self._orig_prog] lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
new_lr_var = prog.global_block().vars[lr_var.name] new_lr_var = prog.global_block().vars[lr_var.name]
...@@ -594,20 +658,27 @@ class StaticGraphAdapter(object): ...@@ -594,20 +658,27 @@ class StaticGraphAdapter(object):
dist_strategy.amp = True dist_strategy.amp = True
dist_strategy.amp_configs = self._amp_configs.copy() dist_strategy.amp_configs = self._amp_configs.copy()
dist_strategy.amp_configs.update(self._amp_custom_lists) dist_strategy.amp_configs.update(self._amp_custom_lists)
dist_strategy.amp_configs[ dist_strategy.amp_configs['use_pure_fp16'] = (
'use_pure_fp16'] = self._amp_level == 'O2' self._amp_level == 'O2'
)
self.model._optimizer = fleet.distributed_optimizer( self.model._optimizer = fleet.distributed_optimizer(
self.model._optimizer, strategy=dist_strategy) self.model._optimizer, strategy=dist_strategy
)
elif self._amp_level != "O0" and core.is_compiled_with_cuda: elif self._amp_level != "O0" and core.is_compiled_with_cuda:
amp_lists = paddle.static.amp.AutoMixedPrecisionLists( amp_lists = (
**self._amp_custom_lists paddle.static.amp.AutoMixedPrecisionLists(
) if self._amp_custom_lists else None **self._amp_custom_lists
)
if self._amp_custom_lists
else None
)
self.model._optimizer = paddle.static.amp.decorate( self.model._optimizer = paddle.static.amp.decorate(
self.model._optimizer, self.model._optimizer,
amp_lists=amp_lists, amp_lists=amp_lists,
use_pure_fp16=self._amp_level == "O2", use_pure_fp16=self._amp_level == "O2",
use_fp16_guard=self._use_fp16_guard, use_fp16_guard=self._use_fp16_guard,
**self._amp_configs) **self._amp_configs
)
self.model._optimizer.minimize(self._loss_endpoint) self.model._optimizer.minimize(self._loss_endpoint)
...@@ -620,7 +691,7 @@ class StaticGraphAdapter(object): ...@@ -620,7 +691,7 @@ class StaticGraphAdapter(object):
self._endpoints[mode] = { self._endpoints[mode] = {
"output": outputs, "output": outputs,
"loss": to_list(losses), "loss": to_list(losses),
"metric": metrics "metric": metrics,
} }
def _compile_and_initialize(self, prog, mode): def _compile_and_initialize(self, prog, mode):
...@@ -628,8 +699,9 @@ class StaticGraphAdapter(object): ...@@ -628,8 +699,9 @@ class StaticGraphAdapter(object):
if compiled_prog is not None: if compiled_prog is not None:
return compiled_prog return compiled_prog
assert self.model._place is not None, \ assert (
"device is not set, please call `model.prepare()` first" self.model._place is not None
), "device is not set, please call `model.prepare()` first"
place = self.model._place place = self.model._place
...@@ -642,8 +714,11 @@ class StaticGraphAdapter(object): ...@@ -642,8 +714,11 @@ class StaticGraphAdapter(object):
uninitialized = [] uninitialized = []
for var_py in self._startup_prog.list_vars(): for var_py in self._startup_prog.list_vars():
var = fluid.global_scope().find_var(var_py.name) var = fluid.global_scope().find_var(var_py.name)
if not var_py.name.startswith('nccl_id') and var and \ if (
var.get_tensor()._is_initialized(): not var_py.name.startswith('nccl_id')
and var
and var.get_tensor()._is_initialized()
):
continue continue
uninitialized.append(var_py) uninitialized.append(var_py)
...@@ -651,7 +726,10 @@ class StaticGraphAdapter(object): ...@@ -651,7 +726,10 @@ class StaticGraphAdapter(object):
startup_prog = self._startup_prog._prune(uninitialized) startup_prog = self._startup_prog._prune(uninitialized)
self._executor.run(startup_prog) self._executor.run(startup_prog)
if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda( if (
self._amp_level == "O2"
and mode == 'train'
and core.is_compiled_with_cuda()
): ):
self.model._optimizer.amp_init(place) self.model._optimizer.amp_init(place)
...@@ -664,7 +742,6 @@ class StaticGraphAdapter(object): ...@@ -664,7 +742,6 @@ class StaticGraphAdapter(object):
class DynamicGraphAdapter(object): class DynamicGraphAdapter(object):
def __init__(self, model): def __init__(self, model):
super(DynamicGraphAdapter, self).__init__() super(DynamicGraphAdapter, self).__init__()
self.model = model self.model = model
...@@ -674,7 +751,7 @@ class DynamicGraphAdapter(object): ...@@ -674,7 +751,7 @@ class DynamicGraphAdapter(object):
'eval_total': 0, 'eval_total': 0,
'test_total': 0, 'test_total': 0,
'eval_batch': 0, 'eval_batch': 0,
'test_batch': 0 'test_batch': 0,
} }
self._input_info = None self._input_info = None
...@@ -691,7 +768,8 @@ class DynamicGraphAdapter(object): ...@@ -691,7 +768,8 @@ class DynamicGraphAdapter(object):
stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
stradegy.current_endpoint = ParallelEnv().current_endpoint stradegy.current_endpoint = ParallelEnv().current_endpoint
self.ddp_model = fluid.dygraph.parallel.DataParallel( self.ddp_model = fluid.dygraph.parallel.DataParallel(
self.model.network, stradegy) self.model.network, stradegy
)
@property @property
def mode(self): def mode(self):
...@@ -703,8 +781,9 @@ class DynamicGraphAdapter(object): ...@@ -703,8 +781,9 @@ class DynamicGraphAdapter(object):
# TODO multi device in dygraph mode not implemented at present time # TODO multi device in dygraph mode not implemented at present time
def train_batch(self, inputs, labels=None, update=True): def train_batch(self, inputs, labels=None, update=True):
assert self.model._optimizer, \ assert (
"model not ready, please call `model.prepare()` first" self.model._optimizer
), "model not ready, please call `model.prepare()` first"
self.model.network.train() self.model.network.train()
self.mode = 'train' self.mode = 'train'
inputs = to_list(inputs) inputs = to_list(inputs)
...@@ -716,9 +795,11 @@ class DynamicGraphAdapter(object): ...@@ -716,9 +795,11 @@ class DynamicGraphAdapter(object):
if self._amp_level != "O0" and self.model._scaler is None: if self._amp_level != "O0" and self.model._scaler is None:
self.model._scaler = paddle.amp.GradScaler(**self._amp_configs) self.model._scaler = paddle.amp.GradScaler(**self._amp_configs)
with paddle.amp.auto_cast(enable=self._amp_level != 'O0', with paddle.amp.auto_cast(
**self._amp_custom_lists, enable=self._amp_level != 'O0',
level=self._amp_level): **self._amp_custom_lists,
level=self._amp_level
):
if self._nranks > 1: if self._nranks > 1:
outputs = self.ddp_model(*[to_variable(x) for x in inputs]) outputs = self.ddp_model(*[to_variable(x) for x in inputs])
else: else:
...@@ -746,8 +827,11 @@ class DynamicGraphAdapter(object): ...@@ -746,8 +827,11 @@ class DynamicGraphAdapter(object):
m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)]) m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
metrics.append(m) metrics.append(m)
return ([to_numpy(l) for l in losses], metrics) \ return (
if len(metrics) > 0 else [to_numpy(l) for l in losses] ([to_numpy(l) for l in losses], metrics)
if len(metrics) > 0
else [to_numpy(l) for l in losses]
)
def eval_batch(self, inputs, labels=None): def eval_batch(self, inputs, labels=None):
self.model.network.eval() self.model.network.eval()
...@@ -777,21 +861,25 @@ class DynamicGraphAdapter(object): ...@@ -777,21 +861,25 @@ class DynamicGraphAdapter(object):
metrics = [] metrics = []
for metric in self.model._metrics: for metric in self.model._metrics:
# cut off padding value. # cut off padding value.
if self.model._test_dataloader is not None and self._nranks > 1 \ if (
and isinstance(self.model._test_dataloader, DataLoader): self.model._test_dataloader is not None
and self._nranks > 1
and isinstance(self.model._test_dataloader, DataLoader)
):
total_size = len(self.model._test_dataloader.dataset) total_size = len(self.model._test_dataloader.dataset)
samples = outputs[0].shape[0] samples = outputs[0].shape[0]
current_count = self._merge_count.get(self.mode + '_total', 0) current_count = self._merge_count.get(self.mode + '_total', 0)
if current_count + samples >= total_size: if current_count + samples >= total_size:
outputs = [ outputs = [
o[:int(total_size - current_count)] for o in outputs o[: int(total_size - current_count)] for o in outputs
] ]
labels = [ labels = [
l[:int(total_size - current_count)] for l in labels l[: int(total_size - current_count)] for l in labels
] ]
self._merge_count[self.mode + '_total'] = 0 self._merge_count[self.mode + '_total'] = 0
self._merge_count[self.mode + '_batch'] = int(total_size - self._merge_count[self.mode + '_batch'] = int(
current_count) total_size - current_count
)
else: else:
self._merge_count[self.mode + '_total'] += samples self._merge_count[self.mode + '_total'] += samples
self._merge_count[self.mode + '_batch'] = samples self._merge_count[self.mode + '_batch'] = samples
...@@ -858,38 +946,48 @@ class DynamicGraphAdapter(object): ...@@ -858,38 +946,48 @@ class DynamicGraphAdapter(object):
opt_unq_name = '' opt_unq_name = ''
opt_cls_name = self.model._optimizer.__class__.__name__ opt_cls_name = self.model._optimizer.__class__.__name__
opt_name = opt_unq_name[:opt_unq_name.rfind("_")] # remove suffix idx opt_name = opt_unq_name[: opt_unq_name.rfind("_")] # remove suffix idx
param_names = [param.name for param in self.model.network.parameters()] param_names = [param.name for param in self.model.network.parameters()]
for var_name, state_var in sorted(optim_state.items(), for var_name, state_var in sorted(
key=lambda x: len(x[0]), optim_state.items(), key=lambda x: len(x[0]), reverse=True
reverse=True): ):
if var_name in ["@LR_DECAY_COUNTER@", "global_step"]: if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
# NOTE: dygraph saved global_step is 1 larger than that in # NOTE: dygraph saved global_step is 1 larger than that in
# static-graph, since the time of global_step to increase is # static-graph, since the time of global_step to increase is
# different. # different.
if var_name == "@LR_DECAY_COUNTER@": if var_name == "@LR_DECAY_COUNTER@":
converted_state["global_step"] = np.array( converted_state["global_step"] = (
converted_state.pop("@LR_DECAY_COUNTER@")) + 1 np.array(converted_state.pop("@LR_DECAY_COUNTER@")) + 1
)
else: else:
# moment and other accumulators # moment and other accumulators
# extend state dict to include promising dygraph names # extend state dict to include promising dygraph names
for param_name in param_names: for param_name in param_names:
if var_name.startswith(param_name + "_" + opt_name): if var_name.startswith(param_name + "_" + opt_name):
# when init optimizer with name # when init optimizer with name
accum_name = var_name[len(param_name + "_" + opt_name + accum_name = var_name[
"_"):] len(param_name + "_" + opt_name + "_") :
elif var_name.startswith(param_name + ]
"_") and opt_name == opt_cls_name: elif (
var_name.startswith(param_name + "_")
and opt_name == opt_cls_name
):
# when init optimizer without name # when init optimizer without name
accum_name = var_name[len(param_name + "_"):] accum_name = var_name[len(param_name + "_") :]
else: else:
continue continue
# remove suffix idx # remove suffix idx
accum_name = accum_name[:accum_name.rfind("_")] accum_name = accum_name[: accum_name.rfind("_")]
# state names always end with "_0" in dygraph because of the # state names always end with "_0" in dygraph because of the
# unique optimizer._name # unique optimizer._name
dy_state_name = (param_name + "_" + opt_unq_name + "_" + dy_state_name = (
accum_name + "_0") param_name
+ "_"
+ opt_unq_name
+ "_"
+ accum_name
+ "_0"
)
converted_state[dy_state_name] = state_var converted_state[dy_state_name] = state_var
if not hasattr(self.model._optimizer, 'set_state_dict'): if not hasattr(self.model._optimizer, 'set_state_dict'):
...@@ -901,18 +999,23 @@ class DynamicGraphAdapter(object): ...@@ -901,18 +999,23 @@ class DynamicGraphAdapter(object):
self.model._optimizer.set_state_dict(converted_state) self.model._optimizer.set_state_dict(converted_state)
def prepare(self): def prepare(self):
if self._amp_level == "O2" and self.model.mode == 'train' and core.is_compiled_with_cuda( if (
self._amp_level == "O2"
and self.model.mode == 'train'
and core.is_compiled_with_cuda()
): ):
self.model.network, self.model._optimizer = paddle.amp.decorate( self.model.network, self.model._optimizer = paddle.amp.decorate(
models=self.model.network, models=self.model.network,
optimizers=self.model._optimizer, optimizers=self.model._optimizer,
level='O2') level='O2',
)
if self._amp_level != "O0": if self._amp_level != "O0":
self.model._scaler = None self.model._scaler = None
class Model(object): class Model(object):
""" """
An Model object is network with training and inference features. An Model object is network with training and inference features.
Dynamic graph and static graph are supported at the same time, Dynamic graph and static graph are supported at the same time,
switched by `paddle.enable_static()`. The usage is as follows. switched by `paddle.enable_static()`. The usage is as follows.
...@@ -920,7 +1023,7 @@ class Model(object): ...@@ -920,7 +1023,7 @@ class Model(object):
instantiating a Model. The input description, i.e, paddle.static.InputSpec, instantiating a Model. The input description, i.e, paddle.static.InputSpec,
must be required for static graph. must be required for static graph.
When training on GPU, auto mixed precision (AMP O1) and pure float16 When training on GPU, auto mixed precision (AMP O1) and pure float16
(AMP O2) training are both supported in static mode and dynamic mode. (AMP O2) training are both supported in static mode and dynamic mode.
In static graph mode, before training with pure float16 (AMP O2), In static graph mode, before training with pure float16 (AMP O2),
`multi_precision` could be set to True when creating optimizer, which can `multi_precision` could be set to True when creating optimizer, which can
...@@ -965,7 +1068,7 @@ class Model(object): ...@@ -965,7 +1068,7 @@ class Model(object):
# inputs and labels are not required for dynamic graph. # inputs and labels are not required for dynamic graph.
input = InputSpec([None, 784], 'float32', 'x') input = InputSpec([None, 784], 'float32', 'x')
label = InputSpec([None, 1], 'int64', 'label') label = InputSpec([None, 1], 'int64', 'label')
model = paddle.Model(net, input, label) model = paddle.Model(net, input, label)
optim = paddle.optimizer.SGD(learning_rate=1e-3, optim = paddle.optimizer.SGD(learning_rate=1e-3,
parameters=model.parameters()) parameters=model.parameters())
...@@ -1053,16 +1156,17 @@ class Model(object): ...@@ -1053,16 +1156,17 @@ class Model(object):
def train_batch(self, inputs, labels=None, update=True): def train_batch(self, inputs, labels=None, update=True):
""" """
Run one training step on one batch of data. And using `update` indicates Run one training step on one batch of data. And using `update` indicates
whether optimizer update gradients computing by this batch. whether optimizer update gradients computing by this batch.
Args: Args:
inputs (numpy.ndarray|Tensor|list): Batch of input data. It could inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
be a numpy array or paddle.Tensor, or a list of arrays or be a numpy array or paddle.Tensor, or a list of arrays or
tensors (in case the model has multiple inputs). tensors (in case the model has multiple inputs).
labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
a numpy array or paddle.Tensor, or a list of arrays or tensors a numpy array or paddle.Tensor, or a list of arrays or tensors
(in case the model has multiple labels). If has no labels, (in case the model has multiple labels). If has no labels,
set None. Default: None. set None. Default: None.
update (bool, optional): Whether update parameters after loss.backward() computing. update (bool, optional): Whether update parameters after loss.backward() computing.
Set it to False to accumulate gradients. Default: True. Set it to False to accumulate gradients. Default: True.
...@@ -1075,7 +1179,7 @@ class Model(object): ...@@ -1075,7 +1179,7 @@ class Model(object):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
from paddle.static import InputSpec from paddle.static import InputSpec
...@@ -1098,6 +1202,7 @@ class Model(object): ...@@ -1098,6 +1202,7 @@ class Model(object):
loss = model.train_batch([data], [label]) loss = model.train_batch([data], [label])
print(loss) print(loss)
# [array([2.192784], dtype=float32)] # [array([2.192784], dtype=float32)]
""" """
loss = self._adapter.train_batch(inputs, labels, update) loss = self._adapter.train_batch(inputs, labels, update)
if fluid._non_static_mode() and self._input_info is None: if fluid._non_static_mode() and self._input_info is None:
...@@ -1107,15 +1212,16 @@ class Model(object): ...@@ -1107,15 +1212,16 @@ class Model(object):
@no_grad() @no_grad()
def eval_batch(self, inputs, labels=None): def eval_batch(self, inputs, labels=None):
""" """
Run one evaluating step on a batch of data. Run one evaluating step on a batch of data.
Args: Args:
inputs (numpy.ndarray|Tensor|list): Batch of input data. It could inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
be a numpy array or paddle.Tensor, or a list of arrays or be a numpy array or paddle.Tensor, or a list of arrays or
tensors (in case the model has multiple inputs). tensors (in case the model has multiple inputs).
labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
a numpy array or paddle.Tensor, or a list of arrays or tensors a numpy array or paddle.Tensor, or a list of arrays or tensors
(in case the model has multiple labels). If has no labels, (in case the model has multiple labels). If has no labels,
set None. Default: None. set None. Default: None.
Returns: Returns:
...@@ -1150,6 +1256,7 @@ class Model(object): ...@@ -1150,6 +1256,7 @@ class Model(object):
loss, acc = model.eval_batch([data], [label]) loss, acc = model.eval_batch([data], [label])
print(loss, acc) print(loss, acc)
# [array([2.8825705], dtype=float32)] [0.0] # [array([2.8825705], dtype=float32)] [0.0]
""" """
loss = self._adapter.eval_batch(inputs, labels) loss = self._adapter.eval_batch(inputs, labels)
if fluid._non_static_mode() and self._input_info is None: if fluid._non_static_mode() and self._input_info is None:
...@@ -1159,11 +1266,12 @@ class Model(object): ...@@ -1159,11 +1266,12 @@ class Model(object):
@no_grad() @no_grad()
def predict_batch(self, inputs): def predict_batch(self, inputs):
""" """
Run one predicting step on a batch of data. Run one predicting step on a batch of data.
Args: Args:
inputs (numpy.ndarray|Tensor|list): Batch of input data. It could inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
be a numpy array or paddle.Tensor, or a list of arrays or be a numpy array or paddle.Tensor, or a list of arrays or
tensors (in case the model has multiple inputs). tensors (in case the model has multiple inputs).
Returns: Returns:
...@@ -1179,7 +1287,7 @@ class Model(object): ...@@ -1179,7 +1287,7 @@ class Model(object):
from paddle.static import InputSpec from paddle.static import InputSpec
device = paddle.set_device('cpu') # or 'gpu' device = paddle.set_device('cpu') # or 'gpu'
input = InputSpec([None, 784], 'float32', 'x') input = InputSpec([None, 784], 'float32', 'x')
label = InputSpec([None, 1], 'int64', 'label') label = InputSpec([None, 1], 'int64', 'label')
...@@ -1197,6 +1305,7 @@ class Model(object): ...@@ -1197,6 +1305,7 @@ class Model(object):
# [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759, # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
# 0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]], # 0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
# dtype=float32)] # dtype=float32)]
""" """
loss = self._adapter.predict_batch(inputs) loss = self._adapter.predict_batch(inputs)
if fluid._non_static_mode() and self._input_info is None: if fluid._non_static_mode() and self._input_info is None:
...@@ -1204,12 +1313,13 @@ class Model(object): ...@@ -1204,12 +1313,13 @@ class Model(object):
return loss return loss
def save(self, path, training=True): def save(self, path, training=True):
""" """
This function saves parameters, optimizer information or model and
This function saves parameters, optimizer information or model and
paramters only for inference to path. It depends on the parameter paramters only for inference to path. It depends on the parameter
`training`. `training`.
If `training` is set to True, the parameters saved contain all If `training` is set to True, the parameters saved contain all
the trainable Variable, will save to a file with suffix ".pdparams". the trainable Variable, will save to a file with suffix ".pdparams".
The optimizer information contains all the variable used by optimizer. The optimizer information contains all the variable used by optimizer.
For Adam optimizer, contains beta1, beta2, momentum etc. All the For Adam optimizer, contains beta1, beta2, momentum etc. All the
...@@ -1268,10 +1378,11 @@ class Model(object): ...@@ -1268,10 +1378,11 @@ class Model(object):
T.Normalize([127.5], [127.5]) T.Normalize([127.5], [127.5])
]) ])
data = paddle.vision.datasets.MNIST(mode='train', transform=transform) data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
model.fit(data, epochs=1, batch_size=32, verbose=0) model.fit(data, epochs=1, batch_size=32, verbose=0)
model.save('checkpoint/test') # save for training model.save('checkpoint/test') # save for training
model.save('inference_model', False) # save for inference model.save('inference_model', False) # save for inference
""" """
if ParallelEnv().local_rank == 0: if ParallelEnv().local_rank == 0:
...@@ -1282,6 +1393,7 @@ class Model(object): ...@@ -1282,6 +1393,7 @@ class Model(object):
def load(self, path, skip_mismatch=False, reset_optimizer=False): def load(self, path, skip_mismatch=False, reset_optimizer=False):
""" """
Load from files storing the model states and optimizer states. The file Load from files storing the model states and optimizer states. The file
for optimizer states is not necessary if no need to restore the optimizer. for optimizer states is not necessary if no need to restore the optimizer.
...@@ -1329,6 +1441,7 @@ class Model(object): ...@@ -1329,6 +1441,7 @@ class Model(object):
model.save('checkpoint/test') model.save('checkpoint/test')
model.load('checkpoint/test') model.load('checkpoint/test')
""" """
def _load_state_from_path(path): def _load_state_from_path(path):
...@@ -1341,17 +1454,24 @@ class Model(object): ...@@ -1341,17 +1454,24 @@ class Model(object):
state = param_state.get(key, None) state = param_state.get(key, None)
if state is None: if state is None:
raise ValueError( raise ValueError(
"{} is not found in the providing file.".format(key)) "{} is not found in the providing file.".format(key)
)
if list(state.shape) != list(param.shape): if list(state.shape) != list(param.shape):
raise ValueError( raise ValueError(
"{} receives a shape {}, but the expected shape is {}.". "{} receives a shape {}, but the expected shape is {}.".format(
format(key, list(state.shape), list(param.shape))) key, list(state.shape), list(param.shape)
)
)
return param, state return param, state
def _strip_postfix(path): def _strip_postfix(path):
path, ext = os.path.splitext(path) path, ext = os.path.splitext(path)
assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \ assert ext in [
"Unknown postfix {} from weights".format(ext) '',
'.pdparams',
'.pdopt',
'.pdmodel',
], "Unknown postfix {} from weights".format(ext)
return path return path
path = _strip_postfix(path) path = _strip_postfix(path)
...@@ -1365,15 +1485,17 @@ class Model(object): ...@@ -1365,15 +1485,17 @@ class Model(object):
except ValueError as err: except ValueError as err:
if skip_mismatch: if skip_mismatch:
warnings.warn( warnings.warn(
("Skip loading for {}. ".format(key) + str(err))) ("Skip loading for {}. ".format(key) + str(err))
)
# reset optimizer when mismatch happens # reset optimizer when mismatch happens
reset_optimizer = True reset_optimizer = True
else: else:
raise err raise err
matched_param_state.append(match_res) matched_param_state.append(match_res)
optim_state = None if reset_optimizer else _load_state_from_path( optim_state = (
path + ".pdopt") None if reset_optimizer else _load_state_from_path(path + ".pdopt")
)
# TODO: support save/load scaler state in static graph # TODO: support save/load scaler state in static graph
if _non_static_mode(): if _non_static_mode():
...@@ -1382,13 +1504,15 @@ class Model(object): ...@@ -1382,13 +1504,15 @@ class Model(object):
if os.path.exists(path + '.pdscaler'): if os.path.exists(path + '.pdscaler'):
scaler_state = paddle.load(path + '.pdscaler') scaler_state = paddle.load(path + '.pdscaler')
return self._adapter.load(matched_param_state, optim_state, return self._adapter.load(
scaler_state) matched_param_state, optim_state, scaler_state
)
else: else:
return self._adapter.load(matched_param_state, optim_state) return self._adapter.load(matched_param_state, optim_state)
def parameters(self, *args, **kwargs): def parameters(self, *args, **kwargs):
""" """
Returns a list of parameters of the model. Returns a list of parameters of the model.
Returns: Returns:
...@@ -1398,30 +1522,32 @@ class Model(object): ...@@ -1398,30 +1522,32 @@ class Model(object):
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
from paddle.static import InputSpec from paddle.static import InputSpec
input = InputSpec([None, 784], 'float32', 'x') input = InputSpec([None, 784], 'float32', 'x')
model = paddle.Model(nn.Sequential( model = paddle.Model(nn.Sequential(
nn.Linear(784, 200), nn.Linear(784, 200),
nn.Tanh(), nn.Tanh(),
nn.Linear(200, 10)), input) nn.Linear(200, 10)), input)
params = model.parameters() params = model.parameters()
""" """
return self._adapter.parameters() return self._adapter.parameters()
def _prepare_amp(self, amp_configs): def _prepare_amp(self, amp_configs):
def _check_pure_fp16_configs(): def _check_pure_fp16_configs():
# pure float16 training has some restricts now # pure float16 training has some restricts now
if self._adapter._amp_level == "O2" and self._optimizer._grad_clip: if self._adapter._amp_level == "O2" and self._optimizer._grad_clip:
# clip by value is not supported # clip by value is not supported
assert isinstance(self._optimizer._grad_clip, (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm)), \ assert isinstance(
"Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently." self._optimizer._grad_clip,
(paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
self._adapter._amp_custom_lists = {} self._adapter._amp_custom_lists = {}
self._adapter._amp_configs = {} self._adapter._amp_configs = {}
...@@ -1433,7 +1559,8 @@ class Model(object): ...@@ -1433,7 +1559,8 @@ class Model(object):
elif isinstance(amp_configs, str): elif isinstance(amp_configs, str):
if amp_configs not in ('O0', 'O1', 'O2'): if amp_configs not in ('O0', 'O1', 'O2'):
raise ValueError( raise ValueError(
"The level of amp_configs should be 'O0', 'O1' or 'O2'.") "The level of amp_configs should be 'O0', 'O1' or 'O2'."
)
self._adapter._amp_level = amp_configs self._adapter._amp_level = amp_configs
_check_pure_fp16_configs() _check_pure_fp16_configs()
return return
...@@ -1442,7 +1569,8 @@ class Model(object): ...@@ -1442,7 +1569,8 @@ class Model(object):
self._adapter._amp_level = 'O1' self._adapter._amp_level = 'O1'
elif amp_configs['level'] not in ('O0', 'O1', 'O2'): elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
raise ValueError( raise ValueError(
"amp_configs['level'] should be 'O0', 'O1' or 'O2'.") "amp_configs['level'] should be 'O0', 'O1' or 'O2'."
)
else: else:
self._adapter._amp_level = amp_configs['level'] self._adapter._amp_level = amp_configs['level']
amp_config_key_set = set(amp_configs.keys()) - {'level'} amp_config_key_set = set(amp_configs.keys()) - {'level'}
...@@ -1459,12 +1587,14 @@ class Model(object): ...@@ -1459,12 +1587,14 @@ class Model(object):
# construct amp_custom_lists # construct amp_custom_lists
if self._adapter._amp_level != 'O0' and amp_config_key_set: if self._adapter._amp_level != 'O0' and amp_config_key_set:
for param_name in [ for param_name in [
'custom_white_list', 'custom_black_list', 'custom_white_list',
'custom_black_varnames' 'custom_black_list',
'custom_black_varnames',
]: ]:
if param_name in amp_config_key_set: if param_name in amp_config_key_set:
self._adapter._amp_custom_lists[param_name] = amp_configs[ self._adapter._amp_custom_lists[param_name] = amp_configs[
param_name] param_name
]
amp_config_key_set -= {param_name} amp_config_key_set -= {param_name}
def _check_amp_configs(amp_config_key_set): def _check_amp_configs(amp_config_key_set):
...@@ -1479,13 +1609,16 @@ class Model(object): ...@@ -1479,13 +1609,16 @@ class Model(object):
} }
if amp_config_key_set - accepted_param_set: if amp_config_key_set - accepted_param_set:
raise ValueError( raise ValueError(
"Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized." "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".format(
.format(tuple(amp_config_key_set - accepted_param_set))) tuple(amp_config_key_set - accepted_param_set)
)
)
if 'use_fp16_guard' in amp_config_key_set: if 'use_fp16_guard' in amp_config_key_set:
if _non_static_mode(): if _non_static_mode():
raise ValueError( raise ValueError(
"'use_fp16_guard' is supported in static mode only.") "'use_fp16_guard' is supported in static mode only."
)
self._adapter._use_fp16_guard = amp_configs['use_fp16_guard'] self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
amp_config_key_set.remove('use_fp16_guard') amp_config_key_set.remove('use_fp16_guard')
...@@ -1495,12 +1628,11 @@ class Model(object): ...@@ -1495,12 +1628,11 @@ class Model(object):
for key in amp_configs_set: for key in amp_configs_set:
self._adapter._amp_configs[key] = amp_configs[key] self._adapter._amp_configs[key] = amp_configs[key]
def prepare(self, def prepare(
optimizer=None, self, optimizer=None, loss=None, metrics=None, amp_configs=None
loss=None, ):
metrics=None,
amp_configs=None):
""" """
Configures the model before runing. Configures the model before runing.
Args: Args:
...@@ -1532,6 +1664,7 @@ class Model(object): ...@@ -1532,6 +1664,7 @@ class Model(object):
Returns: Returns:
None None
""" """
self._place = _get_device() self._place = _get_device()
if isinstance(self._place, fluid.CUDAPlace): if isinstance(self._place, fluid.CUDAPlace):
...@@ -1539,15 +1672,17 @@ class Model(object): ...@@ -1539,15 +1672,17 @@ class Model(object):
if ParallelEnv().nranks > 1 and not _parallel_context_initialized: if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
if fluid._non_static_mode(): if fluid._non_static_mode():
main_prog_seed = fluid.default_main_program().random_seed main_prog_seed = fluid.default_main_program().random_seed
startup_prog_seed = fluid.default_startup_program( startup_prog_seed = (
).random_seed fluid.default_startup_program().random_seed
)
fluid.disable_dygraph() fluid.disable_dygraph()
paddle.disable_static(self._place) paddle.disable_static(self._place)
# enable_dygraph would create and switch to a new program, # enable_dygraph would create and switch to a new program,
# thus also copy seed to the new program # thus also copy seed to the new program
fluid.default_main_program().random_seed = main_prog_seed fluid.default_main_program().random_seed = main_prog_seed
fluid.default_startup_program( fluid.default_startup_program().random_seed = (
).random_seed = startup_prog_seed startup_prog_seed
)
else: else:
prepare_distributed_context(self._place) prepare_distributed_context(self._place)
_parallel_context_initialized = True _parallel_context_initialized = True
...@@ -1562,43 +1697,46 @@ class Model(object): ...@@ -1562,43 +1697,46 @@ class Model(object):
metrics = metrics or [] metrics = metrics or []
for metric in to_list(metrics): for metric in to_list(metrics):
assert isinstance(metric, Metric), \ assert isinstance(
"{} is not sub class of Metric".format( metric, Metric
metric.__class__.__name__) ), "{} is not sub class of Metric".format(metric.__class__.__name__)
self._metrics = to_list(metrics) self._metrics = to_list(metrics)
self._prepare_amp(amp_configs) self._prepare_amp(amp_configs)
self._adapter.prepare() self._adapter.prepare()
def fit(self, def fit(
train_data=None, self,
eval_data=None, train_data=None,
batch_size=1, eval_data=None,
epochs=1, batch_size=1,
eval_freq=1, epochs=1,
log_freq=10, eval_freq=1,
save_dir=None, log_freq=10,
save_freq=1, save_dir=None,
verbose=2, save_freq=1,
drop_last=False, verbose=2,
shuffle=True, drop_last=False,
num_workers=0, shuffle=True,
callbacks=None, num_workers=0,
accumulate_grad_batches=1, callbacks=None,
num_iters=None): accumulate_grad_batches=1,
num_iters=None,
):
""" """
Trains the model for a fixed number of epochs. If `eval_data` is set, Trains the model for a fixed number of epochs. If `eval_data` is set,
evaluation will be done at the end of each epoch. evaluation will be done at the end of each epoch.
Args: Args:
train_data (Dataset|DataLoader, optional): An iterable data loader is used for train_data (Dataset|DataLoader, optional): An iterable data loader is used for
train. An instance of paddle paddle.io.Dataset or train. An instance of paddle paddle.io.Dataset or
paddle.io.Dataloader is recomended. Default: None. paddle.io.Dataloader is recomended. Default: None.
eval_data (Dataset|DataLoader, optional): An iterable data loader is used for eval_data (Dataset|DataLoader, optional): An iterable data loader is used for
evaluation at the end of epoch. If None, will not do evaluation. evaluation at the end of epoch. If None, will not do evaluation.
An instance of paddle.io.Dataset or paddle.io.Dataloader An instance of paddle.io.Dataset or paddle.io.Dataloader
is recomended. Default: None. is recomended. Default: None.
batch_size (int, optional): The batch size of train_data and eval_data. When batch_size (int, optional): The batch size of train_data and eval_data. When
train_data and eval_data are both the instance of Dataloader, this train_data and eval_data are both the instance of Dataloader, this
parameter will be ignored. Default: 1. parameter will be ignored. Default: 1.
epochs (int, optional): The number of epochs to train the model. Default: 1. epochs (int, optional): The number of epochs to train the model. Default: 1.
...@@ -1626,7 +1764,7 @@ class Model(object): ...@@ -1626,7 +1764,7 @@ class Model(object):
callbacks (Callback|None, optional): A list of `Callback` instances to apply callbacks (Callback|None, optional): A list of `Callback` instances to apply
during training. If None, :ref:`api_paddle_callbacks_ProgBarLogger` and during training. If None, :ref:`api_paddle_callbacks_ProgBarLogger` and
:ref:`api_paddle_callbacks_ModelCheckpoint` are automatically inserted. Default: None. :ref:`api_paddle_callbacks_ModelCheckpoint` are automatically inserted. Default: None.
accumulate_grad_batches (int, optional): The number of batches to accumulate gradident accumulate_grad_batches (int, optional): The number of batches to accumulate gradident
during training process before optimizer updates. It can mimic large batch during training process before optimizer updates. It can mimic large batch
size. Default: 1. size. Default: 1.
num_iters (int|None, optional): The number of iterations to evaluate the model. num_iters (int|None, optional): The number of iterations to evaluate the model.
...@@ -1641,7 +1779,7 @@ class Model(object): ...@@ -1641,7 +1779,7 @@ class Model(object):
How to make a batch is done internally. How to make a batch is done internally.
.. code-block:: python .. code-block:: python
:name: code-example1 :name: code-example3
import paddle import paddle
import paddle.vision.transforms as T import paddle.vision.transforms as T
...@@ -1681,7 +1819,7 @@ class Model(object): ...@@ -1681,7 +1819,7 @@ class Model(object):
DataLoader. DataLoader.
.. code-block:: python .. code-block:: python
:name: code-example2 :name: code-example4
import paddle import paddle
import paddle.vision.transforms as T import paddle.vision.transforms as T
...@@ -1691,7 +1829,7 @@ class Model(object): ...@@ -1691,7 +1829,7 @@ class Model(object):
dynamic = True dynamic = True
if not dynamic: if not dynamic:
paddle.enable_static() paddle.enable_static()
transform = T.Compose([ transform = T.Compose([
T.Transpose(), T.Transpose(),
T.Normalize([127.5], [127.5]) T.Normalize([127.5], [127.5])
...@@ -1718,31 +1856,38 @@ class Model(object): ...@@ -1718,31 +1856,38 @@ class Model(object):
val_loader, val_loader,
epochs=2, epochs=2,
save_dir='mnist_checkpoint') save_dir='mnist_checkpoint')
""" """
assert train_data is not None, \ assert train_data is not None, "train_data must be given!"
"train_data must be given!"
if isinstance(train_data, Dataset): if isinstance(train_data, Dataset):
train_sampler = DistributedBatchSampler(train_data, train_sampler = DistributedBatchSampler(
batch_size=batch_size, train_data,
shuffle=shuffle, batch_size=batch_size,
drop_last=drop_last) shuffle=shuffle,
train_loader = DataLoader(train_data, drop_last=drop_last,
batch_sampler=train_sampler, )
places=self._place, train_loader = DataLoader(
num_workers=num_workers, train_data,
return_list=True) batch_sampler=train_sampler,
places=self._place,
num_workers=num_workers,
return_list=True,
)
else: else:
train_loader = train_data train_loader = train_data
if eval_data is not None and isinstance(eval_data, Dataset): if eval_data is not None and isinstance(eval_data, Dataset):
eval_sampler = DistributedBatchSampler(eval_data, eval_sampler = DistributedBatchSampler(
batch_size=batch_size) eval_data, batch_size=batch_size
eval_loader = DataLoader(eval_data, )
batch_sampler=eval_sampler, eval_loader = DataLoader(
places=self._place, eval_data,
num_workers=num_workers, batch_sampler=eval_sampler,
return_list=True) places=self._place,
num_workers=num_workers,
return_list=True,
)
elif eval_data is not None: elif eval_data is not None:
eval_loader = eval_data eval_loader = eval_data
else: else:
...@@ -1755,8 +1900,11 @@ class Model(object): ...@@ -1755,8 +1900,11 @@ class Model(object):
steps = self._len_data_loader(train_loader) steps = self._len_data_loader(train_loader)
self.num_iters = num_iters self.num_iters = num_iters
if num_iters is not None and isinstance(num_iters, int) and isinstance( if (
steps, int): num_iters is not None
and isinstance(num_iters, int)
and isinstance(steps, int)
):
assert num_iters > 0, "num_iters must be greater than 0!" assert num_iters > 0, "num_iters must be greater than 0!"
epochs = (num_iters // steps) + 1 epochs = (num_iters // steps) + 1
steps = min(num_iters, steps) steps = min(num_iters, steps)
...@@ -1784,10 +1932,10 @@ class Model(object): ...@@ -1784,10 +1932,10 @@ class Model(object):
if do_eval and epoch % eval_freq == 0: if do_eval and epoch % eval_freq == 0:
eval_steps = self._len_data_loader(eval_loader) eval_steps = self._len_data_loader(eval_loader)
cbks.on_begin('eval', { cbks.on_begin(
'steps': eval_steps, 'eval',
'metrics': self._metrics_name() {'steps': eval_steps, 'metrics': self._metrics_name()},
}) )
eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval') eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval')
...@@ -1798,20 +1946,22 @@ class Model(object): ...@@ -1798,20 +1946,22 @@ class Model(object):
cbks.on_end('train', logs) cbks.on_end('train', logs)
self._test_dataloader = None self._test_dataloader = None
def evaluate(self, def evaluate(
eval_data, self,
batch_size=1, eval_data,
log_freq=10, batch_size=1,
verbose=2, log_freq=10,
num_workers=0, verbose=2,
callbacks=None, num_workers=0,
num_iters=None): callbacks=None,
num_iters=None,
):
""" """
Evaluate the loss and metrics of the model on input dataset. Evaluate the loss and metrics of the model on input dataset.
Args: Args:
eval_data (Dataset|DataLoader): An iterable data loader is used for eval_data (Dataset|DataLoader): An iterable data loader is used for
evaluation. An instance of paddle.io.Dataset or evaluation. An instance of paddle.io.Dataset or
paddle.io.Dataloader is recomended. paddle.io.Dataloader is recomended.
batch_size (int, optional): The batch size of train_data and eval_data. batch_size (int, optional): The batch size of train_data and eval_data.
When eval_data is the instance of Dataloader, this argument will be When eval_data is the instance of Dataloader, this argument will be
...@@ -1859,13 +2009,16 @@ class Model(object): ...@@ -1859,13 +2009,16 @@ class Model(object):
""" """
if eval_data is not None and isinstance(eval_data, Dataset): if eval_data is not None and isinstance(eval_data, Dataset):
eval_sampler = DistributedBatchSampler(eval_data, eval_sampler = DistributedBatchSampler(
batch_size=batch_size) eval_data, batch_size=batch_size
eval_loader = DataLoader(eval_data, )
batch_sampler=eval_sampler, eval_loader = DataLoader(
places=self._place, eval_data,
num_workers=num_workers, batch_sampler=eval_sampler,
return_list=True) places=self._place,
num_workers=num_workers,
return_list=True,
)
else: else:
eval_loader = eval_data eval_loader = eval_data
...@@ -1881,15 +2034,17 @@ class Model(object): ...@@ -1881,15 +2034,17 @@ class Model(object):
eval_steps = self._len_data_loader(eval_loader) eval_steps = self._len_data_loader(eval_loader)
self.num_iters = num_iters self.num_iters = num_iters
if num_iters is not None and isinstance(num_iters, int) and isinstance( if (
eval_steps, int): num_iters is not None
and isinstance(num_iters, int)
and isinstance(eval_steps, int)
):
assert num_iters > 0, "num_iters must be greater than 0!" assert num_iters > 0, "num_iters must be greater than 0!"
eval_steps = min(num_iters, eval_steps) eval_steps = min(num_iters, eval_steps)
self.num_iters = eval_steps self.num_iters = eval_steps
cbks.on_begin('eval', { cbks.on_begin(
'steps': eval_steps, 'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
'metrics': self._metrics_name() )
})
logs = self._run_one_epoch(eval_loader, cbks, 'eval') logs = self._run_one_epoch(eval_loader, cbks, 'eval')
...@@ -1903,13 +2058,15 @@ class Model(object): ...@@ -1903,13 +2058,15 @@ class Model(object):
return eval_result return eval_result
def predict(self, def predict(
test_data, self,
batch_size=1, test_data,
num_workers=0, batch_size=1,
stack_outputs=False, num_workers=0,
verbose=1, stack_outputs=False,
callbacks=None): verbose=1,
callbacks=None,
):
""" """
Compute the output predictions on testing data. Compute the output predictions on testing data.
...@@ -1919,7 +2076,7 @@ class Model(object): ...@@ -1919,7 +2076,7 @@ class Model(object):
is recomended. is recomended.
batch_size (int, optional): The batch size of test_data. When test_data is the batch_size (int, optional): The batch size of test_data. When test_data is the
instance of Dataloader, this argument will be ignored. Default: 1. instance of Dataloader, this argument will be ignored. Default: 1.
num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess
used and loading data in main process. When test_data is the instance of Dataloader, used and loading data in main process. When test_data is the instance of Dataloader,
this argument will be ignored. Default: 0. this argument will be ignored. Default: 0.
stack_outputs (bool, optional): Whether stack output field like a batch, as for an output stack_outputs (bool, optional): Whether stack output field like a batch, as for an output
...@@ -1980,13 +2137,16 @@ class Model(object): ...@@ -1980,13 +2137,16 @@ class Model(object):
""" """
if test_data is not None and isinstance(test_data, Dataset): if test_data is not None and isinstance(test_data, Dataset):
test_sampler = DistributedBatchSampler(test_data, test_sampler = DistributedBatchSampler(
batch_size=batch_size) test_data, batch_size=batch_size
test_loader = DataLoader(test_data, )
batch_sampler=test_sampler, test_loader = DataLoader(
places=self._place, test_data,
num_workers=num_workers, batch_sampler=test_sampler,
return_list=True) places=self._place,
num_workers=num_workers,
return_list=True,
)
else: else:
test_loader = test_data test_loader = test_data
...@@ -2036,7 +2196,8 @@ class Model(object): ...@@ -2036,7 +2196,8 @@ class Model(object):
if self._is_shape_inferred: if self._is_shape_inferred:
warnings.warn( warnings.warn(
"'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization." "'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization."
% self._input_info[0]) % self._input_info[0]
)
paddle.jit.save(layer, path, input_spec=self._inputs) paddle.jit.save(layer, path, input_spec=self._inputs)
...@@ -2047,7 +2208,8 @@ class Model(object): ...@@ -2047,7 +2208,8 @@ class Model(object):
raise ValueError( raise ValueError(
"The input path MUST be format of dirname/file_prefix " "The input path MUST be format of dirname/file_prefix "
"[dirname\\file_prefix in Windows system], but received " "[dirname\\file_prefix in Windows system], but received "
"file_prefix is empty string.") "file_prefix is empty string."
)
dirname = os.path.dirname(path) dirname = os.path.dirname(path)
if dirname and not os.path.exists(dirname): if dirname and not os.path.exists(dirname):
...@@ -2058,21 +2220,24 @@ class Model(object): ...@@ -2058,21 +2220,24 @@ class Model(object):
params_filename = file_prefix + INFER_PARAMS_SUFFIX params_filename = file_prefix + INFER_PARAMS_SUFFIX
prog = self._adapter._progs.get('test', None) prog = self._adapter._progs.get('test', None)
assert prog, \ assert (
"Model is not ready, please call `model.prepare()` first" prog
), "Model is not ready, please call `model.prepare()` first"
infer_prog = prog.clone(for_test=True) infer_prog = prog.clone(for_test=True)
input_names = [v.name for v in self._adapter._input_vars['test']] input_names = [v.name for v in self._adapter._input_vars['test']]
endpoints = self._adapter._endpoints['test']['output'] endpoints = self._adapter._endpoints['test']['output']
fluid.io.save_inference_model(model_path, fluid.io.save_inference_model(
input_names, model_path,
endpoints, input_names,
self._adapter._executor, endpoints,
main_program=infer_prog, self._adapter._executor,
model_filename=model_filename, main_program=infer_prog,
params_filename=params_filename) model_filename=model_filename,
params_filename=params_filename,
)
def _run_one_epoch( def _run_one_epoch(
self, self,
...@@ -2098,16 +2263,21 @@ class Model(object): ...@@ -2098,16 +2263,21 @@ class Model(object):
# LoDTensor.shape is callable, where LoDTensor comes from # LoDTensor.shape is callable, where LoDTensor comes from
# DataLoader in static graph # DataLoader in static graph
batch_size = data[0].shape()[0] if callable( batch_size = (
data[0].shape) else data[0].shape[0] data[0].shape()[0]
if callable(data[0].shape)
else data[0].shape[0]
)
callbacks.on_batch_begin(mode, step, logs) callbacks.on_batch_begin(mode, step, logs)
if mode != 'predict': if mode != 'predict':
_inputs = [data[:len(self._inputs)], data[len(self._inputs):]] _inputs = [data[: len(self._inputs)], data[len(self._inputs) :]]
if mode == 'train': if mode == 'train':
_inputs.append((step + 1) % self._accumulate == 0 _inputs.append(
or step + 1 == len(data_loader)) (step + 1) % self._accumulate == 0
or step + 1 == len(data_loader)
)
outs = getattr(self, mode + '_batch')(*_inputs) outs = getattr(self, mode + '_batch')(*_inputs)
...@@ -2128,15 +2298,17 @@ class Model(object): ...@@ -2128,15 +2298,17 @@ class Model(object):
logs[k] = v logs[k] = v
else: else:
if self._inputs is not None: if self._inputs is not None:
outs = self.predict_batch(data[:len(self._inputs)]) outs = self.predict_batch(data[: len(self._inputs)])
else: else:
outs = self.predict_batch(data) outs = self.predict_batch(data)
outputs.append(outs) outputs.append(outs)
logs['step'] = step logs['step'] = step
if mode == 'train' or self._adapter._merge_count.get( if (
mode + '_batch', 0) <= 0: mode == 'train'
or self._adapter._merge_count.get(mode + '_batch', 0) <= 0
):
logs['batch_size'] = batch_size * ParallelEnv().nranks logs['batch_size'] = batch_size * ParallelEnv().nranks
else: else:
logs['batch_size'] = self._adapter._merge_count[mode + '_batch'] logs['batch_size'] = self._adapter._merge_count[mode + '_batch']
...@@ -2158,10 +2330,10 @@ class Model(object): ...@@ -2158,10 +2330,10 @@ class Model(object):
"""Prints a string summary of the network. """Prints a string summary of the network.
Args: Args:
input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor.
if not set, input_size will get from ``self._inputs`` if network only have if not set, input_size will get from ``self._inputs`` if network only have
one input, input_size can be tuple or InputSpec. if model have multiple one input, input_size can be tuple or InputSpec. if model have multiple
input, input_size must be a list which contain every input's shape. input, input_size must be a list which contain every input's shape.
Default: None. Default: None.
dtype (str, optional): if dtype is None, 'float32' will be used, Default: None. dtype (str, optional): if dtype is None, 'float32' will be used, Default: None.
...@@ -2190,8 +2362,9 @@ class Model(object): ...@@ -2190,8 +2362,9 @@ class Model(object):
# {'total_params': 61610, 'trainable_params': 61610} # {'total_params': 61610, 'trainable_params': 61610}
""" """
assert (input_size is not None or self._inputs assert (
is not None), "'input_size' or 'self._input' must be set" input_size is not None or self._inputs is not None
), "'input_size' or 'self._input' must be set"
if input_size is not None: if input_size is not None:
_input_size = input_size _input_size = input_size
else: else:
...@@ -2208,7 +2381,10 @@ class Model(object): ...@@ -2208,7 +2381,10 @@ class Model(object):
if is_input: if is_input:
arg_names = extract_args(self.network.forward)[1:] arg_names = extract_args(self.network.forward)[1:]
# While Saving inference model in dygraph, and providing inputs only in running. # While Saving inference model in dygraph, and providing inputs only in running.
if shapes is not None and dtypes is not None and fluid._non_static_mode( if (
shapes is not None
and dtypes is not None
and fluid._non_static_mode()
): ):
out_specs = [ out_specs = [
Input(name=n, dtype=dtypes[i], shape=shapes[i]) Input(name=n, dtype=dtypes[i], shape=shapes[i])
...@@ -2221,7 +2397,8 @@ class Model(object): ...@@ -2221,7 +2397,8 @@ class Model(object):
elif isinstance(specs, dict): elif isinstance(specs, dict):
assert is_input is False assert is_input is False
out_specs = [ out_specs = [
specs[n] for n in extract_args(self.network.forward) specs[n]
for n in extract_args(self.network.forward)
if n != 'self' if n != 'self'
] ]
else: else:
...@@ -2232,8 +2409,10 @@ class Model(object): ...@@ -2232,8 +2409,10 @@ class Model(object):
assert isinstance(spec, Input) assert isinstance(spec, Input)
if spec.name is None: if spec.name is None:
raise ValueError( raise ValueError(
"Requires Input[{}].name != None, but receive `None` with {}." "Requires Input[{}].name != None, but receive `None` with {}.".format(
.format(i, spec)) i, spec
)
)
return out_specs return out_specs
...@@ -2258,6 +2437,7 @@ class Model(object): ...@@ -2258,6 +2437,7 @@ class Model(object):
"Update self._inputs according to given inputs." "Update self._inputs according to given inputs."
self._input_info = self._adapter._input_info self._input_info = self._adapter._input_info
if self._input_info is not None and len(self._input_info) == 2: if self._input_info is not None and len(self._input_info) == 2:
self._inputs = self._verify_spec(None, self._input_info[0], self._inputs = self._verify_spec(
self._input_info[1], True) None, self._input_info[0], self._input_info[1], True
)
self._is_shape_inferred = True self._is_shape_inferred = True
...@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm( ...@@ -284,9 +284,11 @@ def fused_bias_dropout_residual_layer_norm(
name=None, name=None,
): ):
r""" r"""
The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows: The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
.. code-block:: python .. code-block:: python
y = layer_norm(residual + dropout(bias + x)) y = layer_norm(residual + dropout(bias + x))
Parameters: Parameters:
...@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm( ...@@ -315,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm(
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
Tensor: The output Tensor, the data type and shape is same as `x`. Tensor, The output Tensor, the data type and shape is same as `x`.
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
...@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm( ...@@ -336,6 +337,7 @@ def fused_bias_dropout_residual_layer_norm(
x, residual, bias) x, residual, bias)
# [2, 4, 128] # [2, 4, 128]
print(output.shape) print(output.shape)
""" """
seed = None seed = None
if mode not in ('downscale_in_infer', 'upscale_in_train'): if mode not in ('downscale_in_infer', 'upscale_in_train'):
......
...@@ -16,7 +16,10 @@ from paddle.incubate.nn import functional as incubate_f ...@@ -16,7 +16,10 @@ from paddle.incubate.nn import functional as incubate_f
from paddle.nn import Layer from paddle.nn import Layer
from paddle.framework import ParamAttr from paddle.framework import ParamAttr
import paddle import paddle
from paddle.nn.layer.transformer import _convert_attention_mask, _convert_param_attr_to_list from paddle.nn.layer.transformer import (
_convert_attention_mask,
_convert_param_attr_to_list,
)
from paddle.nn.initializer import Constant from paddle.nn.initializer import Constant
from paddle.fluid.dygraph import no_grad from paddle.fluid.dygraph import no_grad
from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode
...@@ -51,7 +54,8 @@ def _to_dtype(t, dtype): ...@@ -51,7 +54,8 @@ def _to_dtype(t, dtype):
if t.place.is_gpu_place(): if t.place.is_gpu_place():
size_dtype = core.size_of_dtype(dtype) size_dtype = core.size_of_dtype(dtype)
waiting_alloc_memory = ( waiting_alloc_memory = (
(np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2 ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
)
gpu_memory_available = core.gpu_memory_available() gpu_memory_available = core.gpu_memory_available()
if gpu_memory_available < waiting_alloc_memory: if gpu_memory_available < waiting_alloc_memory:
t_used = t._copy_to(paddle.CPUPlace(), False) t_used = t._copy_to(paddle.CPUPlace(), False)
...@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer): ...@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
output = fused_bias_dropout_residual_ln(x, residual) # [2, 4, 128] output = fused_bias_dropout_residual_ln(x, residual) # [2, 4, 128]
""" """
def __init__(self, def __init__(
embed_dim, self,
dropout_rate=0.5, embed_dim,
weight_attr=None, dropout_rate=0.5,
bias_attr=None, weight_attr=None,
epsilon=1e-5, bias_attr=None,
name=None): epsilon=1e-5,
name=None,
):
super(FusedBiasDropoutResidualLayerNorm, self).__init__() super(FusedBiasDropoutResidualLayerNorm, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " assert embed_dim > 0, (
"but recieved {}".format(embed_dim)) "Expected embed_dim to be greater than 0, "
"but recieved {}".format(embed_dim)
)
self._dtype = self._helper.get_default_dtype() self._dtype = self._helper.get_default_dtype()
self._bias_attr = bias_attr self._bias_attr = bias_attr
self._weight_attr = weight_attr self._weight_attr = weight_attr
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.linear_bias = self.create_parameter(shape=[embed_dim], self.linear_bias = self.create_parameter(
attr=self._bias_attr, shape=[embed_dim],
dtype=self._dtype, attr=self._bias_attr,
is_bias=True) dtype=self._dtype,
is_bias=True,
)
self.ln_scale = self.create_parameter( self.ln_scale = self.create_parameter(
attr=self._weight_attr, attr=self._weight_attr,
shape=[embed_dim], shape=[embed_dim],
default_initializer=Constant(value=1.0)) default_initializer=Constant(value=1.0),
self.ln_bias = self.create_parameter(attr=self._bias_attr, )
shape=[embed_dim], self.ln_bias = self.create_parameter(
is_bias=True) attr=self._bias_attr, shape=[embed_dim], is_bias=True
)
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self._epsilon = epsilon self._epsilon = epsilon
...@@ -163,14 +174,20 @@ class FusedBiasDropoutResidualLayerNorm(Layer): ...@@ -163,14 +174,20 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
ln_epsilon=self._epsilon, ln_epsilon=self._epsilon,
training=self.training, training=self.training,
mode='upscale_in_train', mode='upscale_in_train',
name=self.name) name=self.name,
)
return out return out
def extra_repr(self): def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else '' name_str = ', name={}'.format(self.name) if self.name else ''
return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format( return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon, self.embed_dim,
self._dtype, name_str) self.seq_len,
self.dropout_rate,
self._epsilon,
self._dtype,
name_str,
)
class FusedMultiHeadAttention(Layer): class FusedMultiHeadAttention(Layer):
...@@ -246,33 +263,40 @@ class FusedMultiHeadAttention(Layer): ...@@ -246,33 +263,40 @@ class FusedMultiHeadAttention(Layer):
output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128] output = multi_head_attn(query, None, None, attn_mask=attn_mask) # [2, 4, 128]
""" """
def __init__(self, def __init__(
embed_dim, self,
num_heads, embed_dim,
dropout_rate=0.5, num_heads,
attn_dropout_rate=0.5, dropout_rate=0.5,
kdim=None, attn_dropout_rate=0.5,
vdim=None, kdim=None,
normalize_before=False, vdim=None,
need_weights=False, normalize_before=False,
qkv_weight_attr=None, need_weights=False,
qkv_bias_attr=None, qkv_weight_attr=None,
linear_weight_attr=None, qkv_bias_attr=None,
linear_bias_attr=None, linear_weight_attr=None,
pre_ln_scale_attr=None, linear_bias_attr=None,
pre_ln_bias_attr=None, pre_ln_scale_attr=None,
ln_scale_attr=None, pre_ln_bias_attr=None,
ln_bias_attr=None, ln_scale_attr=None,
epsilon=1e-5, ln_bias_attr=None,
nranks=1, epsilon=1e-5,
ring_id=-1, nranks=1,
name=None): ring_id=-1,
name=None,
):
super(FusedMultiHeadAttention, self).__init__() super(FusedMultiHeadAttention, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " assert embed_dim > 0, (
"but received {}".format(embed_dim)) "Expected embed_dim to be greater than 0, "
assert num_heads > 0, ("Expected nhead to be greater than 0, " "but received {}".format(embed_dim)
"but received {}".format(num_heads)) )
assert (
num_heads > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
num_heads
)
self.normalize_before = normalize_before self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype() self._dtype = self._helper.get_default_dtype()
...@@ -285,7 +309,9 @@ class FusedMultiHeadAttention(Layer): ...@@ -285,7 +309,9 @@ class FusedMultiHeadAttention(Layer):
self.kdim = kdim self.kdim = kdim
self.vdim = vdim self.vdim = vdim
self.need_weights = need_weights self.need_weights = need_weights
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" assert (
self.head_dim * num_heads == embed_dim
), "embed_dim must be divisible by num_heads"
assert need_weights is False, "Only support need_weight is False now." assert need_weights is False, "Only support need_weight is False now."
# tensor model parallel # tensor model parallel
...@@ -296,21 +322,26 @@ class FusedMultiHeadAttention(Layer): ...@@ -296,21 +322,26 @@ class FusedMultiHeadAttention(Layer):
shape=[3, num_heads, self.head_dim, embed_dim], shape=[3, num_heads, self.head_dim, embed_dim],
attr=qkv_weight_attr, attr=qkv_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
)
self.qkv_bias = self.create_parameter( self.qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim], shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr, attr=qkv_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
self.linear_weight = self.create_parameter( self.linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim], shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr, attr=linear_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
self.linear_bias = self.create_parameter(shape=[embed_dim], )
attr=linear_bias_attr, self.linear_bias = self.create_parameter(
dtype=self._dtype, shape=[embed_dim],
is_bias=True) attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True,
)
# tensor model parallel # tensor model parallel
if nranks > 1: if nranks > 1:
...@@ -325,10 +356,11 @@ class FusedMultiHeadAttention(Layer): ...@@ -325,10 +356,11 @@ class FusedMultiHeadAttention(Layer):
self.pre_ln_scale = self.create_parameter( self.pre_ln_scale = self.create_parameter(
attr=pre_ln_scale_attr, attr=pre_ln_scale_attr,
shape=[embed_dim], shape=[embed_dim],
default_initializer=Constant(value=1.0)) default_initializer=Constant(value=1.0),
self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr, )
shape=[embed_dim], self.pre_ln_bias = self.create_parameter(
is_bias=True) attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True
)
self.ln_scale = None self.ln_scale = None
self.ln_bias = None self.ln_bias = None
else: else:
...@@ -337,10 +369,11 @@ class FusedMultiHeadAttention(Layer): ...@@ -337,10 +369,11 @@ class FusedMultiHeadAttention(Layer):
self.ln_scale = self.create_parameter( self.ln_scale = self.create_parameter(
attr=ln_scale_attr, attr=ln_scale_attr,
shape=[embed_dim], shape=[embed_dim],
default_initializer=Constant(value=1.0)) default_initializer=Constant(value=1.0),
self.ln_bias = self.create_parameter(attr=ln_bias_attr, )
shape=[embed_dim], self.ln_bias = self.create_parameter(
is_bias=True) attr=ln_bias_attr, shape=[embed_dim], is_bias=True
)
self.dropout_rate = dropout_rate self.dropout_rate = dropout_rate
self.attn_dropout_rate = attn_dropout_rate self.attn_dropout_rate = attn_dropout_rate
...@@ -404,15 +437,25 @@ class FusedMultiHeadAttention(Layer): ...@@ -404,15 +437,25 @@ class FusedMultiHeadAttention(Layer):
ln_epsilon=self._epsilon, ln_epsilon=self._epsilon,
training=self.training, training=self.training,
ring_id=self._ring_id, ring_id=self._ring_id,
name=self.name) name=self.name,
)
return out return out
def extra_repr(self): def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else '' name_str = ', name={}'.format(self.name) if self.name else ''
return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format( return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format(
self.embed_dim, self.num_heads, self.dropout_rate, self.embed_dim,
self.attn_dropout_rate, self._epsilon, self.kdim, self.vdim, self.num_heads,
self.normalize_before, self.need_weights, self._dtype, name_str) self.dropout_rate,
self.attn_dropout_rate,
self._epsilon,
self.kdim,
self.vdim,
self.normalize_before,
self.need_weights,
self._dtype,
name_str,
)
def _amp_decorate(self, dtype): def _amp_decorate(self, dtype):
# tmp fix for amp.decorator(O2) # tmp fix for amp.decorator(O2)
...@@ -495,33 +538,39 @@ class FusedFeedForward(Layer): ...@@ -495,33 +538,39 @@ class FusedFeedForward(Layer):
# (1, 8, 8) # (1, 8, 8)
""" """
def __init__(self, def __init__(
d_model, self,
dim_feedforward, d_model,
dropout_rate=0.1, dim_feedforward,
epsilon=1e-05, dropout_rate=0.1,
activation="relu", epsilon=1e-05,
act_dropout_rate=None, activation="relu",
normalize_before=False, act_dropout_rate=None,
linear1_weight_attr=None, normalize_before=False,
linear1_bias_attr=None, linear1_weight_attr=None,
linear2_weight_attr=None, linear1_bias_attr=None,
linear2_bias_attr=None, linear2_weight_attr=None,
ln1_scale_attr=None, linear2_bias_attr=None,
ln1_bias_attr=None, ln1_scale_attr=None,
ln2_scale_attr=None, ln1_bias_attr=None,
ln2_bias_attr=None, ln2_scale_attr=None,
nranks=1, ln2_bias_attr=None,
ring_id=-1, nranks=1,
name=None): ring_id=-1,
name=None,
):
super(FusedFeedForward, self).__init__() super(FusedFeedForward, self).__init__()
assert d_model > 0, ( assert (
"Expected d_model to be greater than 0, but received {}".format( d_model > 0
d_model)) ), "Expected d_model to be greater than 0, but received {}".format(
assert dim_feedforward > 0, ( d_model
"Expected dim_feedforward to be greater than 0, but received {}". )
format(dim_feedforward)) assert (
dim_feedforward > 0
), "Expected dim_feedforward to be greater than 0, but received {}".format(
dim_feedforward
)
self._dtype = self._helper.get_default_dtype() self._dtype = self._helper.get_default_dtype()
self._d_model = d_model self._d_model = d_model
...@@ -530,7 +579,9 @@ class FusedFeedForward(Layer): ...@@ -530,7 +579,9 @@ class FusedFeedForward(Layer):
dim_feedforward = dim_feedforward // nranks dim_feedforward = dim_feedforward // nranks
self._dim_feedforward = dim_feedforward self._dim_feedforward = dim_feedforward
self._dropout_rate = dropout_rate self._dropout_rate = dropout_rate
self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate self._act_dropout_rate = (
dropout_rate if act_dropout_rate is None else act_dropout_rate
)
self._act_method = activation self._act_method = activation
self._normalize_before = normalize_before self._normalize_before = normalize_before
self._epsilon = epsilon self._epsilon = epsilon
...@@ -540,22 +591,28 @@ class FusedFeedForward(Layer): ...@@ -540,22 +591,28 @@ class FusedFeedForward(Layer):
shape=[d_model, dim_feedforward], shape=[d_model, dim_feedforward],
attr=linear1_weight_attr, attr=linear1_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
self._linear1_bias = self.create_parameter(shape=[dim_feedforward], )
attr=linear1_bias_attr, self._linear1_bias = self.create_parameter(
dtype=self._dtype, shape=[dim_feedforward],
is_bias=True) attr=linear1_bias_attr,
dtype=self._dtype,
is_bias=True,
)
self._linear2_weight = self.create_parameter( self._linear2_weight = self.create_parameter(
shape=[dim_feedforward, d_model], shape=[dim_feedforward, d_model],
attr=linear2_weight_attr, attr=linear2_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
)
self._linear2_bias = self.create_parameter(shape=[d_model], self._linear2_bias = self.create_parameter(
attr=linear2_bias_attr, shape=[d_model],
dtype=self._dtype, attr=linear2_bias_attr,
is_bias=True) dtype=self._dtype,
is_bias=True,
)
if nranks > 1: if nranks > 1:
assert ring_id != -1 assert ring_id != -1
...@@ -569,10 +626,11 @@ class FusedFeedForward(Layer): ...@@ -569,10 +626,11 @@ class FusedFeedForward(Layer):
shape=[d_model], shape=[d_model],
attr=ln1_scale_attr, attr=ln1_scale_attr,
is_bias=False, is_bias=False,
default_initializer=Constant(1.0)) default_initializer=Constant(1.0),
self._ln1_bias = self.create_parameter(shape=[d_model], )
attr=ln1_bias_attr, self._ln1_bias = self.create_parameter(
is_bias=True) shape=[d_model], attr=ln1_bias_attr, is_bias=True
)
self._ln2_scale = None self._ln2_scale = None
self._ln2_bias = None self._ln2_bias = None
else: else:
...@@ -582,10 +640,11 @@ class FusedFeedForward(Layer): ...@@ -582,10 +640,11 @@ class FusedFeedForward(Layer):
shape=[d_model], shape=[d_model],
attr=ln2_scale_attr, attr=ln2_scale_attr,
is_bias=False, is_bias=False,
default_initializer=Constant(1.0)) default_initializer=Constant(1.0),
self._ln2_bias = self.create_parameter(shape=[d_model], )
attr=ln2_bias_attr, self._ln2_bias = self.create_parameter(
is_bias=True) shape=[d_model], attr=ln2_bias_attr, is_bias=True
)
self.name = name self.name = name
...@@ -608,15 +667,23 @@ class FusedFeedForward(Layer): ...@@ -608,15 +667,23 @@ class FusedFeedForward(Layer):
pre_layer_norm=self._normalize_before, pre_layer_norm=self._normalize_before,
training=self.training, training=self.training,
ring_id=self._ring_id, ring_id=self._ring_id,
name=self.name) name=self.name,
)
return out return out
def extra_repr(self): def extra_repr(self):
name_str = ', name={}'.format(self.name) if self.name else '' name_str = ', name={}'.format(self.name) if self.name else ''
return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format( return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format(
self._d_model, self._dim_feedforward, self._dropout_rate, self._d_model,
self._epsilon, self._act_method, self._act_dropout_rate, self._dim_feedforward,
self._normalize_before, self._dtype, name_str) self._dropout_rate,
self._epsilon,
self._act_method,
self._act_dropout_rate,
self._normalize_before,
self._dtype,
name_str,
)
def _amp_decorate(self, dtype): def _amp_decorate(self, dtype):
# tmp fix for amp.decorator(O2) # tmp fix for amp.decorator(O2)
...@@ -640,6 +707,7 @@ class FusedFeedForward(Layer): ...@@ -640,6 +707,7 @@ class FusedFeedForward(Layer):
class FusedTransformerEncoderLayer(Layer): class FusedTransformerEncoderLayer(Layer):
""" """
FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head) FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
attention and feedforward network. Before and after each sub-layer, pre-process attention and feedforward network. Before and after each sub-layer, pre-process
and post-precess would be applied on the input and output accordingly. If and post-precess would be applied on the input and output accordingly. If
...@@ -681,10 +749,9 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -681,10 +749,9 @@ class FusedTransformerEncoderLayer(Layer):
Examples: Examples:
.. code-block:: python .. code-block:: python
# required: gpu # required: gpu
import paddle import paddle
from paddle.incubate.nn import FusedTransformerEncoderLayer from paddle.incubate.nn import FusedTransformerEncoderLayer
...@@ -694,33 +761,47 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -694,33 +761,47 @@ class FusedTransformerEncoderLayer(Layer):
attn_mask = paddle.rand((2, 2, 4, 4)) attn_mask = paddle.rand((2, 2, 4, 4))
encoder_layer = FusedTransformerEncoderLayer(128, 2, 512) encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128]
""" """
def __init__(self, def __init__(
d_model, self,
nhead, d_model,
dim_feedforward, nhead,
dropout_rate=0.1, dim_feedforward,
activation="relu", dropout_rate=0.1,
attn_dropout_rate=None, activation="relu",
act_dropout_rate=None, attn_dropout_rate=None,
normalize_before=False, act_dropout_rate=None,
weight_attr=None, normalize_before=False,
bias_attr=None): weight_attr=None,
bias_attr=None,
):
self._config = locals() self._config = locals()
self._config.pop("self") self._config.pop("self")
self._config.pop("__class__", None) # py3 self._config.pop("__class__", None) # py3
super(FusedTransformerEncoderLayer, self).__init__() super(FusedTransformerEncoderLayer, self).__init__()
assert d_model > 0, ("Expected d_model to be greater than 0, " assert (
"but received {}".format(d_model)) d_model > 0
assert nhead > 0, ("Expected nhead to be greater than 0, " ), "Expected d_model to be greater than 0, " "but received {}".format(
"but received {}".format(nhead)) d_model
)
assert (
nhead > 0
), "Expected nhead to be greater than 0, " "but received {}".format(
nhead
)
assert dim_feedforward > 0, ( assert dim_feedforward > 0, (
"Expected dim_feedforward to be greater than 0, " "Expected dim_feedforward to be greater than 0, "
"but received {}".format(dim_feedforward)) "but received {}".format(dim_feedforward)
attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate )
act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate attn_dropout_rate = (
dropout_rate if attn_dropout_rate is None else attn_dropout_rate
)
act_dropout_rate = (
dropout_rate if act_dropout_rate is None else act_dropout_rate
)
self.normalize_before = normalize_before self.normalize_before = normalize_before
weight_attrs = _convert_param_attr_to_list(weight_attr, 2) weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
...@@ -739,22 +820,27 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -739,22 +820,27 @@ class FusedTransformerEncoderLayer(Layer):
pre_ln_scale_attr=weight_attrs[0], pre_ln_scale_attr=weight_attrs[0],
pre_ln_bias_attr=bias_attrs[0], pre_ln_bias_attr=bias_attrs[0],
ln_scale_attr=weight_attrs[0], ln_scale_attr=weight_attrs[0],
ln_bias_attr=bias_attrs[0]) ln_bias_attr=bias_attrs[0],
)
self.ffn = FusedFeedForward(d_model,
dim_feedforward, self.ffn = FusedFeedForward(
dropout_rate=dropout_rate, d_model,
activation=activation, dim_feedforward,
act_dropout_rate=act_dropout_rate, dropout_rate=dropout_rate,
normalize_before=self.normalize_before, activation=activation,
linear1_weight_attr=weight_attrs[1], act_dropout_rate=act_dropout_rate,
linear1_bias_attr=bias_attrs[1], normalize_before=self.normalize_before,
linear2_weight_attr=weight_attrs[1], linear1_weight_attr=weight_attrs[1],
linear2_bias_attr=bias_attrs[1]) linear1_bias_attr=bias_attrs[1],
linear2_weight_attr=weight_attrs[1],
linear2_bias_attr=bias_attrs[1],
)
def forward(self, src, src_mask=None, cache=None): def forward(self, src, src_mask=None, cache=None):
""" """
Applies a Transformer encoder layer on the input. Applies a Transformer encoder layer on the input.
Parameters: Parameters:
src (Tensor): The input of Transformer encoder layer. It is src (Tensor): The input of Transformer encoder layer. It is
a tensor with shape `[batch_size, sequence_length, d_model]`. a tensor with shape `[batch_size, sequence_length, d_model]`.
...@@ -770,25 +856,27 @@ class FusedTransformerEncoderLayer(Layer): ...@@ -770,25 +856,27 @@ class FusedTransformerEncoderLayer(Layer):
`-INF` values and the others have 0 values. It can be None when `-INF` values and the others have 0 values. It can be None when
nothing wanted or needed to be prevented attention to. Default None. nothing wanted or needed to be prevented attention to. Default None.
cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`. cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
See `TransformerEncoderLayer.gen_cache` for more details. It is See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is
only used for inference and should be None for training. Default only used for inference and should be None for training. Default
None. None.
Returns: Returns:
Tensor|tuple: It is a tensor that has the same shape and data type \ Tensor|tuple, It is a tensor that has the same shape and data type \
as `enc_input`, representing the output of Transformer encoder \ as `enc_input`, representing the output of Transformer encoder \
layer. Or a tuple if `cache` is not None, except for encoder \ layer. Or a tuple if `cache` is not None, except for encoder \
layer output, the tuple includes the new cache which is same \ layer output, the tuple includes the new cache which is same \
as input `cache` argument but `incremental_cache` has an \ as input `cache` argument but `incremental_cache` has an \
incremental length. See `MultiHeadAttention.gen_cache` and \ incremental length. See `MultiHeadAttention.gen_cache` and \
`MultiHeadAttention.forward` for more details. `MultiHeadAttention.forward` for more details.
""" """
src_mask = _convert_attention_mask(src_mask, src.dtype) src_mask = _convert_attention_mask(src_mask, src.dtype)
if cache is None: if cache is None:
attn_out = self.fused_attn(src, attn_mask=src_mask) attn_out = self.fused_attn(src, attn_mask=src_mask)
else: else:
attn_out, incremental_cache = self.fused_attn(src, attn_out, incremental_cache = self.fused_attn(
attn_mask=src_mask, src, attn_mask=src_mask, cache=cache
cache=cache) )
ffn_out = self.ffn(attn_out) ffn_out = self.ffn(attn_out)
...@@ -889,21 +977,23 @@ class FusedTransformer(Layer): ...@@ -889,21 +977,23 @@ class FusedTransformer(Layer):
cross_attn_mask) # [2, 6, 128] cross_attn_mask) # [2, 6, 128]
""" """
def __init__(self, def __init__(
d_model=512, self,
nhead=8, d_model=512,
num_encoder_layers=6, nhead=8,
num_decoder_layers=6, num_encoder_layers=6,
dim_feedforward=2048, num_decoder_layers=6,
dropout=0.1, dim_feedforward=2048,
activation="relu", dropout=0.1,
attn_dropout=None, activation="relu",
act_dropout=None, attn_dropout=None,
normalize_before=False, act_dropout=None,
weight_attr=None, normalize_before=False,
bias_attr=None, weight_attr=None,
custom_encoder=None, bias_attr=None,
custom_decoder=None): custom_encoder=None,
custom_decoder=None,
):
super(fusedTransformer, self).__init__() super(fusedTransformer, self).__init__()
raise NotImplementedError() raise NotImplementedError()
...@@ -1071,40 +1161,49 @@ class FusedMultiTransformer(Layer): ...@@ -1071,40 +1161,49 @@ class FusedMultiTransformer(Layer):
enc_output = encoder_layers(enc_input, attn_mask) # [2, 4, 128] enc_output = encoder_layers(enc_input, attn_mask) # [2, 4, 128]
""" """
def __init__(self, def __init__(
embed_dim, self,
num_heads, embed_dim,
dim_feedforward, num_heads,
dropout_rate=0.0, dim_feedforward,
activation="gelu", dropout_rate=0.0,
normalize_before=True, activation="gelu",
ln_scale_attrs=None, normalize_before=True,
ln_bias_attrs=None, ln_scale_attrs=None,
qkv_weight_attrs=None, ln_bias_attrs=None,
qkv_bias_attrs=None, qkv_weight_attrs=None,
linear_weight_attrs=None, qkv_bias_attrs=None,
linear_bias_attrs=None, linear_weight_attrs=None,
ffn_ln_scale_attrs=None, linear_bias_attrs=None,
ffn_ln_bias_attrs=None, ffn_ln_scale_attrs=None,
ffn1_weight_attrs=None, ffn_ln_bias_attrs=None,
ffn1_bias_attrs=None, ffn1_weight_attrs=None,
ffn2_weight_attrs=None, ffn1_bias_attrs=None,
ffn2_bias_attrs=None, ffn2_weight_attrs=None,
epsilon=1e-5, ffn2_bias_attrs=None,
num_layers=-1, epsilon=1e-5,
nranks=1, num_layers=-1,
trans_qkvw=True, nranks=1,
ring_id=-1, trans_qkvw=True,
name=None): ring_id=-1,
name=None,
):
super(FusedMultiTransformer, self).__init__() super(FusedMultiTransformer, self).__init__()
assert embed_dim > 0, ("Expected embed_dim to be greater than 0, " assert embed_dim > 0, (
"but received {}".format(embed_dim)) "Expected embed_dim to be greater than 0, "
assert num_heads > 0, ("Expected nhead to be greater than 0, " "but received {}".format(embed_dim)
"but received {}".format(num_heads)) )
assert dim_feedforward > 0, ( assert (
"Expected dim_feedforward to be greater than 0, but received {}". num_heads > 0
format(dim_feedforward)) ), "Expected nhead to be greater than 0, " "but received {}".format(
num_heads
)
assert (
dim_feedforward > 0
), "Expected dim_feedforward to be greater than 0, but received {}".format(
dim_feedforward
)
self.normalize_before = normalize_before self.normalize_before = normalize_before
self._dtype = self._helper.get_default_dtype() self._dtype = self._helper.get_default_dtype()
...@@ -1115,7 +1214,9 @@ class FusedMultiTransformer(Layer): ...@@ -1115,7 +1214,9 @@ class FusedMultiTransformer(Layer):
self.embed_dim = embed_dim self.embed_dim = embed_dim
self.num_heads = num_heads self.num_heads = num_heads
self.head_dim = embed_dim // num_heads self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads" assert (
self.head_dim * num_heads == embed_dim
), "embed_dim must be divisible by num_heads"
# tensor model parallel # tensor model parallel
if nranks > 1: if nranks > 1:
...@@ -1161,57 +1262,71 @@ class FusedMultiTransformer(Layer): ...@@ -1161,57 +1262,71 @@ class FusedMultiTransformer(Layer):
ln_scale = self.create_parameter( ln_scale = self.create_parameter(
attr=ln_scale_attr, attr=ln_scale_attr,
shape=[embed_dim], shape=[embed_dim],
default_initializer=Constant(value=1.0)) default_initializer=Constant(value=1.0),
ln_bias = self.create_parameter(attr=ln_bias_attr, )
shape=[embed_dim], ln_bias = self.create_parameter(
is_bias=True) attr=ln_bias_attr, shape=[embed_dim], is_bias=True
)
qkv_weight = self.create_parameter( qkv_weight = self.create_parameter(
shape=[3, num_heads, self.head_dim, embed_dim] shape=[3, num_heads, self.head_dim, embed_dim]
if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim], if trans_qkvw
else [embed_dim, 3, num_heads, self.head_dim],
attr=qkv_weight_attr, attr=qkv_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
)
qkv_bias = self.create_parameter( qkv_bias = self.create_parameter(
shape=[3, num_heads, self.head_dim], shape=[3, num_heads, self.head_dim],
attr=qkv_bias_attr, attr=qkv_bias_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True,
)
linear_weight = self.create_parameter( linear_weight = self.create_parameter(
shape=[num_heads * self.head_dim, embed_dim], shape=[num_heads * self.head_dim, embed_dim],
attr=linear_weight_attr, attr=linear_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
linear_bias = self.create_parameter(shape=[embed_dim], )
attr=linear_bias_attr, linear_bias = self.create_parameter(
dtype=self._dtype, shape=[embed_dim],
is_bias=True) attr=linear_bias_attr,
dtype=self._dtype,
is_bias=True,
)
ffn_ln_scale = self.create_parameter( ffn_ln_scale = self.create_parameter(
shape=[embed_dim], shape=[embed_dim],
attr=ffn_ln_scale_attr, attr=ffn_ln_scale_attr,
is_bias=False, is_bias=False,
default_initializer=Constant(1.0)) default_initializer=Constant(1.0),
ffn_ln_bias = self.create_parameter(shape=[embed_dim], )
attr=ffn_ln_bias_attr, ffn_ln_bias = self.create_parameter(
is_bias=True) shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True
)
ffn1_weight = self.create_parameter( ffn1_weight = self.create_parameter(
shape=[embed_dim, dim_feedforward], shape=[embed_dim, dim_feedforward],
attr=ffn1_weight_attr, attr=ffn1_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
ffn1_bias = self.create_parameter(shape=[dim_feedforward], )
attr=ffn1_bias_attr, ffn1_bias = self.create_parameter(
dtype=self._dtype, shape=[dim_feedforward],
is_bias=True) attr=ffn1_bias_attr,
dtype=self._dtype,
is_bias=True,
)
ffn2_weight = self.create_parameter( ffn2_weight = self.create_parameter(
shape=[dim_feedforward, embed_dim], shape=[dim_feedforward, embed_dim],
attr=ffn2_weight_attr, attr=ffn2_weight_attr,
dtype=self._dtype, dtype=self._dtype,
is_bias=False) is_bias=False,
ffn2_bias = self.create_parameter(shape=[embed_dim], )
attr=ffn2_bias_attr, ffn2_bias = self.create_parameter(
dtype=self._dtype, shape=[embed_dim],
is_bias=True) attr=ffn2_bias_attr,
dtype=self._dtype,
is_bias=True,
)
# tensor model parallel # tensor model parallel
if nranks > 1: if nranks > 1:
...@@ -1300,5 +1415,6 @@ class FusedMultiTransformer(Layer): ...@@ -1300,5 +1415,6 @@ class FusedMultiTransformer(Layer):
mode='upscale_in_train', mode='upscale_in_train',
trans_qkvw=self._trans_qkvw, trans_qkvw=self._trans_qkvw,
ring_id=self._ring_id, ring_id=self._ring_id,
name=self.name) name=self.name,
)
return out return out
...@@ -20,104 +20,134 @@ from paddle.fluid import core ...@@ -20,104 +20,134 @@ from paddle.fluid import core
from paddle import _C_ops, _legacy_C_ops from paddle import _C_ops, _legacy_C_ops
def graph_khop_sampler(row, def graph_khop_sampler(
colptr, row,
input_nodes, colptr,
sample_sizes, input_nodes,
sorted_eids=None, sample_sizes,
return_eids=False, sorted_eids=None,
name=None): return_eids=False,
name=None,
):
""" """
Graph Khop Sampler API. Graph Khop Sampler API.
This API is mainly used in Graph Learning domain, and the main purpose is to This API is mainly used in Graph Learning domain, and the main purpose is to
provide high performance graph khop sampling method with subgraph reindex step. provide high performance graph khop sampling method with subgraph reindex step.
For example, we get the CSC(Compressed Sparse Column) format of the input graph For example, we get the CSC(Compressed Sparse Column) format of the input graph
edges as `row` and `colptr`, so as to covert graph data into a suitable format edges as `row` and `colptr`, so as to covert graph data into a suitable format
for sampling. And the `input_nodes` means the nodes we need to sample neighbors, for sampling. And the `input_nodes` means the nodes we need to sample neighbors,
and `sample_sizes` means the number of neighbors and number of layers we want and `sample_sizes` means the number of neighbors and number of layers we want
to sample. to sample.
Args: Args:
row (Tensor): One of the components of the CSC format of the input graph, and row (Tensor): One of the components of the CSC format of the input graph, and
the shape should be [num_edges, 1] or [num_edges]. The available the shape should be [num_edges, 1] or [num_edges]. The available
data type is int32, int64. data type is int32, int64.
colptr (Tensor): One of the components of the CSC format of the input graph, colptr (Tensor): One of the components of the CSC format of the input graph,
and the shape should be [num_nodes + 1, 1] or [num_nodes]. and the shape should be [num_nodes + 1, 1] or [num_nodes].
The data type should be the same with `row`. The data type should be the same with `row`.
input_nodes (Tensor): The input nodes we need to sample neighbors for, and the input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
data type should be the same with `row`. data type should be the same with `row`.
sample_sizes (list|tuple): The number of neighbors and number of layers we want sample_sizes (list|tuple): The number of neighbors and number of layers we want
to sample. The data type should be int, and the shape to sample. The data type should be int, and the shape
should only have one dimension. should only have one dimension.
sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids` sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids`
is True. The shape should be [num_edges, 1], and the data is True. The shape should be [num_edges, 1], and the data
type should be the same with `row`. type should be the same with `row`. Default is None.
return_eids (bool): Whether to return the id of the sample edges. Default is False. return_eids (bool, optional): Whether to return the id of the sample edges. Default is False.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
edge_src (Tensor): The src index of the output edges, also means the first column of - edge_src (Tensor), The src index of the output edges, also means the first column of
the edges. The shape is [num_sample_edges, 1] currently. the edges. The shape is [num_sample_edges, 1] currently.
edge_dst (Tensor): The dst index of the output edges, also means the second column - edge_dst (Tensor), The dst index of the output edges, also means the second column
of the edges. The shape is [num_sample_edges, 1] currently. of the edges. The shape is [num_sample_edges, 1] currently.
sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes. - sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes.
reindex_nodes (Tensor): The reindex id of the input nodes. - reindex_nodes (Tensor), The reindex id of the input nodes.
edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True. - edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
nodes = [0, 8, 1, 2]
sample_sizes = [2, 2]
row = paddle.to_tensor(row, dtype="int64")
colptr = paddle.to_tensor(colptr, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64")
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
nodes = [0, 8, 1, 2]
sample_sizes = [2, 2]
row = paddle.to_tensor(row, dtype="int64")
colptr = paddle.to_tensor(colptr, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64")
edge_src, edge_dst, sample_index, reindex_nodes = \
paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
""" """
if _non_static_mode(): if _non_static_mode():
if return_eids: if return_eids:
if sorted_eids is None: if sorted_eids is None:
raise ValueError(f"`sorted_eid` should not be None " raise ValueError(
f"if return_eids is True.") f"`sorted_eid` should not be None "
edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \ f"if return_eids is True."
_legacy_C_ops.graph_khop_sampler(row, sorted_eids, )
colptr, input_nodes, (
"sample_sizes", sample_sizes, edge_src,
"return_eids", True) edge_dst,
sample_index,
reindex_nodes,
edge_eids,
) = _legacy_C_ops.graph_khop_sampler(
row,
sorted_eids,
colptr,
input_nodes,
"sample_sizes",
sample_sizes,
"return_eids",
True,
)
return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
else: else:
edge_src, edge_dst, sample_index, reindex_nodes, _ = \ (
_legacy_C_ops.graph_khop_sampler(row, None, edge_src,
colptr, input_nodes, edge_dst,
"sample_sizes", sample_sizes, sample_index,
"return_eids", False) reindex_nodes,
_,
) = _legacy_C_ops.graph_khop_sampler(
row,
None,
colptr,
input_nodes,
"sample_sizes",
sample_sizes,
"return_eids",
False,
)
return edge_src, edge_dst, sample_index, reindex_nodes return edge_src, edge_dst, sample_index, reindex_nodes
check_variable_and_dtype(row, "Row", ("int32", "int64"), check_variable_and_dtype(
"graph_khop_sampler") row, "Row", ("int32", "int64"), "graph_khop_sampler"
)
if return_eids: if return_eids:
if sorted_eids is None: if sorted_eids is None:
raise ValueError(f"`sorted_eid` should not be None " raise ValueError(
f"if return_eids is True.") f"`sorted_eid` should not be None " f"if return_eids is True."
check_variable_and_dtype(sorted_eids, "Eids", ("int32", "int64"), )
"graph_khop_sampler") check_variable_and_dtype(
sorted_eids, "Eids", ("int32", "int64"), "graph_khop_sampler"
)
check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"), check_variable_and_dtype(
"graph_khop_sampler") colptr, "Col_Ptr", ("int32", "int64"), "graph_khop_sampler"
check_variable_and_dtype(input_nodes, "X", ("int32", "int64"), )
"graph_khop_sampler") check_variable_and_dtype(
input_nodes, "X", ("int32", "int64"), "graph_khop_sampler"
)
helper = LayerHelper("graph_khop_sampler", **locals()) helper = LayerHelper("graph_khop_sampler", **locals())
edge_src = helper.create_variable_for_type_inference(dtype=row.dtype) edge_src = helper.create_variable_for_type_inference(dtype=row.dtype)
...@@ -125,24 +155,23 @@ def graph_khop_sampler(row, ...@@ -125,24 +155,23 @@ def graph_khop_sampler(row,
sample_index = helper.create_variable_for_type_inference(dtype=row.dtype) sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype) reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype) edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
helper.append_op(type="graph_khop_sampler", helper.append_op(
inputs={ type="graph_khop_sampler",
"Row": row, inputs={
"Eids": sorted_eids, "Row": row,
"Col_Ptr": colptr, "Eids": sorted_eids,
"X": input_nodes "Col_Ptr": colptr,
}, "X": input_nodes,
outputs={ },
"Out_Src": edge_src, outputs={
"Out_Dst": edge_dst, "Out_Src": edge_src,
"Sample_Index": sample_index, "Out_Dst": edge_dst,
"Reindex_X": reindex_nodes, "Sample_Index": sample_index,
"Out_Eids": edge_eids "Reindex_X": reindex_nodes,
}, "Out_Eids": edge_eids,
attrs={ },
"sample_sizes": sample_sizes, attrs={"sample_sizes": sample_sizes, "return_eids": return_eids},
"return_eids": return_eids )
})
if return_eids: if return_eids:
return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
else: else:
......
...@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops ...@@ -21,18 +21,23 @@ from paddle import _C_ops, _legacy_C_ops
import paddle.utils.deprecated as deprecated import paddle.utils.deprecated as deprecated
@deprecated(since="2.4.0", @deprecated(
update_to="paddle.geometric.reindex_graph", since="2.4.0",
level=1, update_to="paddle.geometric.reindex_graph",
reason="paddle.incubate.graph_reindex will be removed in future") level=1,
def graph_reindex(x, reason="paddle.incubate.graph_reindex will be removed in future",
neighbors, )
count, def graph_reindex(
value_buffer=None, x,
index_buffer=None, neighbors,
flag_buffer_hashtable=False, count,
name=None): value_buffer=None,
index_buffer=None,
flag_buffer_hashtable=False,
name=None,
):
""" """
Graph Reindex API. Graph Reindex API.
This API is mainly used in Graph Learning domain, which should be used This API is mainly used in Graph Learning domain, which should be used
...@@ -40,11 +45,11 @@ def graph_reindex(x, ...@@ -40,11 +45,11 @@ def graph_reindex(x,
is to reindex the ids information of the input nodes, and return the is to reindex the ids information of the input nodes, and return the
corresponding graph edges after reindex. corresponding graph edges after reindex.
**Notes**: Notes:
The number in x should be unique, otherwise it would cause potential errors. The number in x should be unique, otherwise it would cause potential errors.
Besides, we also support multi-edge-types neighbors reindexing. If we have different Besides, we also support multi-edge-types neighbors reindexing. If we have different
edge_type neighbors for x, we should concatenate all the neighbors and count of x. edge_type neighbors for x, we should concatenate all the neighbors and count of x.
We will reindex all the nodes from 0. We will reindex all the nodes from 0.
Take input nodes x = [0, 1, 2] as an example. Take input nodes x = [0, 1, 2] as an example.
If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2], If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2],
...@@ -58,98 +63,105 @@ def graph_reindex(x, ...@@ -58,98 +63,105 @@ def graph_reindex(x,
should be the same with `x`. should be the same with `x`.
count (Tensor): The neighbor count of the input nodes `x`. And the count (Tensor): The neighbor count of the input nodes `x`. And the
data type should be int32. data type should be int32.
value_buffer (Tensor|None): Value buffer for hashtable. The data type should value_buffer (Tensor, optional): Value buffer for hashtable. The data type should
be int32, and should be filled with -1. be int32, and should be filled with -1. Default is None.
index_buffer (Tensor|None): Index buffer for hashtable. The data type should index_buffer (Tensor, optional): Index buffer for hashtable. The data type should
be int32, and should be filled with -1. be int32, and should be filled with -1. Default is None.
flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up. flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up.
Default is False. Only useful for gpu version currently. Default is False. Only useful for gpu version currently.
name (str, optional): Name for the operation (optional, default is None). name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
reindex_src (Tensor): The source node index of graph edges after reindex. - reindex_src (Tensor), The source node index of graph edges after reindex.
reindex_dst (Tensor): The destination node index of graph edges after reindex. - reindex_dst (Tensor), The destination node index of graph edges after reindex.
out_nodes (Tensor): The index of unique input nodes and neighbors before reindex, - out_nodes (Tensor), The index of unique input nodes and neighbors before reindex,
where we put the input nodes `x` in the front, and put neighbor where we put the input nodes `x` in the front, and put neighbor
nodes in the back. nodes in the back.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
x = [0, 1, 2] x = [0, 1, 2]
neighbors_e1 = [8, 9, 0, 4, 7, 6, 7] neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
count_e1 = [2, 3, 2] count_e1 = [2, 3, 2]
x = paddle.to_tensor(x, dtype="int64") x = paddle.to_tensor(x, dtype="int64")
neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64") neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
count_e1 = paddle.to_tensor(count_e1, dtype="int32") count_e1 = paddle.to_tensor(count_e1, dtype="int32")
reindex_src, reindex_dst, out_nodes = \ reindex_src, reindex_dst, out_nodes = \
paddle.incubate.graph_reindex(x, neighbors_e1, count_e1) paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
# reindex_src: [3, 4, 0, 5, 6, 7, 6] # reindex_src: [3, 4, 0, 5, 6, 7, 6]
# reindex_dst: [0, 0, 1, 1, 1, 2, 2] # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
# out_nodes: [0, 1, 2, 8, 9, 4, 7, 6] # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
neighbors_e2 = [0, 2, 3, 5, 1] neighbors_e2 = [0, 2, 3, 5, 1]
count_e2 = [1, 3, 1] count_e2 = [1, 3, 1]
neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64") neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
count_e2 = paddle.to_tensor(count_e2, dtype="int32") count_e2 = paddle.to_tensor(count_e2, dtype="int32")
neighbors = paddle.concat([neighbors_e1, neighbors_e2]) neighbors = paddle.concat([neighbors_e1, neighbors_e2])
count = paddle.concat([count_e1, count_e2]) count = paddle.concat([count_e1, count_e2])
reindex_src, reindex_dst, out_nodes = \ reindex_src, reindex_dst, out_nodes = \
paddle.incubate.graph_reindex(x, neighbors, count) paddle.incubate.graph_reindex(x, neighbors, count)
# reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1] # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
# reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2] # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
# out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5] # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
""" """
if flag_buffer_hashtable: if flag_buffer_hashtable:
if value_buffer is None or index_buffer is None: if value_buffer is None or index_buffer is None:
raise ValueError(f"`value_buffer` and `index_buffer` should not" raise ValueError(
"be None if `flag_buffer_hashtable` is True.") f"`value_buffer` and `index_buffer` should not"
"be None if `flag_buffer_hashtable` is True."
)
if _non_static_mode(): if _non_static_mode():
reindex_src, reindex_dst, out_nodes = \ reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
_legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer, x,
"flag_buffer_hashtable", flag_buffer_hashtable) neighbors,
count,
value_buffer,
index_buffer,
"flag_buffer_hashtable",
flag_buffer_hashtable,
)
return reindex_src, reindex_dst, out_nodes return reindex_src, reindex_dst, out_nodes
check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex") check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"), check_variable_and_dtype(
"graph_reindex") neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
)
check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex") check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
if flag_buffer_hashtable: if flag_buffer_hashtable:
check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"), check_variable_and_dtype(
"graph_reindex") value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"), )
"graph_reindex") check_variable_and_dtype(
index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
)
helper = LayerHelper("graph_reindex", **locals()) helper = LayerHelper("graph_reindex", **locals())
reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype) reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype) reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype) out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
helper.append_op(type="graph_reindex", helper.append_op(
inputs={ type="graph_reindex",
"X": inputs={
x, "X": x,
"Neighbors": "Neighbors": neighbors,
neighbors, "Count": count,
"Count": "HashTable_Value": value_buffer if flag_buffer_hashtable else None,
count, "HashTable_Index": index_buffer if flag_buffer_hashtable else None,
"HashTable_Value": },
value_buffer if flag_buffer_hashtable else None, outputs={
"HashTable_Index": "Reindex_Src": reindex_src,
index_buffer if flag_buffer_hashtable else None, "Reindex_Dst": reindex_dst,
}, "Out_Nodes": out_nodes,
outputs={ },
"Reindex_Src": reindex_src, attrs={"flag_buffer_hashtable": flag_buffer_hashtable},
"Reindex_Dst": reindex_dst, )
"Out_Nodes": out_nodes
},
attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
return reindex_src, reindex_dst, out_nodes return reindex_src, reindex_dst, out_nodes
...@@ -25,17 +25,21 @@ import paddle.utils.deprecated as deprecated ...@@ -25,17 +25,21 @@ import paddle.utils.deprecated as deprecated
since="2.4.0", since="2.4.0",
update_to="paddle.geometric.sample_neighbors", update_to="paddle.geometric.sample_neighbors",
level=1, level=1,
reason="paddle.incubate.graph_sample_neighbors will be removed in future") reason="paddle.incubate.graph_sample_neighbors will be removed in future",
def graph_sample_neighbors(row, )
colptr, def graph_sample_neighbors(
input_nodes, row,
eids=None, colptr,
perm_buffer=None, input_nodes,
sample_size=-1, eids=None,
return_eids=False, perm_buffer=None,
flag_perm_buffer=False, sample_size=-1,
name=None): return_eids=False,
flag_perm_buffer=False,
name=None,
):
""" """
Graph Sample Neighbors API. Graph Sample Neighbors API.
This API is mainly used in Graph Learning domain, and the main purpose is to This API is mainly used in Graph Learning domain, and the main purpose is to
...@@ -71,86 +75,109 @@ def graph_sample_neighbors(row, ...@@ -71,86 +75,109 @@ def graph_sample_neighbors(row,
For more information, please refer to :ref:`api_guide_Name`. For more information, please refer to :ref:`api_guide_Name`.
Returns: Returns:
out_neighbors (Tensor): The sample neighbors of the input nodes. - out_neighbors (Tensor), The sample neighbors of the input nodes.
out_count (Tensor): The number of sampling neighbors of each input node, and the shape - out_count (Tensor), The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
should be the same with `input_nodes`. - out_eids (Tensor), If `return_eids` is True, we will return the eid information of the sample edges.
out_eids (Tensor): If `return_eids` is True, we will return the eid information of the
sample edges.
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle
# edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4), import paddle
# (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8) # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7] # (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13] row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
nodes = [0, 8, 1, 2] colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
sample_size = 2 nodes = [0, 8, 1, 2]
row = paddle.to_tensor(row, dtype="int64") sample_size = 2
colptr = paddle.to_tensor(colptr, dtype="int64") row = paddle.to_tensor(row, dtype="int64")
nodes = paddle.to_tensor(nodes, dtype="int64") colptr = paddle.to_tensor(colptr, dtype="int64")
out_neighbors, out_count = \ nodes = paddle.to_tensor(nodes, dtype="int64")
paddle.incubate.graph_sample_neighbors(row, colptr, nodes, out_neighbors, out_count = \
sample_size=sample_size) paddle.incubate.graph_sample_neighbors(row, colptr, nodes,
sample_size=sample_size)
""" """
if return_eids: if return_eids:
if eids is None: if eids is None:
raise ValueError( raise ValueError(
f"`eids` should not be None if `return_eids` is True.") f"`eids` should not be None if `return_eids` is True."
)
if flag_perm_buffer: if flag_perm_buffer:
if perm_buffer is None: if perm_buffer is None:
raise ValueError( raise ValueError(
f"`perm_buffer` should not be None if `flag_perm_buffer`" f"`perm_buffer` should not be None if `flag_perm_buffer`"
"is True.") "is True."
)
if _non_static_mode(): if _non_static_mode():
out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors( (
row, colptr, input_nodes, eids, perm_buffer, "sample_size", out_neighbors,
sample_size, "return_eids", return_eids, "flag_perm_buffer", out_count,
flag_perm_buffer) out_eids,
) = _legacy_C_ops.graph_sample_neighbors(
row,
colptr,
input_nodes,
eids,
perm_buffer,
"sample_size",
sample_size,
"return_eids",
return_eids,
"flag_perm_buffer",
flag_perm_buffer,
)
if return_eids: if return_eids:
return out_neighbors, out_count, out_eids return out_neighbors, out_count, out_eids
return out_neighbors, out_count return out_neighbors, out_count
check_variable_and_dtype(row, "Row", ("int32", "int64"), check_variable_and_dtype(
"graph_sample_neighbors") row, "Row", ("int32", "int64"), "graph_sample_neighbors"
check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"), )
"graph_sample_neighbors") check_variable_and_dtype(
check_variable_and_dtype(input_nodes, "X", ("int32", "int64"), colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
"graph_sample_neighbors") )
check_variable_and_dtype(
input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
)
if return_eids: if return_eids:
check_variable_and_dtype(eids, "Eids", ("int32", "int64"), check_variable_and_dtype(
"graph_sample_neighbors") eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
)
if flag_perm_buffer: if flag_perm_buffer:
check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"), check_variable_and_dtype(
"graph_sample_neighbors") perm_buffer,
"Perm_Buffer",
("int32", "int64"),
"graph_sample_neighbors",
)
helper = LayerHelper("graph_sample_neighbors", **locals()) helper = LayerHelper("graph_sample_neighbors", **locals())
out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype) out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
out_count = helper.create_variable_for_type_inference(dtype=row.dtype) out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
out_eids = helper.create_variable_for_type_inference(dtype=row.dtype) out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
helper.append_op(type="graph_sample_neighbors", helper.append_op(
inputs={ type="graph_sample_neighbors",
"Row": row, inputs={
"Col_Ptr": colptr, "Row": row,
"X": input_nodes, "Col_Ptr": colptr,
"Eids": eids if return_eids else None, "X": input_nodes,
"Perm_Buffer": "Eids": eids if return_eids else None,
perm_buffer if flag_perm_buffer else None "Perm_Buffer": perm_buffer if flag_perm_buffer else None,
}, },
outputs={ outputs={
"Out": out_neighbors, "Out": out_neighbors,
"Out_Count": out_count, "Out_Count": out_count,
"Out_Eids": out_eids "Out_Eids": out_eids,
}, },
attrs={ attrs={
"sample_size": sample_size, "sample_size": sample_size,
"return_eids": return_eids, "return_eids": return_eids,
"flag_perm_buffer": flag_perm_buffer "flag_perm_buffer": flag_perm_buffer,
}) },
)
if return_eids: if return_eids:
return out_neighbors, out_count, out_eids return out_neighbors, out_count, out_eids
return out_neighbors, out_count return out_neighbors, out_count
...@@ -36,106 +36,232 @@ from paddle import _C_ops, _legacy_C_ops ...@@ -36,106 +36,232 @@ from paddle import _C_ops, _legacy_C_ops
__all__ = ['resnet_basic_block', 'ResNetBasicBlock'] __all__ = ['resnet_basic_block', 'ResNetBasicBlock']
def resnet_basic_block(x, def resnet_basic_block(
filter1, x,
scale1, filter1,
bias1, scale1,
mean1, bias1,
var1, mean1,
filter2, var1,
scale2, filter2,
bias2, scale2,
mean2, bias2,
var2, mean2,
filter3, var2,
scale3, filter3,
bias3, scale3,
mean3, bias3,
var3, mean3,
stride1, var3,
stride2, stride1,
stride3, stride2,
padding1, stride3,
padding2, padding1,
padding3, padding2,
dilation1, padding3,
dilation2, dilation1,
dilation3, dilation2,
groups, dilation3,
momentum, groups,
eps, momentum,
data_format, eps,
has_shortcut, data_format,
use_global_stats=None, has_shortcut,
training=False, use_global_stats=None,
trainable_statistics=False, training=False,
find_conv_max=True): trainable_statistics=False,
find_conv_max=True,
):
if fluid.framework.in_dygraph_mode(): if fluid.framework.in_dygraph_mode():
attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3, attrs = (
'padding1', padding1, 'padding2', padding2, 'padding3', 'stride1',
padding3, 'dilation1', dilation1, 'dilation2', dilation2, stride1,
'dilation3', dilation3, 'group', groups, 'momentum', momentum, 'stride2',
'epsilon', eps, 'data_format', data_format, 'has_shortcut', stride2,
has_shortcut, 'use_global_stats', use_global_stats, 'stride3',
"trainable_statistics", trainable_statistics, 'is_test', stride3,
not training, 'act_type', "relu", 'find_conv_input_max', 'padding1',
find_conv_max) padding1,
'padding2',
out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \ padding2,
getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \ 'padding3',
filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs) padding3,
'dilation1',
dilation1,
'dilation2',
dilation2,
'dilation3',
dilation3,
'group',
groups,
'momentum',
momentum,
'epsilon',
eps,
'data_format',
data_format,
'has_shortcut',
has_shortcut,
'use_global_stats',
use_global_stats,
"trainable_statistics",
trainable_statistics,
'is_test',
not training,
'act_type',
"relu",
'find_conv_input_max',
find_conv_max,
)
(
out,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
_,
) = getattr(_C_ops, "resnet_basic_block")(
x,
filter1,
scale1,
bias1,
mean1,
var1,
filter2,
scale2,
bias2,
mean2,
var2,
filter3,
scale3,
bias3,
mean3,
var3,
mean1,
var1,
mean2,
var2,
mean3,
var3,
*attrs
)
return out return out
helper = LayerHelper('resnet_basic_block', **locals()) helper = LayerHelper('resnet_basic_block', **locals())
bn_param_dtype = fluid.core.VarDesc.VarType.FP32 bn_param_dtype = fluid.core.VarDesc.VarType.FP32
max_dtype = fluid.core.VarDesc.VarType.FP32 max_dtype = fluid.core.VarDesc.VarType.FP32
out = helper.create_variable_for_type_inference(dtype=x.dtype, out = helper.create_variable_for_type_inference(
stop_gradient=True) dtype=x.dtype, stop_gradient=True
conv1 = helper.create_variable_for_type_inference(dtype=x.dtype, )
stop_gradient=True) conv1 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean1 = helper.create_variable_for_type_inference( saved_mean1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd1 = helper.create_variable_for_type_inference( saved_invstd1 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
running_mean1 = helper.create_variable_for_type_inference( )
dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1 running_mean1 = (
running_var1 = helper.create_variable_for_type_inference( helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1 dtype=bn_param_dtype, stop_gradient=True
conv2 = helper.create_variable_for_type_inference(dtype=x.dtype, )
stop_gradient=True) if mean1 is None
conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype, else mean1
stop_gradient=True) )
running_var1 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var1 is None
else var1
)
conv2 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
conv2_input = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean2 = helper.create_variable_for_type_inference( saved_mean2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd2 = helper.create_variable_for_type_inference( saved_invstd2 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
running_mean2 = helper.create_variable_for_type_inference( )
dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2 running_mean2 = (
running_var2 = helper.create_variable_for_type_inference( helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2 dtype=bn_param_dtype, stop_gradient=True
conv3 = helper.create_variable_for_type_inference(dtype=x.dtype, )
stop_gradient=True) if mean2 is None
else mean2
)
running_var2 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var2 is None
else var2
)
conv3 = helper.create_variable_for_type_inference(
dtype=x.dtype, stop_gradient=True
)
saved_mean3 = helper.create_variable_for_type_inference( saved_mean3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
)
saved_invstd3 = helper.create_variable_for_type_inference( saved_invstd3 = helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) dtype=bn_param_dtype, stop_gradient=True
running_mean3 = helper.create_variable_for_type_inference( )
dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3 running_mean3 = (
running_var3 = helper.create_variable_for_type_inference( helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3 dtype=bn_param_dtype, stop_gradient=True
)
if mean3 is None
else mean3
)
running_var3 = (
helper.create_variable_for_type_inference(
dtype=bn_param_dtype, stop_gradient=True
)
if var3 is None
else var3
)
conv1_input_max = helper.create_variable_for_type_inference( conv1_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv1_filter_max = helper.create_variable_for_type_inference( conv1_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv2_input_max = helper.create_variable_for_type_inference( conv2_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv2_filter_max = helper.create_variable_for_type_inference( conv2_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv3_input_max = helper.create_variable_for_type_inference( conv3_input_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
conv3_filter_max = helper.create_variable_for_type_inference( conv3_filter_max = helper.create_variable_for_type_inference(
dtype=max_dtype, stop_gradient=True) dtype=max_dtype, stop_gradient=True
)
inputs = { inputs = {
'X': x, 'X': x,
...@@ -175,7 +301,7 @@ def resnet_basic_block(x, ...@@ -175,7 +301,7 @@ def resnet_basic_block(x,
"trainable_statistics": trainable_statistics, "trainable_statistics": trainable_statistics,
'is_test': not training, 'is_test': not training,
'act_type': "relu", 'act_type': "relu",
'find_conv_input_max': find_conv_max 'find_conv_input_max': find_conv_max,
} }
outputs = { outputs = {
...@@ -203,88 +329,172 @@ def resnet_basic_block(x, ...@@ -203,88 +329,172 @@ def resnet_basic_block(x,
'MaxInput3': conv3_input_max, 'MaxInput3': conv3_input_max,
'MaxFilter3': conv3_filter_max, 'MaxFilter3': conv3_filter_max,
} }
helper.append_op(type='resnet_basic_block', helper.append_op(
inputs=inputs, type='resnet_basic_block', inputs=inputs, outputs=outputs, attrs=attrs
outputs=outputs, )
attrs=attrs)
return out return out
class ResNetBasicBlock(Layer): class ResNetBasicBlock(Layer):
""" r"""
ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block. ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block.
The fusion op architecture like this: If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time.
has_shortcut = True: else: If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this
X X case the shape of output is same with input.
/ /
| | | |
CONV1 | CONV1 | Args:
| | | | num_channels (int): The number of input image channel.
BN1 | BN1 | num_filter (int): The number of filter. It is as same as the output image channel.
| | | | filter_size (int|list|tuple): The filter size. If filter_size
RELU1 | RELU1 | is a tuple, it must contain two integers, (filter_size_height,
| | | | filter_size_width). Otherwise, filter_size_height = filter_size_width =\
CONV2 CONV3 CONV2 | filter_size.
| | | | stride (int, optional): The stride size. It means the stride in convolution.
BN2 BN3 BN2 | If stride is a tuple, it must contain two integers, (stride_height, stride_width).
\ / \ / Otherwise, stride_height = stride_width = stride. Default: stride = 1.
ADD ADD act (str, optional): Activation type, if it is set to None, activation is not appended.
| | Default: None
RELU RELU momentum (float, optional): The value used for the moving_mean and
| | moving_var computation. This should be a float number or a Tensor with
Y Y shape [1] and data type as float32. The updated formula is:
:math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
:math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
Default is 0.9.
eps (float, optional): A value added to the denominator for
numerical stability. Default is 1e-5.
data_format (str, optional): Specify the data format of the input, and the data format of the output
will be consistent with that of the input. Now is only support `"NCHW"`, the data is stored in
the order of: `[batch_size, input_channels, input_height, input_width]`.
has_shortcut (bool, optional): Whether to calculate CONV3 and BN3. Default: False.
use_global_stats (bool, optional): Whether to use global mean and
variance. In inference or test mode, set use_global_stats to true
or is_test to true, and the behavior is equivalent.
In train mode, when setting use_global_stats True, the global mean
and variance are also used during train period. Default: False.
is_test (bool, optional): A flag indicating whether it is in
test phrase or not. Default: False.
filter_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
will create ParamAttr as param_attr. Default: None.
scale_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr
as param_attr, the name of scale can be set in ParamAttr. If the Initializer of the param_attr is not set,
the parameter is initialized with Xavier. Default: None.
bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
If it is set to None or one attribute of ParamAttr, batch_norm
will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
If the Initializer of the bias_attr is not set, the bias is initialized zero.
Default: None.
moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
will save global mean with the string. Default: None.
moving_var_name (str, optional): The name of the moving_variance which store the global Variance.
If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
will save global variance with the string. Default: None.
padding (int, optional): The padding size. It is only spupport padding_height = padding_width = padding.
Default: padding = 0.
dilation (int, optional): The dilation size. It means the spacing between the kernel
points. It is only spupport dilation_height = dilation_width = dilation.
Default: dilation = 1.
trainable_statistics (bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
Default: False.
find_conv_max (bool, optional): Whether to calculate max value of each conv2d. Default: True.
Returns:
A Tensor representing the ResNetBasicBlock, whose data type is the same with input.
Examples:
.. code-block:: python
# required: xpu
import paddle
from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
ch_in = 4
ch_out = 8
x = paddle.uniform((2, ch_in, 16, 16), dtype='float32', min=-1., max=1.)
resnet_basic_block = ResNetBasicBlock(num_channels1=ch_in,
num_filter1=ch_out,
filter1_size=3,
num_channels2=ch_out,
num_filter2=ch_out,
filter2_size=3,
num_channels3=ch_in,
num_filter3=ch_out,
filter3_size=1,
stride1=1,
stride2=1,
stride3=1,
act='relu',
padding1=1,
padding2=1,
padding3=0,
has_shortcut=True)
out = resnet_basic_block.forward(x)
print(out.shape) # [2, 8, 16, 16]
""" """
def __init__(self, def __init__(
num_channels1, self,
num_filter1, num_channels1,
filter1_size, num_filter1,
num_channels2, filter1_size,
num_filter2, num_channels2,
filter2_size, num_filter2,
num_channels3, filter2_size,
num_filter3, num_channels3,
filter3_size, num_filter3,
stride1=1, filter3_size,
stride2=1, stride1=1,
stride3=1, stride2=1,
act='relu', stride3=1,
momentum=0.9, act='relu',
eps=1e-5, momentum=0.9,
data_format='NCHW', eps=1e-5,
has_shortcut=False, data_format='NCHW',
use_global_stats=False, has_shortcut=False,
is_test=False, use_global_stats=False,
filter1_attr=None, is_test=False,
scale1_attr=None, filter1_attr=None,
bias1_attr=None, scale1_attr=None,
moving_mean1_name=None, bias1_attr=None,
moving_var1_name=None, moving_mean1_name=None,
filter2_attr=None, moving_var1_name=None,
scale2_attr=None, filter2_attr=None,
bias2_attr=None, scale2_attr=None,
moving_mean2_name=None, bias2_attr=None,
moving_var2_name=None, moving_mean2_name=None,
filter3_attr=None, moving_var2_name=None,
scale3_attr=None, filter3_attr=None,
bias3_attr=None, scale3_attr=None,
moving_mean3_name=None, bias3_attr=None,
moving_var3_name=None, moving_mean3_name=None,
padding1=0, moving_var3_name=None,
padding2=0, padding1=0,
padding3=0, padding2=0,
dilation1=1, padding3=0,
dilation2=1, dilation1=1,
dilation3=1, dilation2=1,
trainable_statistics=False, dilation3=1,
find_conv_max=True): trainable_statistics=False,
find_conv_max=True,
):
super(ResNetBasicBlock, self).__init__() super(ResNetBasicBlock, self).__init__()
self._stride1 = stride1 self._stride1 = stride1
self._stride2 = stride2 self._stride2 = stride2
self._kernel1_size = utils.convert_to_list(filter1_size, 2, self._kernel1_size = utils.convert_to_list(
'filter1_size') filter1_size, 2, 'filter1_size'
self._kernel2_size = utils.convert_to_list(filter2_size, 2, )
'filter2_size') self._kernel2_size = utils.convert_to_list(
filter2_size, 2, 'filter2_size'
)
self._dilation1 = dilation1 self._dilation1 = dilation1
self._dilation2 = dilation2 self._dilation2 = dilation2
self._padding1 = padding1 self._padding1 = padding1
...@@ -301,8 +511,9 @@ class ResNetBasicBlock(Layer): ...@@ -301,8 +511,9 @@ class ResNetBasicBlock(Layer):
self._find_conv_max = find_conv_max self._find_conv_max = find_conv_max
if has_shortcut: if has_shortcut:
self._kernel3_size = utils.convert_to_list(filter3_size, 2, self._kernel3_size = utils.convert_to_list(
'filter3_size') filter3_size, 2, 'filter3_size'
)
self._padding3 = padding3 self._padding3 = padding3
self._stride3 = stride3 self._stride3 = stride3
self._dilation3 = dilation3 self._dilation3 = dilation3
...@@ -317,11 +528,13 @@ class ResNetBasicBlock(Layer): ...@@ -317,11 +528,13 @@ class ResNetBasicBlock(Layer):
if data_format not in valid_format: if data_format not in valid_format:
raise ValueError( raise ValueError(
"conv_format must be one of {}, but got conv_format={}".format( "conv_format must be one of {}, but got conv_format={}".format(
valid_format, data_format)) valid_format, data_format
)
)
def _get_default_param_initializer(channels, kernel_size): def _get_default_param_initializer(channels, kernel_size):
filter_elem_num = np.prod(kernel_size) * channels filter_elem_num = np.prod(kernel_size) * channels
std = (2.0 / filter_elem_num)**0.5 std = (2.0 / filter_elem_num) ** 0.5
return I.Normal(0.0, std) return I.Normal(0.0, std)
# init filter # init filter
...@@ -335,92 +548,128 @@ class ResNetBasicBlock(Layer): ...@@ -335,92 +548,128 @@ class ResNetBasicBlock(Layer):
shape=filter1_shape, shape=filter1_shape,
attr=filter1_attr, attr=filter1_attr,
default_initializer=_get_default_param_initializer( default_initializer=_get_default_param_initializer(
num_channels1, self._kernel1_size)) num_channels1, self._kernel1_size
),
)
self.scale_1 = self.create_parameter( self.scale_1 = self.create_parameter(
shape=bn1_param_shape, shape=bn1_param_shape,
attr=scale1_attr, attr=scale1_attr,
dtype=bn_param_dtype, dtype=bn_param_dtype,
default_initializer=I.Constant(1.0)) default_initializer=I.Constant(1.0),
self.bias_1 = self.create_parameter(shape=bn1_param_shape, )
attr=bias1_attr, self.bias_1 = self.create_parameter(
dtype=bn_param_dtype, shape=bn1_param_shape,
is_bias=True) attr=bias1_attr,
self.mean_1 = self.create_parameter(attr=ParamAttr( dtype=bn_param_dtype,
name=moving_mean1_name, is_bias=True,
initializer=I.Constant(0.0), )
trainable=False), self.mean_1 = self.create_parameter(
shape=bn1_param_shape, attr=ParamAttr(
dtype=bn_param_dtype) name=moving_mean1_name,
initializer=I.Constant(0.0),
trainable=False,
),
shape=bn1_param_shape,
dtype=bn_param_dtype,
)
self.mean_1.stop_gradient = True self.mean_1.stop_gradient = True
self.var_1 = self.create_parameter( self.var_1 = self.create_parameter(
attr=ParamAttr(name=moving_var1_name, attr=ParamAttr(
initializer=I.Constant(1.0), name=moving_var1_name,
trainable=False), initializer=I.Constant(1.0),
trainable=False,
),
shape=bn1_param_shape, shape=bn1_param_shape,
dtype=bn_param_dtype) dtype=bn_param_dtype,
)
self.var_1.stop_gradient = True self.var_1.stop_gradient = True
self.filter_2 = self.create_parameter( self.filter_2 = self.create_parameter(
shape=filter2_shape, shape=filter2_shape,
attr=filter2_attr, attr=filter2_attr,
default_initializer=_get_default_param_initializer( default_initializer=_get_default_param_initializer(
num_channels2, self._kernel2_size)) num_channels2, self._kernel2_size
),
)
self.scale_2 = self.create_parameter( self.scale_2 = self.create_parameter(
shape=bn2_param_shape, shape=bn2_param_shape,
attr=scale2_attr, attr=scale2_attr,
dtype=bn_param_dtype, dtype=bn_param_dtype,
default_initializer=I.Constant(1.0)) default_initializer=I.Constant(1.0),
self.bias_2 = self.create_parameter(shape=bn2_param_shape, )
attr=bias2_attr, self.bias_2 = self.create_parameter(
dtype=bn_param_dtype, shape=bn2_param_shape,
is_bias=True) attr=bias2_attr,
self.mean_2 = self.create_parameter(attr=ParamAttr( dtype=bn_param_dtype,
name=moving_mean2_name, is_bias=True,
initializer=I.Constant(0.0), )
trainable=False), self.mean_2 = self.create_parameter(
shape=bn2_param_shape, attr=ParamAttr(
dtype=bn_param_dtype) name=moving_mean2_name,
initializer=I.Constant(0.0),
trainable=False,
),
shape=bn2_param_shape,
dtype=bn_param_dtype,
)
self.mean_2.stop_gradient = True self.mean_2.stop_gradient = True
self.var_2 = self.create_parameter( self.var_2 = self.create_parameter(
attr=ParamAttr(name=moving_var2_name, attr=ParamAttr(
initializer=I.Constant(1.0), name=moving_var2_name,
trainable=False), initializer=I.Constant(1.0),
trainable=False,
),
shape=bn2_param_shape, shape=bn2_param_shape,
dtype=bn_param_dtype) dtype=bn_param_dtype,
)
self.var_2.stop_gradient = True self.var_2.stop_gradient = True
if has_shortcut: if has_shortcut:
bn3_param_shape = [1, 1, num_filter3] bn3_param_shape = [1, 1, num_filter3]
filter3_shape = [ filter3_shape = [
num_filter3, num_channels3, filter3_size, filter3_size num_filter3,
num_channels3,
filter3_size,
filter3_size,
] ]
self.filter_3 = self.create_parameter( self.filter_3 = self.create_parameter(
shape=filter3_shape, shape=filter3_shape,
attr=filter3_attr, attr=filter3_attr,
default_initializer=_get_default_param_initializer( default_initializer=_get_default_param_initializer(
num_channels3, self._kernel3_size)) num_channels3, self._kernel3_size
),
)
self.scale_3 = self.create_parameter( self.scale_3 = self.create_parameter(
shape=bn3_param_shape, shape=bn3_param_shape,
attr=scale3_attr, attr=scale3_attr,
dtype=bn_param_dtype, dtype=bn_param_dtype,
default_initializer=I.Constant(1.0)) default_initializer=I.Constant(1.0),
self.bias_3 = self.create_parameter(shape=bn3_param_shape, )
attr=bias3_attr, self.bias_3 = self.create_parameter(
dtype=bn_param_dtype, shape=bn3_param_shape,
is_bias=True) attr=bias3_attr,
self.mean_3 = self.create_parameter(attr=ParamAttr( dtype=bn_param_dtype,
name=moving_mean3_name, is_bias=True,
initializer=I.Constant(0.0), )
trainable=False), self.mean_3 = self.create_parameter(
shape=bn3_param_shape, attr=ParamAttr(
dtype=bn_param_dtype) name=moving_mean3_name,
initializer=I.Constant(0.0),
trainable=False,
),
shape=bn3_param_shape,
dtype=bn_param_dtype,
)
self.mean_3.stop_gradient = True self.mean_3.stop_gradient = True
self.var_3 = self.create_parameter(attr=ParamAttr( self.var_3 = self.create_parameter(
name=moving_var3_name, attr=ParamAttr(
initializer=I.Constant(1.0), name=moving_var3_name,
trainable=False), initializer=I.Constant(1.0),
shape=bn3_param_shape, trainable=False,
dtype=bn_param_dtype) ),
shape=bn3_param_shape,
dtype=bn_param_dtype,
)
self.var_3.stop_gradient = True self.var_3.stop_gradient = True
else: else:
self.filter_3 = None self.filter_3 = None
...@@ -464,5 +713,6 @@ class ResNetBasicBlock(Layer): ...@@ -464,5 +713,6 @@ class ResNetBasicBlock(Layer):
use_global_stats=self._use_global_stats, use_global_stats=self._use_global_stats,
training=self.training, training=self.training,
trainable_statistics=self._trainable_statistics, trainable_statistics=self._trainable_statistics,
find_conv_max=self._find_conv_max) find_conv_max=self._find_conv_max,
)
return out return out
...@@ -715,6 +715,7 @@ def upsample( ...@@ -715,6 +715,7 @@ def upsample(
name=None, name=None,
): ):
""" """
This API resizes a batch of images. This API resizes a batch of images.
The input must be a 3-D Tensor of the shape (num_batches, channels, in_w) The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
...@@ -725,11 +726,12 @@ def upsample( ...@@ -725,11 +726,12 @@ def upsample(
and the resizing only applies on the three dimensions(depth, height and width). and the resizing only applies on the three dimensions(depth, height and width).
Supporting resample methods: Supporting resample methods:
'linear' : Linear interpolation - 'linear' : Linear interpolation
'bilinear' : Bilinear interpolation - 'bilinear' : Bilinear interpolation
'trilinear' : Trilinear interpolation - 'trilinear' : Trilinear interpolation
'nearest' : Nearest neighbor interpolation - 'nearest' : Nearest neighbor interpolation
'bicubic' : Bicubic interpolation - 'bicubic' : Bicubic interpolation
Linear interpolation is the method of using a line connecting two known quantities Linear interpolation is the method of using a line connecting two known quantities
to determine the value of an unknown quantity between the two known quantities. to determine the value of an unknown quantity between the two known quantities.
...@@ -762,77 +764,78 @@ def upsample( ...@@ -762,77 +764,78 @@ def upsample(
`paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`. `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
Example: Example:
.. code-block:: text .. code-block:: text
For scale_factor: For scale_factor:
if align_corners = True && out_size > 1 : if align_corners = True && out_size > 1 :
scale_factor = (in_size-1.0)/(out_size-1.0) scale_factor = (in_size-1.0)/(out_size-1.0)
else:
scale_factor = float(in_size/out_size)
Linear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,W_in)
output: (N,C,W_out) where:
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,W_in)
output: (N,C,W_out) where:
W_out = W_{in} * scale_{factor}
Nearest neighbor interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = floor (H_{in} * scale_{factor})
W_out = floor (W_{in} * scale_{factor})
else: else:
scale_factor = float(in_size/out_size) align_corners = True
Linear interpolation: input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
Bilinear interpolation:
if: if:
align_corners = False , align_mode = 0 align_corners = False , align_mode = 0
input : (N,C,W_in) input : (N,C,H_in,W_in)
output: (N,C,W_out) where: output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5 W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else: else:
input : (N,C,W_in) input : (N,C,H_in,W_in)
output: (N,C,W_out) where: output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Bicubic interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Trilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = (D_{in}+0.5) * scale_{factor} - 0.5
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = D_{in} * scale_{factor}
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor} W_out = W_{in} * scale_{factor}
Nearest neighbor interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = floor (H_{in} * scale_{factor})
W_out = floor (W_{in} * scale_{factor})
else:
align_corners = True
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = round(H_{in} * scale_{factor})
W_out = round(W_{in} * scale_{factor})
Bilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Bicubic interpolation:
if:
align_corners = False
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,H_in,W_in)
output: (N,C,H_out,W_out) where:
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
Trilinear interpolation:
if:
align_corners = False , align_mode = 0
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = (D_{in}+0.5) * scale_{factor} - 0.5
H_out = (H_{in}+0.5) * scale_{factor} - 0.5
W_out = (W_{in}+0.5) * scale_{factor} - 0.5
else:
input : (N,C,D_in,H_in,W_in)
output: (N,C,D_out,H_out,W_out) where:
D_out = D_{in} * scale_{factor}
H_out = H_{in} * scale_{factor}
W_out = W_{in} * scale_{factor}
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of linear interpolation, please refer to Wikipedia: For details of linear interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Linear_interpolation.
For details of nearest neighbor interpolation, please refer to Wikipedia: For details of nearest neighbor interpolation, please refer to Wikipedia:
https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation. https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
...@@ -876,23 +879,24 @@ def upsample( ...@@ -876,23 +879,24 @@ def upsample(
name(str, optional): The default value is None. name(str, optional): The default value is None.
Normally there is no need for user to set this property. Normally there is no need for user to set this property.
For more information, please refer to :ref:`api_guide_Name` For more information, please refer to :ref:`api_guide_Name`
Returns: Returns:
A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels), A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels), A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels). or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
Examples: Examples:
.. code-block:: python .. code-block:: python
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn
input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32) input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
upsample_out = paddle.nn.Upsample(size=[12,12]) upsample_out = paddle.nn.Upsample(size=[12,12])
output = upsample_out(x=input_data) output = upsample_out(x=input_data)
print(output.shape) print(output.shape)
# [2L, 3L, 12L, 12L] # [2L, 3L, 12L, 12L]
""" """
return interpolate( return interpolate(
......
...@@ -23,6 +23,7 @@ __all__ = [] ...@@ -23,6 +23,7 @@ __all__ = []
def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None): def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
r""" r"""
It computes the pairwise distance between two vectors. The It computes the pairwise distance between two vectors. The
distance is calculated by p-oreder norm: distance is calculated by p-oreder norm:
...@@ -48,10 +49,11 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None): ...@@ -48,10 +49,11 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
Returns: Returns:
Tensor, the dtype is same as input tensor. Tensor, the dtype is same as input tensor.
- If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`, - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
depending on whether the input has data shaped as :math:`[N, D]`. depending on whether the input has data shaped as :math:`[N, D]`.
- If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`, - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
depending on whether the input has data shaped as :math:`[N, D]`. depending on whether the input has data shaped as :math:`[N, D]`.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
...@@ -1310,6 +1310,7 @@ def margin_ranking_loss( ...@@ -1310,6 +1310,7 @@ def margin_ranking_loss(
def l1_loss(input, label, reduction='mean', name=None): def l1_loss(input, label, reduction='mean', name=None):
r""" r"""
Computes the L1 Loss of Tensor ``input`` and ``label`` as follows. Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
If `reduction` set to ``'none'``, the loss is: If `reduction` set to ``'none'``, the loss is:
...@@ -1341,7 +1342,7 @@ def l1_loss(input, label, reduction='mean', name=None): ...@@ -1341,7 +1342,7 @@ def l1_loss(input, label, reduction='mean', name=None):
Returns: Returns:
Tensor, the L1 Loss of Tensor ``input`` and ``label``. Tensor, the L1 Loss of Tensor ``input`` and ``label``.
If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` . If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1]. If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
Examples: Examples:
...@@ -1364,6 +1365,7 @@ def l1_loss(input, label, reduction='mean', name=None): ...@@ -1364,6 +1365,7 @@ def l1_loss(input, label, reduction='mean', name=None):
l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum') l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
print(l1_loss.numpy()) print(l1_loss.numpy())
# [1.4] # [1.4]
""" """
if reduction not in ['sum', 'mean', 'none']: if reduction not in ['sum', 'mean', 'none']:
raise ValueError( raise ValueError(
...@@ -2286,6 +2288,7 @@ def cross_entropy( ...@@ -2286,6 +2288,7 @@ def cross_entropy(
name=None, name=None,
): ):
r""" r"""
By default, this operator implements the cross entropy loss function with softmax. This function By default, this operator implements the cross entropy loss function with softmax. This function
combines the calculation of the softmax operation and the cross entropy loss function combines the calculation of the softmax operation and the cross entropy loss function
to provide a more numerically stable computing. to provide a more numerically stable computing.
...@@ -2399,21 +2402,13 @@ def cross_entropy( ...@@ -2399,21 +2402,13 @@ def cross_entropy(
Parameters: Parameters:
input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
- **input** (Tensor)
Input tensor, the data type is float32, float64. Shape is
:math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes , ``k >= 1`` .
Note: Note:
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
output of softmax operator, which will produce incorrect results.
2. when use_softmax=False, it expects the output of softmax operator. 2. when use_softmax=False, it expects the output of softmax operator.
- **label** (Tensor) label (Tensor):
1. If soft_label=False, the shape is 1. If soft_label=False, the shape is
:math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1. :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
the data type is int32, int64, float32, float64, where each value is [0, C-1]. the data type is int32, int64, float32, float64, where each value is [0, C-1].
...@@ -2421,48 +2416,27 @@ def cross_entropy( ...@@ -2421,48 +2416,27 @@ def cross_entropy(
2. If soft_label=True, the shape and data type should be same with ``input`` , 2. If soft_label=True, the shape and data type should be same with ``input`` ,
and the sum of the labels for each sample should be 1. and the sum of the labels for each sample should be 1.
- **weight** (Tensor, optional) weight (Tensor, optional): a manual rescaling weight given to each class.
a manual rescaling weight given to each class.
If given, has to be a Tensor of size C and the data type is float32, float64. If given, has to be a Tensor of size C and the data type is float32, float64.
Default is ``'None'`` . Default is ``'None'`` .
ignore_index (int64, optional): Specifies a target value that is ignored
- **ignore_index** (int64, optional)
Specifies a target value that is ignored
and does not contribute to the loss. A negative value means that no label and does not contribute to the loss. A negative value means that no label
value needs to be ignored. Only valid when soft_label = False. value needs to be ignored. Only valid when soft_label = False.
Default is ``-100`` . Default is ``-100`` .
reduction (str, optional): Indicate how to average the loss by batch_size,
- **reduction** (str, optional)
Indicate how to average the loss by batch_size,
the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned. If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
If :attr:`reduction` is ``'none'``, the unreduced loss is returned. If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
Default is ``'mean'``. Default is ``'mean'``.
soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
- **soft_label** (bool, optional) axis (int, optional):The index of dimension to perform softmax calculations.
Indicate whether label is soft.
Default is ``False``.
- **axis** (int, optional)
The index of dimension to perform softmax calculations.
It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
number of dimensions of input :attr:`input`. number of dimensions of input :attr:`input`.
Default is ``-1`` . Default is ``-1`` .
use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
- **use_softmax** (bool, optional)
Indicate whether compute softmax before cross_entropy.
Default is ``True``. Default is ``True``.
name (str, optional): The name of the operator. Default is ``None`` .
- **name** (str, optional)
The name of the operator. Default is ``None`` .
For more information, please refer to :ref:`api_guide_Name` . For more information, please refer to :ref:`api_guide_Name` .
Returns: Returns:
...@@ -2478,9 +2452,7 @@ def cross_entropy( ...@@ -2478,9 +2452,7 @@ def cross_entropy(
2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
Examples: Examples:
.. code-block:: python .. code-block:: python
# hard labels # hard labels
...@@ -3834,6 +3806,7 @@ def triplet_margin_loss( ...@@ -3834,6 +3806,7 @@ def triplet_margin_loss(
def soft_margin_loss(input, label, reduction='mean', name=None): def soft_margin_loss(input, label, reduction='mean', name=None):
""" """
The API measures the soft margin loss between input predictions ``input`` The API measures the soft margin loss between input predictions ``input``
and target labels ``label`` . It can be described as: and target labels ``label`` . It can be described as:
...@@ -3842,9 +3815,9 @@ def soft_margin_loss(input, label, reduction='mean', name=None): ...@@ -3842,9 +3815,9 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Parameters: Parameters:
input (Tensor): The input predications tensor with shape: [N, *], input (Tensor): The input predications tensor with shape: ``[N, *]``,
N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf. N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
Available dtype is float32, float64. Available dtype is float32, float64.
label (Tensor): The target labels tensor with the same shape as label (Tensor): The target labels tensor with the same shape as
``input``. The target labels which values should be numbers -1 or 1. ``input``. The target labels which values should be numbers -1 or 1.
...@@ -3862,8 +3835,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None): ...@@ -3862,8 +3835,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
Returns: Returns:
Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
same as ``input`` , else the shape of output is [1].
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -3889,6 +3861,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None): ...@@ -3889,6 +3861,7 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
# [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678], # [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
# [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790], # [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
# [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]]) # [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
""" """
if reduction not in ['sum', 'mean', 'none']: if reduction not in ['sum', 'mean', 'none']:
raise ValueError( raise ValueError(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment