Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
dbe08e9b
Commit
dbe08e9b
authored
Jun 12, 2023
by
yuguo960516yuguo
Browse files
2.4.2
parent
b5499578
Changes
302
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
779 additions
and
232 deletions
+779
-232
paddle/phi/kernels/funcs/fc_functor.cu
paddle/phi/kernels/funcs/fc_functor.cu
+4
-7
paddle/phi/kernels/funcs/norm_utils.h
paddle/phi/kernels/funcs/norm_utils.h
+4
-0
paddle/phi/kernels/funcs/sparse/utils.cu.h
paddle/phi/kernels/funcs/sparse/utils.cu.h
+13
-0
paddle/phi/kernels/gpu/add_n_kernel.cu
paddle/phi/kernels/gpu/add_n_kernel.cu
+134
-30
paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+32
-13
paddle/phi/kernels/gpu/batch_norm_kernel.cu
paddle/phi/kernels/gpu/batch_norm_kernel.cu
+74
-18
paddle/phi/kernels/gpu/multiplex_kernel.cu
paddle/phi/kernels/gpu/multiplex_kernel.cu
+1
-1
paddle/phi/kernels/impl/add_n_kernel_impl.h
paddle/phi/kernels/impl/add_n_kernel_impl.h
+80
-0
paddle/phi/kernels/impl/fold_grad_kernel_impl.h
paddle/phi/kernels/impl/fold_grad_kernel_impl.h
+2
-5
paddle/phi/kernels/impl/fold_kernel_impl.h
paddle/phi/kernels/impl/fold_kernel_impl.h
+3
-5
paddle/phi/kernels/impl/solve_kernel_impl.h
paddle/phi/kernels/impl/solve_kernel_impl.h
+1
-1
paddle/phi/kernels/impl/squeeze_kernel_impl.h
paddle/phi/kernels/impl/squeeze_kernel_impl.h
+1
-5
paddle/phi/kernels/selected_rows/add_n_kernel.h
paddle/phi/kernels/selected_rows/add_n_kernel.h
+10
-20
paddle/phi/kernels/selected_rows/cpu/add_n_kernel.cc
paddle/phi/kernels/selected_rows/cpu/add_n_kernel.cc
+25
-0
paddle/phi/kernels/selected_rows/gpu/add_n_kernel.cu
paddle/phi/kernels/selected_rows/gpu/add_n_kernel.cu
+26
-0
paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
+90
-0
paddle/phi/kernels/sparse/batch_norm_kernel.h
paddle/phi/kernels/sparse/batch_norm_kernel.h
+19
-19
paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+2
-3
paddle/phi/kernels/sparse/gpu/conv.cu.h
paddle/phi/kernels/sparse/gpu/conv.cu.h
+137
-31
paddle/phi/kernels/sparse/gpu/conv_kernel.cu
paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+121
-74
No files found.
paddle/phi/kernels/funcs/fc_functor.cu
View file @
dbe08e9b
...
...
@@ -149,7 +149,7 @@ __global__ void bias_relu_v2(const int num,
#if __CUDA_ARCH__ >= 800
packed_val
=
__hmax2
(
__half2
(
0
,
0
),
packed_val
);
#elif __CUDA_ARCH__ >= 530
packed_val
=
__hmul2
(
__hgt2
(
__half2
(
0
,
0
)
,
packed_val
),
packed_val
);
packed_val
=
__hmul2
(
__hgt2
(
packed_val
,
__half2
(
0
,
0
)),
packed_val
);
#else
packed_val
.
x
=
static_cast
<
int
>
(
static_cast
<
float
>
(
packed_val
.
x
)
>
0
)
*
static_cast
<
float
>
(
packed_val
.
x
);
...
...
@@ -292,19 +292,16 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
errors
::
PermissionDenied
(
"Weight padding in fc can not be used in GPU scope."
));
auto
blas
=
phi
::
funcs
::
GetBlas
<
DeviceContext
,
T
>
(
context
);
blas
.
GEMM
(
false
,
false
,
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1.0
),
X
,
K
,
W
,
N
,
static_cast
<
T
>
(
0.0
),
Y
,
N
);
Y
);
if
(
B
==
NULL
)
{
return
;
}
...
...
paddle/phi/kernels/funcs/norm_utils.h
View file @
dbe08e9b
...
...
@@ -18,6 +18,10 @@ limitations under the License. */
namespace
phi
{
namespace
funcs
{
#define CUDNN_PER_ACTIVATION_THRESHOLD 10240
#define CUDNN_SPATIAL_THRESHOLD_TRAIN 880801
#define CUDNN_SPATIAL_THRESHOLD_EVAL 65535
inline
void
ExtractNCWHD
(
const
phi
::
DDim
&
dims
,
const
DataLayout
&
data_layout
,
int
*
N
,
...
...
paddle/phi/kernels/funcs/sparse/utils.cu.h
View file @
dbe08e9b
...
...
@@ -26,6 +26,19 @@ __global__ void DistanceKernel(const T* start, const T* end, T* distance) {
}
}
inline
__device__
bool
SetBits
(
const
int
value
,
int
*
ptr
)
{
const
int
index
=
value
>>
5
;
const
int
mask
=
1
<<
(
value
&
31
);
const
int
old
=
atomicOr
(
ptr
+
index
,
mask
);
return
(
mask
&
old
)
!=
0
;
}
inline
__device__
bool
TestBits
(
const
int
value
,
const
int
*
ptr
)
{
const
int
index
=
value
>>
5
;
const
int
mask
=
1
<<
(
value
&
31
);
return
(
mask
&
ptr
[
index
])
!=
0
;
}
}
// namespace sparse
}
// namespace funcs
}
// namespace phi
paddle/phi/kernels/gpu/add_n_kernel.cu
View file @
dbe08e9b
...
...
@@ -14,16 +14,27 @@
#include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
template
<
class
T
>
__global__
void
Sum2CUDAKernel
(
const
T
*
in_0
,
const
T
*
in_1
,
T
*
out
,
int64_t
N
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
id
<
N
)
{
out
[
id
]
=
in_0
[
id
]
+
in_1
[
id
];
id
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
template
<
class
T
>
__global__
void
SumArrayCUDAKernel
(
T
**
in
,
T
*
out
,
int64_t
N
,
size_t
in_size
,
bool
read_dst
)
{
...
...
@@ -41,9 +52,26 @@ __global__ void SumArrayCUDAKernel(
}
}
template
<
class
T
>
__global__
void
SumSelectedRowsCUDAKernel
(
T
**
sr_in_out
,
int64_t
N
,
size_t
rows
)
{
int
id
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
while
(
id
<
N
)
{
for
(
int
i
=
0
;
i
<
2
*
rows
;
i
+=
2
)
{
const
T
*
tmp
=
sr_in_out
[
i
];
T
*
tmp_out
=
sr_in_out
[
i
+
1
];
if
(
tmp
&&
tmp_out
)
{
tmp_out
[
id
]
+=
tmp
[
id
];
}
}
id
+=
blockDim
.
x
*
gridDim
.
x
;
}
}
template
<
typename
T
,
typename
Context
>
void
AddNKernel
(
const
Context
&
dev_ctx
,
const
std
::
vector
<
const
Dense
Tensor
*>
&
x
,
const
std
::
vector
<
const
Tensor
Base
*>
&
x
,
DenseTensor
*
out
)
{
const
size_t
in_num
=
x
.
size
();
...
...
@@ -66,36 +94,38 @@ void AddNKernel(const Context &dev_ctx,
grids
=
dim3
(
CEIL_DIV
(
length
,
tile_size
),
1
,
1
);
blocks
=
dim3
(
tile_size
,
1
,
1
);
};
auto
*
out_ptr
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
bool
in_place
=
false
;
if
(
x
.
size
()
>
0
&&
x
[
0
]
->
initialized
()
&&
DenseTensor
::
classof
(
x
[
0
]))
{
if
((
static_cast
<
const
DenseTensor
*>
(
x
[
0
]))
->
data
()
==
out
->
data
())
{
in_place
=
true
;
}
}
bool
in_place
=
x
[
0
]
==
out
;
if
(
!
in_place
)
{
auto
*
out_ptr
=
dev_ctx
.
template
Alloc
<
T
>(
out
);
if
(
in_num
>=
1
)
{
auto
&
in_0_tensor
=
*
x
[
0
];
if
(
in_0_tensor
.
numel
()
>
0
)
{
in_place
=
(
in_0_tensor
.
data
<
T
>
()
==
out_ptr
);
}
if
(
!
in_place
&&
in_num
>=
1
&&
DenseTensor
::
classof
(
x
[
0
]))
{
auto
&
in_0_tensor
=
*
(
static_cast
<
const
DenseTensor
*>
(
x
[
0
]));
if
(
in_0_tensor
.
numel
()
>
0
)
{
in_place
=
(
in_0_tensor
.
data
<
T
>
()
==
out_ptr
);
}
}
// Sum of two tensors
if
(
in_num
==
2
)
{
auto
&
in_0
=
*
x
[
0
];
auto
&
in_1
=
*
x
[
1
];
if
(
in_num
==
2
&&
DenseTensor
::
classof
(
x
[
0
])
&&
DenseTensor
::
classof
(
x
[
1
])
)
{
auto
&
in_0
=
*
(
static_cast
<
const
DenseTensor
*>
(
x
[
0
]
))
;
auto
&
in_1
=
*
(
static_cast
<
const
DenseTensor
*>
(
x
[
1
]
))
;
int64_t
length_0
=
in_0
.
numel
();
int64_t
length_1
=
in_1
.
numel
();
if
(
length_0
&&
length_1
&&
in_0
.
i
nitialized
()
&&
in_1
.
i
nitialized
())
{
if
(
length_0
&&
length_1
&&
in_0
.
IsI
nitialized
()
&&
in_1
.
IsI
nitialized
())
{
auto
result
=
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
&
place
=
*
dev_ctx
.
eigen_device
();
auto
in_0_e
=
EigenVector
<
T
>::
Flatten
(
in_0
);
auto
in_1_e
=
EigenVector
<
T
>::
Flatten
(
in_1
);
result
.
device
(
place
)
=
in_0_e
+
in_1_e
;
}
else
if
(
length_0
&&
in_0
.
i
nitialized
())
{
}
else
if
(
length_0
&&
in_0
.
IsI
nitialized
())
{
auto
result
=
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
&
place
=
*
dev_ctx
.
eigen_device
();
result
.
device
(
place
)
=
EigenVector
<
T
>::
Flatten
(
in_0
);
}
else
if
(
length_1
&&
in_1
.
i
nitialized
())
{
}
else
if
(
length_1
&&
in_1
.
IsI
nitialized
())
{
auto
result
=
EigenVector
<
T
>::
Flatten
(
*
out
);
auto
&
place
=
*
dev_ctx
.
eigen_device
();
result
.
device
(
place
)
=
EigenVector
<
T
>::
Flatten
(
in_1
);
...
...
@@ -105,27 +135,90 @@ void AddNKernel(const Context &dev_ctx,
int
start
=
in_place
?
1
:
0
;
if
(
!
in_place
)
{
funcs
::
SetConstant
<
Context
,
T
>
constant_functor
;
phi
::
funcs
::
SetConstant
<
phi
::
GPU
Context
,
T
>
constant_functor
;
constant_functor
(
dev_ctx
,
out
,
static_cast
<
T
>
(
0
));
}
std
::
vector
<
const
T
*>
in_data
;
std
::
vector
<
int
>
selectrow_index
;
int64_t
lod_length
=
0
;
bool
dst_write
=
false
;
for
(
int
i
=
start
;
i
<
in_num
;
++
i
)
{
auto
&
in_i
=
*
x
[
i
];
lod_length
=
in_i
.
numel
();
if
(
lod_length
&&
in_i
.
initialized
())
{
in_data
.
emplace_back
(
in_i
.
data
<
T
>
());
if
(
DenseTensor
::
classof
(
x
[
i
]))
{
auto
&
in_i
=
*
(
static_cast
<
const
DenseTensor
*>
(
x
[
i
]));
lod_length
=
in_i
.
numel
();
if
(
lod_length
&&
in_i
.
IsInitialized
())
{
in_data
.
emplace_back
(
in_i
.
data
<
T
>
());
}
}
else
if
(
SelectedRows
::
classof
(
x
[
i
]))
{
selectrow_index
.
push_back
(
i
);
}
}
// compute select rows separately.
if
(
!
selectrow_index
.
empty
())
{
std
::
vector
<
const
T
*>
sr_in_out_data
;
size_t
rows
=
0
;
int64_t
length
=
0
;
for
(
auto
index
:
selectrow_index
)
{
auto
&
sr
=
*
(
static_cast
<
const
SelectedRows
*>
(
x
[
index
]));
auto
&
sr_value
=
sr
.
value
();
auto
&
sr_rows
=
sr
.
rows
();
auto
row_numel
=
sr_value
.
numel
()
/
sr_rows
.
size
();
auto
out_dims
=
out
->
dims
();
PADDLE_ENFORCE_EQ
(
sr
.
height
(),
out_dims
[
0
],
errors
::
InvalidArgument
(
"The table height of input must be same as output, "
"but received input height is %d"
", output height is %d"
,
sr
.
height
(),
out_dims
[
0
]));
PADDLE_ENFORCE_EQ
(
row_numel
,
out
->
numel
()
/
sr
.
height
(),
errors
::
InvalidArgument
(
"The table width of input must be same as output, "
"but received input width is %d"
", output width is %d"
,
row_numel
,
out
->
numel
()
/
sr
.
height
()));
auto
*
sr_data
=
sr_value
.
data
<
T
>
();
auto
*
sr_out_data
=
out
->
data
<
T
>
();
rows
+=
sr_rows
.
size
();
length
=
row_numel
;
for
(
size_t
i
=
0
;
i
<
sr_rows
.
size
();
++
i
)
{
sr_in_out_data
.
emplace_back
(
&
sr_data
[
i
*
row_numel
]);
sr_in_out_data
.
emplace_back
(
&
sr_out_data
[
sr_rows
[
i
]
*
row_numel
]);
}
}
if
(
!
sr_in_out_data
.
empty
())
{
auto
tmp_sr_in_out_array
=
paddle
::
memory
::
Alloc
(
dev_ctx
.
GetPlace
(),
sr_in_out_data
.
size
()
*
sizeof
(
T
*
));
paddle
::
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
tmp_sr_in_out_array
->
ptr
(),
phi
::
CPUPlace
(),
reinterpret_cast
<
void
*>
(
sr_in_out_data
.
data
()),
sr_in_out_data
.
size
()
*
sizeof
(
T
*
),
dev_ctx
.
stream
());
T
**
sr_in_out_array_data
=
reinterpret_cast
<
T
**>
(
tmp_sr_in_out_array
->
ptr
());
ComputeKernelParameter
(
length
);
SumSelectedRowsCUDAKernel
<
T
>
<<<
grids
,
blocks
,
0
,
stream
>>>
(
sr_in_out_array_data
,
length
,
rows
);
dst_write
=
true
;
}
}
// if indata not null, merge into one kernel call.
if
(
!
in_data
.
empty
())
{
auto
tmp_in_array
=
paddle
::
memory
::
Alloc
(
dev_ctx
.
GetPlace
(),
in_data
.
size
()
*
sizeof
(
T
*
),
phi
::
Stream
(
reinterpret_cast
<
phi
::
StreamId
>
(
dev_ctx
.
stream
())));
auto
tmp_in_array
=
paddle
::
memory
::
Alloc
(
dev_ctx
.
GetPlace
(),
in_data
.
size
()
*
sizeof
(
T
*
));
paddle
::
memory
::
Copy
(
dev_ctx
.
GetPlace
(),
tmp_in_array
->
ptr
(),
...
...
@@ -153,6 +246,17 @@ PD_REGISTER_KERNEL(add_n,
float
,
double
,
int
,
int64_t
,
phi
::
dtype
::
bfloat16
,
phi
::
dtype
::
float16
)
{}
phi
::
dtype
::
float16
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
add_n_array
,
GPU
,
ALL_LAYOUT
,
phi
::
AddNArrayKernel
,
float
,
double
,
int
,
phi
::
dtype
::
bfloat16
,
phi
::
dtype
::
float16
,
int64_t
)
{}
paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
View file @
dbe08e9b
...
...
@@ -852,15 +852,18 @@ void BatchNormGradRawKernel(const Context &ctx,
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
// CUDNN only support small batch size
// const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const
size_t
CUDNN_PER_ACTIVATION_THRESHOLD
=
10240
;
const
size_t
CUDNN_SPATIAL_THRESHOLD
=
880801
;
const
bool
use_native_kernel
=
((
x_dims
.
size
()
==
2
&&
N
>=
CUDNN_PER_ACTIVATION_THRESHOLD
)
||
(
x_dims
.
size
()
==
3
&&
N
>=
CUDNN_SPATIAL_THRESHOLD
));
if
(
use_native_kernel
)
{
if
(
x_dims
.
size
()
==
2
)
{
}
// CUDNN only support small batch size
bool
use_native_nhwc
=
d_x
?
(
x_dims
.
size
()
==
4
&&
compute_format
==
DataLayout
::
kNHWC
&&
H
*
W
>=
CUDNN_SPATIAL_THRESHOLD_EVAL
)
:
false
;
const
bool
use_native_kernel
=
((
x_dims
.
size
()
==
2
&&
N
>=
CUDNN_PER_ACTIVATION_THRESHOLD
)
||
(
x_dims
.
size
()
==
3
&&
N
>=
CUDNN_SPATIAL_THRESHOLD_TRAIN
));
if
(
use_native_nhwc
||
(
d_x
&&
d_scale
&&
d_bias
))
{
if
(
use_native_kernel
||
use_native_nhwc
)
{
if
(
x_dims
.
size
()
==
2
||
use_native_nhwc
)
{
dim3
block
;
dim3
grid
;
const
int
block_size
=
512
;
...
...
@@ -931,6 +934,21 @@ void BatchNormGradRawKernel(const Context &ctx,
flag_ptr
);
}
// 2. reduce_sum(x, dy, mean) => dscale, dbias
BatchNormParamType
<
T
>
*
dscale
=
nullptr
;
BatchNormParamType
<
T
>
*
dbias
=
nullptr
;
bool
with_scale
=
false
;
if
(
d_scale
&&
d_bias
)
{
dscale
=
ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_scale
);
dbias
=
ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_bias
);
}
else
{
DenseTensor
dscale_mem
=
phi
::
Empty
<
BatchNormParamType
<
T
>
,
Context
>
(
ctx
,
{
C
});
DenseTensor
dbias_mem
=
phi
::
Empty
<
BatchNormParamType
<
T
>
,
Context
>
(
ctx
,
{
C
});
dscale
=
dscale_mem
.
data
<
BatchNormParamType
<
T
>>
();
dbias
=
dbias_mem
.
data
<
BatchNormParamType
<
T
>>
();
}
BNBackward2DChannelLastStage2
<
T
,
block_size
>
<<<
grid
,
block
,
0
,
ctx
.
stream
()
>>>
(
transformed_d_y
.
template
data
<
T
>(),
...
...
@@ -942,8 +960,8 @@ void BatchNormGradRawKernel(const Context &ctx,
H
*
W
*
D
,
epsilon
,
block_data_ptr
,
ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_
scale
)
,
ctx
.
template
Alloc
<
BatchNormParamType
<
T
>
>
(
d_
bias
)
,
d
scale
,
d
bias
,
flag_ptr
);
// 3. elementwise_mul(scale, mean, inv_var, dy, dscale, dbias) => dx
...
...
@@ -952,8 +970,8 @@ void BatchNormGradRawKernel(const Context &ctx,
transformed_d_y
.
template
data
<
T
>(),
transformed_x
.
template
data
<
T
>(),
scale
.
template
data
<
BatchNormParamType
<
T
>
>
(),
d
_
scale
->
data
<
BatchNormParamType
<
T
>>
()
,
d
_
bias
->
data
<
BatchNormParamType
<
T
>>
()
,
dscale
,
dbias
,
mean_ptr
,
variance_ptr
,
C
,
...
...
@@ -1163,6 +1181,7 @@ void BatchNormGradRawKernel(const Context &ctx,
paddle
::
platform
::
dynload
::
cudnnDestroyTensorDescriptor
(
bn_param_desc_
));
#endif
}
else
{
const
auto
*
running_mean
=
mean
.
get_ptr
();
const
auto
*
running_var
=
variance
.
get_ptr
();
...
...
paddle/phi/kernels/gpu/batch_norm_kernel.cu
View file @
dbe08e9b
...
...
@@ -72,6 +72,40 @@ static __global__ void BNForwardInference(const T *x,
}
}
template
<
typename
T
>
static
__global__
void
InverseVariance
(
const
BatchNormParamType
<
T
>
*
variance
,
const
double
epsilon
,
const
int
C
,
BatchNormParamType
<
T
>
*
inv_variance
)
{
int
tid
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
if
(
tid
<
C
)
{
inv_variance
[
tid
]
=
1
/
sqrt
(
variance
[
tid
]
+
epsilon
);
}
}
template
<
typename
T
,
phi
::
DataLayout
layout
>
static
__global__
void
BN1DForwardInference
(
const
T
*
x
,
const
BatchNormParamType
<
T
>
*
mean
,
const
BatchNormParamType
<
T
>
*
inv_variance
,
const
BatchNormParamType
<
T
>
*
scale
,
const
BatchNormParamType
<
T
>
*
bias
,
const
int
C
,
const
int
N
,
const
int
HxW
,
const
double
epsilon
,
T
*
y
)
{
int
gid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
stride
=
blockDim
.
x
*
gridDim
.
x
;
int
num
=
N
*
C
*
HxW
;
for
(
int
i
=
gid
;
i
<
num
;
i
+=
stride
)
{
const
int
c
=
layout
==
phi
::
DataLayout
::
kNCHW
?
i
/
HxW
%
C
:
i
%
C
;
BatchNormParamType
<
T
>
x_sub_mean
=
static_cast
<
BatchNormParamType
<
T
>>
(
x
[
i
])
-
mean
[
c
];
y
[
i
]
=
static_cast
<
T
>
(
scale
[
c
]
*
x_sub_mean
*
inv_variance
[
c
]
+
bias
[
c
]);
}
}
template
<
typename
T
,
int
BlockDim
,
phi
::
DataLayout
layout
>
static
__global__
LAUNCH_BOUNDS
(
BlockDim
)
void
BNForwardTraining
(
const
T
*
x
,
...
...
@@ -691,9 +725,6 @@ void BatchNormKernel(const Context &ctx,
auto
handle
=
ctx
.
cudnn_handle
();
const
size_t
CUDNN_PER_ACTIVATION_THRESHOLD
=
10240
;
const
size_t
CUDNN_SPATIAL_THRESHOLD
=
880801
;
// Now, depending on whether we are running test or not, we have two paths.
// It is training mode when it's not reference AND not using pre-trained
// model.
...
...
@@ -797,8 +828,8 @@ void BatchNormKernel(const Context &ctx,
// epsilon));
#else
const
bool
use_native_kernel
=
(
(
x_dims
.
size
()
==
2
&&
N
>=
CUDNN_PER_ACTIVATION_THRESHOLD
)
||
(
x_dims
.
size
()
==
3
&&
N
>=
CUDNN_SPATIAL_THRESHOLD
));
(
x_dims
.
size
()
==
2
||
(
x_dims
.
size
()
==
3
&&
N
>=
CUDNN_SPATIAL_THRESHOLD
_EVAL
));
if
(
use_native_kernel
)
{
const
int
block_size
=
256
;
const
int
grid_size
=
(
N
*
C
*
H
*
W
*
D
+
block_size
-
1
)
/
block_size
;
...
...
@@ -816,18 +847,43 @@ void BatchNormKernel(const Context &ctx,
epsilon
,
transformed_y
.
template
data
<
T
>());
}
else
{
BNForwardInference
<
T
,
DataLayout
::
kNHWC
>
<<<
grid_size
,
block_size
,
0
,
ctx
.
stream
()
>>>
(
transformed_x
.
template
data
<
T
>(),
est_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
est_var
->
template
data
<
BatchNormParamType
<
T
>
>
(),
scale
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
.
template
data
<
BatchNormParamType
<
T
>
>
(),
C
,
N
,
H
*
W
*
D
,
epsilon
,
transformed_y
.
template
data
<
T
>());
if
(
x_dims
.
size
()
==
2
)
{
DenseTensor
inv_var
=
phi
::
Empty
<
BatchNormParamType
<
T
>>
(
ctx
,
{
C
});
auto
*
inv_var_ptr
=
inv_var
.
data
<
BatchNormParamType
<
T
>>
();
const
int
threads
=
512
>
C
?
C
:
512
;
const
int
blocks
=
(
C
+
511
)
/
512
;
InverseVariance
<
T
><<<
blocks
,
threads
>>>
(
est_var
->
template
data
<
BatchNormParamType
<
T
>
>
(),
epsilon
,
C
,
inv_var_ptr
);
BN1DForwardInference
<
T
,
DataLayout
::
kNHWC
>
<<<
grid_size
,
block_size
,
0
,
ctx
.
stream
()
>>>
(
transformed_x
.
template
data
<
T
>(),
est_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
// est_var->template data<BatchNormParamType<T>>(),
inv_var_ptr
,
scale
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
.
template
data
<
BatchNormParamType
<
T
>
>
(),
C
,
N
,
H
*
W
*
D
,
epsilon
,
transformed_y
.
template
data
<
T
>());
}
else
{
BNForwardInference
<
T
,
DataLayout
::
kNHWC
>
<<<
grid_size
,
block_size
,
0
,
ctx
.
stream
()
>>>
(
transformed_x
.
template
data
<
T
>(),
est_mean
->
template
data
<
BatchNormParamType
<
T
>
>
(),
est_var
->
template
data
<
BatchNormParamType
<
T
>
>
(),
scale
.
template
data
<
BatchNormParamType
<
T
>
>
(),
bias
.
template
data
<
BatchNormParamType
<
T
>
>
(),
C
,
N
,
H
*
W
*
D
,
epsilon
,
transformed_y
.
template
data
<
T
>());
}
}
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
...
...
@@ -949,7 +1005,7 @@ void BatchNormKernel(const Context &ctx,
// const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const
bool
use_native_kernel
=
((
x_dims
.
size
()
==
2
&&
N
>=
CUDNN_PER_ACTIVATION_THRESHOLD
)
||
(
x_dims
.
size
()
==
3
&&
N
>=
CUDNN_SPATIAL_THRESHOLD
));
(
x_dims
.
size
()
==
3
&&
N
>=
CUDNN_SPATIAL_THRESHOLD
_TRAIN
));
if
(
use_native_kernel
)
{
dim3
block
;
dim3
grid
;
...
...
paddle/phi/kernels/gpu/multiplex_kernel.cu
View file @
dbe08e9b
...
...
@@ -41,7 +41,7 @@ void MultiplexKernel(const Context& ctx,
paddle
::
framework
::
TensorCopySync
(
ids
,
phi
::
CPUPlace
(),
&
index_t_cpu
);
auto
*
index
=
index_t_cpu
.
data
<
int32_t
>
();
auto
stream
=
ctx
.
stream
();
for
(
auto
i
=
0
;
i
<
rows
;
i
++
)
{
for
(
auto
i
=
0
;
i
<
ids
.
dims
()[
0
]
;
i
++
)
{
int32_t
k
=
index
[
i
];
PADDLE_ENFORCE_GE
(
k
,
0
,
errors
::
PreconditionNotMet
(
"index must be nonnegative."
));
...
...
paddle/phi/kernels/impl/add_n_kernel_impl.h
0 → 100644
View file @
dbe08e9b
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
AddNArrayKernel
(
const
Context
&
dev_ctx
,
const
std
::
vector
<
const
TensorArray
*>&
x
,
TensorArray
*
out
)
{
for
(
auto
&
ele
:
*
out
)
{
dev_ctx
.
template
Alloc
<
T
>(
&
ele
);
}
bool
in_place
=
true
;
if
(
x
.
size
()
>
0
&&
x
[
0
]
->
size
()
==
out
->
size
())
{
for
(
size_t
i
=
0
;
i
<
out
->
size
();
i
++
)
{
if
(
x
[
0
]
->
at
(
i
).
IsInitialized
()
&&
out
->
at
(
i
).
data
()
!=
x
[
0
]
->
at
(
i
).
data
())
{
in_place
=
false
;
break
;
}
}
}
else
{
in_place
=
false
;
}
for
(
size_t
i
=
in_place
?
1
:
0
;
i
<
x
.
size
();
++
i
)
{
auto
*
in_array
=
x
.
at
(
i
);
for
(
size_t
j
=
0
;
j
<
in_array
->
size
();
++
j
)
{
if
(
in_array
->
at
(
j
).
IsInitialized
()
&&
(
in_array
->
at
(
j
).
numel
()
!=
0
))
{
if
(
j
>=
out
->
size
())
{
out
->
resize
(
j
+
1
);
}
if
(
!
out
->
at
(
j
).
IsInitialized
()
||
(
out
->
at
(
j
).
numel
()
==
0
))
{
Copy
<
Context
>
(
dev_ctx
,
in_array
->
at
(
j
),
in_array
->
at
(
j
).
place
(),
false
,
&
out
->
at
(
j
));
out
->
at
(
j
).
set_lod
(
in_array
->
at
(
j
).
lod
());
}
else
{
PADDLE_ENFORCE_EQ
(
out
->
at
(
j
).
lod
(),
in_array
->
at
(
j
).
lod
(),
phi
::
errors
::
InvalidArgument
(
"The lod message between inputs[%d] and"
" outputs[%d] must be same, but now is not same."
,
j
,
j
));
auto
in
=
EigenVector
<
T
>::
Flatten
(
in_array
->
at
(
j
));
auto
result
=
EigenVector
<
T
>::
Flatten
(
out
->
at
(
j
));
result
.
device
(
*
dev_ctx
.
eigen_device
())
=
result
+
in
;
}
}
}
}
}
}
// namespace phi
paddle/phi/kernels/impl/fold_grad_kernel_impl.h
View file @
dbe08e9b
...
...
@@ -54,11 +54,8 @@ void FoldGradKernel(const Context& ctx,
DDim
out_shape
=
make_ddim
({
n_output_plane
,
output_sizes
[
0
],
output_sizes
[
1
]});
DDim
input_matrix_shape
=
make_ddim
({
x_dims
[
0
],
kernel_sizes
[
0
],
kernel_sizes
[
1
],
output_height
,
output_width
});
DDim
input_matrix_shape
=
make_ddim
(
{
1
,
kernel_sizes
[
0
],
kernel_sizes
[
1
],
output_height
,
output_width
});
paddle
::
operators
::
math
::
Im2ColFunctor
<
paddle
::
operators
::
math
::
ColFormat
::
kCFO
,
Context
,
T
>
...
...
paddle/phi/kernels/impl/fold_kernel_impl.h
View file @
dbe08e9b
...
...
@@ -56,11 +56,8 @@ void FoldKernel(const Context& ctx,
DDim
output_shape
=
make_ddim
({
n_output_plane
,
output_sizes
[
0
],
output_sizes
[
1
]});
DDim
input_matrix_shape
=
make_ddim
({
x_dims
[
0
],
kernel_sizes
[
0
],
kernel_sizes
[
1
],
output_height
,
output_width
});
DDim
input_matrix_shape
=
make_ddim
(
{
1
,
kernel_sizes
[
0
],
kernel_sizes
[
1
],
output_height
,
output_width
});
phi
::
funcs
::
SetConstant
<
Context
,
T
>
set_zero
;
set_zero
(
ctx
,
out
,
static_cast
<
T
>
(
0
));
...
...
@@ -68,6 +65,7 @@ void FoldKernel(const Context& ctx,
for
(
int
i
=
0
;
i
<
batch_size
;
i
++
)
{
DenseTensor
out_batch
=
out
->
Slice
(
i
,
i
+
1
).
Resize
(
output_shape
);
// im size=3
DenseTensor
in_batch
=
x
.
Slice
(
i
,
i
+
1
).
Resize
(
input_matrix_shape
);
// col size=5
col2im
(
ctx
,
in_batch
,
dilations
,
strides
,
paddings
,
&
out_batch
);
...
...
paddle/phi/kernels/impl/solve_kernel_impl.h
View file @
dbe08e9b
...
...
@@ -169,7 +169,7 @@ static void linalg_solve(const Context& dev_ctx,
out_tmp
.
Resize
(
out
->
dims
());
out_tmp
=
*
out
;
phi
::
Squeeze
Kernel
<
T
,
Context
>
(
dev_ctx
,
out_tmp
,
{
-
1
},
out
);
phi
::
Squeeze
<
T
,
Context
>
(
dev_ctx
,
out_tmp
,
{
-
1
},
out
);
}
else
{
PADDLE_ENFORCE_EQ
(
x_dim
[
x_dim_size
-
1
],
...
...
paddle/phi/kernels/impl/squeeze_kernel_impl.h
View file @
dbe08e9b
...
...
@@ -23,11 +23,7 @@ void SqueezeKernel(const Context& dev_ctx,
const
DenseTensor
&
x
,
const
IntArray
&
axes
,
DenseTensor
*
out
)
{
auto
x_dims
=
x
.
dims
();
std
::
vector
<
int32_t
>
tmp
(
axes
.
GetData
().
begin
(),
axes
.
GetData
().
end
());
auto
out_dims
=
funcs
::
GetOutputSqueezeShape
(
tmp
,
x_dims
,
true
);
out
->
Resize
(
out_dims
);
auto
out_dims
=
out
->
dims
();
dev_ctx
.
template
Alloc
<
T
>(
out
);
phi
::
Copy
(
dev_ctx
,
x
,
dev_ctx
.
GetPlace
(),
false
,
out
);
out
->
Resize
(
out_dims
);
// copy will reset the dims.
...
...
paddle/
fluid/inference/analysis/passes/ir_graph_clean_pass
.h
→
paddle/
phi/kernels/selected_rows/add_n_kernel
.h
View file @
dbe08e9b
// Copyright (c) 20
18
PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 20
22
PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
...
...
@@ -14,24 +14,14 @@
#pragma once
#include <string>
#include <unordered_set>
#include "paddle/phi/core/selected_rows.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
namespace
phi
{
namespace
sr
{
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
struct
Argument
;
class
IrInferCleanGraphPass
:
public
AnalysisPass
{
public:
void
RunImpl
(
Argument
*
argument
)
override
;
std
::
string
repr
()
const
override
{
return
"ir_graph_clean_pass"
;
}
};
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
template
<
typename
T
,
typename
Context
>
void
AddNKernel
(
const
Context
&
dev_ctx
,
const
std
::
vector
<
const
SelectedRows
*>&
x
,
SelectedRows
*
out
);
}
// namespace sr
}
// namespace phi
paddle/
fluid/inference/analysis/passes/ir_graph_clean_pass
.cc
→
paddle/
phi/kernels/selected_rows/cpu/add_n_kernel
.cc
View file @
dbe08e9b
// Copyright (c) 20
18
PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 20
22
PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
...
...
@@ -12,38 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/
fluid/inference/analysis/passes/ir_graph_clean_pass
.h"
#include "paddle/
phi/kernels/selected_rows/impl/add_n_kernel_impl
.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/node.h"
namespace
paddle
{
namespace
inference
{
namespace
analysis
{
void
IrInferCleanGraphPass
::
RunImpl
(
Argument
*
argument
)
{
auto
&
graph
=
argument
->
main_graph
();
auto
is_valid_node
=
[](
framework
::
ir
::
Node
*
x
)
{
return
x
&&
IsControlDepVar
(
*
x
)
&&
x
->
IsVar
()
&&
!
x
->
Var
();
};
std
::
unordered_set
<
const
framework
::
ir
::
Node
*>
invalid_nodes
;
int
valid_op
=
0
;
for
(
auto
*
node
:
graph
.
Nodes
())
{
PADDLE_ENFORCE_NOT_NULL
(
node
,
platform
::
errors
::
PreconditionNotMet
(
"The node should not be nullptr."
));
if
(
is_valid_node
(
node
))
{
invalid_nodes
.
insert
(
node
);
}
else
if
(
node
->
IsOp
())
{
++
valid_op
;
}
}
GraphSafeRemoveNodes
(
&
graph
,
invalid_nodes
);
}
}
// namespace analysis
}
// namespace inference
}
// namespace paddle
PD_REGISTER_KERNEL
(
add_n_sr
,
CPU
,
ALL_LAYOUT
,
phi
::
sr
::
AddNKernel
,
float
,
double
,
int
,
phi
::
dtype
::
bfloat16
,
int64_t
)
{}
paddle/phi/kernels/selected_rows/gpu/add_n_kernel.cu
0 → 100644
View file @
dbe08e9b
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h"
PD_REGISTER_KERNEL
(
add_n_sr
,
GPU
,
ALL_LAYOUT
,
phi
::
sr
::
AddNKernel
,
float
,
double
,
int
,
phi
::
dtype
::
bfloat16
,
phi
::
dtype
::
float16
,
int64_t
)
{}
paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
0 → 100644
View file @
dbe08e9b
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/selected_rows/add_n_kernel.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
phi
{
namespace
sr
{
template
<
typename
T
,
typename
Context
>
void
AddNKernel
(
const
Context
&
dev_ctx
,
const
std
::
vector
<
const
SelectedRows
*>
&
x
,
SelectedRows
*
out
)
{
dev_ctx
.
template
Alloc
<
T
>(
out
->
mutable_value
());
bool
in_place
=
false
;
if
(
x
.
size
()
>
0
&&
x
[
0
]
->
value
().
Holder
()
==
out
->
value
().
Holder
())
{
in_place
=
true
;
}
if
(
in_place
&&
x
.
size
()
<
2
)
{
return
;
}
std
::
vector
<
const
phi
::
SelectedRows
*>
inputs
;
SelectedRows
temp_in0
;
if
(
in_place
)
{
auto
&
in0
=
*
x
[
0
];
temp_in0
.
set_height
(
in0
.
height
());
temp_in0
.
set_rows
(
in0
.
rows
());
Copy
<
Context
>
(
dev_ctx
,
in0
.
value
(),
in0
.
place
(),
false
,
temp_in0
.
mutable_value
());
inputs
.
push_back
(
&
temp_in0
);
for
(
size_t
i
=
1
;
i
<
x
.
size
();
++
i
)
{
auto
&
in
=
*
x
[
i
];
if
(
in
.
rows
().
size
()
>
0
)
{
inputs
.
push_back
(
&
in
);
}
}
}
else
{
for
(
auto
in_var
:
x
)
{
auto
&
in
=
*
in_var
;
if
(
in
.
rows
().
size
()
>
0
)
{
inputs
.
push_back
(
in_var
);
}
}
}
out
->
mutable_rows
()
->
clear
();
bool
has_data
=
false
;
for
(
auto
&
in
:
inputs
)
{
if
(
in
->
rows
().
size
()
>
0
)
{
has_data
=
true
;
break
;
}
}
if
(
has_data
)
{
paddle
::
operators
::
math
::
scatter
::
MergeAdd
<
Context
,
T
>
merge_add
;
merge_add
(
dev_ctx
,
inputs
,
out
);
out
->
SyncIndex
();
}
else
{
// no data, just set a empty out tensor.
auto
*
out_dense
=
out
->
mutable_value
();
out_dense
->
clear
();
out_dense
->
Resize
(
phi
::
make_ddim
({
0
}));
dev_ctx
.
template
Alloc
<
T
>(
out_dense
);
}
}
}
// namespace sr
}
// namespace phi
paddle/phi/kernels/sparse/batch_norm_kernel.h
View file @
dbe08e9b
...
...
@@ -23,25 +23,25 @@ namespace phi {
namespace
sparse
{
template
<
typename
T
,
typename
Context
>
void
BatchNormKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
scale
,
const
DenseTensor
&
bias
,
const
DenseTensor
&
mean
,
const
DenseTensor
&
variance
,
float
momentum
,
float
epsilon
,
const
std
::
string
&
data_layout
,
bool
is_test
,
bool
use_global_stats
,
bool
trainable_statistics
,
bool
fuse_with_relu
,
SparseCooTensor
*
y
,
DenseTensor
*
mean_out
,
DenseTensor
*
variance_out
,
DenseTensor
*
saved_mean
,
DenseTensor
*
saved_variance
,
DenseTensor
*
reserve_space
);
void
BatchNorm
Coo
Kernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
scale
,
const
DenseTensor
&
bias
,
const
DenseTensor
&
mean
,
const
DenseTensor
&
variance
,
float
momentum
,
float
epsilon
,
const
std
::
string
&
data_layout
,
bool
is_test
,
bool
use_global_stats
,
bool
trainable_statistics
,
bool
fuse_with_relu
,
SparseCooTensor
*
y
,
DenseTensor
*
mean_out
,
DenseTensor
*
variance_out
,
DenseTensor
*
saved_mean
,
DenseTensor
*
saved_variance
,
DenseTensor
*
reserve_space
);
}
// namespace sparse
}
// namespace phi
paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
View file @
dbe08e9b
...
...
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
#include <thrust/sort.h>
#include <thrust/unique.h>
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
...
...
@@ -22,8 +23,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
#include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
#include <thrust/sort.h>
#include <thrust/unique.h>
namespace
phi
{
namespace
sparse
{
...
...
paddle/phi/kernels/sparse/gpu/conv.cu.h
View file @
dbe08e9b
...
...
@@ -15,8 +15,14 @@ limitations under the License. */
#pragma once
#include <thrust/remove.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#ifdef __NVCC__
#include <cub/block/block_scan.cuh>
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#endif
#include "paddle/phi/kernels/sparse/conv_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
...
...
@@ -167,7 +173,7 @@ inline void GatherV2(const GPUContext& dev_ctx,
template
<
typename
IntT
>
__global__
void
UniqueKernel
(
const
IntT
*
in_indexs
,
const
int
rulebook_len
,
int
*
out_
index_
table
,
int
*
index_
flags
,
int
*
out_indexs
,
int
*
nnz
)
{
extern
__shared__
int
cache
[];
...
...
@@ -182,8 +188,8 @@ __global__ void UniqueKernel(const IntT* in_indexs,
if
(
i
<
rulebook_len
)
{
// atomicOr only support int
int
index
=
static_cast
<
int
>
(
in_indexs
[
i
]);
int
flag
=
atomicOr
(
out_index_table
+
index
,
1
);
if
(
flag
==
0
)
{
const
bool
flag
=
phi
::
funcs
::
sparse
::
SetBits
(
index
,
index_flags
);
if
(
!
flag
)
{
int
j
=
atomicAdd
(
&
count
,
1
);
cache
[
j
]
=
index
;
}
...
...
@@ -199,6 +205,88 @@ __global__ void UniqueKernel(const IntT* in_indexs,
}
}
inline
__device__
uint32_t
BitCount
(
const
uint32_t
data
)
{
uint32_t
count
=
data
;
count
=
(
count
&
0x55555555
)
+
((
count
>>
1
)
&
0x55555555
);
count
=
(
count
&
0x33333333
)
+
((
count
>>
2
)
&
0x33333333
);
count
=
(
count
&
0x0f0f0f0f
)
+
((
count
>>
4
)
&
0x0f0f0f0f
);
count
=
(
count
&
0x00ff00ff
)
+
((
count
>>
8
)
&
0x00ff00ff
);
count
=
(
count
&
0x0000ffff
)
+
((
count
>>
16
)
&
0x0000ffff
);
return
count
;
}
static
__global__
void
GetOutIndexsCounter
(
const
int
*
flags
,
const
int
n
,
int
*
out
)
{
int
tid
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
__shared__
int
block_count
;
if
(
threadIdx
.
x
==
0
)
{
block_count
=
0
;
}
__syncthreads
();
if
(
tid
<
n
)
{
// get the count of 1 in flags[tid]
uint32_t
count
=
BitCount
(
static_cast
<
uint32_t
>
(
flags
[
tid
]));
// add to block_count
// TODO(zhangkaihuo): replace with block reduce_sum
atomicAdd
(
&
block_count
,
static_cast
<
int
>
(
count
));
}
__syncthreads
();
// write to out
if
(
threadIdx
.
x
==
0
)
{
out
[
blockIdx
.
x
]
=
block_count
;
}
}
template
<
int
BS
>
__global__
void
GetOutIndexs
(
const
int
*
flags
,
const
int
n
,
const
int
*
offsets
,
const
int
out_nnz
,
int
*
out
)
{
int
tid
=
threadIdx
.
x
+
blockDim
.
x
*
blockIdx
.
x
;
__shared__
int
block_counts
[
BS
];
__shared__
int
block_outs
[
BS
*
32
];
int
count
=
0
;
if
(
tid
<
n
)
{
// get the count of 1 in flags[tid]
int
flag
=
flags
[
tid
];
count
=
BitCount
(
static_cast
<
uint32_t
>
(
flag
));
}
// call block prefix_sum
// using namespace cub;
typedef
cub
::
BlockScan
<
int
,
BS
>
BlockScan
;
__shared__
typename
BlockScan
::
TempStorage
temp_storage
;
BlockScan
(
temp_storage
).
ExclusiveSum
(
count
,
count
);
__syncthreads
();
// write index to out
if
(
tid
<
n
)
{
// get the count of 1 in flags[tid]
int
flag
=
flags
[
tid
];
// int j = block_counts[threadIdx.x];
int
j
=
count
;
// TODO(zhangkaihuo): opt the loop
for
(
int
i
=
0
;
i
<
32
;
++
i
)
{
if
((
1
&
(
flag
>>
i
))
==
1
)
{
block_outs
[
j
++
]
=
(
tid
<<
5
)
+
i
;
}
}
}
__syncthreads
();
// write to block_outs
int
start
=
offsets
[
blockIdx
.
x
];
int
end
=
blockIdx
.
x
==
gridDim
.
x
-
1
?
out_nnz
:
offsets
[
blockIdx
.
x
+
1
];
for
(
int
i
=
threadIdx
.
x
;
i
<
end
-
start
;
i
+=
blockDim
.
x
)
{
out
[
start
+
i
]
=
block_outs
[
i
];
}
}
template
<
typename
IntT
>
__global__
void
GroupIndexs
(
const
int
*
out_index_table
,
const
int
n
,
...
...
@@ -284,7 +372,6 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
atomicAdd
(
&
counter_buf
[
kernel_index
],
1
);
kernel_i
=
kernel_index
;
}
// rulebook[kernel_index * non_zero_num + i] = kernel_i;
rulebook
[
kernel_index
*
non_zero_num
+
i
]
=
in_i
;
rulebook
[
kernel_index
*
non_zero_num
+
offset
+
i
]
=
out_index
;
++
kernel_index
;
...
...
@@ -299,17 +386,19 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
}
template
<
typename
IntT
>
__global__
void
GetOutIndexTable
(
const
IntT
*
indices
,
const
IntT
non_zero_num
,
const
Dims4D
dims
,
int
*
out_index_table
)
{
__global__
void
GetOutIndexTable1
(
const
IntT
*
indices
,
const
IntT
non_zero_num
,
const
Dims4D
dims
,
int
*
index_flags
,
int
*
out_index_table
)
{
CUDA_KERNEL_LOOP_TYPE
(
i
,
non_zero_num
,
int64_t
)
{
IntT
batch
=
indices
[
i
];
IntT
in_z
=
indices
[
i
+
non_zero_num
];
IntT
in_y
=
indices
[
i
+
2
*
non_zero_num
];
IntT
in_x
=
indices
[
i
+
3
*
non_zero_num
];
IntT
index
=
PointToIndex
(
batch
,
in_x
,
in_y
,
in_z
,
dims
);
out_index_table
[
index
]
=
i
==
0
?
-
1
:
i
;
phi
::
funcs
::
sparse
::
SetBits
(
index
,
index_flags
);
out_index_table
[
index
]
=
i
;
}
}
...
...
@@ -375,6 +464,7 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
const
Dims4D
paddings
,
const
Dims4D
dilations
,
const
Dims4D
strides
,
const
int
*
index_flags
,
const
int
*
out_index_table
,
T
*
rulebook
,
int
*
counter
)
{
...
...
@@ -417,9 +507,10 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
T
out_x
=
(
in_x
+
paddings
[
3
]
-
kx
*
dilations
[
3
])
/
strides
[
3
];
out_index
=
phi
::
funcs
::
sparse
::
PointToIndex
<
Dims4D
>
(
batch
,
out_x
,
out_y
,
out_z
,
out_dims
);
int
real_out_index
=
out_index_table
[
out_index
];
if
(
real_out_index
!=
0
)
{
real_out_index
=
real_out_index
==
-
1
?
0
:
real_out_index
;
const
bool
flag
=
phi
::
funcs
::
sparse
::
TestBits
(
out_index
,
index_flags
);
if
(
flag
)
{
int
real_out_index
=
out_index_table
[
out_index
];
in_i
=
i
;
int
buf_i
=
atomicAdd
(
&
counter_buf
[
kernel_index
],
1
);
kernel_i
=
kernel_index
;
...
...
@@ -440,7 +531,6 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
__syncthreads
();
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
if
(
threadIdx
.
x
<
counter_buf
[
i
])
{
// rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i;
rulebook
[
i
*
non_zero_num
+
counter_buf2
[
i
]
+
threadIdx
.
x
]
=
rulebook_buf
[
i
*
blockDim
.
x
+
threadIdx
.
x
];
rulebook
[
i
*
non_zero_num
+
offset
+
counter_buf2
[
i
]
+
threadIdx
.
x
]
=
...
...
@@ -575,12 +665,18 @@ int ProductRuleBook(const Context& dev_ctx,
DenseTensorMeta
rulebook_meta
(
indices_dtype
,
{
rulebook_rows
,
rulebook_cols
},
DataLayout
::
NCHW
);
int
64_t
table_size
=
1
;
int
table_size
=
1
;
for
(
int
i
=
0
;
i
<
out_dims
.
size
()
-
1
;
i
++
)
{
table_size
*=
out_dims
[
i
];
}
DenseTensor
out_index_table
=
phi
::
Empty
<
int
>
(
dev_ctx
,
{
table_size
});
int
*
out_index_table_ptr
=
out_index_table
.
data
<
int
>
();
// index_flags: flag the indices exist or not
int
index_flags_size
=
(
table_size
+
31
)
/
32
;
DenseTensor
index_flags
=
phi
::
Empty
<
int
>
(
dev_ctx
,
{
index_flags_size
});
int
*
index_flags_ptr
=
index_flags
.
data
<
int
>
();
phi
::
backends
::
gpu
::
GpuMemsetAsync
(
index_flags_ptr
,
0
,
sizeof
(
int
)
*
index_flags
.
numel
(),
dev_ctx
.
stream
());
if
(
subm
)
{
DenseTensor
tmp_rulebook
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
rulebook_meta
));
...
...
@@ -590,16 +686,16 @@ int ProductRuleBook(const Context& dev_ctx,
phi
::
Copy
(
dev_ctx
,
x
.
indices
(),
dev_ctx
.
GetPlace
(),
false
,
&
out_indices
);
phi
::
backends
::
gpu
::
GpuMemsetAsync
(
out_index_table_ptr
,
0
,
sizeof
(
int
)
*
table_size
,
dev_ctx
.
stream
());
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
non_zero_num
,
1
);
GetOutIndexTable
<
IntT
><<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
dev_ctx
.
stream
()
>>>
(
out_indices
.
data
<
IntT
>
(),
non_zero_num
,
d_x_dims
,
out_index_table_ptr
);
GetOutIndexTable1
<
IntT
><<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
dev_ctx
.
stream
()
>>>
(
out_indices
.
data
<
IntT
>
(),
non_zero_num
,
d_x_dims
,
index_flags_ptr
,
out_index_table_ptr
);
size_t
cache_size
=
kernel_size
*
2
*
sizeof
(
int
)
+
...
...
@@ -625,6 +721,7 @@ int ProductRuleBook(const Context& dev_ctx,
d_paddings
,
d_dilations
,
d_strides
,
index_flags_ptr
,
out_index_table_ptr
,
rulebook_ptr
,
counter_ptr
);
...
...
@@ -695,9 +792,6 @@ int ProductRuleBook(const Context& dev_ctx,
int
*
out_index_ptr
=
out_index
->
data
<
int
>
();
int
*
unique_key_ptr
=
unique_key
.
data
<
int
>
();
phi
::
backends
::
gpu
::
GpuMemsetAsync
(
out_index_table_ptr
,
0
,
sizeof
(
int
)
*
table_size
,
dev_ctx
.
stream
());
phi
::
backends
::
gpu
::
GpuMemsetAsync
(
unique_key_ptr
,
0
,
sizeof
(
int
),
dev_ctx
.
stream
());
...
...
@@ -708,7 +802,7 @@ int ProductRuleBook(const Context& dev_ctx,
cache_size
,
dev_ctx
.
stream
()
>>>
(
rulebook_ptr
+
rulebook_len
,
rulebook_len
,
out_
index_
table
_ptr
,
index_
flags
_ptr
,
out_index_ptr
,
unique_key_ptr
);
...
...
@@ -719,13 +813,25 @@ int ProductRuleBook(const Context& dev_ctx,
gpuMemcpyDeviceToHost
,
dev_ctx
.
stream
());
dev_ctx
.
Wait
();
const
int
threads
=
256
;
const
int
blocks
=
(
index_flags
.
numel
()
+
threads
-
1
)
/
threads
;
GetOutIndexsCounter
<<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
index_flags_ptr
,
index_flags
.
numel
(),
out_index_table_ptr
);
#ifdef PADDLE_WITH_HIP
thrust
::
sort
(
thrust
::
hip
::
par
.
on
(
dev_ctx
.
stream
()),
thrust
::
exclusive_scan
(
thrust
::
hip
::
par
.
on
(
dev_ctx
.
stream
()),
#else
thrust
::
sort
(
thrust
::
cuda
::
par
.
on
(
dev_ctx
.
stream
()),
thrust
::
exclusive_scan
(
thrust
::
cuda
::
par
.
on
(
dev_ctx
.
stream
()),
#endif
out_index_ptr
,
out_index_ptr
+
out_nnz
);
out_index_table_ptr
,
out_index_table_ptr
+
blocks
,
out_index_table_ptr
);
GetOutIndexs
<
threads
>
<<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
index_flags_ptr
,
index_flags
.
numel
(),
out_index_table_ptr
,
out_nnz
,
out_index_ptr
);
const
int64_t
sparse_dim
=
4
;
phi
::
DenseTensor
out_indices
=
...
...
paddle/phi/kernels/sparse/gpu/conv_kernel.cu
View file @
dbe08e9b
...
...
@@ -22,6 +22,9 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/scatter.cu.h"
#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
#ifdef PADDLE_WITH_CUTLASS
#include "paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h"
#endif
#include "glog/logging.h"
...
...
@@ -120,85 +123,129 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
dev_ctx
,
x
,
key
,
tmp_rulebook
,
h_counter
,
out
,
rulebook
,
counter
);
}
// 2. gather
phi
::
DenseTensor
in_features
=
phi
::
Empty
<
T
>
(
dev_ctx
,
{
rulebook_len
,
in_channels
});
phi
::
DenseTensor
out_features
=
phi
::
Empty
<
T
>
(
dev_ctx
,
{
rulebook_len
,
out_channels
});
T
*
in_features_ptr
=
in_features
.
data
<
T
>
();
T
*
out_features_ptr
=
out_features
.
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
&
out_features
,
static_cast
<
T
>
(
0.0
f
));
Gather
<
T
,
IntT
>
(
dev_ctx
,
x
.
values
().
data
<
T
>
(),
rulebook_ptr
,
rulebook_len
,
in_channels
,
in_features_ptr
);
// 3. call gemm for every werght
auto
blas
=
phi
::
funcs
::
GetBlas
<
GPUContext
,
T
>
(
dev_ctx
);
auto
*
out_values
=
out
->
mutable_values
();
T
*
out_values_ptr
=
out_values
->
data
<
T
>
();
set_zero
(
dev_ctx
,
out_values
,
static_cast
<
T
>
(
0.0
f
));
if
(
subm
)
{
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
rulebook_len
,
1
);
unique_value
.
ResizeAndAllocate
(
{
static_cast
<
int
>
(
out
->
nnz
()
*
kernel_size
)});
out_index
.
ResizeAndAllocate
({
static_cast
<
int
>
(
rulebook_len
)});
int
*
out_index_ptr
=
out_index
.
data
<
int
>
();
int
*
unique_value_ptr
=
unique_value
.
data
<
int
>
();
phi
::
backends
::
gpu
::
GpuMemsetAsync
(
out_index_ptr
,
0
,
sizeof
(
int
)
*
rulebook_len
,
dev_ctx
.
stream
());
GroupIndexs
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
dev_ctx
.
stream
()
>>>
(
rulebook_len
,
kernel_size
,
rulebook_ptr
+
rulebook_len
,
out_index_ptr
,
unique_value_ptr
);
#ifdef PADDLE_WITH_CUTLASS
bool
cutlass
=
true
;
if
(
dev_ctx
.
GetComputeCapability
()
<
75
)
cutlass
=
false
;
if
(
in_channels
%
4
!=
0
||
out_channels
%
4
!=
0
)
{
if
(
std
::
is_same
<
T
,
phi
::
dtype
::
float16
>::
value
)
cutlass
=
false
;
if
(
std
::
is_same
<
T
,
float
>::
value
)
cutlass
=
false
;
}
if
(
!
std
::
is_same
<
IntT
,
int32_t
>::
value
)
cutlass
=
false
;
if
(
cutlass
)
{
auto
*
out_values
=
out
->
mutable_non_zero_elements
();
T
*
out_values_ptr
=
out_values
->
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
out_values
,
static_cast
<
T
>
(
0.0
f
));
const
T
*
kernel_ptr
=
kernel
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
if
(
h_counter_ptr
[
i
]
<=
0
)
{
continue
;
}
const
T
*
kernel_ptr
=
kernel
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
if
(
h_counter_ptr
[
i
]
<=
0
)
{
continue
;
const
int
M
=
h_counter_ptr
[
i
];
const
int
K
=
in_channels
;
const
int
N
=
out_channels
;
const
T
*
tmp_kernel_ptr
=
kernel_ptr
+
i
*
K
*
N
;
const
IntT
*
gather_indices
=
rulebook_ptr
+
h_offsets_ptr
[
i
];
const
IntT
*
scatter_indices
=
rulebook_ptr
+
rulebook_len
+
h_offsets_ptr
[
i
];
dispatchKernel
(
dev_ctx
,
x
.
non_zero_elements
().
data
<
T
>
(),
tmp_kernel_ptr
,
out_values_ptr
,
out_values_ptr
,
M
,
N
,
K
,
gather_indices
,
scatter_indices
,
cutlass
,
x
.
dtype
());
}
}
else
{
#endif
if
(
subm
)
{
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
rulebook_len
,
1
);
unique_value
.
ResizeAndAllocate
(
{
static_cast
<
int
>
(
out
->
nnz
()
*
kernel_size
)});
out_index
.
ResizeAndAllocate
({
static_cast
<
int
>
(
rulebook_len
)});
int
*
out_index_ptr
=
out_index
.
data
<
int
>
();
int
*
unique_value_ptr
=
unique_value
.
data
<
int
>
();
phi
::
backends
::
gpu
::
GpuMemsetAsync
(
out_index_ptr
,
0
,
sizeof
(
int
)
*
rulebook_len
,
dev_ctx
.
stream
());
GroupIndexs
<<<
config
.
block_per_grid
,
config
.
thread_per_block
,
0
,
dev_ctx
.
stream
()
>>>
(
rulebook_len
,
kernel_size
,
rulebook_ptr
+
rulebook_len
,
out_index_ptr
,
unique_value_ptr
);
}
// 2. gather
phi
::
DenseTensor
in_features
=
phi
::
Empty
<
T
>
(
dev_ctx
,
{
rulebook_len
,
in_channels
});
phi
::
DenseTensor
out_features
=
phi
::
Empty
<
T
>
(
dev_ctx
,
{
rulebook_len
,
out_channels
});
T
*
in_features_ptr
=
in_features
.
data
<
T
>
();
T
*
out_features_ptr
=
out_features
.
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
&
out_features
,
static_cast
<
T
>
(
0.0
f
));
// call gemm: (n, in_channels) * (in_channels, out_channels)
const
int
M
=
h_counter_ptr
[
i
];
const
int
K
=
in_channels
;
const
int
N
=
out_channels
;
T
*
tmp_in_ptr
=
in_features_ptr
+
h_offsets_ptr
[
i
]
*
in_channels
;
const
T
*
tmp_kernel_ptr
=
kernel_ptr
+
i
*
K
*
N
;
T
*
tmp_out_ptr
=
out_features_ptr
+
h_offsets_ptr
[
i
]
*
out_channels
;
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1
),
tmp_in_ptr
,
tmp_kernel_ptr
,
static_cast
<
T
>
(
0
),
tmp_out_ptr
);
}
Gather
<
T
,
IntT
>
(
dev_ctx
,
x
.
values
().
data
<
T
>
(),
rulebook_ptr
,
rulebook_len
,
in_channels
,
in_features_ptr
);
// 3. call gemm for every werght
auto
blas
=
phi
::
funcs
::
GetBlas
<
GPUContext
,
T
>
(
dev_ctx
);
auto
*
out_values
=
out
->
mutable_values
();
T
*
out_values_ptr
=
out_values
->
data
<
T
>
();
set_zero
(
dev_ctx
,
out_values
,
static_cast
<
T
>
(
0.0
f
));
// 4. scatter
phi
::
funcs
::
sparse
::
ScatterV2
<
T
>
(
dev_ctx
,
out_features_ptr
,
out_index
.
data
<
int
>
(),
unique_value
.
data
<
int
>
(),
out
->
nnz
(),
kernel_size
,
out_channels
,
1
,
out_values_ptr
);
const
T
*
kernel_ptr
=
kernel
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
if
(
h_counter_ptr
[
i
]
<=
0
)
{
continue
;
}
// call gemm: (n, in_channels) * (in_channels, out_channels)
const
int
M
=
h_counter_ptr
[
i
];
const
int
K
=
in_channels
;
const
int
N
=
out_channels
;
T
*
tmp_in_ptr
=
in_features_ptr
+
h_offsets_ptr
[
i
]
*
in_channels
;
const
T
*
tmp_kernel_ptr
=
kernel_ptr
+
i
*
K
*
N
;
T
*
tmp_out_ptr
=
out_features_ptr
+
h_offsets_ptr
[
i
]
*
out_channels
;
blas
.
GEMM
(
CblasNoTrans
,
CblasNoTrans
,
M
,
N
,
K
,
static_cast
<
T
>
(
1
),
tmp_in_ptr
,
tmp_kernel_ptr
,
static_cast
<
T
>
(
0
),
tmp_out_ptr
);
}
// 4. scatter
phi
::
funcs
::
sparse
::
ScatterV2
<
T
>
(
dev_ctx
,
out_features_ptr
,
out_index
.
data
<
int
>
(),
unique_value
.
data
<
int
>
(),
out
->
nnz
(),
kernel_size
,
out_channels
,
1
,
out_values_ptr
);
#ifdef PADDLE_WITH_CUTLASS
}
#endif
}
/**
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment