Commit dbe08e9b authored by yuguo960516yuguo's avatar yuguo960516yuguo
Browse files

2.4.2

parent b5499578
......@@ -149,7 +149,7 @@ __global__ void bias_relu_v2(const int num,
#if __CUDA_ARCH__ >= 800
packed_val = __hmax2(__half2(0, 0), packed_val);
#elif __CUDA_ARCH__ >= 530
packed_val = __hmul2(__hgt2(__half2(0, 0), packed_val), packed_val);
packed_val = __hmul2(__hgt2(packed_val, __half2(0, 0)), packed_val);
#else
packed_val.x = static_cast<int>(static_cast<float>(packed_val.x) > 0) *
static_cast<float>(packed_val.x);
......@@ -292,19 +292,16 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
errors::PermissionDenied(
"Weight padding in fc can not be used in GPU scope."));
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
blas.GEMM(false,
false,
blas.GEMM(CblasNoTrans,
CblasNoTrans,
M,
N,
K,
static_cast<T>(1.0),
X,
K,
W,
N,
static_cast<T>(0.0),
Y,
N);
Y);
if (B == NULL) {
return;
}
......
......@@ -18,6 +18,10 @@ limitations under the License. */
namespace phi {
namespace funcs {
#define CUDNN_PER_ACTIVATION_THRESHOLD 10240
#define CUDNN_SPATIAL_THRESHOLD_TRAIN 880801
#define CUDNN_SPATIAL_THRESHOLD_EVAL 65535
inline void ExtractNCWHD(const phi::DDim &dims,
const DataLayout &data_layout,
int *N,
......
......@@ -26,6 +26,19 @@ __global__ void DistanceKernel(const T* start, const T* end, T* distance) {
}
}
inline __device__ bool SetBits(const int value, int* ptr) {
const int index = value >> 5;
const int mask = 1 << (value & 31);
const int old = atomicOr(ptr + index, mask);
return (mask & old) != 0;
}
inline __device__ bool TestBits(const int value, const int* ptr) {
const int index = value >> 5;
const int mask = 1 << (value & 31);
return (mask & ptr[index]) != 0;
}
} // namespace sparse
} // namespace funcs
} // namespace phi
......@@ -14,16 +14,27 @@
#include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
template <class T>
__global__ void Sum2CUDAKernel(const T *in_0,
const T *in_1,
T *out,
int64_t N) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
while (id < N) {
out[id] = in_0[id] + in_1[id];
id += blockDim.x * gridDim.x;
}
}
template <class T>
__global__ void SumArrayCUDAKernel(
T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
......@@ -41,9 +52,26 @@ __global__ void SumArrayCUDAKernel(
}
}
template <class T>
__global__ void SumSelectedRowsCUDAKernel(T **sr_in_out,
int64_t N,
size_t rows) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
while (id < N) {
for (int i = 0; i < 2 * rows; i += 2) {
const T *tmp = sr_in_out[i];
T *tmp_out = sr_in_out[i + 1];
if (tmp && tmp_out) {
tmp_out[id] += tmp[id];
}
}
id += blockDim.x * gridDim.x;
}
}
template <typename T, typename Context>
void AddNKernel(const Context &dev_ctx,
const std::vector<const DenseTensor *> &x,
const std::vector<const TensorBase *> &x,
DenseTensor *out) {
const size_t in_num = x.size();
......@@ -66,36 +94,38 @@ void AddNKernel(const Context &dev_ctx,
grids = dim3(CEIL_DIV(length, tile_size), 1, 1);
blocks = dim3(tile_size, 1, 1);
};
auto *out_ptr = dev_ctx.template Alloc<T>(out);
bool in_place = false;
if (x.size() > 0 && x[0]->initialized() && DenseTensor::classof(x[0])) {
if ((static_cast<const DenseTensor *>(x[0]))->data() == out->data()) {
in_place = true;
}
}
bool in_place = x[0] == out;
if (!in_place) {
auto *out_ptr = dev_ctx.template Alloc<T>(out);
if (in_num >= 1) {
auto &in_0_tensor = *x[0];
if (in_0_tensor.numel() > 0) {
in_place = (in_0_tensor.data<T>() == out_ptr);
}
if (!in_place && in_num >= 1 && DenseTensor::classof(x[0])) {
auto &in_0_tensor = *(static_cast<const DenseTensor *>(x[0]));
if (in_0_tensor.numel() > 0) {
in_place = (in_0_tensor.data<T>() == out_ptr);
}
}
// Sum of two tensors
if (in_num == 2) {
auto &in_0 = *x[0];
auto &in_1 = *x[1];
if (in_num == 2 && DenseTensor::classof(x[0]) && DenseTensor::classof(x[1])) {
auto &in_0 = *(static_cast<const DenseTensor *>(x[0]));
auto &in_1 = *(static_cast<const DenseTensor *>(x[1]));
int64_t length_0 = in_0.numel();
int64_t length_1 = in_1.numel();
if (length_0 && length_1 && in_0.initialized() && in_1.initialized()) {
if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
auto in_0_e = EigenVector<T>::Flatten(in_0);
auto in_1_e = EigenVector<T>::Flatten(in_1);
result.device(place) = in_0_e + in_1_e;
} else if (length_0 && in_0.initialized()) {
} else if (length_0 && in_0.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_0);
} else if (length_1 && in_1.initialized()) {
} else if (length_1 && in_1.IsInitialized()) {
auto result = EigenVector<T>::Flatten(*out);
auto &place = *dev_ctx.eigen_device();
result.device(place) = EigenVector<T>::Flatten(in_1);
......@@ -105,27 +135,90 @@ void AddNKernel(const Context &dev_ctx,
int start = in_place ? 1 : 0;
if (!in_place) {
funcs::SetConstant<Context, T> constant_functor;
phi::funcs::SetConstant<phi::GPUContext, T> constant_functor;
constant_functor(dev_ctx, out, static_cast<T>(0));
}
std::vector<const T *> in_data;
std::vector<int> selectrow_index;
int64_t lod_length = 0;
bool dst_write = false;
for (int i = start; i < in_num; ++i) {
auto &in_i = *x[i];
lod_length = in_i.numel();
if (lod_length && in_i.initialized()) {
in_data.emplace_back(in_i.data<T>());
if (DenseTensor::classof(x[i])) {
auto &in_i = *(static_cast<const DenseTensor *>(x[i]));
lod_length = in_i.numel();
if (lod_length && in_i.IsInitialized()) {
in_data.emplace_back(in_i.data<T>());
}
} else if (SelectedRows::classof(x[i])) {
selectrow_index.push_back(i);
}
}
// compute select rows separately.
if (!selectrow_index.empty()) {
std::vector<const T *> sr_in_out_data;
size_t rows = 0;
int64_t length = 0;
for (auto index : selectrow_index) {
auto &sr = *(static_cast<const SelectedRows *>(x[index]));
auto &sr_value = sr.value();
auto &sr_rows = sr.rows();
auto row_numel = sr_value.numel() / sr_rows.size();
auto out_dims = out->dims();
PADDLE_ENFORCE_EQ(sr.height(),
out_dims[0],
errors::InvalidArgument(
"The table height of input must be same as output, "
"but received input height is %d"
", output height is %d",
sr.height(),
out_dims[0]));
PADDLE_ENFORCE_EQ(row_numel,
out->numel() / sr.height(),
errors::InvalidArgument(
"The table width of input must be same as output, "
"but received input width is %d"
", output width is %d",
row_numel,
out->numel() / sr.height()));
auto *sr_data = sr_value.data<T>();
auto *sr_out_data = out->data<T>();
rows += sr_rows.size();
length = row_numel;
for (size_t i = 0; i < sr_rows.size(); ++i) {
sr_in_out_data.emplace_back(&sr_data[i * row_numel]);
sr_in_out_data.emplace_back(&sr_out_data[sr_rows[i] * row_numel]);
}
}
if (!sr_in_out_data.empty()) {
auto tmp_sr_in_out_array = paddle::memory::Alloc(
dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_sr_in_out_array->ptr(),
phi::CPUPlace(),
reinterpret_cast<void *>(sr_in_out_data.data()),
sr_in_out_data.size() * sizeof(T *),
dev_ctx.stream());
T **sr_in_out_array_data =
reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
ComputeKernelParameter(length);
SumSelectedRowsCUDAKernel<T>
<<<grids, blocks, 0, stream>>>(sr_in_out_array_data, length, rows);
dst_write = true;
}
}
// if indata not null, merge into one kernel call.
if (!in_data.empty()) {
auto tmp_in_array = paddle::memory::Alloc(
dev_ctx.GetPlace(),
in_data.size() * sizeof(T *),
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
auto tmp_in_array =
paddle::memory::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));
paddle::memory::Copy(dev_ctx.GetPlace(),
tmp_in_array->ptr(),
......@@ -153,6 +246,17 @@ PD_REGISTER_KERNEL(add_n,
float,
double,
int,
int64_t,
phi::dtype::bfloat16,
phi::dtype::float16) {}
phi::dtype::float16,
int64_t) {}
PD_REGISTER_KERNEL(add_n_array,
GPU,
ALL_LAYOUT,
phi::AddNArrayKernel,
float,
double,
int,
phi::dtype::bfloat16,
phi::dtype::float16,
int64_t) {}
......@@ -852,15 +852,18 @@ void BatchNormGradRawKernel(const Context &ctx,
// ctx.GetPlace()),
// epsilon, saved_mean_data, saved_var_data));
#else
// CUDNN only support small batch size
// const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 10240;
const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
const bool use_native_kernel =
((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
(x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
if (use_native_kernel) {
if (x_dims.size() == 2) {
}
// CUDNN only support small batch size
bool use_native_nhwc =
d_x ? (x_dims.size() == 4 && compute_format == DataLayout::kNHWC &&
H * W >= CUDNN_SPATIAL_THRESHOLD_EVAL)
: false;
const bool use_native_kernel =
((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
(x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD_TRAIN));
if (use_native_nhwc || (d_x && d_scale && d_bias)) {
if (use_native_kernel || use_native_nhwc) {
if (x_dims.size() == 2 || use_native_nhwc) {
dim3 block;
dim3 grid;
const int block_size = 512;
......@@ -931,6 +934,21 @@ void BatchNormGradRawKernel(const Context &ctx,
flag_ptr);
}
// 2. reduce_sum(x, dy, mean) => dscale, dbias
BatchNormParamType<T> *dscale = nullptr;
BatchNormParamType<T> *dbias = nullptr;
bool with_scale = false;
if (d_scale && d_bias) {
dscale = ctx.template Alloc<BatchNormParamType<T>>(d_scale);
dbias = ctx.template Alloc<BatchNormParamType<T>>(d_bias);
} else {
DenseTensor dscale_mem =
phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
DenseTensor dbias_mem =
phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
dscale = dscale_mem.data<BatchNormParamType<T>>();
dbias = dbias_mem.data<BatchNormParamType<T>>();
}
BNBackward2DChannelLastStage2<T, block_size>
<<<grid, block, 0, ctx.stream()>>>(
transformed_d_y.template data<T>(),
......@@ -942,8 +960,8 @@ void BatchNormGradRawKernel(const Context &ctx,
H * W * D,
epsilon,
block_data_ptr,
ctx.template Alloc<BatchNormParamType<T>>(d_scale),
ctx.template Alloc<BatchNormParamType<T>>(d_bias),
dscale,
dbias,
flag_ptr);
// 3. elementwise_mul(scale, mean, inv_var, dy, dscale, dbias) => dx
......@@ -952,8 +970,8 @@ void BatchNormGradRawKernel(const Context &ctx,
transformed_d_y.template data<T>(),
transformed_x.template data<T>(),
scale.template data<BatchNormParamType<T>>(),
d_scale->data<BatchNormParamType<T>>(),
d_bias->data<BatchNormParamType<T>>(),
dscale,
dbias,
mean_ptr,
variance_ptr,
C,
......@@ -1163,6 +1181,7 @@ void BatchNormGradRawKernel(const Context &ctx,
paddle::platform::dynload::cudnnDestroyTensorDescriptor(
bn_param_desc_));
#endif
} else {
const auto *running_mean = mean.get_ptr();
const auto *running_var = variance.get_ptr();
......
......@@ -72,6 +72,40 @@ static __global__ void BNForwardInference(const T *x,
}
}
template <typename T>
static __global__ void InverseVariance(const BatchNormParamType<T> *variance,
const double epsilon,
const int C,
BatchNormParamType<T> *inv_variance) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < C) {
inv_variance[tid] = 1 / sqrt(variance[tid] + epsilon);
}
}
template <typename T, phi::DataLayout layout>
static __global__ void BN1DForwardInference(
const T *x,
const BatchNormParamType<T> *mean,
const BatchNormParamType<T> *inv_variance,
const BatchNormParamType<T> *scale,
const BatchNormParamType<T> *bias,
const int C,
const int N,
const int HxW,
const double epsilon,
T *y) {
int gid = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
int num = N * C * HxW;
for (int i = gid; i < num; i += stride) {
const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
BatchNormParamType<T> x_sub_mean =
static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_variance[c] + bias[c]);
}
}
template <typename T, int BlockDim, phi::DataLayout layout>
static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
const T *x,
......@@ -691,9 +725,6 @@ void BatchNormKernel(const Context &ctx,
auto handle = ctx.cudnn_handle();
const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 10240;
const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
// Now, depending on whether we are running test or not, we have two paths.
// It is training mode when it's not reference AND not using pre-trained
// model.
......@@ -797,8 +828,8 @@ void BatchNormKernel(const Context &ctx,
// epsilon));
#else
const bool use_native_kernel =
((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
(x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
(x_dims.size() == 2 ||
(x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD_EVAL));
if (use_native_kernel) {
const int block_size = 256;
const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
......@@ -816,18 +847,43 @@ void BatchNormKernel(const Context &ctx,
epsilon,
transformed_y.template data<T>());
} else {
BNForwardInference<T, DataLayout::kNHWC>
<<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
if (x_dims.size() == 2) {
DenseTensor inv_var = phi::Empty<BatchNormParamType<T>>(ctx, {C});
auto *inv_var_ptr = inv_var.data<BatchNormParamType<T>>();
const int threads = 512 > C ? C : 512;
const int blocks = (C + 511) / 512;
InverseVariance<T><<<blocks, threads>>>(
est_var->template data<BatchNormParamType<T>>(),
epsilon,
C,
inv_var_ptr);
BN1DForwardInference<T, DataLayout::kNHWC>
<<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
// est_var->template data<BatchNormParamType<T>>(),
inv_var_ptr,
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
} else {
BNForwardInference<T, DataLayout::kNHWC>
<<<grid_size, block_size, 0, ctx.stream()>>>(
transformed_x.template data<T>(),
est_mean->template data<BatchNormParamType<T>>(),
est_var->template data<BatchNormParamType<T>>(),
scale.template data<BatchNormParamType<T>>(),
bias.template data<BatchNormParamType<T>>(),
C,
N,
H * W * D,
epsilon,
transformed_y.template data<T>());
}
}
} else {
PADDLE_ENFORCE_GPU_SUCCESS(
......@@ -949,7 +1005,7 @@ void BatchNormKernel(const Context &ctx,
// const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
const bool use_native_kernel =
((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
(x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
(x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD_TRAIN));
if (use_native_kernel) {
dim3 block;
dim3 grid;
......
......@@ -41,7 +41,7 @@ void MultiplexKernel(const Context& ctx,
paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.stream();
for (auto i = 0; i < rows; i++) {
for (auto i = 0; i < ids.dims()[0]; i++) {
int32_t k = index[i];
PADDLE_ENFORCE_GE(
k, 0, errors::PreconditionNotMet("index must be nonnegative."));
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/add_n_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace phi {
template <typename T, typename Context>
void AddNArrayKernel(const Context& dev_ctx,
const std::vector<const TensorArray*>& x,
TensorArray* out) {
for (auto& ele : *out) {
dev_ctx.template Alloc<T>(&ele);
}
bool in_place = true;
if (x.size() > 0 && x[0]->size() == out->size()) {
for (size_t i = 0; i < out->size(); i++) {
if (x[0]->at(i).IsInitialized() &&
out->at(i).data() != x[0]->at(i).data()) {
in_place = false;
break;
}
}
} else {
in_place = false;
}
for (size_t i = in_place ? 1 : 0; i < x.size(); ++i) {
auto* in_array = x.at(i);
for (size_t j = 0; j < in_array->size(); ++j) {
if (in_array->at(j).IsInitialized() && (in_array->at(j).numel() != 0)) {
if (j >= out->size()) {
out->resize(j + 1);
}
if (!out->at(j).IsInitialized() || (out->at(j).numel() == 0)) {
Copy<Context>(dev_ctx,
in_array->at(j),
in_array->at(j).place(),
false,
&out->at(j));
out->at(j).set_lod(in_array->at(j).lod());
} else {
PADDLE_ENFORCE_EQ(
out->at(j).lod(),
in_array->at(j).lod(),
phi::errors::InvalidArgument(
"The lod message between inputs[%d] and"
" outputs[%d] must be same, but now is not same.",
j,
j));
auto in = EigenVector<T>::Flatten(in_array->at(j));
auto result = EigenVector<T>::Flatten(out->at(j));
result.device(*dev_ctx.eigen_device()) = result + in;
}
}
}
}
}
} // namespace phi
......@@ -54,11 +54,8 @@ void FoldGradKernel(const Context& ctx,
DDim out_shape =
make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
DDim input_matrix_shape = make_ddim({x_dims[0],
kernel_sizes[0],
kernel_sizes[1],
output_height,
output_width});
DDim input_matrix_shape = make_ddim(
{1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});
paddle::operators::math::
Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
......
......@@ -56,11 +56,8 @@ void FoldKernel(const Context& ctx,
DDim output_shape =
make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
DDim input_matrix_shape = make_ddim({x_dims[0],
kernel_sizes[0],
kernel_sizes[1],
output_height,
output_width});
DDim input_matrix_shape = make_ddim(
{1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(ctx, out, static_cast<T>(0));
......@@ -68,6 +65,7 @@ void FoldKernel(const Context& ctx,
for (int i = 0; i < batch_size; i++) {
DenseTensor out_batch =
out->Slice(i, i + 1).Resize(output_shape); // im size=3
DenseTensor in_batch =
x.Slice(i, i + 1).Resize(input_matrix_shape); // col size=5
col2im(ctx, in_batch, dilations, strides, paddings, &out_batch);
......
......@@ -169,7 +169,7 @@ static void linalg_solve(const Context& dev_ctx,
out_tmp.Resize(out->dims());
out_tmp = *out;
phi::SqueezeKernel<T, Context>(dev_ctx, out_tmp, {-1}, out);
phi::Squeeze<T, Context>(dev_ctx, out_tmp, {-1}, out);
} else {
PADDLE_ENFORCE_EQ(
x_dim[x_dim_size - 1],
......
......@@ -23,11 +23,7 @@ void SqueezeKernel(const Context& dev_ctx,
const DenseTensor& x,
const IntArray& axes,
DenseTensor* out) {
auto x_dims = x.dims();
std::vector<int32_t> tmp(axes.GetData().begin(), axes.GetData().end());
auto out_dims = funcs::GetOutputSqueezeShape(tmp, x_dims, true);
out->Resize(out_dims);
auto out_dims = out->dims();
dev_ctx.template Alloc<T>(out);
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
out->Resize(out_dims); // copy will reset the dims.
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,24 +14,14 @@
#pragma once
#include <string>
#include <unordered_set>
#include "paddle/phi/core/selected_rows.h"
#include "paddle/fluid/inference/analysis/analysis_pass.h"
namespace phi {
namespace sr {
namespace paddle {
namespace inference {
namespace analysis {
struct Argument;
class IrInferCleanGraphPass : public AnalysisPass {
public:
void RunImpl(Argument *argument) override;
std::string repr() const override { return "ir_graph_clean_pass"; }
};
} // namespace analysis
} // namespace inference
} // namespace paddle
template <typename T, typename Context>
void AddNKernel(const Context& dev_ctx,
const std::vector<const SelectedRows*>& x,
SelectedRows* out);
} // namespace sr
} // namespace phi
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -12,38 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h"
#include "paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/node.h"
namespace paddle {
namespace inference {
namespace analysis {
void IrInferCleanGraphPass::RunImpl(Argument* argument) {
auto& graph = argument->main_graph();
auto is_valid_node = [](framework::ir::Node* x) {
return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
};
std::unordered_set<const framework::ir::Node*> invalid_nodes;
int valid_op = 0;
for (auto* node : graph.Nodes()) {
PADDLE_ENFORCE_NOT_NULL(node,
platform::errors::PreconditionNotMet(
"The node should not be nullptr."));
if (is_valid_node(node)) {
invalid_nodes.insert(node);
} else if (node->IsOp()) {
++valid_op;
}
}
GraphSafeRemoveNodes(&graph, invalid_nodes);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
PD_REGISTER_KERNEL(add_n_sr,
CPU,
ALL_LAYOUT,
phi::sr::AddNKernel,
float,
double,
int,
phi::dtype::bfloat16,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h"
PD_REGISTER_KERNEL(add_n_sr,
GPU,
ALL_LAYOUT,
phi::sr::AddNKernel,
float,
double,
int,
phi::dtype::bfloat16,
phi::dtype::float16,
int64_t) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/kernels/selected_rows/add_n_kernel.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
namespace sr {
template <typename T, typename Context>
void AddNKernel(const Context &dev_ctx,
const std::vector<const SelectedRows *> &x,
SelectedRows *out) {
dev_ctx.template Alloc<T>(out->mutable_value());
bool in_place = false;
if (x.size() > 0 && x[0]->value().Holder() == out->value().Holder()) {
in_place = true;
}
if (in_place && x.size() < 2) {
return;
}
std::vector<const phi::SelectedRows *> inputs;
SelectedRows temp_in0;
if (in_place) {
auto &in0 = *x[0];
temp_in0.set_height(in0.height());
temp_in0.set_rows(in0.rows());
Copy<Context>(
dev_ctx, in0.value(), in0.place(), false, temp_in0.mutable_value());
inputs.push_back(&temp_in0);
for (size_t i = 1; i < x.size(); ++i) {
auto &in = *x[i];
if (in.rows().size() > 0) {
inputs.push_back(&in);
}
}
} else {
for (auto in_var : x) {
auto &in = *in_var;
if (in.rows().size() > 0) {
inputs.push_back(in_var);
}
}
}
out->mutable_rows()->clear();
bool has_data = false;
for (auto &in : inputs) {
if (in->rows().size() > 0) {
has_data = true;
break;
}
}
if (has_data) {
paddle::operators::math::scatter::MergeAdd<Context, T> merge_add;
merge_add(dev_ctx, inputs, out);
out->SyncIndex();
} else {
// no data, just set a empty out tensor.
auto *out_dense = out->mutable_value();
out_dense->clear();
out_dense->Resize(phi::make_ddim({0}));
dev_ctx.template Alloc<T>(out_dense);
}
}
} // namespace sr
} // namespace phi
......@@ -23,25 +23,25 @@ namespace phi {
namespace sparse {
template <typename T, typename Context>
void BatchNormKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
SparseCooTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space);
void BatchNormCooKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& scale,
const DenseTensor& bias,
const DenseTensor& mean,
const DenseTensor& variance,
float momentum,
float epsilon,
const std::string& data_layout,
bool is_test,
bool use_global_stats,
bool trainable_statistics,
bool fuse_with_relu,
SparseCooTensor* y,
DenseTensor* mean_out,
DenseTensor* variance_out,
DenseTensor* saved_mean,
DenseTensor* saved_variance,
DenseTensor* reserve_space);
} // namespace sparse
} // namespace phi
......@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/coalesce_kernel.h"
#include <thrust/sort.h>
#include <thrust/unique.h>
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -22,8 +23,6 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
#include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
#include <thrust/sort.h>
#include <thrust/unique.h>
namespace phi {
namespace sparse {
......
......@@ -15,8 +15,14 @@ limitations under the License. */
#pragma once
#include <thrust/remove.h>
#include <thrust/sort.h>
#include <thrust/unique.h>
#ifdef __NVCC__
#include <cub/block/block_scan.cuh>
#endif
#ifdef __HIPCC__
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#endif
#include "paddle/phi/kernels/sparse/conv_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
......@@ -167,7 +173,7 @@ inline void GatherV2(const GPUContext& dev_ctx,
template <typename IntT>
__global__ void UniqueKernel(const IntT* in_indexs,
const int rulebook_len,
int* out_index_table,
int* index_flags,
int* out_indexs,
int* nnz) {
extern __shared__ int cache[];
......@@ -182,8 +188,8 @@ __global__ void UniqueKernel(const IntT* in_indexs,
if (i < rulebook_len) {
// atomicOr only support int
int index = static_cast<int>(in_indexs[i]);
int flag = atomicOr(out_index_table + index, 1);
if (flag == 0) {
const bool flag = phi::funcs::sparse::SetBits(index, index_flags);
if (!flag) {
int j = atomicAdd(&count, 1);
cache[j] = index;
}
......@@ -199,6 +205,88 @@ __global__ void UniqueKernel(const IntT* in_indexs,
}
}
inline __device__ uint32_t BitCount(const uint32_t data) {
uint32_t count = data;
count = (count & 0x55555555) + ((count >> 1) & 0x55555555);
count = (count & 0x33333333) + ((count >> 2) & 0x33333333);
count = (count & 0x0f0f0f0f) + ((count >> 4) & 0x0f0f0f0f);
count = (count & 0x00ff00ff) + ((count >> 8) & 0x00ff00ff);
count = (count & 0x0000ffff) + ((count >> 16) & 0x0000ffff);
return count;
}
static __global__ void GetOutIndexsCounter(const int* flags,
const int n,
int* out) {
int tid = threadIdx.x + blockDim.x * blockIdx.x;
__shared__ int block_count;
if (threadIdx.x == 0) {
block_count = 0;
}
__syncthreads();
if (tid < n) {
// get the count of 1 in flags[tid]
uint32_t count = BitCount(static_cast<uint32_t>(flags[tid]));
// add to block_count
// TODO(zhangkaihuo): replace with block reduce_sum
atomicAdd(&block_count, static_cast<int>(count));
}
__syncthreads();
// write to out
if (threadIdx.x == 0) {
out[blockIdx.x] = block_count;
}
}
template <int BS>
__global__ void GetOutIndexs(const int* flags,
const int n,
const int* offsets,
const int out_nnz,
int* out) {
int tid = threadIdx.x + blockDim.x * blockIdx.x;
__shared__ int block_counts[BS];
__shared__ int block_outs[BS * 32];
int count = 0;
if (tid < n) {
// get the count of 1 in flags[tid]
int flag = flags[tid];
count = BitCount(static_cast<uint32_t>(flag));
}
// call block prefix_sum
// using namespace cub;
typedef cub::BlockScan<int, BS> BlockScan;
__shared__ typename BlockScan::TempStorage temp_storage;
BlockScan(temp_storage).ExclusiveSum(count, count);
__syncthreads();
// write index to out
if (tid < n) {
// get the count of 1 in flags[tid]
int flag = flags[tid];
// int j = block_counts[threadIdx.x];
int j = count;
// TODO(zhangkaihuo): opt the loop
for (int i = 0; i < 32; ++i) {
if ((1 & (flag >> i)) == 1) {
block_outs[j++] = (tid << 5) + i;
}
}
}
__syncthreads();
// write to block_outs
int start = offsets[blockIdx.x];
int end = blockIdx.x == gridDim.x - 1 ? out_nnz : offsets[blockIdx.x + 1];
for (int i = threadIdx.x; i < end - start; i += blockDim.x) {
out[start + i] = block_outs[i];
}
}
template <typename IntT>
__global__ void GroupIndexs(const int* out_index_table,
const int n,
......@@ -284,7 +372,6 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
atomicAdd(&counter_buf[kernel_index], 1);
kernel_i = kernel_index;
}
// rulebook[kernel_index * non_zero_num + i] = kernel_i;
rulebook[kernel_index * non_zero_num + i] = in_i;
rulebook[kernel_index * non_zero_num + offset + i] = out_index;
++kernel_index;
......@@ -299,17 +386,19 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
}
template <typename IntT>
__global__ void GetOutIndexTable(const IntT* indices,
const IntT non_zero_num,
const Dims4D dims,
int* out_index_table) {
__global__ void GetOutIndexTable1(const IntT* indices,
const IntT non_zero_num,
const Dims4D dims,
int* index_flags,
int* out_index_table) {
CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
IntT batch = indices[i];
IntT in_z = indices[i + non_zero_num];
IntT in_y = indices[i + 2 * non_zero_num];
IntT in_x = indices[i + 3 * non_zero_num];
IntT index = PointToIndex(batch, in_x, in_y, in_z, dims);
out_index_table[index] = i == 0 ? -1 : i;
phi::funcs::sparse::SetBits(index, index_flags);
out_index_table[index] = i;
}
}
......@@ -375,6 +464,7 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
const Dims4D paddings,
const Dims4D dilations,
const Dims4D strides,
const int* index_flags,
const int* out_index_table,
T* rulebook,
int* counter) {
......@@ -417,9 +507,10 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
T out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
batch, out_x, out_y, out_z, out_dims);
int real_out_index = out_index_table[out_index];
if (real_out_index != 0) {
real_out_index = real_out_index == -1 ? 0 : real_out_index;
const bool flag =
phi::funcs::sparse::TestBits(out_index, index_flags);
if (flag) {
int real_out_index = out_index_table[out_index];
in_i = i;
int buf_i = atomicAdd(&counter_buf[kernel_index], 1);
kernel_i = kernel_index;
......@@ -440,7 +531,6 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
__syncthreads();
for (int i = 0; i < kernel_size; i++) {
if (threadIdx.x < counter_buf[i]) {
// rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i;
rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] =
rulebook_buf[i * blockDim.x + threadIdx.x];
rulebook[i * non_zero_num + offset + counter_buf2[i] + threadIdx.x] =
......@@ -575,12 +665,18 @@ int ProductRuleBook(const Context& dev_ctx,
DenseTensorMeta rulebook_meta(
indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
int64_t table_size = 1;
int table_size = 1;
for (int i = 0; i < out_dims.size() - 1; i++) {
table_size *= out_dims[i];
}
DenseTensor out_index_table = phi::Empty<int>(dev_ctx, {table_size});
int* out_index_table_ptr = out_index_table.data<int>();
// index_flags: flag the indices exist or not
int index_flags_size = (table_size + 31) / 32;
DenseTensor index_flags = phi::Empty<int>(dev_ctx, {index_flags_size});
int* index_flags_ptr = index_flags.data<int>();
phi::backends::gpu::GpuMemsetAsync(
index_flags_ptr, 0, sizeof(int) * index_flags.numel(), dev_ctx.stream());
if (subm) {
DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
......@@ -590,16 +686,16 @@ int ProductRuleBook(const Context& dev_ctx,
phi::Copy(dev_ctx, x.indices(), dev_ctx.GetPlace(), false, &out_indices);
phi::backends::gpu::GpuMemsetAsync(
out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
GetOutIndexTable<IntT><<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(
out_indices.data<IntT>(), non_zero_num, d_x_dims, out_index_table_ptr);
GetOutIndexTable1<IntT><<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(out_indices.data<IntT>(),
non_zero_num,
d_x_dims,
index_flags_ptr,
out_index_table_ptr);
size_t cache_size =
kernel_size * 2 * sizeof(int) +
......@@ -625,6 +721,7 @@ int ProductRuleBook(const Context& dev_ctx,
d_paddings,
d_dilations,
d_strides,
index_flags_ptr,
out_index_table_ptr,
rulebook_ptr,
counter_ptr);
......@@ -695,9 +792,6 @@ int ProductRuleBook(const Context& dev_ctx,
int* out_index_ptr = out_index->data<int>();
int* unique_key_ptr = unique_key.data<int>();
phi::backends::gpu::GpuMemsetAsync(
out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
phi::backends::gpu::GpuMemsetAsync(
unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
......@@ -708,7 +802,7 @@ int ProductRuleBook(const Context& dev_ctx,
cache_size,
dev_ctx.stream()>>>(rulebook_ptr + rulebook_len,
rulebook_len,
out_index_table_ptr,
index_flags_ptr,
out_index_ptr,
unique_key_ptr);
......@@ -719,13 +813,25 @@ int ProductRuleBook(const Context& dev_ctx,
gpuMemcpyDeviceToHost,
dev_ctx.stream());
dev_ctx.Wait();
const int threads = 256;
const int blocks = (index_flags.numel() + threads - 1) / threads;
GetOutIndexsCounter<<<blocks, threads, 0, dev_ctx.stream()>>>(
index_flags_ptr, index_flags.numel(), out_index_table_ptr);
#ifdef PADDLE_WITH_HIP
thrust::sort(thrust::hip::par.on(dev_ctx.stream()),
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
out_index_ptr,
out_index_ptr + out_nnz);
out_index_table_ptr,
out_index_table_ptr + blocks,
out_index_table_ptr);
GetOutIndexs<threads>
<<<blocks, threads, 0, dev_ctx.stream()>>>(index_flags_ptr,
index_flags.numel(),
out_index_table_ptr,
out_nnz,
out_index_ptr);
const int64_t sparse_dim = 4;
phi::DenseTensor out_indices =
......
......@@ -22,6 +22,9 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/scatter.cu.h"
#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
#include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
#ifdef PADDLE_WITH_CUTLASS
#include "paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h"
#endif
#include "glog/logging.h"
......@@ -120,85 +123,129 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
}
// 2. gather
phi::DenseTensor in_features =
phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
phi::DenseTensor out_features =
phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
T* in_features_ptr = in_features.data<T>();
T* out_features_ptr = out_features.data<T>();
phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
Gather<T, IntT>(dev_ctx,
x.values().data<T>(),
rulebook_ptr,
rulebook_len,
in_channels,
in_features_ptr);
// 3. call gemm for every werght
auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
auto* out_values = out->mutable_values();
T* out_values_ptr = out_values->data<T>();
set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
if (subm) {
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
unique_value.ResizeAndAllocate(
{static_cast<int>(out->nnz() * kernel_size)});
out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
int* out_index_ptr = out_index.data<int>();
int* unique_value_ptr = unique_value.data<int>();
phi::backends::gpu::GpuMemsetAsync(
out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
GroupIndexs<<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(rulebook_len,
kernel_size,
rulebook_ptr + rulebook_len,
out_index_ptr,
unique_value_ptr);
#ifdef PADDLE_WITH_CUTLASS
bool cutlass = true;
if (dev_ctx.GetComputeCapability() < 75) cutlass = false;
if (in_channels % 4 != 0 || out_channels % 4 != 0) {
if (std::is_same<T, phi::dtype::float16>::value) cutlass = false;
if (std::is_same<T, float>::value) cutlass = false;
}
if (!std::is_same<IntT, int32_t>::value) cutlass = false;
if (cutlass) {
auto* out_values = out->mutable_non_zero_elements();
T* out_values_ptr = out_values->data<T>();
phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
const T* kernel_ptr = kernel.data<T>();
for (int i = 0; i < kernel_size; i++) {
if (h_counter_ptr[i] <= 0) {
continue;
}
const T* kernel_ptr = kernel.data<T>();
for (int i = 0; i < kernel_size; i++) {
if (h_counter_ptr[i] <= 0) {
continue;
const int M = h_counter_ptr[i];
const int K = in_channels;
const int N = out_channels;
const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
const IntT* gather_indices = rulebook_ptr + h_offsets_ptr[i];
const IntT* scatter_indices =
rulebook_ptr + rulebook_len + h_offsets_ptr[i];
dispatchKernel(dev_ctx,
x.non_zero_elements().data<T>(),
tmp_kernel_ptr,
out_values_ptr,
out_values_ptr,
M,
N,
K,
gather_indices,
scatter_indices,
cutlass,
x.dtype());
}
} else {
#endif
if (subm) {
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
unique_value.ResizeAndAllocate(
{static_cast<int>(out->nnz() * kernel_size)});
out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
int* out_index_ptr = out_index.data<int>();
int* unique_value_ptr = unique_value.data<int>();
phi::backends::gpu::GpuMemsetAsync(
out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
GroupIndexs<<<config.block_per_grid,
config.thread_per_block,
0,
dev_ctx.stream()>>>(rulebook_len,
kernel_size,
rulebook_ptr + rulebook_len,
out_index_ptr,
unique_value_ptr);
}
// 2. gather
phi::DenseTensor in_features =
phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
phi::DenseTensor out_features =
phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
T* in_features_ptr = in_features.data<T>();
T* out_features_ptr = out_features.data<T>();
phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
// call gemm: (n, in_channels) * (in_channels, out_channels)
const int M = h_counter_ptr[i];
const int K = in_channels;
const int N = out_channels;
T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
blas.GEMM(CblasNoTrans,
CblasNoTrans,
M,
N,
K,
static_cast<T>(1),
tmp_in_ptr,
tmp_kernel_ptr,
static_cast<T>(0),
tmp_out_ptr);
}
Gather<T, IntT>(dev_ctx,
x.values().data<T>(),
rulebook_ptr,
rulebook_len,
in_channels,
in_features_ptr);
// 3. call gemm for every werght
auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
auto* out_values = out->mutable_values();
T* out_values_ptr = out_values->data<T>();
set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
// 4. scatter
phi::funcs::sparse::ScatterV2<T>(dev_ctx,
out_features_ptr,
out_index.data<int>(),
unique_value.data<int>(),
out->nnz(),
kernel_size,
out_channels,
1,
out_values_ptr);
const T* kernel_ptr = kernel.data<T>();
for (int i = 0; i < kernel_size; i++) {
if (h_counter_ptr[i] <= 0) {
continue;
}
// call gemm: (n, in_channels) * (in_channels, out_channels)
const int M = h_counter_ptr[i];
const int K = in_channels;
const int N = out_channels;
T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
blas.GEMM(CblasNoTrans,
CblasNoTrans,
M,
N,
K,
static_cast<T>(1),
tmp_in_ptr,
tmp_kernel_ptr,
static_cast<T>(0),
tmp_out_ptr);
}
// 4. scatter
phi::funcs::sparse::ScatterV2<T>(dev_ctx,
out_features_ptr,
out_index.data<int>(),
unique_value.data<int>(),
out->nnz(),
kernel_size,
out_channels,
1,
out_values_ptr);
#ifdef PADDLE_WITH_CUTLASS
}
#endif
}
/**
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment