2.4.2

dbe08e9b · yuguo960516yuguo · b5499578 · dbe08e9b · dbe08e9b · dbe08e9b
Commit dbe08e9b authored Jun 12, 2023 by yuguo960516yuguo
20 changed files
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -149,7 +149,7 @@ __global__ void bias_relu_v2(const int num,
 #if __CUDA_ARCH__ >= 800
      packed_val = __hmax2(__half2(0, 0), packed_val);
 #elif __CUDA_ARCH__ >= 530
-      packed_val = __hmul2(__hgt2(__half2(0, 0), packed_val), packed_val);
+      packed_val = __hmul2(__hgt2(packed_val, __half2(0, 0)), packed_val);
 #else
      packed_val.x = static_cast<int>(static_cast<float>(packed_val.x) > 0) *
                     static_cast<float>(packed_val.x);
@@ -292,19 +292,16 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
                    errors::PermissionDenied(
                        "Weight padding in fc can not be used in GPU scope."));
  auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
-  blas.GEMM(false,
-            false,
+  blas.GEMM(CblasNoTrans,
+            CblasNoTrans,
            M,
            N,
            K,
            static_cast<T>(1.0),
            X,
-            K,
            W,
-            N,
            static_cast<T>(0.0),
-            Y,
-            N);
+            Y);
  if (B == NULL) {
    return;
  }

--- a/paddle/phi/kernels/funcs/norm_utils.h
+++ b/paddle/phi/kernels/funcs/norm_utils.h
@@ -18,6 +18,10 @@ limitations under the License. */

 namespace phi {
 namespace funcs {
+#define CUDNN_PER_ACTIVATION_THRESHOLD 10240
+#define CUDNN_SPATIAL_THRESHOLD_TRAIN 880801
+#define CUDNN_SPATIAL_THRESHOLD_EVAL 65535
+
 inline void ExtractNCWHD(const phi::DDim &dims,
                         const DataLayout &data_layout,
                         int *N,

--- a/paddle/phi/kernels/funcs/sparse/utils.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/utils.cu.h
@@ -26,6 +26,19 @@ __global__ void DistanceKernel(const T* start, const T* end, T* distance) {
  }
 }

+inline __device__ bool SetBits(const int value, int* ptr) {
+  const int index = value >> 5;
+  const int mask = 1 << (value & 31);
+  const int old = atomicOr(ptr + index, mask);
+  return (mask & old) != 0;
+}
+
+inline __device__ bool TestBits(const int value, const int* ptr) {
+  const int index = value >> 5;
+  const int mask = 1 << (value & 31);
+  return (mask & ptr[index]) != 0;
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -14,16 +14,27 @@

 #include "paddle/phi/kernels/add_n_kernel.h"

+#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
+
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/math_function.h"

 namespace phi {

 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))

+template <class T>
+__global__ void Sum2CUDAKernel(const T *in_0,
+                               const T *in_1,
+                               T *out,
+                               int64_t N) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  while (id < N) {
+    out[id] = in_0[id] + in_1[id];
+    id += blockDim.x * gridDim.x;
+  }
+}
+
 template <class T>
 __global__ void SumArrayCUDAKernel(
    T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
@@ -41,9 +52,26 @@ __global__ void SumArrayCUDAKernel(
  }
 }

+template <class T>
+__global__ void SumSelectedRowsCUDAKernel(T **sr_in_out,
+                                          int64_t N,
+                                          size_t rows) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  while (id < N) {
+    for (int i = 0; i < 2 * rows; i += 2) {
+      const T *tmp = sr_in_out[i];
+      T *tmp_out = sr_in_out[i + 1];
+      if (tmp && tmp_out) {
+        tmp_out[id] += tmp[id];
+      }
+    }
+    id += blockDim.x * gridDim.x;
+  }
+}
+
 template <typename T, typename Context>
 void AddNKernel(const Context &dev_ctx,
-                const std::vector<const DenseTensor *> &x,
+                const std::vector<const TensorBase *> &x,
                DenseTensor *out) {
  const size_t in_num = x.size();

@@ -66,36 +94,38 @@ void AddNKernel(const Context &dev_ctx,
    grids = dim3(CEIL_DIV(length, tile_size), 1, 1);
    blocks = dim3(tile_size, 1, 1);
  };
+  auto *out_ptr = dev_ctx.template Alloc<T>(out);
+  bool in_place = false;
+  if (x.size() > 0 && x[0]->initialized() && DenseTensor::classof(x[0])) {
+    if ((static_cast<const DenseTensor *>(x[0]))->data() == out->data()) {
+      in_place = true;
+    }
+  }

-  bool in_place = x[0] == out;
-
-  if (!in_place) {
-    auto *out_ptr = dev_ctx.template Alloc<T>(out);
-    if (in_num >= 1) {
-      auto &in_0_tensor = *x[0];
-      if (in_0_tensor.numel() > 0) {
-        in_place = (in_0_tensor.data<T>() == out_ptr);
-      }
+  if (!in_place && in_num >= 1 && DenseTensor::classof(x[0])) {
+    auto &in_0_tensor = *(static_cast<const DenseTensor *>(x[0]));
+    if (in_0_tensor.numel() > 0) {
+      in_place = (in_0_tensor.data<T>() == out_ptr);
    }
  }

  // Sum of two tensors
-  if (in_num == 2) {
-    auto &in_0 = *x[0];
-    auto &in_1 = *x[1];
+  if (in_num == 2 && DenseTensor::classof(x[0]) && DenseTensor::classof(x[1])) {
+    auto &in_0 = *(static_cast<const DenseTensor *>(x[0]));
+    auto &in_1 = *(static_cast<const DenseTensor *>(x[1]));
    int64_t length_0 = in_0.numel();
    int64_t length_1 = in_1.numel();
-    if (length_0 && length_1 && in_0.initialized() && in_1.initialized()) {
+    if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
      auto result = EigenVector<T>::Flatten(*out);
      auto &place = *dev_ctx.eigen_device();
      auto in_0_e = EigenVector<T>::Flatten(in_0);
      auto in_1_e = EigenVector<T>::Flatten(in_1);
      result.device(place) = in_0_e + in_1_e;
-    } else if (length_0 && in_0.initialized()) {
+    } else if (length_0 && in_0.IsInitialized()) {
      auto result = EigenVector<T>::Flatten(*out);
      auto &place = *dev_ctx.eigen_device();
      result.device(place) = EigenVector<T>::Flatten(in_0);
-    } else if (length_1 && in_1.initialized()) {
+    } else if (length_1 && in_1.IsInitialized()) {
      auto result = EigenVector<T>::Flatten(*out);
      auto &place = *dev_ctx.eigen_device();
      result.device(place) = EigenVector<T>::Flatten(in_1);
@@ -105,27 +135,90 @@ void AddNKernel(const Context &dev_ctx,

  int start = in_place ? 1 : 0;
  if (!in_place) {
-    funcs::SetConstant<Context, T> constant_functor;
+    phi::funcs::SetConstant<phi::GPUContext, T> constant_functor;
    constant_functor(dev_ctx, out, static_cast<T>(0));
  }

  std::vector<const T *> in_data;
+  std::vector<int> selectrow_index;
  int64_t lod_length = 0;
  bool dst_write = false;
  for (int i = start; i < in_num; ++i) {
-    auto &in_i = *x[i];
-    lod_length = in_i.numel();
-    if (lod_length && in_i.initialized()) {
-      in_data.emplace_back(in_i.data<T>());
+    if (DenseTensor::classof(x[i])) {
+      auto &in_i = *(static_cast<const DenseTensor *>(x[i]));
+      lod_length = in_i.numel();
+      if (lod_length && in_i.IsInitialized()) {
+        in_data.emplace_back(in_i.data<T>());
+      }
+    } else if (SelectedRows::classof(x[i])) {
+      selectrow_index.push_back(i);
    }
  }

+  // compute select rows separately.
+  if (!selectrow_index.empty()) {
+    std::vector<const T *> sr_in_out_data;
+    size_t rows = 0;
+    int64_t length = 0;
+    for (auto index : selectrow_index) {
+      auto &sr = *(static_cast<const SelectedRows *>(x[index]));
+      auto &sr_value = sr.value();
+      auto &sr_rows = sr.rows();
+
+      auto row_numel = sr_value.numel() / sr_rows.size();
+      auto out_dims = out->dims();
+
+      PADDLE_ENFORCE_EQ(sr.height(),
+                        out_dims[0],
+                        errors::InvalidArgument(
+                            "The table height of input must be same as output, "
+                            "but received input height is %d"
+                            ", output height is %d",
+                            sr.height(),
+                            out_dims[0]));
+      PADDLE_ENFORCE_EQ(row_numel,
+                        out->numel() / sr.height(),
+                        errors::InvalidArgument(
+                            "The table width of input must be same as output, "
+                            "but received input width is %d"
+                            ", output width is %d",
+                            row_numel,
+                            out->numel() / sr.height()));
+
+      auto *sr_data = sr_value.data<T>();
+      auto *sr_out_data = out->data<T>();
+      rows += sr_rows.size();
+      length = row_numel;
+
+      for (size_t i = 0; i < sr_rows.size(); ++i) {
+        sr_in_out_data.emplace_back(&sr_data[i * row_numel]);
+        sr_in_out_data.emplace_back(&sr_out_data[sr_rows[i] * row_numel]);
+      }
+    }
+    if (!sr_in_out_data.empty()) {
+      auto tmp_sr_in_out_array = paddle::memory::Alloc(
+          dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
+
+      paddle::memory::Copy(dev_ctx.GetPlace(),
+                           tmp_sr_in_out_array->ptr(),
+                           phi::CPUPlace(),
+                           reinterpret_cast<void *>(sr_in_out_data.data()),
+                           sr_in_out_data.size() * sizeof(T *),
+                           dev_ctx.stream());
+
+      T **sr_in_out_array_data =
+          reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
+
+      ComputeKernelParameter(length);
+      SumSelectedRowsCUDAKernel<T>
+          <<<grids, blocks, 0, stream>>>(sr_in_out_array_data, length, rows);
+      dst_write = true;
+    }
+  }
  // if indata not null, merge into one kernel call.
  if (!in_data.empty()) {
-    auto tmp_in_array = paddle::memory::Alloc(
-        dev_ctx.GetPlace(),
-        in_data.size() * sizeof(T *),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+    auto tmp_in_array =
+        paddle::memory::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));

    paddle::memory::Copy(dev_ctx.GetPlace(),
                         tmp_in_array->ptr(),
@@ -153,6 +246,17 @@ PD_REGISTER_KERNEL(add_n,
                   float,
                   double,
                   int,
-                   int64_t,
                   phi::dtype::bfloat16,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(add_n_array,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddNArrayKernel,
+                   float,
+                   double,
+                   int,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   int64_t) {}
--- a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu
@@ -852,15 +852,18 @@ void BatchNormGradRawKernel(const Context &ctx,
 //             ctx.GetPlace()),
 //         epsilon, saved_mean_data, saved_var_data));
 #else
-      // CUDNN only support small batch size
-      // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
-      const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 10240;
-      const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
-      const bool use_native_kernel =
-          ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
-           (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
-      if (use_native_kernel) {
-        if (x_dims.size() == 2) {
+    }
+    // CUDNN only support small batch size
+    bool use_native_nhwc =
+        d_x ? (x_dims.size() == 4 && compute_format == DataLayout::kNHWC &&
+               H * W >= CUDNN_SPATIAL_THRESHOLD_EVAL)
+            : false;
+    const bool use_native_kernel =
+        ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
+         (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD_TRAIN));
+    if (use_native_nhwc || (d_x && d_scale && d_bias)) {
+      if (use_native_kernel || use_native_nhwc) {
+        if (x_dims.size() == 2 || use_native_nhwc) {
          dim3 block;
          dim3 grid;
          const int block_size = 512;
@@ -931,6 +934,21 @@ void BatchNormGradRawKernel(const Context &ctx,
                    flag_ptr);
          }
          // 2. reduce_sum(x, dy, mean) => dscale, dbias
+          BatchNormParamType<T> *dscale = nullptr;
+          BatchNormParamType<T> *dbias = nullptr;
+          bool with_scale = false;
+          if (d_scale && d_bias) {
+            dscale = ctx.template Alloc<BatchNormParamType<T>>(d_scale);
+            dbias = ctx.template Alloc<BatchNormParamType<T>>(d_bias);
+          } else {
+            DenseTensor dscale_mem =
+                phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
+            DenseTensor dbias_mem =
+                phi::Empty<BatchNormParamType<T>, Context>(ctx, {C});
+            dscale = dscale_mem.data<BatchNormParamType<T>>();
+            dbias = dbias_mem.data<BatchNormParamType<T>>();
+          }
+
          BNBackward2DChannelLastStage2<T, block_size>
              <<<grid, block, 0, ctx.stream()>>>(
                  transformed_d_y.template data<T>(),
@@ -942,8 +960,8 @@ void BatchNormGradRawKernel(const Context &ctx,
                  H * W * D,
                  epsilon,
                  block_data_ptr,
-                  ctx.template Alloc<BatchNormParamType<T>>(d_scale),
-                  ctx.template Alloc<BatchNormParamType<T>>(d_bias),
+                  dscale,
+                  dbias,
                  flag_ptr);

          // 3. elementwise_mul(scale, mean, inv_var, dy, dscale, dbias) => dx
@@ -952,8 +970,8 @@ void BatchNormGradRawKernel(const Context &ctx,
                  transformed_d_y.template data<T>(),
                  transformed_x.template data<T>(),
                  scale.template data<BatchNormParamType<T>>(),
-                  d_scale->data<BatchNormParamType<T>>(),
-                  d_bias->data<BatchNormParamType<T>>(),
+                  dscale,
+                  dbias,
                  mean_ptr,
                  variance_ptr,
                  C,
@@ -1163,6 +1181,7 @@ void BatchNormGradRawKernel(const Context &ctx,
        paddle::platform::dynload::cudnnDestroyTensorDescriptor(
            bn_param_desc_));
 #endif
+
  } else {
    const auto *running_mean = mean.get_ptr();
    const auto *running_var = variance.get_ptr();

--- a/paddle/phi/kernels/gpu/batch_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu
@@ -72,6 +72,40 @@ static __global__ void BNForwardInference(const T *x,
  }
 }

+template <typename T>
+static __global__ void InverseVariance(const BatchNormParamType<T> *variance,
+                                       const double epsilon,
+                                       const int C,
+                                       BatchNormParamType<T> *inv_variance) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  if (tid < C) {
+    inv_variance[tid] = 1 / sqrt(variance[tid] + epsilon);
+  }
+}
+
+template <typename T, phi::DataLayout layout>
+static __global__ void BN1DForwardInference(
+    const T *x,
+    const BatchNormParamType<T> *mean,
+    const BatchNormParamType<T> *inv_variance,
+    const BatchNormParamType<T> *scale,
+    const BatchNormParamType<T> *bias,
+    const int C,
+    const int N,
+    const int HxW,
+    const double epsilon,
+    T *y) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  int num = N * C * HxW;
+  for (int i = gid; i < num; i += stride) {
+    const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C;
+    BatchNormParamType<T> x_sub_mean =
+        static_cast<BatchNormParamType<T>>(x[i]) - mean[c];
+    y[i] = static_cast<T>(scale[c] * x_sub_mean * inv_variance[c] + bias[c]);
+  }
+}
+
 template <typename T, int BlockDim, phi::DataLayout layout>
 static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining(
    const T *x,
@@ -691,9 +725,6 @@ void BatchNormKernel(const Context &ctx,

  auto handle = ctx.cudnn_handle();

-  const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 10240;
-  const size_t CUDNN_SPATIAL_THRESHOLD = 880801;
-
  // Now, depending on whether we are running test or not, we have two paths.
  // It is training mode when it's not reference AND not using pre-trained
  // model.
@@ -797,8 +828,8 @@ void BatchNormKernel(const Context &ctx,
 //         epsilon));
 #else
    const bool use_native_kernel =
-        ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
-         (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
+        (x_dims.size() == 2 ||
+         (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD_EVAL));
    if (use_native_kernel) {
      const int block_size = 256;
      const int grid_size = (N * C * H * W * D + block_size - 1) / block_size;
@@ -816,18 +847,43 @@ void BatchNormKernel(const Context &ctx,
                epsilon,
                transformed_y.template data<T>());
      } else {
-        BNForwardInference<T, DataLayout::kNHWC>
-            <<<grid_size, block_size, 0, ctx.stream()>>>(
-                transformed_x.template data<T>(),
-                est_mean->template data<BatchNormParamType<T>>(),
-                est_var->template data<BatchNormParamType<T>>(),
-                scale.template data<BatchNormParamType<T>>(),
-                bias.template data<BatchNormParamType<T>>(),
-                C,
-                N,
-                H * W * D,
-                epsilon,
-                transformed_y.template data<T>());
+        if (x_dims.size() == 2) {
+          DenseTensor inv_var = phi::Empty<BatchNormParamType<T>>(ctx, {C});
+          auto *inv_var_ptr = inv_var.data<BatchNormParamType<T>>();
+          const int threads = 512 > C ? C : 512;
+          const int blocks = (C + 511) / 512;
+          InverseVariance<T><<<blocks, threads>>>(
+              est_var->template data<BatchNormParamType<T>>(),
+              epsilon,
+              C,
+              inv_var_ptr);
+          BN1DForwardInference<T, DataLayout::kNHWC>
+              <<<grid_size, block_size, 0, ctx.stream()>>>(
+                  transformed_x.template data<T>(),
+                  est_mean->template data<BatchNormParamType<T>>(),
+                  // est_var->template data<BatchNormParamType<T>>(),
+                  inv_var_ptr,
+                  scale.template data<BatchNormParamType<T>>(),
+                  bias.template data<BatchNormParamType<T>>(),
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  transformed_y.template data<T>());
+        } else {
+          BNForwardInference<T, DataLayout::kNHWC>
+              <<<grid_size, block_size, 0, ctx.stream()>>>(
+                  transformed_x.template data<T>(),
+                  est_mean->template data<BatchNormParamType<T>>(),
+                  est_var->template data<BatchNormParamType<T>>(),
+                  scale.template data<BatchNormParamType<T>>(),
+                  bias.template data<BatchNormParamType<T>>(),
+                  C,
+                  N,
+                  H * W * D,
+                  epsilon,
+                  transformed_y.template data<T>());
+        }
      }
    } else {
      PADDLE_ENFORCE_GPU_SUCCESS(
@@ -949,7 +1005,7 @@ void BatchNormKernel(const Context &ctx,
      // const size_t CUDNN_PER_ACTIVATION_THRESHOLD = 131070;
      const bool use_native_kernel =
          ((x_dims.size() == 2 && N >= CUDNN_PER_ACTIVATION_THRESHOLD) ||
-           (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD));
+           (x_dims.size() == 3 && N >= CUDNN_SPATIAL_THRESHOLD_TRAIN));
      if (use_native_kernel) {
        dim3 block;
        dim3 grid;

--- a/paddle/phi/kernels/gpu/multiplex_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
@@ -41,7 +41,7 @@ void MultiplexKernel(const Context& ctx,
  paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
  auto* index = index_t_cpu.data<int32_t>();
  auto stream = ctx.stream();
-  for (auto i = 0; i < rows; i++) {
+  for (auto i = 0; i < ids.dims()[0]; i++) {
    int32_t k = index[i];
    PADDLE_ENFORCE_GE(
        k, 0, errors::PreconditionNotMet("index must be nonnegative."));

--- a/paddle/phi/kernels/impl/add_n_kernel_impl.h
+++ b/paddle/phi/kernels/impl/add_n_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/add_n_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddNArrayKernel(const Context& dev_ctx,
+                     const std::vector<const TensorArray*>& x,
+                     TensorArray* out) {
+  for (auto& ele : *out) {
+    dev_ctx.template Alloc<T>(&ele);
+  }
+  bool in_place = true;
+  if (x.size() > 0 && x[0]->size() == out->size()) {
+    for (size_t i = 0; i < out->size(); i++) {
+      if (x[0]->at(i).IsInitialized() &&
+          out->at(i).data() != x[0]->at(i).data()) {
+        in_place = false;
+        break;
+      }
+    }
+  } else {
+    in_place = false;
+  }
+  for (size_t i = in_place ? 1 : 0; i < x.size(); ++i) {
+    auto* in_array = x.at(i);
+
+    for (size_t j = 0; j < in_array->size(); ++j) {
+      if (in_array->at(j).IsInitialized() && (in_array->at(j).numel() != 0)) {
+        if (j >= out->size()) {
+          out->resize(j + 1);
+        }
+        if (!out->at(j).IsInitialized() || (out->at(j).numel() == 0)) {
+          Copy<Context>(dev_ctx,
+                        in_array->at(j),
+                        in_array->at(j).place(),
+                        false,
+                        &out->at(j));
+          out->at(j).set_lod(in_array->at(j).lod());
+        } else {
+          PADDLE_ENFORCE_EQ(
+              out->at(j).lod(),
+              in_array->at(j).lod(),
+              phi::errors::InvalidArgument(
+                  "The lod message between inputs[%d] and"
+                  " outputs[%d] must be same, but now is not same.",
+                  j,
+                  j));
+          auto in = EigenVector<T>::Flatten(in_array->at(j));
+          auto result = EigenVector<T>::Flatten(out->at(j));
+          result.device(*dev_ctx.eigen_device()) = result + in;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
--- a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
@@ -54,11 +54,8 @@ void FoldGradKernel(const Context& ctx,

  DDim out_shape =
      make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
-  DDim input_matrix_shape = make_ddim({x_dims[0],
-                                       kernel_sizes[0],
-                                       kernel_sizes[1],
-                                       output_height,
-                                       output_width});
+  DDim input_matrix_shape = make_ddim(
+      {1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});

  paddle::operators::math::
      Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>

--- a/paddle/phi/kernels/impl/fold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_kernel_impl.h
@@ -56,11 +56,8 @@ void FoldKernel(const Context& ctx,
  DDim output_shape =
      make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});

-  DDim input_matrix_shape = make_ddim({x_dims[0],
-                                       kernel_sizes[0],
-                                       kernel_sizes[1],
-                                       output_height,
-                                       output_width});
+  DDim input_matrix_shape = make_ddim(
+      {1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});

  phi::funcs::SetConstant<Context, T> set_zero;
  set_zero(ctx, out, static_cast<T>(0));
@@ -68,6 +65,7 @@ void FoldKernel(const Context& ctx,
  for (int i = 0; i < batch_size; i++) {
    DenseTensor out_batch =
        out->Slice(i, i + 1).Resize(output_shape);  // im size=3
+
    DenseTensor in_batch =
        x.Slice(i, i + 1).Resize(input_matrix_shape);  // col size=5
    col2im(ctx, in_batch, dilations, strides, paddings, &out_batch);

--- a/paddle/phi/kernels/impl/solve_kernel_impl.h
+++ b/paddle/phi/kernels/impl/solve_kernel_impl.h
@@ -169,7 +169,7 @@ static void linalg_solve(const Context& dev_ctx,
    out_tmp.Resize(out->dims());
    out_tmp = *out;

-    phi::SqueezeKernel<T, Context>(dev_ctx, out_tmp, {-1}, out);
+    phi::Squeeze<T, Context>(dev_ctx, out_tmp, {-1}, out);
  } else {
    PADDLE_ENFORCE_EQ(
        x_dim[x_dim_size - 1],

--- a/paddle/phi/kernels/impl/squeeze_kernel_impl.h
+++ b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
@@ -23,11 +23,7 @@ void SqueezeKernel(const Context& dev_ctx,
                   const DenseTensor& x,
                   const IntArray& axes,
                   DenseTensor* out) {
-  auto x_dims = x.dims();
-  std::vector<int32_t> tmp(axes.GetData().begin(), axes.GetData().end());
-  auto out_dims = funcs::GetOutputSqueezeShape(tmp, x_dims, true);
-  out->Resize(out_dims);
-
+  auto out_dims = out->dims();
  dev_ctx.template Alloc<T>(out);
  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
  out->Resize(out_dims);  // copy will reset the dims.

--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,24 +14,14 @@

 #pragma once

-#include <string>
-#include <unordered_set>
+#include "paddle/phi/core/selected_rows.h"

-#include "paddle/fluid/inference/analysis/analysis_pass.h"
+namespace phi {
+namespace sr {

-namespace paddle {
-namespace inference {
-namespace analysis {
-
-struct Argument;
-
-class IrInferCleanGraphPass : public AnalysisPass {
- public:
-  void RunImpl(Argument *argument) override;
-
-  std::string repr() const override { return "ir_graph_clean_pass"; }
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
+template <typename T, typename Context>
+void AddNKernel(const Context& dev_ctx,
+                const std::vector<const SelectedRows*>& x,
+                SelectedRows* out);
+}  // namespace sr
+}  // namespace phi
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,38 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h"
+#include "paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h"

-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void IrInferCleanGraphPass::RunImpl(Argument* argument) {
-  auto& graph = argument->main_graph();
-  auto is_valid_node = [](framework::ir::Node* x) {
-    return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
-  };
-
-  std::unordered_set<const framework::ir::Node*> invalid_nodes;
-  int valid_op = 0;
-  for (auto* node : graph.Nodes()) {
-    PADDLE_ENFORCE_NOT_NULL(node,
-                            platform::errors::PreconditionNotMet(
-                                "The node should not be nullptr."));
-    if (is_valid_node(node)) {
-      invalid_nodes.insert(node);
-    } else if (node->IsOp()) {
-      ++valid_op;
-    }
-  }
-
-  GraphSafeRemoveNodes(&graph, invalid_nodes);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
+PD_REGISTER_KERNEL(add_n_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::AddNKernel,
+                   float,
+                   double,
+                   int,
+                   phi::dtype::bfloat16,
+                   int64_t) {}
--- a/paddle/phi/kernels/selected_rows/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/selected_rows/gpu/add_n_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h"
+
+PD_REGISTER_KERNEL(add_n_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::AddNKernel,
+                   float,
+                   double,
+                   int,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   int64_t) {}
--- a/paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/selected_rows/add_n_kernel.h"
+
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace sr {
+template <typename T, typename Context>
+void AddNKernel(const Context &dev_ctx,
+                const std::vector<const SelectedRows *> &x,
+                SelectedRows *out) {
+  dev_ctx.template Alloc<T>(out->mutable_value());
+
+  bool in_place = false;
+  if (x.size() > 0 && x[0]->value().Holder() == out->value().Holder()) {
+    in_place = true;
+  }
+
+  if (in_place && x.size() < 2) {
+    return;
+  }
+
+  std::vector<const phi::SelectedRows *> inputs;
+  SelectedRows temp_in0;
+
+  if (in_place) {
+    auto &in0 = *x[0];
+    temp_in0.set_height(in0.height());
+    temp_in0.set_rows(in0.rows());
+    Copy<Context>(
+        dev_ctx, in0.value(), in0.place(), false, temp_in0.mutable_value());
+    inputs.push_back(&temp_in0);
+    for (size_t i = 1; i < x.size(); ++i) {
+      auto &in = *x[i];
+      if (in.rows().size() > 0) {
+        inputs.push_back(&in);
+      }
+    }
+  } else {
+    for (auto in_var : x) {
+      auto &in = *in_var;
+      if (in.rows().size() > 0) {
+        inputs.push_back(in_var);
+      }
+    }
+  }
+
+  out->mutable_rows()->clear();
+
+  bool has_data = false;
+  for (auto &in : inputs) {
+    if (in->rows().size() > 0) {
+      has_data = true;
+      break;
+    }
+  }
+  if (has_data) {
+    paddle::operators::math::scatter::MergeAdd<Context, T> merge_add;
+    merge_add(dev_ctx, inputs, out);
+
+    out->SyncIndex();
+
+  } else {
+    // no data, just set a empty out tensor.
+    auto *out_dense = out->mutable_value();
+    out_dense->clear();
+    out_dense->Resize(phi::make_ddim({0}));
+    dev_ctx.template Alloc<T>(out_dense);
+  }
+}
+}  // namespace sr
+}  // namespace phi
--- a/paddle/phi/kernels/sparse/batch_norm_kernel.h
+++ b/paddle/phi/kernels/sparse/batch_norm_kernel.h
@@ -23,25 +23,25 @@ namespace phi {
 namespace sparse {

 template <typename T, typename Context>
-void BatchNormKernel(const Context& dev_ctx,
-                     const SparseCooTensor& x,
-                     const DenseTensor& scale,
-                     const DenseTensor& bias,
-                     const DenseTensor& mean,
-                     const DenseTensor& variance,
-                     float momentum,
-                     float epsilon,
-                     const std::string& data_layout,
-                     bool is_test,
-                     bool use_global_stats,
-                     bool trainable_statistics,
-                     bool fuse_with_relu,
-                     SparseCooTensor* y,
-                     DenseTensor* mean_out,
-                     DenseTensor* variance_out,
-                     DenseTensor* saved_mean,
-                     DenseTensor* saved_variance,
-                     DenseTensor* reserve_space);
+void BatchNormCooKernel(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const DenseTensor& scale,
+                        const DenseTensor& bias,
+                        const DenseTensor& mean,
+                        const DenseTensor& variance,
+                        float momentum,
+                        float epsilon,
+                        const std::string& data_layout,
+                        bool is_test,
+                        bool use_global_stats,
+                        bool trainable_statistics,
+                        bool fuse_with_relu,
+                        SparseCooTensor* y,
+                        DenseTensor* mean_out,
+                        DenseTensor* variance_out,
+                        DenseTensor* saved_mean,
+                        DenseTensor* saved_variance,
+                        DenseTensor* reserve_space);

 }  // namespace sparse
 }  // namespace phi
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/phi/kernels/sparse/coalesce_kernel.h"
-
+#include <thrust/sort.h>
+#include <thrust/unique.h>
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -22,8 +23,6 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
-#include <thrust/sort.h>
-#include <thrust/unique.h>

 namespace phi {
 namespace sparse {

--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -15,8 +15,14 @@ limitations under the License. */
 #pragma once

 #include <thrust/remove.h>
-#include <thrust/sort.h>
 #include <thrust/unique.h>
+#ifdef __NVCC__
+#include <cub/block/block_scan.cuh>
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
 #include "paddle/phi/kernels/sparse/conv_kernel.h"

 #include "paddle/phi/backends/gpu/gpu_context.h"
@@ -167,7 +173,7 @@ inline void GatherV2(const GPUContext& dev_ctx,
 template <typename IntT>
 __global__ void UniqueKernel(const IntT* in_indexs,
                             const int rulebook_len,
-                             int* out_index_table,
+                             int* index_flags,
                             int* out_indexs,
                             int* nnz) {
  extern __shared__ int cache[];
@@ -182,8 +188,8 @@ __global__ void UniqueKernel(const IntT* in_indexs,
  if (i < rulebook_len) {
    // atomicOr only support int
    int index = static_cast<int>(in_indexs[i]);
-    int flag = atomicOr(out_index_table + index, 1);
-    if (flag == 0) {
+    const bool flag = phi::funcs::sparse::SetBits(index, index_flags);
+    if (!flag) {
      int j = atomicAdd(&count, 1);
      cache[j] = index;
    }
@@ -199,6 +205,88 @@ __global__ void UniqueKernel(const IntT* in_indexs,
  }
 }

+inline __device__ uint32_t BitCount(const uint32_t data) {
+  uint32_t count = data;
+  count = (count & 0x55555555) + ((count >> 1) & 0x55555555);
+  count = (count & 0x33333333) + ((count >> 2) & 0x33333333);
+  count = (count & 0x0f0f0f0f) + ((count >> 4) & 0x0f0f0f0f);
+  count = (count & 0x00ff00ff) + ((count >> 8) & 0x00ff00ff);
+  count = (count & 0x0000ffff) + ((count >> 16) & 0x0000ffff);
+  return count;
+}
+
+static __global__ void GetOutIndexsCounter(const int* flags,
+                                           const int n,
+                                           int* out) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  __shared__ int block_count;
+  if (threadIdx.x == 0) {
+    block_count = 0;
+  }
+  __syncthreads();
+
+  if (tid < n) {
+    // get the count of 1 in flags[tid]
+    uint32_t count = BitCount(static_cast<uint32_t>(flags[tid]));
+    // add to block_count
+    // TODO(zhangkaihuo): replace with block reduce_sum
+    atomicAdd(&block_count, static_cast<int>(count));
+  }
+  __syncthreads();
+  // write to out
+  if (threadIdx.x == 0) {
+    out[blockIdx.x] = block_count;
+  }
+}
+
+template <int BS>
+__global__ void GetOutIndexs(const int* flags,
+                             const int n,
+                             const int* offsets,
+                             const int out_nnz,
+                             int* out) {
+  int tid = threadIdx.x + blockDim.x * blockIdx.x;
+  __shared__ int block_counts[BS];
+  __shared__ int block_outs[BS * 32];
+
+  int count = 0;
+
+  if (tid < n) {
+    // get the count of 1 in flags[tid]
+    int flag = flags[tid];
+    count = BitCount(static_cast<uint32_t>(flag));
+  }
+
+  // call block prefix_sum
+  // using namespace cub;
+  typedef cub::BlockScan<int, BS> BlockScan;
+  __shared__ typename BlockScan::TempStorage temp_storage;
+  BlockScan(temp_storage).ExclusiveSum(count, count);
+  __syncthreads();
+
+  // write index to out
+  if (tid < n) {
+    // get the count of 1 in flags[tid]
+    int flag = flags[tid];
+    // int j = block_counts[threadIdx.x];
+    int j = count;
+    // TODO(zhangkaihuo): opt the loop
+    for (int i = 0; i < 32; ++i) {
+      if ((1 & (flag >> i)) == 1) {
+        block_outs[j++] = (tid << 5) + i;
+      }
+    }
+  }
+
+  __syncthreads();
+  // write to block_outs
+  int start = offsets[blockIdx.x];
+  int end = blockIdx.x == gridDim.x - 1 ? out_nnz : offsets[blockIdx.x + 1];
+  for (int i = threadIdx.x; i < end - start; i += blockDim.x) {
+    out[start + i] = block_outs[i];
+  }
+}
+
 template <typename IntT>
 __global__ void GroupIndexs(const int* out_index_table,
                            const int n,
@@ -284,7 +372,6 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
            atomicAdd(&counter_buf[kernel_index], 1);
            kernel_i = kernel_index;
          }
-          // rulebook[kernel_index * non_zero_num + i] = kernel_i;
          rulebook[kernel_index * non_zero_num + i] = in_i;
          rulebook[kernel_index * non_zero_num + offset + i] = out_index;
          ++kernel_index;
@@ -299,17 +386,19 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
 }

 template <typename IntT>
-__global__ void GetOutIndexTable(const IntT* indices,
-                                 const IntT non_zero_num,
-                                 const Dims4D dims,
-                                 int* out_index_table) {
+__global__ void GetOutIndexTable1(const IntT* indices,
+                                  const IntT non_zero_num,
+                                  const Dims4D dims,
+                                  int* index_flags,
+                                  int* out_index_table) {
  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
    IntT batch = indices[i];
    IntT in_z = indices[i + non_zero_num];
    IntT in_y = indices[i + 2 * non_zero_num];
    IntT in_x = indices[i + 3 * non_zero_num];
    IntT index = PointToIndex(batch, in_x, in_y, in_z, dims);
-    out_index_table[index] = i == 0 ? -1 : i;
+    phi::funcs::sparse::SetBits(index, index_flags);
+    out_index_table[index] = i;
  }
 }

@@ -375,6 +464,7 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
                                          const Dims4D paddings,
                                          const Dims4D dilations,
                                          const Dims4D strides,
+                                          const int* index_flags,
                                          const int* out_index_table,
                                          T* rulebook,
                                          int* counter) {
@@ -417,9 +507,10 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
            T out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
            out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
                batch, out_x, out_y, out_z, out_dims);
-            int real_out_index = out_index_table[out_index];
-            if (real_out_index != 0) {
-              real_out_index = real_out_index == -1 ? 0 : real_out_index;
+            const bool flag =
+                phi::funcs::sparse::TestBits(out_index, index_flags);
+            if (flag) {
+              int real_out_index = out_index_table[out_index];
              in_i = i;
              int buf_i = atomicAdd(&counter_buf[kernel_index], 1);
              kernel_i = kernel_index;
@@ -440,7 +531,6 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
  __syncthreads();
  for (int i = 0; i < kernel_size; i++) {
    if (threadIdx.x < counter_buf[i]) {
-      // rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] = i;
      rulebook[i * non_zero_num + counter_buf2[i] + threadIdx.x] =
          rulebook_buf[i * blockDim.x + threadIdx.x];
      rulebook[i * non_zero_num + offset + counter_buf2[i] + threadIdx.x] =
@@ -575,12 +665,18 @@ int ProductRuleBook(const Context& dev_ctx,
  DenseTensorMeta rulebook_meta(
      indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);

-  int64_t table_size = 1;
+  int table_size = 1;
  for (int i = 0; i < out_dims.size() - 1; i++) {
    table_size *= out_dims[i];
  }
  DenseTensor out_index_table = phi::Empty<int>(dev_ctx, {table_size});
  int* out_index_table_ptr = out_index_table.data<int>();
+  // index_flags: flag the indices exist or not
+  int index_flags_size = (table_size + 31) / 32;
+  DenseTensor index_flags = phi::Empty<int>(dev_ctx, {index_flags_size});
+  int* index_flags_ptr = index_flags.data<int>();
+  phi::backends::gpu::GpuMemsetAsync(
+      index_flags_ptr, 0, sizeof(int) * index_flags.numel(), dev_ctx.stream());

  if (subm) {
    DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
@@ -590,16 +686,16 @@ int ProductRuleBook(const Context& dev_ctx,

    phi::Copy(dev_ctx, x.indices(), dev_ctx.GetPlace(), false, &out_indices);

-    phi::backends::gpu::GpuMemsetAsync(
-        out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
-
    auto config =
        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
-    GetOutIndexTable<IntT><<<config.block_per_grid,
-                             config.thread_per_block,
-                             0,
-                             dev_ctx.stream()>>>(
-        out_indices.data<IntT>(), non_zero_num, d_x_dims, out_index_table_ptr);
+    GetOutIndexTable1<IntT><<<config.block_per_grid,
+                              config.thread_per_block,
+                              0,
+                              dev_ctx.stream()>>>(out_indices.data<IntT>(),
+                                                  non_zero_num,
+                                                  d_x_dims,
+                                                  index_flags_ptr,
+                                                  out_index_table_ptr);

    size_t cache_size =
        kernel_size * 2 * sizeof(int) +
@@ -625,6 +721,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                                          d_paddings,
                                                          d_dilations,
                                                          d_strides,
+                                                          index_flags_ptr,
                                                          out_index_table_ptr,
                                                          rulebook_ptr,
                                                          counter_ptr);
@@ -695,9 +792,6 @@ int ProductRuleBook(const Context& dev_ctx,
    int* out_index_ptr = out_index->data<int>();
    int* unique_key_ptr = unique_key.data<int>();

-    phi::backends::gpu::GpuMemsetAsync(
-        out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
-
    phi::backends::gpu::GpuMemsetAsync(
        unique_key_ptr, 0, sizeof(int), dev_ctx.stream());

@@ -708,7 +802,7 @@ int ProductRuleBook(const Context& dev_ctx,
                         cache_size,
                         dev_ctx.stream()>>>(rulebook_ptr + rulebook_len,
                                             rulebook_len,
-                                             out_index_table_ptr,
+                                             index_flags_ptr,
                                             out_index_ptr,
                                             unique_key_ptr);

@@ -719,13 +813,25 @@ int ProductRuleBook(const Context& dev_ctx,
                                       gpuMemcpyDeviceToHost,
                                       dev_ctx.stream());
    dev_ctx.Wait();
+
+    const int threads = 256;
+    const int blocks = (index_flags.numel() + threads - 1) / threads;
+    GetOutIndexsCounter<<<blocks, threads, 0, dev_ctx.stream()>>>(
+        index_flags_ptr, index_flags.numel(), out_index_table_ptr);
 #ifdef PADDLE_WITH_HIP
-    thrust::sort(thrust::hip::par.on(dev_ctx.stream()),
+    thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
 #else
-    thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
+    thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
 #endif
-                 out_index_ptr,
-                 out_index_ptr + out_nnz);
+                           out_index_table_ptr,
+                           out_index_table_ptr + blocks,
+                           out_index_table_ptr);
+    GetOutIndexs<threads>
+        <<<blocks, threads, 0, dev_ctx.stream()>>>(index_flags_ptr,
+                                                   index_flags.numel(),
+                                                   out_index_table_ptr,
+                                                   out_nnz,
+                                                   out_index_ptr);

    const int64_t sparse_dim = 4;
    phi::DenseTensor out_indices =

--- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/scatter.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/gpu/conv.cu.h"
+#ifdef PADDLE_WITH_CUTLASS
+#include "paddle/phi/kernels/sparse/gpu/gather_gemm_scatter.h"
+#endif

 #include "glog/logging.h"

@@ -120,85 +123,129 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
        dev_ctx, x, key, tmp_rulebook, h_counter, out, rulebook, counter);
  }

-  // 2. gather
-  phi::DenseTensor in_features =
-      phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
-  phi::DenseTensor out_features =
-      phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
-  T* in_features_ptr = in_features.data<T>();
-  T* out_features_ptr = out_features.data<T>();
-  phi::funcs::SetConstant<GPUContext, T> set_zero;
-  set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
-
-  Gather<T, IntT>(dev_ctx,
-                  x.values().data<T>(),
-                  rulebook_ptr,
-                  rulebook_len,
-                  in_channels,
-                  in_features_ptr);
-
-  // 3. call gemm for every werght
-  auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
-  auto* out_values = out->mutable_values();
-  T* out_values_ptr = out_values->data<T>();
-  set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
-
-  if (subm) {
-    auto config =
-        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-    unique_value.ResizeAndAllocate(
-        {static_cast<int>(out->nnz() * kernel_size)});
-    out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
-    int* out_index_ptr = out_index.data<int>();
-    int* unique_value_ptr = unique_value.data<int>();
-    phi::backends::gpu::GpuMemsetAsync(
-        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
-    GroupIndexs<<<config.block_per_grid,
-                  config.thread_per_block,
-                  0,
-                  dev_ctx.stream()>>>(rulebook_len,
-                                      kernel_size,
-                                      rulebook_ptr + rulebook_len,
-                                      out_index_ptr,
-                                      unique_value_ptr);
+#ifdef PADDLE_WITH_CUTLASS
+  bool cutlass = true;
+  if (dev_ctx.GetComputeCapability() < 75) cutlass = false;
+  if (in_channels % 4 != 0 || out_channels % 4 != 0) {
+    if (std::is_same<T, phi::dtype::float16>::value) cutlass = false;
+    if (std::is_same<T, float>::value) cutlass = false;
  }
+  if (!std::is_same<IntT, int32_t>::value) cutlass = false;
+  if (cutlass) {
+    auto* out_values = out->mutable_non_zero_elements();
+    T* out_values_ptr = out_values->data<T>();
+    phi::funcs::SetConstant<GPUContext, T> set_zero;
+    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
+
+    const T* kernel_ptr = kernel.data<T>();
+    for (int i = 0; i < kernel_size; i++) {
+      if (h_counter_ptr[i] <= 0) {
+        continue;
+      }

-  const T* kernel_ptr = kernel.data<T>();
-  for (int i = 0; i < kernel_size; i++) {
-    if (h_counter_ptr[i] <= 0) {
-      continue;
+      const int M = h_counter_ptr[i];
+      const int K = in_channels;
+      const int N = out_channels;
+      const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
+      const IntT* gather_indices = rulebook_ptr + h_offsets_ptr[i];
+      const IntT* scatter_indices =
+          rulebook_ptr + rulebook_len + h_offsets_ptr[i];
+      dispatchKernel(dev_ctx,
+                     x.non_zero_elements().data<T>(),
+                     tmp_kernel_ptr,
+                     out_values_ptr,
+                     out_values_ptr,
+                     M,
+                     N,
+                     K,
+                     gather_indices,
+                     scatter_indices,
+                     cutlass,
+                     x.dtype());
    }
+  } else {
+#endif
+    if (subm) {
+      auto config =
+          phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+      unique_value.ResizeAndAllocate(
+          {static_cast<int>(out->nnz() * kernel_size)});
+      out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
+      int* out_index_ptr = out_index.data<int>();
+      int* unique_value_ptr = unique_value.data<int>();
+      phi::backends::gpu::GpuMemsetAsync(
+          out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
+      GroupIndexs<<<config.block_per_grid,
+                    config.thread_per_block,
+                    0,
+                    dev_ctx.stream()>>>(rulebook_len,
+                                        kernel_size,
+                                        rulebook_ptr + rulebook_len,
+                                        out_index_ptr,
+                                        unique_value_ptr);
+    }
+    // 2. gather
+    phi::DenseTensor in_features =
+        phi::Empty<T>(dev_ctx, {rulebook_len, in_channels});
+    phi::DenseTensor out_features =
+        phi::Empty<T>(dev_ctx, {rulebook_len, out_channels});
+    T* in_features_ptr = in_features.data<T>();
+    T* out_features_ptr = out_features.data<T>();
+    phi::funcs::SetConstant<GPUContext, T> set_zero;
+    set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));

-    // call gemm: (n, in_channels) * (in_channels, out_channels)
-    const int M = h_counter_ptr[i];
-    const int K = in_channels;
-    const int N = out_channels;
-    T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
-    const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
-    T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
-
-    blas.GEMM(CblasNoTrans,
-              CblasNoTrans,
-              M,
-              N,
-              K,
-              static_cast<T>(1),
-              tmp_in_ptr,
-              tmp_kernel_ptr,
-              static_cast<T>(0),
-              tmp_out_ptr);
-  }
+    Gather<T, IntT>(dev_ctx,
+                    x.values().data<T>(),
+                    rulebook_ptr,
+                    rulebook_len,
+                    in_channels,
+                    in_features_ptr);
+
+    // 3. call gemm for every werght
+    auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
+    auto* out_values = out->mutable_values();
+    T* out_values_ptr = out_values->data<T>();
+    set_zero(dev_ctx, out_values, static_cast<T>(0.0f));

-  // 4. scatter
-  phi::funcs::sparse::ScatterV2<T>(dev_ctx,
-                                   out_features_ptr,
-                                   out_index.data<int>(),
-                                   unique_value.data<int>(),
-                                   out->nnz(),
-                                   kernel_size,
-                                   out_channels,
-                                   1,
-                                   out_values_ptr);
+    const T* kernel_ptr = kernel.data<T>();
+    for (int i = 0; i < kernel_size; i++) {
+      if (h_counter_ptr[i] <= 0) {
+        continue;
+      }
+
+      // call gemm: (n, in_channels) * (in_channels, out_channels)
+      const int M = h_counter_ptr[i];
+      const int K = in_channels;
+      const int N = out_channels;
+      T* tmp_in_ptr = in_features_ptr + h_offsets_ptr[i] * in_channels;
+      const T* tmp_kernel_ptr = kernel_ptr + i * K * N;
+      T* tmp_out_ptr = out_features_ptr + h_offsets_ptr[i] * out_channels;
+
+      blas.GEMM(CblasNoTrans,
+                CblasNoTrans,
+                M,
+                N,
+                K,
+                static_cast<T>(1),
+                tmp_in_ptr,
+                tmp_kernel_ptr,
+                static_cast<T>(0),
+                tmp_out_ptr);
+    }
+
+    // 4. scatter
+    phi::funcs::sparse::ScatterV2<T>(dev_ctx,
+                                     out_features_ptr,
+                                     out_index.data<int>(),
+                                     unique_value.data<int>(),
+                                     out->nnz(),
+                                     kernel_size,
+                                     out_channels,
+                                     1,
+                                     out_values_ptr);
+#ifdef PADDLE_WITH_CUTLASS
+  }
+#endif
 }

 /**