2.5

992bec46 · “yuguo” · 0259837d · 992bec46 · 992bec46 · 992bec46
Commit 992bec46 authored Oct 08, 2023 by “yuguo”
18 changed files
--- a/paddle/cinn/common/topo_walker.h
+++ b/paddle/cinn/common/topo_walker.h
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <array>
+#include <functional>
+#include <queue>
+#include <unordered_set>
+
+namespace cinn {
+namespace common {
+
+// Topological order visitor
+template <typename NodeType>
+class TopoWalker final {
+ public:
+  TopoWalker(const TopoWalker&) = delete;
+  TopoWalker(TopoWalker&&) = delete;
+
+  using NodeHandlerType = std::function<void(NodeType)>;
+  using NodesVisitorType =
+      std::function<void(NodeType, const NodeHandlerType&)>;
+
+  TopoWalker(const NodesVisitorType& VisitPrevNodes,
+             const NodesVisitorType& VisitNextNodes)
+      : VisitPrevNodes_(VisitPrevNodes), VisitNextNodes_(VisitNextNodes) {}
+
+  void operator()(NodeType node, const NodeHandlerType& NodeHandler) const {
+    std::array<NodeType, 1> nodes{node};
+    (*this)(nodes.begin(), nodes.end(), NodeHandler);
+  }
+
+  template <typename NodeIt>
+  void operator()(NodeIt begin,
+                  NodeIt end,
+                  const NodeHandlerType& NodeHandler) const {
+    std::queue<NodeType> node_queue;
+    std::unordered_set<NodeType> queued_nodes;
+    const auto& TryEnqueueNode = [&](NodeType node) {
+      if (queued_nodes.count(node) == 0) {
+        node_queue.push(node);
+        queued_nodes.insert(node);
+      }
+    };
+    for (NodeIt iter = begin; iter != end; ++iter) {
+      TryEnqueueNode(*iter);
+    }
+    while (!node_queue.empty()) {
+      NodeType node = node_queue.front();
+      node_queue.pop();
+      NodeHandler(node);
+      VisitNextNodes_(node, [&](NodeType node) {
+        size_t num_unfinished_inputs = 0;
+        VisitPrevNodes_(node, [&](NodeType in_node) {
+          num_unfinished_inputs += (queued_nodes.count(in_node) > 0 ? 0 : 1);
+        });
+        if (num_unfinished_inputs == 0) {
+          TryEnqueueNode(node);
+        }
+      });
+    }
+  }
+
+ private:
+  NodesVisitorType VisitPrevNodes_;
+  NodesVisitorType VisitNextNodes_;
+};
+
+}  // namespace common
+}  // namespace cinn
--- a/paddle/cinn/common/topo_walker_test.cc
+++ b/paddle/cinn/common/topo_walker_test.cc
+// Copyright (c) 2023 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/common/topo_walker.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+namespace cinn {
+namespace common {
+
+TEST(TopoWalker, simple) {
+  std::vector<std::pair<int, int>> edges{
+      {0, 3}, {1, 2}, {1, 3}, {2, 3}, {3, 4}};
+  TopoWalker<int> visitor(
+      [&](int node, const std::function<void(int)>& NodeHandler) {
+        for (const auto& pair : edges) {
+          if (pair.second == node) {
+            NodeHandler(pair.first);
+          }
+        }
+      },
+      [&](int node, const std::function<void(int)>& NodeHandler) {
+        for (const auto& pair : edges) {
+          if (pair.first == node) {
+            NodeHandler(pair.second);
+          }
+        }
+      });
+  std::vector<int> sources{0, 1};
+  std::vector<int> outputs;
+  visitor(sources.begin(), sources.end(), [&](int node) {
+    outputs.push_back(node);
+  });
+  std::vector<int> expected{0, 1, 2, 3, 4};
+  EXPECT_TRUE((outputs == expected));
+}
+
+}  // namespace common
+}  // namespace cinn
--- a/paddle/cinn/common/type.cc
+++ b/paddle/cinn/common/type.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/common/type.h"
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
+namespace cinn {
+namespace common {
+
+struct Type::Storage {
+  Storage() = default;
+  Storage(type_t t, int b, int w, specific_type_t st)
+      : type_(t), bits_(b), lanes_(w), specific_type_(st) {}
+
+  type_t type_{type_t::Unk};
+  // distinguish FP16/BF16, or E5M2/E4M3 (when FP8 is supported)
+  specific_type_t specific_type_{specific_type_t::None};
+  cpp_type_t cpp_type_{cpp_type_t::None};
+
+  //! How many bits per element.
+  int bits_{0};
+
+  //! How many elements(if a vector type), for scalar types, it should be 1.
+  int lanes_{1};
+
+  //! Name of the customized type.
+  std::string customized_type_;
+};
+
+Type::~Type() {}
+
+std::string Type::to_string() const {
+  std::string ret = "";
+  if (is_cpp_const()) ret += "const ";
+  ret += Type2Str(*this);
+
+  if (lanes() > 1) {
+    ret += "<";
+    ret += std::to_string(lanes());
+    ret += ">";
+  }
+  if (is_cpp_handle()) ret += "*";
+  if (is_cpp_handle2()) ret += "**";
+
+  return ret;
+}
+
+std::ostream &operator<<(std::ostream &os, const Type &t) {
+  os << t.to_string();
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, Type::type_t t) {
+  switch (t) {
+    case Type::type_t::Void:
+      os << "Void";
+      break;
+    case Type::type_t::UInt:
+      os << "UInt";
+      break;
+    case Type::type_t::Int:
+      os << "Int";
+      break;
+    case Type::type_t::Float:
+      os << "Float";
+      break;
+    case Type::type_t::Unk:
+      os << "Unk";
+      break;
+    case Type::type_t::Customized:
+      os << "Customized";
+  }
+  return os;
+}
+
+Type &Type::set_cpp_handle(bool x) {
+  // unset the other handle-related bits.
+  set_cpp_handle2(false);
+
+  auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
+  // unset the other handle-related bits.
+  v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+  v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  if (x)
+    v |= static_cast<uint8_t>(cpp_type_t::Handle);
+  else
+    v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+
+  return *this;
+}
+
+Type &Type::set_cpp_handle2(bool x) {
+  auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
+
+  // unset the other handle-related bits.
+  v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
+  v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  if (x)
+    v |= static_cast<uint8_t>(cpp_type_t::HandleHandle);
+  else
+    v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
+
+  return *this;
+}
+
+Type Type::VectorOf(int w) const {
+  CheckTypeValid();
+  return Type(type(), bits(), w, specific_type());
+}
+
+Type::Type(const Type &other) {
+  if (other.storage_) storage_.reset(new Storage(*other.storage_));
+}
+
+Type Type::ElementOf() const {
+  CheckTypeValid();
+  auto type = *this;
+  type.storage_->lanes_ = 1;
+  return type;
+}
+
+void Type::CheckTypeValid() const {
+  CHECK_NE(GetStorage().type_, type_t::Unk);
+  if (GetStorage().type_ == type_t::Float && GetStorage().bits_ == 16) {
+    CHECK(GetStorage().specific_type_ == specific_type_t::FP16 ||
+          GetStorage().specific_type_ == specific_type_t::BF16)
+        << "When creating a 16 bits Float, the specific_type_t must be FP16 or "
+           "BF16.";
+  }
+}
+
+Type Type::PointerOf() const {
+  CheckTypeValid();
+  auto x = *this;
+  CHECK(!x.is_cpp_handle2()) << "Not support three level of PointerOf";
+  if (x.is_cpp_handle())
+    x.set_cpp_handle2();
+  else
+    x.set_cpp_handle();
+  return x;
+}
+
+Type Type::ConstOf() const {
+  CheckTypeValid();
+  auto x = *this;
+  x.set_cpp_const();
+  return x;
+}
+
+bool Type::is_supported() const {
+  return this->is_float(32) || this->is_float16() || this->is_bfloat16() ||
+         this->is_float(64) || this->is_bool() || this->is_int(8) ||
+         this->is_int(16) || this->is_int(32) || this->is_int(64) ||
+         this->is_uint(8) || this->is_uint(16) || this->is_uint(32) ||
+         this->is_uint(64);
+}
+
+Type Type::IgnoreConst() const {
+  CheckTypeValid();
+  auto x = *this;
+  x.set_cpp_const(false);
+  return x;
+}
+
+Type Type::with_bits(int x) const {
+  CHECK(is_primitive());
+  Type type = *this;
+  type.GetStorage().bits_ = x;
+  return type;
+}
+
+Type Type::with_type(Type::type_t x) const {
+  Type type = *this;
+  type.GetStorage().type_ = x;
+  return type;
+}
+
+Type Type::with_lanes(int x) const {
+  CHECK(valid());
+  Type type = *this;
+  type.GetStorage().lanes_ = x;
+  return type;
+}
+
+Type Type::with_cpp_const(bool x) const {
+  Type type = *this;
+  type.set_cpp_const(x);
+  return type;
+}
+
+Type &Type::set_cpp_const(bool is_const) {
+  uint8_t &data = *reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_);
+  if (is_const) {
+    data |= static_cast<uint8_t>(cpp_type_t::Const);
+  } else {
+    data &= ~(static_cast<uint8_t>(cpp_type_t::Const));
+  }
+
+  return *this;
+}
+Type &Type::set_customized_type(const std::string &t) {
+  GetStorage().type_ = type_t ::Customized;
+  GetStorage().customized_type_ = t;
+
+  return *this;
+}
+
+bool Type::valid() const {
+  if (is_unk()) return false;
+  if (is_customized()) {
+    return !GetStorage().customized_type_.empty();
+  }
+  if (is_float() && GetStorage().bits_ == 16) {
+    return (GetStorage().specific_type_ == specific_type_t::FP16 ||
+            GetStorage().specific_type_ == specific_type_t::BF16);
+  }
+  if (is_primitive()) {
+    return bits() != 0;
+  }
+
+  return true;
+}
+
+Type::Type(Type::type_t t, int b, int w, specific_type_t st)
+    : storage_(new Storage(t, b, w, st)) {
+  if (t == Type::type_t::Float && b == 16) {
+    CHECK(st == specific_type_t::FP16 || st == specific_type_t::BF16)
+        << "When creating a 16 bits Float, the specific_type_t must be FP16 or "
+           "BF16.";
+  }
+}
+bool Type::is_primitive() const {
+  return !is_unk() && type() != type_t::Customized;
+}
+bool Type::is_customized() const {
+  return !is_unk() && type() == type_t::Customized;
+}
+bool Type::is_unk() const { return type() == type_t::Unk; }
+bool Type::is_bool() const { return type() == type_t::UInt && bits() == 1; }
+bool Type::is_void() const { return type() == type_t::Void; }
+bool Type::is_vector() const { return lanes() > 1; }
+bool Type::is_scalar() const { return lanes() == 1; }
+// Note: when calling is_float(16), 'st' can't be specific_type_t::None to
+// distinguish FP16/BF16, or use is_float16()/is_bfloat16() for short
+bool Type::is_float(int bits, specific_type_t st) const {
+  if (type() == type_t::Float && bits == 16) {
+    CHECK(st != specific_type_t::None)
+        << "when calling is_float(16), 'st' can't be specific_type_t::None to "
+           "distinguish FP16/BF16, or use is_float16()/is_bfloat16() for short";
+    return st == this->specific_type();
+  } else {
+    return type() == type_t::Float && (bits < 0 || bits == this->bits());
+  }
+}
+bool Type::is_float16() const { return is_float(16, specific_type_t::FP16); }
+bool Type::is_bfloat16() const { return is_float(16, specific_type_t::BF16); }
+bool Type::is_uint(int bits) const {
+  return type() == type_t::UInt && (bits < 0 || bits == this->bits());
+}
+bool Type::is_int(int bits) const {
+  return type() == type_t::Int && (bits < 0 || bits == this->bits());
+}
+bool Type::is_integer(int bits) const {
+  return (type() == type_t::Int || type() == type_t::UInt) &&
+         (bits < 0 || bits == this->bits());
+}
+bool Type::is_index_type() {
+  return is_int() && lanes() == 1 && (bits() == 32 || bits() == 64);
+}
+bool Type::is_cpp_handle() const {
+  return static_cast<uint8_t>(GetStorage().cpp_type_) &
+         static_cast<uint8_t>(cpp_type_t::Handle);
+}
+bool Type::is_cpp_handle2() const {
+  return static_cast<uint8_t>(GetStorage().cpp_type_) &
+         static_cast<uint8_t>(cpp_type_t::HandleHandle);
+}
+bool Type::is_cpp_const() const {
+  return static_cast<uint8_t>(cpp_type_t::Const) &
+         static_cast<uint8_t>(GetStorage().cpp_type_);
+}
+const std::string &Type::customized_type() const {
+  return GetStorage().customized_type_;
+}
+bool Type::is_customized_type() const {
+  return !GetStorage().customized_type_.empty();
+}
+Type::type_t Type::type() const { return GetStorage().type_; }
+Type::specific_type_t Type::specific_type() const {
+  return GetStorage().specific_type_;
+}
+int Type::bits() const { return GetStorage().bits_; }
+int Type::lanes() const { return GetStorage().lanes_; }
+Type::cpp_type_t Type::cpp_type() const { return GetStorage().cpp_type_; }
+bool Type::operator==(const Type &other) const {
+  return type() == other.type() && specific_type() == other.specific_type() &&
+         bits() == other.bits() && lanes() == other.lanes() &&
+         GetStorage().cpp_type_ == other.GetStorage().cpp_type_ &&
+         customized_type() == other.customized_type();
+}
+bool Type::is_string() const { return type() == type_t::String; }
+
+Type &Type::operator=(const Type &other) {
+  if (other.storage_) {
+    storage_.reset(new Storage(other.GetStorage().type_,
+                               other.GetStorage().bits_,
+                               other.GetStorage().lanes_,
+                               other.GetStorage().specific_type_));
+    storage_->cpp_type_ = other.GetStorage().cpp_type_;
+    storage_->customized_type_ = other.GetStorage().customized_type_;
+  }
+  return *this;
+}
+
+Type::Storage &Type::GetStorage() {
+  CHECK(storage_) << "The type not initializated! Please check.";
+  return *storage_;
+}
+const Type::Storage &Type::GetStorage() const {
+  CHECK(storage_) << "The type not initializated! Please check.";
+  return *storage_;
+}
+
+Type::Type() : storage_(new Storage) {}
+Type::Type(Type &&other) : storage_(std::move(other.storage_)) {}
+
+const Type &BF16() {
+  static auto t = Float(16, 1, Type::specific_type_t::BF16);
+  return t;
+}
+const Type &F16() {
+  static auto t = Float(16, 1, Type::specific_type_t::FP16);
+  return t;
+}
+const Type &F32() {
+  static auto t = Float(32);
+  return t;
+}
+const Type &F64() {
+  static auto t = Float(64);
+  return t;
+}
+const Type &I8() {
+  static auto t = Int(8);
+  return t;
+}
+const Type &I16() {
+  static auto t = Int(16);
+  return t;
+}
+const Type &I32() {
+  static auto t = Int(32);
+  return t;
+}
+const Type &I64() {
+  static auto t = Int(64);
+  return t;
+}
+const Type &UI8() {
+  static auto t = UInt(8);
+  return t;
+}
+const Type &UI16() {
+  static auto t = UInt(16);
+  return t;
+}
+const Type &UI32() {
+  static auto t = UInt(32);
+  return t;
+}
+const Type &UI64() {
+  static auto t = UInt(64);
+  return t;
+}
+const Type &I1() {
+  static auto t = Int(1);
+  return t;
+}
+const Type &UI1() {
+  static auto t = UInt(1);
+  return t;
+}
+
+struct TypeHash {
+  size_t operator()(const Type &type) const {
+    std::string hash_str;
+    hash_str += std::to_string(static_cast<int>(type.type()));
+    hash_str += std::to_string(static_cast<int>(type.specific_type()));
+    hash_str += std::to_string(type.bits());
+    hash_str += std::to_string(type.lanes());
+    hash_str += std::to_string(static_cast<int>(type.cpp_type()));
+    if (type.is_customized_type()) {
+      hash_str += type.customized_type();
+    }
+
+    return std::hash<std::string>()(hash_str);
+  }
+};
+
+int Type::bytes() const {
+  // if the type is a pointer
+  auto cpp_type = this->cpp_type();
+  if (cpp_type == Type::cpp_type_t::Handle ||
+      cpp_type == Type::cpp_type_t::HandleHandle) {
+    return sizeof(void *);
+  }
+
+// if the type is an known pod type
+#define GET_TYPE_SIZE_PAIR(TYPE) \
+  { type_of<TYPE>(), sizeof(TYPE) }
+  static std::unordered_map<Type, int, TypeHash> type_bytes = {
+      GET_TYPE_SIZE_PAIR(bfloat16),
+      GET_TYPE_SIZE_PAIR(float16),
+      GET_TYPE_SIZE_PAIR(float),
+      GET_TYPE_SIZE_PAIR(double),
+
+      GET_TYPE_SIZE_PAIR(char),
+      GET_TYPE_SIZE_PAIR(signed char),
+      GET_TYPE_SIZE_PAIR(unsigned char),
+
+      GET_TYPE_SIZE_PAIR(int8_t),
+      GET_TYPE_SIZE_PAIR(int16_t),
+      GET_TYPE_SIZE_PAIR(int32_t),
+      GET_TYPE_SIZE_PAIR(int64_t),
+
+      GET_TYPE_SIZE_PAIR(uint8_t),
+      GET_TYPE_SIZE_PAIR(uint16_t),
+      GET_TYPE_SIZE_PAIR(uint32_t),
+      GET_TYPE_SIZE_PAIR(uint64_t),
+
+      GET_TYPE_SIZE_PAIR(bool),
+  };
+#undef GET_TYPE_SIZE_PAIR
+
+  if (type_bytes.count(*this)) {
+    return type_bytes.at(*this);
+  }
+
+  // else get size by bits size
+  auto bit_size = this->bits();
+  return (bit_size + 7) / 8;
+}
+
+Type Str2Type(const std::string &type) {
+  static std::unordered_map<std::string, Type> str2type_map = {
+      {"unk", Type()},
+      {"void", Void()},
+      {"bool", Bool()},
+      {"unsigned char", UI8()},
+
+      {"char", I8()},
+      {"signed char", I8()},
+
+      {"string", String()},
+
+      {"bit", I1()},
+      {"signed bit", I1()},
+      {"int1", I1()},
+      {"int1_t", I1()},
+
+      {"ubit", UI1()},
+      {"unsigned bit", UI1()},
+      {"uint1", UI1()},
+      {"uint1_t", UI1()},
+
+      {"int8", I8()},
+      {"int8_t", I8()},
+
+      {"int16", I16()},
+      {"int16_t", I16()},
+
+      {"int", I32()},
+      {"int32", I32()},
+      {"int32_t", I32()},
+
+      {"int64", I64()},
+      {"int64_t", I64()},
+
+      {"uint8", UI8()},
+      {"uint8_t", UI8()},
+
+      {"uint16", UI16()},
+      {"uint16_t", UI16()},
+
+      {"uint", UI32()},
+      {"uint32", UI32()},
+      {"uint32_t", UI32()},
+
+      {"uint64", UI64()},
+      {"uint64_t", UI64()},
+
+      {"bfloat16", BF16()},
+      {"float16", F16()},
+      {"half", F16()},
+
+      {"float", F32()},
+      {"float32", F32()},
+
+      {"float64", F64()},
+      {"double", F64()},
+
+      {"void*", type_of<void *>()},
+      {"void_p", type_of<void *>()},
+      {"void**", type_of<void **>()},
+      {"void_p_p", type_of<void **>()},
+
+      {"int8*", type_of<int8_t *>()},
+      {"int8_p", type_of<int8_t *>()},
+      {"int8_t*", type_of<int8_t *>()},
+
+      {"uint8*", type_of<uint8_t *>()},
+      {"uint8_p", type_of<uint8_t *>()},
+      {"uint8_t*", type_of<uint8_t *>()},
+
+      {"bfloat16*", type_of<bfloat16 *>()},
+      {"float16*", type_of<float16 *>()},
+      {"half*", type_of<float16 *>()},
+      {"bfloat16_p", type_of<bfloat16 *>()},
+      {"float16_p", type_of<float16 *>()},
+      {"half_p", type_of<float16 *>()},
+
+      {"float*", type_of<float *>()},
+      {"float32*", type_of<float *>()},
+      {"float_p", type_of<float *>()},
+      {"float32_p", type_of<float *>()},
+
+      {"double*", type_of<double *>()},
+      {"float64*", type_of<double *>()},
+      {"double_p", type_of<double *>()},
+      {"float64_p", type_of<double *>()},
+
+      {"cinn_buffer", type_of<cinn_buffer_t>()},
+      {"cinn_buffer*", type_of<cinn_buffer_t>()},
+      {"cinn_buffer_p", type_of<cinn_buffer_t *>()},
+
+      {"const cinn_buffer*", type_of<const cinn_buffer_t *>()},
+      {"const_cinn_buffer_p", type_of<const cinn_buffer_t *>()},
+
+      {"cinn_pod_value", type_of<cinn_pod_value_t>()},
+      {"cinn_pod_value*", type_of<cinn_pod_value_t *>()},
+      {"cinn_pod_value_p", type_of<cinn_pod_value_t *>()},
+  };
+
+  CHECK(str2type_map.find(type) != str2type_map.end())
+      << "Not support type [" << type << "] ! Please Check.\n";
+  return str2type_map.at(type);
+}
+
+std::string Type2Str(const Type &type) {
+  switch (type.type()) {
+    case Type::type_t::Int:
+      return "int" + std::to_string(type.bits());
+
+    case Type::type_t::UInt:
+      if (type.bits() == 1) {
+        return "bool";
+      } else {
+        return "uint" + std::to_string(type.bits());
+      }
+
+    case Type::type_t::Float:
+      switch (type.specific_type()) {
+        case Type::specific_type_t::None:
+          return "float" + std::to_string(type.bits());
+        case Type::specific_type_t::BF16:
+          return "bfloat16";
+        case Type::specific_type_t::FP16:
+          return "float16";
+        default:
+          break;
+      }
+
+    case Type::type_t::Void:
+      return "void";
+
+    case Type::type_t::Customized:
+      return type.customized_type();
+
+    case Type::type_t::String:
+      return "string";
+
+    case Type::type_t::Unk:
+      return "unk";
+
+    default:
+      LOG(FATAL) << "Not support type [" << type << "] ! Please Check.\n";
+  }
+  return "unk";
+}
+
+}  // namespace common
+}  // namespace cinn
--- a/paddle/cinn/common/type.h
+++ b/paddle/cinn/common/type.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <glog/logging.h>
+
+#include <memory>
+#include <string>
+
+#include "paddle/cinn/common/bfloat16.h"
+#include "paddle/cinn/common/float16.h"
+#include "paddle/cinn/common/float16_bfloat16_utils.h"
+#include "paddle/cinn/common/macros.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+
+//! Much of the concepts are borrowed from Halide project.
+
+namespace cinn {
+namespace common {
+
+/**
+ * Types in the CINN type system. They can be ints, unsigned ints, or floats of
+ * various bit-widths. They can also be vectors of the same (by setting the
+ * `lanes` field to something larger than one). NOTE: Front-end code other than
+ * vectorize shouldn't use vector types.
+ */
+struct Type {
+  enum class type_t {
+    Unk = -1,
+    Int,
+    UInt,
+    Float,
+    String,
+    Void,
+    // stupid idea to mix the Customized with other primitive types, large
+    // refactor needs here.
+    Customized,  // Customized type
+  };
+
+  // CINN use type_t and bits to distinguish data types, like is_float(64) for
+  // double, is_float(32) for float, but for Float16 and BFloat16, the bits are
+  // both 16, so we need some other info to distinguish them.
+  enum class specific_type_t {
+    // None for some cases we only care about the bits, e.g. vectorize for
+    // hardwares
+    None = -1,
+    FP16,
+    BF16,
+    // for FP8 in future
+    // E5M2,
+    // E4M3,
+  };
+
+  //! type decorators in C++, the different code can used together.
+  enum class cpp_type_t : uint8_t {
+    None = 0,               // None information.
+    Const = 1,              // const.
+    Handle = 1 << 1,        // pointer type, such as `cinn_buffer_t*`.
+    HandleHandle = 1 << 2,  // pointer of pointer, such as `cinn_buffer_t**`.
+  };
+
+  Type();
+  Type(type_t t, int b, int w, specific_type_t st = specific_type_t::None);
+  Type(const Type& other);
+  explicit Type(Type&& other);
+  Type& operator=(const Type& other);
+
+  CINN_NODISCARD bool is_primitive() const;
+  CINN_NODISCARD bool is_customized() const;
+  CINN_NODISCARD bool valid() const;
+
+  //! Some helper functions to check a type.
+  // @{
+  CINN_NODISCARD bool is_unk() const;
+  CINN_NODISCARD bool is_void() const;
+  CINN_NODISCARD bool is_bool() const;
+  CINN_NODISCARD bool is_vector() const;
+  CINN_NODISCARD bool is_scalar() const;
+  CINN_NODISCARD bool is_float(
+      int bits = -1, specific_type_t st = specific_type_t::None) const;
+  CINN_NODISCARD bool is_float16() const;
+  CINN_NODISCARD bool is_bfloat16() const;
+  CINN_NODISCARD bool is_int(int bits = -1) const;
+  CINN_NODISCARD bool is_integer(int bits = -1) const;
+  CINN_NODISCARD bool is_uint(int bits = -1) const;
+  CINN_NODISCARD bool is_string() const;
+  CINN_NODISCARD bool is_index_type();
+  // @}
+
+  Type& set_cpp_handle(bool x = true);
+  CINN_NODISCARD bool is_cpp_handle() const;
+
+  Type& set_cpp_handle2(bool x = true);
+  CINN_NODISCARD bool is_cpp_handle2() const;
+
+  Type& set_cpp_const(bool is_const = true);
+  CINN_NODISCARD bool is_cpp_const() const;
+
+  Type& set_customized_type(const std::string& t);
+  const std::string& customized_type() const;
+  CINN_NODISCARD bool is_customized_type() const;
+
+  // Get a new type with bits set to \p x.
+  Type with_bits(int x) const;
+  // Get a new type with type set to \p x.
+  Type with_type(type_t x) const;
+  // Get a new type with lanes set to \p x.
+  Type with_lanes(int x) const;
+  // Get a new type with cpp_const set to \p x.
+  Type with_cpp_const(bool x = true) const;
+
+  //! Getters
+  // @{
+  type_t type() const;
+  specific_type_t specific_type() const;
+  int bits() const;
+  int lanes() const;
+  cpp_type_t cpp_type() const;
+  int bytes() const;
+  // @}
+
+  //! Compare two types for equality.
+  bool operator==(const Type& other) const;
+
+  //! Compare two types for inequality.
+  bool operator!=(const Type& other) const { return !(*this == other); }
+
+  //! Generate a vector of this type, with `w` elements.
+  Type VectorOf(int w) const;
+  //! Generate a element type of this type.
+  Type ElementOf() const;
+  //! Generate the address type.
+  Type PointerOf() const;
+  //! Ignore const.
+  Type IgnoreConst() const;
+  //! Add const.
+  Type ConstOf() const;
+  //! Check if a dtype is supported in CINN yet.
+  bool is_supported() const;
+
+  std::string to_string() const;
+
+  friend std::ostream& operator<<(std::ostream& os, const Type& t);
+
+  ~Type();
+
+ private:
+  void CheckTypeValid() const;
+
+  struct Storage;
+  Storage& GetStorage();
+  const Storage& GetStorage() const;
+
+  std::unique_ptr<Storage> storage_;
+};  // namespace common
+
+inline Type Void() { return Type(Type::type_t ::Void, 1, 0); }
+inline Type Int(int bits, int lanes = 1) {
+  return Type(Type::type_t ::Int, bits, lanes);
+}
+inline Type UInt(int bits, int lanes = 1) {
+  return Type(Type::type_t ::UInt, bits, lanes);
+}
+inline Type BFloat16(int lanes = 1) {
+  return Type(Type::type_t ::Float, 16, lanes, Type::specific_type_t::BF16);
+}
+inline Type Float16(int lanes = 1) {
+  return Type(Type::type_t ::Float, 16, lanes, Type::specific_type_t::FP16);
+}
+inline Type Float(int bits,
+                  int lanes = 1,
+                  Type::specific_type_t st = Type::specific_type_t::None) {
+  if (bits == 16) {
+    CHECK(st == Type::specific_type_t::FP16 ||
+          st == Type::specific_type_t::BF16)
+        << "When creating a 16 bits Float, the specific_type_t must be FP16 or "
+           "BF16.";
+  }
+  return Type(Type::type_t ::Float, bits, lanes, st);
+}
+inline Type Bool(int lanes = 1) { return Type(Type::type_t ::UInt, 1, lanes); }
+inline Type String() { return Type(Type::type_t::String, 1, 1); }
+
+//! Builtin native types as global singletons.
+// @{
+const Type& BF16();
+const Type& F16();
+const Type& F32();
+const Type& F64();
+const Type& I8();
+const Type& I16();
+const Type& I32();
+const Type& I64();
+const Type& UI8();
+const Type& UI16();
+const Type& UI32();
+const Type& UI64();
+const Type& I1();
+const Type& UI1();
+// @}
+
+template <typename T>
+Type type_of();
+
+// clang-format off
+template <> inline Type type_of<void>() { return Void(); }
+
+template <> inline Type type_of<bfloat16>() { return BF16(); }
+template <> inline Type type_of<float16>() { return F16(); }
+template <> inline Type type_of<float>() { return F32(); }
+template <> inline Type type_of<double>() { return F64(); }
+
+template <> inline Type type_of<bool>() { return UI1(); }
+template <> inline Type type_of<char>() { return I8(); }
+// template <> inline Type type_of<signed char>() { return I8(); }
+// template <> inline Type type_of<unsigned char>() { return UI8(); }
+template <> inline Type type_of<std::string>() { return String(); }
+
+template <> inline Type type_of<int8_t>() { return I8(); }
+template <> inline Type type_of<int16_t>() { return I16(); }
+template <> inline Type type_of<int32_t>() { return I32(); }
+template <> inline Type type_of<int64_t>() { return I64(); }
+
+template <> inline Type type_of<uint8_t>() { return UI8(); }
+template <> inline Type type_of<uint16_t>() { return UI16(); }
+template <> inline Type type_of<uint32_t>() { return UI32(); }
+template <> inline Type type_of<uint64_t>() { return UI64(); }
+
+// clang-format on
+template <>
+inline Type type_of<int8_t*>() {
+  Type x = Int(8);
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<uint8_t*>() {
+  Type x = UInt(8);
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<void*>() {
+  Type x = type_of<void>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<void**>() {
+  Type x = type_of<void>();
+  x.set_cpp_handle2();
+  return x;
+}
+template <>
+inline Type type_of<bfloat16*>() {
+  Type x = type_of<float16>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<float16*>() {
+  Type x = type_of<float16>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<float*>() {
+  Type x = type_of<float>();
+  x.set_cpp_handle();
+  return x;
+}
+template <>
+inline Type type_of<double*>() {
+  Type x = type_of<double>();
+  x.set_cpp_handle();
+  return x;
+}
+
+std::ostream& operator<<(std::ostream& os, Type::type_t t);
+
+namespace customized_type {
+
+static const char* kArgs_type_repr = "Args";
+static const char* kArgValue_type_repr = "ArgValue";
+static const char* kbuffer_t = "cinn_buffer_t";
+static const char* kpod_value_t = "cinn_pod_value_t";
+static const char* kcuda_builtin_vector_t = "CudaVectorType::";
+
+}  // namespace customized_type
+
+template <>
+inline Type type_of<cinn_buffer_t>() {
+  return Type().set_customized_type(customized_type::kbuffer_t);
+}
+template <>
+inline Type type_of<cinn_buffer_t*>() {
+  return Type()
+      .set_customized_type(customized_type::kbuffer_t)
+      .set_cpp_handle();
+}
+template <>
+inline Type type_of<const cinn_buffer_t*>() {
+  return Type()
+      .set_customized_type(customized_type::kbuffer_t)
+      .set_cpp_handle()
+      .set_cpp_const();
+}
+template <>
+inline Type type_of<cinn_pod_value_t>() {
+  return Type().set_customized_type(customized_type::kpod_value_t);
+}
+template <>
+inline Type type_of<cinn_pod_value_t*>() {
+  return Type()
+      .set_customized_type(customized_type::kpod_value_t)
+      .set_cpp_handle();
+}
+
+Type Str2Type(const std::string& type);
+
+std::string Type2Str(const Type& type);
+
+enum class Layout {
+  kUnk = 0,
+  kNCHW,
+  kNHWC,
+};
+
+}  // namespace common
+}  // namespace cinn
--- a/paddle/cinn/common/type_test.cc
+++ b/paddle/cinn/common/type_test.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/common/type.h"
+
+#include <gtest/gtest.h>
+
+namespace cinn::common {
+
+TEST(Type, basic) {
+  LOG(INFO) << I32();
+
+  auto i32 = I32();
+  LOG(INFO) << I32();
+
+  LOG(INFO) << F32();
+  LOG(INFO) << type_of<float>();
+}
+
+}  // namespace cinn::common
--- a/paddle/cinn/common/union_find.cc
+++ b/paddle/cinn/common/union_find.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/common/union_find.h"
+
+namespace cinn {
+namespace common {
+
+const char* UnionFindNode::__type_info__ = "UnionFindNode";
+const char* UnionFindNode::type_info() const { return __type_info__; }
+
+}  // namespace common
+}  // namespace cinn
--- a/paddle/cinn/common/union_find.h
+++ b/paddle/cinn/common/union_find.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/**
+ * \file This file implements a general UnionFind algorithm to help cluster
+ * something.
+ */
+#pragma once
+#include <cstring>
+#include <map>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "paddle/cinn/common/object.h"
+#include "paddle/cinn/common/shared.h"
+
+namespace cinn {
+namespace common {
+
+struct UnionFindNode : public Object {
+  UnionFindNode* parent{};
+  std::string cluster_info;
+
+  std::tuple<UnionFindNode*, int /*height*/> GetRoot() {
+    auto* p = this;
+    int level = 0;
+    while (p->parent) {
+      p = p->parent;
+      level++;
+    }
+    return std::make_tuple(p, level);
+  }
+
+  void Union(UnionFindNode* other) {
+    auto _p0_l0_ = GetRoot();
+    auto& p0 = std::get<0>(_p0_l0_);
+    auto& l0 = std::get<1>(_p0_l0_);
+    auto _p1_l1_ = other->GetRoot();
+    auto& p1 = std::get<0>(_p1_l1_);
+    auto& l1 = std::get<1>(_p1_l1_);
+    if (p0 == p1) return;
+
+    if (l0 < l1) {
+      p1->parent = p0;
+    } else {
+      p0->parent = p1;
+    }
+  }
+
+  template <typename T>
+  T* safe_as() {
+    CHECK_EQ(std::strcmp(T::__type_info__, type_info()), 0)
+        << "Want a " << T::__type_info__ << " but get a " << type_info();
+    return reinterpret_cast<T*>(this);
+  }
+
+  const char* type_info() const override;
+
+  static const char* __type_info__;
+};
+
+struct UnionFind {
+  UnionFindNode* AddNode(UnionFindNode* node) {
+    nodes.emplace_back(node);
+    return node;
+  }
+
+  std::vector<std::vector<UnionFindNode*>> GetClusters() {
+    std::map<UnionFindNode* /*root*/, std::vector<UnionFindNode*>> clusters;
+
+    for (auto& n : nodes) {
+      auto _root_l_ = n->GetRoot();  // NOLINT
+      auto& root = std::get<0>(_root_l_);
+      auto& l = std::get<1>(_root_l_);
+      clusters[root].push_back(n.get());
+    }
+
+    std::vector<std::vector<UnionFindNode*>> res;
+    for (auto& item : clusters) {
+      res.push_back(item.second);
+    }
+    return res;
+  }
+
+  std::vector<common::Shared<UnionFindNode>> nodes;
+};
+
+}  // namespace common
+}  // namespace cinn
--- a/paddle/cinn/frontend/CMakeLists.txt
+++ b/paddle/cinn/frontend/CMakeLists.txt
+core_gather_headers()
+gather_srcs(
+  cinnapi_src
+  SRCS
+  computation.cc
+  syntax.cc
+  paddle_model_to_program.cc
+  interpreter.cc
+  net_builder.cc
+  op_mapper_registry.cc
+  paddle_model_convertor.cc
+  program_pass.cc
+  optimize.cc)
+
+if(NOT WITH_CUDA)
+  cinn_cc_test(
+    test_frontend_syntax
+    ARGS
+    "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
+    SRCS
+    syntax_test.cc
+    DEPS
+    cinncore)
+
+  #  cinn_cc_test(test_frontend_interpreter
+  #          ARGS --model_dir=${THIRD_PARTY_PATH}/naive_mul_model
+  #          SRCS interpreter_test.cc DEPS cinncore)
+
+else()
+  cinn_nv_test(
+    test_frontend_syntax
+    ARGS
+    "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
+    SRCS
+    syntax_test.cc
+    DEPS
+    cinncore)
+
+  cinn_nv_test(
+    test_frontend_interpreter
+    ARGS
+    --model_dir=${THIRD_PARTY_PATH}/naive_mul_model
+    SRCS
+    interpreter_test.cc
+    DEPS
+    cinncore)
+endif()
+
+#cinn_cc_test(test_paddle_model_convertor
+#        ARGS --model_dir=${THIRD_PARTY_PATH}/resnet_model
+#        SRCS paddle_model_convertor_test.cc DEPS cinncore decomposer_test_helper)
+
+#cinn_cc_test(test_computation
+#  ARGS "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
+#  SRCS computation_test.cc DEPS cinncore)
+
+cinn_cc_test(test_net_builder SRCS net_builder_test.cc DEPS cinncore)
+cinn_cc_test(test_decomposer_registry SRCS decomposer_registry_test.cc DEPS
+             cinncore)
+
+add_subdirectory(paddle)
+add_subdirectory(decomposer)
+add_subdirectory(op_mappers)
+add_subdirectory(pass)
+
+cinn_cc_test(test_op_mapper_registry SRCS op_mapper_registry_test.cc DEPS
+             cinncore)
--- a/paddle/cinn/frontend/computation.cc
+++ b/paddle/cinn/frontend/computation.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/frontend/computation.h"
+
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/program_pass.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+
+namespace cinn {
+namespace frontend {
+
+struct ComputationContext {
+  Target target;
+  void *stream;
+  std::shared_ptr<hlir::framework::Graph> graph;
+  std::shared_ptr<hlir::framework::Scope> scope;
+  std::shared_ptr<hlir::framework::Program> program;
+  std::shared_ptr<hlir::framework::GraphCompiler> graph_compiler;
+
+  CinnComputation::CompileOptions compile_options;
+
+  std::vector<hlir::framework::Tensor> inputs;
+  std::vector<hlir::framework::Tensor> outputs;
+  std::unordered_map<std::string, Variable> varmap;
+  std::unordered_map<std::string, std::string> varmap_paddle2program;
+};
+
+std::shared_ptr<ComputationContext> CompileProgram(
+    const Target &target,
+    Program &program,  // NOLINT
+    const std::vector<Variable> &outputs,
+    std::shared_ptr<hlir::framework::Scope> scope,
+    const CinnComputation::CompileOptions &options,
+    void *stream) {
+  std::shared_ptr<ComputationContext> ctx(new ComputationContext());
+  ctx->stream = stream;
+  ctx->target = target;
+  ctx->compile_options = options;
+  if (ctx->compile_options.use_decomposer) {
+    ProgramPass::Apply(&program, {}, target, {"Decomposer"});
+  }
+  ctx->graph.reset(new hlir::framework::Graph(program, target));
+
+  if (ctx->compile_options.use_default_passes) {
+    hlir::framework::ApplyPass(ctx->graph.get(), "InferShape");
+
+#ifndef CINN_WITH_CUDA
+    if (target.arch == Target::Arch::X86) {
+      hlir::framework::ApplyPass(ctx->graph.get(), "AlterLayout");
+    }
+#endif
+    hlir::framework::ApplyPass(ctx->graph.get(), "ConstPropagate");
+    hlir::framework::ApplyPasses(ctx->graph.get(), DefaultOpFusionPasses());
+  }
+  for (auto &pass_name : ctx->compile_options.passes) {
+    hlir::framework::ApplyPass(ctx->graph.get(), pass_name);
+  }
+
+  ctx->scope = hlir::framework::BuildScope(target, ctx->graph, scope);
+  ctx->graph_compiler.reset(
+      new hlir::framework::GraphCompiler(target, ctx->scope, ctx->graph));
+
+  std::unordered_set<std::string> fetch_var_ids;
+  for (auto &out : outputs) {
+    fetch_var_ids.insert(out->id);
+  }
+
+  ctx->program = ctx->graph_compiler->Build(options, std::move(fetch_var_ids))
+                     .runtime_program;
+  if (ctx->compile_options.do_prerun) {
+    ctx->program->PreRun();
+  }
+
+  for (auto &in_v : program.GetInputs()) {
+    hlir::framework::Tensor t = ctx->scope->GetTensor(in_v->id);
+    ctx->inputs.push_back(t);
+  }
+  for (auto &out_v : outputs) {
+    hlir::framework::Tensor t = ctx->scope->GetTensor(out_v->id);
+    ctx->outputs.push_back(t);
+  }
+  return ctx;
+}
+
+std::vector<std::string> CinnComputation::GetAllTensorNames() {
+  std::vector<std::string> res;
+  for (auto &v : context_->scope->var_names()) {
+    res.push_back(std::string(v));
+  }
+  return res;
+}
+
+std::shared_ptr<CinnComputation> CinnComputation::CompilePaddleModel(
+    const Target &target,
+    const std::string &model_path,
+    const std::vector<std::string> &input_names,
+    const std::vector<hlir::framework::shape_t> &input_shapes,
+    bool params_combined,
+    const CompileOptions &options,
+    void *stream) {
+  CHECK(input_names.size() == input_shapes.size());
+  auto scope = std::make_shared<hlir::framework::Scope>();
+  std::unordered_map<std::string, std::vector<int>> input_shape_map;
+  for (int idx = 0; idx < input_names.size(); ++idx) {
+    input_shape_map[input_names[idx]] = input_shapes[idx];
+  }
+  auto loadedProgram = LoadPaddleProgram(
+      model_path, scope.get(), input_shape_map, params_combined, target);
+  auto &program = std::get<0>(loadedProgram);
+  auto &varmap = std::get<1>(loadedProgram);
+  auto &varmap_paddle2program = std::get<2>(loadedProgram);
+  auto &fetch_names = std::get<3>(loadedProgram);
+
+  // std::vector<Variable> input_vars;
+  // for (int i = 0; i < input_names.size(); i++) {
+  //   auto &name = input_names[i];
+  //   auto &var  = varmap.at(name);
+  //   var->shape = input_shapes[i];
+  //   input_vars.push_back(var);
+  // }
+  // program->SetInputs({input_vars});
+  // program->Validate();
+  VLOG(3) << "program:\n" << *program;
+  std::vector<Variable> output_vars;
+  for (auto &name : fetch_names) {
+    output_vars.push_back(varmap.at(name));
+  }
+
+  std::shared_ptr<ComputationContext> ctx =
+      CompileProgram(target, *program, output_vars, scope, options, stream);
+  for (auto &v : varmap) {
+    ctx->varmap[v.first] = v.second;
+  }
+  for (auto &v : varmap_paddle2program) {
+    ctx->varmap_paddle2program[v.first] = v.second;
+  }
+
+  auto computation = std::make_shared<CinnComputation>();
+  computation->context_ = std::move(ctx);
+
+  return computation;
+}
+
+std::shared_ptr<CinnComputation> CinnComputation::BuildAndCompile(
+    const Target &target,
+    NetBuilder &builder,
+    const CompileOptions &options,
+    const std::vector<Variable> &outputs,
+    void *stream) {
+  auto program = builder.Build();
+  return Compile(target, program, options, outputs, stream);
+}
+
+std::shared_ptr<CinnComputation> CinnComputation::Compile(
+    const Target &target,
+    Program &program,
+    const CompileOptions &options,
+    const std::vector<Variable> &outputs,
+    void *stream) {
+  std::vector<Variable> output_vars = outputs;
+  if (output_vars.empty()) {
+    output_vars.push_back(program[program.size() - 1].GetOutput(0));
+  }
+
+  std::shared_ptr<ComputationContext> ctx =
+      CompileProgram(target, program, output_vars, nullptr, options, stream);
+
+  auto computation = std::make_shared<CinnComputation>();
+  computation->context_ = std::move(ctx);
+
+  return computation;
+}
+
+void CinnComputation::SetTensorData(const std::string &tname,
+                                    void *data,
+                                    size_t size) {
+  hlir::framework::Tensor t = GetTensor(tname);
+  SetTensorData(t, data, size);
+}
+
+void CinnComputation::SetTensorData(hlir::framework::Tensor &t,
+                                    void *data,
+                                    size_t size) {
+  void *tdata = t->mutable_data(context_->target, t->type());
+  CHECK_EQ(size, t->shape().numel() * t->type().bytes());
+  if (context_->target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+    CUDA_CALL(cudaMemcpy(tdata, data, size, cudaMemcpyHostToDevice));
+#else
+    CINN_NOT_IMPLEMENTED
+#endif
+  } else if (context_->target.arch == Target::Arch::X86) {
+    memcpy(tdata, data, size);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+void CinnComputation::GetTensorData(hlir::framework::Tensor &t,
+                                    void *data,
+                                    size_t size) {
+  void *tdata = t->mutable_data(context_->target, t->type());
+  CHECK_EQ(size, t->shape().numel() * t->type().bytes());
+  if (context_->target.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+    CUDA_CALL(cudaMemcpy(data, tdata, size, cudaMemcpyDeviceToHost));
+#else
+    CINN_NOT_IMPLEMENTED
+#endif
+  } else if (context_->target.arch == Target::Arch::X86) {
+    memcpy(data, tdata, size);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void CinnComputation::GetTensorData(const std::string &tname,
+                                    void *data,
+                                    size_t size) {
+  hlir::framework::Tensor t = GetTensor(tname);
+  GetTensorData(t, data, size);
+}
+
+std::vector<hlir::framework::Tensor> CinnComputation::GetInputTensors() {
+  return context_->inputs;
+}
+
+std::vector<hlir::framework::Tensor> CinnComputation::GetOutputTensors() {
+  return context_->outputs;
+}
+
+hlir::framework::Tensor CinnComputation::GetTensor(const std::string &tname) {
+  if (context_->scope->FindVar(tname)) {
+    return context_->scope->GetTensor(tname);
+  }
+  auto it = context_->varmap_paddle2program.find(tname);
+  if (it == context_->varmap_paddle2program.end()) {
+    LOG(FATAL) << "No variable called [" << tname
+               << "] found in computation\nThe existing vars: "
+               << utils::Join(context_->scope->var_names(), ", ");
+  }
+  return context_->scope->GetTensor(it->second);
+}
+
+void CinnComputation::Execute(
+    const std::map<std::string, cinn_pod_value_t> *name2podargs) {
+  context_->program->Execute(name2podargs, context_->stream);
+}
+
+}  // namespace frontend
+}  // namespace cinn
--- a/paddle/cinn/frontend/computation.h
+++ b/paddle/cinn/frontend/computation.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <iostream>
+
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/tensor.h"
+
+namespace cinn {
+namespace frontend {
+
+struct ComputationContext;
+
+class CinnComputation {
+ public:
+  struct CompileOptions
+      : public hlir::framework::GraphCompiler::CompileOptions {
+    bool use_decomposer = false;
+    bool do_prerun = true;
+    bool use_default_passes = true;
+    std::vector<std::string> passes;
+  };
+
+  inline static CompileOptions DefaultCompileOptions() {
+    CompileOptions options;
+    options.with_instantiate_variables = true;
+    options.use_decomposer = false;
+    options.passes = {};
+    options.do_prerun = true;
+    options.use_default_passes = true;
+    return options;
+  }
+
+  /**
+   * build program from NetBuilder, then compile it. NetBuilder is normally
+   * NetBuilder or CINNBuilder.
+   * @param target the target to run the program
+   * @param builder program builder (NetBuilder or CINNBuilder)
+   * @param options CompileOptions, config the compilation steps
+   * @param outputs program output variables, if outputs is empty, then the
+   * output variable of the last instruction of the program is used
+   * @param stream CUDA stream, the value is meaningful only when target is
+   * NVGPU
+   * @return shared_ptr pointing to CinnComputation instance
+   */
+  static std::shared_ptr<CinnComputation> BuildAndCompile(
+      const Target &target,
+      NetBuilder &builder,  // NOLINT
+      const CompileOptions &options = DefaultCompileOptions(),
+      const std::vector<Variable> &outputs = {},
+      void *stream = nullptr);
+  /**
+   * compile the program
+   * @param target the target to run the program
+   * @param program program (usually generated by a Builder, or converted from
+   * Paddle model)
+   * @param options CompileOptions, config the compilation steps
+   * @param outputs program output variables, if outputs is empty, then the
+   * output variable of the last instruction of the program is used
+   * @param stream CUDA stream, the value is meaningful only when target is
+   * NVGpu
+   * @return shared_ptr pointing to CinnComputation instance
+   */
+  static std::shared_ptr<CinnComputation> Compile(
+      const Target &target,
+      Program &program,  // NOLINT
+      const CompileOptions &options = DefaultCompileOptions(),
+      const std::vector<Variable> &outputs = {},
+      void *stream = nullptr);
+  /**
+   * convert a paddle model to program, then compile it.
+   * @param target the target to run the program
+   * @param model_path the path of the paddle model
+   * @param input_names input variable names of paddle model
+   * @param input_shapes input variable shapes of paddle model
+   * @param params_combined whether params are stored combined
+   * @param options CompileOptions, config the compilation steps
+   * @param stream CUDA stream, the value is meaningful only when target is
+   * NVGpu
+   * @return shared_ptr pointing to CinnComputation instance
+   */
+  static std::shared_ptr<CinnComputation> CompilePaddleModel(
+      const Target &target,
+      const std::string &model_path,
+      const std::vector<std::string> &input_names,
+      const std::vector<hlir::framework::shape_t> &input_shapes,
+      bool params_combined,
+      const CompileOptions &options = DefaultCompileOptions(),
+      void *stream = nullptr);
+
+  /**
+   * get all variable names in the program
+   */
+  std::vector<std::string> GetAllTensorNames();
+
+  /**
+   * get tensor by name
+   * @param name tensor name
+   */
+  hlir::framework::Tensor GetTensor(const std::string &name);
+
+  /**
+   * get input tensors
+   */
+  std::vector<hlir::framework::Tensor> GetInputTensors();
+
+  /**
+   * get output tensors
+   */
+  std::vector<hlir::framework::Tensor> GetOutputTensors();
+
+  /**
+   * set the data of a tensor from user specified buffer.
+   * if tensor is in NVGPU device memory, cudaMemcpy is used.
+   * @param t the tensor
+   * @param data address of the memory buffer to store tensor's data
+   * @param size size of the memory buffer
+   */
+  void SetTensorData(hlir::framework::Tensor &t,  // NOLINT
+                     void *data,
+                     size_t size);
+
+  /**
+   * set the data of a tensor (specified by it's name) from user specified
+   * buffer. if tensor is in NVGPU device memory, cudaMemcpy is used.
+   * @param tname name of the tensor
+   * @param data address of the memory buffer to store tensor's data
+   * @param size size of the memory buffer
+   */
+  void SetTensorData(const std::string &tname, void *data, size_t size);
+
+  /**
+   * copy the data of a tensor to user specified buffer.
+   * if tensor is in NVGPU device memory, cudaMemcpy is used.
+   * @param t the tensor
+   * @param data address of the memory buffer to store tensor's data
+   * @param size size of the memory buffer
+   */
+  void GetTensorData(hlir::framework::Tensor &t,  // NOLINT
+                     void *data,
+                     size_t size);
+  /**
+   * copy the data of a tensor (specified by it's name) to user specified
+   * buffer. if tensor is in NVGPU device memory, cudaMemcpy is used.
+   * @param tname name of the tensor
+   * @param data address of the memory buffer to store tensor's data
+   * @param size size of the memory buffer
+   */
+  void GetTensorData(const std::string &tname, void *data, size_t size);
+
+  /**
+   * run the compiled program
+   */
+  void Execute(
+      const std::map<std::string, cinn_pod_value_t> *name2podargs = nullptr);
+
+ private:
+  std::shared_ptr<ComputationContext> context_;
+};
+
+}  // namespace frontend
+}  // namespace cinn
--- a/paddle/cinn/frontend/computation_test.cc
+++ b/paddle/cinn/frontend/computation_test.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/frontend/computation.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/decomposer/use_decomposer.h"
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/pass/use_program_pass.h"
+#include "paddle/cinn/frontend/program_pass.h"
+
+DEFINE_string(model_dir, "", "");
+
+namespace cinn {
+namespace frontend {
+
+Program CreateTestProgram() {
+  constexpr int B = 8;
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a = builder.CreateInput(Float(32), {M, N / 2}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N / 2}, "B");
+  auto t = builder.Transpose(b, {1, 0});
+  auto r = builder.Reshape(t, {M, N / 2});
+  auto c = builder.Add(a, r);
+  auto x = builder.Divide(a, b);
+  auto d = builder.Concat({c, x}, 1);
+  auto e = builder.BroadcastTo(d, {B, M, N}, {1, 2});
+  auto f = builder.Concat({a, b}, 1);
+  auto g = builder.BroadcastTo(f, {B, M, N}, {1, 2});
+  auto h = builder.Subtract(e, g);
+  auto i = builder.Max(e, h);
+  auto j = builder.Min(e, h);
+  auto k = builder.Multiply(i, j);
+  auto l = builder.Constant<bool>(1, "condition");
+  auto m = builder.BroadcastTo(l, {B, M, N}, {0});
+  auto n = builder.Select(m, j, k);
+  auto o = builder.ReduceSum(n, {0, 1, 2});
+
+  auto program = builder.Build();
+  return program;
+}
+
+Program CreateAddProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a = builder.CreateInput(Float(32), {M, N});
+  auto b = builder.CreateInput(Float(32), {M, N});
+  auto c = builder.Relu(a);
+  auto d = builder.Add(b, c);
+  auto program = builder.Build();
+
+  return program;
+}
+
+TEST(cinn_computation, basic_cpu) {
+  NetBuilder builder("basic");
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
+
+  auto target = common::DefaultHostTarget();
+  auto comp = CinnComputation::BuildAndCompile(target, builder);
+  std::vector<float> hostA(M * N);
+  std::vector<float> hostB(M * N);
+  std::vector<float> hostD(M * N);
+  std::vector<float> hostD_expected(M * N);
+  for (int i = 0; i < M * N; i++) {
+    hostA[i] = static_cast<float>(rand()) / INT_MAX;  // NOLINT
+    hostB[i] = static_cast<float>(rand()) / INT_MAX;  // NOLINT
+    hostD_expected[i] = hostA[i] * 2 + hostB[i];
+  }
+
+  comp->SetTensorData("A",
+                      reinterpret_cast<void *>(hostA.data()),
+                      hostA.size() * sizeof(float));
+  comp->SetTensorData("B",
+                      reinterpret_cast<void *>(hostB.data()),
+                      hostB.size() * sizeof(float));
+  comp->Execute();
+  comp->GetTensorData(d->id,
+                      reinterpret_cast<void *>(hostD.data()),
+                      hostD.size() * sizeof(float));
+  for (int i = 0; i < hostD.size(); i++) {
+    ASSERT_NEAR(hostD[i], hostD_expected[i], 1e-5);
+  }
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(cinn_computation, basic_gpu) {
+  NetBuilder builder("basic");
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
+
+  auto target = common::DefaultNVGPUTarget();
+  auto comp = CinnComputation::BuildAndCompile(target, builder);
+  std::vector<float> hostA(M * N);
+  std::vector<float> hostB(M * N);
+  std::vector<float> hostD(M * N);
+  std::vector<float> hostD_expected(M * N);
+  for (int i = 0; i < M * N; i++) {
+    hostA[i] = static_cast<float>(rand()) / INT_MAX;  // NOLINT
+    hostB[i] = static_cast<float>(rand()) / INT_MAX;  // NOLINT
+    hostD_expected[i] = hostA[i] * 2 + hostB[i];
+  }
+
+  comp->SetTensorData("A",
+                      reinterpret_cast<void *>(hostA.data()),
+                      hostA.size() * sizeof(float));
+  comp->SetTensorData("B",
+                      reinterpret_cast<void *>(hostB.data()),
+                      hostB.size() * sizeof(float));
+  comp->Execute();
+  comp->GetTensorData(d->id,
+                      reinterpret_cast<void *>(hostD.data()),
+                      hostD.size() * sizeof(float));
+  for (int i = 0; i < hostD.size(); i++) {
+    ASSERT_NEAR(hostD[i], hostD_expected[i], 1e-5);
+  }
+}
+#endif
+
+TEST(cinn_computation, net_builder_cpu) {
+  auto program = CreateTestProgram();
+  auto target = common::DefaultHostTarget();
+  auto compute = CinnComputation::Compile(target, program);
+  auto inputs = compute->GetInputTensors();
+  ASSERT_EQ(inputs.size(), 2);
+  auto tensorA = inputs[0];
+  auto tensorB = inputs[1];
+  ASSERT_EQ(tensorA->shape().numel(), 32 * 24 / 2);
+  ASSERT_EQ(tensorB->shape().numel(), 32 * 24 / 2);
+
+  auto outputs = compute->GetOutputTensors();
+  ASSERT_EQ(outputs.size(), 1);
+  auto tensorOut = outputs[0];
+
+  auto load_input = [=](hlir::framework::Tensor t) {
+    float *ptr = t->mutable_data<float>(target);
+    for (int i = 0; i < t->shape().numel(); i++) {
+      ptr[i] = static_cast<float>(rand()) / INT_MAX;  // NOLINT
+    }
+  };
+
+  // run inference for 10 times
+  for (int i = 0; i < 10; i++) {
+    // load data directly to tensor's host memory
+    load_input(tensorA);
+    load_input(tensorB);
+    // execute engine
+    compute->Execute();
+    // get outputs (ignored)
+  }
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(cinn_computation, net_builder_gpu) {
+  auto program = CreateTestProgram();
+  auto target = common::DefaultNVGPUTarget();
+  auto compute = CinnComputation::Compile(target, program);
+  auto inputs = compute->GetInputTensors();
+  ASSERT_EQ(inputs.size(), 2);
+  auto tensorA = inputs[0];
+  auto tensorB = inputs[1];
+  ASSERT_EQ(tensorA->shape().numel(), 32 * 24 / 2);
+  ASSERT_EQ(tensorB->shape().numel(), 32 * 24 / 2);
+  auto outputs = compute->GetOutputTensors();
+  ASSERT_EQ(outputs.size(), 1);
+  auto tensorOut = outputs[0];
+
+  // run inference for 10 times
+  for (int i = 0; i < 10; i++) {
+    // load data directly to tensor's host memory
+    // assume tensorA is generated in GPU directly
+    float *device_ptrA = tensorOut->mutable_data<float>(target);
+    // ... generated data directly in device memory via gpu kernels
+    // ... or async copy to device memory
+    // ... not showed here
+
+    // assume tensorB is generated in host memory, needs copy to GPU memory
+    // (sync.)
+    std::vector<float> hostB(32 * 24 / 2);
+    compute->SetTensorData(tensorB,
+                           reinterpret_cast<void *>(hostB.data()),
+                           hostB.size() * sizeof(float));
+
+    // execute engine
+    compute->Execute();
+    // get outputs
+    std::vector<float> hostOut(tensorOut->shape().numel());
+    compute->GetTensorData(tensorOut,
+                           reinterpret_cast<void *>(hostOut.data()),
+                           hostOut.size() * sizeof(float));
+  }
+}
+#endif
+
+TEST(cinn_computation, fc_execute_cpu) {
+  auto target = common::DefaultHostTarget();
+  ASSERT_NE(FLAGS_model_dir, "");
+  auto compute = CinnComputation::CompilePaddleModel(
+      target, FLAGS_model_dir, {"A"}, {{1, 30}}, false);
+  auto inputs = compute->GetInputTensors();
+  ASSERT_EQ(inputs.size(), 1);
+  auto A = inputs[0];
+  ASSERT_EQ(A->shape().numel(), 1 * 30);
+  float *ptrA = A->mutable_data<float>(target);
+  for (int i = 0; i < 30; i++)
+    ptrA[i] = static_cast<float>(rand()) / INT_MAX;  // NOLINT
+  for (int i = 0; i < 30; i++) ptrA[i] = static_cast<float>(0);
+  compute->Execute();
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(cinn_computation, fc_execute_gpu) {
+  auto target = common::DefaultNVGPUTarget();
+  ASSERT_NE(FLAGS_model_dir, "");
+  auto compute = CinnComputation::CompilePaddleModel(
+      target, FLAGS_model_dir, {"A"}, {{1, 30}}, false);
+
+  auto inputs = compute->GetInputTensors();
+  ASSERT_EQ(inputs.size(), 1);
+  auto A = inputs[0];
+  ASSERT_EQ(A->shape().numel(), 1 * 30);
+  auto outputs = compute->GetOutputTensors();
+  ASSERT_EQ(outputs.size(), 1);
+  auto out = outputs[0];
+
+  std::vector<float> hostA(30);
+  for (float &v : hostA) v = static_cast<float>(rand()) / INT_MAX;  // NOLINT
+  compute->SetTensorData(
+      A, reinterpret_cast<void *>(hostA.data()), hostA.size() * sizeof(float));
+
+  compute->Execute();
+
+  std::vector<float> hostOut(30);
+  compute->GetTensorData(out,
+                         reinterpret_cast<void *>(hostOut.data()),
+                         hostOut.size() * sizeof(float));
+}
+#endif
+
+TEST(cinn_computation, decomposer_cpu) {
+  // this test only shows the API usage
+  ASSERT_NE(cinn::frontend::ProgramPassRegistry::Global()->Find("Decomposer"),
+            nullptr);
+  // without decomposer
+  {
+    auto prog = CreateAddProgram();
+    auto target = common::DefaultHostTarget();
+    auto options = CinnComputation::DefaultCompileOptions();
+    options.use_decomposer = false;
+    auto compute = CinnComputation::Compile(target, prog, options);
+    auto names = compute->GetAllTensorNames();
+    ASSERT_EQ(names.size(), 3);
+  }
+  // with decomposer
+  {
+    auto prog = CreateAddProgram();
+    auto target = common::DefaultHostTarget();
+    auto options = CinnComputation::DefaultCompileOptions();
+    options.use_decomposer = true;
+    auto compute = CinnComputation::Compile(target, prog, options);
+    auto names = compute->GetAllTensorNames();
+  }
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(cinn_computation, gpu_stream) {
+  // this test only shows the API usage
+  auto target = common::DefaultNVGPUTarget();
+  auto prog = CreateAddProgram();
+  auto options = CinnComputation::DefaultCompileOptions();
+
+  cudaStream_t streams[1];
+  cudaStreamCreate(&streams[0]);
+  auto compute = CinnComputation::Compile(
+      target, prog, options, {}, static_cast<void *>(streams[0]));
+  compute->Execute();
+}
+#endif
+
+TEST(cinn_computation, without_instantiate_variables) {
+  // this test only shows the API usage
+  auto target = common::DefaultHostTarget();
+  auto prog = CreateAddProgram();
+  auto options = CinnComputation::DefaultCompileOptions();
+  options.with_instantiate_variables = false;
+
+  auto compute = CinnComputation::Compile(target, prog, options);
+  auto names = compute->GetAllTensorNames();
+
+  std::map<std::string, cinn_pod_value_t> pod2args;
+  // compute->Execute(&pod2args);
+}
+
+}  // namespace frontend
+}  // namespace cinn
--- a/paddle/cinn/frontend/decomposer/CMakeLists.txt
+++ b/paddle/cinn/frontend/decomposer/CMakeLists.txt
+core_gather_headers()
+
+gather_srcs(
+  cinnapi_src
+  SRCS
+  activation.cc
+  elementwise.cc
+  broadcast.cc
+  batch_norm.cc
+  top_k.cc)
+
+cinn_cc_library(decomposer_test_helper SRCS test_helper.cc DEPS cinncore)
+
+if(WITH_CUDA)
+  cinn_cc_test(test_activation_decomposer SRCS activation_test.cc DEPS cinncore
+               decomposer_test_helper)
+  cinn_cc_test(test_elementwise_decomposer SRCS elementwise_test.cc DEPS
+               cinncore decomposer_test_helper)
+  cinn_cc_test(test_broadcast_decomposer SRCS broadcast_test.cc DEPS cinncore
+               decomposer_test_helper)
+  cinn_cc_test(test_batch_norm_decomposer SRCS batch_norm_test.cc DEPS cinncore
+               decomposer_test_helper)
+  cinn_cc_test(test_top_k_decomposer SRCS top_k_test.cc DEPS cinncore
+               decomposer_test_helper)
+endif()
--- a/paddle/cinn/frontend/decomposer/activation.cc
+++ b/paddle/cinn/frontend/decomposer/activation.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace decomposer {
+
+void relu(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 1UL)
+      << " 1 input tensor for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL)
+      << "1 output tensor for " << instr->op_type;
+  auto x = instr->inputs[0];
+  auto output = instr->outputs[0];
+  auto* builder = context.builder();
+
+  auto bcast_zero = builder->FillConstant(
+      x->shape, 0.0f, common::UniqName("zero"), common::Type2Str(x->type));
+  auto out = builder->Max(x, bcast_zero);
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(out, output);
+}
+
+void relu_grad(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 2UL)
+      << " 2 input tensors for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL)
+      << "1 output tensor for " << instr->op_type;
+  auto dout = instr->inputs[0];
+  auto out = instr->inputs[1];
+  auto dx = instr->outputs[0];
+  auto* builder = context.builder();
+
+  auto bcast_zero = builder->FillConstant(
+      out->shape, 0.0f, common::UniqName("zero"), common::Type2Str(out->type));
+  auto condition = builder->GreaterThan(out, bcast_zero);
+  auto res = builder->Select(condition, dout, bcast_zero);
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(res, dx);
+}
+
+void gelu(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 1UL)
+      << " 1 input tensor for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL)
+      << "1 output tensor for " << instr->op_type;
+  auto x = instr->inputs[0];
+  auto output = instr->outputs[0];
+  auto* builder = context.builder();
+
+  // x * (0.5 + 0.5 * erf(sqrtf(0.5) * x))
+  auto p_5 = builder->FillConstant(
+      x->shape, 0.5f, common::UniqName("p_5"), common::Type2Str(x->type));
+  auto p_7 = builder->FillConstant(x->shape,
+                                   std::sqrt(0.5),
+                                   common::UniqName("p_7"),
+                                   common::Type2Str(x->type));
+  auto erf = builder->Erf(builder->Multiply(x, p_7));
+  auto cdf = builder->Add(p_5, builder->Multiply(p_5, erf));
+  auto out = builder->Multiply(x, cdf);
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(out, output);
+}
+
+void softmax(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 1UL)
+      << " 1 input tensor for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL)
+      << "1 output tensor for " << instr->op_type;
+  auto x = instr->inputs[0];
+  auto output = instr->outputs[0];
+  auto* builder = context.builder();
+
+  std::vector<int> b_axes;
+  auto axes = instr.GetAttrs<std::vector<int>>("axes");
+  CHECK(axes.size());
+  for (auto& axis : axes) {
+    if (axis < 0) {
+      axis += x->shape.size();
+    }
+  }
+  for (int idx = 0; idx < x->shape.size(); ++idx) {
+    if (std::find(axes.begin(), axes.end(), idx) == axes.end()) {
+      b_axes.push_back(idx);
+    }
+  }
+
+  // When the rank of x is 1, broadcast axes will be empty, so we need to insert
+  // last dim as broadcast axis.
+  if (b_axes.empty()) {
+    b_axes.emplace_back(-1);
+  }
+
+  auto mode = instr.GetAttrs<std::string>("mode");
+  if (mode == "fast") {
+    // x_sum = sum(exp(x))
+    auto x_sum = builder->BroadcastTo(
+        builder->ReduceSum(builder->Exp(x), axes), x->shape, b_axes);
+    // x_exp / x_sum
+    auto out = builder->Divide(builder->Exp(x), x_sum);
+
+    // map the the output of decomposed operator to the original.
+    context.MapOutToOrigin(out, output);
+  } else {
+    // x = max(x)
+    auto x_max =
+        builder->BroadcastTo(builder->ReduceMax(x, axes), x->shape, b_axes);
+    // x_exp = exp(x - x_max)
+    auto x_exp = builder->Exp(builder->Subtract(x, x_max));
+    // x_sum = sum(x_exp)
+    auto x_sum =
+        builder->BroadcastTo(builder->ReduceSum(x_exp, axes), x->shape, b_axes);
+    // x_exp / x_sum
+    auto out =
+        builder->Divide(builder->Exp(builder->Subtract(x, x_max)), x_sum);
+
+    // map the the output of decomposed operator to the original.
+    context.MapOutToOrigin(out, output);
+  }
+}
+
+}  // namespace decomposer
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(relu_decomposers) {
+  CINN_DECOMPOSER_REGISTER(relu, cinn::frontend::decomposer::relu);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(relu_grad_decomposers) {
+  CINN_DECOMPOSER_REGISTER(relu_grad, cinn::frontend::decomposer::relu_grad);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(gelu_decomposers) {
+  CINN_DECOMPOSER_REGISTER(gelu, cinn::frontend::decomposer::gelu);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(softmax_decomposers) {
+  CINN_DECOMPOSER_REGISTER(softmax, cinn::frontend::decomposer::softmax);
+
+  return true;
+}
--- a/paddle/cinn/frontend/decomposer/activation_test.cc
+++ b/paddle/cinn/frontend/decomposer/activation_test.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn::frontend {
+
+TEST(Decomposer, relu) {
+  NetBuilder builder("relu");
+  auto x = builder.CreateInput(Float(32), {20, 10}, "x");
+  auto out = builder.Relu(x);
+
+  auto relu_cpu = [](const std::vector<size_t>& lengths,
+                     const std::vector<void*>& ptrs) {
+    size_t n = lengths[0];
+    float* x = static_cast<float*>(ptrs[0]);
+    float* out = static_cast<float*>(ptrs[1]);
+    for (size_t i = 0; i < n; ++i) {
+      float tmp_0 = x[i];
+      out[i] = tmp_0 > 0 ? tmp_0 : 0;
+    }
+  };
+
+  std::vector<std::string> input_names = {x.id().data()};
+  std::vector<std::string> output_names = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{20, 10}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, relu_cpu, -1, 1);
+}
+
+TEST(Decomposer, relu_grad) {
+  NetBuilder builder("relu_grad");
+  auto dout = builder.CreateInput(Float(32), {20, 10}, "dout");
+  auto out = builder.CreateInput(Float(32), {20, 10}, "out");
+  auto dx = builder.ReluGrad(dout, out);
+
+  auto relu_grad_cpu = [](const std::vector<size_t>& lengths,
+                          const std::vector<void*>& ptrs) {
+    size_t n = lengths[0];
+    float* dout = static_cast<float*>(ptrs[0]);
+    float* out = static_cast<float*>(ptrs[1]);
+    float* dx = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < n; ++i) {
+      dx[i] = out[i] > 0 ? dout[i] : 0;
+    }
+  };
+
+  std::vector<std::string> input_names = {dout.id().data(), out.id().data()};
+  std::vector<std::string> output_names = {dx->id};
+  std::vector<std::vector<int>> output_shapes = {{20, 10}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, relu_grad_cpu, -1, 1);
+}
+
+TEST(Decomposer, softmax_decomposer) {
+  int n = 16, c = 128, h = 14, w = 14;
+  std::vector<int> axes = {1, 2, 3};
+  NetBuilder net_builder("softmax_decomposer");
+  std::unordered_set<std::string> output_names;
+  {
+    auto x = net_builder.CreateInput(Float(32), {n, c, h, w}, "x");
+    auto y = net_builder.Softmax(x, axes);
+    output_names.insert(y->id);
+  }
+  auto program = net_builder.Build();
+
+  auto target = common::DefaultTarget();
+  RunDecomposer(&program, target);
+
+  auto graph =
+      std::make_shared<hlir::framework::Graph>(program, output_names, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto run_program = gc.Build();
+
+  std::vector<float> x(n * c * h * w);
+  InitRandomVector<float>(&x, n * c * h * w, 0.0f, 1.0f, 1e-3);
+  std::vector<std::pair<std::string, std::vector<float>>> inputs = {{"x", x}};
+  for (auto& input : inputs) {
+    scope->Var<hlir::framework::Tensor>(input.first);
+    auto tensor = scope->GetTensor(input.first);
+    auto* data = tensor->mutable_data<float>(target);
+    CopyFromVector(input.second, tensor, target);
+  }
+  run_program->Execute();
+}
+
+}  // namespace cinn::frontend
--- a/paddle/cinn/frontend/decomposer/batch_norm.cc
+++ b/paddle/cinn/frontend/decomposer/batch_norm.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace decomposer {
+
+struct BatchNormHelper {
+  BatchNormHelper(NetBuilder* net_builder,
+                  const std::vector<int>& arg_x_shape,
+                  const std::vector<int>& arg_param_shape,
+                  std::string data_layout,
+                  std::string bn_op_type) {
+    CHECK_EQ(arg_x_shape.size(), 4UL)
+        << "Only 4-D input tensor is supported, but get " << arg_x_shape.size()
+        << "-D input tensor.";
+
+    builder = net_builder;
+    x_shape = arg_x_shape;
+    param_shape = arg_param_shape;
+
+    if (data_layout == "NCHW") {
+      channel_dim = 1;
+      reduce_dim = {0, 2, 3};
+      element_count = x_shape[0] * x_shape[2] * x_shape[3];
+    } else if (data_layout == "NHWC") {
+      channel_dim = 3;
+      reduce_dim = {0, 1, 2};
+      element_count = x_shape[0] * x_shape[1] * x_shape[2];
+    } else {
+      LOG(FATAL) << data_layout << " setting is not support!";
+    }
+
+    num_instructions = builder->size();
+    op_type = bn_op_type;
+  }
+
+  ~BatchNormHelper() {
+    VLOG(4) << op_type << " is decomposed to "
+            << builder->size() - num_instructions << " instructions.";
+  }
+
+  std::vector<Variable> MeanAndVariance(Variable x) {
+    auto mean = Mean(x);
+    // variance = reduce_sum(x * x) / nhw - mean * mean, shape = [c], simplified
+    // by equation: E(x^2) - [E(x)]^2
+    auto variance = Variance(x, mean);
+    return {mean, variance};
+  }
+
+  std::vector<Variable> GradBiasAndScale(Variable x,
+                                         Variable x_mean,
+                                         Variable y_grad) {
+    auto mean_4d = builder->BroadcastTo(x_mean, x->shape, {channel_dim});
+    auto x_mean_diff = builder->Subtract(x, mean_4d);
+    // bias_grad = reduce_sum(y_grad), shape = [c]
+    auto bias_grad = Reduce(y_grad);
+    auto sum_of_y_grad_mul_x_mean_diff =
+        Reduce(builder->Multiply(y_grad, x_mean_diff));
+    return {bias_grad, sum_of_y_grad_mul_x_mean_diff};
+  }
+
+  // mean = reduce_sum(x) / nhw
+  Variable Mean(Variable x) {
+    auto sum = Reduce(x);
+    auto element_count_1d =
+        builder->FillConstant(sum->shape,
+                              element_count,
+                              common::UniqName("element_count"),
+                              common::Type2Str(sum->type));
+    auto mean = builder->Divide(sum, element_count_1d);
+    return mean;
+  }
+
+  // variance = reduce_sum(x * x) / nhw - mean * mean
+  Variable Variance(Variable x, Variable mean) {
+    auto x_square = builder->Multiply(x, builder->Identity(x));
+    auto x_square_sum = Reduce(x_square);
+    auto element_count_1d =
+        builder->FillConstant(x_square_sum->shape,
+                              element_count,
+                              common::UniqName("element_count"),
+                              common::Type2Str(x_square_sum->type));
+    auto x_square_mean = builder->Divide(x_square_sum, element_count_1d);
+    auto variance = builder->Subtract(
+        x_square_mean, builder->Multiply(mean, builder->Identity(mean)));
+    return variance;
+  }
+
+  // std_variance_inv = rsqrt(variance + epsilon)
+  Variable StdVarianceInv1d(Variable variance, float epsilon) {
+    auto epsilon_1d = builder->FillConstant(variance->shape,
+                                            epsilon,
+                                            common::UniqName("epsilon"),
+                                            common::Type2Str(variance->type));
+    auto std_variance_inv = builder->Rsqrt(builder->Add(variance, epsilon_1d));
+    return std_variance_inv;
+  }
+
+  // std_variance_inv = rsqrt(variance + epsilon)
+  Variable StdVarianceInv4d(Variable variance, float epsilon) {
+    auto variance_4d = builder->BroadcastTo(variance, x_shape, {channel_dim});
+    auto epsilon_4d =
+        builder->FillConstant(variance_4d->shape,
+                              epsilon,
+                              common::UniqName("epsilon"),
+                              common::Type2Str(variance_4d->type));
+    auto std_variance_inv_4d =
+        builder->Rsqrt(builder->Add(variance_4d, epsilon_4d));
+    return std_variance_inv_4d;
+  }
+
+  // moving_value = moving_value * momentum + (1.0 - momentum) * saved_value
+  // value maybe mean and variance.
+  Variable UpdateMeanVariance(Variable moving_value,
+                              Variable saved_value,
+                              float momentum) {
+    auto factor_0 = builder->FillConstant(moving_value->shape,
+                                          momentum,
+                                          common::UniqName("factor_0"),
+                                          common::Type2Str(moving_value->type));
+    auto factor_1 = builder->FillConstant(saved_value->shape,
+                                          1.0f - momentum,
+                                          common::UniqName("factor_1"),
+                                          common::Type2Str(saved_value->type));
+    auto new_moving_value =
+        builder->Add(builder->Multiply(moving_value, factor_0),
+                     builder->Multiply(saved_value, factor_1));
+    return new_moving_value;
+  }
+
+  Variable Reduce(Variable x) { return builder->ReduceSum(x, reduce_dim); }
+
+  NetBuilder* builder{nullptr};
+  std::vector<int> x_shape;
+  std::vector<int> param_shape;
+  std::vector<int> reduce_dim;
+  float element_count{0};
+  int channel_dim{0};
+  std::string op_type;
+  int num_instructions{0};
+};
+
+void batch_norm_train(const Instruction& instr,
+                      const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 5UL)
+      << "The number of the given inputs is not equal to the required for op "
+      << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 5UL)
+      << "The number of the given outputs is not equal to the required for op "
+      << instr->op_type;
+
+  auto& x = instr->inputs[0];
+  auto& scale = instr->inputs[1];
+  auto& bias = instr->inputs[2];
+  auto& moving_mean = instr->inputs[3];
+  auto& moving_variance = instr->inputs[4];
+  CHECK_EQ(scale->type, bias->type);
+  CHECK_EQ(scale->type, moving_mean->type);
+  CHECK_EQ(scale->type, moving_variance->type);
+
+  float epsilon = instr.GetAttrs<float>("epsilon");
+  float momentum = instr.GetAttrs<float>("momentum");
+  std::string layout = instr.GetAttrs<std::string>("data_layout");
+
+  NetBuilder* builder = context.builder();
+  BatchNormHelper helper(
+      builder, x->shape, scale->shape, layout, "batch_norm_train");
+
+  auto mean_variance = helper.MeanAndVariance(x);
+  auto mean = mean_variance[0];
+  auto variance = mean_variance[1];
+
+  auto mean_4d = builder->BroadcastTo(mean, x->shape, {helper.channel_dim});
+  // std_variance_inv = rsqrt(variance + epsilon), shape = [c]
+  auto std_variance_inv_4d = helper.StdVarianceInv4d(variance, epsilon);
+
+  // y = scale * (x - mean) * std_variance_inv + bias, shape = [n, c, h, w]
+  auto scale_4d = builder->BroadcastTo(scale, x->shape, {helper.channel_dim});
+  auto bias_4d = builder->BroadcastTo(bias, x->shape, {helper.channel_dim});
+  auto normalized =
+      builder->Multiply(builder->Subtract(x, mean_4d), std_variance_inv_4d);
+  auto scaled_normalized = builder->Multiply(normalized, scale_4d);
+  auto y = builder->Add(scaled_normalized, bias_4d);
+
+  // moving_mean = moving_mean * momentum + (1.0 - momentum) * mean, shape = [c]
+  auto new_moving_mean = helper.UpdateMeanVariance(moving_mean, mean, momentum);
+
+  // moving_variance = moving_variance * momentum + (1.0 - momentum) * variance,
+  // shape = [c]
+  auto new_moving_variance =
+      helper.UpdateMeanVariance(moving_variance, variance, momentum);
+
+  context.MapOutToOrigin(y, instr->outputs[0]);
+  context.MapOutToOrigin(mean, instr->outputs[1]);
+  context.MapOutToOrigin(variance, instr->outputs[2]);
+  context.MapOutToOrigin(new_moving_mean, instr->outputs[3]);
+  context.MapOutToOrigin(new_moving_variance, instr->outputs[4]);
+}
+
+void batch_norm_grad(const Instruction& instr,
+                     const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 5UL)
+      << " The number of the given inputs is not equal to the required "
+      << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 3UL)
+      << " The number of the given outputs is not equal to the required"
+      << instr->op_type;
+
+  auto& y_grad = instr->inputs[0];
+  auto& x = instr->inputs[1];
+  auto& scale = instr->inputs[2];
+  auto& save_mean = instr->inputs[3];
+  auto& save_variance = instr->inputs[4];
+  CHECK_EQ(y_grad->type, x->type);
+  CHECK_EQ(scale->type, save_mean->type);
+  CHECK_EQ(scale->type, save_variance->type);
+
+  auto epsilon = instr.GetAttrs<float>("epsilon");
+  auto layout = instr.GetAttrs<std::string>("data_layout");
+
+  NetBuilder* builder = context.builder();
+  BatchNormHelper helper(
+      builder, x->shape, scale->shape, layout, "batch_norm_grad");
+
+  auto vars = helper.GradBiasAndScale(x, save_mean, y_grad);
+  auto bias_grad = vars[0];
+  auto sum_of_y_grad_mul_x_mean_diff = vars[1];
+
+  // scale_grad = reduce_sum(y_grad * (x - mean)) * rsqrt(variance + epsilon),
+  // shape = [c]
+  auto scale_grad =
+      builder->Multiply(sum_of_y_grad_mul_x_mean_diff,
+                        helper.StdVarianceInv1d(save_variance, epsilon));
+
+  // x_grad = 1/nhw * scale * rsqrt(variance + epsilon) *
+  //   (nhw * y_grad - reduce_sum(y_grad) - (x - mean) * reduce_sum(y_grad * (x
+  //   - mean)) / (variance + epsilon))
+  // => x_grad = tmp0 * (tmp1 - tmp2 - tmp3)
+  auto scaled_std_variance_inv =
+      builder->Multiply(scale, helper.StdVarianceInv1d(save_variance, epsilon));
+  auto element_count_1d =
+      builder->FillConstant(scaled_std_variance_inv->shape,
+                            helper.element_count,
+                            common::UniqName("element_count_1d"),
+                            common::Type2Str(scaled_std_variance_inv->type));
+  auto tmp0 = builder->BroadcastTo(
+      builder->Divide(scaled_std_variance_inv, element_count_1d),
+      x->shape,
+      {helper.channel_dim});
+
+  auto element_count_4d =
+      builder->FillConstant(y_grad->shape,
+                            helper.element_count,
+                            common::UniqName("element_count_4d"),
+                            common::Type2Str(y_grad->type));
+  auto tmp1 = builder->Multiply(y_grad, element_count_4d);
+
+  auto tmp2 = builder->BroadcastTo(bias_grad, x->shape, {helper.channel_dim});
+
+  auto mean_4d =
+      builder->BroadcastTo(save_mean, x->shape, {helper.channel_dim});
+  auto x_mean_diff = builder->Subtract(x, mean_4d);
+
+  auto sum_of_y_grad_mul_x_mean_diff_4d = builder->BroadcastTo(
+      sum_of_y_grad_mul_x_mean_diff, x->shape, {helper.channel_dim});
+  auto tmp3_0 =
+      builder->Multiply(x_mean_diff, sum_of_y_grad_mul_x_mean_diff_4d);
+  auto epsilon_1d =
+      builder->FillConstant(save_variance->shape,
+                            epsilon,
+                            common::UniqName("epsilon"),
+                            common::Type2Str(save_variance->type));
+  auto variance_add_eps = builder->Add(save_variance, epsilon_1d);
+  auto variance_add_eps_4d =
+      builder->BroadcastTo(variance_add_eps, x->shape, {helper.channel_dim});
+  auto tmp3 = builder->Divide(tmp3_0, variance_add_eps_4d);
+
+  auto x_grad = builder->Multiply(
+      tmp0, builder->Subtract(builder->Subtract(tmp1, tmp2), tmp3));
+
+  context.MapOutToOrigin(x_grad, instr->outputs[0]);
+  context.MapOutToOrigin(scale_grad, instr->outputs[1]);
+  context.MapOutToOrigin(bias_grad, instr->outputs[2]);
+}
+
+void batch_norm(const Instruction& instr, const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 5UL)
+      << "The number of the given inputs is not equal to the required for op "
+      << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL)
+      << "The number of the given outputs is not equal to the required for op "
+      << instr->op_type;
+
+  auto& x = instr->inputs[0];
+  auto& scale = instr->inputs[1];
+  auto& bias = instr->inputs[2];
+  auto& moving_mean = instr->inputs[3];
+  auto& moving_variance = instr->inputs[4];
+  CHECK_EQ(scale->type, bias->type);
+  CHECK_EQ(scale->type, moving_mean->type);
+  CHECK_EQ(scale->type, moving_variance->type);
+
+  float epsilon = instr.GetAttrs<float>("epsilon");
+  float momentum = instr.GetAttrs<float>("momentum");
+  std::string layout = instr.GetAttrs<std::string>("data_layout");
+
+  NetBuilder* builder = context.builder();
+  BatchNormHelper helper(builder, x->shape, scale->shape, layout, "batch_norm");
+
+  auto mean_4d =
+      builder->BroadcastTo(moving_mean, x->shape, {helper.channel_dim});
+  // std_variance_inv = rsqrt(variance + epsilon), shape = [c]
+  auto std_variance_inv_4d = helper.StdVarianceInv4d(moving_variance, epsilon);
+
+  // y = scale * (x - mean) * std_variance_inv + bias, shape = [n, c, h, w]
+  auto scale_4d = builder->BroadcastTo(scale, x->shape, {helper.channel_dim});
+  auto bias_4d = builder->BroadcastTo(bias, x->shape, {helper.channel_dim});
+  auto normalized =
+      builder->Multiply(builder->Subtract(x, mean_4d), std_variance_inv_4d);
+  auto scaled_normalized = builder->Multiply(normalized, scale_4d);
+  auto y = builder->Add(scaled_normalized, bias_4d);
+
+  context.MapOutToOrigin(y, instr->outputs[0]);
+}
+
+}  // namespace decomposer
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(batch_norm_decomposer) {
+  CINN_DECOMPOSER_REGISTER(batch_norm, cinn::frontend::decomposer::batch_norm);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(batch_norm_train_decomposer) {
+  CINN_DECOMPOSER_REGISTER(batch_norm_train,
+                           cinn::frontend::decomposer::batch_norm_train);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(batch_norm_grad_decomposer) {
+  CINN_DECOMPOSER_REGISTER(batch_norm_grad,
+                           cinn::frontend::decomposer::batch_norm_grad);
+
+  return true;
+}
--- a/paddle/cinn/frontend/decomposer/batch_norm_test.cc
+++ b/paddle/cinn/frontend/decomposer/batch_norm_test.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn {
+namespace frontend {
+namespace {
+
+struct Offset {
+  int n;
+  int c;
+  int h;
+  int w;
+
+  Offset(int arg_n, int arg_c, int arg_h, int arg_w)
+      : n(arg_n), c(arg_c), h(arg_h), w(arg_w) {}
+
+  int operator()(int idx_n, int idx_c, int idx_h, int idx_w) const {
+    return idx_n * c * h * w + idx_c * h * w + idx_h * w + idx_w;
+  }
+};
+
+template <typename FuncType>
+void Loop(FuncType func, const int n, const int c, const int h, const int w) {
+  for (int in = 0; in < n; ++in) {
+    for (int ic = 0; ic < c; ++ic) {
+      for (int ih = 0; ih < h; ++ih) {
+        for (int iw = 0; iw < w; ++iw) {
+          func(in, ic, ih, iw);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ComputeBatchNormTrainRef(const std::vector<T>& x,
+                              const std::vector<T>& scale,
+                              const std::vector<T>& bias,
+                              const std::vector<T>& moving_mean,
+                              const std::vector<T>& moving_variance,
+                              const int n,
+                              const int c,
+                              const int h,
+                              const int w,
+                              std::vector<T>* y,
+                              std::vector<T>* saved_mean,
+                              std::vector<T>* saved_variance,
+                              std::vector<T>* new_moving_mean,
+                              std::vector<T>* new_moving_variance,
+                              const float epsilon,
+                              const float momentum) {
+  Offset offset(n, c, h, w);
+
+  // sum
+  memset(saved_mean->data(), 0, sizeof(T) * c);
+  auto func_sum_x = [=](int in, int ic, int ih, int iw) {
+    saved_mean->at(ic) += x[offset(in, ic, ih, iw)];
+  };
+  Loop(func_sum_x, n, c, h, w);
+
+  // saved mean
+  float element_count = static_cast<float>(n * h * w);
+  for (int ic = 0; ic < c; ++ic) {
+    // Checking result of saved_mean:
+    // output[saved_mean], var_name=var_5, shape={32}
+    // - Total 0 different results, offset=0, 0.00527001 vs 0.00527001,
+    // maximum_relative_diff=0(absolute_diff=0)
+    saved_mean->at(ic) /= element_count;
+  }
+
+  // square_sum
+  std::vector<float> x_square_mean(c, 0);
+  auto func_sum_square_x = [&](int in, int ic, int ih, int iw) {
+    x_square_mean.at(ic) +=
+        x[offset(in, ic, ih, iw)] * x[offset(in, ic, ih, iw)];
+  };
+  Loop(func_sum_square_x, n, c, h, w);
+
+  for (int ic = 0; ic < c; ++ic) {
+    x_square_mean[ic] /= element_count;
+  }
+
+  // saved variance, according to equation: E(x^2) - [E(x)]^2
+  std::vector<float> std_variance(c);
+  for (int ic = 0; ic < c; ++ic) {
+    // Checking results of saved_variance and std_variance:
+    // output[saved_variance], var_name=var_6, shape={32}
+    // - Total 0 different results, offset=0, 0.336347 vs 0.336347,
+    // maximum_relative_diff=0(absolute_diff=0) output[std_variance],
+    // var_name=std_variance, shape={32}
+    // - Total 0 different results, offset=0, 0.579963 vs 0.579963,
+    // maximum_relative_diff=0(absolute_diff=0)
+    saved_variance->at(ic) =
+        x_square_mean[ic] - (saved_mean->at(ic) * saved_mean->at(ic));
+    std_variance[ic] = sqrt(saved_variance->at(ic) + epsilon);
+  }
+
+  // compute output
+  std::vector<float> y_nobias(n * c * h * w);
+  auto func_y_nobias = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    // Checking result of y_nobias:
+    // output[y_nobias], var_name=y_nobias, shape={16, 32, 16, 16}
+    // - Total 0 different results, offset=32104, -0.000488288 vs -0.000488288,
+    // maximum_relative_diff=1.19208e-07(absolute_diff=5.82077e-11)
+    y_nobias[idx] =
+        (x[idx] - saved_mean->at(ic)) * scale[ic] / std_variance[ic];
+  };
+  Loop(func_y_nobias, n, c, h, w);
+
+  auto func_y = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    // Checking result of y:
+    // output[y], var_name=var_4, shape={16, 32, 16, 16}
+    // - Total 80 different results, offset=126409, 1.81794e-06 vs 1.80304e-06,
+    // maximum_relative_diff=0.00826446(absolute_diff=1.49012e-08) For the
+    // following case:
+    //   idx=126409, y[idx]=1.80304e-06, y_nobias[idx]=0.2033332,
+    //   bias[ic]=-0.2033314
+    // The computing result of CPU and GPU may have some difference, like
+    //   i=126409, 1.8179417e-06 vs 1.8030405e-06, relative_diff=0.0082644625,
+    //   absolute_diff=1.4901161e-08
+    // This case is considered reasonable.
+    y->at(idx) = y_nobias[idx] + bias[ic];
+  };
+  Loop(func_y, n, c, h, w);
+
+  // new moving running and variance
+  float factor_0 = momentum;
+  float factor_1 = static_cast<float>(1.0f - momentum);
+  for (int ic = 0; ic < c; ++ic) {
+    // Checking result of new_moving_mean and new_moving_variance:
+    // output[new_moving_mean], var_name=var_7, shape={32}
+    // - Total 0 different results, offset=9, 0.00123065 vs 0.00123065,
+    // maximum_relative_diff=9.45967e-08(absolute_diff=1.16415e-10)
+    // output[new_moving_variance], var_name=var_8, shape={32}
+    // - Total 0 different results, offset=16, -0.00140787 vs -0.00140787,
+    // maximum_relative_diff=5.29211e-06(absolute_diff=7.45058e-09)
+    new_moving_mean->at(ic) =
+        moving_mean[ic] * factor_0 + saved_mean->at(ic) * factor_1;
+    new_moving_variance->at(ic) =
+        moving_variance[ic] * factor_0 + saved_variance->at(ic) * factor_1;
+  }
+}
+
+TEST(Decomposer, BatchNormTrain) {
+  int n = 16, c = 128, h = 14, w = 14;
+  float epsilon = 1e-5;
+  float momentum = 0.9f;
+  std::string data_layout = "NCHW";
+  bool is_test = false;
+  NetBuilder net_builder("batch_norm_train");
+  std::vector<std::string> output_names;
+  {
+    auto x = net_builder.CreateInput(Float(32), {n, c, h, w}, "x");
+    auto scale = net_builder.CreateInput(Float(32), {c}, "scale");
+    auto bias = net_builder.CreateInput(Float(32), {c}, "bias");
+    auto moving_mean = net_builder.CreateInput(Float(32), {c}, "moving_mean");
+    auto moving_variance =
+        net_builder.CreateInput(Float(32), {c}, "moving_variance");
+
+    auto outputs = net_builder.BatchNorm(x,
+                                         scale,
+                                         bias,
+                                         moving_mean,
+                                         moving_variance,
+                                         epsilon,
+                                         momentum,
+                                         data_layout,
+                                         is_test);
+    for (auto output : outputs) {
+      output_names.push_back(output->id);
+    }
+  }
+  auto program = net_builder.Build();
+
+  auto target = common::DefaultTarget();
+  RunDecomposer(&program,
+                target,
+                cinn::frontend::DefaultTrainingOptimizeOptions().program_passes,
+                output_names);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto run_program = gc.Build();
+
+  // set input
+  float precision = 1e-3;
+  std::vector<float> x(n * c * h * w), scale(c), bias(c), moving_mean(c),
+      moving_variance(c);
+  InitRandomVector<float>(&x, n * c * h * w, 0.0f, 1.0f, precision);
+  InitRandomVector<float>(&scale, c, 0.0f, 1.0f, precision);
+  InitRandomVector<float>(&bias, c, 10.0f, 20.0f, precision);
+  InitRandomVector<float>(&moving_mean, c, 0.0f, 1.0f, precision);
+  InitRandomVector<float>(&moving_variance, c, 0.0f, 1.0f, precision);
+
+  std::vector<float> y(n * c * h * w), new_moving_mean(c),
+      new_moving_variance(c), saved_mean(c), saved_variance(c);
+  ComputeBatchNormTrainRef<float>(x,
+                                  scale,
+                                  bias,
+                                  moving_mean,
+                                  moving_variance,
+                                  n,
+                                  c,
+                                  h,
+                                  w,
+                                  &y,
+                                  &saved_mean,
+                                  &saved_variance,
+                                  &new_moving_mean,
+                                  &new_moving_variance,
+                                  epsilon,
+                                  momentum);
+
+  std::vector<std::pair<std::string, std::vector<float>>> inputs = {
+      {"x", x},
+      {"scale", scale},
+      {"bias", bias},
+      {"moving_mean", moving_mean},
+      {"moving_variance", moving_variance}};
+  for (auto& input : inputs) {
+    scope->Var<hlir::framework::Tensor>(input.first);
+    auto tensor = scope->GetTensor(input.first);
+    auto* data = tensor->mutable_data<float>(target);
+    CopyFromVector(input.second, tensor, target);
+  }
+  run_program->Execute();
+
+  std::unordered_map<std::string, std::pair<std::string, std::vector<float>>>
+      outputs_ref = {
+          {"new_moving_variance", {output_names[4], new_moving_variance}},
+          {"new_moving_mean", {output_names[3], new_moving_mean}},
+          {"saved_variance", {output_names[2], saved_variance}},
+          {"saved_mean", {output_names[1], saved_mean}},
+          {"y", {output_names[0], y}}};
+
+  for (auto& iter : outputs_ref) {
+    auto output = iter.second;
+    auto tensor = scope->GetTensor(output.first);
+    std::vector<float> data(tensor->shape().numel());
+    CopyToVector(tensor, &data);
+
+    LOG(INFO) << "output[" << iter.first << "], var_name=" << output.first
+              << ", shape=" << tensor->shape().data();
+    CheckOutput<float>(data, output.second, 1e-8, 1e-4);
+  }
+}
+
+template <typename T>
+void ComputeBatchNormGradRef(const std::vector<T>& y_grad,
+                             const std::vector<T>& x,
+                             const std::vector<T>& scale,
+                             const std::vector<T>& save_mean,
+                             const std::vector<T>& save_variance,
+                             const int n,
+                             const int c,
+                             const int h,
+                             const int w,
+                             std::vector<T>* x_grad,
+                             std::vector<T>* scale_grad,
+                             std::vector<T>* bias_grad,
+                             const float epsilon = 1e-5) {
+  Offset offset(n, c, h, w);
+
+  // bias_grad
+  memset(bias_grad->data(), 0, sizeof(T) * c);
+  auto func_bias_grad = [=](int in, int ic, int ih, int iw) {
+    bias_grad->at(ic) += y_grad[offset(in, ic, ih, iw)];
+  };
+  Loop(func_bias_grad, n, c, h, w);
+
+  // std_variance
+  std::vector<T> std_variance(c);
+  for (int ic = 0; ic < c; ++ic) {
+    std_variance[ic] = sqrt(save_variance[ic] + epsilon);
+  }
+
+  // grad scale
+  memset(scale_grad->data(), 0, sizeof(T) * c);
+  auto func_scale_grad = [=](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    scale_grad->at(ic) += y_grad[idx] * (x[idx] - save_mean[ic]);
+  };
+  Loop(func_scale_grad, n, c, h, w);
+  for (int ic = 0; ic < c; ++ic) {
+    scale_grad->at(ic) /= std_variance[ic];
+  }
+
+  // std_norm_grad
+  std::vector<T> std_norm_grad(n * c * h * w);
+  auto func_std_norm_grad = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    std_norm_grad[idx] = y_grad[idx] * scale[ic];
+  };
+  Loop(func_std_norm_grad, n, c, h, w);
+
+  // x_mean_diff_grad
+  std::vector<T> x_mean_diff_grad(n * c * h * w);
+  auto func_x_mean_diff_grad = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    x_mean_diff_grad[idx] = std_norm_grad[idx] / std_variance[ic];
+  };
+  Loop(func_x_mean_diff_grad, n, c, h, w);
+
+  // std_variance_grad
+  std::vector<T> std_variance_grad(c, 0);
+  auto func_std_variance_grad = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    std_variance_grad[ic] += -1.0f * std_norm_grad[idx] *
+                             (x[idx] - save_mean[ic]) /
+                             (save_variance[ic] + epsilon);
+  };
+  Loop(func_std_variance_grad, n, c, h, w);
+
+  // variance_grad_without_mul
+  std::vector<T> variance_grad_without_mul(c);
+  for (int ic = 0; ic < c; ++ic) {
+    variance_grad_without_mul[ic] = std_variance_grad[ic] / std_variance[ic];
+  }
+
+  // x_grad_0
+  float element_count = static_cast<float>(n * h * w);
+  std::vector<T> x_grad_0(n * c * h * w);
+  auto func_x_grad_0 = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    x_grad_0[idx] = x[idx] * (variance_grad_without_mul[ic] / element_count);
+  };
+  Loop(func_x_grad_0, n, c, h, w);
+
+  // minus_mean_grad
+  std::vector<T> minus_mean_grad(c, 0);
+  auto func_minus_mean_grad = [&](int in, int ic, int ih, int iw) {
+    minus_mean_grad[ic] += x_mean_diff_grad[offset(in, ic, ih, iw)];
+  };
+  Loop(func_minus_mean_grad, n, c, h, w);
+  for (int ic = 0; ic < c; ++ic) {
+    minus_mean_grad[ic] += variance_grad_without_mul[ic] * save_mean[ic];
+    minus_mean_grad[ic] /= element_count;
+  }
+
+  auto func_x_grad = [=](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    x_grad->at(idx) =
+        x_mean_diff_grad[idx] + x_grad_0[idx] - minus_mean_grad[ic];
+  };
+  Loop(func_x_grad, n, c, h, w);
+}
+
+TEST(Decomposer, BatchNormGrad) {
+  int n = 16, c = 128, h = 14, w = 14;
+  int num = n * c * h * w;
+  float epsilon = 1e-5;
+  NetBuilder net_builder("batch_norm_grad");
+  std::vector<std::string> output_names;
+  {
+    auto y_grad = net_builder.CreateInput(Float(32), {n, c, h, w}, "y_grad");
+    auto x = net_builder.CreateInput(Float(32), {n, c, h, w}, "x");
+    auto scale = net_builder.CreateInput(Float(32), {c}, "scale");
+    auto saved_mean = net_builder.CreateInput(Float(32), {c}, "saved_mean");
+    auto saved_variance =
+        net_builder.CreateInput(Float(32), {c}, "saved_variance");
+
+    auto outputs = net_builder.BatchNormGrad(
+        y_grad, x, scale, saved_mean, saved_variance, epsilon);
+    for (auto output : outputs) {
+      output_names.push_back(output->id);
+    }
+  }
+  auto program = net_builder.Build();
+
+  auto target = common::DefaultTarget();
+  RunDecomposer(&program,
+                target,
+                cinn::frontend::DefaultTrainingOptimizeOptions().program_passes,
+                output_names);
+
+  auto graph = std::make_shared<hlir::framework::Graph>(program, target);
+  hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+  hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
+
+  auto scope = BuildScope(target, graph);
+  hlir::framework::GraphCompiler gc(target, scope, graph);
+  auto run_program = gc.Build();
+
+  // set input
+  float precision = 1e-3;
+  std::vector<float> y_grad(num), x(num), scale(c), saved_mean(c, 0),
+      saved_variance(c, 0);
+  InitRandomVector(&y_grad, num, 0.0f, 1.0f, precision);
+  InitRandomVector(&x, num, 0.0f, 1.0f, precision);
+  InitRandomVector(&scale, c, 0.0f, 1.0f, precision);
+
+  Offset offset(n, c, h, w);
+  auto func_save_mean = [&](int in, int ic, int ih, int iw) {
+    int idx = offset(in, ic, ih, iw);
+    saved_mean[ic] += x[idx];
+    saved_variance[ic] += x[idx] * x[idx];
+  };
+  Loop(func_save_mean, n, c, h, w);
+  float element_count = static_cast<float>(n * h * w);
+  for (int ic = 0; ic < c; ++ic) {
+    saved_mean[ic] /= element_count;
+    saved_variance[ic] =
+        saved_variance[ic] / element_count - saved_mean[ic] * saved_mean[ic];
+  }
+
+  std::vector<std::pair<std::string, std::vector<float>>> inputs = {
+      {"y_grad", y_grad},
+      {"x", x},
+      {"scale", scale},
+      {"saved_mean", saved_mean},
+      {"saved_variance", saved_variance}};
+  for (auto& input : inputs) {
+    scope->Var<hlir::framework::Tensor>(input.first);
+    auto tensor = scope->GetTensor(input.first);
+    CopyFromVector(input.second, tensor, target);
+  }
+  run_program->Execute();
+
+  std::vector<float> x_grad(num), scale_grad(c), bias_grad(c);
+  ComputeBatchNormGradRef(y_grad,
+                          x,
+                          scale,
+                          saved_mean,
+                          saved_variance,
+                          n,
+                          c,
+                          h,
+                          w,
+                          &x_grad,
+                          &scale_grad,
+                          &bias_grad,
+                          epsilon);
+
+  std::unordered_map<std::string, std::pair<std::string, std::vector<float>>>
+      output_refs = {{"bias_grad", {output_names[2], bias_grad}},
+                     {"scale_grad", {output_names[1], scale_grad}},
+                     {"x_grad", {output_names[0], x_grad}}};
+
+  for (auto& iter : output_refs) {
+    auto output = iter.second;
+    auto tensor = scope->GetTensor(output.first);
+    std::vector<float> data(tensor->shape().numel());
+    CopyToVector(tensor, &data);
+
+    LOG(INFO) << "output[" << iter.first << "], var_name=" << output.first
+              << ", shape=" << tensor->shape().data();
+    if (iter.first == "x_grad") {
+      // TODO(Xreki): fix the precision check of x_grad.
+      // CheckOutput<float>(data, output.second, 1e-8, 1e-1);
+    } else if (iter.first == "scale_grad") {
+      CheckOutput<float>(data, output.second, 1e-8, 1e-2);
+    } else {
+      CheckOutput<float>(data, output.second);
+    }
+  }
+}
+
+}  // namespace
+}  // namespace frontend
+}  // namespace cinn
--- a/paddle/cinn/frontend/decomposer/broadcast.cc
+++ b/paddle/cinn/frontend/decomposer/broadcast.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/frontend/decomposer_registry.h"
+#include "paddle/cinn/frontend/syntax.h"
+
+namespace cinn {
+namespace frontend {
+namespace decomposer {
+
+void GetReduceDimsForX(const std::vector<int>& dx_shape,
+                       const std::vector<int>& dout_shape,
+                       std::vector<int>* reduce_dims) {
+  // e.g., dx_shape = [4, 1, 3], dout_shape = [4, 2, 3], reduce_dims=[1]
+  for (size_t i = 0; i < dout_shape.size(); ++i) {
+    if (dx_shape[i] == 1 && dout_shape[i] != 1) {
+      reduce_dims->push_back(i);
+    }
+  }
+  VLOG(3) << "The reduce_dims for X: " << utils::Join(*reduce_dims, ",");
+}
+
+void GetReduceDimsForY(const std::vector<int>& dy_shape,
+                       const std::vector<int>& dout_shape,
+                       int axis,
+                       std::vector<int>* reduce_dims) {
+  // e.g., dy_shape = [3, 1, 4], dout_shape = [2, 3, 4, 4, 5], axis = 1
+  // reduce_dims=[0, 2, 4]
+  for (size_t i = 0; i < dout_shape.size(); ++i) {
+    if (i < axis || i >= axis + dy_shape.size()) {
+      reduce_dims->push_back(i);
+    } else {
+      if (dy_shape[i - axis] == 1 && dout_shape[i] != 1) {
+        reduce_dims->push_back(i);
+      }
+    }
+  }
+  VLOG(3) << "The reduce_dims for Y: " << utils::Join(*reduce_dims, ",");
+}
+
+void elementwise_add(const Instruction& instr,
+                     const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 2UL)
+      << " 2 input tensors for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 1UL)
+      << "1 output tensor for " << instr->op_type;
+  auto x = instr->inputs[0];
+  auto y = instr->inputs[1];
+  auto output = instr->outputs[0];
+
+  int axis = -1;
+  if (instr->attrs.find("axis") != instr->attrs.end()) {
+    axis = instr.GetAttrs<int>("axis");
+  }
+
+  if (x->shape.size() >= y->shape.size()) {
+    axis = axis >= 0 ? axis : x->shape.size() - y->shape.size();
+    auto* builder = context.builder();
+
+    Variable out;
+    Variable bcast_x = x;
+    Variable bcast_y = y;
+
+    // e.g., x.shape = [4, 1, 3], y.shape = [2, 3], aixs = 1 out.shape = [4, 2,
+    // 3] bcast_axes_x = [0, 1, 2], bcast_axes_y = [1, 2]
+    if (x->shape != output->shape) {
+      std::vector<int> bcast_axes_x(x->shape.size());
+      std::iota(bcast_axes_x.begin(), bcast_axes_x.end(), 0);
+      bcast_x = builder->BroadcastTo(x, output->shape, bcast_axes_x);
+    }
+
+    // if y.shape=[1], y does not need to be broadcast
+    if (y->shape != output->shape && y->shape != std::vector<int>(1, 1)) {
+      std::vector<int> bcast_axes_y(y->shape.size());
+      std::iota(bcast_axes_y.begin(), bcast_axes_y.end(), axis);
+      bcast_y = builder->BroadcastTo(y, output->shape, bcast_axes_y);
+    }
+
+    out = builder->Add(bcast_x, bcast_y);
+
+    // map the the output of decomposed operator to the original.
+    context.MapOutToOrigin(out, output);
+  } else {
+    axis = axis >= 0 ? axis : y->shape.size() - x->shape.size();
+    auto* builder = context.builder();
+
+    Variable out;
+    Variable bcast_x = x;
+    Variable bcast_y = y;
+
+    if (y->shape != output->shape) {
+      std::vector<int> bcast_axes_y(y->shape.size());
+      std::iota(bcast_axes_y.begin(), bcast_axes_y.end(), 0);
+      bcast_y = builder->BroadcastTo(y, output->shape, bcast_axes_y);
+    }
+
+    if (x->shape != output->shape && x->shape != std::vector<int>(1, 1)) {
+      std::vector<int> bcast_axes_x(x->shape.size());
+      std::iota(bcast_axes_x.begin(), bcast_axes_x.end(), axis);
+      bcast_x = builder->BroadcastTo(x, output->shape, bcast_axes_x);
+    }
+
+    out = builder->Add(bcast_x, bcast_y);
+
+    // map the the output of decomposed operator to the original.
+    context.MapOutToOrigin(out, output);
+  }
+}
+
+void elementwise_add_grad(const Instruction& instr,
+                          const DecomposerContext& context) {
+  CHECK_EQ(instr->inputs.size(), 3UL)
+      << " 3 input tensors for " << instr->op_type;
+  CHECK_EQ(instr->outputs.size(), 2UL)
+      << "2 output tensors for " << instr->op_type;
+  auto dout = instr->inputs[0];
+  auto dx = instr->outputs[0];
+  auto dy = instr->outputs[1];
+  int axis = instr.GetAttrs<int>("axis");
+  if (axis < 0 && dx->shape.size() < dy->shape.size()) {
+    LOG(FATAL) << "Please make sure x'rank greater than or equal to y'rank "
+                  "when axis = -1";
+  }
+  axis = axis >= 0 ? axis : dx->shape.size() - dy->shape.size();
+  auto* builder = context.builder();
+
+  Variable dx_t;
+  if (dx->shape == dout->shape) {
+    dx_t = builder->Identity(dout);
+    context.MapOutToOrigin(dx, dout);
+  } else {
+    std::vector<int> x_reduce_dims;
+    GetReduceDimsForX(dx->shape, dout->shape, &x_reduce_dims);
+    // The rank of dx is same as dout, so set keep_dim = true
+    dx_t = builder->ReduceSum(dout, x_reduce_dims, true);
+  }
+
+  Variable dy_t;
+  if (dy->shape == dout->shape) {
+    dy_t = builder->Identity(dout);
+    context.MapOutToOrigin(dy, dout);
+  } else {
+    std::vector<int> y_reduce_dims;
+    GetReduceDimsForY(dy->shape, dout->shape, axis, &y_reduce_dims);
+    // The rank of dy is less or equal to dout, after reduce_sum, there
+    // may be some extra "1" in the front or back of dy_res's shape. So
+    // the dt_res needs to be reshaped.
+    auto dy_res = builder->ReduceSum(dout, y_reduce_dims, true);
+    dy_t = builder->Reshape(dy_res, dy->shape);
+  }
+
+  // map the the output of decomposed operator to the original.
+  context.MapOutToOrigin(dx_t, dx);
+  context.MapOutToOrigin(dy_t, dy);
+}
+
+}  // namespace decomposer
+}  // namespace frontend
+}  // namespace cinn
+
+CINN_REGISTER_HELPER(broadcast_decomposers) {
+  CINN_DECOMPOSER_REGISTER(elementwise_add,
+                           cinn::frontend::decomposer::elementwise_add);
+
+  return true;
+}
+
+CINN_REGISTER_HELPER(broadcast_grad_decomposers) {
+  CINN_DECOMPOSER_REGISTER(elementwise_add_grad,
+                           cinn::frontend::decomposer::elementwise_add_grad);
+
+  return true;
+}
--- a/paddle/cinn/frontend/decomposer/broadcast_test.cc
+++ b/paddle/cinn/frontend/decomposer/broadcast_test.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+
+#include "paddle/cinn/frontend/decomposer/test_helper.h"
+
+namespace cinn::frontend {
+
+TEST(Decomposer, elementwise_add_bcast0) {
+  NetBuilder builder("elementwise_add");
+  auto x = builder.CreateInput(Float(32), {4, 1, 20, 10});
+  auto y = builder.CreateInput(Float(32), {10, 20});
+  auto out = builder.Add(x, y, 1);
+
+  std::vector<std::string> input_names = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{4, 10, 20, 10}};
+  RunAndCheckShape<float>(&builder, input_names, output_names, output_shapes);
+}
+
+TEST(Decomposer, elementwise_add_bcase1) {
+  NetBuilder builder("elementwise_add");
+  auto x = builder.CreateInput(Float(32), {10, 20});
+  auto y = builder.CreateInput(Float(32), {4, 1, 20, 10});
+  auto out = builder.Add(x, y, 1);
+
+  std::vector<std::string> input_names = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{4, 10, 20, 10}};
+  RunAndCheckShape<float>(&builder, input_names, output_names, output_shapes);
+}
+
+TEST(Decomposer, elementwise_add_grad_bcast0) {
+  NetBuilder builder("elementwise_add_grad");
+  auto dout = builder.CreateInput(Float(32), {4, 10, 20, 10});
+  auto x = builder.CreateInput(Float(32), {4, 1, 20, 10});
+  auto y = builder.CreateInput(Float(32), {10, 20});
+  auto out_grads = builder.ElementwiseAddGrad(dout, x, y, 1);
+
+  std::vector<std::string> input_names = {dout.id().data()};
+  std::vector<std::string> output_names = {out_grads[0]->id, out_grads[1]->id};
+  std::vector<std::vector<int>> output_shapes = {{4, 1, 20, 10}, {10, 20}};
+  RunAndCheckShape<float>(&builder, input_names, output_names, output_shapes);
+}
+
+TEST(Decomposer, elementwise_add_bcast1) {
+  NetBuilder builder("elementwise_add");
+  auto x = builder.CreateInput(Float(32), {32, 64, 32, 32});
+  auto y = builder.CreateInput(Float(32), {64});
+  auto out = builder.Add(x, y, 1);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths,
+                    const std::vector<void*>& ptrs) {
+    float* x = static_cast<float*>(ptrs[0]);
+    float* y = static_cast<float*>(ptrs[1]);
+    float* out = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < 32; ++i) {
+      for (size_t j = 0; j < 64; ++j) {
+        for (size_t k = 0; k < 32 * 32; ++k) {
+          out[(i * 64 + j) * 32 * 32 + k] =
+              x[(i * 64 + j) * 32 * 32 + k] + y[j];
+        }
+      }
+    }
+  };
+
+  std::vector<std::string> input_names = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 64, 32, 32}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_bcast1_2) {
+  NetBuilder builder("elementwise_add");
+  auto x = builder.CreateInput(Float(32), {64});
+  auto y = builder.CreateInput(Float(32), {32, 64, 32, 32});
+  auto out = builder.Add(x, y, 1);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths,
+                    const std::vector<void*>& ptrs) {
+    float* x = static_cast<float*>(ptrs[0]);
+    float* y = static_cast<float*>(ptrs[1]);
+    float* out = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < 32; ++i) {
+      for (size_t j = 0; j < 64; ++j) {
+        for (size_t k = 0; k < 32 * 32; ++k) {
+          out[(i * 64 + j) * 32 * 32 + k] =
+              y[(i * 64 + j) * 32 * 32 + k] + x[j];
+        }
+      }
+    }
+  };
+
+  std::vector<std::string> input_names = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 64, 32, 32}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_grad_bcast1) {
+  NetBuilder builder("elementwise_add_grad");
+  auto dout = builder.CreateInput(Float(32), {32, 64, 32, 32});
+  auto x = builder.CreateInput(Float(32), {32, 64, 32, 32});
+  auto y = builder.CreateInput(Float(32), {64});
+  auto out_grads = builder.ElementwiseAddGrad(dout, x, y, 1);
+
+  auto add_grad_cpu = [](const std::vector<size_t>& lengths,
+                         const std::vector<void*>& ptrs) {
+    float* dout = static_cast<float*>(ptrs[0]);
+    float* dx = static_cast<float*>(ptrs[1]);
+    float* dy = static_cast<float*>(ptrs[2]);
+    for (size_t j = 0; j < 64; ++j) {
+      dy[j] = 0;
+    }
+    for (size_t i = 0; i < 32; ++i) {
+      for (size_t j = 0; j < 64; ++j) {
+        for (size_t k = 0; k < 32 * 32; ++k) {
+          dx[(i * 64 + j) * 32 * 32 + k] = dout[(i * 64 + j) * 32 * 32 + k];
+          dy[j] = dy[j] + dout[(i * 64 + j) * 32 * 32 + k];
+        }
+      }
+    }
+  };
+
+  std::vector<std::string> input_names = {dout.id().data()};
+  std::vector<std::string> output_names = {out_grads[0]->id, out_grads[1]->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 64, 32, 32}, {64}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, add_grad_cpu);
+}
+
+TEST(Decomposer, elementwise_add_bcast2) {
+  NetBuilder builder("elementwise_add");
+  auto x = builder.CreateInput(Float(32), {32, 16});
+  auto y = builder.CreateInput(Float(32), {1});
+  auto out = builder.Add(x, y);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths,
+                    const std::vector<void*>& ptrs) {
+    size_t n = lengths[0];
+    float* x = static_cast<float*>(ptrs[0]);
+    float* y = static_cast<float*>(ptrs[1]);
+    float* out = static_cast<float*>(ptrs[2]);
+    float y_data = y[0];
+    for (size_t i = 0; i < n; ++i) {
+      out[i] = x[i] + y_data;
+    }
+  };
+
+  std::vector<std::string> input_names = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_bcast2_2) {
+  NetBuilder builder("elementwise_add");
+  auto x = builder.CreateInput(Float(32), {1});
+  auto y = builder.CreateInput(Float(32), {32, 16});
+  auto out = builder.Add(x, y);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths,
+                    const std::vector<void*>& ptrs) {
+    size_t n = 32 * 16;
+    float* x = static_cast<float*>(ptrs[0]);
+    float* y = static_cast<float*>(ptrs[1]);
+    float* out = static_cast<float*>(ptrs[2]);
+    float x_data = x[0];
+    for (size_t i = 0; i < n; ++i) {
+      out[i] = y[i] + x_data;
+    }
+  };
+
+  std::vector<std::string> input_names = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_bcast2_3) {
+  constexpr int kLength = 64;
+  using int_ty = int64_t;
+  NetBuilder builder("elementwise_add");
+  auto x = builder.CreateInput(Int(kLength), {32, 16});
+  auto y = builder.CreateInput(Int(kLength), {1});
+  auto out = builder.Add(x, y);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths,
+                    const std::vector<void*>& ptrs) {
+    size_t n = lengths[0];
+    int_ty* x = static_cast<int_ty*>(ptrs[0]);
+    int_ty* y = static_cast<int_ty*>(ptrs[1]);
+    int_ty* out = static_cast<int_ty*>(ptrs[2]);
+    int_ty y_data = y[0];
+    for (size_t i = 0; i < n; ++i) {
+      out[i] = x[i] + y_data;
+    }
+  };
+
+  std::vector<std::string> input_names = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}};
+  RunAndCheck<int_ty>(
+      &builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_grad_bcast2) {
+  NetBuilder builder("elementwise_add_grad");
+  auto dout = builder.CreateInput(Float(32), {32, 16});
+  auto x = builder.CreateInput(Float(32), {32, 16});
+  auto y = builder.CreateInput(Float(32), {1});
+  auto out_grads = builder.ElementwiseAddGrad(dout, x, y);
+
+  auto add_grad_cpu = [](const std::vector<size_t>& lengths,
+                         const std::vector<void*>& ptrs) {
+    size_t n = lengths[0];
+    float* dout = static_cast<float*>(ptrs[0]);
+    float* dx = static_cast<float*>(ptrs[1]);
+    float* dy = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < n; ++i) {
+      float tmp = dout[i];
+      dx[i] = tmp;
+      dy[0] += tmp;
+    }
+  };
+
+  std::vector<std::string> input_names = {dout.id().data()};
+  std::vector<std::string> output_names = {out_grads[0]->id, out_grads[1]->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}, {1}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, add_grad_cpu);
+}
+
+TEST(Decomposer, elementwise_add_same_dims) {
+  NetBuilder builder("elementwise_add");
+  auto x = builder.CreateInput(Float(32), {32, 16});
+  auto y = builder.CreateInput(Float(32), {32, 16});
+  auto out = builder.Add(x, y);
+
+  auto add_cpu = [](const std::vector<size_t>& lengths,
+                    const std::vector<void*>& ptrs) {
+    size_t n = lengths[0];
+    float* x = static_cast<float*>(ptrs[0]);
+    float* y = static_cast<float*>(ptrs[1]);
+    float* out = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < n; ++i) {
+      out[i] = x[i] + y[i];
+    }
+  };
+
+  std::vector<std::string> input_names = {x.id().data(), y.id().data()};
+  std::vector<std::string> output_names = {out->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, add_cpu);
+}
+
+TEST(Decomposer, elementwise_add_grad_same_dims) {
+  NetBuilder builder("elementwise_add_grad");
+  auto dout = builder.CreateInput(Float(32), {32, 16});
+  auto x = builder.CreateInput(Float(32), {32, 16});
+  auto y = builder.CreateInput(Float(32), {32, 16});
+  auto out_grads = builder.ElementwiseAddGrad(dout, x, y);
+
+  auto add_grad_cpu = [](const std::vector<size_t>& lengths,
+                         const std::vector<void*>& ptrs) {
+    size_t n = lengths[0];
+    float* dout = static_cast<float*>(ptrs[0]);
+    float* dx = static_cast<float*>(ptrs[1]);
+    float* dy = static_cast<float*>(ptrs[2]);
+    for (size_t i = 0; i < n; ++i) {
+      float tmp = dout[i];
+      dx[i] = tmp;
+      dy[i] = tmp;
+    }
+  };
+
+  std::vector<std::string> input_names = {dout.id().data()};
+  std::vector<std::string> output_names = {out_grads[0]->id, out_grads[1]->id};
+  std::vector<std::vector<int>> output_shapes = {{32, 16}, {32, 16}};
+  RunAndCheck<float>(
+      &builder, input_names, output_names, output_shapes, add_grad_cpu);
+}
+
+}  // namespace cinn::frontend