2.5

992bec46 · “yuguo” · 0259837d · 992bec46 · 992bec46 · 992bec46
Commit 992bec46 authored Oct 08, 2023 by “yuguo”
20 changed files
--- a/paddle/cinn/backends/_x86_builtin_source.cc
+++ b/paddle/cinn/backends/_x86_builtin_source.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///                     Predefined utilities in CINN BEGIN(
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#include <immintrin.h>
+#include <math.h>
+#include <omp.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "paddle/cinn/runtime/cpu/thread_backend.h"
+
+#ifndef _CINN_X86_BUILTIN_SOURCE_
+#define _CINN_X86_BUILTIN_SOURCE_
+//! Vector in stack, this can only used in generated .cc file.
+template <typename T, size_t Num>
+struct StackVec {
+  typedef T value_type;
+  typedef StackVec<T, Num> self_type;
+
+  self_type& operator=(const StackVec& src) {
+    if (this != &src) {
+      memcpy(data_, src.data_, num_bytes());
+    }
+    return *this;
+  }
+
+  StackVec() { memset(data_, 0, num_bytes()); }
+
+  explicit StackVec(const T* externl) : external_data_(externl) {}
+
+  static self_type Broadcast(const value_type& v) {
+    self_type res;
+    for (size_t i = 0; i < Num; i++) res.data_[i] = v;
+    return res;
+  }
+
+  static self_type Ramp(const value_type& base, const value_type& stride) {
+    self_type res;
+    for (size_t i = 0; i < Num; i++) {
+      res.data_[i] = base + stride * i;
+    }
+  }
+
+  static self_type Load(const void* base, int32_t offset) {
+    self_type res;
+    memcpy(&res.data_[0], (const value_type*)base + offset, num_bytes());
+  }
+
+  static self_type Load(const void* base,
+                        const StackVec<int32_t, Num>& offset) {
+    self_type res;
+    for (size_t i = 0; i < Num; i++) {
+      res.data_[i] = ((const value_type*)base)[offset[i]];
+    }
+  }
+
+  void Store(void* base, int32_t offset) const {
+    mempcpy((value_type*)base + offset, &data_[0], num_bytes());  // NOLINT
+  }
+
+  inline value_type& operator[](size_t i) { return data_[i]; }
+  inline value_type operator[](size_t i) const { return data_[i]; }
+
+  // binary operator between two vectors
+  // @{
+#define __(op__)                                                           \
+  friend self_type operator op__(const self_type& a, const self_type& b) { \
+    self_type res;                                                         \
+    for (size_t i = 0; i < Num; i++) {                                     \
+      res.data_[i] = a[i] op__ b[i];                                       \
+    }                                                                      \
+    return res;                                                            \
+  }
+  __(+)
+  __(-)
+  __(*)
+  __(/)
+  __(%)
+  // @}
+#undef __
+
+  // binary operator between a vector and a scalar
+  // @{
+#define __(op__)                                                            \
+  friend self_type operator op__(const self_type& a, const value_type& b) { \
+    self_type res;                                                          \
+    for (size_t i = 0; i < Num; i++) {                                      \
+      res.data_[i] = a[i] op__ b;                                           \
+    }                                                                       \
+    return res;                                                             \
+  }
+  __(+)
+  __(-)
+  __(*)
+  __(/)
+  __(%)
+#undef __
+  // @}
+
+  static constexpr size_t num_bytes() { return sizeof(data_); }
+
+ private:
+  T data_[Num];
+  T* external_data_{nullptr};
+};
+
+/**
+ * The vector with external data.
+ */
+template <typename T, size_t Num>
+struct ExternalVec {
+  typedef T value_type;
+  typedef ExternalVec<T, Num> self_type;
+
+  explicit ExternalVec(T* data) : data_(data) {}
+
+  self_type& operator=(const self_type& src) {
+    if (data_ != src.data_) {
+      memcpy(data_, src.data_, num_bytes());
+    }
+    return *this;
+  }
+
+  static self_type Load(const void* base, int32_t offset) {
+    self_type res((T*)base + offset);  // NOLINT
+    return res;
+  }
+
+  static constexpr size_t num_bytes() { return sizeof(value_type) * Num; }
+
+ private:
+  T* data_{nullptr};
+};
+
+// AVX256 load
+//@{
+inline __m256 cinn_avx256_load(const float* dst) { return _mm256_load_ps(dst); }
+inline __m256d cinn_avx256_load(const double* dst) {
+  return _mm256_load_pd(dst);
+}
+//@}
+// AVX512 load
+//@{
+inline __m512 cinn_avx512_load(const float* dst) { return _mm512_load_ps(dst); }
+inline __m512d cinn_avx512_load(const double* dst) {
+  return _mm512_load_pd(dst);
+}
+//@}
+
+// FP32x8 * FP32x8
+// @{
+inline void cinn_avx256_add(float* dst, float* a, float* b) {
+  _mm256_store_ps(dst, _mm256_add_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_sub(float* dst, float* a, float* b) {
+  _mm256_store_ps(dst, _mm256_sub_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_mul(float* dst, float* a, float* b) {
+  _mm256_store_ps(dst, _mm256_mul_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_div(float* dst, float* a, float* b) {
+  _mm256_store_ps(dst, _mm256_div_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
+}
+// @}
+
+// FP32x4 * float
+// @{
+inline void cinn_avx256_add(float* dst, float* a, float b) {
+  _mm256_store_ps(dst, _mm256_add_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
+}
+inline void cinn_avx256_sub(float* dst, float* a, float b) {
+  _mm256_store_ps(dst, _mm256_sub_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
+}
+inline void cinn_avx256_mul(float* dst, float* a, float b) {
+  _mm256_store_ps(dst, _mm256_mul_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
+}
+inline void cinn_avx256_div(float* dst, float* a, float b) {
+  _mm256_store_ps(dst, _mm256_div_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
+}
+// @}
+
+// float * FP32x4
+// @{
+inline void cinn_avx256_add(float* dst, float a, float* b) {
+  _mm256_store_ps(dst, _mm256_add_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_sub(float* dst, float a, float* b) {
+  _mm256_store_ps(dst, _mm256_sub_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_mul(float* dst, float a, float* b) {
+  _mm256_store_ps(dst, _mm256_mul_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
+}
+inline void cinn_avx256_div(float* dst, float a, float* b) {
+  _mm256_store_ps(dst, _mm256_div_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
+}
+// @}
+
+// 4 x float64
+// @{
+inline void cinn_avx256_add(double* dst, double* a, double* b) {
+  _mm256_store_pd(dst, _mm256_add_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_sub(double* dst, double* a, double* b) {
+  _mm256_store_pd(dst, _mm256_sub_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_mul(double* dst, double* a, double* b) {
+  _mm256_store_pd(dst, _mm256_mul_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_div(double* dst, double* a, double* b) {
+  _mm256_store_pd(dst, _mm256_div_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
+}
+// @}
+
+// FP32x4 * FP64
+// @{
+inline void cinn_avx256_add(double* dst, double* a, double b) {
+  _mm256_store_pd(dst, _mm256_add_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
+}
+inline void cinn_avx256_sub(double* dst, double* a, double b) {
+  _mm256_store_pd(dst, _mm256_sub_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
+}
+inline void cinn_avx256_mul(double* dst, double* a, double b) {
+  _mm256_store_pd(dst, _mm256_mul_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
+}
+inline void cinn_avx256_div(double* dst, double* a, double b) {
+  _mm256_store_pd(dst, _mm256_div_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
+}
+// @}
+
+// float * FP32x4
+// @{
+inline void cinn_avx256_add(double* dst, double a, double* b) {
+  _mm256_store_pd(dst, _mm256_add_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_sub(double* dst, double a, double* b) {
+  _mm256_store_pd(dst, _mm256_sub_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_mul(double* dst, double a, double* b) {
+  _mm256_store_pd(dst, _mm256_mul_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
+}
+inline void cinn_avx256_div(double* dst, double a, double* b) {
+  _mm256_store_pd(dst, _mm256_div_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
+}
+// @}
+
+//! 32 x float32 operations.
+// @{
+inline void cinn_avx512_add(float* dst, float* a, float* b) {
+  _mm512_store_ps(dst, _mm512_add_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
+}
+inline void cinn_avx512_sub(float* dst, float* a, float* b) {
+  _mm512_store_ps(dst, _mm512_sub_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
+}
+inline void cinn_avx512_mul(float* dst, float* a, float* b) {
+  _mm512_store_ps(dst, _mm512_mul_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
+}
+inline void cinn_avx512_div(float* dst, float* a, float* b) {
+  _mm512_store_ps(dst, _mm512_div_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
+}
+// @}
+
+// FP32x4 * FP64
+// @{
+inline void cinn_avx512_add(float* dst, float* a, float b) {
+  _mm512_store_pd(dst, _mm512_add_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
+}
+inline void cinn_avx512_sub(float* dst, float* a, float b) {
+  _mm512_store_pd(dst, _mm512_sub_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
+}
+inline void cinn_avx512_mul(float* dst, float* a, float b) {
+  _mm512_store_pd(dst, _mm512_mul_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
+}
+inline void cinn_avx512_div(float* dst, float* a, float b) {
+  _mm512_store_pd(dst, _mm512_div_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
+}
+// @}
+
+// float * FP32x4
+// @{
+inline void cinn_avx512_add(float* dst, float a, float* b) {
+  _mm512_store_pd(dst, _mm512_add_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_sub(float* dst, float a, float* b) {
+  _mm512_store_pd(dst, _mm512_sub_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_mul(float* dst, float a, float* b) {
+  _mm512_store_pd(dst, _mm512_mul_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_div(float* dst, float a, float* b) {
+  _mm512_store_pd(dst, _mm512_div_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
+}
+// @}
+
+//! 16 x float32 operations.
+// @{
+inline void cinn_avx512_add(double* dst, double* a, double* b) {
+  _mm512_store_pd(dst, _mm512_add_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_sub(double* dst, double* a, double* b) {
+  _mm512_store_pd(dst, _mm512_sub_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_mul(double* dst, double* a, double* b) {
+  _mm512_store_pd(dst, _mm512_mul_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
+}
+inline void cinn_avx512_div(double* dst, double* a, double* b) {
+  _mm512_store_pd(dst, _mm512_div_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
+}
+// @}
+
+inline __m512 cinn_avx512_add(const __m512& a, const __m512& b);
+
+inline __m256 cinn_avx256_add_float(const __m256& a, const __m256& b) {
+  return _mm256_add_ps(a, b);
+}
+inline __m256d cinn_avx256_add_double(const __m256d& a, const __m256d& b) {
+  return _mm256_add_pd(a, b);
+}
+inline __m512 cinn_avx512_add_float(const __m512& a, const __m512& b) {
+  return _mm512_add_ps(a, b);
+}
+inline __m512d cinn_avx512_add_double(const __m512d& a, const __m512d& b) {
+  return _mm512_add_pd(a, b);
+}
+
+//! set1
+// @{
+inline __m256 cinn_avx256_set1(float value) { return _mm256_set1_ps(value); }
+inline __m256d cinn_avx256_set1(double value) { return _mm256_set1_pd(value); }
+inline __m512 cinn_avx512_set1(float value) { return _mm512_set1_ps(value); }
+inline __m512d cinn_avx512_set1(double value) { return _mm512_set1_pd(value); }
+// @}
+
+//! store
+// @{
+inline void cinn_avx512_store(float* dst, const __m512& x) {
+  _mm512_store_ps(dst, x);
+}
+inline void cinn_avx512_store(double* dst, const __m512d& x) {
+  _mm512_store_pd(dst, x);
+}
+inline void cinn_avx256_store(float* dst, const __m256& x) {
+  _mm256_store_ps(dst, x);
+}
+inline void cinn_avx256_store(double* dst, const __m256d& x) {
+  _mm256_store_pd(dst, x);
+}
+// @}
+
+//! add
+// @{
+inline __m256 cinn_avx256_add(const __m256& a, const __m256& b) {
+  return _mm256_add_ps(a, b);
+}
+inline __m256d cinn_avx256_add(const __m256d& a, const __m256d& b) {
+  return _mm256_add_pd(a, b);
+}
+inline __m512 cinn_avx512_add(const __m512& a, const __m512& b) {
+  return _mm512_add_ps(a, b);
+}
+inline __m512d cinn_avx512_add(const __m512d& a, const __m512d& b) {
+  return _mm512_add_pd(a, b);
+}
+// @}
+
+//! mul
+// @{
+inline __m256 cinn_avx256_mul(const __m256& a, const __m256& b) {
+  return _mm256_mul_ps(a, b);
+}
+inline __m256d cinn_avx256_mul(const __m256d& a, const __m256d& b) {
+  return _mm256_mul_pd(a, b);
+}
+inline __m512 cinn_avx512_mul(const __m512& a, const __m512& b) {
+  return _mm512_mul_ps(a, b);
+}
+inline __m512d cinn_avx512_mul(const __m512d& a, const __m512d& b) {
+  return _mm512_mul_pd(a, b);
+}
+// @}
+
+//! fma
+// @{
+inline __m128 cinn_avx128_fma(const __m128& a,
+                              const __m128& b,
+                              const __m128& c) {
+  return _mm_fmadd_ps(a, b, c);
+}
+inline __m128d cinn_avx128_fma(const __m128d& a,
+                               const __m128d& b,
+                               const __m128d& c) {
+  return _mm_fmadd_pd(a, b, c);
+}
+inline __m256 cinn_avx256_fma(const __m256& a,
+                              const __m256& b,
+                              const __m256& c) {
+  return _mm256_fmadd_ps(a, b, c);
+}
+inline __m256d cinn_avx256_fma(const __m256d& a,
+                               const __m256d& b,
+                               const __m256d& c) {
+  return _mm256_fmadd_pd(a, b, c);
+}
+inline __m512 cinn_avx512_fma(const __m512& a,
+                              const __m512& b,
+                              const __m512& c) {
+  return _mm512_fmadd_ps(a, b, c);
+}
+inline __m512d cinn_avx512_fma(const __m512d& a,
+                               const __m512d& b,
+                               const __m512d& c) {
+  return _mm512_fmadd_pd(a, b, c);
+}
+// @}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+///                     )END Predefined utilities in CINN
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+#endif  // _CINN_X86_BUILTIN_SOURCE_
--- a/paddle/cinn/backends/codegen_c.cc
+++ b/paddle/cinn/backends/codegen_c.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/codegen_c.h"
+
+#include <fstream>
+#include <string>
+
+#include "paddle/cinn/backends/extern_func_emitter.h"
+#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/utils/ir_verify.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/remove_nested_block.h"
+#include "paddle/cinn/runtime/cpu/thread_backend.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+#include "paddle/cinn/utils/string.h"
+
+//! Root of the builtin code.
+DECLARE_string(cinn_x86_builtin_code_root);
+
+namespace cinn {
+namespace backends {
+using namespace utils;  // NOLINT
+using cinn::common::float16;
+
+const char *kCKeywordRestrict = "__restrict__";
+
+void CodeGenC::Compile(const ir::Module &module, const Outputs &outputs) {
+  ir::IrVerify(Expr(module));
+
+  if (!outputs.c_header_name.empty()) {
+    auto source = Compile(module, OutputKind::CHeader);
+    str_ = "";
+    std::ofstream file(outputs.c_header_name);
+    CHECK(file.is_open()) << "failed to open file " << outputs.c_header_name;
+    file << source;
+    file.close();
+    LOG(WARNING) << "Output C header to file " << outputs.c_header_name;
+  }
+
+  if (!outputs.c_source_name.empty()) {
+    auto source = Compile(module, OutputKind::CImpl);
+    str_ = "";
+    std::ofstream file(outputs.c_source_name);
+    CHECK(file.is_open()) << "failed to open file " << outputs.c_source_name;
+    file << source;
+    file.close();
+    LOG(WARNING) << "Output C source to file " << outputs.c_source_name;
+  }
+}
+
+CodeGenC::CodeGenC(Target target) : ir::IrPrinter(ss_) {}
+
+std::string CodeGenC::Compile(const ir::Module &module,
+                              OutputKind output_kind) {
+  if (output_kind == OutputKind::CHeader) {
+    GenerateHeaderFile(module);
+  } else if (output_kind == OutputKind::CImpl) {
+    PrintIncludes();
+
+    if (inline_builtin_codes_) PrintBuiltinCodes();
+
+    for (auto &func : module.functions()) {
+      Compile(func);
+    }
+  } else {
+    LOG(FATAL) << "Not supported OutputKind";
+  }
+  return str_;
+}
+
+// TODO(LiuYang): Here the Ret type seems unuseful
+void CodeGenC::Compile(const ir::LoweredFunc &function) {
+  CHECK(function.defined());
+  IrPrinter::Visit(function);
+  str_ += "\n\n";
+}
+
+std::string CodeGenC::GetTypeName(Type type) {
+  // common scalar type
+#define GET_SCALAR_TYPE(pred_expr, scalar_name) \
+  if (pred_expr) {                              \
+    return scalar_name;                         \
+  }
+
+  GET_SCALAR_TYPE(type.is_void(), "void");
+  GET_SCALAR_TYPE(type.is_bool(), "bool");
+
+  GET_SCALAR_TYPE(type.is_int(8), "int8_t");
+  GET_SCALAR_TYPE(type.is_int(16), "int16_t");
+  GET_SCALAR_TYPE(type.is_int(32), "int32_t");
+  GET_SCALAR_TYPE(type.is_int(64), "int64_t");
+
+  GET_SCALAR_TYPE(type.is_uint(8), "uint8_t");
+  GET_SCALAR_TYPE(type.is_uint(16), "uint16_t");
+  GET_SCALAR_TYPE(type.is_uint(32), "uint32_t");
+  GET_SCALAR_TYPE(type.is_uint(64), "uint64_t");
+
+  GET_SCALAR_TYPE(type.is_bfloat16(), "bfloat16");
+  GET_SCALAR_TYPE(type.is_float16(), "float16");
+  GET_SCALAR_TYPE(type.is_float(32), "float")
+  GET_SCALAR_TYPE(type.is_float(64), "double")
+#undef GET_SCALAR_TYPE
+
+  // customized_type
+  if (type.is_customized_type()) {
+    CHECK(!type.customized_type().empty()) << "customized_type can't be empty.";
+    auto customized_name = type.customized_type();
+    // get name of a cuda built-in vector type, it is started with a
+    // 'CudaVectorType::' prefix
+    if (utils::Startswith(customized_name,
+                          common::customized_type::kcuda_builtin_vector_t)) {
+      customized_name.erase(
+          0, strlen(common::customized_type::kcuda_builtin_vector_t));
+    }
+    return customized_name;
+  }
+
+  // other types are not implemented yet
+  CINN_NOT_IMPLEMENTED
+  return "";
+}
+
+std::string CodeGenC::GetTypeRepr(Type type) {
+  std::string str;
+  if (type.is_cpp_const()) {
+    str = "const ";
+  }
+
+  str += GetTypeName(type);
+  if (type.is_cpp_handle()) {
+    str += "*";
+  } else if (type.is_cpp_handle2()) {
+    str += "**";
+  }
+  return str;
+}
+void CodeGenC::Visit(const ir::IntImm *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::UIntImm *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::FloatImm *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::StringImm *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Add *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Sub *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Mul *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Div *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Mod *op) {
+  auto copied = op->b();
+  optim::Simplify(&copied);
+  if (copied.is_constant()) {
+    int temp = static_cast<int>(copied.get_constant());
+    if ((temp & (temp - 1)) == 0) {
+      str_ += "(";
+      IrPrinter::Visit(op->a());
+      str_ += " & ";
+      str_ += std::to_string(temp - 1);
+      str_ += ")";
+      return;
+    }
+  }
+  PrintBinaryOp("%", op);
+}
+void CodeGenC::Visit(const ir::EQ *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::NE *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::LT *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::LE *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::GT *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::GE *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::And *op) { PrintBinaryOp("&&", op); }
+void CodeGenC::Visit(const ir::Or *op) { PrintBinaryOp("||", op); }
+void CodeGenC::Visit(const ir::Min *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Max *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Minus *op) { IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Not *op) {
+  str_ += "(!";
+  IrPrinter::Visit(op->v());
+  str_ += ")";
+}
+void CodeGenC::Visit(const ir::Cast *op) { PrintCastExpr(op->type(), op->v()); }
+void CodeGenC::Visit(const ir::For *op) {
+  Expr extent = op->extent;
+  Expr min = op->min;
+  int num_task = 1;
+  if (op->is_parallel()) {
+    str_ += "int num_task = max_concurrency();\n";
+    DoIndent();
+    str_ += "omp_set_num_threads(num_task);\n";
+    DoIndent();
+    str_ += "auto flambda = [=](int task_id, int num_task) -> int {\n";
+    IncIndent();
+    DoIndent();
+    str_ += "int n_per_task = ";
+    Expr num_task_var = Var("num_task");
+    IrPrinter::Visit((op->extent + num_task_var - 1) / num_task_var);
+    str_ += ";\n";
+    CHECK_EQ(min.as_int32(), 0);
+    auto task_id = Var("task_id");
+    auto n_per_task = Var("n_per_task");
+    min = task_id * n_per_task;
+    extent = (task_id + 1) * n_per_task;
+    DoIndent();
+  }
+  str_ += "for (";
+  str_ += GetTypeRepr(Int(32));
+  str_ += " ";
+  str_ += op->loop_var->name;
+  str_ += " = ";
+  IrPrinter::Visit(min);
+  str_ += "; ";
+  str_ += op->loop_var->name;
+  str_ += " < ";
+  IrPrinter::Visit(op->extent);
+  if (op->is_parallel()) {
+    str_ += " && ";
+    str_ += op->loop_var->name;
+    str_ += " < ";
+    IrPrinter::Visit(extent);
+  }
+  str_ += "; ";
+
+  str_ += op->loop_var->name;
+  str_ += " += 1";
+  str_ += ") ";
+
+  IrPrinter::Visit(op->body);
+  if (op->is_parallel()) {
+    str_ += "\n";
+    DoIndent();
+    str_ += "return 0;\n";
+    DecIndent();
+    DoIndent();
+    str_ += "};\n";
+    str_ += "#pragma omp parallel num_threads(num_task)\n";
+    DoIndent();
+    str_ += "{\n";
+    IncIndent();
+    DoIndent();
+    str_ += "int task_id = omp_get_thread_num();\n";
+    DoIndent();
+    str_ += "flambda(task_id, num_task);\n";
+    DecIndent();
+    DoIndent();
+    str_ += "}";
+  }
+}
+void CodeGenC::Visit(const ir::PolyFor *op) {
+  str_ += "for (";
+  str_ += GetTypeRepr(Int(32));
+  str_ += " ";
+  str_ += op->iterator->name;
+  str_ += " = ";
+  IrPrinter::Visit(op->init);
+  str_ += "; ";
+  IrPrinter::Visit(op->condition);
+  str_ += "; ";
+
+  str_ += op->iterator->name;
+  str_ += " += ";
+  IrPrinter::Visit(op->inc);
+  str_ += ") ";
+
+  IrPrinter::Visit(op->body);
+}
+void CodeGenC::Visit(const ir::Select *op) {
+  str_ += "(";
+  str_ += "(";
+  IrPrinter::Visit(op->condition);
+  str_ += ") ? ";
+  IrPrinter::Visit(op->true_value);
+  str_ += " : ";
+  IrPrinter::Visit(op->false_value);
+  str_ += ")";
+}
+void CodeGenC::Visit(const ir::IfThenElse *op) {
+  str_ += "if (";
+  IrPrinter::Visit(op->condition);
+  str_ += ") {\n";
+
+  if (!op->true_case.As<ir::Block>()) IncIndent();
+  DoIndent();
+  IrPrinter::Visit(op->true_case);
+  if (!op->true_case.As<ir::Block>()) str_ += ";";
+  str_ += "\n";
+
+  if (!op->true_case.As<ir::Block>()) DecIndent();
+
+  DoIndent();
+  str_ += "}";
+
+  if (op->false_case.defined()) {
+    str_ += " else {\n";
+
+    if (!op->true_case.As<ir::Block>()) IncIndent();
+    DoIndent();
+    IrPrinter::Visit(op->false_case);
+    if (!op->false_case.As<ir::Block>()) str_ += ";";
+    str_ += "\n";
+    if (!op->true_case.As<ir::Block>()) DecIndent();
+
+    DoIndent();
+    str_ += "}";
+  }
+}
+void CodeGenC::Visit(const ir::Block *op) {
+  str_ += "{\n";
+
+  IncIndent();
+
+  for (int i = 0; i < op->stmts.size() - 1; i++) {
+    DoIndent();
+    IrPrinter::Visit(op->stmts[i]);
+    str_ += ";\n";
+  }
+  if (op->stmts.size() >= 1) {
+    DoIndent();
+    IrPrinter::Visit(op->stmts.back());
+    str_ += ";";
+  }
+
+  DecIndent();
+  str_ += "\n";
+  DoIndent();
+  str_ += "}";
+}
+void CodeGenC::Visit(const ir::Call *op) {
+  if (op->name == runtime::intrinsic::buffer_malloc) {
+    PrintCall_buffer_malloc(op);
+  } else if (op->name == runtime::intrinsic::pod_values_to_array_repr) {
+    PrintCall_pod_values_to_array(op);
+  } else if (op->is_intrinsic_call()) {
+    str_ += op->name;
+    str_ += "(";
+    PrintCallArgs(op);
+    str_ += ")";
+  } else if (op->is_cinn_call()) {  // call CINN LoweredFunc
+    str_ += op->name;
+    str_ += "(";
+    PrintCallArgs(op);
+    str_ += ")";
+  } else if (op->is_extern_call()) {
+    const auto &fn_name = ExternFunctionEmitterRegistry::Global().Lookup(
+        ExternFuncID{backend_C, op->name.c_str()});
+    if (!fn_name.empty()) {
+      ExternFunctionLLVMEmitter emitter(fn_name);
+      emitter.BindCodeGen(this);
+      emitter.Emit(op);
+    } else {
+      CHECK(!op->read_args.empty() || !op->write_args.empty());
+      str_ += op->name;
+      str_ += "(";
+      PrintCallArgs(op);
+      str_ += ")";
+    }
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+void CodeGenC::PrintCallArgs(const ir::Call *op) {
+  if (!op->read_args.empty()) {
+    for (int i = 0; i < op->read_args.size() - 1; i++) {
+      IrPrinter::Visit(op->read_args[i]);
+      str_ += ", ";
+    }
+    IrPrinter::Visit(op->read_args.back());
+  }
+  if (!op->write_args.empty()) {
+    if (!op->read_args.empty()) str_ += ", ";
+
+    for (int i = 0; i < op->write_args.size() - 1; i++) {
+      IrPrinter::Visit(op->write_args[i]);
+      str_ += ", ";
+    }
+    IrPrinter::Visit(op->write_args.back());
+  }
+}
+
+void CodeGenC::PrintCall_buffer_malloc(const ir::Call *op) {
+  CHECK_EQ(op->read_args.size(), 2UL);
+  str_ += op->name;
+  str_ += "(";
+  PrintCastExpr("void*", op->read_args[0]);
+  str_ += ", ";
+  IrPrinter::Visit(op->read_args[1]);
+  str_ += ")";
+}
+
+void CodeGenC::PrintCall_cinn_pod_value_to_(const ir::Call *op) {
+  CHECK_EQ(op->read_args.size(), 1UL);
+  str_ += op->name;
+  str_ += "(";
+  str_ += "&(";
+  IrPrinter::Visit(op->read_args[0]);
+  str_ += ")";
+  str_ += ")";
+}
+
+void CodeGenC::PrintCall_get_address(const ir::Call *op) {
+  CHECK_EQ(op->read_args.size(), 1UL);
+  CHECK(op->write_args.empty());
+  auto *read_var = op->read_args.front().as_var();
+  auto *read_buf = op->read_args.front().as_buffer();
+  CHECK(read_var || read_buf) << "Only Var or Buffer can get address";
+
+  if (read_var) {
+    if (read_var->type().lanes() <= 1) str_ += "&";
+    str_ += read_var->name;
+  } else if (read_buf) {
+    if (read_buf->type().lanes() <= 1) str_ += "&";
+    str_ += read_buf->name;
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void CodeGenC::PrintCall_pod_values_to_array(const ir::Call *op) {
+  CHECK(!op->read_args.empty());
+  CHECK_EQ(op->write_args.size(), 1UL);
+  auto output_var = op->write_args.front().as_var_ref();
+  CHECK(output_var.defined());
+
+  std::vector<std::string> arg_names;
+  for (auto &arg : op->read_args) {
+    auto arg_var = arg.as_var();
+    CHECK(arg_var);
+    arg_names.push_back(arg_var->name);
+  }
+
+  str_ += "cinn_pod_value_t ";
+  str_ += output_var->name;
+  str_ += "[] = ";
+  str_ += "{ ";
+
+  str_ += utils::Join(arg_names, ", ");
+
+  str_ += " }";
+}
+
+void CodeGenC::Visit(const ir::_Module_ *op) { CINN_NOT_IMPLEMENTED }
+void CodeGenC::Visit(const ir::_Var_ *op) { str_ += op->name; }
+
+void CodeGenC::Visit(const ir::Load *op) {
+  Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1);
+  if (dense_strided_ramp.defined()) {  // Loading a continuous Ramp address.
+    CHECK(op->type().is_vector());
+    PrintStackVecType(op->type().ElementOf(), op->index().type().lanes());
+    str_ += "::";
+    str_ += "Load(";
+    str_ += op->tensor.As<ir::_Tensor_>()->name;
+    str_ += ",";
+    IrPrinter::Visit(dense_strided_ramp);
+    str_ += ")";
+  } else if (op->index().type().is_vector()) {
+    // gather
+    CHECK(op->type().is_vector());
+    PrintStackVecType(op->type().ElementOf(), op->index().type().lanes());
+    str_ += "::Load(";
+    str_ += op->tensor.As<ir::_Tensor_>()->name;
+    str_ += ",";
+    IrPrinter::Visit(op->index());
+    str_ += ")";
+  } else if (op->is_addr_tensor()) {
+    auto *tensor = op->tensor.As<ir::_Tensor_>();
+    str_ += tensor->name;
+    str_ += "[";
+    IrPrinter::Visit(op->index());
+    str_ += "]";
+  } else {
+    IrPrinter::Visit(op);
+  }
+}
+
+void CodeGenC::Visit(const ir::Store *op) {
+  CHECK(op->is_addr_tensor());
+
+  auto *tensor = op->tensor.As<ir::_Tensor_>();
+  CHECK(tensor);
+  str_ += tensor->name;
+  str_ += "[";
+  IrPrinter::Visit(op->index());
+  str_ += "]";
+  str_ += " = ";
+  IrPrinter::Visit(op->value);
+}
+void CodeGenC::Visit(const ir::Alloc *op) {
+  str_ += runtime::intrinsic::buffer_malloc;
+  str_ += "(";
+  str_ += "(void*)(0), ";
+
+  auto *buffer = op->destination.As<ir::_Buffer_>();
+  str_ += buffer->name;
+  str_ += ")";
+}
+
+void CodeGenC::Visit(const ir::Free *op) {
+  str_ += runtime::intrinsic::buffer_free;
+  str_ += "(";
+  str_ += "(void*)(0), ";
+
+  auto *buffer = op->destination.As<ir::_Buffer_>();
+  str_ += buffer->name;
+  str_ += ")";
+}
+
+void CodeGenC::Visit(const ir::_Buffer_ *op) { str_ += op->name; }
+void CodeGenC::Visit(const ir::_Tensor_ *op) { str_ += op->buffer->name; }
+void CodeGenC::Visit(const ir::Let *op) {
+  bool is_vec = false;
+  CHECK(op->type().valid());
+  if (op->body.defined() && op->body.As<ir::Broadcast>()) {
+    // broadcast's type is hard to print, so use c++11 auto instead.
+    str_ += "auto";
+    is_vec = true;
+  } else {
+    str_ += GetTypeRepr(op->type());
+  }
+
+  str_ += " ";
+  IrPrinter::Visit(op->symbol);
+
+  // native C array.
+  if (op->type().lanes() > 1 && !is_vec) {
+    str_ += "[";
+    str_ += std::to_string(op->type().lanes());
+    str_ += "]";
+  }
+
+  if (op->body.defined()) {
+    str_ += " = ";
+    IrPrinter::Visit(op->body);
+  }
+}
+
+void CodeGenC::Visit(const ir::Reduce *op) {
+  LOG(FATAL) << "Reduce IR is just for internal representation, should not be "
+                "used for CodeGen.";
+}
+
+void CodeGenC::Visit(const ir::Ramp *op) {
+  str_ += "StackVec<";
+  str_ += std::to_string(op->lanes);
+  str_ += ",";
+  str_ += GetTypeRepr(op->type().ElementOf());
+  str_ += ">::Ramp(";
+  IrPrinter::Visit(op->base);
+  str_ += ", ";
+  IrPrinter::Visit(op->stride);
+  str_ += ", ";
+  str_ += std::to_string(op->lanes);
+  str_ += ")";
+}
+
+void CodeGenC::Visit(const ir::Broadcast *op) {
+  str_ += "StackVec<";
+  str_ += std::to_string(op->lanes);
+  str_ += ",";
+  str_ += GetTypeRepr(op->type().ElementOf());
+  str_ += ">::Broadcast(";
+  IrPrinter::Visit(op->value);
+  str_ += ", ";
+  str_ += std::to_string(op->lanes);
+  str_ += ")";
+}
+
+void CodeGenC::Visit(const ir::FracOp *op) { ir::IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Sum *op) { ir::IrPrinter::Visit(op); }
+void CodeGenC::Visit(const ir::Product *op) { ir::IrPrinter::Visit(op); }
+
+void CodeGenC::PrintCastExpr(const Type &type, Expr e) {
+  str_ += "((";
+  str_ += GetTypeRepr(type);
+  str_ += ")";
+  str_ += "(";
+  IrPrinter::Visit(e);
+  str_ += "))";
+}
+void CodeGenC::PrintCastExpr(const std::string &type, Expr e) {
+  str_ += "(";
+  str_ += type;
+  str_ += ")";
+  str_ += "(";
+  IrPrinter::Visit(e);
+  str_ += ")";
+}
+
+void CodeGenC::PrintShape(const std::vector<Expr> &shape,
+                          char leftb,
+                          char rightb) {
+  str_ += leftb;
+  str_ += " ";
+
+  for (int i = 0; i < shape.size() - 1; i++) {
+    IrPrinter::Visit(shape[i]);
+    str_ += ", ";
+  }
+  if (shape.size() > 1) IrPrinter::Visit(shape.back());
+
+  str_ += " ";
+  str_ += rightb;
+}
+
+void CodeGenC::Visit(const ir::_LoweredFunc_ *op) {
+  PrintFunctionDeclaration(op);
+  str_ += "\n";
+
+  DoIndent();
+
+  CHECK_EQ(op->alloc_output_buffer_exprs.size(),
+           op->dealloc_output_buffer_exprs.size())
+      << "the count of allocation and deallocaton expressions is not match";
+
+  std::vector<Expr> new_body;
+
+  std::vector<Expr> create_temp_buffers = op->PrepareCreateTempBufferExprs();
+  std::vector<Expr> alloca_temp_buffers = op->PrepareAllocTempBufferExprs();
+  std::vector<Expr> dealloca_temp_buffers = op->PrepareDeallocTempBufferExprs();
+#define APPEND_TO_NEW_BODY(field__) \
+  new_body.insert(                  \
+      std::end(new_body), std::begin(op->field__), std::end(op->field__));
+  APPEND_TO_NEW_BODY(argument_prepare_exprs)
+  new_body.insert(std::end(new_body),
+                  std::begin(create_temp_buffers),
+                  std::end(create_temp_buffers));
+  APPEND_TO_NEW_BODY(alloc_output_buffer_exprs)
+  new_body.insert(std::end(new_body),
+                  std::begin(alloca_temp_buffers),
+                  std::end(alloca_temp_buffers));
+  APPEND_TO_NEW_BODY(buffer_data_cast_exprs)
+  new_body.push_back(op->body);
+  new_body.insert(std::end(new_body),
+                  std::begin(dealloca_temp_buffers),
+                  std::end(dealloca_temp_buffers));
+  APPEND_TO_NEW_BODY(dealloc_output_buffer_exprs)
+
+  Expr func_body = ir::Block::Make(new_body);
+
+  optim::RemoveNestedBlock(&func_body);
+
+  IrPrinter::Visit(func_body);
+}
+void CodeGenC::PrintIncludes() {
+  str_ += "#include <cinn_runtime.h>\n";
+  str_ += "#include <stdio.h>\n";
+  str_ += "\n";
+}
+
+void CodeGenC::PrintFileGuardOpen(const std::string &name) {
+  str_ += utils::StringFormat("#ifndef _%s_CINN_H_\n", Uppercase(name).c_str());
+  str_ += utils::StringFormat("#define _%s_CINN_H_\n", Uppercase(name).c_str());
+  str_ += "\n";
+}
+void CodeGenC::PrintFileGuardClose(const std::string &module_name) {
+  str_ += utils::StringFormat("#endif  // _%s_CINN_H_\n",
+                              Uppercase(module_name).c_str());
+}
+
+void CodeGenC::PrintBufferCreation(const std::vector<ir::Buffer> &buffers) {
+  for (auto &buffer : buffers) {
+    // Ignore the buffer in other devices.
+    if (!buffer->is_on_host()) continue;
+    DoIndent();
+    auto buffer_ptr_type =
+        Type()
+            .set_customized_type(common::customized_type::kbuffer_t)
+            .set_cpp_handle();
+    Var variable = ir::_Var_::Make(buffer->name, buffer_ptr_type);
+    auto expr = ir::intrinsics::BufferCreate::Make(buffer);
+    expr = ir::Let::Make(variable, expr);
+    IrPrinter::Visit(expr);
+    str_ += ";\n";
+  }
+}
+
+void CodeGenC::PrintBufferDestroy(const std::vector<ir::Buffer> &buffers) {
+  for (auto &buffer : buffers) {
+    DoIndent();
+    IrPrinter::Visit(buffer.DestroyExpr());
+    str_ += ";\n";
+  }
+}
+
+void CodeGenC::GenerateHeaderFile(const ir::Module &module) {
+  PrintFileGuardOpen(module.name());
+  PrintIncludes();
+
+  for (auto &func : module.functions()) {
+    PrintFunctionDeclaration(func.As<ir::_LoweredFunc_>());
+    str_ += ";\n";
+    str_ += "\n\n";
+  }
+
+  PrintFileGuardClose(module.name());
+}
+
+void CodeGenC::PrintFuncArg(const ir::Argument &arg) {
+  if (arg.is_buffer()) {
+    if (arg.is_input()) {
+      str_ += "const struct cinn_buffer_t *";
+    } else {
+      str_ += "struct cinn_buffer_t *";
+    }
+  } else if (arg.is_var()) {
+    str_ += GetTypeRepr(arg.type());
+    str_ += " ";
+    str_ += arg.name();
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+  str_ += arg.name();
+}
+
+void CodeGenC::PrintRuntimeType(const cinn_type_t &type) {
+  if (type == cinn_bool_t()) {
+    str_ += "cinn_bool_t()";
+  } else if (type == cinn_int8_t()) {
+    str_ += "cinn_int8_t()";
+  } else if (type == cinn_int16_t()) {
+    str_ += "cinn_int16_t()";
+  } else if (type == cinn_int32_t()) {
+    str_ += "cinn_int32_t()";
+  } else if (type == cinn_int64_t()) {
+    str_ += "cinn_int64_t()";
+  } else if (type == cinn_uint8_t()) {
+    str_ += "cinn_uint8_t()";
+  } else if (type == cinn_uint16_t()) {
+    str_ += "cinn_uint16_t()";
+  } else if (type == cinn_uint32_t()) {
+    str_ += "cinn_uint32_t()";
+  } else if (type == cinn_uint64_t()) {
+    str_ += "cinn_uint64_t()";
+  } else if (type == cinn_bfloat16_t()) {
+    str_ += "cinn_bfloat16_t()";
+  } else if (type == cinn_float16_t()) {
+    str_ += "cinn_float16_t()";
+  } else if (type == cinn_float32_t()) {
+    str_ += "cinn_float32_t()";
+  } else if (type == cinn_float64_t()) {
+    str_ += "cinn_float64_t()";
+  } else {
+    LOG(FATAL) << "Unknown type is not supported to print";
+  }
+}
+
+void CodeGenC::PrintStackVecType(Type type, int lanes) {
+  str_ += "StackedVec<";
+  str_ += GetTypeRepr(type);
+  str_ += ",";
+  str_ += std::to_string(lanes);
+  str_ += ">";
+}
+
+void CodeGenC::Visit(const ir::PrimitiveNode *op) { CINN_NOT_IMPLEMENTED }
+void CodeGenC::Visit(const ir::_BufferRange_ *op) { CINN_NOT_IMPLEMENTED }
+void CodeGenC::Visit(const ir::ScheduleBlock *op) { CINN_NOT_IMPLEMENTED }
+void CodeGenC::Visit(const ir::ScheduleBlockRealize *op) {
+  CINN_NOT_IMPLEMENTED
+}
+
+void CodeGenC::Visit(const ir::IntrinsicOp *op) {
+  switch (op->getKind()) {
+#define __(x)                                     \
+  case ir::IntrinsicKind::k##x:                   \
+    Visit(llvm::dyn_cast<ir::intrinsics::x>(op)); \
+    break;
+
+    INTRINSIC_KIND_FOR_EACH(__)
+#undef __
+  }
+}
+
+void CodeGenC::Visit(const ir::intrinsics::BufferGetDataHandle *op) {
+  str_ += op->buffer.as_buffer()->name;
+  str_ += "->";
+  str_ += "memory";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::BufferGetDataConstHandle *op) {
+  str_ += op->buffer.as_buffer()->name;
+  str_ += "->";
+  str_ += "memory";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::PodValueToX *op) {
+  auto to_type = op->GetOutputType(0);
+  if (to_type == type_of<float>()) {
+    str_ += runtime::intrinsic::pod_value_to_float;
+  } else if (to_type == type_of<double>()) {
+    str_ += runtime::intrinsic::pod_value_to_double;
+  } else if (to_type == type_of<float16>()) {
+    str_ += runtime::intrinsic::pod_value_to_float16;
+  } else if (to_type == type_of<bool>()) {
+    str_ += runtime::intrinsic::pod_value_to_bool;
+  } else if (to_type == type_of<int8_t>()) {
+    str_ += runtime::intrinsic::pod_value_to_int8;
+  } else if (to_type == type_of<int16_t>()) {
+    str_ += runtime::intrinsic::pod_value_to_int16;
+  } else if (to_type == type_of<int32_t>()) {
+    str_ += runtime::intrinsic::pod_value_to_int32;
+  } else if (to_type == type_of<int64_t>()) {
+    str_ += runtime::intrinsic::pod_value_to_int64;
+  } else if (to_type == type_of<uint8_t>()) {
+    str_ += runtime::intrinsic::pod_value_to_uint8;
+  } else if (to_type == type_of<uint16_t>()) {
+    str_ += runtime::intrinsic::pod_value_to_uint16;
+  } else if (to_type == type_of<uint32_t>()) {
+    str_ += runtime::intrinsic::pod_value_to_uint32;
+  } else if (to_type == type_of<uint64_t>()) {
+    str_ += runtime::intrinsic::pod_value_to_uint64;
+  } else if (to_type == type_of<void *>()) {
+    str_ += runtime::intrinsic::pod_value_to_void_p;
+  } else if (to_type == type_of<cinn_buffer_t *>()) {
+    str_ += runtime::intrinsic::pod_value_to_buffer_p;
+  } else {
+    LOG(FATAL) << "Not supported type: " << to_type;
+  }
+
+  str_ += "(";
+  IrPrinter::Visit(op->pod_value_ptr);
+  str_ += ")";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::BufferCreate *op) {
+  const ir::_Buffer_ *buffer_arg = op->buffer.as_buffer();
+  CHECK(buffer_arg);
+
+  str_ += runtime::intrinsic::buffer_create;
+  str_ += "(";
+  PrintCastExpr("cinn_device_kind_t", Expr(buffer_arg->target.runtime_arch()));
+  str_ += "/*target*/, ";
+  PrintRuntimeType(runtime::ToRuntimeType(buffer_arg->dtype.ElementOf()));
+  str_ += ", ";
+  PrintShape(buffer_arg->shape);
+  if (buffer_arg->data_alignment > 0) {
+    str_ += ", ";
+    str_ += std::to_string(buffer_arg->data_alignment);
+    str_ += "/*align*/";
+  }
+  str_ += ")";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::GetAddr *op) {
+  if (op->data.as_buffer()) {
+    str_ += "&";
+    str_ += op->data.as_buffer()->name;
+  } else if (op->data.as_var()) {
+    str_ += "&";
+    str_ += op->data.as_var()->name;
+  } else {
+    str_ += "&(";
+    IrPrinter::Visit(op->data);
+    str_ += ")";
+  }
+}
+
+void CodeGenC::Visit(const ir::intrinsics::ArgsConstruct *op) {
+  str_ += runtime::intrinsic::args_construct_repr;
+  str_ += "(";
+  str_ += op->var->name;
+  str_ += ", ";
+  str_ += std::to_string(op->args.size());
+  str_ += ", ";
+  for (int i = 0; i < op->args.size() - 1; i++) {
+    IrPrinter::Visit(op->args[i]);
+    str_ += ", ";
+  }
+  if (!op->args.empty()) {
+    IrPrinter::Visit(op->args.back());
+  }
+  str_ += ")";
+}
+
+void CodeGenC::Visit(const ir::intrinsics::BuiltinIntrin *op) {
+  str_ += op->name;
+  str_ += "(";
+  if (!op->args.empty()) {
+    for (int i = 0; i < op->args.size() - 1; i++) {
+      IrPrinter::Visit(op->args[i]);
+      str_ += ", ";
+    }
+    IrPrinter::Visit(op->args.back());
+  }
+  str_ += ")";
+}
+
+std::string ReadWholeFile(const std::string &path) {
+  CHECK(!path.empty());
+  std::ifstream file(path);
+  CHECK(file.is_open()) << "Failed to open file: " << path;
+  std::stringstream ss;
+  ss << file.rdbuf();
+  return ss.str();
+}
+
+void CodeGenC::PrintBuiltinCodes() {
+  CHECK(!FLAGS_cinn_x86_builtin_code_root.empty())
+      << "The flag cinn_x86_builtin_code_root should be set first";
+
+  const std::string x86_code_file = "_x86_builtin_source.cc";
+
+  auto source =
+      ReadWholeFile(FLAGS_cinn_x86_builtin_code_root + "/" + x86_code_file);
+
+  str_ += source;
+  str_ += "\n";
+}
+
+namespace detail {
+
+Expr StridedRampBase(Expr e, int stride) {
+  auto *ramp_n = e.As<ir::Ramp>();
+  if (ramp_n) {
+    auto *iv = ramp_n->stride.As<ir::IntImm>();
+    if (iv && iv->value == stride) return ramp_n->base;
+  }
+  return Expr();
+}
+
+}  // namespace detail
+
+}  // namespace backends
+
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_c.h
+++ b/paddle/cinn/backends/codegen_c.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gflags/gflags.h>
+
+#include <string>
+#include <vector>
+
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+
+namespace cinn {
+
+namespace ir {
+class Module;
+}  // namespace ir
+
+namespace backends {
+
+//! keyword of __restrict__.
+extern const char* kCKeywordRestrict;
+
+class CodeGenC : public ir::IrPrinter {
+ public:
+  enum class OutputKind {
+    CHeader,  //! output the C header file.
+    CImpl,    //! output the C implementation file.
+  };
+
+  explicit CodeGenC(Target target);
+
+  void Compile(const ir::Module& module, const Outputs& outputs);
+
+  virtual std::string Compile(const ir::Module& module, OutputKind output_kind);
+
+  //! Disable inline the builtin codes(too large) for simpler string comparison.
+  void SetInlineBuiltinCodes(bool x = true) { inline_builtin_codes_ = x; }
+
+ protected:
+  void Compile(const ir::LoweredFunc& function);
+
+  void GenerateHeaderFile(const ir::Module& module);
+
+  std::string GetTypeName(Type type);
+
+  std::string GetTypeRepr(Type type);
+  //! type cast, print like "int(x)"
+  // @{
+  void PrintCastExpr(const Type& type, Expr e);
+  void PrintCastExpr(const std::string& type, Expr e);
+  // @}
+
+  void PrintFunctionDeclaration(const ir::_LoweredFunc_* op) {
+    str_ += "void ";
+    str_ += op->name;
+    str_ += "(";
+    str_ += "void* _args, int32_t num_args";
+    str_ += ")";
+  }
+
+  void PrintShape(const std::vector<Expr>& shape,
+                  char leftb = '{',
+                  char rightb = '}');
+
+  virtual void PrintIncludes();
+  void PrintBuiltinCodes();
+  void PrintFileGuardOpen(const std::string& module_name);
+  void PrintFileGuardClose(const std::string& module_name);
+
+  //! Create the buffers in global scope(just creation without allocating them).
+  void PrintBufferCreation(const std::vector<ir::Buffer>& buffers);
+  void PrintBufferDestroy(const std::vector<ir::Buffer>& buffers);
+  void PrintRuntimeType(const cinn_type_t& type);
+
+  //! Print different kinds of Calls.
+  // @{
+  void PrintCallArgs(const ir::Call* call);
+  void PrintCall_buffer_malloc(const ir::Call* op);
+  void PrintCall_cinn_pod_value_to_(const ir::Call* op);
+  void PrintCall_get_address(const ir::Call* op);
+  void PrintCall_pod_values_to_array(const ir::Call* op);
+  // @}
+
+#define __DEFINE_VISIT(op__) void Visit(const ir::op__* op) override;
+  NODETY_FORALL(__DEFINE_VISIT)
+#undef __DEFINE_VISIT
+
+#define __DEFINE_VISIT(op__) \
+  void Visit(const ir::intrinsics::op__* op) override;
+  INTRINSIC_KIND_FOR_EACH(__DEFINE_VISIT)
+#undef __DEFINE_VISIT
+
+  void PrintFuncArg(const ir::Argument& arg);
+
+  void PrintStackVecType(Type type, int lanes);
+
+  friend class ExternFunctionEmitter;
+
+ protected:
+  Target target_;
+  std::stringstream ss_;
+  bool inline_builtin_codes_{true};
+};
+
+namespace detail {
+
+Expr StridedRampBase(Expr e, int stride);
+
+}  // namespace detail
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_c_test.cc
+++ b/paddle/cinn/backends/codegen_c_test.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/codegen_c.h"
+
+#include <gtest/gtest.h>
+
+#include <sstream>
+#include <tuple>
+
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
+
+namespace cinn {
+namespace backends {
+
+using ir::Module;
+using lang::Compute;
+using lang::Lower;
+using lang::Placeholder;
+using utils::StringFormat;
+using utils::Trim;
+
+std::tuple<ir::Tensor, ir::Tensor, ir::Tensor, lang::Buffer> CreateTensor1() {
+  Expr M(100);
+  Expr N(20);
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  lang::Buffer C_buf(Float(32));
+  auto C = Compute(
+      {M, N}, [&](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
+  C->Bind(C_buf);
+  return std::make_tuple(A, B, C, C_buf);
+}
+
+TEST(CodeGenC, module) {
+  ir::Tensor A, B, C;
+  lang::Buffer C_buf(Float(32));
+  std::tie(A, B, C, C_buf) = CreateTensor1();
+
+  LOG(INFO) << "C.body: " << C->get_compute_op()->body.front();
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os = Target::OS ::Linux;
+  Module::Builder builder("module1", target);
+
+  auto stages = CreateStages({A, B, C});
+  auto func = Lower("add1", stages, {A, B, C});
+
+  builder.AddFunction(func);
+
+  {
+    CodeGenC codegen(target);
+    codegen.SetInlineBuiltinCodes(false);
+    auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+    std::cout << "codegen C:" << std::endl << out << std::endl;
+
+    std::string target_str = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void add1(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 100; i += 1) {
+    for (int32_t j = 0; j < 20; j += 1) {
+      C[((20 * i) + j)] = (A[((20 * i) + j)] + B[((20 * i) + j)]);
+    };
+  };
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+    EXPECT_EQ(utils::Trim(target_str), utils::Trim(out));
+  }
+
+  {
+    CodeGenC compiler(target);
+    auto out = compiler.Compile(builder.Build(), CodeGenC::OutputKind::CHeader);
+    std::cout << "header:\n" << out << std::endl;
+    auto target_str = R"ROC(
+#ifndef _MODULE1_CINN_H_
+#define _MODULE1_CINN_H_
+
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void add1(void* _args, int32_t num_args);
+
+
+#endif  // _MODULE1_CINN_H_
+)ROC";
+
+    EXPECT_EQ(utils::Trim(out), utils::Trim(target_str));
+  }
+
+  {
+    CodeGenC compiler(target);
+    compiler.SetInlineBuiltinCodes(false);
+    Outputs outputs;
+    outputs = outputs.c_header("./generated_module1.h")
+                  .c_source("./_generated_module1.cc");
+    compiler.Compile(builder.Build(), outputs);
+  }
+}
+
+TEST(CodeGenC, matmul) {
+  using namespace ir;  // NOLINT
+  Context::Global().ResetNameId();
+
+  Placeholder<float> A("A", {Expr(100), Expr(20)});
+  Placeholder<float> B("B", {Expr(20), Expr(50)});
+
+  Target target{};
+
+  Module::Builder builder("module1", target);
+
+  // C = A * B
+  Var k(20, "k0");
+
+  Tensor C = Compute(
+      {Expr(100), Expr(50)},
+      [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); },
+      "C");
+
+  auto stages = CreateStages({A, B, C});
+
+  // Code gen
+  auto func = Lower("matmul", stages, {A, B, C});
+  builder.AddFunction(func);
+  builder.AddBuffer(C->buffer);
+
+  {  // main
+    std::vector<lang::ReturnType> returns(
+        {lang::ReturnType{Float(32), C->shape, C->name}});
+
+    auto tensors = lang::CallLowered("matmul", {A, B}, returns);
+
+    auto C = tensors[0];
+    C->WithBuffer();
+
+    LOG(INFO) << "C.body: " << C->body();
+
+    auto stages = CreateStages({C});
+
+    auto f = Lower("main", stages, {A, B, C}, {});
+    std::cout << "f\n" << Expr(f) << std::endl;
+    builder.AddFunction(f);
+  }
+
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "codegen C:" << std::endl << out << std::endl;
+
+  auto tgt = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void matmul(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  float* C__reduce_init = ((float*)(_C->memory));
+  for (int32_t i = 0; i < 100; i += 1) {
+    for (int32_t j = 0; j < 50; j += 1) {
+      C__reduce_init[((50 * i) + j)] = 0.00000000f;
+      for (int32_t k0 = 0; k0 < 20; k0 += 1) {
+        C[((50 * i) + j)] = (C[((50 * i) + j)] + (A[((20 * i) + k0)] * B[((50 * k0) + j)]));
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _C);
+}
+
+void main(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  cinn_pod_value_t _pod_val_;
+  buffer_p_to_cinn_pod_value(_A, &_pod_val_);
+  cinn_pod_value_t _pod_val__0;
+  buffer_p_to_cinn_pod_value(_B, &_pod_val__0);
+  cinn_pod_value_t _pod_val__1;
+  buffer_p_to_cinn_pod_value(_C, &_pod_val__1);
+  cinn_pod_value_t _pod_arr[3];
+  cinn_args_construct(_pod_arr, 3, &_pod_val_, &_pod_val__0, &_pod_val__1);
+  matmul(_pod_arr, 3);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+
+  ASSERT_EQ(Trim(tgt), Trim(out));
+}
+
+// This matches output of competitor.
+TEST(CodeGenC, matmul_tile) {
+  using namespace ir;  // NOLINT
+  Expr M(100);
+  Expr K(200);
+  Expr N(500);
+  Expr bn(32);
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  // C = A * B
+  Var k(K.as_int32(), "k0");
+
+  Tensor C_init = Compute(
+      {M, N}, [&](Var i, Var j) { return Expr(0.f); }, "C_init");
+
+  Tensor C = Compute(
+      {M, N},
+      [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); },
+      "C");
+
+  auto stages = CreateStages({C, C_init});
+  stages[C]->ShareBufferWith(stages[C_init]);
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ =
+        stages[C_init]->Tile(0, 1, bn.as_int32(), bn.as_int32());  // NOLINT
+    auto &i_outer = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    stages[C_init]->Reorder({i_outer, j_outer, i_inner, j_inner});
+  }
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ =
+        stages[C]->Tile(0, 1, bn.as_int32(), bn.as_int32());  // NOLINT
+    auto &i_outer = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_ =
+        stages[C]->Split(poly::Iterator("k0"), 4);  // NOLINT
+    auto &k_outer = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner = std::get<1>(_k_outer_k_inner_);
+    stages[C]->Reorder({i_outer, j_outer, i_inner, j_inner, k_outer, k_inner});
+  }
+
+  stages[C_init]->ComputeAtSchedule(
+      stages[C], 3, poly::Stage::kComputeAtBefore);
+
+  // Code gen
+  auto func = Lower("matmul", stages, {A, B, C});
+
+  Target target = common::DefaultHostTarget();
+
+  Module::Builder builder("module1", target);
+  builder.AddFunction(func);
+  builder.AddBuffer(C_init->buffer);
+
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "codegen C:" << std::endl << out << std::endl;
+
+  auto target_out = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void matmul(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  float* C__reduce_init = ((float*)(_C->memory));
+  float* C_init = ((float*)(_C->memory));
+  for (int32_t i_outer = 0; i_outer < 4; i_outer += 1) {
+    for (int32_t j_outer = 0; j_outer < 16; j_outer += 1) {
+      for (int32_t i_inner = 0; i_inner < cinn_min(32, (100 + (-32 * i_outer))); i_inner += 1) {
+        for (int32_t j_inner = 0; j_inner < cinn_min(32, (500 + (-32 * j_outer))); j_inner += 1) {
+          C__reduce_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0.00000000f;
+          C_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0.00000000f;
+          for (int32_t k0_outer = 0; k0_outer < 50; k0_outer += 1) {
+            for (int32_t k0_inner = 0; k0_inner < 4; k0_inner += 1) {
+              C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = fma(A[((200 * i_inner) + ((6400 * i_outer) + ((4 * k0_outer) + k0_inner)))], B[((32 * j_outer) + ((500 * k0_inner) + ((2000 * k0_outer) + j_inner)))], C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))]);
+            };
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+
+  ASSERT_EQ(Trim(target_out), Trim(out));
+}
+
+TEST(CodeGenC, matmul_packed) {
+  Expr M(100);
+  Expr K(200);
+  Expr N(500);
+  Expr bn(32);
+  Placeholder<float> A("A", {M, K});
+  Placeholder<float> B("B", {K, N});
+
+  // TODO(Superjomn) Make sure the domain works.
+  Var k(K.as_int32(), "k0");
+  auto packedB = Compute(
+      {N / bn, K, bn},
+      [&](Expr x, Expr y, Expr z) { return B(y, x * bn + z); },
+      "PackedB");
+  auto C = Compute(
+      {M, N},
+      [&](Expr i, Expr j) {
+        return ReduceSum(A(i, k) * packedB(j / bn, k, j % bn), {k});
+      },
+      "C");
+
+  auto stages = CreateStages({packedB, C});
+
+  {
+    auto _i_outer_i_inner_j_outer_j_inner_ =
+        stages[C]->Tile(0, 1, bn.as_int32(), bn.as_int32());
+    auto &i_outer = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &i_inner = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_outer = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
+    auto &j_inner = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
+    auto _k_outer_k_inner_ = stages[C]->Split(poly::Iterator("k0"), 4);
+    auto &k_outer = std::get<0>(_k_outer_k_inner_);
+    auto &k_inner = std::get<1>(_k_outer_k_inner_);
+    stages[C]->Reorder({i_outer, j_outer, i_inner, j_inner, k_outer, k_inner});
+  }
+
+  // Code gen
+  auto func = Lower("matmul_with_packing", stages, {A, B, packedB, C});
+
+  Target target = common::DefaultHostTarget();
+
+  Module::Builder builder("module1", target);
+  builder.AddFunction(func);
+  builder.AddBuffer(C->buffer);
+  builder.AddBuffer(packedB->buffer);
+
+  CodeGenC codegen(target);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "codegen C:" << std::endl << out << std::endl;
+
+  auto target_out = R"ROC(
+#include <cinn_runtime.h>
+#include <stdio.h>
+
+void matmul_with_packing(void* _args, int32_t num_args)
+{
+  const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
+  const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
+  cinn_buffer_t* _PackedB = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
+  cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[3]));
+  cinn_buffer_malloc((void*)(0), _PackedB);
+  cinn_buffer_malloc((void*)(0), _C);
+  const float* A = ((const float*)(_A->memory));
+  const float* B = ((const float*)(_B->memory));
+  float* C = ((float*)(_C->memory));
+  float* C__reduce_init = ((float*)(_C->memory));
+  float* PackedB = ((float*)(_PackedB->memory));
+  for (int32_t i = 0; i < 15; i += 1) {
+    for (int32_t j = 0; j < 200; j += 1) {
+      for (int32_t k = 0; k < 32; k += 1) {
+        PackedB[((6400 * i) + ((32 * j) + k))] = B[((32 * i) + ((500 * j) + k))];
+      };
+    };
+  };
+  for (int32_t i_outer = 0; i_outer < 4; i_outer += 1) {
+    for (int32_t j_outer = 0; j_outer < 16; j_outer += 1) {
+      for (int32_t i_inner = 0; i_inner < cinn_min(32, (100 + (-32 * i_outer))); i_inner += 1) {
+        for (int32_t j_inner = 0; j_inner < cinn_min(32, (500 + (-32 * j_outer))); j_inner += 1) {
+          C__reduce_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0;
+          for (int32_t k0_outer = 0; k0_outer < 50; k0_outer += 1) {
+            for (int32_t k0_inner = 0; k0_inner < 4; k0_inner += 1) {
+              C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = fma(A[((200 * i_inner) + ((6400 * i_outer) + ((4 * k0_outer) + k0_inner)))], PackedB[((6400 * (j_inner / 32)) + ((j_inner & 31) + ((6400 * j_outer) + ((32 * k0_inner) + (128 * k0_outer)))))], C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))]);
+            };
+          };
+        };
+      };
+    };
+  };
+  cinn_buffer_free((void*)(0), _PackedB);
+  cinn_buffer_free((void*)(0), _C);
+}
+)ROC";
+  // ToDo @haoze @wangyue Check Codegen
+  // ASSERT_EQ(utils::Trim(target_out), utils::Trim(out));
+}
+
+TEST(CodeGenC, call_extern) {
+  Expr M(100);
+
+  Placeholder<float> x("x", {M});
+
+  ir::Tensor y = Compute(
+      {M},
+      [=](Var i) -> Expr { return lang::CallExtern("tanh", {x(i)}); },
+      "y");
+
+  auto stages = CreateStages({y});
+
+  auto yexpr = Lower("yy", stages, {y});
+
+  Module::Builder builder("module0", common::DefaultHostTarget());
+  builder.AddFunction(yexpr);
+
+  CodeGenC codegen(common::DefaultHostTarget());
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "codegen C:" << std::endl << out << std::endl;
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_c_x86.cc
+++ b/paddle/cinn/backends/codegen_c_x86.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/codegen_c_x86.h"
+
+namespace cinn {
+namespace backends {
+
+void CodeGenCX86::Visit(const ir::Add *op) {
+  VisitBinaryOp(op, op->a(), op->b(), "add");
+}
+void CodeGenCX86::Visit(const ir::Sub *op) {
+  VisitBinaryOp(op, op->a(), op->b(), "sub");
+}
+void CodeGenCX86::Visit(const ir::Mul *op) {
+  VisitBinaryOp(op, op->a(), op->b(), "mul");
+}
+void CodeGenCX86::Visit(const ir::Div *op) {
+  VisitBinaryOp(op, op->a(), op->b(), "div");
+}
+
+void CodeGenCX86::Visit(const ir::Load *op) {
+  Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1);
+  if (dense_strided_ramp.defined()) {  // Loading a continuous Ramp address.
+    CHECK(op->type().is_vector());
+
+    int bits = op->type().bits() * op->type().lanes();
+    if (SupportsAVX512() && bits == 512) {
+      str_ += "cinn_avx512_load(";
+      PrintAbsAddr(op);
+      str_ += ")";
+    } else if (SupportsAVX256() && bits == 256) {
+      str_ += "cinn_avx256_load(";
+      PrintAbsAddr(op);
+      str_ += ")";
+    } else {
+      CodeGenC::Visit(op);
+    }
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+void CodeGenCX86::Visit(const ir::Broadcast *op) {
+  CHECK_GT(op->type().lanes(), 1);
+  int bits = op->type().bits() * op->type().lanes();
+
+  if (SupportsAVX512() && bits == 512) {
+    str_ += "cinn_avx512_set1(";
+    PrintCastExpr(op->value.type().ElementOf(), op->value);
+    str_ += ")";
+  } else if (SupportsAVX256() && bits == 256) {
+    str_ += "cinn_avx256_set1(";
+    PrintCastExpr(op->value.type().ElementOf(), op->value);
+    str_ += ")";
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+void CodeGenCX86::Visit(const ir::Store *op) {
+  if (op->type().lanes() == 1) {
+    CodeGenC::Visit(op);
+    return;
+  }
+
+  int bits = op->type().bits() * op->type().lanes();
+  if (SupportsAVX512() && bits == 512) {
+    str_ += "cinn_avx512_store(";
+    PrintAbsAddr(op);
+    str_ += ", ";
+    IrPrinter::Visit(op->value);
+    str_ += ")";
+  } else if (SupportsAVX256() && bits == 256) {
+    str_ += "cinn_avx256_store(";
+    PrintAbsAddr(op);
+    str_ += ", ";
+    IrPrinter::Visit(op->value);
+    str_ += ")";
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+void CodeGenCX86::PrintVecInputArgument(const Expr *op) {
+  int bits = op->type().bits() * op->type().lanes();
+  auto *broadcast_n = op->As<ir::Broadcast>();
+
+  if (op->type().lanes() == 1 || broadcast_n) {
+    Expr value = op->type().lanes() == 1 ? *op : broadcast_n->value;
+
+    if (SupportsAVX512()) {
+      str_ += "cinn_avx512_set1(";
+      IrPrinter::Visit(value);
+      str_ += ")";
+    } else if (SupportsAVX256()) {
+      str_ += "cinn_avx256_set1(";
+      IrPrinter::Visit(value);
+      str_ += ")";
+    } else {
+      CINN_NOT_IMPLEMENTED
+    }
+  } else {
+    IrPrinter::Visit(*op);
+  }
+}
+
+void CodeGenCX86::Visit(const ir::intrinsics::BuiltinIntrin *op) {
+  if (op->type().lanes() == 1) {
+    CodeGenC::Visit(op);
+    return;
+  }
+  int bits = op->type().bits() * op->type().lanes();
+  if (SupportsAVX512() && bits == 512) {
+    str_ += "cinn_avx512_";
+    str_ += op->name;
+    str_ += "(";
+    if (!op->args.empty()) {
+      for (int i = 0; i < op->args.size() - 1; i++) {
+        PrintVecInputArgument(&op->args[i]);
+        str_ += ", ";
+      }
+      IrPrinter::Visit(op->args.back());
+    }
+    str_ += ")";
+  } else if (SupportsAVX256() && bits == 256) {
+    str_ += "cinn_avx256_";
+    str_ += op->name;
+    str_ += "(";
+    if (!op->args.empty()) {
+      for (int i = 0; i < op->args.size() - 1; i++) {
+        PrintVecInputArgument(&op->args[i]);
+        str_ += ", ";
+      }
+      PrintVecInputArgument(&op->args.back());
+    }
+    str_ += ")";
+  } else if (bits == 128) {
+    str_ += "cinn_avx128_";
+    str_ += op->name;
+    str_ += "(";
+    if (!op->args.empty()) {
+      for (int i = 0; i < op->args.size() - 1; i++) {
+        PrintVecInputArgument(&op->args[i]);
+        str_ += ", ";
+      }
+      PrintVecInputArgument(&op->args.back());
+    }
+    str_ += ")";
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_c_x86.h
+++ b/paddle/cinn/backends/codegen_c_x86.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/ir/intrinsic_ops.h"
+
+namespace cinn {
+namespace backends {
+
+/**
+ * C code generation with X86 instruction or math library support.
+ */
+class CodeGenCX86 : public CodeGenC {
+ public:
+  //! The X86 CPU supports some following features. We use SSE or AVX to
+  //! accelerate the basic operations if forloop is vectorized.
+  enum class Feature : int {
+    None = 0,
+    SSE = 1,          //! support SSE instruction set.
+    AVX256 = 1 << 1,  // ! support AVX256 instruction set.
+    AVX512 = 1 << 2,  // ! support AVX512 instruction set.
+    BLAS = 1 << 3,    // ! support BLAS library.
+  };
+
+  Feature feature{Feature::None};
+
+  /**
+   * constructor.
+   * @param target The device.
+   * @param features Features it supported.
+   */
+  CodeGenCX86(Target target, Feature feature)
+      : CodeGenC(target), feature(feature) {}
+
+ protected:
+  void Visit(const ir::Add *op) override;
+  void Visit(const ir::Sub *op) override;
+  void Visit(const ir::Mul *op) override;
+  void Visit(const ir::Div *op) override;
+  void Visit(const ir::Mod *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::EQ *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::NE *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::LT *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::LE *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::GT *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::GE *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::And *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::Or *op) override { CodeGenC::Visit(op); }
+  void Visit(const ir::Load *op) override;
+  void Visit(const ir::Store *op) override;
+  void Visit(const ir::Broadcast *op) override;
+  void Visit(const ir::intrinsics::BuiltinIntrin *op);
+
+  //! Check the features.
+  // @{
+  bool SupportsSSE() {
+    return static_cast<int>(feature) & static_cast<int>(Feature::SSE);
+  }
+  bool SupportsAVX256() {
+    return static_cast<int>(feature) & static_cast<int>(Feature::AVX256);
+  }
+  bool SupportsAVX512() {
+    return static_cast<int>(feature) & static_cast<int>(Feature::AVX512);
+  }
+  bool SupportsBLAS() {
+    return static_cast<int>(feature) & static_cast<int>(Feature::BLAS);
+  }
+  // @}
+
+  //! Print (and prepare) a argument in vectorize type, for example:
+  // 3. -> set1(3.)
+  // a[i:j] -> load_ps(a+i)
+  void PrintVecInputArgument(const Expr *op);
+  //! The output argument, such as the destination for Load.
+  void PrintVecOutputArgument(const Expr *op);
+
+  template <typename Op>
+  void PrintAbsAddr(const Op *op) {
+    str_ += op->tensor.template As<ir::_Tensor_>()->name;
+    str_ += " + ";
+
+    auto index = op->index();
+    auto *ramp_n = index.template As<ir::Ramp>();
+    if (ramp_n) {
+      CHECK(!ramp_n->base.template As<ir::Ramp>())
+          << "base of a Ramp node should not be Ramp type";
+      IrPrinter::Visit(ramp_n->base);
+    } else {
+      IrPrinter::Visit(op->index());
+    }
+  }
+
+  template <typename Op>
+  void VisitBinaryOp(const Op *op, Expr a, Expr b, const std::string &op_repr);
+};
+
+template <typename Op>
+void CodeGenCX86::VisitBinaryOp(const Op *op,
+                                Expr a,
+                                Expr b,
+                                const std::string &op_repr) {
+  CHECK_EQ(a.type(), b.type()) << " a is : " << a << ", and b is : " << b
+                               << ". op_repr is : " << op_repr;
+
+  // scalar.
+  if (a.type().lanes() == 1) {
+    CodeGenC::Visit(op);
+    return;
+  }
+
+  // TODO(Superjomn) Consider support BLAS.
+  int bits = a.type().bits() * a.type().lanes();
+  if (SupportsAVX512() && bits == 512) {
+    str_ += "cinn_avx512_";
+    str_ += op_repr;
+    str_ += "(";
+    PrintVecInputArgument(&a);
+    str_ += ", ";
+    PrintVecInputArgument(&b);
+    str_ += ")";
+  } else if (SupportsAVX256() && bits == 256) {
+    str_ += "cinn_avx256_";
+    str_ += op_repr;
+    str_ += "(";
+    PrintVecInputArgument(&a);
+    str_ += ", ";
+    PrintVecInputArgument(&b);
+    str_ += ")";
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_c_x86_test.cc
+++ b/paddle/cinn/backends/codegen_c_x86_test.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/codegen_c_x86.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/lang/builtin.h"
+#include "paddle/cinn/lang/compute.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/lang/placeholder.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/transform_polyfor_to_for.h"
+#include "paddle/cinn/optim/vectorize_loops.h"
+
+namespace cinn {
+namespace backends {
+
+TEST(CodeGenCX86, basic) {
+  // create two forloops, check only one forloop is marked Vectorize.
+  Context::info_rgt().Clear();
+
+  using namespace ir;  // NOLINT
+
+  const int M = 100;
+  const int K = 200;
+  const int N = 500;
+  const int bn = 32;
+
+  Target target;
+  target.arch = Target::Arch ::X86;
+  target.bits = Target::Bit ::k32;
+  target.os = Target::OS ::Linux;
+
+  Placeholder<float> A("A", {M, N});
+  Placeholder<float> B("B", {M, N});
+
+  // C = A * B
+  Tensor C = Compute(
+      {Expr(M), Expr(N)}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
+
+  Tensor D = Compute(
+      {Expr(M), Expr(N)}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "D");
+
+  auto stages = CreateStages({C, D});
+  // vectorize C, not D
+  stages[C]->Vectorize(1, 16);
+  stages[C]->Unroll(1);
+
+  auto func = Lower("matmul", stages, {A, B, C, D});
+
+  std::cout << "before optim\n" << func->body << std::endl;
+
+  ir::Module::Builder builder("module1", target);
+  builder.AddFunction(func);
+
+  CodeGenCX86 codegen(target, CodeGenCX86::Feature::AVX512);
+  codegen.SetInlineBuiltinCodes(false);
+  auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
+  std::cout << "out:\n" << out;
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_cuda_dev.cc
+++ b/paddle/cinn/backends/codegen_cuda_dev.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+
+#include <glog/logging.h>
+#include <paddle/cinn/utils/string.h>
+
+#include <fstream>
+#include <set>
+#include <unordered_set>
+
+#include "paddle/cinn/ir/op/ir_operators.h"
+#include "paddle/cinn/ir/utils/ir_verify.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/optim/remove_nested_block.h"
+
+namespace cinn {
+namespace backends {
+
+const std::string CodeGenCUDA_Dev::source_header_ =  // NOLINT
+    R"(#include <cstdint>
+
+#define CINN_WITH_CUDA
+#include "bfloat16.h"
+#include "float16.h"
+using cinn::common::bfloat16;
+using cinn::common::float16;
+using cinn::common::half4;
+using cinn::common::half8;
+using cinn::common::float8;
+
+#include "cinn_cuda_runtime_source.cuh"
+)";
+
+const std::string &CodeGenCUDA_Dev::GetSourceHeader() { return source_header_; }
+
+CodeGenCUDA_Dev::CodeGenCUDA_Dev(Target target) : CodeGenC(target) {}
+
+std::string CodeGenCUDA_Dev::Compile(const ir::Module &module, bool for_nvrtc) {
+  for_nvrtc_ = for_nvrtc;
+  auto source = Compile(module, OutputKind::CImpl);
+
+  return source;
+}
+
+void CodeGenCUDA_Dev::Compile(const ir::Module &module,
+                              const Outputs &outputs) {
+  ir::IrVerify(Expr(module));
+
+  CodeGenC::inline_builtin_codes_ = false;
+  if (!outputs.c_header_name.empty()) {
+    auto source = Compile(module, OutputKind::CHeader);
+    str_ = "";
+    std::ofstream file(outputs.c_header_name);
+    CHECK(file.is_open()) << "failed to open file " << outputs.c_header_name;
+    file << source;
+    file.close();
+    LOG(WARNING) << "Output C header to file " << outputs.c_header_name;
+  }
+
+  if (!outputs.cuda_source_name.empty()) {
+    auto source = Compile(module, OutputKind::CImpl);
+    str_ = "";
+    std::ofstream file(outputs.cuda_source_name);
+    CHECK(file.is_open()) << "failed to open file " << outputs.cuda_source_name;
+    file << source;
+    file.close();
+    LOG(WARNING) << "Output C source to file " << outputs.cuda_source_name;
+  }
+}
+
+void CodeGenCUDA_Dev::Compile(const ir::LoweredFunc &func) {
+  IrPrinter::Visit(Expr(func));
+}
+
+std::vector<Expr> CodeGenCUDA_Dev::GenerateBufferAliasExprs(
+    const ir::_LoweredFunc_ *op, const std::vector<ir::Buffer> &temp_buffers) {
+  std::set<ir::Buffer> temp_buffer_set(temp_buffers.begin(),
+                                       temp_buffers.end());
+  // prepare temp buffer alias
+  std::vector<Expr> buffer_alias;
+  auto tensors = ir::CollectIRNodes(op->body, [&](const Expr *x) {
+    return x->as_tensor() && x->as_tensor()->buffer.defined() &&
+           temp_buffer_set.count(x->as_tensor()->buffer);
+  });
+
+  // unique tensors
+  std::set<ir::Tensor> unique_tensors;
+  for (auto &e : tensors) {
+    unique_tensors.insert(e.as_tensor_ref());
+  }
+
+  for (auto &t : unique_tensors) {
+    auto data_type = t->type();
+    auto data_ptr_type = data_type;
+    data_ptr_type.set_cpp_handle();
+
+    Var t_var(t->name, data_ptr_type);
+    Var buf_var(t->buffer->name, data_ptr_type);
+    buffer_alias.push_back(ir::Let::Make(t_var, buf_var));
+  }
+
+  return buffer_alias;
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::_LoweredFunc_ *op) {
+  // clear names valid within scope when enter a new function
+  vectorized_tensor_names_.clear();
+  str_ += "__global__\n";
+
+  PrintFunctionDeclaration(op);
+  str_ += "\n";
+
+  DoIndent();
+
+  std::vector<Expr> new_body;
+
+  auto alloca_temp_buffers = op->PrepareAllocTempBufferExprs();
+  auto temp_buffer_alias = GenerateBufferAliasExprs(op, op->temp_bufs);
+  auto alis_var_exprs = op->CudaAliasVarExprs();
+
+#define APPEND_TO_NEW_BODY(field__) \
+  new_body.insert(std::end(new_body), std::begin(field__), std::end(field__));
+  APPEND_TO_NEW_BODY(alloca_temp_buffers)
+  APPEND_TO_NEW_BODY(temp_buffer_alias)
+  APPEND_TO_NEW_BODY(alis_var_exprs)
+
+  new_body.push_back(op->body);
+
+  Expr func_body = ir::Block::Make(new_body);
+
+  optim::RemoveNestedBlock(&func_body);
+  // Make sure that the function's body is wrapped by a block
+  if (!func_body.As<ir::Block>()) {
+    func_body = ir::Block::Make({func_body});
+  }
+  IrPrinter::Visit(func_body);
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::_Var_ *op) {
+  if (utils::Startswith(op->name, "threadIdx") ||
+      utils::Startswith(op->name, "blockIdx")) {
+    str_ += "(int)";
+    str_ += op->name;
+  } else {
+    str_ += op->name;
+  }
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Alloc *op) {
+  CHECK(op->destination.as_buffer());
+  PrintTempBufferCreation(op->destination.as_buffer_ref());
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Min *op) {
+  str_ += "min(";
+  IrPrinter::Visit(op->a());
+  str_ += ", ";
+  IrPrinter::Visit(op->b());
+  str_ += ")";
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Max *op) {
+  str_ += "max(";
+  IrPrinter::Visit(op->a());
+  str_ += ", ";
+  IrPrinter::Visit(op->b());
+  str_ += ")";
+}
+
+void CodeGenCUDA_Dev::PrintFunctionDeclaration(const ir::_LoweredFunc_ *op) {
+  str_ += "void ";
+  if (op->cuda_axis_info.valid()) {
+    int thread_num = 1;
+    for (int i = 0; i < 3; i++) {
+      thread_num *= op->cuda_axis_info.block_dim(i);
+    }
+    str_ += "__launch_bounds__(";
+    str_ += std::to_string(thread_num);
+    str_ += ") ";
+  }
+
+  str_ += op->name;
+  str_ += "(";
+  for (int i = 0; i < op->args.size() - 1; i++) {
+    auto &arg = op->args[i];
+    PrintFuncArg(arg);
+    str_ += ", ";
+  }
+  if (!op->args.empty()) {
+    PrintFuncArg(op->args.back());
+  }
+  str_ += ")";
+}
+
+void CodeGenCUDA_Dev::PrintFuncArg(const ir::Argument &arg) {
+  if (arg.is_buffer()) {
+    // In CUDA kernel, only primitive type is supported, so we replace the
+    // buffer with T*j
+    if (arg.is_input()) str_ += "const ";
+    str_ += GetTypeRepr(arg.buffer_arg()->dtype);
+    str_ += "* ";
+    str_ += kCKeywordRestrict;
+    str_ += " ";
+    str_ += ir::BufferGetTensorName(arg.buffer_arg().As<ir::_Buffer_>());
+  } else if (arg.is_var()) {
+    if (arg.var_arg()->type().is_cpp_handle()) {
+      str_ += kCKeywordRestrict;
+    }
+    str_ += GetTypeRepr(arg.type());
+    str_ += " ";
+    str_ += arg.name();
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void CodeGenCUDA_Dev::PrintBuiltinCodes() {}
+
+std::string CodeGenCUDA_Dev::Compile(const ir::Module &module,
+                                     CodeGenC::OutputKind output_kind) {
+  if (output_kind == OutputKind::CHeader) {
+    GenerateHeaderFile(module);
+  } else if (output_kind == OutputKind::CImpl) {
+    PrintIncludes();
+
+    if (for_nvrtc_) {
+      str_ += "\nextern \"C\" {\n\n";
+    }
+
+    PrintBuiltinCodes();
+
+    for (auto &func : module.functions()) {
+      Compile(func);
+    }
+  } else {
+    LOG(FATAL) << "Not supported OutputKind";
+  }
+
+  if (for_nvrtc_) {
+    str_ += "\n\n}";
+  }
+  return str_;
+}
+
+void CodeGenCUDA_Dev::PrintIncludes() { str_ += GetSourceHeader(); }
+
+void CodeGenCUDA_Dev::PrintTempBufferCreation(const ir::Buffer &buffer) {
+  CHECK_NE(buffer->type(), Void());
+  auto print_gpu_memory = [&](const std::string &mark) {
+    str_ += mark;
+    str_ += GetTypeRepr(buffer->dtype);
+    str_ += " ";
+    str_ += buffer->name;
+    str_ += " ";
+
+    str_ += "[ ";
+    Expr buffer_size(1);
+    for (int i = 0; i < buffer->shape.size(); i++) {
+      buffer_size = buffer_size * buffer->shape[i];
+    }
+    optim::Simplify(&buffer_size);
+    IrPrinter::Visit(buffer_size);
+    str_ += " ]";
+  };
+  switch (buffer->memory_type) {
+    case ir::MemoryType::GPUShared:
+      print_gpu_memory("__shared__ ");
+      break;
+
+    case ir::MemoryType::GPULocal:
+      print_gpu_memory("");
+      break;
+
+    default:
+      LOG(FATAL) << "CUDA device codegen not support memory " << buffer->name
+                 << ", type " << buffer->memory_type;
+  }
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Call *op) {
+  str_ += op->name;
+  str_ += "(";
+
+  if (!op->read_args.empty()) {
+    for (int i = 0; i < op->read_args.size() - 1; i++) {
+      auto &arg = op->read_args[i];
+      if (arg.as_tensor()) {
+        str_ += arg.as_tensor()->name;
+        str_ += ", ";
+      } else {
+        IrPrinter::Visit(arg);
+        str_ += ", ";
+      }
+    }
+    if (op->read_args.back().as_tensor()) {
+      str_ += op->read_args.back().as_tensor()->name;
+    } else {
+      IrPrinter::Visit(op->read_args.back());
+    }
+  }
+
+  if (!op->write_args.empty()) {
+    str_ += ", ";
+    for (int i = 0; i < op->write_args.size() - 1; i++) {
+      auto &arg = op->write_args[i];
+      if (arg.as_tensor()) {
+        str_ += arg.as_tensor()->name;
+        str_ += ", ";
+      } else {
+        IrPrinter::Visit(arg);
+        str_ += ", ";
+      }
+    }
+    if (op->write_args.back().as_tensor()) {
+      str_ += op->write_args.back().as_tensor()->name;
+    } else {
+      IrPrinter::Visit(op->write_args.back());
+    }
+  }
+
+  str_ += ")";
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Let *op) {
+  CHECK(op->type().valid());
+
+  // identify vectorized tensors by checking their dtypes are customized_type
+  // with customized_type::kcuda_builtin_vector_t prefix, and save their names
+  if (op->type().is_customized() &&
+      utils::Startswith(op->type().customized_type(),
+                        common::customized_type::kcuda_builtin_vector_t)) {
+    str_ += GetTypeRepr(op->type());
+    if (op->type().is_cpp_handle()) {
+      str_ += " ";
+      str_ += kCKeywordRestrict;
+    }
+    str_ += " ";
+    IrPrinter::Visit(op->symbol);
+    vectorized_tensor_names_.insert(utils::GetStreamCnt(op->symbol));
+    // skip "=0" in "half8 temp = 0;" sincethe operator= of half8 may not
+    // overloaded.
+    if (op->body.As<ir::IntImm>() && op->body.As<ir::IntImm>()->value == 0) {
+      return;
+    }
+    str_ += " = ";
+    IrPrinter::Visit(op->body);
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+bool CodeGenCUDA_Dev::PrintBuiltinVectorAccess(const ir::LoadStoreAddrMnger *op,
+                                               ir::Expr index_expr,
+                                               bool is_store) {
+  static constexpr char index2suffix[8] = {
+      'x', 'y', 'z', 'w', 'v', 'u', 't', 's'};
+
+  // addr of op should be a place of tensor and the index is simple int number
+  if (!op->is_addr_tensor() || !index_expr.As<ir::IntImm>()) {
+    return false;
+  }
+  auto *tensor = op->tensor.As<ir::_Tensor_>();
+  CHECK(tensor);
+
+  // identify vectorized tensors by their names
+  if (!vectorized_tensor_names_.count(tensor->name)) {
+    return false;
+  }
+
+  // the index can't exceed the range of cuda built-in vector type
+  int index = index_expr.As<ir::IntImm>()->value;
+  if (index < 0 || index >= 8) {
+    return false;
+  }
+  if (is_store && tensor->type().is_cpp_handle()) {
+    str_ += tensor->name;
+    str_ += "[";
+    str_ += std::to_string(index);
+    str_ += "]";
+  } else {
+    str_ += tensor->name;
+    str_ += (tensor->type().is_cpp_handle() ? "->" : ".");
+    str_ += index2suffix[index];
+  }
+  return true;
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Load *op) {
+  // overload this visit function to especially deal with the case when it
+  // accesses element at a cuda built-in vector, others still resolve to
+  // CodeGenC
+  if (!PrintBuiltinVectorAccess(op, op->index(), false)) {
+    CodeGenC::Visit(op);
+  }
+}
+
+void CodeGenCUDA_Dev::Visit(const ir::Store *op) {
+  // overload this visit function to especially deal with the case when it
+  // accesses element at a cuda built-in vector, others still resolve to
+  // CodeGenC
+  if (PrintBuiltinVectorAccess(op, op->index(), true)) {
+    str_ += " = ";
+    IrPrinter::Visit(op->value);
+  } else {
+    CodeGenC::Visit(op);
+  }
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_cuda_dev.h
+++ b/paddle/cinn/backends/codegen_cuda_dev.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/cinn/backends/codegen_c.h"
+#include "paddle/cinn/common/common.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/module.h"
+#include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/lang/packed_func.h"
+#include "paddle/cinn/runtime/cinn_runtime.h"
+
+namespace cinn::ir {
+class Module;
+}  // namespace cinn::ir
+
+namespace cinn {
+namespace backends {
+
+/**
+ * CUDA device code generator.
+ *
+ * It generates the device function, e.g, the function called "myadd" will have
+ * a __global__ functon called "myadd_kernel", different from codegen_c, the
+ * declaration of the "myadd_kernel" function has an expanded argument list,
+ * which finally similar to `__global__ void myadd(float* __restrict__ A, float*
+ * __restrict__ B, int n);`
+ */
+class CodeGenCUDA_Dev : public CodeGenC {
+ public:
+  explicit CodeGenCUDA_Dev(Target target);
+
+  /**
+   * Compile the \p module to \p outputs.
+   */
+  void Compile(const ir::Module& module, const Outputs& outputs);
+
+  //! Compile on NVRTC.
+  std::string Compile(const ir::Module& module, bool for_nvrtc = true);
+
+  void Compile(const ir::LoweredFunc& func);
+
+  /**
+   * \brief Print a function argument in CUDA syntax. Currently, just some
+   * decoration of __restrict__.
+   * @param arg the argument.
+   * @return the representation in CUDA syntax.
+   *
+   * We make it a static to make the test easier.
+   */
+  void PrintFuncArg(const ir::Argument& arg);
+
+  std::string Compile(const ir::Module& module, OutputKind output_kind);
+
+  static const std::string& GetSourceHeader();
+
+ protected:
+  void Visit(const ir::_Var_* op) override;
+  void Visit(const ir::_LoweredFunc_* op) override;
+  void Visit(const ir::Min* op) override;
+  void Visit(const ir::Max* op) override;
+  void Visit(const ir::Alloc* op) override;
+  void Visit(const ir::Call* op) override;
+  void Visit(const ir::Load* op) override;
+  void Visit(const ir::Store* op) override;
+  void Visit(const ir::Let* op) override;
+
+  // Print element access at a cuda built-in vector on a load/store node
+  bool PrintBuiltinVectorAccess(const ir::LoadStoreAddrMnger* op,
+                                ir::Expr index,
+                                bool is_store);
+
+  void PrintBuiltinCodes();
+
+  void PrintIncludes() override;
+
+  void PrintTempBufferCreation(const ir::Buffer& buffer);
+
+  void PrintTempBufferAliasDefinition(const ir::Buffer& buffer);
+
+  std::vector<Expr> GenerateBufferAliasExprs(
+      const ir::_LoweredFunc_* op, const std::vector<ir::Buffer>& temp_buffers);
+
+  /**
+   * Print the function declaration, this is different from C, we expand the
+   * arguments and get something like
+   * `__global__ void myadd(float* __restrict__ A, float* __restrict__ B, int
+   * n);`
+   */
+  void PrintFunctionDeclaration(const ir::_LoweredFunc_* op);
+
+ private:
+  Target target_;
+  bool for_nvrtc_{false};
+  // names of vectorized tensors from `Let` statments where dtypes of the
+  // tensors are customized_type with customized_type::kcuda_builtin_vector_t
+  // prefix
+  std::unordered_set<std::string> vectorized_tensor_names_;
+  static const std::string source_header_;
+};
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_cuda_generate_test.cc
+++ b/paddle/cinn/backends/codegen_cuda_generate_test.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <stdlib.h>
+
+#include <fstream>
+#include <tuple>
+#include <vector>
+
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/ir_util.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/hlir/pe/schedule.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/lang/lower.h"
+#include "paddle/cinn/optim/ir_simplify.h"
+#include "paddle/cinn/utils/timer.h"
+
+namespace cinn {
+namespace backends {
+
+TEST(CUDAFile, Module_output) {
+  std::string cuda_source_name = "_generated1.cu";
+  std::string cuda_source_code = R"ROC(
+extern "C" {
+
+__global__
+void __launch_bounds__(200) elementwise_mul(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C)
+{
+  if (((int)blockIdx.x < 100)) {
+    if (((int)threadIdx.x < 200)) {
+      C[((200 * (int)blockIdx.x) + (int)threadIdx.x)] = (A[((200 * (int)blockIdx.x) + (int)threadIdx.x)] * B[((200 * (int)blockIdx.x) + (int)threadIdx.x)]);
+    };
+  };
+}
+
+}
+  )ROC";
+  std::ofstream file(cuda_source_name);
+  CHECK(file.is_open()) << "failed to open file " << cuda_source_name;
+  file << CodeGenCUDA_Dev::GetSourceHeader();
+  file << cuda_source_code;
+  file.close();
+  LOG(WARNING) << "Output C source to file " << cuda_source_name;
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_cuda_host.cc
+++ b/paddle/cinn/backends/codegen_cuda_host.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/backends/llvm/llvm_util.h"
+#include "paddle/cinn/runtime/intrinsic.h"
+
+namespace cinn {
+namespace backends {
+
+using cinn::common::bfloat16;
+using cinn::common::float16;
+
+const int kArgsArrayMaxLen = 20;
+
+llvm::Value* CodeGenCUDA_Host::LowerGPUKernelLauncher(
+    const ir::_LoweredFunc_* func) {
+  auto body = func->body;
+  auto* call_ir = body.As<ir::Call>();
+  CHECK(call_ir);
+
+  // Create the function
+  // @{
+  auto* function_type = GenFunctionTypeFromCinnFunction(func, true);
+  llvm::Function* function = llvm::Function::Create(
+      function_type, llvm::Function::ExternalLinkage, func->name, m_);
+  function->setCallingConv(llvm::CallingConv::C);
+  function->setHasUWTable();
+
+  std::vector<llvm::Value*> ll_function_args;
+  std::transform(function->arg_begin(),
+                 function->arg_end(),
+                 std::back_inserter(ll_function_args),
+                 [](auto& arg) { return std::addressof(arg); });
+  // @}
+
+  llvm::BasicBlock* entry = llvm::BasicBlock::Create(
+      /*Context=*/b_->getContext(),
+      /*Name=*/"entry",
+      /*Parent=*/function,
+      /*InsertBefore=*/nullptr);
+  b_->SetInsertPoint(entry);
+
+  auto* kernel_args = ll_function_args[0];
+  auto* kernel_args_count = ll_function_args[1];
+  llvm::Value* kernel_stream = nullptr;
+  if (ll_function_args.size() == 3) {
+    kernel_stream = ll_function_args[2];
+    CHECK_EQ(kernel_stream->getType(), ll_void_p_ty());  // void* stream
+  }
+  CHECK_EQ(kernel_args->getType(), ll_void_p_ty());       // void* args
+  CHECK_EQ(kernel_args_count->getType(), ll_int32_ty());  // int32
+
+  std::unordered_map<std::string, llvm::Value*> global_args = {
+      {KERNEL_ARGS, kernel_args},
+      {KERNEL_ARGS_NUM, kernel_args_count},
+      {KERNEL_STREAM, kernel_stream}};
+
+  auto ret_type = CinnTypeToLLVMType(Void(), m_);
+  std::vector<llvm::Type*> args_type;
+  for (auto r_arg : call_ir->read_args) {
+    if (r_arg.is_var()) {
+      if (r_arg.as_var()->type().is_cpp_handle() ||
+          r_arg.as_var()->type().is_string()) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<void*>(), m_));
+      } else if (r_arg.as_var()->type().is_int(32)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int32_t>(), m_));
+      } else {
+        CINN_NOT_IMPLEMENTED;
+      }
+    } else {
+      if (r_arg.type().is_bool()) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<bool>(), m_));
+      } else if (r_arg.type().is_uint(8)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<uint8_t>(), m_));
+      } else if (r_arg.type().is_uint(16)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<uint16_t>(), m_));
+      } else if (r_arg.type().is_uint(32)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<uint32_t>(), m_));
+      } else if (r_arg.type().is_uint(64)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<uint64_t>(), m_));
+      } else if (r_arg.type().is_int(8)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int8_t>(), m_));
+      } else if (r_arg.type().is_int(16)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int16_t>(), m_));
+      } else if (r_arg.type().is_int(32)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int32_t>(), m_));
+      } else if (r_arg.type().is_int(64)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<int64_t>(), m_));
+      } else if (r_arg.type().is_float(32)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<float>(), m_));
+      } else if (r_arg.type().is_float(64)) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<double>(), m_));
+      } else if (r_arg.type().is_bfloat16()) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<bfloat16>(), m_));
+      } else if (r_arg.type().is_float16()) {
+        args_type.push_back(CinnTypeToLLVMType(type_of<float16>(), m_));
+      } else {
+        CINN_NOT_IMPLEMENTED;
+      }
+    }
+  }
+  auto func_type = llvm::FunctionType::get(ret_type, args_type, false);
+  auto call_func = m_->getOrInsertFunction(call_ir->name, func_type);
+
+  std::vector<llvm::Value*> call_args;
+  for (auto& r_arg : call_ir->read_args) {
+    if (r_arg.is_var()) {
+      if (r_arg.as_var()->type().is_string()) {
+        auto kvalue = m_->getOrInsertGlobal(r_arg.as_var()->name + "_ptr_",
+                                            b_->getInt8PtrTy());
+        call_args.push_back(b_->CreateLoad(
+            b_->getInt8PtrTy(), kvalue, r_arg.as_var()->name + "_ptr_load"));
+      } else if (r_arg.as_var()->type().is_cpp_handle() ||
+                 r_arg.as_var()->type().is_int(32)) {
+        CHECK(global_args.count(r_arg.as_var()->name));
+        call_args.push_back(global_args[r_arg.as_var()->name]);
+      } else {
+        CINN_NOT_IMPLEMENTED;
+      }
+    } else {
+      if (r_arg.type().is_bool()) {
+        call_args.push_back(b_->getInt1(r_arg.as_bool()));
+      } else if (r_arg.type().is_int(8)) {
+        call_args.push_back(b_->getInt8(r_arg.as_int8()));
+      } else if (r_arg.type().is_int(16)) {
+        call_args.push_back(b_->getInt16(r_arg.as_int16()));
+      } else if (r_arg.type().is_int(32)) {
+        call_args.push_back(b_->getInt32(r_arg.as_int32()));
+      } else if (r_arg.type().is_int(64)) {
+        call_args.push_back(b_->getInt64(r_arg.as_int64()));
+      } else if (r_arg.type().is_uint(8)) {
+        call_args.push_back(b_->getInt8(r_arg.as_uint8()));
+      } else if (r_arg.type().is_uint(16)) {
+        call_args.push_back(b_->getInt16(r_arg.as_uint16()));
+      } else if (r_arg.type().is_uint(32)) {
+        call_args.push_back(b_->getInt32(r_arg.as_uint32()));
+      } else if (r_arg.type().is_uint(64)) {
+        call_args.push_back(b_->getInt64(r_arg.as_uint64()));
+      } else if (r_arg.type().is_float(32)) {
+        call_args.push_back(llvm::ConstantFP::get(
+            b_->getFloatTy(), llvm::APFloat(r_arg.as_float())));
+      } else if (r_arg.type().is_float(64)) {
+        call_args.push_back(llvm::ConstantFP::get(
+            b_->getDoubleTy(), llvm::APFloat(r_arg.as_double())));
+      } else if (r_arg.type().is_bfloat16()) {
+        call_args.push_back(llvm::ConstantFP::get(
+            b_->getBFloatTy(),
+            llvm::APFloat(static_cast<float>(r_arg.as_bfloat16()))));
+      } else if (r_arg.type().is_float16()) {
+        call_args.push_back(llvm::ConstantFP::get(
+            b_->getHalfTy(),
+            llvm::APFloat(static_cast<float>(r_arg.as_float16()))));
+      } else {
+        CINN_NOT_IMPLEMENTED;
+      }
+    }
+  }
+
+  b_->CreateCall(call_func, call_args);
+  RetVoid();
+
+  return function;
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_cuda_host.h
+++ b/paddle/cinn/backends/codegen_cuda_host.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+
+namespace cinn {
+namespace backends {
+
+/**
+ * CodeGenCUDA takes a CINN Module with host functions and output a LLVM module.
+ */
+class CodeGenCUDA_Host : public CodeGenLLVM {
+ public:
+  explicit CodeGenCUDA_Host(llvm::Module *m,
+                            llvm::IRBuilder<> *b,
+                            const std::shared_ptr<SymbolTable> &vars = nullptr)
+      : CodeGenLLVM(m, b, vars) {}
+
+  using CodeGenLLVM::Visit;
+  llvm::Value *Visit(const ir::_LoweredFunc_ *func) override {
+    return LowerGPUKernelLauncher(func);
+  }
+
+ private:
+  /**
+   * Lower a CUDA kernel launcher.
+   *
+   * We launch a CUDA kernel in the following way:
+   *
+   * 1. a GPU function (called fn) will compiled to PTX and lower by CUDA driver
+   * to a function pointer, which we store as a `void*` type global variable
+   * [fn_kernel_ptr] in LLVM module.
+   * 2. when lower the host launcher, we replace the Call of the original kernel
+   * [fn] to a Call of `cinn_call_cuda_kernel` method which is registered as an
+   * external function.
+   *
+   */
+  llvm::Value *LowerGPUKernelLauncher(const ir::_LoweredFunc_ *func);
+};
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_cuda_util.cc
+++ b/paddle/cinn/backends/codegen_cuda_util.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+
+#include "paddle/cinn/backends/cuda_util.h"
+#include "paddle/cinn/ir/utils/ir_mutator.h"
+
+namespace cinn {
+namespace backends {
+
+std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module) {
+  detail::CollectHostFunctionVisitor visitor(module->name);
+  Expr expr(module);
+  return visitor(&expr);
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_cuda_util.h
+++ b/paddle/cinn/backends/codegen_cuda_util.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
+#include "paddle/cinn/ir/utils/ir_mutator.h"
+
+namespace cinn {
+namespace backends {
+
+#define KERNEL_ARGS "kernel_args"
+#define KERNEL_ARGS_NUM "kernel_args_num"
+#define KERNEL_STREAM "kernel_stream"
+
+/**
+ * Split a CINN Module into two separate modules, one cantains the host
+ * functions, the other contains the device kernels.
+ *
+ * This contains some process:
+ *
+ * - replace the original kernel function with a Call node and add it to the
+ * first module, add a device kernel function to the second module.
+ */
+std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module);
+
+namespace detail {
+
+struct CollectHostFunctionVisitor : public ir::IRMutator<> {
+  explicit CollectHostFunctionVisitor(const std::string& module_name)
+      : host_module_builder(module_name + "_host", common::DefaultHostTarget()),
+        device_module_builder(module_name + "_gpu_device",
+                              common::DefaultNVGPUTarget()) {}
+
+  std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
+    ir::IRMutator<>::Visit(expr, expr);
+    return std::make_tuple(host_module_builder.Build(),
+                           device_module_builder.Build());
+  }
+
+ private:
+  void Visit(const ir::_LoweredFunc_* op, Expr* expr) override {
+    if (op->body.As<ir::Call>()) {
+      host_module_builder.AddFunctionWithoutOptim(expr->as_lowered_func_ref());
+    } else {
+      if (!op->cuda_axis_info.valid()) {
+        expr->as_lowered_func_ref()->cuda_axis_info.set_valid(true);
+      }
+      auto host_func = CreateHostFunctionGivenDeviceKernel(op);
+      host_module_builder.AddFunctionWithoutOptim(
+          host_func.as_lowered_func_ref());
+      device_module_builder.AddFunctionWithoutOptim(
+          CreateDeviceFunctionGivenDeviceKernel(*expr).as_lowered_func_ref());
+    }
+  }
+
+  /**
+   * Create a wrapper function for a kernel.
+   *
+   * For example, we get a kernel function:
+   *
+   * \code
+   * __global__
+   * void fn (float* a, float* out) { ... }
+   * \endcode
+   *
+   * A host wrapper function will generate for it
+   *
+   * \code
+   * void fn (cinn_buffer_t* a, cinn_buffer_t* out) {
+   *   Call(fn_kernel);
+   * }
+   * \endcode
+   */
+  Expr CreateHostFunctionGivenDeviceKernel(const ir::_LoweredFunc_* func) {
+    // std::vector<Expr> args;
+    // NOTE the suffix `__ptr` makes this argument lower to a pointer in LLVM
+    // backend. args.push_back(Var("args__ptr", type_of<cinn_pod_value_t*>()));
+    // args.push_back(Var("num_args", type_of<int32_t>()));
+    ir::Var kernel_ptr(GenDeviceKernelName(func->name), type_of<std::string>());
+    ir::Var kernel_args(KERNEL_ARGS, type_of<void*>());
+    ir::Var kernel_args_num(KERNEL_ARGS_NUM, type_of<int>());
+    ir::Var kernel_stream(KERNEL_STREAM, type_of<void*>());
+
+    auto call_extern_api =
+        ir::Call::Make(Void(),
+                       runtime::intrinsic::call_cuda_kernel,
+                       {kernel_ptr,
+                        kernel_args,
+                        kernel_args_num,
+                        Expr(func->cuda_axis_info.grid_dim(0)),   // grid_x
+                        Expr(func->cuda_axis_info.grid_dim(1)),   // grid_y
+                        Expr(func->cuda_axis_info.grid_dim(2)),   // grid_z
+                        Expr(func->cuda_axis_info.block_dim(0)),  // block_x
+                        Expr(func->cuda_axis_info.block_dim(1)),  // block_y
+                        Expr(func->cuda_axis_info.block_dim(2)),  // block_z
+                        kernel_stream},
+                       {},
+                       ir::CallType::Extern,
+                       ir::FunctionRef(),
+                       0);
+    std::vector<ir::Argument> arguments = {
+        ir::Argument(kernel_args, ir::Argument::IO::kOutput),
+        ir::Argument(kernel_args_num, ir::Argument::IO::kInput),
+        ir::Argument(kernel_stream, ir::Argument::IO::kOutput)};
+
+    return ir::_LoweredFunc_::Make(func->name, arguments, call_extern_api, {});
+  }
+
+  Expr CreateDeviceFunctionGivenDeviceKernel(Expr expr) {
+    auto copied = optim::IRCopy(expr);
+    auto* lowered_func = copied.as_lowered_func();
+    lowered_func->name = GenDeviceKernelName(lowered_func->name);
+    return copied;
+  }
+
+  inline std::string GenDeviceKernelName(const std::string& fn) {
+    return fn + "_kernel";
+  }
+
+ private:
+  ir::Module::Builder host_module_builder;
+  ir::Module::Builder device_module_builder;
+};
+
+}  // namespace detail
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/codegen_debug_test.cc
+++ b/paddle/cinn/backends/codegen_debug_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
+
+namespace cinn {
+namespace backends {
+
+/**
+ * This file is not a common test, it is used as a util for developers to
+ * write source CUDA code to debug whether it runs correctly during runtime
+ */
+using runtime::cuda::CUDAModule;
+
+/**
+ * Utility function to create cuda memory of non-empty shape.
+ *
+ * @param shape: a non-empty shape for the created cuda memory
+ * @param data: the data to initialize the cuda memory. Function doesn't
+ *     initailize if it is nullptr
+ * @return the CUdeviceptr pointing to the created memory
+ */
+template <typename T>
+CUdeviceptr CreateCudaMemory(const std::vector<int>& shape, const T* data) {
+  CHECK(!shape.empty()) << "Couldn't create CUDA memory for empty shape";
+  CUDA_CALL(cudaDeviceSynchronize());
+
+  int numel = 1;
+  for (int s : shape) {
+    numel = numel * s;
+  }
+
+  CUdeviceptr cuda_ptr = cuMemAlloc(&cuda_ptr, numel * sizeof(T));
+  if (data != nullptr) {
+    CUDA_CALL(cudaMemcpy(reinterpret_cast<void*>(cuda_ptr),
+                         data,
+                         numel * sizeof(T),
+                         cudaMemcpyHostToDevice));
+  }
+  return cuda_ptr;
+}
+
+TEST(CodeGenDebug, RunCudaSourceCode) {
+  common::Context::Global().ResetNameId();
+
+  std::string source_code = R"ROC(
+extern "C" {
+
+__global__
+void __launch_bounds__(512) fn_relu_1_kernel(const float* __restrict__ var_1, float* __restrict__ Relu_output)
+{
+  for (int32_t j_0 = 0; j_0 < 8; j_0 += 1) {
+    for (int32_t j_1 = 0; j_1 < 1; j_1 += 1) {
+      for (int32_t j_2 = 0; j_2 < 1; j_2 += 1) {
+        for (int32_t j_3 = 0; j_3 < 8; j_3 += 1) {
+          for (int32_t j_4 = 0; j_4 < 1; j_4 += 1) {
+            for (int32_t k_0 = 0; k_0 < 1; k_0 += 1) {
+              for (int32_t k_1 = 0; k_1 < 7; k_1 += 1) {
+                for (int32_t k_2 = 0; k_2 < 4; k_2 += 1) {
+                  for (int32_t k_3 = 0; k_3 < 4; k_3 += 1) {
+                    for (int32_t k_4 = 0; k_4 < 1; k_4 += 1) {
+                      for (int32_t a_0 = 0; a_0 < 16; a_0 += 1) {
+                        for (int32_t a_1 = 0; a_1 < 1; a_1 += 1) {
+                          for (int32_t a_2 = 0; a_2 < 1; a_2 += 1) {
+                            for (int32_t a_3 = 0; a_3 < 1; a_3 += 1) {
+                              for (int32_t a_4 = 0; a_4 < 7; a_4 += 1) {
+                                Relu_output[((7 * a_0) + ((7 * a_1) + ((7 * a_2) + ((7 * a_3) + ((100352 * j_0) + ((100352 * j_1) + ((100352 * j_2) + ((12544 * j_3) + ((12544 * j_4) + ((12544 * k_0) + ((1792 * k_1) + ((448 * k_2) + ((112 * k_3) + ((112 * k_4) + a_4))))))))))))))] = max(var_1[((7 * a_0) + ((7 * a_1) + ((7 * a_2) + ((7 * a_3) + ((100352 * j_0) + ((100352 * j_1) + ((100352 * j_2) + ((12544 * j_3) + ((12544 * j_4) + ((12544 * k_0) + ((1792 * k_1) + ((448 * k_2) + ((112 * k_3) + ((112 * k_4) + a_4))))))))))))))], 0.00000000f);
+                              };
+                            };
+                          };
+                        };
+                      };
+                    };
+                  };
+                };
+              };
+            };
+          };
+        };
+      };
+    };
+  };
+}
+
+}
+)ROC";
+
+  backends::nvrtc::Compiler compiler;
+
+  std::string ptx = compiler(CodeGenCUDA_Dev::GetSourceHeader() + source_code);
+  ASSERT_FALSE(ptx.empty());
+
+  CUDAModule cuda_module(ptx, CUDAModule::Kind::PTX);
+  CUdeviceptr var =
+      CreateCudaMemory<float>(/* shape */ {64 * 112 * 112}, /* data */ nullptr);
+  CUdeviceptr out =
+      CreateCudaMemory<float>(/* shape */ {64 * 112 * 112}, /* data */ nullptr);
+
+  void* args[] = {&var, &out};
+  dim3 grid(512, 1, 1);
+  dim3 block(512, 1, 1);
+  cuda_module.LaunchKernel(
+      /*device_id*/ 0, "fn_relu_1_kernel", grid, block, args);
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/compiler.cc
+++ b/paddle/cinn/backends/compiler.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/compiler.h"
+
+#include <fstream>
+
+#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/hlir/framework/visualize_helper.h"
+#ifdef CINN_WITH_CUDA
+#include "paddle/cinn/backends/codegen_cuda_dev.h"
+#include "paddle/cinn/backends/codegen_cuda_host.h"
+#include "paddle/cinn/backends/codegen_cuda_util.h"
+#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
+#include "paddle/cinn/runtime/cuda/cuda_util.h"
+#include "paddle/cinn/runtime/flags.h"
+#endif
+
+DECLARE_string(cinn_source_code_save_path);
+DECLARE_string(cinn_dump_group_lowered_func);
+DECLARE_string(cinn_dump_group_source_code);
+DECLARE_string(cinn_dump_group_ptx);
+DECLARE_string(cinn_dump_group_instruction);
+
+namespace cinn {
+namespace backends {
+using ir::Module;
+
+static constexpr int DebugLogMaxLen = 30000;
+
+void CompilationInfoDumper::DumpLoweredFunc() {
+  if (FLAGS_cinn_dump_group_lowered_func.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.lowered_funcs.size(); ++idx) {
+    std::stringstream content;
+    content << info_.lowered_funcs[idx].front();
+    Dump(FLAGS_cinn_dump_group_lowered_func,
+         idx,
+         "lowered_function.txt",
+         content.str());
+  }
+}
+
+void CompilationInfoDumper::DumpSourceCode() {
+  if (FLAGS_cinn_dump_group_source_code.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.source_codes.size(); ++idx) {
+    Dump(FLAGS_cinn_dump_group_source_code,
+         idx,
+         "source_code.cu",
+         info_.source_codes[idx]);
+  }
+}
+
+void CompilationInfoDumper::DumpPtxCode() {
+  if (FLAGS_cinn_dump_group_ptx.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.source_ptxs.size(); ++idx) {
+    Dump(FLAGS_cinn_dump_group_ptx,
+         idx,
+         "source_ptx.ptx",
+         info_.source_ptxs[idx]);
+  }
+}
+
+void CompilationInfoDumper::DumpInstruction() {
+  if (FLAGS_cinn_dump_group_instruction.empty()) {
+    return;
+  }
+  for (int idx = 0; idx < info_.instructions.size(); ++idx) {
+    Dump(FLAGS_cinn_dump_group_instruction,
+         idx,
+         "instruction.txt",
+         info_.instructions[idx]->DumpInstruction());
+  }
+}
+
+void CompilationInfoDumper::Dump(const std::string& base_path,
+                                 const int idx,
+                                 const std::string& file_name,
+                                 const std::string& content) {
+  auto dump_path =
+      utils::StringFormat("%s/fusion_group_%d", base_path.c_str(), idx);
+  if (!hlir::framework::MakeDirectory(
+          dump_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
+    LOG(WARNING) << "Failed to make directory: \"" << dump_path
+                 << "\", the instruction for this group will not dump.";
+  } else {
+    auto dump_file =
+        utils::StringFormat("%s/%s", dump_path.c_str(), file_name.c_str());
+    VLOG(7) << "Dump instruction to: " << dump_file;
+    std::ofstream of(dump_file, std::ios_base::out);
+    if (of.is_open()) {
+      of << content;
+      of.close();
+    } else {
+      LOG(WARNING) << "Failed to open file: " << dump_file
+                   << ", please check your path.";
+    }
+  }
+}
+
+SourceCodePrint::SourceCodePrint() {
+  if (!FLAGS_cinn_source_code_save_path.empty()) {
+    LOG(INFO)
+        << "The CINN auto generated source code will writing into file: \""
+        << FLAGS_cinn_source_code_save_path << "\"";
+    of.open(FLAGS_cinn_source_code_save_path, std::ios_base::out);
+  }
+}
+
+SourceCodePrint::~SourceCodePrint() {
+  if (of.is_open()) {
+    of.close();
+  }
+}
+
+void SourceCodePrint::write(const std::string& source_code) {
+  std::lock_guard<std::mutex> guard(mtx_);
+  if (of.is_open()) {
+    of << source_code << std::endl;
+  } else if (!FLAGS_cinn_source_code_save_path.empty()) {
+    LOG(WARNING) << "Failed to open \"" << FLAGS_cinn_source_code_save_path
+                 << "\", source code will print.";
+    if (source_code.size() > DebugLogMaxLen) {
+      LOG(INFO) << "[CUDA] source code-0:\n"
+                << source_code.substr(0, DebugLogMaxLen);
+      for (int i = 1; i * DebugLogMaxLen < source_code.size(); ++i) {
+        LOG(INFO) << "[CUDA] source code-" << i << ":\n"
+                  << source_code.substr(DebugLogMaxLen * i, DebugLogMaxLen);
+      }
+    } else {
+      LOG(INFO) << "[CUDA] source code:\n" << source_code;
+    }
+  }
+}
+
+void Compiler::Build(const Module& module, const std::string& code) {
+  if (target_.arch == Target::Arch::NVGPU) {
+    CompileCudaModule(module, code);
+  } else if (target_.arch == Target::Arch::X86) {
+    CompileX86Module(module);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+std::string Compiler::GetSourceCode(const ir::Module& module) {
+  if (target_.arch == Target::Arch::NVGPU) {
+#ifdef CINN_WITH_CUDA
+    auto _host_module_device_module_ =
+        SplitCudaAndHostModule(module);  // NOLINT
+    auto& host_module = std::get<0>(_host_module_device_module_);
+    auto& device_module = std::get<1>(_host_module_device_module_);
+    CodeGenCUDA_Dev codegen(target_);
+    auto source_code = codegen.Compile(device_module);
+    return source_code;
+#else
+    CINN_NOT_IMPLEMENTED
+#endif
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void Compiler::BuildDefault(const Module& module) {
+  if (target_.arch == Target::Arch::NVGPU) {
+    CompileCudaModule(module);
+  } else if (target_.arch == Target::Arch::X86) {
+    CompileX86Module(module);
+  } else {
+    CINN_NOT_IMPLEMENTED
+  }
+}
+
+void Compiler::CompileCudaModule(const Module& module,
+                                 const std::string& code) {
+#ifdef CINN_WITH_CUDA
+  auto _host_module_device_module_ = SplitCudaAndHostModule(module);  // NOLINT
+  auto& host_module = std::get<0>(_host_module_device_module_);
+  auto& device_module = std::get<1>(_host_module_device_module_);
+  VLOG(3) << "[CUDA] host module:\n" << host_module;
+
+  VLOG(3) << "[CUDA] device module:\n" << device_module;
+  std::string source_code;
+  if (code.empty()) {
+    CodeGenCUDA_Dev codegen(target_);
+    source_code = codegen.Compile(device_module);
+  } else {
+    source_code = code;
+  }
+  CHECK(!source_code.empty())
+      << "Compile CUDA C code failed from device module:\n"
+      << device_module;
+  VLOG(3) << "[CUDA] C:\n" << source_code;
+  SourceCodePrint::GetInstance()->write(source_code);
+  using runtime::cuda::CUDAModule;
+
+  nvrtc::Compiler compiler;
+  auto ptx = compiler(source_code);
+  CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n"
+                      << source_code;
+  cuda_module_.reset(new CUDAModule(ptx,
+                                    compiler.compile_to_cubin()
+                                        ? CUDAModule::Kind::CUBIN
+                                        : CUDAModule::Kind::PTX));
+
+  RuntimeSymbols symbols;
+  for (auto& fn : device_module.functions()) {
+    std::string kernel_fn_name = fn->name;
+    auto fn_kernel = cuda_module_->GetFunction(0, kernel_fn_name);
+    CHECK(fn_kernel);
+
+    symbols.RegisterVar(kernel_fn_name + "_ptr_",
+                        reinterpret_cast<void*>(fn_kernel));
+  }
+
+  engine_ = ExecutionEngine::Create(ExecutionOptions(), std::move(symbols));
+  engine_->Link<CodeGenCUDA_Host>(host_module);
+
+#else
+  CINN_NOT_IMPLEMENTED
+#endif
+}
+
+void Compiler::CompileX86Module(const Module& module) {
+  engine_->Link<CodeGenX86>(module);
+}
+
+void Compiler::ExportObject(const std::string& path) {
+  engine_->ExportObject(path);
+}
+
+void* Compiler::Lookup(absl::string_view fn_name) {
+  CHECK(engine_);
+  if (engine_->Lookup(fn_name) != nullptr) {
+    return engine_->Lookup(fn_name);
+  }
+  return nullptr;
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/compiler.h
+++ b/paddle/cinn/backends/compiler.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/strings/string_view.h>
+
+#include <fstream>
+#include <memory>
+#include <mutex>
+#include <string>
+
+#include "paddle/cinn/backends/llvm/codegen_llvm.h"
+#include "paddle/cinn/backends/llvm/execution_engine.h"
+#include "paddle/cinn/backends/llvm/simple_jit.h"
+#include "paddle/cinn/hlir/framework/parallel_compiler.h"
+#include "paddle/cinn/lang/packed_func.h"
+#ifdef CINN_WITH_CUDA
+#include "paddle/cinn/runtime/cuda/cuda_module.h"
+#endif
+
+namespace cinn {
+namespace backends {
+
+/**
+ * A class for dumping the code after compilation.
+ * Use FLAGS_cinn_dump_group_lowered_func to specify the directory to dump
+ * lowered function. Use FLAGS_cinn_dump_group_source_code to specify the
+ * directory to dump the source code. Use FLAGS_cinn_dump_group_ptx to specify
+ * the directory to dump ptx. Use FLAGS_cinn_dump_group_instruction to specify
+ * the directory to dump instruction.
+ */
+class CompilationInfoDumper {
+ public:
+  explicit CompilationInfoDumper(
+      const hlir::framework::ParallelCompiler::CompilationResult& info)
+      : info_(info) {
+    DumpLoweredFunc();
+    DumpSourceCode();
+    DumpPtxCode();
+    DumpInstruction();
+  }
+
+ private:
+  void DumpLoweredFunc();
+  void DumpSourceCode();
+  void DumpPtxCode();
+  void DumpInstruction();
+  void Dump(const std::string& base_path,
+            const int idx,
+            const std::string& file_name,
+            const std::string& content);
+
+  const hlir::framework::ParallelCompiler::CompilationResult& info_;
+};
+
+class SourceCodePrint {
+ public:
+  static SourceCodePrint* GetInstance() {
+    static SourceCodePrint print;
+    return &print;
+  }
+
+  void write(const std::string& source_code);
+
+ private:
+  SourceCodePrint();
+  ~SourceCodePrint();
+
+  std::ofstream of;
+  std::mutex mtx_;
+};
+
+class Compiler final {
+ public:
+  static std::unique_ptr<Compiler> Create(const Target& target) {
+    return std::unique_ptr<Compiler>(new Compiler(target));
+  }
+
+  /**
+   * Compile and link to a CINN module.
+   */
+  void Build(const ir::Module& module, const std::string& code = "");
+
+  void ExportObject(const std::string& path);
+
+  std::string GetSourceCode(const ir::Module& module);
+
+  void BuildDefault(const ir::Module& module);
+
+  /**
+   * Retrieve a function by \p fn_name.
+   * @return function address or null if not exists.
+   */
+  void* Lookup(absl::string_view fn_name);
+
+ private:
+  void CompileCudaModule(const ir::Module& module,
+                         const std::string& code = "");
+
+  void CompileX86Module(const ir::Module& module);
+
+  explicit Compiler(const Target& target)
+      : target_(target), engine_(ExecutionEngine::Create(ExecutionOptions())) {}
+
+  CINN_DISALLOW_COPY_AND_ASSIGN(Compiler);
+
+ private:
+  Target target_;
+  std::unique_ptr<ExecutionEngine> engine_;
+
+#ifdef CINN_WITH_CUDA
+  std::unique_ptr<runtime::cuda::CUDAModule> cuda_module_;
+#endif
+};
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/compiler_test.cc
+++ b/paddle/cinn/backends/compiler_test.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/compiler.h"
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include "paddle/cinn/cinn.h"
+#include "paddle/cinn/common/test_helper.h"
+#include "paddle/cinn/hlir/pe/elementwise.h"
+#include "paddle/cinn/hlir/pe/nn.h"
+#include "paddle/cinn/runtime/use_extern_funcs.h"
+#include "paddle/cinn/utils/timer.h"
+
+namespace cinn {
+namespace backends {
+
+TEST(Compiler, x86) {
+  Expr M(1024), N(1024);
+
+  auto create_module = [&]() {
+    Placeholder<float> A("A", {M, N});
+    Placeholder<float> B("B", {M, N});
+
+    auto C = Compute(
+        {M, N}, [=](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+    return std::make_tuple(A, B, C);
+  };
+
+  {                                  // test x86
+    auto _A_B_C_ = create_module();  // NOLINT
+    auto& A = std::get<0>(_A_B_C_);
+    auto& B = std::get<1>(_A_B_C_);
+    auto& C = std::get<2>(_A_B_C_);
+
+    auto stages = CreateStages({C});
+
+    auto fn = Lower("fn", stages, {A, B, C});
+
+    ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+    builder.AddFunction(fn);
+
+    auto compiler = Compiler::Create(common::DefaultHostTarget());
+    compiler->Build(builder.Build());
+
+    auto* fnp = compiler->Lookup("fn");
+    ASSERT_TRUE(fnp);
+
+    auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+                   .set_random()
+                   .Build();
+    auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+                   .set_random()
+                   .Build();
+    auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+                   .set_zero()
+                   .Build();
+
+    auto args = common::ArgsBuilder().Add(Ab).Add(Bb).Add(Cb).Build();
+    reinterpret_cast<void (*)(void*, int)>(fnp)(args.data(), args.size());
+
+    // test result
+    auto* Ad = reinterpret_cast<float*>(Ab->memory);
+    auto* Bd = reinterpret_cast<float*>(Bb->memory);
+    auto* Cd = reinterpret_cast<float*>(Cb->memory);
+    for (int i = 0; i < Ab->num_elements(); i++) {
+      ASSERT_NEAR(Ad[i] + Bd[i], Cd[i], 1e-5);
+    }
+  }
+}
+
+#ifdef CINN_WITH_CUDA
+TEST(Compiler, cuda) {
+  Expr M(1024), N(1024);
+
+  auto create_module = [&]() {
+    Placeholder<float> A("A", {M, N});
+    Placeholder<float> B("B", {M, N});
+
+    auto C = Compute(
+        {M, N}, [=](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
+    return std::make_tuple(A, B, C);
+  };
+
+  {                                  // cuda
+    auto _A_B_C_ = create_module();  // NOLINT
+    auto& A = std::get<0>(_A_B_C_);
+    auto& B = std::get<1>(_A_B_C_);
+    auto& C = std::get<2>(_A_B_C_);
+    auto stages = CreateStages({C});
+
+    stages[C]->Bind(0, "blockIdx.x");
+    stages[C]->Bind(1, "threadIdx.x");
+
+    auto fn = Lower("fn", stages, {A, B, C});
+
+    ir::Module::Builder builder("some_module", common::DefaultHostTarget());
+    builder.AddFunction(fn);
+
+    auto compiler = Compiler::Create(common::DefaultNVGPUTarget());
+    compiler->Build(builder.Build());
+
+    auto* fnp = compiler->Lookup("fn");
+    ASSERT_TRUE(fnp);
+
+    auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+                   .set_random()
+                   .Build();
+    auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+                   .set_random()
+                   .Build();
+    auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
+                   .set_zero()
+                   .Build();
+
+    // allocate CUDA buffer
+    void *Ag, *Bg, *Cg;
+    const int num_bytes = Ab->num_elements() * sizeof(float);
+    cudaMalloc(&Ag, num_bytes);
+    cudaMalloc(&Bg, num_bytes);
+    cudaMalloc(&Cg, num_bytes);
+
+    CUDA_CALL(cudaMemcpy(Ag, Ab->memory, num_bytes, cudaMemcpyHostToDevice));
+    CUDA_CALL(cudaMemcpy(Bg, Bb->memory, num_bytes, cudaMemcpyHostToDevice));
+    CUDA_CALL(cudaMemcpy(Cg, Cb->memory, num_bytes, cudaMemcpyHostToDevice));
+
+    cinn_buffer_t Abb;
+    Abb.memory = reinterpret_cast<uint8_t*>(Ag);
+    cinn_buffer_t Bbb;
+    Bbb.memory = reinterpret_cast<uint8_t*>(Bg);
+    cinn_buffer_t Cbb;
+    Cbb.memory = reinterpret_cast<uint8_t*>(Cg);
+
+    auto args = common::ArgsBuilder().Add(&Abb).Add(&Bbb).Add(&Cbb).Build();
+
+    utils::Timer timer;
+    timer.Start();
+    void* stream = nullptr;
+    for (int i = 0; i < 1000; i++) {
+      reinterpret_cast<void (*)(void*, int, void*)>(fnp)(
+          args.data(), args.size(), stream);
+    }
+
+    CUDA_CALL(cudaDeviceSynchronize());
+    float latency = timer.Stop();
+    LOG(INFO) << "latency: " << latency / 1000;
+
+    std::vector<float> ch(M.as_int32() * N.as_int32(), 0.f);
+    CUDA_CALL(cudaMemcpy(
+        ch.data(), Cg, ch.size() * sizeof(float), cudaMemcpyDeviceToHost));
+
+    auto* Ad = reinterpret_cast<float*>(Ab->memory);
+    auto* Bd = reinterpret_cast<float*>(Bb->memory);
+    for (int i = 0; i < Ab->num_elements(); i++) {
+      ASSERT_NEAR(Ad[i] + Bd[i], ch[i], 1e-5);
+    }
+  }
+}
+#endif
+
+TEST(Compiler, sqrt) {
+  Expr N(100);
+  Expr C(10);
+  Expr H(10);
+  Expr W(10);
+
+  Placeholder<float> input("input", {N, C, H, W});
+  Placeholder<float> mean("mean", {C});
+  Placeholder<float> scale("scale", {C});
+  Placeholder<float> variance("variance", {C});
+  Placeholder<float> bias("bias", {C});
+  float epsilon = 0.1f;
+
+  auto A = Compute(
+      {N, C, H, W},
+      [=](Expr n, Expr c, Expr h, Expr w) {
+        return (input(n, c, h, w) - mean(c)) * scale(c) /
+                   lang::Sqrt(variance(c) + Expr(epsilon)) +
+               bias(c);
+      },
+      "A");
+
+  auto B = hlir::pe::Pool2d(
+      input, {3, 3}, {1, 1}, {1, 1, 1, 1}, "max", false, false);
+
+  auto BB = hlir::pe::BatchNorm_NCHW(
+      input, scale, bias, mean, variance, epsilon, "batchnorm");
+
+  auto stages = CreateStages({input, mean, scale, variance, A, bias, B[0], BB});
+
+  auto fn =
+      Lower("fn", stages, {input, mean, scale, bias, variance, A, B[0], BB});
+
+  Module::Builder builder("some", common::DefaultHostTarget());
+  builder.AddFunction(fn);
+
+  auto compiler = Compiler::Create(common::DefaultHostTarget());
+  compiler->Build(builder.Build());
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/cuda_util.cc
+++ b/paddle/cinn/backends/cuda_util.cc
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/backends/cuda_util.h"
+
+#include <glog/logging.h>
+
+#include "paddle/cinn/backends/extern_func_jit_register.h"
+#include "paddle/cinn/common/target.h"
+
+namespace cinn {
+namespace backends {
+
+std::string cuda_thread_axis_name(int level) {
+  switch (level) {
+    case 0:
+      return "threadIdx.x";
+      break;
+    case 1:
+      return "threadIdx.y";
+      break;
+    case 2:
+      return "threadIdx.z";
+      break;
+  }
+  return "";
+}
+
+std::string cuda_block_axis_name(int level) {
+  switch (level) {
+    case 0:
+      return "blockIdx.x";
+      break;
+    case 1:
+      return "blockIdx.y";
+      break;
+    case 2:
+      return "blockIdx.z";
+      break;
+  }
+  return "";
+}
+
+}  // namespace backends
+}  // namespace cinn
--- a/paddle/cinn/backends/cuda_util.h
+++ b/paddle/cinn/backends/cuda_util.h
+// Copyright (c) 2021 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#ifdef CINN_WITH_CUDA
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <glog/logging.h>
+
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "paddle/cinn/runtime/cinn_runtime.h"
+
+#define CUDA_DRIVER_CALL(func)                                                 \
+  {                                                                            \
+    auto status = func;                                                        \
+    if (status != CUDA_SUCCESS) {                                              \
+      const char* msg;                                                         \
+      cuGetErrorString(status, &msg);                                          \
+      LOG(FATAL) << "CUDA Driver Error: " #func " failed with error: " << msg; \
+    }                                                                          \
+  }
+
+#define CUDA_CALL(func)                                            \
+  {                                                                \
+    auto status = func;                                            \
+    if (status != cudaSuccess) {                                   \
+      LOG(FATAL) << "CUDA Error : " << cudaGetErrorString(status); \
+    }                                                              \
+  }
+
+#define CURAND_CALL(func)                        \
+  {                                              \
+    auto status = func;                          \
+    if (status != CURAND_STATUS_SUCCESS) {       \
+      LOG(FATAL) << "CURAND Error : " << status; \
+    }                                            \
+  }
+
+#define CUSOLVER_CALL(func)                       \
+  {                                               \
+    auto status = func;                           \
+    if (status != CUSOLVER_STATUS_SUCCESS) {      \
+      LOG(FATAL) << "CUSOLVER Error: " << status; \
+    }                                             \
+  }
+
+#define CUBLAS_CALL(func)                  \
+  {                                        \
+    auto status = func;                    \
+    if (status != CUBLAS_STATUS_SUCCESS) { \
+      LOG(FATAL) << "CUBLAS Error!";       \
+    }                                      \
+  }
+
+#define CUDNN_CALL(func)                                             \
+  {                                                                  \
+    auto status = func;                                              \
+    if (status != CUDNN_STATUS_SUCCESS) {                            \
+      LOG(FATAL) << "CUDNN Error : " << cudnnGetErrorString(status); \
+    }                                                                \
+  }
+
+#define NVRTC_CALL(func)                                             \
+  {                                                                  \
+    auto status = func;                                              \
+    if (status != NVRTC_SUCCESS) {                                   \
+      LOG(FATAL) << "NVRTC Error : " << nvrtcGetErrorString(status); \
+    }                                                                \
+  }
+
+namespace cinn {
+namespace backends {
+
+// CUDA syntax for thread axis.
+std::string cuda_thread_axis_name(int level);
+
+// CUDA syntax for block axis.
+std::string cuda_block_axis_name(int level);
+
+}  // namespace backends
+}  // namespace cinn
+
+#endif  // CINN_WITH_CUDA