2.5

992bec46 · “yuguo” · 0259837d · 992bec46 · 992bec46 · 992bec46
Commit 992bec46 authored Oct 08, 2023 by “yuguo”
20 changed files
--- a/paddle/cinn/auto_schedule/task/task_creator_test.cc
+++ b/paddle/cinn/auto_schedule/task/task_creator_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <vector>
+
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::frontend::NetBuilder;
+using ::cinn::frontend::Program;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::Node;
+
+Program CreateAddProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
+  auto program = builder.Build();
+
+  return program;
+}
+
+TEST(TaskCreator, Basic) {
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  Program prog = CreateAddProgram();
+  auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
+
+  TaskCreator task_creator;
+  std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
+
+  ASSERT_EQ(tasks.size(), 2UL);
+  for (TuneTask& task : tasks) {
+    std::shared_ptr<Graph::Group> subgraph = task.subgraph;
+    ASSERT_EQ(subgraph->CollectNodes().size(), 1UL);
+    ASSERT_EQ(subgraph->nodes[0]->op()->name, "elementwise_add");
+  }
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task/task_optimizer.cc
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
+
+#include <glog/logging.h>
+
+#include <functional>
+#include <limits>
+
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "paddle/cinn/auto_schedule/measure/measure.h"
+#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/hlir/op/external_api_registry.h"
+#include "paddle/cinn/ir/buffer.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
+#include "paddle/cinn/optim/transform_gpu_forloop.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/string.h"
+#ifdef CINN_WITH_CUDA
+#include <cuda_runtime_api.h>
+
+#include "paddle/cinn/backends/cuda_util.h"
+#endif
+
+DECLARE_bool(auto_schedule_use_cost_model);
+
+namespace cinn {
+namespace auto_schedule {
+
+using cinn::hlir::op::ExternalApiRegistry;
+
+// *** forward declarations of auxiliary functions to be used in this file only
+// *** update a scheduled function with several post-processors
+ir::LoweredFunc FuncWithUpdatedBody(const common::Target& target,
+                                    const ir::LoweredFunc& old_func,
+                                    ir::Expr& body);  // NOLINT
+// check whether a scheduled lowered function is valid
+bool PruneInvalid(const ir::LoweredFunc& lowered_func,
+                  const common::Target& target);
+// exclude some special tasks
+bool IsForbiddenToTune(const TuneTask* task);
+// tell whether the task has been wrapped by custom_call in
+// TransToCustomCallPass
+bool IsWrappedByCustomCall(const TuneTask* task);
+// tell whether the task has registered external api
+bool HasExternalApi(const TuneTask* task);
+
+TaskOptimizer::TaskOptimizer(TuneTask* task,
+                             ScheduleMeasurer* schedule_measurer,
+                             Database* database,
+                             utils::LinearRandomEngine::StateType rand_seed)
+    : task_(task),
+      schedule_measurer_(schedule_measurer),
+      database_(database),
+      cost_model_(),
+      rand_seed_(utils::LinearRandomEngine::NormalizeState(rand_seed)) {}
+
+FunctionGroup TaskOptimizer::Optimize(const TuningOptions& options) {
+  CHECK(task_->subgraph != nullptr) << "subgraph can't be empty";
+  // task with forbidden or custom_call ops can't be tuned
+  if (IsForbiddenToTune(task_) || IsWrappedByCustomCall(task_)) {
+    return task_->op_lowerer->Lower(task_->subgraph);
+  }
+  // TODO(CtfGo): the input/output names of a Graph::Group will be changed in
+  // Lowering by OpLowerer currently, so we should revert them after following
+  // different lower methods, remove this hard code by fixing the decoupling
+  // between lowering and BuildInstructions
+  auto initial_input_names = task_->subgraph->input_names;
+  auto initial_output_names = task_->subgraph->output_names;
+
+  std::vector<TaskOptimizer::Result> candidates;
+  candidates.emplace_back(OptimizeByEvolution(options));
+  candidates.emplace_back(OptimizeByManual(options.num_measure_trials > 0));
+  if (HasExternalApi(task_)) {
+    candidates.emplace_back(OptimizeByExternal(options.num_measure_trials > 0));
+  }
+  sort(candidates.begin(),
+       candidates.end(),
+       [](const auto& lhs, const auto& rhs) { return lhs.cost < rhs.cost; });
+  auto&& best = candidates.front();
+  VLOG(4) << "Total candidates=" << candidates.size()
+          << ", the best from=" << best.from << ", cost=" << best.cost;
+
+  // revert input/output names
+  task_->subgraph->input_names = initial_input_names;
+  task_->subgraph->output_names = initial_output_names;
+  return best.functions;
+}
+
+TaskOptimizer::Result TaskOptimizer::OptimizeByManual(bool need_measured) {
+  static constexpr char* kManualMeasuredKeyPrefix = "@ManualMeasured:\n";
+  TaskOptimizer::Result result("Manual");
+  result.functions = task_->op_lowerer->Lower(task_->subgraph);
+
+  // pack functions body
+  std::vector<ir::Expr> func_bodys;
+  for (const ir::LoweredFunc& func : result.functions) {
+    func_bodys.push_back(func->body);
+  }
+
+  SearchState state(ir::IRSchedule(ir::ModuleExpr(std::move(func_bodys))));
+  // the manual is regarded as the second best in default, so we set its cost
+  // 0.0
+  result.cost = 0.0;
+
+  // add the specific prefix in front of serialized_key to be store/load
+  // measured record for manual schedule
+  std::string measured_key = kManualMeasuredKeyPrefix + task_->serialized_key;
+  if (need_measured && database_->Count(measured_key) == 0) {
+    std::vector<MeasureInput> inputs(1);
+    inputs.back().task = task_;
+    inputs.back().lowered_funcs = result.functions;
+    VLOG(4) << "Measure manual schedule";
+    std::vector<MeasureResult> measure_outputs =
+        schedule_measurer_->Measure(inputs);
+    database_->AddRecord(
+        TuningRecord(measured_key, state, measure_outputs[0].execution_cost));
+  }
+
+  auto measured_records = database_->LookUp(measured_key);
+  if (!measured_records.empty()) {  // update result.cost by measured if exists
+    result.cost = measured_records[0].execution_cost;
+  }
+  return result;
+}
+
+TaskOptimizer::Result TaskOptimizer::OptimizeByExternal(bool need_measured) {
+  static constexpr char* kExternalMeasuredKeyPrefix = "@ExternalMeasured:\n";
+  TaskOptimizer::Result result("External");
+  auto nodes = task_->subgraph->CollectNodes();
+  auto* first_node = nodes.front();
+
+  // set the necessary field for lowering with external api
+  std::string original_op = first_node->op()->name;
+  first_node->attrs.attr_store["original_op"] = original_op;
+  first_node->attrs.op = hlir::framework::Operator::Get("custom_call");
+  result.functions = task_->op_lowerer->Lower(task_->subgraph);
+
+  // add the specific prefix in front of serialized_key to be store/load
+  // measured record for external api
+  result.cost = -1.0;  // the external is regarded as the best in default, so we
+                       // set its cost -1.0
+  std::string measured_key = kExternalMeasuredKeyPrefix + task_->serialized_key;
+  if (need_measured && database_->Count(measured_key) == 0) {
+    std::vector<MeasureInput> inputs(1);
+    inputs.back().task = task_;
+    inputs.back().lowered_funcs = result.functions;
+    VLOG(4) << "Measure external api";
+    std::vector<MeasureResult> measure_outputs =
+        schedule_measurer_->Measure(inputs);
+    // the SearchState of external is invalid and will not be used, so we just
+    // put a temporary one
+    database_->AddRecord(TuningRecord(measured_key,
+                                      SearchState(ir::IRSchedule()),
+                                      measure_outputs[0].execution_cost));
+  }
+
+  auto measured_records = database_->LookUp(measured_key);
+  if (!measured_records.empty()) {  // update result.cost by measured if exists
+    result.cost = measured_records[0].execution_cost;
+  }
+  return result;
+}
+
+bool IsForbiddenToTune(const TuneTask* task) {
+  // TODO(CtfGo): some operators may change its linked edges in
+  // TransToCustomCallPass, like conv2d, we will skip these ops in auto-schedule
+  // because they can't revert original links for no schedule and manual
+  // schedule lowering.
+  static std::unordered_set<std::string> links_changed_ops = {"conv2d"};
+  auto nodes = task->subgraph->CollectNodes();
+  auto&& op_name = nodes.front()->op()->name;
+  if (nodes.size() == 1 && links_changed_ops.count(op_name)) {
+    VLOG(5) << "Op:" << op_name << " is forbidden to call external_api";
+    return true;
+  }
+
+  return false;
+}
+
+bool HasExternalApi(const TuneTask* task) {
+  auto nodes = task->subgraph->CollectNodes();
+  auto* first_node = nodes.front();
+  if (nodes.size() == 1 && ExternalApiRegistry::Global()->Has(
+                               first_node->op()->name, task->target)) {
+    return true;
+  }
+  return false;
+}
+
+bool IsWrappedByCustomCall(const TuneTask* task) {
+  auto nodes = task->subgraph->CollectNodes();
+  auto* first_node = nodes.front();
+  if (nodes.size() == 1 && first_node->op()->name == "custom_call") {
+    CHECK(first_node->attrs.attr_store.count("original_op"))
+        << "a custom_call op must store its original op name";
+    std::string op_name =
+        absl::get<std::string>(first_node->attrs.attr_store.at("original_op"));
+    VLOG(5) << "Op:" << op_name << " was wrapped as custom_call";
+    return true;
+  }
+
+  return false;
+}
+
+TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution(
+    const TuningOptions& options) {
+  CHECK_EQ(options.num_measure_trials % options.num_samples_per_iteration, 0)
+      << "TuningOptions.num_measure_trials % "
+         "TuningOptions.num_samples_per_iteration must be 0.";
+
+  VLOG(4) << "Optimizing TuneTask with num_measure_trials:"
+          << options.num_measure_trials
+          << ", LoweredFunc before optimization is:";
+  VLOG(4) << "lowered function size = " << task_->lowered_funcs.size();
+  for (size_t i = 0; i < task_->lowered_funcs.size(); ++i) {
+    VLOG(4) << "lowered_funcs[" << i << "] detail:\n"
+            << task_->lowered_funcs[i];
+  }
+
+  if (evolutionary_search_ == nullptr) {
+    // TODO(zhhsplendid): check whether the options is same as previous,
+    // if not, we should create new EvolutionarySearch
+    evolutionary_search_ = std::make_unique<EvolutionarySearch>(
+        *task_, cost_model_, database_, utils::ForkRandomState(&rand_seed_));
+  }
+
+  TaskOptimizer::Result result("Evolution");
+  auto& optimized_funcs = result.functions;
+  auto& best_cost = result.cost;
+  // use initial lowered function as default result
+  optimized_funcs = optim::IRCopy(task_->lowered_funcs);
+  if (options.num_measure_trials ==
+      0) {  // no need to measure and simply return the best searched
+    std::vector<MeasureInput> measure_candidates;
+    std::vector<SearchState> states =
+        SearchOneRound(options, &measure_candidates);
+    if (!states.empty()) {
+      if (FLAGS_auto_schedule_use_cost_model) {
+        best_cost = cost_model_.Predict(states.front()->ir_schedule.GetModule(),
+                                        task_->target);
+      }
+      optimized_funcs = measure_candidates[0].lowered_funcs;
+    } else {
+      LOG(WARNING) << "No valid candidate searched, will return initial state";
+    }
+    return result;
+  }
+
+  int measured_count = 0;
+  uint32_t continuous_empty_cnt = 0;
+  while (measured_count < options.num_measure_trials) {
+    VLOG(4) << "Launch a new search, current measured_count:" << measured_count;
+    std::vector<MeasureInput> measure_inputs;
+    std::vector<SearchState> states = SearchOneRound(options, &measure_inputs);
+    if (states.empty()) {  // no new valid candidate achieved
+      ++continuous_empty_cnt;
+      if (continuous_empty_cnt <= kMaxRetryContinuousEmpty_) {
+        VLOG(4) << "No valid state searched, continuous_empty_cnt="
+                << continuous_empty_cnt;
+        continue;
+      } else {
+        LOG(WARNING) << "OptimizeByEvolution will be exited in advance due to "
+                        "continuous invalid search, final measured_count="
+                     << measured_count;
+        break;
+      }
+    }
+    continuous_empty_cnt = 0;  // reset if get valid candidates
+
+    VLOG(4) << "ScheduleMeasurer start with input size="
+            << measure_inputs.size();
+    std::vector<MeasureResult> measure_outputs =
+        schedule_measurer_->Measure(measure_inputs);
+    CHECK_EQ(measure_outputs.size(), states.size())
+        << "ScheduleMeasurer didn't output same number of MeasureOutput of "
+           "states in TaskOptimizer";
+    // record to database
+    for (size_t i = 0; i < states.size(); ++i) {
+      database_->AddRecord(TuningRecord(measure_inputs[i].task->serialized_key,
+                                        states[i],
+                                        measure_outputs[i].execution_cost));
+    }
+
+    // update cost model
+    if (FLAGS_auto_schedule_use_cost_model) {
+      std::vector<const ir::ModuleExpr*> cost_model_samples(states.size());
+      std::vector<float> cost_model_labels(states.size());
+      for (size_t i = 0; i < states.size(); ++i) {
+        cost_model_samples[i] = &(states[i]->ir_schedule.GetModule());
+        cost_model_labels[i] = measure_outputs[i].execution_cost;
+      }
+      VLOG(4) << utils::StringFormat(
+          "Update CostModel with samples size=%lu,labels size=%lu",
+          cost_model_samples.size(),
+          cost_model_labels.size());
+      cost_model_.Update(cost_model_samples, cost_model_labels, task_->target);
+    }
+
+    // update the best
+    for (size_t i = 0; i < measure_outputs.size(); ++i) {
+      if (measure_outputs[i].execution_cost < best_cost) {
+        VLOG(4) << "Update best candidate with execution_cost:"
+                << measure_outputs[i].execution_cost << "us";
+        best_cost = measure_outputs[i].execution_cost;
+        optimized_funcs = measure_inputs[i].lowered_funcs;
+      }
+    }
+
+    // count result size
+    measured_count += states.size();
+  }
+  return result;
+}
+
+std::vector<SearchState> TaskOptimizer::SearchOneRound(
+    const TuningOptions& options,
+    std::vector<MeasureInput>* measure_candidates) {
+  std::vector<SearchState> states =
+      evolutionary_search_->SearchModuleExprEpsGreedy(options);
+  VLOG(4) << JoinStatesDebugString("TaskOptimizer::EvolutionarySearch-Result",
+                                   states,
+                                   /*verbose=*/VLOG_IS_ON(5));
+
+  size_t valid_cnt = 0;
+  for (size_t i = 0; i < states.size(); ++i) {
+    std::vector<ir::Expr> best_exprs =
+        states[i]->ir_schedule.GetModule().GetExprs();
+    CHECK_EQ(best_exprs.size(), task_->lowered_funcs.size())
+        << "RuntimeError: Expr size is not equal to LoweredFunc size in "
+           "TaskOptimizer";
+    auto init_funcs = optim::IRCopy(task_->lowered_funcs);
+    std::vector<ir::LoweredFunc> valid_funcs;
+    for (size_t j = 0; j < best_exprs.size(); ++j) {
+      auto updated_f =
+          UpdateFuncWithNewBody(task_->target, init_funcs[j], best_exprs[j]);
+      if (PruneInvalid(updated_f, task_->target)) {
+        VLOG(4) << "PruneInvalid states-" << i;
+        break;
+      }
+      valid_funcs.emplace_back(updated_f);
+    }
+
+    // all functions are validated, collect this state to be measured
+    if (valid_funcs.size() == init_funcs.size()) {
+      states[valid_cnt++] = states[i];
+      measure_candidates->emplace_back(MeasureInput());
+      measure_candidates->back().task = task_;
+      measure_candidates->back().lowered_funcs = std::move(valid_funcs);
+    }
+  }
+
+  states.erase(states.begin() + valid_cnt, states.end());
+  CHECK_EQ(states.size(), measure_candidates->size())
+      << "result size of states not equal to measure_candidates";
+  VLOG(4) << "EvolutionarySearch return size=" << states.size()
+          << ", valid count=" << valid_cnt;
+  VLOG(4) << JoinStatesDebugString("TaskOptimizer::SearchOneRound-Result",
+                                   states,
+                                   /*verbose=*/VLOG_IS_ON(5));
+  return states;
+}
+
+// detect the limit of available shared memory on the current NVGPU with CUDA
+// runtime
+size_t GetGPUSharedMemoryLimit() {
+#ifdef CINN_WITH_CUDA
+  int device_id;
+  CUDA_CALL(cudaGetDevice(&device_id));
+  cudaDeviceProp prop;
+  CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
+  VLOG(4) << utils::StringFormat(
+      "GPU-%d GPUSharedMemoryLimit=%d", device_id, prop.sharedMemPerBlock);
+  return prop.sharedMemPerBlock;
+#else
+  return 0;
+#endif
+}
+
+// detect the limit of available local/stack memory on the current NVGPU with
+// CUDA runtime
+size_t GetGPULocalStackLimit() {
+#ifdef CINN_WITH_CUDA
+  int device_id;
+  CUDA_CALL(cudaGetDevice(&device_id));
+  cudaDeviceProp prop;
+  CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
+  size_t limit = prop.totalGlobalMem / prop.multiProcessorCount /
+                 prop.maxThreadsPerMultiProcessor;
+  VLOG(4) << utils::StringFormat(
+      "GPU-%d "
+      "totalGlobalMem=%lu,maxThreadsPerMultiProcessor=%d,multiProcessorCount=%"
+      "d, calculated "
+      "GPULocalStackLimit=%lu",
+      device_id,
+      prop.totalGlobalMem,
+      prop.multiProcessorCount,
+      prop.maxThreadsPerMultiProcessor,
+      limit);
+  return limit;
+#else
+  return 0;
+#endif
+}
+
+// check whether usage of the specific memory type in the lowered_func exceeds
+// hardware limit
+bool IsGPUMemoryUsageExceedLimit(const ir::LoweredFunc& lowered_func,
+                                 const ir::MemoryType& used_memory_type,
+                                 const size_t limit_bytes) {
+  std::unordered_set<std::string> visited;
+  size_t used_bytes_cnt = 0;
+  for (auto&& buf : lowered_func->temp_bufs) {
+    VLOG(5) << "temp buf name=" << buf->name << ", numel=" << buf->numel()
+            << ",dtype=" << buf->dtype;
+    if (buf->memory_type == used_memory_type && !visited.count(buf->name)) {
+      used_bytes_cnt += buf->numel() * buf->dtype.bytes();
+      visited.insert(buf->name);
+    }
+  }
+  VLOG(5) << "total used_bytes_cnt=" << used_bytes_cnt;
+  return used_bytes_cnt >= limit_bytes;
+}
+
+bool PruneInvalid(const ir::LoweredFunc& lowered_func,
+                  const common::Target& target) {
+  static const size_t kGPUSharedMemoryLimitBytes = GetGPUSharedMemoryLimit();
+  static const size_t kGPULocalStackLimitBytes = GetGPULocalStackLimit();
+
+  if (target == common::DefaultNVGPUTarget()) {
+    if (IsGPUMemoryUsageExceedLimit(lowered_func,
+                                    ir::MemoryType::GPUShared,
+                                    kGPUSharedMemoryLimitBytes)) {
+      VLOG(5) << ir::MemoryType::GPUShared
+              << " memory usage exceeds limit, func:\n"
+              << lowered_func;
+      return true;
+    }
+
+    if (IsGPUMemoryUsageExceedLimit(
+            lowered_func, ir::MemoryType::GPULocal, kGPULocalStackLimitBytes)) {
+      VLOG(5) << ir::MemoryType::GPULocal
+              << " memory usage exceeds limit, func:\n"
+              << lowered_func;
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task/task_optimizer.h
+++ b/paddle/cinn/auto_schedule/task/task_optimizer.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
+#include "paddle/cinn/auto_schedule/database/database.h"
+#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
+#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/utils/random_engine.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// This class is responsible for tuning a specific task,
+// it will integrate necessary components to search the
+// optimal schedule for the task.
+class TaskOptimizer {
+ public:
+  TaskOptimizer(TuneTask* task,
+                ScheduleMeasurer* schedule_measurer,
+                Database* database,
+                utils::LinearRandomEngine::StateType rand_seed = -1);
+
+  FunctionGroup Optimize(const TuningOptions& options);
+
+ private:
+  struct Result {
+    std::string from;
+    double cost;
+    FunctionGroup functions;
+    explicit Result(const std::string& from_type)
+        : from(from_type), cost(std::numeric_limits<double>::max()) {}
+  };
+
+  Result OptimizeByManual(bool need_measure);
+  Result OptimizeByExternal(bool need_measure);
+  Result OptimizeByEvolution(const TuningOptions& options);
+
+  // call search candidates once by EvolutionarySearch and prune invalid ones
+  std::vector<SearchState> SearchOneRound(
+      const TuningOptions& options,
+      std::vector<MeasureInput>* measure_candidates);
+
+ private:
+  // the max retry times if continuously get empty result
+  static constexpr uint32_t kMaxRetryContinuousEmpty_ = 3;
+  TuneTask* task_;
+  ScheduleMeasurer* schedule_measurer_;
+  std::unique_ptr<EvolutionarySearch> evolutionary_search_ = nullptr;
+  ExprCostModel cost_model_;
+  Database* database_;
+  utils::LinearRandomEngine::StateType rand_seed_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task/task_registry.h
+++ b/paddle/cinn/auto_schedule/task/task_registry.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <gflags/gflags.h>
+
+#include <mutex>
+#include <string>
+
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_copy.h"
+#include "paddle/cinn/utils/registry.h"
+
+namespace cinn {
+
+namespace auto_schedule {
+
+struct InitialTaskInfo {
+  std::string task_key;
+  ir::ModuleExpr module_expr;
+
+  InitialTaskInfo(const std::string& task_key,
+                  const ir::ModuleExpr& module_expr)
+      : task_key(task_key), module_expr(module_expr) {}
+};
+
+// Global task registry, used to save the initial ModuleExpr of each task.
+class InitialTaskRegistry : public Registry<InitialTaskInfo> {
+ public:
+  static InitialTaskRegistry* Global() {
+    static InitialTaskRegistry x;
+    return &x;
+  }
+
+  // Get the initial ModuleExpr of a task.
+  inline const InitialTaskInfo* Get(const std::string& task_key) {
+    const InitialTaskInfo* task_info =
+        Registry<InitialTaskInfo>::Find(task_key);
+    CHECK(task_info) << "InitialTaskInfo [" << task_key
+                     << "] is not registered";
+    return task_info;
+  }
+
+  // Check if the task info with task_key exists;
+  inline const bool Has(const std::string& task_key) {
+    return nullptr != Registry<InitialTaskInfo>::Find(task_key);
+  }
+
+  // Regist the initial ModuleExpr of a task into the map
+  inline void Regist(const std::string& task_key,
+                     const ir::ModuleExpr& module_expr) {
+    std::lock_guard<std::mutex> guard(registering_mutex);
+    if (fmap_.count(task_key) == 0) {
+      InitialTaskInfo* task_info =
+          new InitialTaskInfo(task_key, optim::IRCopy(module_expr));
+      __REGISTER__(task_key, task_info);
+    }
+  }
+
+ private:
+  InitialTaskRegistry() = default;
+  CINN_DISALLOW_COPY_AND_ASSIGN(InitialTaskRegistry);
+
+  // Regist the initial ModuleExpr of a task.
+  inline InitialTaskInfo* __REGISTER__(const std::string& task_key,
+                                       InitialTaskInfo* task_info) {
+    fmap_[task_key] = task_info;
+    const_list_.push_back(task_info);
+    entry_list_.push_back(task_info);
+    return task_info;
+  }
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task/task_registry_test.cc
+++ b/paddle/cinn/auto_schedule/task/task_registry_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/task/task_registry.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <cstdlib>
+
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/utils/string.h"
+#include "paddle/cinn/utils/type_defs.h"
+
+DECLARE_bool(auto_schedule_use_cost_model);
+
+namespace cinn {
+namespace auto_schedule {
+
+std::vector<TuneTask> CreateTasks(hlir::framework::Graph* graph,
+                                  const common::Target& target) {
+  // create tasks
+  TaskCreator task_creator;
+  std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph);
+
+  const auto& dtype_dict =
+      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+          "inferdtype");
+  const auto& shape_dict = graph->GetAttrs<
+      absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+
+  std::unique_ptr<hlir::framework::OpLowerer> op_lowerer =
+      std::make_unique<hlir::framework::OpLowerer>(
+          dtype_dict, shape_dict, target);
+  for (TuneTask& task : tasks) {
+    task.Initialize(shape_dict, dtype_dict, op_lowerer.get());
+    VLOG(3) << "Add a task with serialized_key:\n" << task.serialized_key;
+  }
+
+  return tasks;
+}
+
+std::shared_ptr<hlir::framework::Graph> CreateAddProgram(
+    const common::Target& target) {
+  frontend::NetBuilder builder("test");
+
+  auto a = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
+  auto b = builder.CreateInput(Float(32), {64}, "B");
+  auto c = builder.Add(a, b, 1);
+
+  return std::make_shared<hlir::framework::Graph>(builder.Build(), target);
+}
+
+TEST(TestTaskRegistry, basic) {
+  FLAGS_auto_schedule_use_cost_model = true;
+
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  std::shared_ptr<hlir::framework::Graph> graph = CreateAddProgram(target);
+  std::vector<TuneTask> tasks = CreateTasks(graph.get(), target);
+
+  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
+
+  std::vector<ir::ModuleExpr> module_exprs;
+  for (const TuneTask& task : tasks) {
+    module_exprs.emplace_back(task.GetLoweredFuncBodyExprs());
+    task_registry->Regist(task.serialized_key, module_exprs.back());
+  }
+
+  for (int i = 0; i < tasks.size(); ++i) {
+    std::string key = tasks[i].serialized_key;
+    VLOG(3) << "serialized_key = " << key;
+    ir::ModuleExpr new_expr = task_registry->Get(key)->module_expr;
+
+    ASSERT_EQ(new_expr.GetExprs().size(), module_exprs[i].GetExprs().size());
+    for (int j = 0; j < new_expr.GetExprs().size(); ++j) {
+      VLOG(3) << "expr " << j << " of task " << key << " : "
+              << new_expr.GetExprs().at(j);
+      ASSERT_EQ(utils::GetStreamCnt(new_expr.GetExprs().at(j)),
+                utils::GetStreamCnt(module_exprs[i].GetExprs().at(j)));
+    }
+  }
+
+  bool flag = task_registry->Has(tasks[0].serialized_key);
+  ASSERT_EQ(flag, true);
+
+  flag = task_registry->Has("not_exist");
+  ASSERT_EQ(flag, false);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task/tune_task.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+
+#include <glog/logging.h>
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/utils/string.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+void TuneTask::Initialize(
+    const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
+        shape_dict,
+    const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
+    hlir::framework::OpLowerer* lower_handler) {
+  CHECK(lower_handler != nullptr) << "op_lowerer can't be nullptr";
+  op_lowerer = lower_handler;
+
+  // Set lowered_funcs and analyze output names.
+  this->lowered_funcs = op_lowerer->Lower(
+      subgraph, /*apply_op_schedule = */ false, /*apply_group_schedule=*/false);
+  this->output_names = GetOutputNamesFromLoweredFunc(this->lowered_funcs);
+  this->serialized_key = SerializeToString(shape_dict, dtype_dict);
+}
+
+std::vector<ir::Expr> TuneTask::GetLoweredFuncBodyExprs() const {
+  std::vector<ir::Expr> result;
+  for (const ir::LoweredFunc& func : lowered_funcs) {
+    result.push_back(func->body);
+  }
+  return result;
+}
+
+std::string TuneTask::SerializeToString(
+    const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
+        shape_dict,
+    const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict) {
+  std::stringstream ss;
+  ss << target << "\n\n";  // print target
+
+  // local function to print dtype,shape of out/in variables of the specified
+  // node
+  auto print_node_links_fn =
+      [&](const std::vector<common::Shared<common::GraphEdge>>& links,
+          bool is_input) {
+        int printed_num = 0;
+        for (auto&& edge : links) {
+          const auto* var_node =
+              is_input ? edge->source()->safe_as<hlir::framework::NodeData>()
+                       : edge->sink()->safe_as<hlir::framework::NodeData>();
+          CHECK(var_node) << "var node invalid";
+          auto sit = shape_dict.find(var_node->id());
+          CHECK(sit != shape_dict.end())
+              << "can't find shape of variable:" << var_node->id();
+          auto dit = dtype_dict.find(var_node->id());
+          CHECK(dit != dtype_dict.end())
+              << "can't find dtype of variable:" << var_node->id();
+          if (printed_num > 0) {
+            ss << ", ";
+          }
+          ++printed_num;
+          // TODO(CtfGo): CINN uses the names of input/output NodeData ids as
+          // arguments of the LoweredFunc in the Lower process, so it will
+          // result in different LoweredFuncs for two Nodes even though they
+          // represents the same operator. Here we add `var_node->id()` into the
+          // serialized_key to distinguish them, otherwise AutoTuner will get
+          // wrong TuningRecords when querying cached results from database.  In
+          // the future, we should remove name-related limit in Lower process,
+          // to avoid duplicate tuning tasks with same operators.
+          ss << var_node->id() << "->" << cinn::common::Type2Str(dit->second)
+             << "[" + utils::Join(sit->second, ",") << "]";
+        }
+      };
+
+  // print each node of the subgraph
+  ss << "Group {\n";
+  for (auto&& node : subgraph->CollectNodes()) {
+    ss << "  (";
+    print_node_links_fn(node->outlinks_in_order(), false);
+    ss << ") = " << node->op()->name << "(";
+    print_node_links_fn(node->inlinks_in_order(), true);
+    ss << ")\n";
+  }
+  ss << "}\n";
+
+  return ss.str();
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task/tune_task.h
+++ b/paddle/cinn/auto_schedule/task/tune_task.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <absl/container/flat_hash_map.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/common/type.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/lowered_func.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+class TuneTask {
+ public:
+  TuneTask() = default;
+  explicit TuneTask(std::shared_ptr<hlir::framework::Graph::Group> group)
+      : subgraph(group) {}
+  // Initialize a task
+  void Initialize(
+      const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
+          shape_dict,
+      const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
+      hlir::framework::OpLowerer* lower_handler);
+  // Extract bodies in lowered_funcs() and return
+  std::vector<ir::Expr> GetLoweredFuncBodyExprs() const;
+
+  // In CINN, we use hlir::framework::Graph::Group to represent a fused
+  // sub-graph (if an op won't be fused, it will be a Group with size=1).
+  std::shared_ptr<hlir::framework::Graph::Group> subgraph;
+  // Lower handler, Not owned
+  hlir::framework::OpLowerer* op_lowerer;
+  // target of this task
+  common::Target target;
+  // stores the initial (un-optimized) LoweredFuncs
+  std::vector<ir::LoweredFunc> lowered_funcs;
+  // names of the output arguments of lowered_funcs_
+  std::unordered_set<std::string> output_names;
+  // serialized string of this task, it contains struct,shape,dtype,input/output
+  // variable name of the subgraph and can be further used to hash
+  std::string serialized_key;
+
+ private:
+  // Serialize this task as a string contains specific fields of it
+  std::string SerializeToString(
+      const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
+          shape_dict,
+      const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict);
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task/tune_task_test.cc
+++ b/paddle/cinn/auto_schedule/task/tune_task_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "paddle/cinn/auto_schedule/task/task_creator.h"
+#include "paddle/cinn/common/context.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/hlir/framework/scope.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/utils/string.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::frontend::NetBuilder;
+using ::cinn::frontend::Program;
+using ::cinn::hlir::framework::OpLowerer;
+
+Program CreateAddProgram() {
+  constexpr int M = 32;
+  constexpr int N = 24;
+
+  NetBuilder builder("net_builder");
+  auto a = builder.CreateInput(Float(32), {M, N}, "A");
+  auto b = builder.CreateInput(Float(32), {M, N}, "B");
+  auto c = builder.Add(a, b);
+  auto d = builder.Add(a, c);
+  auto program = builder.Build();
+
+  return program;
+}
+
+TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  Program prog = CreateAddProgram();
+  auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
+
+  TaskCreator task_creator;
+  std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
+  ASSERT_EQ(tasks.size(), 2UL);
+
+  const auto& shape_dict = graph->GetAttrs<
+      absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  const auto& dtype_dict =
+      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+          "inferdtype");
+  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+
+  std::stringstream ss;
+  for (TuneTask& task : tasks) {
+    task.Initialize(shape_dict, dtype_dict, &op_lowerer);
+
+    std::vector<ir::Expr> exprs = task.GetLoweredFuncBodyExprs();
+    VLOG(6) << "ir:Expr is: ";
+    for (const ir::Expr& e : exprs) {
+      VLOG(6) << e;
+      ss << e << std::endl;
+    }
+  }
+
+  std::string expr_str = ss.str();
+#ifdef CINN_WITH_CUDA
+  std::string target_str = R"ROC(
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 24)
+      {
+        ScheduleBlock(var_1)
+        {
+          i0, i1 = axis.bind(i, j)
+          var_1[i, j] = (A[i, j] + B[i, j])
+        }
+      }
+    }
+  }
+}
+{
+  ScheduleBlock(root_0)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 24)
+      {
+        ScheduleBlock(var_2)
+        {
+          i0_0, i1_0 = axis.bind(i, j)
+          var_2[i, j] = (A[i, j] + var_1[i, j])
+        }
+      }
+    }
+  }
+}
+)ROC";
+#else
+  std::string target_str = R"ROC(
+{
+  ScheduleBlock(root)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 24)
+      {
+        ScheduleBlock(var_1)
+        {
+          i0, i1 = axis.bind(i, j)
+          var_1[i0, i1] = (A[i0, i1] + B[i0, i1])
+        }
+      }
+    }
+  }
+}
+{
+  ScheduleBlock(root_0)
+  {
+    serial for (i, 0, 32)
+    {
+      serial for (j, 0, 24)
+      {
+        ScheduleBlock(var_2)
+        {
+          i0_0, i1_0 = axis.bind(i, j)
+          var_2[i0_0, i1_0] = (A[i0_0, i1_0] + var_1[i0_0, i1_0])
+        }
+      }
+    }
+  }
+}
+)ROC";
+#endif
+
+  EXPECT_EQ(utils::Trim(target_str), utils::Trim(expr_str));
+}
+
+TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  Program prog = CreateAddProgram();
+  auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
+  ApplyPass(graph.get(), "OpFusionPass");
+
+  TaskCreator task_creator;
+  std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
+
+  ASSERT_EQ(tasks.size(), 1UL);
+
+  const auto& shape_dict = graph->GetAttrs<
+      absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  const auto& dtype_dict =
+      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+          "inferdtype");
+
+  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+
+  std::stringstream ss;
+  for (TuneTask& task : tasks) {
+    task.Initialize(shape_dict, dtype_dict, &op_lowerer);
+
+    std::vector<ir::Expr> exprs = task.GetLoweredFuncBodyExprs();
+    VLOG(6) << "ir:Expr is: ";
+    for (const ir::Expr& e : exprs) {
+      VLOG(6) << e;
+      ss << e << std::endl;
+    }
+  }
+
+  std::string expr_str = ss.str();
+#ifdef CINN_WITH_CUDA
+  std::string target_str = R"ROC(
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 24)
+        {
+          ScheduleBlock(var_1)
+          {
+            i0, i1 = axis.bind(i, j)
+            var_1[i, j] = (A[i, j] + B[i, j])
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 24)
+        {
+          ScheduleBlock(var_2)
+          {
+            i0_0, i1_0 = axis.bind(i, j)
+            var_2[i, j] = (A[i, j] + var_1[i, j])
+          }
+        }
+      }
+    }
+  }
+}
+)ROC";
+
+#else
+  std::string target_str = R"ROC(
+{
+  ScheduleBlock(root)
+  {
+    {
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 24)
+        {
+          ScheduleBlock(var_1)
+          {
+            i0, i1 = axis.bind(i, j)
+            var_1[i0, i1] = (A[i0, i1] + B[i0, i1])
+          }
+        }
+      }
+      serial for (i, 0, 32)
+      {
+        serial for (j, 0, 24)
+        {
+          ScheduleBlock(var_2)
+          {
+            i0_0, i1_0 = axis.bind(i, j)
+            var_2[i0_0, i1_0] = (A[i0_0, i1_0] + var_1[i0_0, i1_0])
+          }
+        }
+      }
+    }
+  }
+}
+)ROC";
+#endif
+
+  EXPECT_EQ(utils::Trim(target_str), utils::Trim(expr_str));
+}
+
+TEST(TuneTask, SerializeToString) {
+  Context::Global().ResetNameId();
+#ifdef CINN_WITH_CUDA
+  Target target = common::DefaultNVGPUTarget();
+#else
+  Target target = common::DefaultHostTarget();
+#endif
+  Program prog = CreateAddProgram();
+  auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
+
+  TaskCreator task_creator;
+  std::vector<TuneTask> single_tasks =
+      task_creator.CreateTuneTaskOpLevel(graph.get());
+
+  const auto& shape_dict = graph->GetAttrs<
+      absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
+  const auto& dtype_dict =
+      graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+          "inferdtype");
+  OpLowerer op_lowerer(dtype_dict, shape_dict, target);
+  ASSERT_EQ(single_tasks.size(), 2UL);
+  for (auto&& task : single_tasks) {
+    task.Initialize(shape_dict, dtype_dict, &op_lowerer);
+  }
+
+#ifdef CINN_WITH_CUDA
+  std::string single_add_str = R"ROC(Target<linux,nvgpu,64>
+
+Group {
+  (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
+}
+)ROC";
+#else
+  std::string single_add_str = R"ROC(Target<linux,x86,64>
+
+Group {
+  (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
+}
+)ROC";
+#endif
+  EXPECT_EQ(single_tasks[0].serialized_key, single_add_str);
+
+  ApplyPass(graph.get(), "OpFusionPass");
+  std::vector<TuneTask> fused_tasks =
+      task_creator.CreateTuneTaskOpLevel(graph.get());
+  ASSERT_EQ(fused_tasks.size(), 1UL);
+  fused_tasks[0].Initialize(shape_dict, dtype_dict, &op_lowerer);
+
+#ifdef CINN_WITH_CUDA
+  std::string fused_expected_str = R"ROC(Target<linux,nvgpu,64>
+
+Group {
+  (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
+  (var_2->float32[32,24]) = elementwise_add(A->float32[32,24], var_1->float32[32,24])
+}
+)ROC";
+#else
+  std::string fused_expected_str = R"ROC(Target<linux,x86,64>
+
+Group {
+  (var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
+  (var_2->float32[32,24]) = elementwise_add(A->float32[32,24], var_1->float32[32,24])
+}
+)ROC";
+#endif
+  EXPECT_EQ(fused_tasks[0].serialized_key, fused_expected_str);
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
+core_gather_headers()
+
+gather_srcs(cinnapi_src SRCS task_scheduler.cc round_robin.cc
+            efficiency_priority.cc)
+
+cinn_cc_test(test_task_scheduler SRCS task_scheduler_test.cc DEPS cinncore)
--- a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+int EfficiencyPriority::NextTaskId() {
+  while (cur_task_id_ < tasks_->size()) {
+    if (IsTaskToTune(&tasks_->at(cur_task_id_))) {
+      return cur_task_id_++;
+    }
+    ++cur_task_id_;
+  }
+  return -1;
+}
+
+bool EfficiencyPriority::IsTaskToTune(const TuneTask* task) {
+  return config_.minimum_gain_threshold > 0.0;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
+++ b/paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Schedule tasks with efficiency_priority strategy, that
+// is picking a task with the maximum earnings ratio.
+class EfficiencyPriority : public TaskScheduler {
+ public:
+  EfficiencyPriority(const std::vector<TuneTask>& tasks, const Config& config)
+      : TaskScheduler(tasks, config) {}
+
+  const char* Name() const override { return "efficiency_priority"; };
+
+  int NextTaskId() override;
+
+ private:
+  bool IsTaskToTune(const TuneTask* task);
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+int RoundRobin::NextTaskId() {
+  if (cur_task_id_ < tasks_->size()) {
+    return cur_task_id_++;
+  }
+  return -1;
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task_scheduler/round_robin.h
+++ b/paddle/cinn/auto_schedule/task_scheduler/round_robin.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Schedule tasks with round_robin strategy, that
+// is picking a task to tune once a time iteratively.
+class RoundRobin : public TaskScheduler {
+ public:
+  RoundRobin(const std::vector<TuneTask>& tasks, const Config& config)
+      : TaskScheduler(tasks, config) {}
+
+  const char* Name() const override { return "round_robin"; };
+
+  int NextTaskId() override;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
+
+#include <algorithm>
+
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+std::unique_ptr<TaskScheduler> TaskScheduler::Make(
+    const std::vector<TuneTask>& tasks,
+    const Config& config,
+    const std::string& strategy) {
+  CHECK_GT(tasks.size(), 0) << "Empty task list";
+  if (strategy == "round_robin") {
+    return std::make_unique<RoundRobin>(tasks, config);
+  } else if (strategy == "efficiency_priority") {
+    return std::make_unique<EfficiencyPriority>(tasks, config);
+  }
+
+  LOG(FATAL) << "Unimplemented strategy:" << strategy;
+  return nullptr;
+}
+
+TaskScheduler::TaskScheduler(const std::vector<TuneTask>& tasks,
+                             const Config& config)
+    : tasks_(&tasks), config_(config), cur_task_id_(0) {}
+
+void TaskScheduler::Reset() { cur_task_id_ = 0; }
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
+#include "paddle/cinn/auto_schedule/task/tune_task.h"
+#include "paddle/cinn/auto_schedule/tuning.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// Class for scheduling tasks to perform auto-tune
+class TaskScheduler {
+ public:
+  // All configs for different schedule strategies
+  // will be defined here together.
+  struct Config {
+    // The minimum threshold of earnings ratio, used by EfficiencyPriority
+    float minimum_gain_threshold = 0.0;
+  };
+
+  // Create a TaskScheduler with the specific strategy name
+  // and necessary construct parameters.
+  static std::unique_ptr<TaskScheduler> Make(
+      const std::vector<TuneTask>& tasks,
+      const Config& config,
+      const std::string& strategy = "round_robin");
+
+  // Reset associated states to schedule at the beginning
+  void Reset();
+
+  // Return the name of schedule strategy
+  virtual const char* Name() const = 0;
+
+  // Select a task to tune
+  virtual int NextTaskId() = 0;
+
+ protected:
+  // A taskScheduler object should be created with the static function Make
+  TaskScheduler(const std::vector<TuneTask>& tasks, const Config& config);
+
+  // The config for scheduling strategy
+  Config config_;
+  // The current task id to be estimated
+  int cur_task_id_;
+  // The pointer refers to all tasks
+  const std::vector<TuneTask>* tasks_;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
+++ b/paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
+
+#include <gtest/gtest.h>
+
+#include <type_traits>
+
+#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
+#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+TEST(TaskScheduler, Make) {
+  std::vector<TuneTask> tasks(3);
+  TaskScheduler::Config config;
+
+  auto round_robin = TaskScheduler::Make(tasks, config);
+  ASSERT_STREQ(round_robin->Name(), "round_robin");
+  auto efficiency_priority =
+      TaskScheduler::Make(tasks, config, "efficiency_priority");
+  ASSERT_STREQ(efficiency_priority->Name(), "efficiency_priority");
+}
+
+TEST(RoundRobinScheduler, NextTaskId) {
+  std::vector<TuneTask> tasks(3);
+  TaskScheduler::Config config;
+  auto round_robin = TaskScheduler::Make(tasks, config);
+  ASSERT_EQ(0, round_robin->NextTaskId());
+  ASSERT_EQ(1, round_robin->NextTaskId());
+  round_robin->Reset();
+  ASSERT_EQ(0, round_robin->NextTaskId());
+}
+
+TEST(EfficiencyPriorityScheduler, NextTaskId) {
+  std::vector<TuneTask> tasks(3);
+  TaskScheduler::Config config;
+  config.minimum_gain_threshold = -1.0;
+  auto efficiency_priority =
+      TaskScheduler::Make(tasks, config, "efficiency_priority");
+  ASSERT_EQ(-1, efficiency_priority->NextTaskId());
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/tests/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/tests/CMakeLists.txt
+if(WITH_CUDA AND (NOT WITH_CUDNN))
+  cinn_cc_test(
+    test_performance_comparison
+    ARGS
+    "--resnet50_model_dir=${THIRD_PARTY_PATH}/ResNet50"
+    SRCS
+    performance_comparison_test.cc
+    DEPS
+    cinncore
+    test_program_builder)
+endif()
--- a/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
+++ b/paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include <bitset>
+#include <iostream>
+
+#include "paddle/cinn/auto_schedule/auto_tuner.h"
+#include "paddle/cinn/common/target.h"
+#include "paddle/cinn/frontend/net_builder.h"
+#include "paddle/cinn/frontend/optimize.h"
+#include "paddle/cinn/frontend/paddle_model_convertor.h"
+#include "paddle/cinn/frontend/syntax.h"
+#include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/hlir/framework/pass.h"
+#include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/runtime/flags.h"
+#include "paddle/cinn/utils/data_util.h"
+#include "test/cpp/cinn/program_builder.h"
+
+/* This test is used as a tool to evaluate or compare performance of 3
+ * schedules(no schedule, manual schedule, auto-schedule). One can specify which
+ * schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which
+ * operator or model through `--gtest_filter=PerformanceTester.xx`, for example,
+ * `FLAGS_evaluate_knobs=4
+ * --gtest_filter=PerformanceTester.Matmul` means it will evaluate auto-schedule
+ * on Matmul operator. You can refer to explanation of following flags or
+ * parameters for more detail.
+ */
+
+DEFINE_string(resnet50_model_dir,
+              "./ResNet50",
+              "the path to paddle model resnet50.");
+// Flags that control which schedule tests will be run.
+// Bit with index 0 controls no schedule test, means options = 1 = "001" will
+// run no schedule test. Bit with index 1 controls manual schedule test, means
+// options = 2 = "010" will run manual schedule test. Bit with index 2 controls
+// auto schedule test, means options = 4 = "100" will run auto schedule test.
+// The default value is -1, which means that this flag is disabled to set the
+// options
+DEFINE_int32(evaluate_knobs,
+             -1,
+             "the options to control which schedule tests will be run.");
+DECLARE_double(cinn_infer_model_version);
+
+namespace cinn {
+namespace auto_schedule {
+
+using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::Graph;
+using ::cinn::hlir::framework::GraphCompiler;
+using ::cinn::hlir::framework::Instruction;
+using ::cinn::hlir::framework::Scope;
+
+class PerformanceTester : public ::testing::Test {
+ public:
+  struct Options {
+    // times of compiled runtime program will be executed repeatedly.
+    int repeat_times = 2;
+    // the num_tuning_rounds for auto tuning
+    int num_tuning_rounds = 2;
+    // knobs to control which schedules will be measured, refer to
+    // FLAGS_evaluate_knobs explanation
+    std::bitset<3> evaluate_knobs = 0UL;
+  };
+
+  void Evaluate(const frontend::Program& program) {
+    if (FLAGS_evaluate_knobs >= 0) {
+      options_.evaluate_knobs = FLAGS_evaluate_knobs;
+    }
+    VLOG(3) << "evaluate_knobs = " << options_.evaluate_knobs;
+
+    auto worker_fn = [this, &program](const std::string& schedule_name,
+                                      BuildRuntimeProgramFn build_fn,
+                                      bool execute = true) {
+      Context::Global().ResetNameId();
+      VLOG(3) << "Initialize graph.";
+      auto graph = std::make_shared<hlir::framework::Graph>(program, target_);
+      VLOG(3) << "Apply graph pass.";
+      hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
+      VLOG(3) << "Build " << schedule_name << " program.";
+      auto scope = BuildScope(target_, graph);
+      auto graph_compiler =
+          std::make_unique<GraphCompiler>(target_, scope, graph);
+      auto runtime_program =
+          (this->*build_fn)(graph.get(), graph_compiler.get());
+      if (execute) {
+        VLOG(3) << "Execute " << schedule_name << " program.";
+        runtime_program->ExecuteTest(options_.repeat_times);
+      }
+    };
+
+    // if no one is set, build no/manual schedule cases to ensure their build
+    // functions are valid
+    if (options_.evaluate_knobs.none()) {
+      worker_fn("no schedule",
+                &PerformanceTester::BuildNoScheduleProgram,
+                /* execute */ false);
+      worker_fn("manual schedule",
+                &PerformanceTester::BuildManualScheduleProgram,
+                /* execute */ false);
+    } else {
+      if (options_.evaluate_knobs.test(0)) {
+        worker_fn("no schedule", &PerformanceTester::BuildNoScheduleProgram);
+      }
+      if (options_.evaluate_knobs.test(1)) {
+        worker_fn("manual schedule",
+                  &PerformanceTester::BuildManualScheduleProgram);
+      }
+      if (options_.evaluate_knobs.test(2)) {
+        worker_fn("auto schedule",
+                  &PerformanceTester::BuildAutoScheduleProgram);
+      }
+    }
+  }
+
+ protected:
+  using BuildRuntimeProgramFn = std::unique_ptr<hlir::framework::Program> (
+      PerformanceTester::*)(Graph*, GraphCompiler*);
+
+  std::unique_ptr<hlir::framework::Program> BuildNoScheduleProgram(
+      Graph* graph, GraphCompiler* graph_compiler) {
+    const auto& dtype_dict =
+        graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
+            "inferdtype");
+    const auto& shape_dict = graph->GetAttrs<
+        absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
+        "infershape");
+
+    std::shared_ptr<hlir::framework::OpLowerer> op_lowerer =
+        std::make_unique<hlir::framework::OpLowerer>(
+            dtype_dict, shape_dict, target_);
+
+    GraphCompiler::CompileOptions compile_options;
+    compile_options.with_instantiate_variables = true;
+
+    if (graph->fusion_groups.empty()) {
+      hlir::framework::ApplyPasses(graph, {"BuildNonFusedGroupsPass"});
+    }
+    compile_options.groups = graph->fusion_groups;
+
+    for (auto group : graph->fusion_groups) {
+      compile_options.lowered_funcs.push_back(
+          op_lowerer->Lower(group,
+                            /*apply_op_schedule = */ false,
+                            /*apply_group_schedule=*/false));
+    }
+
+    VLOG(3) << "===========================No Schedule LoweredFunc "
+               "Begin===========================";
+    for (const auto& funcvec : compile_options.lowered_funcs) {
+      for (const auto& func : funcvec) {
+        VLOG(3) << func;
+      }
+    }
+    VLOG(3) << "===========================No Schedule LoweredFunc "
+               "End=============================";
+
+    return graph_compiler->Build(compile_options).runtime_program;
+  }
+
+  std::unique_ptr<hlir::framework::Program> BuildManualScheduleProgram(
+      Graph* graph, GraphCompiler* graph_compiler) {
+    return graph_compiler->Build();
+  }
+
+  std::unique_ptr<hlir::framework::Program> BuildAutoScheduleProgram(
+      Graph* graph, GraphCompiler* graph_compiler) {
+    auto tuner = std::make_unique<AutoTuner>(target_, graph);
+
+    AutoTuner::Config tuning_config;
+    TuningOptions tuning_options;
+    tuning_options.num_tuning_rounds = options_.num_tuning_rounds;
+    tuning_options.num_measure_trials = 2;
+    tuning_options.num_samples_per_iteration = 2;
+
+    tuner->Initialize(tuning_config, graph_compiler);
+    TuningResult tuning_result = tuner->Tune(tuning_options);
+
+    GraphCompiler::CompileOptions compile_options;
+    compile_options.with_instantiate_variables = true;
+    compile_options.Apply(tuning_result);
+
+    VLOG(3) << "===========================Auto Schedule LoweredFunc "
+               "Begin===========================";
+    for (const auto& funcvec : compile_options.lowered_funcs) {
+      for (const auto& func : funcvec) {
+        VLOG(3) << func;
+      }
+    }
+    VLOG(3) << "===========================Auto Schedule LoweredFunc "
+               "End=============================";
+
+    return graph_compiler->Build(compile_options).runtime_program;
+  }
+
+#ifdef CINN_WITH_CUDA
+  Target target_ = common::DefaultNVGPUTarget();
+#else
+  Target target_ = common::DefaultHostTarget();
+#endif
+  Options options_;
+};
+
+constexpr int batch_size = 2;
+
+TEST_F(PerformanceTester, Mul) {
+  Evaluate(tests::OpBuilder("mul").Build({{"X", {32, 16}}, {"Y", {16, 32}}}));
+}
+
+TEST_F(PerformanceTester, Add) {
+  Evaluate(tests::OpBuilder("elementwise_add")
+               .Build({{"X", {1, 56, 56, 256}}, {"Y", {1, 56, 56, 256}}}));
+}
+
+TEST_F(PerformanceTester, Matmul) {
+  Evaluate(tests::OpBuilder("matmul").Build(
+      {{"X", {batch_size, 2048}}, {"Y", {2048, 1000}}}));
+}
+
+TEST_F(PerformanceTester, Relu) {
+  Evaluate(tests::OpBuilder("relu").Build({{"X", {batch_size, 64, 56, 56}}}));
+}
+
+TEST_F(PerformanceTester, Conv2d) {
+  std::vector<int> strides{2, 2};
+  std::vector<int> paddings{3, 3};
+  std::vector<int> dilations{1, 1};
+  int groups = 1;
+  std::string conv_type = "forward";
+  std::string data_format = "NCHW";
+  std::string padding_algorithm = "EXPLICIT";
+
+  Evaluate(tests::OpBuilder("conv2d").Build(
+      {{"X", {batch_size, 3, 224, 224}}, {"W", {64, 3, 7, 7}}},
+      {{"stride", strides},
+       {"padding", paddings},
+       {"dilation", dilations},
+       {"groups", groups},
+       {"conv_type", conv_type},
+       {"data_format", data_format},
+       {"padding_algorithm", padding_algorithm}}));
+}
+
+TEST_F(PerformanceTester, Pool2d) {
+  std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
+  std::string pooling_type = "max";
+  std::vector<int> ksize{3, 3};
+  std::vector<int> strides{2, 2};
+  std::vector<int> paddings{1, 1, 1, 1};
+  bool ceil_mode = false;
+  bool exclusive = true;
+  bool global_pooling = false;
+  std::string data_format = "NCHW";
+  bool adaptive = false;
+  std::string padding_algorithm = "EXPLICIT";
+
+  Evaluate(tests::OpBuilder("pool2d").Build(
+      {{"X", {batch_size, 64, 112, 112}}},
+      {{"pool_type", pooling_type},
+       {"kernel_size", ksize},
+       {"stride_size", strides},
+       {"padding_size", paddings},
+       {"ceil_mode", ceil_mode},
+       {"exclusive", exclusive},
+       {"global_pooling", global_pooling},
+       {"data_format", data_format},
+       {"adaptive", adaptive},
+       {"padding_algorithm", padding_algorithm}}));
+}
+
+TEST_F(PerformanceTester, BatchNorm) {
+  std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
+  std::vector<int32_t> scale_shape{64};
+  std::vector<int32_t> bias_shape{64};
+  std::vector<int32_t> mean_shape{64};
+  std::vector<int32_t> variance_shape{64};
+  float epsilon = 1e-5f;
+  float momentum = 0.9f;
+  const std::string& data_layout = "NCHW";
+
+  Evaluate(tests::OpBuilder("batch_norm")
+               .Build({{"X", {batch_size, 64, 112, 112}},
+                       {"scale", {64}},
+                       {"bias", {64}},
+                       {"mean", {64}},
+                       {"variance", {64}}},
+                      {{"epsilon", epsilon},
+                       {"momentum", momentum},
+                       {"data_layout", data_layout}}));
+}
+
+TEST_F(PerformanceTester, Reshape) {
+  std::vector<int32_t> output_shape{batch_size, 2048};
+
+  Evaluate(tests::OpBuilder("reshape").Build({{"X", {batch_size, 2048, 1, 1}}},
+                                             {{"shape", output_shape}}));
+}
+
+TEST_F(PerformanceTester, Softmax) {
+  std::vector<int> axes = {-1};
+  std::string mode = "fast";
+  std::string data_format = "AnyLayout";
+
+  Evaluate(tests::OpBuilder("softmax").Build(
+      {{"X", {batch_size, 1000}}},
+      {{"axes", axes}, {"mode", mode}, {"data_format", data_format}}));
+}
+
+TEST_F(PerformanceTester, Scale) {
+  float scale = 1.0f;
+  float bias = 0.0f;
+  bool bias_after_scale = true;
+
+  Evaluate(tests::OpBuilder("scale").Build(
+      {{"X", {batch_size, 1000}}},
+      {{"scale", scale},
+       {"bias", bias},
+       {"bias_after_scale", bias_after_scale}}));
+}
+
+TEST_F(PerformanceTester, LookupTable) {
+  int64_t padding_idx = -1;
+
+  Evaluate(tests::OpBuilder("lookup_table")
+               .Build({{"table", {50001, 768}},
+                       {"ids", {10, 128, 1}, common::Int(64)}},
+                      {{"padding_idx", padding_idx}}));
+}
+
+TEST_F(PerformanceTester, Gather) {
+  int axis = 3;
+
+  Evaluate(tests::OpBuilder("gather").Build(
+      {{"operand", {10, 12, 128, 512}},
+       {"index", {1, 1, 1, 128}, common::Int(32)}},
+      {{"axis", axis}}));
+}
+
+// paddle model test
+TEST_F(PerformanceTester, ResNet50) {
+  CHECK_NE(FLAGS_resnet50_model_dir, "");
+  FLAGS_cinn_infer_model_version = 1.0;
+  std::unordered_map<std::string, std::vector<int64_t>> feeds = {
+      {"inputs", {batch_size, 3, 224, 224}}};
+  Evaluate(cinn::frontend::PaddleModelConvertor(common::DefaultNVGPUTarget())
+               .LoadModel(FLAGS_resnet50_model_dir, true, feeds));
+}
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/auto_schedule/tuning.h
+++ b/paddle/cinn/auto_schedule/tuning.h
+// Copyright (c) 2022 CINN Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "paddle/cinn/hlir/framework/graph.h"
+#include "paddle/cinn/hlir/framework/node.h"
+#include "paddle/cinn/ir/lowered_func.h"
+
+namespace cinn {
+namespace auto_schedule {
+
+// alias a LoweredFunc array as FunctionGroup
+using FunctionGroup = std::vector<ir::LoweredFunc>;
+// alias a Graph::Group array as SubGraph
+using SubGraphPtr = std::shared_ptr<hlir::framework::Graph::Group>;
+
+// Options for tuning process
+struct TuningOptions {
+  // The number of tuning rounds, each round will tune several tasks,
+  // each task involves TuningOptions.num_measure_trials measurements.
+  int num_tuning_rounds = 1;
+
+  // The number of measurement trials in a task, if it is 0,
+  // that means the tunner will return the best
+  // candidate of schedule config without measurement.
+  int num_measure_trials = 10;
+
+  // Every round TaskSchedule chooses some TuneTask(s) to optimize and run
+  // several iterations of search algorithm for a task to generate samples.
+  // Each iteration has num_samples_per_iteration samples.
+  //
+  // 1. if TuningOptions.num_measure_trials is 0, the autotune doesn't involve
+  // hardware measurements. It predicts performance by cost model.
+  //
+  // 2. num_measure_trials % num_samples_per_iteration must equal 0.
+  // In each round, autotune will run iterations until number of iterations
+  // * num_samples_per_iteration equals num_measure_trials.
+  int num_samples_per_iteration = 10;
+
+  //////////////////////////////////////
+  // Evolutionary Search Related Options
+  //////////////////////////////////////
+
+  // The number of picks from the stored database in each iteration
+  // These are best performance recorded from previous generations
+  //
+  // Note the number doesn't guaranteed returns those topk when the
+  // database doesn't have enough data. Evolutionary Search would get
+  // as many as possible without throwing errors or warnings.
+  int evolution_pick_database_topk = 8;
+
+  // The number of initial populations at each generation. It contains
+  // the picks from  database plus random generated samples.
+  int evolution_init_population_num = 10;
+
+  // The number of samples generated by cross over
+  int evolution_cross_over_num = 0;
+
+  // The fraction of random samples in num_samples_per_iteration.
+  // So the num_samples_per_iteration would have (1 - eps_greedy) best
+  // samples from evolutionary search and eps_greedy random samples.
+  //
+  // It explores the cases evolutionary search won't predict precisely
+  float evolution_eps_greedy = 0.1f;
+};
+
+// Result of the tuning process
+struct TuningResult {
+  // Result of graph tuning
+  std::vector<SubGraphPtr> subgraphs;
+  // Result of schedule tuning
+  std::vector<FunctionGroup> function_groups;
+};
+
+}  // namespace auto_schedule
+}  // namespace cinn
--- a/paddle/cinn/backends/CMakeLists.txt
+++ b/paddle/cinn/backends/CMakeLists.txt
+core_gather_headers()
+
+gather_srcs(
+  cinnapi_src
+  SRCS
+  outputs.cc
+  codegen_c.cc
+  codegen_c_x86.cc
+  codegen_cuda_host.cc
+  extern_func_emitter.cc
+  extern_func_emitter_builtin.cc
+  function_prototype.cc
+  extern_func_protos.cc
+  extern_func_jit_register.cc
+  modular.cc
+  compiler.cc)
+
+if(WITH_CUDA)
+  add_subdirectory(nvrtc)
+  list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc codegen_cuda_util.cc)
+endif()
+
+if(WITH_OPENMP)
+  cinn_cc_library(__x86_source_fake_lib SRCS _x86_builtin_source.cc)
+endif()
+add_subdirectory(llvm)
+
+if(WITH_CUDA)
+  cinn_nv_test(test_raw_cuda_code SRCS raw_cuda_code_test.cu DEPS cinncore)
+endif()
+
+cinn_cc_test(
+  test_codegen_c
+  SRCS
+  codegen_c_test.cc
+  DEPS
+  cinncore
+  ARGS
+  ${global_test_args})
+cinn_cc_test(
+  test_codegen_c_x86
+  SRCS
+  codegen_c_x86_test.cc
+  DEPS
+  cinncore
+  ARGS
+  ${global_test_args})
+cinn_cc_test(test_generated1 SRCS generated_module1.cc DEPS cinn_runtime)
+add_run_test_dependency(test_generated1 test_codegen_c)
+cinn_cc_test(test_ir_schedule SRCS ir_schedule_test.cc DEPS cinncore)
+include_directories(${CMAKE_SOURCE_DIR}/paddle/cinn/runtime)
+if(TARGET test_generated1)
+  add_dependencies(test_generated1 test_codegen_c)
+endif()
+
+if(WITH_CUDA)
+  cinn_nv_test(test_codegen_cuda_generate SRCS codegen_cuda_generate_test.cc
+               DEPS cinncore)
+  cinn_nv_test(test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore)
+
+  if(WITH_TESTING)
+    if(CINN_ONLY)
+      cinn_nv_test(generated1_cuda SRCS generated1.cu DEPS cinncore)
+    else()
+      nv_test(
+        generated1_cuda
+        SRCS generated1.cu
+        DEPS cinncore)
+    endif()
+    add_run_test_dependency(generated1_cuda test_codegen_cuda_generate)
+  endif()
+
+  cinn_nv_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
+else()
+  cinn_cc_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
+endif()
+
+foreach(cpp ${srcs})
+  set(cinnapi_src
+      "${cinnapi_src};paddle/cinn/backends/${cpp}"
+      CACHE INTERNAL "")
+endforeach()
+
+file(
+  GLOB includes
+  LIST_DIRECTORIES false
+  RELATIVE ${CMAKE_SOURCE_DIR}
+  *.h)
+
+foreach(header ${includes})
+  set(core_includes
+      "${core_includes};${header}"
+      CACHE INTERNAL "")
+endforeach()