2.5.2-dtk24.04

01a10755 · yuguo-Jack · 63eb0da5 · 01a10755 · 01a10755 · 01a10755
Commit 01a10755 authored Mar 04, 2024 by yuguo-Jack
20 changed files
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.cc
@@ -23,12 +23,12 @@
 #include "paddle/cinn/ir/buffer.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/lowered_func.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/tensor.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
-#include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/lang/lower.h"
 #include "paddle/cinn/optim/optimize.h"
 #include "paddle/cinn/optim/transform_gpu_forloop.h"
@@ -41,7 +41,7 @@ std::vector<ir::Var> IndicesToVars(const std::vector<ir::Expr>& indices) {
  for (const ir::Expr& e : indices) {
    // Whether we have to convert other types, like const numbers to Var?
    if (e.As<ir::_Var_>() != nullptr) {
-      ir::Expr copy_e = optim::IRCopy(e);
+      ir::Expr copy_e = ir::ir_utils::IRCopy(e);
      ir::_Var_* var_ref = copy_e.As<ir::_Var_>();
      result.emplace_back(ir::Var(var_ref));
    }
@@ -54,7 +54,8 @@ void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
    return;
  }

-  ir::CollectIRNodesWithoutTensor(sche_block->body, [&](const Expr* x) {
+  ir::ir_utils::CollectIRNodesWithoutTensor(
+      sche_block->body, [&](const Expr* x) {
        const ir::Load* load_expr = x->As<ir::Load>();
        if (load_expr != nullptr) {
          const ir::Tensor t = load_expr->tensor.as_tensor_ref();
@@ -76,7 +77,7 @@ void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
 bool ContainsNodeType(ir::Expr expr,
                      const std::unordered_set<ir::IrNodeTy>& node_types) {
  std::set<ir::Expr> collection =
-      ir::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) {
        return node_types.find(x->node_type()) != node_types.end();
      });
  return !collection.empty();
@@ -189,5 +190,40 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
  return new_func;
 }

+std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
+  const ir::ScheduleBlockRealize* block_realize =
+      block.As<ir::ScheduleBlockRealize>();
+  CHECK_NOTNULL(block_realize);
+  const ir::ScheduleBlock* block_node =
+      block_realize->schedule_block.As<ir::ScheduleBlock>();
+  CHECK_NOTNULL(block_node);
+  std::vector<ir::Expr> iter_values = block_realize->iter_values;
+  std::vector<ir::Var> iter_vars = block_node->iter_vars;
+
+  std::unordered_set<std::string> reduce_loop_var;
+  for (int i = 0; i < iter_vars.size(); ++i) {
+    if (iter_vars[i]->is_reduce_axis) {
+      ir::ir_utils::CollectIRNodesWithoutTensor(
+          iter_values[i], [&](const ir::Expr* x) {
+            if (x->as_var()) {
+              reduce_loop_var.insert(x->as_var_ref()->name);
+            }
+            return false;
+          });
+    }
+  }
+  return reduce_loop_var;
+}
+
+std::string GetBlockName(const ir::Expr block) {
+  const ir::ScheduleBlockRealize* block_realize =
+      block.As<ir::ScheduleBlockRealize>();
+  CHECK_NOTNULL(block_realize);
+  const ir::ScheduleBlock* block_node =
+      block_realize->schedule_block.As<ir::ScheduleBlock>();
+  CHECK_NOTNULL(block_node);
+  return block_node->name;
+}
+
 }  // namespace auto_schedule
 }  // namespace cinn
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir.h
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir.h
@@ -48,5 +48,15 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
                                      const ir::LoweredFunc& old_func,
                                      ir::Expr& body);  // NOLINT

+/**
+ * Get loop var names of reduce axis
+ */
+std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block);
+
+/**
+ * Get name of a ScheduleBlock
+ */
+std::string GetBlockName(const ir::Expr block);
+
 }  // namespace auto_schedule
 }  // namespace cinn
--- a/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
+++ b/paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
@@ -20,6 +20,7 @@
 #include <sstream>
 #include <vector>

+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/common/context.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
@@ -49,9 +50,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_SimpleAssign) {
  ir::Tensor B = lang::Compute(
      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");

-  poly::StageMap stages = poly::CreateStages({A, B});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({A, B});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("SimpleAssign", {A, B}, &tensor_group, target);

  ASSERT_FALSE(funcs.empty());
  ir::Expr ast_expr = funcs[0]->body;
@@ -115,9 +116,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_AddDiffShape) {
  ir::Tensor C = lang::Compute(
      {M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");

-  poly::StageMap stages = poly::CreateStages({C});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "AddDiffShape", stages, {C}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("AddDiffShape", {C}, &tensor_group, target);

  ir::Expr ast_expr = funcs[0]->body;
  VLOG(6) << "Expr before MultiLevelTiling: ";
@@ -169,9 +170,9 @@ TEST(AnalyzeIr, ContainsNodeType) {
  ir::Tensor B = lang::Compute(
      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");

-  poly::StageMap stages = poly::CreateStages({A, B});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({A, B});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("SimpleAssign", {A, B}, &tensor_group, target);

  ASSERT_FALSE(funcs.empty());
  ir::Expr ast_expr = funcs[0]->body;

--- a/paddle/cinn/auto_schedule/auto_tuner.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner.cc
@@ -63,8 +63,8 @@ void AutoTuner::Initialize(const Config& config,
  const auto& shape_dict = graph_->GetAttrs<
      absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");

-  op_lowerer_ = std::make_unique<hlir::framework::OpLowerer>(
-      dtype_dict, shape_dict, target_);
+  op_lowerer_ = std::make_unique<hlir::framework::OpLowerer<GroupPtr>>(
+      new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target_));
  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
  for (auto i = 0; i < tasks_.size(); ++i) {
    auto&& task = tasks_[i];

--- a/paddle/cinn/auto_schedule/auto_tuner.h
+++ b/paddle/cinn/auto_schedule/auto_tuner.h
@@ -30,11 +30,11 @@

 namespace cinn {
 namespace auto_schedule {
-
 // This class is entrance of auto-tune, users can use it
 // to tune graph (not supported yet) and search a series of schedules
 // that maybe more likely to obtain better performance.
 // Internally, it creates necessary components and use them to perform tuning.
+using GroupPtr = hlir::framework::GroupPtr;
 class AutoTuner {
 public:
  // configure how to perform auto-tune, such as
@@ -58,7 +58,7 @@ class AutoTuner {
 private:
  const common::Target& target_;
  hlir::framework::Graph* graph_;
-  std::unique_ptr<hlir::framework::OpLowerer> op_lowerer_;
+  std::unique_ptr<hlir::framework::OpLowerer<GroupPtr>> op_lowerer_;

  // Tasks to tune
  std::vector<TuneTask> tasks_;

--- a/paddle/cinn/auto_schedule/auto_tuner_test.cc
+++ b/paddle/cinn/auto_schedule/auto_tuner_test.cc
@@ -26,17 +26,19 @@
 #include "paddle/cinn/frontend/syntax.h"
 #include "paddle/cinn/hlir/framework/graph.h"
 #include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
 #include "paddle/cinn/hlir/framework/node.h"
 #include "paddle/cinn/hlir/framework/pass.h"
 #include "paddle/cinn/ir/ir_base.h"
 #include "paddle/cinn/runtime/flags.h"

-DECLARE_bool(auto_schedule_use_cost_model);
+PD_DECLARE_bool(auto_schedule_use_cost_model);

 namespace cinn {
 namespace auto_schedule {

 using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::CompilationContext;
 using ::cinn::hlir::framework::Graph;
 using ::cinn::hlir::framework::GraphCompiler;
 using ::cinn::hlir::framework::Instruction;
@@ -53,6 +55,7 @@ class TestAutoTuner : public ::testing::Test {

  std::shared_ptr<Graph> graph;
  std::shared_ptr<Scope> compiled_scope;
+  CompilationContext context;
  std::unique_ptr<GraphCompiler> graph_compiler;
  std::unique_ptr<AutoTuner> tuner;

@@ -73,8 +76,10 @@ class TestAutoTuner : public ::testing::Test {
    auto program = CreateAddReluProgram();
    auto graph = cinn::frontend::Optimize(&program, fetch_ids, target);
    compiled_scope = BuildScope(target, graph);
-    graph_compiler =
-        std::make_unique<GraphCompiler>(target, compiled_scope, graph);
+    context.graph = graph;
+    context.scope = compiled_scope;
+    context.target = target;
+    graph_compiler = std::make_unique<GraphCompiler>(context);
    tuner = std::make_unique<AutoTuner>(target, graph.get());
  }

@@ -99,16 +104,14 @@ class TestAutoTuner : public ::testing::Test {

  virtual void ApplyTunedAndRun(const TuningResult& result) {
    // build runtime program with tuning result
-    GraphCompiler::CompileOptions compile_options;
-    compile_options.with_instantiate_variables = true;
-    compile_options.Apply(result);
-    ASSERT_EQ(1, compile_options.groups.size());
-    ASSERT_EQ(1, compile_options.lowered_funcs.size());
+    context.with_instantiate_variables = true;
+    context.ApplyTuningResult(result);
+    ASSERT_EQ(1, context.groups.size());
+    ASSERT_EQ(1, context.lowered_funcs.size());
    VLOG(6) << "Print lowered_funcs before building";
-    VLOG(6) << compile_options.lowered_funcs[0][0];
-    VLOG(6) << compile_options.lowered_funcs[1][0];
-    auto runtime_program =
-        graph_compiler->Build(compile_options).runtime_program;
+    VLOG(6) << context.lowered_funcs[0][0];
+    VLOG(6) << context.lowered_funcs[1][0];
+    auto runtime_program = graph_compiler->Build(&context).runtime_program;
    ASSERT_EQ(1, runtime_program->size());
    runtime_program->Execute();
  }

--- a/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
@@ -3,7 +3,8 @@ core_gather_headers()
 gather_srcs(cinnapi_src SRCS xgb_cost_model.cc expr_cost_model.cc feature.cc
            feature_extractor.cc)

-cinn_cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
+# TODO(zhhsplendid): enable this test again
+#cinn_cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
 cinn_cc_test(test_feature_extractor SRCS feature_extractor_test.cc DEPS
             cinncore)
 cinn_cc_test(test_feature SRCS feature_test.cc DEPS cinncore)
--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
@@ -33,9 +33,9 @@
 #include "paddle/cinn/common/type.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
-#include "paddle/cinn/ir/utils/ir_printer.h"
 #include "paddle/cinn/optim/transform_polyfor_to_for.h"

 namespace cinn {
@@ -82,6 +82,7 @@ VisitDoNothing(ScheduleBlockRealize);
 VisitDoNothing(Ramp);
 VisitDoNothing(_Buffer_);
 VisitDoNothing(_BufferRange_);
+VisitDoNothing(_Dim_);

 #define NotVisitExprFields(NodeType) \
  void FeatureExtractor::Visit(const NodeType *x) {}
@@ -218,7 +219,7 @@ void FeatureExtractor::Visit(const For *x) {
 }

 void FeatureExtractor::Visit(const PolyFor *x) {
-  Expr copy = optim::IRCopy(Expr(x));
+  Expr copy = ir::ir_utils::IRCopy(Expr(x));
  feature_.IntoLoopBlock();
  optim::TransformPolyForToFor(&copy);
  ir::For *loop = copy.As<For>();

--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor.h
@@ -31,8 +31,8 @@
 #include "paddle/cinn/common/target.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
+#include "paddle/cinn/ir/ir_visitor.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
-#include "paddle/cinn/ir/utils/ir_visitor.h"

 namespace cinn {
 namespace auto_schedule {

--- a/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
+++ b/paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
@@ -21,6 +21,7 @@
 #include <unordered_set>
 #include <vector>

+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/common/context.h"
 #include "paddle/cinn/ir/ir.h"
 #include "paddle/cinn/ir/ir_base.h"
@@ -48,9 +49,9 @@ TEST(FeatureExtractor, SimpleAssign) {
  ir::Tensor B = lang::Compute(
      {M, N}, [&](Var i, Var j) { return A(i, j); }, "B");

-  poly::StageMap stages = poly::CreateStages({A, B});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({A, B});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("SimpleAssign", {A, B}, &tensor_group, target);
  ir::Expr ast_expr = funcs[0]->body;
  VLOG(6) << "Expr to test: " << ast_expr;

@@ -88,6 +89,7 @@ TEST(FeatureExtractor, SimpleAssign) {
  ASSERT_EQ(to_check[29], slog(3));
 }

+#ifdef CINN_WITH_CUDA
 TEST(FeatureExtractor, MatrixMultiply) {
  Context::Global().ResetNameId();
 #ifdef CINN_WITH_CUDA
@@ -109,9 +111,9 @@ TEST(FeatureExtractor, MatrixMultiply) {
      [&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); },
      "C");

-  poly::StageMap stages = poly::CreateStages({C});
-  std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
-      "MatrixMultiply", stages, {C}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({C});
+  std::vector<ir::LoweredFunc> funcs =
+      lang::LowerToAstVec("SimpleAssign", {C}, &tensor_group, target);

  std::vector<Expr> vec_ast{funcs[0]->body};
  ir::ModuleExpr mod_expr(vec_ast);
@@ -161,6 +163,6 @@ TEST(FeatureExtractor, MatrixMultiply) {
  // GpuBind loop
  ASSERT_EQ(to_check[37], slog(out_loop));
 }
-
+#endif
 }  // namespace auto_schedule
 }  // namespace cinn
--- a/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+++ b/paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
@@ -20,12 +20,13 @@
 #include <fstream>
 #include <vector>

+#include "paddle/cinn/ast_gen_ius/tensor_group.h"
 #include "paddle/cinn/auto_schedule/search_space/search_state.h"
 #include "paddle/cinn/auto_schedule/task/task_registry.h"
 #include "paddle/cinn/cinn.h"
+#include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
-#include "paddle/cinn/ir/utils/ir_printer.h"

 namespace cinn {
 namespace auto_schedule {
@@ -47,8 +48,8 @@ std::vector<ir::LoweredFunc> LowerCompute(const std::vector<int>& shape,
  C = Compute(
      domain, [&B](Var i, Var j) { return B(i, j); }, "C");

-  return cinn::lang::LowerVec(
-      "test_func", CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
+  ast_gen_ius::TensorGroup tensor_group({A, B});
+  return cinn::lang::LowerToAstVec("test_func", {A, B}, &tensor_group, target);
 }

 // Create a new IRSchedule with copied ir::LoweredFunc AST
@@ -56,7 +57,7 @@ ir::IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs,
                              const std::string& task_key) {
  std::vector<Expr> exprs;
  for (auto&& func : lowered_funcs) {
-    exprs.emplace_back(optim::IRCopy(func->body));
+    exprs.emplace_back(ir::ir_utils::IRCopy(func->body));
  }
  InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
  task_registry->Regist(task_key, ir::ModuleExpr(exprs));

--- a/paddle/cinn/auto_schedule/measure/measurer_test.cc
+++ b/paddle/cinn/auto_schedule/measure/measurer_test.cc
@@ -25,12 +25,15 @@
 #include "paddle/cinn/frontend/optimize.h"
 #include "paddle/cinn/frontend/syntax.h"
 #include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
+#include "paddle/cinn/hlir/framework/op_lowering.h"
 #include "paddle/cinn/runtime/flags.h"

 namespace cinn {
 namespace auto_schedule {

 using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::CompilationContext;
 using ::cinn::hlir::framework::Graph;
 using ::cinn::hlir::framework::GraphCompiler;

@@ -62,7 +65,8 @@ class TestMeasurer : public ::testing::Test {
    auto program = CreateAddReluProgram();
    auto graph = cinn::frontend::Optimize(&program, fetch_ids, target);
    auto scope = BuildScope(target, graph);
-    graph_compiler = std::make_unique<GraphCompiler>(target, scope, graph);
+    CompilationContext context(graph, scope, target);
+    graph_compiler = std::make_unique<GraphCompiler>(context);
    TaskCreator task_creator;
    tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
    const auto& dtype_dict =
@@ -72,12 +76,12 @@ class TestMeasurer : public ::testing::Test {
        absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
        "infershape");

-    auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
-        dtype_dict, shape_dict, target);
+    auto op_lowerer =
+        hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
    inputs.reserve(tasks.size());
    for (int i = 0; i < tasks.size(); ++i) {
      auto* task = &tasks[i];
-      task->Initialize(shape_dict, dtype_dict, op_lowerer.get());
+      task->Initialize(shape_dict, dtype_dict, &op_lowerer);
      MeasureInput input;
      input.task = task;
      input.lowered_funcs = task->lowered_funcs;

--- a/paddle/cinn/auto_schedule/measure/simple_builder.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.cc
@@ -17,6 +17,8 @@
 namespace cinn {
 namespace auto_schedule {

+using hlir::framework::CompilationContext;
+using hlir::framework::CompilationResult;
 using hlir::framework::GraphCompiler;

 SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler)
@@ -25,19 +27,18 @@ SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler)
 BuildResult SimpleBuilder::Build(const MeasureInput& input) {
  CHECK_NE(graph_compiler_, static_cast<GraphCompiler*>(nullptr))
      << "empty handle to GraphCompiler";
-  GraphCompiler::CompileOptions compile_options;
-  compile_options.groups.emplace_back(input.task->subgraph);
-  compile_options.lowered_funcs.emplace_back(input.lowered_funcs);
-  compile_options.remove_unused_variables = false;
+  CompilationContext& context = graph_compiler_->GetCompilationContext();
+  context.groups.emplace_back(input.task->subgraph);
+  context.lowered_funcs.emplace_back(input.lowered_funcs);
+  context.remove_unused_variables = false;
  VLOG(5) << "call GraphCompiler to Build with Graph::Group size="
-          << compile_options.groups.size() << ", lowered_funcs group size="
-          << compile_options.lowered_funcs.size();
-  GraphCompiler::CompilationResult compiled_result =
-      graph_compiler_->Build(compile_options);
+          << context.groups.size()
+          << ", lowered_funcs group size=" << context.lowered_funcs.size();
+  CompilationResult compiled_result = graph_compiler_->Build(&context);

  BuildResult build_result;
  build_result.compiled_scope = graph_compiler_->GetScope().get();
-  build_result.runtime_program = std::move(compiled_result.runtime_program);
+  build_result.runtime_program = std::move(compiled_result.RuntimeProgram());
  return build_result;
 }


--- a/paddle/cinn/auto_schedule/measure/simple_builder.h
+++ b/paddle/cinn/auto_schedule/measure/simple_builder.h
@@ -16,6 +16,7 @@

 #include "paddle/cinn/auto_schedule/measure/measure.h"
 #include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/graph_compiler_util.h"

 namespace cinn {
 namespace auto_schedule {

--- a/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
+++ b/paddle/cinn/auto_schedule/measure/simple_runner_test.cc
@@ -25,11 +25,13 @@
 #include "paddle/cinn/frontend/optimize.h"
 #include "paddle/cinn/frontend/syntax.h"
 #include "paddle/cinn/hlir/framework/graph_compiler.h"
+#include "paddle/cinn/hlir/framework/graph_compiler_util.h"

 namespace cinn {
 namespace auto_schedule {

 using ::cinn::hlir::framework::BuildScope;
+using ::cinn::hlir::framework::CompilationContext;
 using ::cinn::hlir::framework::Graph;
 using ::cinn::hlir::framework::GraphCompiler;
 using ::cinn::hlir::framework::Instruction;
@@ -56,8 +58,8 @@ class TestSimpleRunner : public ::testing::Test {
    auto program = CreateAddReluProgram();
    auto graph = cinn::frontend::Optimize(&program, fetch_ids, target);
    compiled_scope = BuildScope(target, graph);
-    graph_compiler =
-        std::make_unique<GraphCompiler>(target, compiled_scope, graph);
+    CompilationContext context(graph, compiled_scope, target);
+    graph_compiler = std::make_unique<GraphCompiler>(context);
    auto runtime_program = graph_compiler->Build();
    const auto& instructions = runtime_program->GetRunInstructions();
    ASSERT_EQ(1, instructions.size());
@@ -123,8 +125,8 @@ TEST_F(TestSimpleRunner, TimeMeasured) {
                                            "sleep_fn"));
  instructions.back()->SetLoweredFunc(reinterpret_cast<void*>(sleep_fn));
  instructions.back()->Finalize();
-  build_result.runtime_program.reset(
-      new hlir::framework::Program(nullptr, std::move(instructions)));
+  build_result.runtime_program = std::make_unique<hlir::framework::Program>(
+      nullptr, std::move(instructions));

  // to skip the condition check of params in Instruction::PreparePodArgs
  std::map<std::string, cinn_pod_value_t> preset_args;

--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
@@ -15,9 +15,9 @@
 #include "paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h"

 #include "paddle/cinn/ir/ir.h"
+#include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
 #include "paddle/cinn/ir/schedule/schedule_desc.h"
-#include "paddle/cinn/ir/utils/ir_printer.h"

 namespace cinn {
 namespace auto_schedule {

--- a/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
+++ b/paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
@@ -17,7 +17,7 @@
 #include <gtest/gtest.h>

 #include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
-#include "paddle/cinn/ir/utils/ir_printer.h"
+#include "paddle/cinn/ir/ir_printer.h"
 #include "test/cpp/cinn/program_builder.h"

 namespace cinn {
@@ -129,7 +129,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
                  {
                    i0, i1 = axis.bind(((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1)))
                    {
-                      temp_matmul_out__reduce_init[((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1))] = 0.00000000f
+                      temp_matmul_out__reduce_init[i0, i1] = 0.00000000f
                    }
                  }
                }
@@ -181,7 +181,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
                  {
                    i0_0, i1_0, i2 = axis.bind(((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1)), ((4 * reduce_k_0) + reduce_k_1))
                    {
-                      temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] = (temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] + (X_reshape_shared_temp_buffer[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((4 * reduce_k_0) + reduce_k_1)] * Y_reshape_shared_temp_buffer[((4 * reduce_k_0) + reduce_k_1), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))]))
+                      temp_matmul_out[i0_0, i1_0] = (temp_matmul_out[i0_0, i1_0] + (X_reshape_shared_temp_buffer[i0_0, i2] * Y_reshape_shared_temp_buffer[i2, i1_0]))
                    }
                  }
                }

--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
@@ -8,7 +8,8 @@ gather_srcs(
  auto_unroll.cc
  multi_level_tiling.cc
  skip_rule.cc
-  auto_bind.cc)
+  auto_bind.cc
+  reduction_factoring.cc)

 if(WITH_TESTING)
  cinn_cc_library(
@@ -51,3 +52,11 @@ endif()
 #cinn_cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore auto_gen_rule_test_helper)
 cinn_cc_test(test_skip_rule SRCS skip_rule_test.cc DEPS cinncore)
 cinn_cc_test(test_auto_unroll SRCS auto_unroll_test.cc DEPS cinncore)
+cinn_cc_test(
+  test_reduction_factoring
+  SRCS
+  reduction_factoring_test.cc
+  DEPS
+  cinncore
+  auto_gen_rule_test_helper
+  test_program_builder)
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
@@ -16,10 +16,11 @@

 #include <glog/logging.h>

+#include "paddle/cinn/ir/ir_printer.h"
 #include "paddle/cinn/ir/schedule/ir_schedule.h"
+#include "paddle/cinn/ir/schedule_block_graph.h"
 #include "paddle/cinn/ir/utils/ir_copy.h"
 #include "paddle/cinn/ir/utils/ir_nodes_collector.h"
-#include "paddle/cinn/ir/utils/ir_printer.h"

 namespace cinn {
 namespace auto_schedule {
@@ -31,7 +32,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
  const auto& loop_var = for_node->loop_var;
  // collect cases where the loop_var used in one of reduce axis in underneath
  // ScheduleBlock
-  auto used_for_reduce_axis = ir::CollectIRNodesWithoutTensor(
+  auto used_for_reduce_axis = ir::ir_utils::CollectIRNodesWithoutTensor(
      for_node->body, [&loop_var](const Expr* x) {
        const auto* block_realize = x->As<ir::ScheduleBlockRealize>();
        if (!block_realize) return false;
@@ -46,7 +47,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
          const ir::Expr& binding = block_realize->iter_values[i];
          if (iter_var->is_reduce_axis ||
              iter_var->name.substr(0, 6) == "reduce") {
-            auto used_exprs = ir::CollectIRNodesWithoutTensor(
+            auto used_exprs = ir::ir_utils::CollectIRNodesWithoutTensor(
                binding, [&loop_var](const Expr* x) {
                  const ir::_Var_* var = x->As<ir::_Var_>();
                  if (var &&
@@ -94,6 +95,8 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
  auto all_loops = ir_schedule->GetLoops(block_name);
  CHECK_LE(num_loops_to_bind, all_loops.size())
      << "The number of loops to be bind is greater than size of all_loops";
+  CHECK_GE(num_loops_to_bind, 0)
+      << "The number of loops to be bind should be greater than 0";
  // check whether it is the case that threadIdx has been binded but blockIdx
  // not, the threadIdx can only be binded in the first loop after
  // num_loops_to_bind loops because we has excluded other cases in
@@ -101,6 +104,17 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
  bool gpu_thread_has_binded =
      num_loops_to_bind < all_loops.size() &&
      all_loops[num_loops_to_bind].As<ir::For>()->is_gpu_thread_binded();
+  ir::BlockOrderConstructor block_order_constructor;
+  std::map<std::vector<int>, ir::Expr> blocks_order_with_ctrl_stmt =
+      block_order_constructor(&all_loops[num_loops_to_bind - 1]);
+  for (auto& pair : blocks_order_with_ctrl_stmt) {
+    if (pair.first.size() == 2) {
+      ir::Expr stmt = pair.second;
+      if (stmt.As<ir::For>() && stmt.As<ir::For>()->is_gpu_thread_binded()) {
+        gpu_thread_has_binded = true;
+      }
+    }
+  }
  Expr fused_loop = ir_schedule->Fuse(
      {all_loops.begin(), all_loops.begin() + num_loops_to_bind});
  int32_t extent = fused_loop.As<ir::For>()->extent.as_int32();
@@ -181,5 +195,18 @@ std::vector<SearchState> AutoBind::ApplyOnBlock(SearchState state,
  return {new_state};
 }

+void AutoBind::Apply(ir::IRSchedule* ir_schedule,
+                     const std::string& block_name) {
+  int num_loop_can_bind =
+      CountLoopCanBinded(ir_schedule->GetLoops(block_name)[0].As<ir::For>());
+  if (num_loop_can_bind > 0) {
+    BindGPUIndex(ir_schedule,
+                 block_name,
+                 num_loop_can_bind,
+                 kMaxBlocks,
+                 target_->max_num_threads());
+  }
+}
+
 }  // namespace auto_schedule
 }  // namespace cinn
--- a/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
+++ b/paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
@@ -42,6 +42,8 @@ class AutoBind : public AutoGenRule {
  std::vector<SearchState> ApplyOnBlock(SearchState state,
                                        const std::string& block_name) override;

+  void Apply(ir::IRSchedule* ir_schedule, const std::string& block_name);
+
 private:
  std::vector<Expr> applicable_schedule_blocks_;
 };