Commit 01a10755 authored by yuguo-Jack's avatar yuguo-Jack
Browse files

2.5.2-dtk24.04

parent 63eb0da5
......@@ -23,12 +23,12 @@
#include "paddle/cinn/ir/buffer.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/tensor.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_nodes_collector.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/lang/lower.h"
#include "paddle/cinn/optim/optimize.h"
#include "paddle/cinn/optim/transform_gpu_forloop.h"
......@@ -41,7 +41,7 @@ std::vector<ir::Var> IndicesToVars(const std::vector<ir::Expr>& indices) {
for (const ir::Expr& e : indices) {
// Whether we have to convert other types, like const numbers to Var?
if (e.As<ir::_Var_>() != nullptr) {
ir::Expr copy_e = optim::IRCopy(e);
ir::Expr copy_e = ir::ir_utils::IRCopy(e);
ir::_Var_* var_ref = copy_e.As<ir::_Var_>();
result.emplace_back(ir::Var(var_ref));
}
......@@ -54,7 +54,8 @@ void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
return;
}
ir::CollectIRNodesWithoutTensor(sche_block->body, [&](const Expr* x) {
ir::ir_utils::CollectIRNodesWithoutTensor(
sche_block->body, [&](const Expr* x) {
const ir::Load* load_expr = x->As<ir::Load>();
if (load_expr != nullptr) {
const ir::Tensor t = load_expr->tensor.as_tensor_ref();
......@@ -76,7 +77,7 @@ void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
bool ContainsNodeType(ir::Expr expr,
const std::unordered_set<ir::IrNodeTy>& node_types) {
std::set<ir::Expr> collection =
ir::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) {
ir::ir_utils::CollectIRNodesWithoutTensor(expr, [&](const Expr* x) {
return node_types.find(x->node_type()) != node_types.end();
});
return !collection.empty();
......@@ -189,5 +190,40 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
return new_func;
}
std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block) {
const ir::ScheduleBlockRealize* block_realize =
block.As<ir::ScheduleBlockRealize>();
CHECK_NOTNULL(block_realize);
const ir::ScheduleBlock* block_node =
block_realize->schedule_block.As<ir::ScheduleBlock>();
CHECK_NOTNULL(block_node);
std::vector<ir::Expr> iter_values = block_realize->iter_values;
std::vector<ir::Var> iter_vars = block_node->iter_vars;
std::unordered_set<std::string> reduce_loop_var;
for (int i = 0; i < iter_vars.size(); ++i) {
if (iter_vars[i]->is_reduce_axis) {
ir::ir_utils::CollectIRNodesWithoutTensor(
iter_values[i], [&](const ir::Expr* x) {
if (x->as_var()) {
reduce_loop_var.insert(x->as_var_ref()->name);
}
return false;
});
}
}
return reduce_loop_var;
}
std::string GetBlockName(const ir::Expr block) {
const ir::ScheduleBlockRealize* block_realize =
block.As<ir::ScheduleBlockRealize>();
CHECK_NOTNULL(block_realize);
const ir::ScheduleBlock* block_node =
block_realize->schedule_block.As<ir::ScheduleBlock>();
CHECK_NOTNULL(block_node);
return block_node->name;
}
} // namespace auto_schedule
} // namespace cinn
......@@ -48,5 +48,15 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
const ir::LoweredFunc& old_func,
ir::Expr& body); // NOLINT
/**
* Get loop var names of reduce axis
*/
std::unordered_set<std::string> GetReduceLoopVarNames(const ir::Expr block);
/**
* Get name of a ScheduleBlock
*/
std::string GetBlockName(const ir::Expr block);
} // namespace auto_schedule
} // namespace cinn
......@@ -20,6 +20,7 @@
#include <sstream>
#include <vector>
#include "paddle/cinn/ast_gen_ius/tensor_group.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
......@@ -49,9 +50,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_SimpleAssign) {
ir::Tensor B = lang::Compute(
{M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
poly::StageMap stages = poly::CreateStages({A, B});
std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
"SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
ast_gen_ius::TensorGroup tensor_group({A, B});
std::vector<ir::LoweredFunc> funcs =
lang::LowerToAstVec("SimpleAssign", {A, B}, &tensor_group, target);
ASSERT_FALSE(funcs.empty());
ir::Expr ast_expr = funcs[0]->body;
......@@ -115,9 +116,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_AddDiffShape) {
ir::Tensor C = lang::Compute(
{M, N}, [&](Var i, Var j) { return A(i) + B(j); }, "C");
poly::StageMap stages = poly::CreateStages({C});
std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
"AddDiffShape", stages, {C}, {}, {}, nullptr, target, true);
ast_gen_ius::TensorGroup tensor_group({C});
std::vector<ir::LoweredFunc> funcs =
lang::LowerToAstVec("AddDiffShape", {C}, &tensor_group, target);
ir::Expr ast_expr = funcs[0]->body;
VLOG(6) << "Expr before MultiLevelTiling: ";
......@@ -169,9 +170,9 @@ TEST(AnalyzeIr, ContainsNodeType) {
ir::Tensor B = lang::Compute(
{M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
poly::StageMap stages = poly::CreateStages({A, B});
std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
"SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
ast_gen_ius::TensorGroup tensor_group({A, B});
std::vector<ir::LoweredFunc> funcs =
lang::LowerToAstVec("SimpleAssign", {A, B}, &tensor_group, target);
ASSERT_FALSE(funcs.empty());
ir::Expr ast_expr = funcs[0]->body;
......
......@@ -63,8 +63,8 @@ void AutoTuner::Initialize(const Config& config,
const auto& shape_dict = graph_->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
op_lowerer_ = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target_);
op_lowerer_ = std::make_unique<hlir::framework::OpLowerer<GroupPtr>>(
new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target_));
InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
for (auto i = 0; i < tasks_.size(); ++i) {
auto&& task = tasks_[i];
......
......@@ -30,11 +30,11 @@
namespace cinn {
namespace auto_schedule {
// This class is entrance of auto-tune, users can use it
// to tune graph (not supported yet) and search a series of schedules
// that maybe more likely to obtain better performance.
// Internally, it creates necessary components and use them to perform tuning.
using GroupPtr = hlir::framework::GroupPtr;
class AutoTuner {
public:
// configure how to perform auto-tune, such as
......@@ -58,7 +58,7 @@ class AutoTuner {
private:
const common::Target& target_;
hlir::framework::Graph* graph_;
std::unique_ptr<hlir::framework::OpLowerer> op_lowerer_;
std::unique_ptr<hlir::framework::OpLowerer<GroupPtr>> op_lowerer_;
// Tasks to tune
std::vector<TuneTask> tasks_;
......
......@@ -26,17 +26,19 @@
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/runtime/flags.h"
DECLARE_bool(auto_schedule_use_cost_model);
PD_DECLARE_bool(auto_schedule_use_cost_model);
namespace cinn {
namespace auto_schedule {
using ::cinn::hlir::framework::BuildScope;
using ::cinn::hlir::framework::CompilationContext;
using ::cinn::hlir::framework::Graph;
using ::cinn::hlir::framework::GraphCompiler;
using ::cinn::hlir::framework::Instruction;
......@@ -53,6 +55,7 @@ class TestAutoTuner : public ::testing::Test {
std::shared_ptr<Graph> graph;
std::shared_ptr<Scope> compiled_scope;
CompilationContext context;
std::unique_ptr<GraphCompiler> graph_compiler;
std::unique_ptr<AutoTuner> tuner;
......@@ -73,8 +76,10 @@ class TestAutoTuner : public ::testing::Test {
auto program = CreateAddReluProgram();
auto graph = cinn::frontend::Optimize(&program, fetch_ids, target);
compiled_scope = BuildScope(target, graph);
graph_compiler =
std::make_unique<GraphCompiler>(target, compiled_scope, graph);
context.graph = graph;
context.scope = compiled_scope;
context.target = target;
graph_compiler = std::make_unique<GraphCompiler>(context);
tuner = std::make_unique<AutoTuner>(target, graph.get());
}
......@@ -99,16 +104,14 @@ class TestAutoTuner : public ::testing::Test {
virtual void ApplyTunedAndRun(const TuningResult& result) {
// build runtime program with tuning result
GraphCompiler::CompileOptions compile_options;
compile_options.with_instantiate_variables = true;
compile_options.Apply(result);
ASSERT_EQ(1, compile_options.groups.size());
ASSERT_EQ(1, compile_options.lowered_funcs.size());
context.with_instantiate_variables = true;
context.ApplyTuningResult(result);
ASSERT_EQ(1, context.groups.size());
ASSERT_EQ(1, context.lowered_funcs.size());
VLOG(6) << "Print lowered_funcs before building";
VLOG(6) << compile_options.lowered_funcs[0][0];
VLOG(6) << compile_options.lowered_funcs[1][0];
auto runtime_program =
graph_compiler->Build(compile_options).runtime_program;
VLOG(6) << context.lowered_funcs[0][0];
VLOG(6) << context.lowered_funcs[1][0];
auto runtime_program = graph_compiler->Build(&context).runtime_program;
ASSERT_EQ(1, runtime_program->size());
runtime_program->Execute();
}
......
......@@ -3,7 +3,8 @@ core_gather_headers()
gather_srcs(cinnapi_src SRCS xgb_cost_model.cc expr_cost_model.cc feature.cc
feature_extractor.cc)
cinn_cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
# TODO(zhhsplendid): enable this test again
#cinn_cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
cinn_cc_test(test_feature_extractor SRCS feature_extractor_test.cc DEPS
cinncore)
cinn_cc_test(test_feature SRCS feature_test.cc DEPS cinncore)
......@@ -33,9 +33,9 @@
#include "paddle/cinn/common/type.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/optim/transform_polyfor_to_for.h"
namespace cinn {
......@@ -82,6 +82,7 @@ VisitDoNothing(ScheduleBlockRealize);
VisitDoNothing(Ramp);
VisitDoNothing(_Buffer_);
VisitDoNothing(_BufferRange_);
VisitDoNothing(_Dim_);
#define NotVisitExprFields(NodeType) \
void FeatureExtractor::Visit(const NodeType *x) {}
......@@ -218,7 +219,7 @@ void FeatureExtractor::Visit(const For *x) {
}
void FeatureExtractor::Visit(const PolyFor *x) {
Expr copy = optim::IRCopy(Expr(x));
Expr copy = ir::ir_utils::IRCopy(Expr(x));
feature_.IntoLoopBlock();
optim::TransformPolyForToFor(&copy);
ir::For *loop = copy.As<For>();
......
......@@ -31,8 +31,8 @@
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/ir_visitor.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_visitor.h"
namespace cinn {
namespace auto_schedule {
......
......@@ -21,6 +21,7 @@
#include <unordered_set>
#include <vector>
#include "paddle/cinn/ast_gen_ius/tensor_group.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
......@@ -48,9 +49,9 @@ TEST(FeatureExtractor, SimpleAssign) {
ir::Tensor B = lang::Compute(
{M, N}, [&](Var i, Var j) { return A(i, j); }, "B");
poly::StageMap stages = poly::CreateStages({A, B});
std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
"SimpleAssign", stages, {A, B}, {}, {}, nullptr, target, true);
ast_gen_ius::TensorGroup tensor_group({A, B});
std::vector<ir::LoweredFunc> funcs =
lang::LowerToAstVec("SimpleAssign", {A, B}, &tensor_group, target);
ir::Expr ast_expr = funcs[0]->body;
VLOG(6) << "Expr to test: " << ast_expr;
......@@ -88,6 +89,7 @@ TEST(FeatureExtractor, SimpleAssign) {
ASSERT_EQ(to_check[29], slog(3));
}
#ifdef CINN_WITH_CUDA
TEST(FeatureExtractor, MatrixMultiply) {
Context::Global().ResetNameId();
#ifdef CINN_WITH_CUDA
......@@ -109,9 +111,9 @@ TEST(FeatureExtractor, MatrixMultiply) {
[&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); },
"C");
poly::StageMap stages = poly::CreateStages({C});
std::vector<ir::LoweredFunc> funcs = lang::LowerVec(
"MatrixMultiply", stages, {C}, {}, {}, nullptr, target, true);
ast_gen_ius::TensorGroup tensor_group({C});
std::vector<ir::LoweredFunc> funcs =
lang::LowerToAstVec("SimpleAssign", {C}, &tensor_group, target);
std::vector<Expr> vec_ast{funcs[0]->body};
ir::ModuleExpr mod_expr(vec_ast);
......@@ -161,6 +163,6 @@ TEST(FeatureExtractor, MatrixMultiply) {
// GpuBind loop
ASSERT_EQ(to_check[37], slog(out_loop));
}
#endif
} // namespace auto_schedule
} // namespace cinn
......@@ -20,12 +20,13 @@
#include <fstream>
#include <vector>
#include "paddle/cinn/ast_gen_ius/tensor_group.h"
#include "paddle/cinn/auto_schedule/search_space/search_state.h"
#include "paddle/cinn/auto_schedule/task/task_registry.h"
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
namespace cinn {
namespace auto_schedule {
......@@ -47,8 +48,8 @@ std::vector<ir::LoweredFunc> LowerCompute(const std::vector<int>& shape,
C = Compute(
domain, [&B](Var i, Var j) { return B(i, j); }, "C");
return cinn::lang::LowerVec(
"test_func", CreateStages({A, B}), {A, B}, {}, {}, nullptr, target, true);
ast_gen_ius::TensorGroup tensor_group({A, B});
return cinn::lang::LowerToAstVec("test_func", {A, B}, &tensor_group, target);
}
// Create a new IRSchedule with copied ir::LoweredFunc AST
......@@ -56,7 +57,7 @@ ir::IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs,
const std::string& task_key) {
std::vector<Expr> exprs;
for (auto&& func : lowered_funcs) {
exprs.emplace_back(optim::IRCopy(func->body));
exprs.emplace_back(ir::ir_utils::IRCopy(func->body));
}
InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
task_registry->Regist(task_key, ir::ModuleExpr(exprs));
......
......@@ -25,12 +25,15 @@
#include "paddle/cinn/frontend/optimize.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/runtime/flags.h"
namespace cinn {
namespace auto_schedule {
using ::cinn::hlir::framework::BuildScope;
using ::cinn::hlir::framework::CompilationContext;
using ::cinn::hlir::framework::Graph;
using ::cinn::hlir::framework::GraphCompiler;
......@@ -62,7 +65,8 @@ class TestMeasurer : public ::testing::Test {
auto program = CreateAddReluProgram();
auto graph = cinn::frontend::Optimize(&program, fetch_ids, target);
auto scope = BuildScope(target, graph);
graph_compiler = std::make_unique<GraphCompiler>(target, scope, graph);
CompilationContext context(graph, scope, target);
graph_compiler = std::make_unique<GraphCompiler>(context);
TaskCreator task_creator;
tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
const auto& dtype_dict =
......@@ -72,12 +76,12 @@ class TestMeasurer : public ::testing::Test {
absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
"infershape");
auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
inputs.reserve(tasks.size());
for (int i = 0; i < tasks.size(); ++i) {
auto* task = &tasks[i];
task->Initialize(shape_dict, dtype_dict, op_lowerer.get());
task->Initialize(shape_dict, dtype_dict, &op_lowerer);
MeasureInput input;
input.task = task;
input.lowered_funcs = task->lowered_funcs;
......
......@@ -17,6 +17,8 @@
namespace cinn {
namespace auto_schedule {
using hlir::framework::CompilationContext;
using hlir::framework::CompilationResult;
using hlir::framework::GraphCompiler;
SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler)
......@@ -25,19 +27,18 @@ SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler)
BuildResult SimpleBuilder::Build(const MeasureInput& input) {
CHECK_NE(graph_compiler_, static_cast<GraphCompiler*>(nullptr))
<< "empty handle to GraphCompiler";
GraphCompiler::CompileOptions compile_options;
compile_options.groups.emplace_back(input.task->subgraph);
compile_options.lowered_funcs.emplace_back(input.lowered_funcs);
compile_options.remove_unused_variables = false;
CompilationContext& context = graph_compiler_->GetCompilationContext();
context.groups.emplace_back(input.task->subgraph);
context.lowered_funcs.emplace_back(input.lowered_funcs);
context.remove_unused_variables = false;
VLOG(5) << "call GraphCompiler to Build with Graph::Group size="
<< compile_options.groups.size() << ", lowered_funcs group size="
<< compile_options.lowered_funcs.size();
GraphCompiler::CompilationResult compiled_result =
graph_compiler_->Build(compile_options);
<< context.groups.size()
<< ", lowered_funcs group size=" << context.lowered_funcs.size();
CompilationResult compiled_result = graph_compiler_->Build(&context);
BuildResult build_result;
build_result.compiled_scope = graph_compiler_->GetScope().get();
build_result.runtime_program = std::move(compiled_result.runtime_program);
build_result.runtime_program = std::move(compiled_result.RuntimeProgram());
return build_result;
}
......
......@@ -16,6 +16,7 @@
#include "paddle/cinn/auto_schedule/measure/measure.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
namespace cinn {
namespace auto_schedule {
......
......@@ -25,11 +25,13 @@
#include "paddle/cinn/frontend/optimize.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
namespace cinn {
namespace auto_schedule {
using ::cinn::hlir::framework::BuildScope;
using ::cinn::hlir::framework::CompilationContext;
using ::cinn::hlir::framework::Graph;
using ::cinn::hlir::framework::GraphCompiler;
using ::cinn::hlir::framework::Instruction;
......@@ -56,8 +58,8 @@ class TestSimpleRunner : public ::testing::Test {
auto program = CreateAddReluProgram();
auto graph = cinn::frontend::Optimize(&program, fetch_ids, target);
compiled_scope = BuildScope(target, graph);
graph_compiler =
std::make_unique<GraphCompiler>(target, compiled_scope, graph);
CompilationContext context(graph, compiled_scope, target);
graph_compiler = std::make_unique<GraphCompiler>(context);
auto runtime_program = graph_compiler->Build();
const auto& instructions = runtime_program->GetRunInstructions();
ASSERT_EQ(1, instructions.size());
......@@ -123,8 +125,8 @@ TEST_F(TestSimpleRunner, TimeMeasured) {
"sleep_fn"));
instructions.back()->SetLoweredFunc(reinterpret_cast<void*>(sleep_fn));
instructions.back()->Finalize();
build_result.runtime_program.reset(
new hlir::framework::Program(nullptr, std::move(instructions)));
build_result.runtime_program = std::make_unique<hlir::framework::Program>(
nullptr, std::move(instructions));
// to skip the condition check of params in Instruction::PreparePodArgs
std::map<std::string, cinn_pod_value_t> preset_args;
......
......@@ -15,9 +15,9 @@
#include "paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/schedule/schedule_desc.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
namespace cinn {
namespace auto_schedule {
......
......@@ -17,7 +17,7 @@
#include <gtest/gtest.h>
#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "test/cpp/cinn/program_builder.h"
namespace cinn {
......@@ -129,7 +129,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
{
i0, i1 = axis.bind(((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1)))
{
temp_matmul_out__reduce_init[((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1))] = 0.00000000f
temp_matmul_out__reduce_init[i0, i1] = 0.00000000f
}
}
}
......@@ -181,7 +181,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
{
i0_0, i1_0, i2 = axis.bind(((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1)), ((4 * reduce_k_0) + reduce_k_1))
{
temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] = (temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] + (X_reshape_shared_temp_buffer[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((4 * reduce_k_0) + reduce_k_1)] * Y_reshape_shared_temp_buffer[((4 * reduce_k_0) + reduce_k_1), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))]))
temp_matmul_out[i0_0, i1_0] = (temp_matmul_out[i0_0, i1_0] + (X_reshape_shared_temp_buffer[i0_0, i2] * Y_reshape_shared_temp_buffer[i2, i1_0]))
}
}
}
......
......@@ -8,7 +8,8 @@ gather_srcs(
auto_unroll.cc
multi_level_tiling.cc
skip_rule.cc
auto_bind.cc)
auto_bind.cc
reduction_factoring.cc)
if(WITH_TESTING)
cinn_cc_library(
......@@ -51,3 +52,11 @@ endif()
#cinn_cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore auto_gen_rule_test_helper)
cinn_cc_test(test_skip_rule SRCS skip_rule_test.cc DEPS cinncore)
cinn_cc_test(test_auto_unroll SRCS auto_unroll_test.cc DEPS cinncore)
cinn_cc_test(
test_reduction_factoring
SRCS
reduction_factoring_test.cc
DEPS
cinncore
auto_gen_rule_test_helper
test_program_builder)
......@@ -16,10 +16,11 @@
#include <glog/logging.h>
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/schedule_block_graph.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_nodes_collector.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
namespace cinn {
namespace auto_schedule {
......@@ -31,7 +32,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
const auto& loop_var = for_node->loop_var;
// collect cases where the loop_var used in one of reduce axis in underneath
// ScheduleBlock
auto used_for_reduce_axis = ir::CollectIRNodesWithoutTensor(
auto used_for_reduce_axis = ir::ir_utils::CollectIRNodesWithoutTensor(
for_node->body, [&loop_var](const Expr* x) {
const auto* block_realize = x->As<ir::ScheduleBlockRealize>();
if (!block_realize) return false;
......@@ -46,7 +47,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
const ir::Expr& binding = block_realize->iter_values[i];
if (iter_var->is_reduce_axis ||
iter_var->name.substr(0, 6) == "reduce") {
auto used_exprs = ir::CollectIRNodesWithoutTensor(
auto used_exprs = ir::ir_utils::CollectIRNodesWithoutTensor(
binding, [&loop_var](const Expr* x) {
const ir::_Var_* var = x->As<ir::_Var_>();
if (var &&
......@@ -94,6 +95,8 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
auto all_loops = ir_schedule->GetLoops(block_name);
CHECK_LE(num_loops_to_bind, all_loops.size())
<< "The number of loops to be bind is greater than size of all_loops";
CHECK_GE(num_loops_to_bind, 0)
<< "The number of loops to be bind should be greater than 0";
// check whether it is the case that threadIdx has been binded but blockIdx
// not, the threadIdx can only be binded in the first loop after
// num_loops_to_bind loops because we has excluded other cases in
......@@ -101,6 +104,17 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
bool gpu_thread_has_binded =
num_loops_to_bind < all_loops.size() &&
all_loops[num_loops_to_bind].As<ir::For>()->is_gpu_thread_binded();
ir::BlockOrderConstructor block_order_constructor;
std::map<std::vector<int>, ir::Expr> blocks_order_with_ctrl_stmt =
block_order_constructor(&all_loops[num_loops_to_bind - 1]);
for (auto& pair : blocks_order_with_ctrl_stmt) {
if (pair.first.size() == 2) {
ir::Expr stmt = pair.second;
if (stmt.As<ir::For>() && stmt.As<ir::For>()->is_gpu_thread_binded()) {
gpu_thread_has_binded = true;
}
}
}
Expr fused_loop = ir_schedule->Fuse(
{all_loops.begin(), all_loops.begin() + num_loops_to_bind});
int32_t extent = fused_loop.As<ir::For>()->extent.as_int32();
......@@ -181,5 +195,18 @@ std::vector<SearchState> AutoBind::ApplyOnBlock(SearchState state,
return {new_state};
}
void AutoBind::Apply(ir::IRSchedule* ir_schedule,
const std::string& block_name) {
int num_loop_can_bind =
CountLoopCanBinded(ir_schedule->GetLoops(block_name)[0].As<ir::For>());
if (num_loop_can_bind > 0) {
BindGPUIndex(ir_schedule,
block_name,
num_loop_can_bind,
kMaxBlocks,
target_->max_num_threads());
}
}
} // namespace auto_schedule
} // namespace cinn
......@@ -42,6 +42,8 @@ class AutoBind : public AutoGenRule {
std::vector<SearchState> ApplyOnBlock(SearchState state,
const std::string& block_name) override;
void Apply(ir::IRSchedule* ir_schedule, const std::string& block_name);
private:
std::vector<Expr> applicable_schedule_blocks_;
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment