Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
01a10755
Commit
01a10755
authored
Mar 04, 2024
by
yuguo-Jack
Browse files
2.5.2-dtk24.04
parent
63eb0da5
Changes
565
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
186 additions
and
85 deletions
+186
-85
paddle/cinn/auto_schedule/analysis/analyze_ir.cc
paddle/cinn/auto_schedule/analysis/analyze_ir.cc
+56
-20
paddle/cinn/auto_schedule/analysis/analyze_ir.h
paddle/cinn/auto_schedule/analysis/analyze_ir.h
+10
-0
paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
+10
-9
paddle/cinn/auto_schedule/auto_tuner.cc
paddle/cinn/auto_schedule/auto_tuner.cc
+2
-2
paddle/cinn/auto_schedule/auto_tuner.h
paddle/cinn/auto_schedule/auto_tuner.h
+2
-2
paddle/cinn/auto_schedule/auto_tuner_test.cc
paddle/cinn/auto_schedule/auto_tuner_test.cc
+15
-12
paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
+2
-1
paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
+3
-2
paddle/cinn/auto_schedule/cost_model/feature_extractor.h
paddle/cinn/auto_schedule/cost_model/feature_extractor.h
+1
-1
paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
...e/cinn/auto_schedule/cost_model/feature_extractor_test.cc
+9
-7
paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
+5
-4
paddle/cinn/auto_schedule/measure/measurer_test.cc
paddle/cinn/auto_schedule/measure/measurer_test.cc
+8
-4
paddle/cinn/auto_schedule/measure/simple_builder.cc
paddle/cinn/auto_schedule/measure/simple_builder.cc
+10
-9
paddle/cinn/auto_schedule/measure/simple_builder.h
paddle/cinn/auto_schedule/measure/simple_builder.h
+1
-0
paddle/cinn/auto_schedule/measure/simple_runner_test.cc
paddle/cinn/auto_schedule/measure/simple_runner_test.cc
+6
-4
paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
...n/auto_schedule/post_schedule_rule/cooperative_process.cc
+1
-1
paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
...o_schedule/post_schedule_rule/cooperative_process_test.cc
+3
-3
paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
...n/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
+10
-1
paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
...inn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
+30
-3
paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
...cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
+2
-0
No files found.
Too many changes to show.
To preserve performance only
565 of 565+
files are displayed.
Plain diff
Email patch
paddle/cinn/auto_schedule/analysis/analyze_ir.cc
View file @
01a10755
...
...
@@ -23,12 +23,12 @@
#include "paddle/cinn/ir/buffer.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/tensor.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_nodes_collector.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/lang/lower.h"
#include "paddle/cinn/optim/optimize.h"
#include "paddle/cinn/optim/transform_gpu_forloop.h"
...
...
@@ -41,7 +41,7 @@ std::vector<ir::Var> IndicesToVars(const std::vector<ir::Expr>& indices) {
for
(
const
ir
::
Expr
&
e
:
indices
)
{
// Whether we have to convert other types, like const numbers to Var?
if
(
e
.
As
<
ir
::
_Var_
>
()
!=
nullptr
)
{
ir
::
Expr
copy_e
=
optim
::
IRCopy
(
e
);
ir
::
Expr
copy_e
=
ir
::
ir_utils
::
IRCopy
(
e
);
ir
::
_Var_
*
var_ref
=
copy_e
.
As
<
ir
::
_Var_
>
();
result
.
emplace_back
(
ir
::
Var
(
var_ref
));
}
...
...
@@ -54,7 +54,8 @@ void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
return
;
}
ir
::
CollectIRNodesWithoutTensor
(
sche_block
->
body
,
[
&
](
const
Expr
*
x
)
{
ir
::
ir_utils
::
CollectIRNodesWithoutTensor
(
sche_block
->
body
,
[
&
](
const
Expr
*
x
)
{
const
ir
::
Load
*
load_expr
=
x
->
As
<
ir
::
Load
>
();
if
(
load_expr
!=
nullptr
)
{
const
ir
::
Tensor
t
=
load_expr
->
tensor
.
as_tensor_ref
();
...
...
@@ -76,7 +77,7 @@ void AnalyzeScheduleBlockReadWriteBuffer(ir::ScheduleBlock* sche_block) {
bool
ContainsNodeType
(
ir
::
Expr
expr
,
const
std
::
unordered_set
<
ir
::
IrNodeTy
>&
node_types
)
{
std
::
set
<
ir
::
Expr
>
collection
=
ir
::
CollectIRNodesWithoutTensor
(
expr
,
[
&
](
const
Expr
*
x
)
{
ir
::
ir_utils
::
CollectIRNodesWithoutTensor
(
expr
,
[
&
](
const
Expr
*
x
)
{
return
node_types
.
find
(
x
->
node_type
())
!=
node_types
.
end
();
});
return
!
collection
.
empty
();
...
...
@@ -189,5 +190,40 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
return
new_func
;
}
std
::
unordered_set
<
std
::
string
>
GetReduceLoopVarNames
(
const
ir
::
Expr
block
)
{
const
ir
::
ScheduleBlockRealize
*
block_realize
=
block
.
As
<
ir
::
ScheduleBlockRealize
>
();
CHECK_NOTNULL
(
block_realize
);
const
ir
::
ScheduleBlock
*
block_node
=
block_realize
->
schedule_block
.
As
<
ir
::
ScheduleBlock
>
();
CHECK_NOTNULL
(
block_node
);
std
::
vector
<
ir
::
Expr
>
iter_values
=
block_realize
->
iter_values
;
std
::
vector
<
ir
::
Var
>
iter_vars
=
block_node
->
iter_vars
;
std
::
unordered_set
<
std
::
string
>
reduce_loop_var
;
for
(
int
i
=
0
;
i
<
iter_vars
.
size
();
++
i
)
{
if
(
iter_vars
[
i
]
->
is_reduce_axis
)
{
ir
::
ir_utils
::
CollectIRNodesWithoutTensor
(
iter_values
[
i
],
[
&
](
const
ir
::
Expr
*
x
)
{
if
(
x
->
as_var
())
{
reduce_loop_var
.
insert
(
x
->
as_var_ref
()
->
name
);
}
return
false
;
});
}
}
return
reduce_loop_var
;
}
std
::
string
GetBlockName
(
const
ir
::
Expr
block
)
{
const
ir
::
ScheduleBlockRealize
*
block_realize
=
block
.
As
<
ir
::
ScheduleBlockRealize
>
();
CHECK_NOTNULL
(
block_realize
);
const
ir
::
ScheduleBlock
*
block_node
=
block_realize
->
schedule_block
.
As
<
ir
::
ScheduleBlock
>
();
CHECK_NOTNULL
(
block_node
);
return
block_node
->
name
;
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/analysis/analyze_ir.h
View file @
01a10755
...
...
@@ -48,5 +48,15 @@ ir::LoweredFunc UpdateFuncWithNewBody(const common::Target& target,
const
ir
::
LoweredFunc
&
old_func
,
ir
::
Expr
&
body
);
// NOLINT
/**
* Get loop var names of reduce axis
*/
std
::
unordered_set
<
std
::
string
>
GetReduceLoopVarNames
(
const
ir
::
Expr
block
);
/**
* Get name of a ScheduleBlock
*/
std
::
string
GetBlockName
(
const
ir
::
Expr
block
);
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/analysis/analyze_ir_test.cc
View file @
01a10755
...
...
@@ -20,6 +20,7 @@
#include <sstream>
#include <vector>
#include "paddle/cinn/ast_gen_ius/tensor_group.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
...
...
@@ -49,9 +50,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_SimpleAssign) {
ir
::
Tensor
B
=
lang
::
Compute
(
{
M
,
N
},
[
&
](
Var
i
,
Var
j
)
{
return
A
(
i
,
j
);
},
"B"
);
poly
::
StageMap
st
a
ge
s
=
poly
::
CreateStages
({
A
,
B
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerVec
(
"SimpleAssign"
,
stages
,
{
A
,
B
},
{},
{},
nullptr
,
target
,
true
);
a
st
_
ge
n_ius
::
TensorGroup
tensor_group
({
A
,
B
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerToAstVec
(
"SimpleAssign"
,
{
A
,
B
},
&
tensor_group
,
target
);
ASSERT_FALSE
(
funcs
.
empty
());
ir
::
Expr
ast_expr
=
funcs
[
0
]
->
body
;
...
...
@@ -115,9 +116,9 @@ TEST(AnalyzeIr, AnalyzeScheduleBlockReadWriteBuffer_AddDiffShape) {
ir
::
Tensor
C
=
lang
::
Compute
(
{
M
,
N
},
[
&
](
Var
i
,
Var
j
)
{
return
A
(
i
)
+
B
(
j
);
},
"C"
);
poly
::
StageMap
st
a
ge
s
=
poly
::
CreateStages
({
C
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerVec
(
"AddDiffShape"
,
stages
,
{
C
},
{},
{},
nullptr
,
target
,
true
);
a
st
_
ge
n_ius
::
TensorGroup
tensor_group
({
C
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerToAstVec
(
"AddDiffShape"
,
{
C
},
&
tensor_group
,
target
);
ir
::
Expr
ast_expr
=
funcs
[
0
]
->
body
;
VLOG
(
6
)
<<
"Expr before MultiLevelTiling: "
;
...
...
@@ -169,9 +170,9 @@ TEST(AnalyzeIr, ContainsNodeType) {
ir
::
Tensor
B
=
lang
::
Compute
(
{
M
,
N
},
[
&
](
Var
i
,
Var
j
)
{
return
A
(
i
,
j
);
},
"B"
);
poly
::
StageMap
st
a
ge
s
=
poly
::
CreateStages
({
A
,
B
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerVec
(
"SimpleAssign"
,
stages
,
{
A
,
B
},
{},
{},
nullptr
,
target
,
true
);
a
st
_
ge
n_ius
::
TensorGroup
tensor_group
({
A
,
B
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerToAstVec
(
"SimpleAssign"
,
{
A
,
B
},
&
tensor_group
,
target
);
ASSERT_FALSE
(
funcs
.
empty
());
ir
::
Expr
ast_expr
=
funcs
[
0
]
->
body
;
...
...
paddle/cinn/auto_schedule/auto_tuner.cc
View file @
01a10755
...
...
@@ -63,8 +63,8 @@ void AutoTuner::Initialize(const Config& config,
const
auto
&
shape_dict
=
graph_
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>>
(
"infershape"
);
op_lowerer_
=
std
::
make_unique
<
hlir
::
framework
::
OpLowerer
>
(
dtype_dict
,
shape_dict
,
target_
);
op_lowerer_
=
std
::
make_unique
<
hlir
::
framework
::
OpLowerer
<
GroupPtr
>
>
(
new
hlir
::
framework
::
OpLowererImpl
(
dtype_dict
,
shape_dict
,
target_
)
)
;
InitialTaskRegistry
*
task_registry
=
InitialTaskRegistry
::
Global
();
for
(
auto
i
=
0
;
i
<
tasks_
.
size
();
++
i
)
{
auto
&&
task
=
tasks_
[
i
];
...
...
paddle/cinn/auto_schedule/auto_tuner.h
View file @
01a10755
...
...
@@ -30,11 +30,11 @@
namespace
cinn
{
namespace
auto_schedule
{
// This class is entrance of auto-tune, users can use it
// to tune graph (not supported yet) and search a series of schedules
// that maybe more likely to obtain better performance.
// Internally, it creates necessary components and use them to perform tuning.
using
GroupPtr
=
hlir
::
framework
::
GroupPtr
;
class
AutoTuner
{
public:
// configure how to perform auto-tune, such as
...
...
@@ -58,7 +58,7 @@ class AutoTuner {
private:
const
common
::
Target
&
target_
;
hlir
::
framework
::
Graph
*
graph_
;
std
::
unique_ptr
<
hlir
::
framework
::
OpLowerer
>
op_lowerer_
;
std
::
unique_ptr
<
hlir
::
framework
::
OpLowerer
<
GroupPtr
>
>
op_lowerer_
;
// Tasks to tune
std
::
vector
<
TuneTask
>
tasks_
;
...
...
paddle/cinn/auto_schedule/auto_tuner_test.cc
View file @
01a10755
...
...
@@ -26,17 +26,19 @@
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/runtime/flags.h"
DECLARE_bool
(
auto_schedule_use_cost_model
);
PD_
DECLARE_bool
(
auto_schedule_use_cost_model
);
namespace
cinn
{
namespace
auto_schedule
{
using
::
cinn
::
hlir
::
framework
::
BuildScope
;
using
::
cinn
::
hlir
::
framework
::
CompilationContext
;
using
::
cinn
::
hlir
::
framework
::
Graph
;
using
::
cinn
::
hlir
::
framework
::
GraphCompiler
;
using
::
cinn
::
hlir
::
framework
::
Instruction
;
...
...
@@ -53,6 +55,7 @@ class TestAutoTuner : public ::testing::Test {
std
::
shared_ptr
<
Graph
>
graph
;
std
::
shared_ptr
<
Scope
>
compiled_scope
;
CompilationContext
context
;
std
::
unique_ptr
<
GraphCompiler
>
graph_compiler
;
std
::
unique_ptr
<
AutoTuner
>
tuner
;
...
...
@@ -73,8 +76,10 @@ class TestAutoTuner : public ::testing::Test {
auto
program
=
CreateAddReluProgram
();
auto
graph
=
cinn
::
frontend
::
Optimize
(
&
program
,
fetch_ids
,
target
);
compiled_scope
=
BuildScope
(
target
,
graph
);
graph_compiler
=
std
::
make_unique
<
GraphCompiler
>
(
target
,
compiled_scope
,
graph
);
context
.
graph
=
graph
;
context
.
scope
=
compiled_scope
;
context
.
target
=
target
;
graph_compiler
=
std
::
make_unique
<
GraphCompiler
>
(
context
);
tuner
=
std
::
make_unique
<
AutoTuner
>
(
target
,
graph
.
get
());
}
...
...
@@ -99,16 +104,14 @@ class TestAutoTuner : public ::testing::Test {
virtual
void
ApplyTunedAndRun
(
const
TuningResult
&
result
)
{
// build runtime program with tuning result
GraphCompiler
::
CompileOptions
compile_options
;
compile_options
.
with_instantiate_variables
=
true
;
compile_options
.
Apply
(
result
);
ASSERT_EQ
(
1
,
compile_options
.
groups
.
size
());
ASSERT_EQ
(
1
,
compile_options
.
lowered_funcs
.
size
());
context
.
with_instantiate_variables
=
true
;
context
.
ApplyTuningResult
(
result
);
ASSERT_EQ
(
1
,
context
.
groups
.
size
());
ASSERT_EQ
(
1
,
context
.
lowered_funcs
.
size
());
VLOG
(
6
)
<<
"Print lowered_funcs before building"
;
VLOG
(
6
)
<<
compile_options
.
lowered_funcs
[
0
][
0
];
VLOG
(
6
)
<<
compile_options
.
lowered_funcs
[
1
][
0
];
auto
runtime_program
=
graph_compiler
->
Build
(
compile_options
).
runtime_program
;
VLOG
(
6
)
<<
context
.
lowered_funcs
[
0
][
0
];
VLOG
(
6
)
<<
context
.
lowered_funcs
[
1
][
0
];
auto
runtime_program
=
graph_compiler
->
Build
(
&
context
).
runtime_program
;
ASSERT_EQ
(
1
,
runtime_program
->
size
());
runtime_program
->
Execute
();
}
...
...
paddle/cinn/auto_schedule/cost_model/CMakeLists.txt
View file @
01a10755
...
...
@@ -3,7 +3,8 @@ core_gather_headers()
gather_srcs
(
cinnapi_src SRCS xgb_cost_model.cc expr_cost_model.cc feature.cc
feature_extractor.cc
)
cinn_cc_test
(
test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore
)
# TODO(zhhsplendid): enable this test again
#cinn_cc_test(test_xgb_cost_model SRCS xgb_cost_model_test.cc DEPS cinncore)
cinn_cc_test
(
test_feature_extractor SRCS feature_extractor_test.cc DEPS
cinncore
)
cinn_cc_test
(
test_feature SRCS feature_test.cc DEPS cinncore
)
paddle/cinn/auto_schedule/cost_model/feature_extractor.cc
View file @
01a10755
...
...
@@ -33,9 +33,9 @@
#include "paddle/cinn/common/type.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/optim/transform_polyfor_to_for.h"
namespace
cinn
{
...
...
@@ -82,6 +82,7 @@ VisitDoNothing(ScheduleBlockRealize);
VisitDoNothing
(
Ramp
);
VisitDoNothing
(
_Buffer_
);
VisitDoNothing
(
_BufferRange_
);
VisitDoNothing
(
_Dim_
);
#define NotVisitExprFields(NodeType) \
void FeatureExtractor::Visit(const NodeType *x) {}
...
...
@@ -218,7 +219,7 @@ void FeatureExtractor::Visit(const For *x) {
}
void
FeatureExtractor
::
Visit
(
const
PolyFor
*
x
)
{
Expr
copy
=
optim
::
IRCopy
(
Expr
(
x
));
Expr
copy
=
ir
::
ir_utils
::
IRCopy
(
Expr
(
x
));
feature_
.
IntoLoopBlock
();
optim
::
TransformPolyForToFor
(
&
copy
);
ir
::
For
*
loop
=
copy
.
As
<
For
>
();
...
...
paddle/cinn/auto_schedule/cost_model/feature_extractor.h
View file @
01a10755
...
...
@@ -31,8 +31,8 @@
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/ir_visitor.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_visitor.h"
namespace
cinn
{
namespace
auto_schedule
{
...
...
paddle/cinn/auto_schedule/cost_model/feature_extractor_test.cc
View file @
01a10755
...
...
@@ -21,6 +21,7 @@
#include <unordered_set>
#include <vector>
#include "paddle/cinn/ast_gen_ius/tensor_group.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
...
...
@@ -48,9 +49,9 @@ TEST(FeatureExtractor, SimpleAssign) {
ir
::
Tensor
B
=
lang
::
Compute
(
{
M
,
N
},
[
&
](
Var
i
,
Var
j
)
{
return
A
(
i
,
j
);
},
"B"
);
poly
::
StageMap
st
a
ge
s
=
poly
::
CreateStages
({
A
,
B
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerVec
(
"SimpleAssign"
,
stages
,
{
A
,
B
},
{},
{},
nullptr
,
target
,
true
);
a
st
_
ge
n_ius
::
TensorGroup
tensor_group
({
A
,
B
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerToAstVec
(
"SimpleAssign"
,
{
A
,
B
},
&
tensor_group
,
target
);
ir
::
Expr
ast_expr
=
funcs
[
0
]
->
body
;
VLOG
(
6
)
<<
"Expr to test: "
<<
ast_expr
;
...
...
@@ -88,6 +89,7 @@ TEST(FeatureExtractor, SimpleAssign) {
ASSERT_EQ
(
to_check
[
29
],
slog
(
3
));
}
#ifdef CINN_WITH_CUDA
TEST
(
FeatureExtractor
,
MatrixMultiply
)
{
Context
::
Global
().
ResetNameId
();
#ifdef CINN_WITH_CUDA
...
...
@@ -109,9 +111,9 @@ TEST(FeatureExtractor, MatrixMultiply) {
[
&
](
Var
i
,
Var
j
)
{
return
lang
::
ReduceSum
(
A
(
i
,
k
)
*
B
(
k
,
j
),
{
k
});
},
"C"
);
poly
::
StageMap
st
a
ge
s
=
poly
::
CreateStages
({
C
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerVec
(
"MatrixMultiply"
,
stages
,
{
C
},
{},
{},
nullptr
,
target
,
true
);
a
st
_
ge
n_ius
::
TensorGroup
tensor_group
({
C
});
std
::
vector
<
ir
::
LoweredFunc
>
funcs
=
lang
::
LowerToAstVec
(
"SimpleAssign"
,
{
C
},
&
tensor_group
,
target
);
std
::
vector
<
Expr
>
vec_ast
{
funcs
[
0
]
->
body
};
ir
::
ModuleExpr
mod_expr
(
vec_ast
);
...
...
@@ -161,6 +163,6 @@ TEST(FeatureExtractor, MatrixMultiply) {
// GpuBind loop
ASSERT_EQ
(
to_check
[
37
],
slog
(
out_loop
));
}
#endif
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/database/jsonfile_database_test.cc
View file @
01a10755
...
...
@@ -20,12 +20,13 @@
#include <fstream>
#include <vector>
#include "paddle/cinn/ast_gen_ius/tensor_group.h"
#include "paddle/cinn/auto_schedule/search_space/search_state.h"
#include "paddle/cinn/auto_schedule/task/task_registry.h"
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
namespace
cinn
{
namespace
auto_schedule
{
...
...
@@ -47,8 +48,8 @@ std::vector<ir::LoweredFunc> LowerCompute(const std::vector<int>& shape,
C
=
Compute
(
domain
,
[
&
B
](
Var
i
,
Var
j
)
{
return
B
(
i
,
j
);
},
"C"
);
return
cinn
::
lang
::
LowerVec
(
"test_func"
,
CreateStages
({
A
,
B
}),
{
A
,
B
},
{},
{},
nullptr
,
target
,
true
);
ast_gen_ius
::
TensorGroup
tensor_group
({
A
,
B
});
return
cinn
::
lang
::
LowerToAstVec
(
"test_func"
,
{
A
,
B
},
&
tensor_group
,
target
);
}
// Create a new IRSchedule with copied ir::LoweredFunc AST
...
...
@@ -56,7 +57,7 @@ ir::IRSchedule MakeIRSchedule(const std::vector<ir::LoweredFunc>& lowered_funcs,
const
std
::
string
&
task_key
)
{
std
::
vector
<
Expr
>
exprs
;
for
(
auto
&&
func
:
lowered_funcs
)
{
exprs
.
emplace_back
(
optim
::
IRCopy
(
func
->
body
));
exprs
.
emplace_back
(
ir
::
ir_utils
::
IRCopy
(
func
->
body
));
}
InitialTaskRegistry
*
task_registry
=
InitialTaskRegistry
::
Global
();
task_registry
->
Regist
(
task_key
,
ir
::
ModuleExpr
(
exprs
));
...
...
paddle/cinn/auto_schedule/measure/measurer_test.cc
View file @
01a10755
...
...
@@ -25,12 +25,15 @@
#include "paddle/cinn/frontend/optimize.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/runtime/flags.h"
namespace
cinn
{
namespace
auto_schedule
{
using
::
cinn
::
hlir
::
framework
::
BuildScope
;
using
::
cinn
::
hlir
::
framework
::
CompilationContext
;
using
::
cinn
::
hlir
::
framework
::
Graph
;
using
::
cinn
::
hlir
::
framework
::
GraphCompiler
;
...
...
@@ -62,7 +65,8 @@ class TestMeasurer : public ::testing::Test {
auto
program
=
CreateAddReluProgram
();
auto
graph
=
cinn
::
frontend
::
Optimize
(
&
program
,
fetch_ids
,
target
);
auto
scope
=
BuildScope
(
target
,
graph
);
graph_compiler
=
std
::
make_unique
<
GraphCompiler
>
(
target
,
scope
,
graph
);
CompilationContext
context
(
graph
,
scope
,
target
);
graph_compiler
=
std
::
make_unique
<
GraphCompiler
>
(
context
);
TaskCreator
task_creator
;
tasks
=
task_creator
.
CreateTuneTaskOpLevel
(
graph
.
get
());
const
auto
&
dtype_dict
=
...
...
@@ -72,12 +76,12 @@ class TestMeasurer : public ::testing::Test {
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>>
(
"infershape"
);
auto
op_lowerer
=
std
::
make_unique
<
hlir
::
framework
::
OpLowerer
>
(
dtype_dict
,
shape_dict
,
target
);
auto
op_lowerer
=
hlir
::
framework
::
CreateOpLowerer
(
dtype_dict
,
shape_dict
,
target
);
inputs
.
reserve
(
tasks
.
size
());
for
(
int
i
=
0
;
i
<
tasks
.
size
();
++
i
)
{
auto
*
task
=
&
tasks
[
i
];
task
->
Initialize
(
shape_dict
,
dtype_dict
,
op_lowerer
.
get
()
);
task
->
Initialize
(
shape_dict
,
dtype_dict
,
&
op_lowerer
);
MeasureInput
input
;
input
.
task
=
task
;
input
.
lowered_funcs
=
task
->
lowered_funcs
;
...
...
paddle/cinn/auto_schedule/measure/simple_builder.cc
View file @
01a10755
...
...
@@ -17,6 +17,8 @@
namespace
cinn
{
namespace
auto_schedule
{
using
hlir
::
framework
::
CompilationContext
;
using
hlir
::
framework
::
CompilationResult
;
using
hlir
::
framework
::
GraphCompiler
;
SimpleBuilder
::
SimpleBuilder
(
hlir
::
framework
::
GraphCompiler
*
graph_compiler
)
...
...
@@ -25,19 +27,18 @@ SimpleBuilder::SimpleBuilder(hlir::framework::GraphCompiler* graph_compiler)
BuildResult
SimpleBuilder
::
Build
(
const
MeasureInput
&
input
)
{
CHECK_NE
(
graph_compiler_
,
static_cast
<
GraphCompiler
*>
(
nullptr
))
<<
"empty handle to GraphCompiler"
;
Graph
Compil
er
::
CompileOptions
compile_options
;
co
mpile_options
.
groups
.
emplace_back
(
input
.
task
->
subgraph
);
co
mpile_options
.
lowered_funcs
.
emplace_back
(
input
.
lowered_funcs
);
co
mpile_options
.
remove_unused_variables
=
false
;
Compil
ationContext
&
context
=
graph_compiler_
->
GetCompilationContext
()
;
co
ntext
.
groups
.
emplace_back
(
input
.
task
->
subgraph
);
co
ntext
.
lowered_funcs
.
emplace_back
(
input
.
lowered_funcs
);
co
ntext
.
remove_unused_variables
=
false
;
VLOG
(
5
)
<<
"call GraphCompiler to Build with Graph::Group size="
<<
compile_options
.
groups
.
size
()
<<
", lowered_funcs group size="
<<
compile_options
.
lowered_funcs
.
size
();
GraphCompiler
::
CompilationResult
compiled_result
=
graph_compiler_
->
Build
(
compile_options
);
<<
context
.
groups
.
size
()
<<
", lowered_funcs group size="
<<
context
.
lowered_funcs
.
size
();
CompilationResult
compiled_result
=
graph_compiler_
->
Build
(
&
context
);
BuildResult
build_result
;
build_result
.
compiled_scope
=
graph_compiler_
->
GetScope
().
get
();
build_result
.
runtime_program
=
std
::
move
(
compiled_result
.
r
untime
_p
rogram
);
build_result
.
runtime_program
=
std
::
move
(
compiled_result
.
R
untime
P
rogram
()
);
return
build_result
;
}
...
...
paddle/cinn/auto_schedule/measure/simple_builder.h
View file @
01a10755
...
...
@@ -16,6 +16,7 @@
#include "paddle/cinn/auto_schedule/measure/measure.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
namespace
cinn
{
namespace
auto_schedule
{
...
...
paddle/cinn/auto_schedule/measure/simple_runner_test.cc
View file @
01a10755
...
...
@@ -25,11 +25,13 @@
#include "paddle/cinn/frontend/optimize.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
namespace
cinn
{
namespace
auto_schedule
{
using
::
cinn
::
hlir
::
framework
::
BuildScope
;
using
::
cinn
::
hlir
::
framework
::
CompilationContext
;
using
::
cinn
::
hlir
::
framework
::
Graph
;
using
::
cinn
::
hlir
::
framework
::
GraphCompiler
;
using
::
cinn
::
hlir
::
framework
::
Instruction
;
...
...
@@ -56,8 +58,8 @@ class TestSimpleRunner : public ::testing::Test {
auto
program
=
CreateAddReluProgram
();
auto
graph
=
cinn
::
frontend
::
Optimize
(
&
program
,
fetch_ids
,
target
);
compiled_scope
=
BuildScope
(
target
,
graph
);
graph
_
compile
r
=
std
::
make_unique
<
GraphCompiler
>
(
target
,
compiled_scope
,
graph
);
CompilationContext
context
(
graph
,
compile
d_scope
,
target
);
graph_compiler
=
std
::
make_unique
<
GraphCompiler
>
(
context
);
auto
runtime_program
=
graph_compiler
->
Build
();
const
auto
&
instructions
=
runtime_program
->
GetRunInstructions
();
ASSERT_EQ
(
1
,
instructions
.
size
());
...
...
@@ -123,8 +125,8 @@ TEST_F(TestSimpleRunner, TimeMeasured) {
"sleep_fn"
));
instructions
.
back
()
->
SetLoweredFunc
(
reinterpret_cast
<
void
*>
(
sleep_fn
));
instructions
.
back
()
->
Finalize
();
build_result
.
runtime_program
.
reset
(
new
hlir
::
framework
::
Program
(
nullptr
,
std
::
move
(
instructions
))
)
;
build_result
.
runtime_program
=
std
::
make_unique
<
hlir
::
framework
::
Program
>
(
nullptr
,
std
::
move
(
instructions
));
// to skip the condition check of params in Instruction::PreparePodArgs
std
::
map
<
std
::
string
,
cinn_pod_value_t
>
preset_args
;
...
...
paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.cc
View file @
01a10755
...
...
@@ -15,9 +15,9 @@
#include "paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/schedule/schedule_desc.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
namespace
cinn
{
namespace
auto_schedule
{
...
...
paddle/cinn/auto_schedule/post_schedule_rule/cooperative_process_test.cc
View file @
01a10755
...
...
@@ -17,7 +17,7 @@
#include <gtest/gtest.h>
#include "paddle/cinn/auto_schedule/search_space/auto_gen_rule/test_helper.h"
#include "paddle/cinn/ir/
utils/
ir_printer.h"
#include "paddle/cinn/ir/ir_printer.h"
#include "test/cpp/cinn/program_builder.h"
namespace
cinn
{
...
...
@@ -129,7 +129,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
{
i0, i1 = axis.bind(((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1)))
{
temp_matmul_out__reduce_init[
((16 * i) + ((2 * i_0) + i_1)), ((16 * j) + ((8 * j_0) + j_1))
] = 0.00000000f
temp_matmul_out__reduce_init[
i0, i1
] = 0.00000000f
}
}
}
...
...
@@ -181,7 +181,7 @@ TEST_F(TestCooperativeProcess, Matmul) {
{
i0_0, i1_0, i2 = axis.bind(((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1)), ((4 * reduce_k_0) + reduce_k_1))
{
temp_matmul_out[
((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] = (temp_matmul_out[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))] + (X_reshape_shared_temp_buffer[((2 * (i_0_j_0_fused / 2)) + ((16 * (i_j_fused / 2)) + i_1)), ((4 * reduce_k_0) + reduce_k_1)] * Y_reshape_shared_temp_buffer[((4 * reduce_k_0) + reduce_k_1), ((8 * (i_0_j_0_fused % 2)) + ((16 * (i_j_fused % 2)) + j_1))
]))
temp_matmul_out[
i0_0, i1_0] = (temp_matmul_out[i0_0, i1_0] + (X_reshape_shared_temp_buffer[i0_0, i2] * Y_reshape_shared_temp_buffer[i2, i1_0
]))
}
}
}
...
...
paddle/cinn/auto_schedule/search_space/auto_gen_rule/CMakeLists.txt
View file @
01a10755
...
...
@@ -8,7 +8,8 @@ gather_srcs(
auto_unroll.cc
multi_level_tiling.cc
skip_rule.cc
auto_bind.cc
)
auto_bind.cc
reduction_factoring.cc
)
if
(
WITH_TESTING
)
cinn_cc_library
(
...
...
@@ -51,3 +52,11 @@ endif()
#cinn_cc_test(test_auto_inline SRCS auto_inline_test.cc DEPS cinncore auto_gen_rule_test_helper)
cinn_cc_test
(
test_skip_rule SRCS skip_rule_test.cc DEPS cinncore
)
cinn_cc_test
(
test_auto_unroll SRCS auto_unroll_test.cc DEPS cinncore
)
cinn_cc_test
(
test_reduction_factoring
SRCS
reduction_factoring_test.cc
DEPS
cinncore
auto_gen_rule_test_helper
test_program_builder
)
paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.cc
View file @
01a10755
...
...
@@ -16,10 +16,11 @@
#include <glog/logging.h>
#include "paddle/cinn/ir/ir_printer.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/schedule_block_graph.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_nodes_collector.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
namespace
cinn
{
namespace
auto_schedule
{
...
...
@@ -31,7 +32,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
const
auto
&
loop_var
=
for_node
->
loop_var
;
// collect cases where the loop_var used in one of reduce axis in underneath
// ScheduleBlock
auto
used_for_reduce_axis
=
ir
::
CollectIRNodesWithoutTensor
(
auto
used_for_reduce_axis
=
ir
::
ir_utils
::
CollectIRNodesWithoutTensor
(
for_node
->
body
,
[
&
loop_var
](
const
Expr
*
x
)
{
const
auto
*
block_realize
=
x
->
As
<
ir
::
ScheduleBlockRealize
>
();
if
(
!
block_realize
)
return
false
;
...
...
@@ -46,7 +47,7 @@ bool IsSpatialLoop(const ir::For* for_node) {
const
ir
::
Expr
&
binding
=
block_realize
->
iter_values
[
i
];
if
(
iter_var
->
is_reduce_axis
||
iter_var
->
name
.
substr
(
0
,
6
)
==
"reduce"
)
{
auto
used_exprs
=
ir
::
CollectIRNodesWithoutTensor
(
auto
used_exprs
=
ir
::
ir_utils
::
CollectIRNodesWithoutTensor
(
binding
,
[
&
loop_var
](
const
Expr
*
x
)
{
const
ir
::
_Var_
*
var
=
x
->
As
<
ir
::
_Var_
>
();
if
(
var
&&
...
...
@@ -94,6 +95,8 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
auto
all_loops
=
ir_schedule
->
GetLoops
(
block_name
);
CHECK_LE
(
num_loops_to_bind
,
all_loops
.
size
())
<<
"The number of loops to be bind is greater than size of all_loops"
;
CHECK_GE
(
num_loops_to_bind
,
0
)
<<
"The number of loops to be bind should be greater than 0"
;
// check whether it is the case that threadIdx has been binded but blockIdx
// not, the threadIdx can only be binded in the first loop after
// num_loops_to_bind loops because we has excluded other cases in
...
...
@@ -101,6 +104,17 @@ void BindGPUIndex(ir::IRSchedule* ir_schedule,
bool
gpu_thread_has_binded
=
num_loops_to_bind
<
all_loops
.
size
()
&&
all_loops
[
num_loops_to_bind
].
As
<
ir
::
For
>
()
->
is_gpu_thread_binded
();
ir
::
BlockOrderConstructor
block_order_constructor
;
std
::
map
<
std
::
vector
<
int
>
,
ir
::
Expr
>
blocks_order_with_ctrl_stmt
=
block_order_constructor
(
&
all_loops
[
num_loops_to_bind
-
1
]);
for
(
auto
&
pair
:
blocks_order_with_ctrl_stmt
)
{
if
(
pair
.
first
.
size
()
==
2
)
{
ir
::
Expr
stmt
=
pair
.
second
;
if
(
stmt
.
As
<
ir
::
For
>
()
&&
stmt
.
As
<
ir
::
For
>
()
->
is_gpu_thread_binded
())
{
gpu_thread_has_binded
=
true
;
}
}
}
Expr
fused_loop
=
ir_schedule
->
Fuse
(
{
all_loops
.
begin
(),
all_loops
.
begin
()
+
num_loops_to_bind
});
int32_t
extent
=
fused_loop
.
As
<
ir
::
For
>
()
->
extent
.
as_int32
();
...
...
@@ -181,5 +195,18 @@ std::vector<SearchState> AutoBind::ApplyOnBlock(SearchState state,
return
{
new_state
};
}
void
AutoBind
::
Apply
(
ir
::
IRSchedule
*
ir_schedule
,
const
std
::
string
&
block_name
)
{
int
num_loop_can_bind
=
CountLoopCanBinded
(
ir_schedule
->
GetLoops
(
block_name
)[
0
].
As
<
ir
::
For
>
());
if
(
num_loop_can_bind
>
0
)
{
BindGPUIndex
(
ir_schedule
,
block_name
,
num_loop_can_bind
,
kMaxBlocks
,
target_
->
max_num_threads
());
}
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/search_space/auto_gen_rule/auto_bind.h
View file @
01a10755
...
...
@@ -42,6 +42,8 @@ class AutoBind : public AutoGenRule {
std
::
vector
<
SearchState
>
ApplyOnBlock
(
SearchState
state
,
const
std
::
string
&
block_name
)
override
;
void
Apply
(
ir
::
IRSchedule
*
ir_schedule
,
const
std
::
string
&
block_name
);
private:
std
::
vector
<
Expr
>
applicable_schedule_blocks_
;
};
...
...
Prev
1
…
4
5
6
7
8
9
10
11
12
…
29
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment