Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
Paddle
Commits
992bec46
Commit
992bec46
authored
Oct 08, 2023
by
“yuguo”
Browse files
2.5
parent
0259837d
Changes
357
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2224 additions
and
0 deletions
+2224
-0
paddle/cinn/auto_schedule/task/task_creator_test.cc
paddle/cinn/auto_schedule/task/task_creator_test.cc
+72
-0
paddle/cinn/auto_schedule/task/task_optimizer.cc
paddle/cinn/auto_schedule/task/task_optimizer.cc
+470
-0
paddle/cinn/auto_schedule/task/task_optimizer.h
paddle/cinn/auto_schedule/task/task_optimizer.h
+73
-0
paddle/cinn/auto_schedule/task/task_registry.h
paddle/cinn/auto_schedule/task/task_registry.h
+87
-0
paddle/cinn/auto_schedule/task/task_registry_test.cc
paddle/cinn/auto_schedule/task/task_registry_test.cc
+111
-0
paddle/cinn/auto_schedule/task/tune_task.cc
paddle/cinn/auto_schedule/task/tune_task.cc
+112
-0
paddle/cinn/auto_schedule/task/tune_task.h
paddle/cinn/auto_schedule/task/tune_task.h
+74
-0
paddle/cinn/auto_schedule/task/tune_task_test.cc
paddle/cinn/auto_schedule/task/tune_task_test.cc
+344
-0
paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
+6
-0
paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
.../cinn/auto_schedule/task_scheduler/efficiency_priority.cc
+35
-0
paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
...e/cinn/auto_schedule/task_scheduler/efficiency_priority.h
+40
-0
paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
+28
-0
paddle/cinn/auto_schedule/task_scheduler/round_robin.h
paddle/cinn/auto_schedule/task_scheduler/round_robin.h
+37
-0
paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
+48
-0
paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
+68
-0
paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
.../cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
+58
-0
paddle/cinn/auto_schedule/tests/CMakeLists.txt
paddle/cinn/auto_schedule/tests/CMakeLists.txt
+11
-0
paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
...e/cinn/auto_schedule/tests/performance_comparison_test.cc
+365
-0
paddle/cinn/auto_schedule/tuning.h
paddle/cinn/auto_schedule/tuning.h
+91
-0
paddle/cinn/backends/CMakeLists.txt
paddle/cinn/backends/CMakeLists.txt
+94
-0
No files found.
Too many changes to show.
To preserve performance only
357 of 357+
files are displayed.
Plain diff
Email patch
paddle/cinn/auto_schedule/task/task_creator_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/task_creator.h"
#include <gtest/gtest.h>
#include <memory>
#include <vector>
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/node.h"
namespace
cinn
{
namespace
auto_schedule
{
using
::
cinn
::
frontend
::
NetBuilder
;
using
::
cinn
::
frontend
::
Program
;
using
::
cinn
::
hlir
::
framework
::
Graph
;
using
::
cinn
::
hlir
::
framework
::
Node
;
Program
CreateAddProgram
()
{
constexpr
int
M
=
32
;
constexpr
int
N
=
24
;
NetBuilder
builder
(
"net_builder"
);
auto
a
=
builder
.
CreateInput
(
Float
(
32
),
{
M
,
N
},
"A"
);
auto
b
=
builder
.
CreateInput
(
Float
(
32
),
{
M
,
N
},
"B"
);
auto
c
=
builder
.
Add
(
a
,
b
);
auto
d
=
builder
.
Add
(
a
,
c
);
auto
program
=
builder
.
Build
();
return
program
;
}
TEST
(
TaskCreator
,
Basic
)
{
#ifdef CINN_WITH_CUDA
Target
target
=
common
::
DefaultNVGPUTarget
();
#else
Target
target
=
common
::
DefaultHostTarget
();
#endif
Program
prog
=
CreateAddProgram
();
auto
graph
=
std
::
make_shared
<
hlir
::
framework
::
Graph
>
(
prog
,
target
);
TaskCreator
task_creator
;
std
::
vector
<
TuneTask
>
tasks
=
task_creator
.
CreateTuneTaskOpLevel
(
graph
.
get
());
ASSERT_EQ
(
tasks
.
size
(),
2UL
);
for
(
TuneTask
&
task
:
tasks
)
{
std
::
shared_ptr
<
Graph
::
Group
>
subgraph
=
task
.
subgraph
;
ASSERT_EQ
(
subgraph
->
CollectNodes
().
size
(),
1UL
);
ASSERT_EQ
(
subgraph
->
nodes
[
0
]
->
op
()
->
name
,
"elementwise_add"
);
}
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task/task_optimizer.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
#include <glog/logging.h>
#include <functional>
#include <limits>
#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
#include "paddle/cinn/auto_schedule/measure/measure.h"
#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/hlir/op/external_api_registry.h"
#include "paddle/cinn/ir/buffer.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/optim/transform_gpu_forloop.h"
#include "paddle/cinn/runtime/flags.h"
#include "paddle/cinn/utils/string.h"
#ifdef CINN_WITH_CUDA
#include <cuda_runtime_api.h>
#include "paddle/cinn/backends/cuda_util.h"
#endif
DECLARE_bool
(
auto_schedule_use_cost_model
);
namespace
cinn
{
namespace
auto_schedule
{
using
cinn
::
hlir
::
op
::
ExternalApiRegistry
;
// *** forward declarations of auxiliary functions to be used in this file only
// *** update a scheduled function with several post-processors
ir
::
LoweredFunc
FuncWithUpdatedBody
(
const
common
::
Target
&
target
,
const
ir
::
LoweredFunc
&
old_func
,
ir
::
Expr
&
body
);
// NOLINT
// check whether a scheduled lowered function is valid
bool
PruneInvalid
(
const
ir
::
LoweredFunc
&
lowered_func
,
const
common
::
Target
&
target
);
// exclude some special tasks
bool
IsForbiddenToTune
(
const
TuneTask
*
task
);
// tell whether the task has been wrapped by custom_call in
// TransToCustomCallPass
bool
IsWrappedByCustomCall
(
const
TuneTask
*
task
);
// tell whether the task has registered external api
bool
HasExternalApi
(
const
TuneTask
*
task
);
TaskOptimizer
::
TaskOptimizer
(
TuneTask
*
task
,
ScheduleMeasurer
*
schedule_measurer
,
Database
*
database
,
utils
::
LinearRandomEngine
::
StateType
rand_seed
)
:
task_
(
task
),
schedule_measurer_
(
schedule_measurer
),
database_
(
database
),
cost_model_
(),
rand_seed_
(
utils
::
LinearRandomEngine
::
NormalizeState
(
rand_seed
))
{}
FunctionGroup
TaskOptimizer
::
Optimize
(
const
TuningOptions
&
options
)
{
CHECK
(
task_
->
subgraph
!=
nullptr
)
<<
"subgraph can't be empty"
;
// task with forbidden or custom_call ops can't be tuned
if
(
IsForbiddenToTune
(
task_
)
||
IsWrappedByCustomCall
(
task_
))
{
return
task_
->
op_lowerer
->
Lower
(
task_
->
subgraph
);
}
// TODO(CtfGo): the input/output names of a Graph::Group will be changed in
// Lowering by OpLowerer currently, so we should revert them after following
// different lower methods, remove this hard code by fixing the decoupling
// between lowering and BuildInstructions
auto
initial_input_names
=
task_
->
subgraph
->
input_names
;
auto
initial_output_names
=
task_
->
subgraph
->
output_names
;
std
::
vector
<
TaskOptimizer
::
Result
>
candidates
;
candidates
.
emplace_back
(
OptimizeByEvolution
(
options
));
candidates
.
emplace_back
(
OptimizeByManual
(
options
.
num_measure_trials
>
0
));
if
(
HasExternalApi
(
task_
))
{
candidates
.
emplace_back
(
OptimizeByExternal
(
options
.
num_measure_trials
>
0
));
}
sort
(
candidates
.
begin
(),
candidates
.
end
(),
[](
const
auto
&
lhs
,
const
auto
&
rhs
)
{
return
lhs
.
cost
<
rhs
.
cost
;
});
auto
&&
best
=
candidates
.
front
();
VLOG
(
4
)
<<
"Total candidates="
<<
candidates
.
size
()
<<
", the best from="
<<
best
.
from
<<
", cost="
<<
best
.
cost
;
// revert input/output names
task_
->
subgraph
->
input_names
=
initial_input_names
;
task_
->
subgraph
->
output_names
=
initial_output_names
;
return
best
.
functions
;
}
TaskOptimizer
::
Result
TaskOptimizer
::
OptimizeByManual
(
bool
need_measured
)
{
static
constexpr
char
*
kManualMeasuredKeyPrefix
=
"@ManualMeasured:
\n
"
;
TaskOptimizer
::
Result
result
(
"Manual"
);
result
.
functions
=
task_
->
op_lowerer
->
Lower
(
task_
->
subgraph
);
// pack functions body
std
::
vector
<
ir
::
Expr
>
func_bodys
;
for
(
const
ir
::
LoweredFunc
&
func
:
result
.
functions
)
{
func_bodys
.
push_back
(
func
->
body
);
}
SearchState
state
(
ir
::
IRSchedule
(
ir
::
ModuleExpr
(
std
::
move
(
func_bodys
))));
// the manual is regarded as the second best in default, so we set its cost
// 0.0
result
.
cost
=
0.0
;
// add the specific prefix in front of serialized_key to be store/load
// measured record for manual schedule
std
::
string
measured_key
=
kManualMeasuredKeyPrefix
+
task_
->
serialized_key
;
if
(
need_measured
&&
database_
->
Count
(
measured_key
)
==
0
)
{
std
::
vector
<
MeasureInput
>
inputs
(
1
);
inputs
.
back
().
task
=
task_
;
inputs
.
back
().
lowered_funcs
=
result
.
functions
;
VLOG
(
4
)
<<
"Measure manual schedule"
;
std
::
vector
<
MeasureResult
>
measure_outputs
=
schedule_measurer_
->
Measure
(
inputs
);
database_
->
AddRecord
(
TuningRecord
(
measured_key
,
state
,
measure_outputs
[
0
].
execution_cost
));
}
auto
measured_records
=
database_
->
LookUp
(
measured_key
);
if
(
!
measured_records
.
empty
())
{
// update result.cost by measured if exists
result
.
cost
=
measured_records
[
0
].
execution_cost
;
}
return
result
;
}
TaskOptimizer
::
Result
TaskOptimizer
::
OptimizeByExternal
(
bool
need_measured
)
{
static
constexpr
char
*
kExternalMeasuredKeyPrefix
=
"@ExternalMeasured:
\n
"
;
TaskOptimizer
::
Result
result
(
"External"
);
auto
nodes
=
task_
->
subgraph
->
CollectNodes
();
auto
*
first_node
=
nodes
.
front
();
// set the necessary field for lowering with external api
std
::
string
original_op
=
first_node
->
op
()
->
name
;
first_node
->
attrs
.
attr_store
[
"original_op"
]
=
original_op
;
first_node
->
attrs
.
op
=
hlir
::
framework
::
Operator
::
Get
(
"custom_call"
);
result
.
functions
=
task_
->
op_lowerer
->
Lower
(
task_
->
subgraph
);
// add the specific prefix in front of serialized_key to be store/load
// measured record for external api
result
.
cost
=
-
1.0
;
// the external is regarded as the best in default, so we
// set its cost -1.0
std
::
string
measured_key
=
kExternalMeasuredKeyPrefix
+
task_
->
serialized_key
;
if
(
need_measured
&&
database_
->
Count
(
measured_key
)
==
0
)
{
std
::
vector
<
MeasureInput
>
inputs
(
1
);
inputs
.
back
().
task
=
task_
;
inputs
.
back
().
lowered_funcs
=
result
.
functions
;
VLOG
(
4
)
<<
"Measure external api"
;
std
::
vector
<
MeasureResult
>
measure_outputs
=
schedule_measurer_
->
Measure
(
inputs
);
// the SearchState of external is invalid and will not be used, so we just
// put a temporary one
database_
->
AddRecord
(
TuningRecord
(
measured_key
,
SearchState
(
ir
::
IRSchedule
()),
measure_outputs
[
0
].
execution_cost
));
}
auto
measured_records
=
database_
->
LookUp
(
measured_key
);
if
(
!
measured_records
.
empty
())
{
// update result.cost by measured if exists
result
.
cost
=
measured_records
[
0
].
execution_cost
;
}
return
result
;
}
bool
IsForbiddenToTune
(
const
TuneTask
*
task
)
{
// TODO(CtfGo): some operators may change its linked edges in
// TransToCustomCallPass, like conv2d, we will skip these ops in auto-schedule
// because they can't revert original links for no schedule and manual
// schedule lowering.
static
std
::
unordered_set
<
std
::
string
>
links_changed_ops
=
{
"conv2d"
};
auto
nodes
=
task
->
subgraph
->
CollectNodes
();
auto
&&
op_name
=
nodes
.
front
()
->
op
()
->
name
;
if
(
nodes
.
size
()
==
1
&&
links_changed_ops
.
count
(
op_name
))
{
VLOG
(
5
)
<<
"Op:"
<<
op_name
<<
" is forbidden to call external_api"
;
return
true
;
}
return
false
;
}
bool
HasExternalApi
(
const
TuneTask
*
task
)
{
auto
nodes
=
task
->
subgraph
->
CollectNodes
();
auto
*
first_node
=
nodes
.
front
();
if
(
nodes
.
size
()
==
1
&&
ExternalApiRegistry
::
Global
()
->
Has
(
first_node
->
op
()
->
name
,
task
->
target
))
{
return
true
;
}
return
false
;
}
bool
IsWrappedByCustomCall
(
const
TuneTask
*
task
)
{
auto
nodes
=
task
->
subgraph
->
CollectNodes
();
auto
*
first_node
=
nodes
.
front
();
if
(
nodes
.
size
()
==
1
&&
first_node
->
op
()
->
name
==
"custom_call"
)
{
CHECK
(
first_node
->
attrs
.
attr_store
.
count
(
"original_op"
))
<<
"a custom_call op must store its original op name"
;
std
::
string
op_name
=
absl
::
get
<
std
::
string
>
(
first_node
->
attrs
.
attr_store
.
at
(
"original_op"
));
VLOG
(
5
)
<<
"Op:"
<<
op_name
<<
" was wrapped as custom_call"
;
return
true
;
}
return
false
;
}
TaskOptimizer
::
Result
TaskOptimizer
::
OptimizeByEvolution
(
const
TuningOptions
&
options
)
{
CHECK_EQ
(
options
.
num_measure_trials
%
options
.
num_samples_per_iteration
,
0
)
<<
"TuningOptions.num_measure_trials % "
"TuningOptions.num_samples_per_iteration must be 0."
;
VLOG
(
4
)
<<
"Optimizing TuneTask with num_measure_trials:"
<<
options
.
num_measure_trials
<<
", LoweredFunc before optimization is:"
;
VLOG
(
4
)
<<
"lowered function size = "
<<
task_
->
lowered_funcs
.
size
();
for
(
size_t
i
=
0
;
i
<
task_
->
lowered_funcs
.
size
();
++
i
)
{
VLOG
(
4
)
<<
"lowered_funcs["
<<
i
<<
"] detail:
\n
"
<<
task_
->
lowered_funcs
[
i
];
}
if
(
evolutionary_search_
==
nullptr
)
{
// TODO(zhhsplendid): check whether the options is same as previous,
// if not, we should create new EvolutionarySearch
evolutionary_search_
=
std
::
make_unique
<
EvolutionarySearch
>
(
*
task_
,
cost_model_
,
database_
,
utils
::
ForkRandomState
(
&
rand_seed_
));
}
TaskOptimizer
::
Result
result
(
"Evolution"
);
auto
&
optimized_funcs
=
result
.
functions
;
auto
&
best_cost
=
result
.
cost
;
// use initial lowered function as default result
optimized_funcs
=
optim
::
IRCopy
(
task_
->
lowered_funcs
);
if
(
options
.
num_measure_trials
==
0
)
{
// no need to measure and simply return the best searched
std
::
vector
<
MeasureInput
>
measure_candidates
;
std
::
vector
<
SearchState
>
states
=
SearchOneRound
(
options
,
&
measure_candidates
);
if
(
!
states
.
empty
())
{
if
(
FLAGS_auto_schedule_use_cost_model
)
{
best_cost
=
cost_model_
.
Predict
(
states
.
front
()
->
ir_schedule
.
GetModule
(),
task_
->
target
);
}
optimized_funcs
=
measure_candidates
[
0
].
lowered_funcs
;
}
else
{
LOG
(
WARNING
)
<<
"No valid candidate searched, will return initial state"
;
}
return
result
;
}
int
measured_count
=
0
;
uint32_t
continuous_empty_cnt
=
0
;
while
(
measured_count
<
options
.
num_measure_trials
)
{
VLOG
(
4
)
<<
"Launch a new search, current measured_count:"
<<
measured_count
;
std
::
vector
<
MeasureInput
>
measure_inputs
;
std
::
vector
<
SearchState
>
states
=
SearchOneRound
(
options
,
&
measure_inputs
);
if
(
states
.
empty
())
{
// no new valid candidate achieved
++
continuous_empty_cnt
;
if
(
continuous_empty_cnt
<=
kMaxRetryContinuousEmpty_
)
{
VLOG
(
4
)
<<
"No valid state searched, continuous_empty_cnt="
<<
continuous_empty_cnt
;
continue
;
}
else
{
LOG
(
WARNING
)
<<
"OptimizeByEvolution will be exited in advance due to "
"continuous invalid search, final measured_count="
<<
measured_count
;
break
;
}
}
continuous_empty_cnt
=
0
;
// reset if get valid candidates
VLOG
(
4
)
<<
"ScheduleMeasurer start with input size="
<<
measure_inputs
.
size
();
std
::
vector
<
MeasureResult
>
measure_outputs
=
schedule_measurer_
->
Measure
(
measure_inputs
);
CHECK_EQ
(
measure_outputs
.
size
(),
states
.
size
())
<<
"ScheduleMeasurer didn't output same number of MeasureOutput of "
"states in TaskOptimizer"
;
// record to database
for
(
size_t
i
=
0
;
i
<
states
.
size
();
++
i
)
{
database_
->
AddRecord
(
TuningRecord
(
measure_inputs
[
i
].
task
->
serialized_key
,
states
[
i
],
measure_outputs
[
i
].
execution_cost
));
}
// update cost model
if
(
FLAGS_auto_schedule_use_cost_model
)
{
std
::
vector
<
const
ir
::
ModuleExpr
*>
cost_model_samples
(
states
.
size
());
std
::
vector
<
float
>
cost_model_labels
(
states
.
size
());
for
(
size_t
i
=
0
;
i
<
states
.
size
();
++
i
)
{
cost_model_samples
[
i
]
=
&
(
states
[
i
]
->
ir_schedule
.
GetModule
());
cost_model_labels
[
i
]
=
measure_outputs
[
i
].
execution_cost
;
}
VLOG
(
4
)
<<
utils
::
StringFormat
(
"Update CostModel with samples size=%lu,labels size=%lu"
,
cost_model_samples
.
size
(),
cost_model_labels
.
size
());
cost_model_
.
Update
(
cost_model_samples
,
cost_model_labels
,
task_
->
target
);
}
// update the best
for
(
size_t
i
=
0
;
i
<
measure_outputs
.
size
();
++
i
)
{
if
(
measure_outputs
[
i
].
execution_cost
<
best_cost
)
{
VLOG
(
4
)
<<
"Update best candidate with execution_cost:"
<<
measure_outputs
[
i
].
execution_cost
<<
"us"
;
best_cost
=
measure_outputs
[
i
].
execution_cost
;
optimized_funcs
=
measure_inputs
[
i
].
lowered_funcs
;
}
}
// count result size
measured_count
+=
states
.
size
();
}
return
result
;
}
std
::
vector
<
SearchState
>
TaskOptimizer
::
SearchOneRound
(
const
TuningOptions
&
options
,
std
::
vector
<
MeasureInput
>*
measure_candidates
)
{
std
::
vector
<
SearchState
>
states
=
evolutionary_search_
->
SearchModuleExprEpsGreedy
(
options
);
VLOG
(
4
)
<<
JoinStatesDebugString
(
"TaskOptimizer::EvolutionarySearch-Result"
,
states
,
/*verbose=*/
VLOG_IS_ON
(
5
));
size_t
valid_cnt
=
0
;
for
(
size_t
i
=
0
;
i
<
states
.
size
();
++
i
)
{
std
::
vector
<
ir
::
Expr
>
best_exprs
=
states
[
i
]
->
ir_schedule
.
GetModule
().
GetExprs
();
CHECK_EQ
(
best_exprs
.
size
(),
task_
->
lowered_funcs
.
size
())
<<
"RuntimeError: Expr size is not equal to LoweredFunc size in "
"TaskOptimizer"
;
auto
init_funcs
=
optim
::
IRCopy
(
task_
->
lowered_funcs
);
std
::
vector
<
ir
::
LoweredFunc
>
valid_funcs
;
for
(
size_t
j
=
0
;
j
<
best_exprs
.
size
();
++
j
)
{
auto
updated_f
=
UpdateFuncWithNewBody
(
task_
->
target
,
init_funcs
[
j
],
best_exprs
[
j
]);
if
(
PruneInvalid
(
updated_f
,
task_
->
target
))
{
VLOG
(
4
)
<<
"PruneInvalid states-"
<<
i
;
break
;
}
valid_funcs
.
emplace_back
(
updated_f
);
}
// all functions are validated, collect this state to be measured
if
(
valid_funcs
.
size
()
==
init_funcs
.
size
())
{
states
[
valid_cnt
++
]
=
states
[
i
];
measure_candidates
->
emplace_back
(
MeasureInput
());
measure_candidates
->
back
().
task
=
task_
;
measure_candidates
->
back
().
lowered_funcs
=
std
::
move
(
valid_funcs
);
}
}
states
.
erase
(
states
.
begin
()
+
valid_cnt
,
states
.
end
());
CHECK_EQ
(
states
.
size
(),
measure_candidates
->
size
())
<<
"result size of states not equal to measure_candidates"
;
VLOG
(
4
)
<<
"EvolutionarySearch return size="
<<
states
.
size
()
<<
", valid count="
<<
valid_cnt
;
VLOG
(
4
)
<<
JoinStatesDebugString
(
"TaskOptimizer::SearchOneRound-Result"
,
states
,
/*verbose=*/
VLOG_IS_ON
(
5
));
return
states
;
}
// detect the limit of available shared memory on the current NVGPU with CUDA
// runtime
size_t
GetGPUSharedMemoryLimit
()
{
#ifdef CINN_WITH_CUDA
int
device_id
;
CUDA_CALL
(
cudaGetDevice
(
&
device_id
));
cudaDeviceProp
prop
;
CUDA_CALL
(
cudaGetDeviceProperties
(
&
prop
,
device_id
));
VLOG
(
4
)
<<
utils
::
StringFormat
(
"GPU-%d GPUSharedMemoryLimit=%d"
,
device_id
,
prop
.
sharedMemPerBlock
);
return
prop
.
sharedMemPerBlock
;
#else
return
0
;
#endif
}
// detect the limit of available local/stack memory on the current NVGPU with
// CUDA runtime
size_t
GetGPULocalStackLimit
()
{
#ifdef CINN_WITH_CUDA
int
device_id
;
CUDA_CALL
(
cudaGetDevice
(
&
device_id
));
cudaDeviceProp
prop
;
CUDA_CALL
(
cudaGetDeviceProperties
(
&
prop
,
device_id
));
size_t
limit
=
prop
.
totalGlobalMem
/
prop
.
multiProcessorCount
/
prop
.
maxThreadsPerMultiProcessor
;
VLOG
(
4
)
<<
utils
::
StringFormat
(
"GPU-%d "
"totalGlobalMem=%lu,maxThreadsPerMultiProcessor=%d,multiProcessorCount=%"
"d, calculated "
"GPULocalStackLimit=%lu"
,
device_id
,
prop
.
totalGlobalMem
,
prop
.
multiProcessorCount
,
prop
.
maxThreadsPerMultiProcessor
,
limit
);
return
limit
;
#else
return
0
;
#endif
}
// check whether usage of the specific memory type in the lowered_func exceeds
// hardware limit
bool
IsGPUMemoryUsageExceedLimit
(
const
ir
::
LoweredFunc
&
lowered_func
,
const
ir
::
MemoryType
&
used_memory_type
,
const
size_t
limit_bytes
)
{
std
::
unordered_set
<
std
::
string
>
visited
;
size_t
used_bytes_cnt
=
0
;
for
(
auto
&&
buf
:
lowered_func
->
temp_bufs
)
{
VLOG
(
5
)
<<
"temp buf name="
<<
buf
->
name
<<
", numel="
<<
buf
->
numel
()
<<
",dtype="
<<
buf
->
dtype
;
if
(
buf
->
memory_type
==
used_memory_type
&&
!
visited
.
count
(
buf
->
name
))
{
used_bytes_cnt
+=
buf
->
numel
()
*
buf
->
dtype
.
bytes
();
visited
.
insert
(
buf
->
name
);
}
}
VLOG
(
5
)
<<
"total used_bytes_cnt="
<<
used_bytes_cnt
;
return
used_bytes_cnt
>=
limit_bytes
;
}
bool
PruneInvalid
(
const
ir
::
LoweredFunc
&
lowered_func
,
const
common
::
Target
&
target
)
{
static
const
size_t
kGPUSharedMemoryLimitBytes
=
GetGPUSharedMemoryLimit
();
static
const
size_t
kGPULocalStackLimitBytes
=
GetGPULocalStackLimit
();
if
(
target
==
common
::
DefaultNVGPUTarget
())
{
if
(
IsGPUMemoryUsageExceedLimit
(
lowered_func
,
ir
::
MemoryType
::
GPUShared
,
kGPUSharedMemoryLimitBytes
))
{
VLOG
(
5
)
<<
ir
::
MemoryType
::
GPUShared
<<
" memory usage exceeds limit, func:
\n
"
<<
lowered_func
;
return
true
;
}
if
(
IsGPUMemoryUsageExceedLimit
(
lowered_func
,
ir
::
MemoryType
::
GPULocal
,
kGPULocalStackLimitBytes
))
{
VLOG
(
5
)
<<
ir
::
MemoryType
::
GPULocal
<<
" memory usage exceeds limit, func:
\n
"
<<
lowered_func
;
return
true
;
}
}
return
false
;
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task/task_optimizer.h
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
#include "paddle/cinn/auto_schedule/database/database.h"
#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/auto_schedule/tuning.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/utils/random_engine.h"
namespace
cinn
{
namespace
auto_schedule
{
// This class is responsible for tuning a specific task,
// it will integrate necessary components to search the
// optimal schedule for the task.
class
TaskOptimizer
{
public:
TaskOptimizer
(
TuneTask
*
task
,
ScheduleMeasurer
*
schedule_measurer
,
Database
*
database
,
utils
::
LinearRandomEngine
::
StateType
rand_seed
=
-
1
);
FunctionGroup
Optimize
(
const
TuningOptions
&
options
);
private:
struct
Result
{
std
::
string
from
;
double
cost
;
FunctionGroup
functions
;
explicit
Result
(
const
std
::
string
&
from_type
)
:
from
(
from_type
),
cost
(
std
::
numeric_limits
<
double
>::
max
())
{}
};
Result
OptimizeByManual
(
bool
need_measure
);
Result
OptimizeByExternal
(
bool
need_measure
);
Result
OptimizeByEvolution
(
const
TuningOptions
&
options
);
// call search candidates once by EvolutionarySearch and prune invalid ones
std
::
vector
<
SearchState
>
SearchOneRound
(
const
TuningOptions
&
options
,
std
::
vector
<
MeasureInput
>*
measure_candidates
);
private:
// the max retry times if continuously get empty result
static
constexpr
uint32_t
kMaxRetryContinuousEmpty_
=
3
;
TuneTask
*
task_
;
ScheduleMeasurer
*
schedule_measurer_
;
std
::
unique_ptr
<
EvolutionarySearch
>
evolutionary_search_
=
nullptr
;
ExprCostModel
cost_model_
;
Database
*
database_
;
utils
::
LinearRandomEngine
::
StateType
rand_seed_
;
};
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task/task_registry.h
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gflags/gflags.h>
#include <mutex>
#include <string>
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/utils/registry.h"
namespace
cinn
{
namespace
auto_schedule
{
struct
InitialTaskInfo
{
std
::
string
task_key
;
ir
::
ModuleExpr
module_expr
;
InitialTaskInfo
(
const
std
::
string
&
task_key
,
const
ir
::
ModuleExpr
&
module_expr
)
:
task_key
(
task_key
),
module_expr
(
module_expr
)
{}
};
// Global task registry, used to save the initial ModuleExpr of each task.
class
InitialTaskRegistry
:
public
Registry
<
InitialTaskInfo
>
{
public:
static
InitialTaskRegistry
*
Global
()
{
static
InitialTaskRegistry
x
;
return
&
x
;
}
// Get the initial ModuleExpr of a task.
inline
const
InitialTaskInfo
*
Get
(
const
std
::
string
&
task_key
)
{
const
InitialTaskInfo
*
task_info
=
Registry
<
InitialTaskInfo
>::
Find
(
task_key
);
CHECK
(
task_info
)
<<
"InitialTaskInfo ["
<<
task_key
<<
"] is not registered"
;
return
task_info
;
}
// Check if the task info with task_key exists;
inline
const
bool
Has
(
const
std
::
string
&
task_key
)
{
return
nullptr
!=
Registry
<
InitialTaskInfo
>::
Find
(
task_key
);
}
// Regist the initial ModuleExpr of a task into the map
inline
void
Regist
(
const
std
::
string
&
task_key
,
const
ir
::
ModuleExpr
&
module_expr
)
{
std
::
lock_guard
<
std
::
mutex
>
guard
(
registering_mutex
);
if
(
fmap_
.
count
(
task_key
)
==
0
)
{
InitialTaskInfo
*
task_info
=
new
InitialTaskInfo
(
task_key
,
optim
::
IRCopy
(
module_expr
));
__REGISTER__
(
task_key
,
task_info
);
}
}
private:
InitialTaskRegistry
()
=
default
;
CINN_DISALLOW_COPY_AND_ASSIGN
(
InitialTaskRegistry
);
// Regist the initial ModuleExpr of a task.
inline
InitialTaskInfo
*
__REGISTER__
(
const
std
::
string
&
task_key
,
InitialTaskInfo
*
task_info
)
{
fmap_
[
task_key
]
=
task_info
;
const_list_
.
push_back
(
task_info
);
entry_list_
.
push_back
(
task_info
);
return
task_info
;
}
};
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task/task_registry_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/task_registry.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cstdlib>
#include "paddle/cinn/auto_schedule/task/task_creator.h"
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/utils/string.h"
#include "paddle/cinn/utils/type_defs.h"
DECLARE_bool
(
auto_schedule_use_cost_model
);
namespace
cinn
{
namespace
auto_schedule
{
std
::
vector
<
TuneTask
>
CreateTasks
(
hlir
::
framework
::
Graph
*
graph
,
const
common
::
Target
&
target
)
{
// create tasks
TaskCreator
task_creator
;
std
::
vector
<
TuneTask
>
tasks
=
task_creator
.
CreateTuneTaskOpLevel
(
graph
);
const
auto
&
dtype_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
common
::
Type
>>
(
"inferdtype"
);
const
auto
&
shape_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>>
(
"infershape"
);
std
::
unique_ptr
<
hlir
::
framework
::
OpLowerer
>
op_lowerer
=
std
::
make_unique
<
hlir
::
framework
::
OpLowerer
>
(
dtype_dict
,
shape_dict
,
target
);
for
(
TuneTask
&
task
:
tasks
)
{
task
.
Initialize
(
shape_dict
,
dtype_dict
,
op_lowerer
.
get
());
VLOG
(
3
)
<<
"Add a task with serialized_key:
\n
"
<<
task
.
serialized_key
;
}
return
tasks
;
}
std
::
shared_ptr
<
hlir
::
framework
::
Graph
>
CreateAddProgram
(
const
common
::
Target
&
target
)
{
frontend
::
NetBuilder
builder
(
"test"
);
auto
a
=
builder
.
CreateInput
(
Float
(
32
),
{
1
,
64
,
112
,
112
},
"A"
);
auto
b
=
builder
.
CreateInput
(
Float
(
32
),
{
64
},
"B"
);
auto
c
=
builder
.
Add
(
a
,
b
,
1
);
return
std
::
make_shared
<
hlir
::
framework
::
Graph
>
(
builder
.
Build
(),
target
);
}
TEST
(
TestTaskRegistry
,
basic
)
{
FLAGS_auto_schedule_use_cost_model
=
true
;
#ifdef CINN_WITH_CUDA
Target
target
=
common
::
DefaultNVGPUTarget
();
#else
Target
target
=
common
::
DefaultHostTarget
();
#endif
std
::
shared_ptr
<
hlir
::
framework
::
Graph
>
graph
=
CreateAddProgram
(
target
);
std
::
vector
<
TuneTask
>
tasks
=
CreateTasks
(
graph
.
get
(),
target
);
InitialTaskRegistry
*
task_registry
=
InitialTaskRegistry
::
Global
();
std
::
vector
<
ir
::
ModuleExpr
>
module_exprs
;
for
(
const
TuneTask
&
task
:
tasks
)
{
module_exprs
.
emplace_back
(
task
.
GetLoweredFuncBodyExprs
());
task_registry
->
Regist
(
task
.
serialized_key
,
module_exprs
.
back
());
}
for
(
int
i
=
0
;
i
<
tasks
.
size
();
++
i
)
{
std
::
string
key
=
tasks
[
i
].
serialized_key
;
VLOG
(
3
)
<<
"serialized_key = "
<<
key
;
ir
::
ModuleExpr
new_expr
=
task_registry
->
Get
(
key
)
->
module_expr
;
ASSERT_EQ
(
new_expr
.
GetExprs
().
size
(),
module_exprs
[
i
].
GetExprs
().
size
());
for
(
int
j
=
0
;
j
<
new_expr
.
GetExprs
().
size
();
++
j
)
{
VLOG
(
3
)
<<
"expr "
<<
j
<<
" of task "
<<
key
<<
" : "
<<
new_expr
.
GetExprs
().
at
(
j
);
ASSERT_EQ
(
utils
::
GetStreamCnt
(
new_expr
.
GetExprs
().
at
(
j
)),
utils
::
GetStreamCnt
(
module_exprs
[
i
].
GetExprs
().
at
(
j
)));
}
}
bool
flag
=
task_registry
->
Has
(
tasks
[
0
].
serialized_key
);
ASSERT_EQ
(
flag
,
true
);
flag
=
task_registry
->
Has
(
"not_exist"
);
ASSERT_EQ
(
flag
,
false
);
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task/tune_task.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include <glog/logging.h>
#include <iostream>
#include <vector>
#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/utils/string.h"
namespace
cinn
{
namespace
auto_schedule
{
void
TuneTask
::
Initialize
(
const
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>&
shape_dict
,
const
absl
::
flat_hash_map
<
std
::
string
,
cinn
::
common
::
Type
>&
dtype_dict
,
hlir
::
framework
::
OpLowerer
*
lower_handler
)
{
CHECK
(
lower_handler
!=
nullptr
)
<<
"op_lowerer can't be nullptr"
;
op_lowerer
=
lower_handler
;
// Set lowered_funcs and analyze output names.
this
->
lowered_funcs
=
op_lowerer
->
Lower
(
subgraph
,
/*apply_op_schedule = */
false
,
/*apply_group_schedule=*/
false
);
this
->
output_names
=
GetOutputNamesFromLoweredFunc
(
this
->
lowered_funcs
);
this
->
serialized_key
=
SerializeToString
(
shape_dict
,
dtype_dict
);
}
std
::
vector
<
ir
::
Expr
>
TuneTask
::
GetLoweredFuncBodyExprs
()
const
{
std
::
vector
<
ir
::
Expr
>
result
;
for
(
const
ir
::
LoweredFunc
&
func
:
lowered_funcs
)
{
result
.
push_back
(
func
->
body
);
}
return
result
;
}
std
::
string
TuneTask
::
SerializeToString
(
const
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>&
shape_dict
,
const
absl
::
flat_hash_map
<
std
::
string
,
cinn
::
common
::
Type
>&
dtype_dict
)
{
std
::
stringstream
ss
;
ss
<<
target
<<
"
\n\n
"
;
// print target
// local function to print dtype,shape of out/in variables of the specified
// node
auto
print_node_links_fn
=
[
&
](
const
std
::
vector
<
common
::
Shared
<
common
::
GraphEdge
>>&
links
,
bool
is_input
)
{
int
printed_num
=
0
;
for
(
auto
&&
edge
:
links
)
{
const
auto
*
var_node
=
is_input
?
edge
->
source
()
->
safe_as
<
hlir
::
framework
::
NodeData
>
()
:
edge
->
sink
()
->
safe_as
<
hlir
::
framework
::
NodeData
>
();
CHECK
(
var_node
)
<<
"var node invalid"
;
auto
sit
=
shape_dict
.
find
(
var_node
->
id
());
CHECK
(
sit
!=
shape_dict
.
end
())
<<
"can't find shape of variable:"
<<
var_node
->
id
();
auto
dit
=
dtype_dict
.
find
(
var_node
->
id
());
CHECK
(
dit
!=
dtype_dict
.
end
())
<<
"can't find dtype of variable:"
<<
var_node
->
id
();
if
(
printed_num
>
0
)
{
ss
<<
", "
;
}
++
printed_num
;
// TODO(CtfGo): CINN uses the names of input/output NodeData ids as
// arguments of the LoweredFunc in the Lower process, so it will
// result in different LoweredFuncs for two Nodes even though they
// represents the same operator. Here we add `var_node->id()` into the
// serialized_key to distinguish them, otherwise AutoTuner will get
// wrong TuningRecords when querying cached results from database. In
// the future, we should remove name-related limit in Lower process,
// to avoid duplicate tuning tasks with same operators.
ss
<<
var_node
->
id
()
<<
"->"
<<
cinn
::
common
::
Type2Str
(
dit
->
second
)
<<
"["
+
utils
::
Join
(
sit
->
second
,
","
)
<<
"]"
;
}
};
// print each node of the subgraph
ss
<<
"Group {
\n
"
;
for
(
auto
&&
node
:
subgraph
->
CollectNodes
())
{
ss
<<
" ("
;
print_node_links_fn
(
node
->
outlinks_in_order
(),
false
);
ss
<<
") = "
<<
node
->
op
()
->
name
<<
"("
;
print_node_links_fn
(
node
->
inlinks_in_order
(),
true
);
ss
<<
")
\n
"
;
}
ss
<<
"}
\n
"
;
return
ss
.
str
();
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task/tune_task.h
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <absl/container/flat_hash_map.h>
#include <memory>
#include <string>
#include <vector>
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/common/type.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
namespace
cinn
{
namespace
auto_schedule
{
class
TuneTask
{
public:
TuneTask
()
=
default
;
explicit
TuneTask
(
std
::
shared_ptr
<
hlir
::
framework
::
Graph
::
Group
>
group
)
:
subgraph
(
group
)
{}
// Initialize a task
void
Initialize
(
const
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>&
shape_dict
,
const
absl
::
flat_hash_map
<
std
::
string
,
cinn
::
common
::
Type
>&
dtype_dict
,
hlir
::
framework
::
OpLowerer
*
lower_handler
);
// Extract bodies in lowered_funcs() and return
std
::
vector
<
ir
::
Expr
>
GetLoweredFuncBodyExprs
()
const
;
// In CINN, we use hlir::framework::Graph::Group to represent a fused
// sub-graph (if an op won't be fused, it will be a Group with size=1).
std
::
shared_ptr
<
hlir
::
framework
::
Graph
::
Group
>
subgraph
;
// Lower handler, Not owned
hlir
::
framework
::
OpLowerer
*
op_lowerer
;
// target of this task
common
::
Target
target
;
// stores the initial (un-optimized) LoweredFuncs
std
::
vector
<
ir
::
LoweredFunc
>
lowered_funcs
;
// names of the output arguments of lowered_funcs_
std
::
unordered_set
<
std
::
string
>
output_names
;
// serialized string of this task, it contains struct,shape,dtype,input/output
// variable name of the subgraph and can be further used to hash
std
::
string
serialized_key
;
private:
// Serialize this task as a string contains specific fields of it
std
::
string
SerializeToString
(
const
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>&
shape_dict
,
const
absl
::
flat_hash_map
<
std
::
string
,
cinn
::
common
::
Type
>&
dtype_dict
);
};
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task/tune_task_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <vector>
#include "paddle/cinn/auto_schedule/task/task_creator.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/hlir/framework/scope.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/utils/string.h"
namespace
cinn
{
namespace
auto_schedule
{
using
::
cinn
::
frontend
::
NetBuilder
;
using
::
cinn
::
frontend
::
Program
;
using
::
cinn
::
hlir
::
framework
::
OpLowerer
;
Program
CreateAddProgram
()
{
constexpr
int
M
=
32
;
constexpr
int
N
=
24
;
NetBuilder
builder
(
"net_builder"
);
auto
a
=
builder
.
CreateInput
(
Float
(
32
),
{
M
,
N
},
"A"
);
auto
b
=
builder
.
CreateInput
(
Float
(
32
),
{
M
,
N
},
"B"
);
auto
c
=
builder
.
Add
(
a
,
b
);
auto
d
=
builder
.
Add
(
a
,
c
);
auto
program
=
builder
.
Build
();
return
program
;
}
TEST
(
TuneTask
,
GraphToUnoptLoweredFunc_NoPass
)
{
Context
::
Global
().
ResetNameId
();
#ifdef CINN_WITH_CUDA
Target
target
=
common
::
DefaultNVGPUTarget
();
#else
Target
target
=
common
::
DefaultHostTarget
();
#endif
Program
prog
=
CreateAddProgram
();
auto
graph
=
std
::
make_shared
<
hlir
::
framework
::
Graph
>
(
prog
,
target
);
TaskCreator
task_creator
;
std
::
vector
<
TuneTask
>
tasks
=
task_creator
.
CreateTuneTaskOpLevel
(
graph
.
get
());
ASSERT_EQ
(
tasks
.
size
(),
2UL
);
const
auto
&
shape_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>>
(
"infershape"
);
const
auto
&
dtype_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
common
::
Type
>>
(
"inferdtype"
);
OpLowerer
op_lowerer
(
dtype_dict
,
shape_dict
,
target
);
std
::
stringstream
ss
;
for
(
TuneTask
&
task
:
tasks
)
{
task
.
Initialize
(
shape_dict
,
dtype_dict
,
&
op_lowerer
);
std
::
vector
<
ir
::
Expr
>
exprs
=
task
.
GetLoweredFuncBodyExprs
();
VLOG
(
6
)
<<
"ir:Expr is: "
;
for
(
const
ir
::
Expr
&
e
:
exprs
)
{
VLOG
(
6
)
<<
e
;
ss
<<
e
<<
std
::
endl
;
}
}
std
::
string
expr_str
=
ss
.
str
();
#ifdef CINN_WITH_CUDA
std
::
string
target_str
=
R"ROC(
{
ScheduleBlock(root)
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_1)
{
i0, i1 = axis.bind(i, j)
var_1[i, j] = (A[i, j] + B[i, j])
}
}
}
}
}
{
ScheduleBlock(root_0)
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_2)
{
i0_0, i1_0 = axis.bind(i, j)
var_2[i, j] = (A[i, j] + var_1[i, j])
}
}
}
}
}
)ROC"
;
#else
std
::
string
target_str
=
R"ROC(
{
ScheduleBlock(root)
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_1)
{
i0, i1 = axis.bind(i, j)
var_1[i0, i1] = (A[i0, i1] + B[i0, i1])
}
}
}
}
}
{
ScheduleBlock(root_0)
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_2)
{
i0_0, i1_0 = axis.bind(i, j)
var_2[i0_0, i1_0] = (A[i0_0, i1_0] + var_1[i0_0, i1_0])
}
}
}
}
}
)ROC"
;
#endif
EXPECT_EQ
(
utils
::
Trim
(
target_str
),
utils
::
Trim
(
expr_str
));
}
TEST
(
TuneTask
,
GraphToUnoptLoweredFunc_ApplyPass
)
{
Context
::
Global
().
ResetNameId
();
#ifdef CINN_WITH_CUDA
Target
target
=
common
::
DefaultNVGPUTarget
();
#else
Target
target
=
common
::
DefaultHostTarget
();
#endif
Program
prog
=
CreateAddProgram
();
auto
graph
=
std
::
make_shared
<
hlir
::
framework
::
Graph
>
(
prog
,
target
);
ApplyPass
(
graph
.
get
(),
"OpFusionPass"
);
TaskCreator
task_creator
;
std
::
vector
<
TuneTask
>
tasks
=
task_creator
.
CreateTuneTaskOpLevel
(
graph
.
get
());
ASSERT_EQ
(
tasks
.
size
(),
1UL
);
const
auto
&
shape_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>>
(
"infershape"
);
const
auto
&
dtype_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
common
::
Type
>>
(
"inferdtype"
);
OpLowerer
op_lowerer
(
dtype_dict
,
shape_dict
,
target
);
std
::
stringstream
ss
;
for
(
TuneTask
&
task
:
tasks
)
{
task
.
Initialize
(
shape_dict
,
dtype_dict
,
&
op_lowerer
);
std
::
vector
<
ir
::
Expr
>
exprs
=
task
.
GetLoweredFuncBodyExprs
();
VLOG
(
6
)
<<
"ir:Expr is: "
;
for
(
const
ir
::
Expr
&
e
:
exprs
)
{
VLOG
(
6
)
<<
e
;
ss
<<
e
<<
std
::
endl
;
}
}
std
::
string
expr_str
=
ss
.
str
();
#ifdef CINN_WITH_CUDA
std
::
string
target_str
=
R"ROC(
{
ScheduleBlock(root)
{
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_1)
{
i0, i1 = axis.bind(i, j)
var_1[i, j] = (A[i, j] + B[i, j])
}
}
}
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_2)
{
i0_0, i1_0 = axis.bind(i, j)
var_2[i, j] = (A[i, j] + var_1[i, j])
}
}
}
}
}
}
)ROC"
;
#else
std
::
string
target_str
=
R"ROC(
{
ScheduleBlock(root)
{
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_1)
{
i0, i1 = axis.bind(i, j)
var_1[i0, i1] = (A[i0, i1] + B[i0, i1])
}
}
}
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_2)
{
i0_0, i1_0 = axis.bind(i, j)
var_2[i0_0, i1_0] = (A[i0_0, i1_0] + var_1[i0_0, i1_0])
}
}
}
}
}
}
)ROC"
;
#endif
EXPECT_EQ
(
utils
::
Trim
(
target_str
),
utils
::
Trim
(
expr_str
));
}
TEST
(
TuneTask
,
SerializeToString
)
{
Context
::
Global
().
ResetNameId
();
#ifdef CINN_WITH_CUDA
Target
target
=
common
::
DefaultNVGPUTarget
();
#else
Target
target
=
common
::
DefaultHostTarget
();
#endif
Program
prog
=
CreateAddProgram
();
auto
graph
=
std
::
make_shared
<
hlir
::
framework
::
Graph
>
(
prog
,
target
);
TaskCreator
task_creator
;
std
::
vector
<
TuneTask
>
single_tasks
=
task_creator
.
CreateTuneTaskOpLevel
(
graph
.
get
());
const
auto
&
shape_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>>
(
"infershape"
);
const
auto
&
dtype_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
common
::
Type
>>
(
"inferdtype"
);
OpLowerer
op_lowerer
(
dtype_dict
,
shape_dict
,
target
);
ASSERT_EQ
(
single_tasks
.
size
(),
2UL
);
for
(
auto
&&
task
:
single_tasks
)
{
task
.
Initialize
(
shape_dict
,
dtype_dict
,
&
op_lowerer
);
}
#ifdef CINN_WITH_CUDA
std
::
string
single_add_str
=
R"ROC(Target<linux,nvgpu,64>
Group {
(var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
}
)ROC"
;
#else
std
::
string
single_add_str
=
R"ROC(Target<linux,x86,64>
Group {
(var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
}
)ROC"
;
#endif
EXPECT_EQ
(
single_tasks
[
0
].
serialized_key
,
single_add_str
);
ApplyPass
(
graph
.
get
(),
"OpFusionPass"
);
std
::
vector
<
TuneTask
>
fused_tasks
=
task_creator
.
CreateTuneTaskOpLevel
(
graph
.
get
());
ASSERT_EQ
(
fused_tasks
.
size
(),
1UL
);
fused_tasks
[
0
].
Initialize
(
shape_dict
,
dtype_dict
,
&
op_lowerer
);
#ifdef CINN_WITH_CUDA
std
::
string
fused_expected_str
=
R"ROC(Target<linux,nvgpu,64>
Group {
(var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
(var_2->float32[32,24]) = elementwise_add(A->float32[32,24], var_1->float32[32,24])
}
)ROC"
;
#else
std
::
string
fused_expected_str
=
R"ROC(Target<linux,x86,64>
Group {
(var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
(var_2->float32[32,24]) = elementwise_add(A->float32[32,24], var_1->float32[32,24])
}
)ROC"
;
#endif
EXPECT_EQ
(
fused_tasks
[
0
].
serialized_key
,
fused_expected_str
);
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task_scheduler/CMakeLists.txt
0 → 100644
View file @
992bec46
core_gather_headers
()
gather_srcs
(
cinnapi_src SRCS task_scheduler.cc round_robin.cc
efficiency_priority.cc
)
cinn_cc_test
(
test_task_scheduler SRCS task_scheduler_test.cc DEPS cinncore
)
paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
namespace
cinn
{
namespace
auto_schedule
{
int
EfficiencyPriority
::
NextTaskId
()
{
while
(
cur_task_id_
<
tasks_
->
size
())
{
if
(
IsTaskToTune
(
&
tasks_
->
at
(
cur_task_id_
)))
{
return
cur_task_id_
++
;
}
++
cur_task_id_
;
}
return
-
1
;
}
bool
EfficiencyPriority
::
IsTaskToTune
(
const
TuneTask
*
task
)
{
return
config_
.
minimum_gain_threshold
>
0.0
;
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
namespace
cinn
{
namespace
auto_schedule
{
// Schedule tasks with efficiency_priority strategy, that
// is picking a task with the maximum earnings ratio.
class
EfficiencyPriority
:
public
TaskScheduler
{
public:
EfficiencyPriority
(
const
std
::
vector
<
TuneTask
>&
tasks
,
const
Config
&
config
)
:
TaskScheduler
(
tasks
,
config
)
{}
const
char
*
Name
()
const
override
{
return
"efficiency_priority"
;
};
int
NextTaskId
()
override
;
private:
bool
IsTaskToTune
(
const
TuneTask
*
task
);
};
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task_scheduler/round_robin.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
namespace
cinn
{
namespace
auto_schedule
{
int
RoundRobin
::
NextTaskId
()
{
if
(
cur_task_id_
<
tasks_
->
size
())
{
return
cur_task_id_
++
;
}
return
-
1
;
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task_scheduler/round_robin.h
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
namespace
cinn
{
namespace
auto_schedule
{
// Schedule tasks with round_robin strategy, that
// is picking a task to tune once a time iteratively.
class
RoundRobin
:
public
TaskScheduler
{
public:
RoundRobin
(
const
std
::
vector
<
TuneTask
>&
tasks
,
const
Config
&
config
)
:
TaskScheduler
(
tasks
,
config
)
{}
const
char
*
Name
()
const
override
{
return
"round_robin"
;
};
int
NextTaskId
()
override
;
};
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task_scheduler/task_scheduler.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
#include <algorithm>
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
namespace
cinn
{
namespace
auto_schedule
{
std
::
unique_ptr
<
TaskScheduler
>
TaskScheduler
::
Make
(
const
std
::
vector
<
TuneTask
>&
tasks
,
const
Config
&
config
,
const
std
::
string
&
strategy
)
{
CHECK_GT
(
tasks
.
size
(),
0
)
<<
"Empty task list"
;
if
(
strategy
==
"round_robin"
)
{
return
std
::
make_unique
<
RoundRobin
>
(
tasks
,
config
);
}
else
if
(
strategy
==
"efficiency_priority"
)
{
return
std
::
make_unique
<
EfficiencyPriority
>
(
tasks
,
config
);
}
LOG
(
FATAL
)
<<
"Unimplemented strategy:"
<<
strategy
;
return
nullptr
;
}
TaskScheduler
::
TaskScheduler
(
const
std
::
vector
<
TuneTask
>&
tasks
,
const
Config
&
config
)
:
tasks_
(
&
tasks
),
config_
(
config
),
cur_task_id_
(
0
)
{}
void
TaskScheduler
::
Reset
()
{
cur_task_id_
=
0
;
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <vector>
#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/auto_schedule/tuning.h"
namespace
cinn
{
namespace
auto_schedule
{
// Class for scheduling tasks to perform auto-tune
class
TaskScheduler
{
public:
// All configs for different schedule strategies
// will be defined here together.
struct
Config
{
// The minimum threshold of earnings ratio, used by EfficiencyPriority
float
minimum_gain_threshold
=
0.0
;
};
// Create a TaskScheduler with the specific strategy name
// and necessary construct parameters.
static
std
::
unique_ptr
<
TaskScheduler
>
Make
(
const
std
::
vector
<
TuneTask
>&
tasks
,
const
Config
&
config
,
const
std
::
string
&
strategy
=
"round_robin"
);
// Reset associated states to schedule at the beginning
void
Reset
();
// Return the name of schedule strategy
virtual
const
char
*
Name
()
const
=
0
;
// Select a task to tune
virtual
int
NextTaskId
()
=
0
;
protected:
// A taskScheduler object should be created with the static function Make
TaskScheduler
(
const
std
::
vector
<
TuneTask
>&
tasks
,
const
Config
&
config
);
// The config for scheduling strategy
Config
config_
;
// The current task id to be estimated
int
cur_task_id_
;
// The pointer refers to all tasks
const
std
::
vector
<
TuneTask
>*
tasks_
;
};
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/task_scheduler/task_scheduler_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
#include <gtest/gtest.h>
#include <type_traits>
#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
namespace
cinn
{
namespace
auto_schedule
{
TEST
(
TaskScheduler
,
Make
)
{
std
::
vector
<
TuneTask
>
tasks
(
3
);
TaskScheduler
::
Config
config
;
auto
round_robin
=
TaskScheduler
::
Make
(
tasks
,
config
);
ASSERT_STREQ
(
round_robin
->
Name
(),
"round_robin"
);
auto
efficiency_priority
=
TaskScheduler
::
Make
(
tasks
,
config
,
"efficiency_priority"
);
ASSERT_STREQ
(
efficiency_priority
->
Name
(),
"efficiency_priority"
);
}
TEST
(
RoundRobinScheduler
,
NextTaskId
)
{
std
::
vector
<
TuneTask
>
tasks
(
3
);
TaskScheduler
::
Config
config
;
auto
round_robin
=
TaskScheduler
::
Make
(
tasks
,
config
);
ASSERT_EQ
(
0
,
round_robin
->
NextTaskId
());
ASSERT_EQ
(
1
,
round_robin
->
NextTaskId
());
round_robin
->
Reset
();
ASSERT_EQ
(
0
,
round_robin
->
NextTaskId
());
}
TEST
(
EfficiencyPriorityScheduler
,
NextTaskId
)
{
std
::
vector
<
TuneTask
>
tasks
(
3
);
TaskScheduler
::
Config
config
;
config
.
minimum_gain_threshold
=
-
1.0
;
auto
efficiency_priority
=
TaskScheduler
::
Make
(
tasks
,
config
,
"efficiency_priority"
);
ASSERT_EQ
(
-
1
,
efficiency_priority
->
NextTaskId
());
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/tests/CMakeLists.txt
0 → 100644
View file @
992bec46
if
(
WITH_CUDA
AND
(
NOT WITH_CUDNN
))
cinn_cc_test
(
test_performance_comparison
ARGS
"--resnet50_model_dir=
${
THIRD_PARTY_PATH
}
/ResNet50"
SRCS
performance_comparison_test.cc
DEPS
cinncore
test_program_builder
)
endif
()
paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <bitset>
#include <iostream>
#include "paddle/cinn/auto_schedule/auto_tuner.h"
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/optimize.h"
#include "paddle/cinn/frontend/paddle_model_convertor.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/runtime/flags.h"
#include "paddle/cinn/utils/data_util.h"
#include "test/cpp/cinn/program_builder.h"
/* This test is used as a tool to evaluate or compare performance of 3
* schedules(no schedule, manual schedule, auto-schedule). One can specify which
* schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which
* operator or model through `--gtest_filter=PerformanceTester.xx`, for example,
* `FLAGS_evaluate_knobs=4
* --gtest_filter=PerformanceTester.Matmul` means it will evaluate auto-schedule
* on Matmul operator. You can refer to explanation of following flags or
* parameters for more detail.
*/
DEFINE_string
(
resnet50_model_dir
,
"./ResNet50"
,
"the path to paddle model resnet50."
);
// Flags that control which schedule tests will be run.
// Bit with index 0 controls no schedule test, means options = 1 = "001" will
// run no schedule test. Bit with index 1 controls manual schedule test, means
// options = 2 = "010" will run manual schedule test. Bit with index 2 controls
// auto schedule test, means options = 4 = "100" will run auto schedule test.
// The default value is -1, which means that this flag is disabled to set the
// options
DEFINE_int32
(
evaluate_knobs
,
-
1
,
"the options to control which schedule tests will be run."
);
DECLARE_double
(
cinn_infer_model_version
);
namespace
cinn
{
namespace
auto_schedule
{
using
::
cinn
::
hlir
::
framework
::
BuildScope
;
using
::
cinn
::
hlir
::
framework
::
Graph
;
using
::
cinn
::
hlir
::
framework
::
GraphCompiler
;
using
::
cinn
::
hlir
::
framework
::
Instruction
;
using
::
cinn
::
hlir
::
framework
::
Scope
;
class
PerformanceTester
:
public
::
testing
::
Test
{
public:
struct
Options
{
// times of compiled runtime program will be executed repeatedly.
int
repeat_times
=
2
;
// the num_tuning_rounds for auto tuning
int
num_tuning_rounds
=
2
;
// knobs to control which schedules will be measured, refer to
// FLAGS_evaluate_knobs explanation
std
::
bitset
<
3
>
evaluate_knobs
=
0UL
;
};
void
Evaluate
(
const
frontend
::
Program
&
program
)
{
if
(
FLAGS_evaluate_knobs
>=
0
)
{
options_
.
evaluate_knobs
=
FLAGS_evaluate_knobs
;
}
VLOG
(
3
)
<<
"evaluate_knobs = "
<<
options_
.
evaluate_knobs
;
auto
worker_fn
=
[
this
,
&
program
](
const
std
::
string
&
schedule_name
,
BuildRuntimeProgramFn
build_fn
,
bool
execute
=
true
)
{
Context
::
Global
().
ResetNameId
();
VLOG
(
3
)
<<
"Initialize graph."
;
auto
graph
=
std
::
make_shared
<
hlir
::
framework
::
Graph
>
(
program
,
target_
);
VLOG
(
3
)
<<
"Apply graph pass."
;
hlir
::
framework
::
ApplyPass
(
graph
.
get
(),
"OpFusionPass"
);
VLOG
(
3
)
<<
"Build "
<<
schedule_name
<<
" program."
;
auto
scope
=
BuildScope
(
target_
,
graph
);
auto
graph_compiler
=
std
::
make_unique
<
GraphCompiler
>
(
target_
,
scope
,
graph
);
auto
runtime_program
=
(
this
->*
build_fn
)(
graph
.
get
(),
graph_compiler
.
get
());
if
(
execute
)
{
VLOG
(
3
)
<<
"Execute "
<<
schedule_name
<<
" program."
;
runtime_program
->
ExecuteTest
(
options_
.
repeat_times
);
}
};
// if no one is set, build no/manual schedule cases to ensure their build
// functions are valid
if
(
options_
.
evaluate_knobs
.
none
())
{
worker_fn
(
"no schedule"
,
&
PerformanceTester
::
BuildNoScheduleProgram
,
/* execute */
false
);
worker_fn
(
"manual schedule"
,
&
PerformanceTester
::
BuildManualScheduleProgram
,
/* execute */
false
);
}
else
{
if
(
options_
.
evaluate_knobs
.
test
(
0
))
{
worker_fn
(
"no schedule"
,
&
PerformanceTester
::
BuildNoScheduleProgram
);
}
if
(
options_
.
evaluate_knobs
.
test
(
1
))
{
worker_fn
(
"manual schedule"
,
&
PerformanceTester
::
BuildManualScheduleProgram
);
}
if
(
options_
.
evaluate_knobs
.
test
(
2
))
{
worker_fn
(
"auto schedule"
,
&
PerformanceTester
::
BuildAutoScheduleProgram
);
}
}
}
protected:
using
BuildRuntimeProgramFn
=
std
::
unique_ptr
<
hlir
::
framework
::
Program
>
(
PerformanceTester
::*
)(
Graph
*
,
GraphCompiler
*
);
std
::
unique_ptr
<
hlir
::
framework
::
Program
>
BuildNoScheduleProgram
(
Graph
*
graph
,
GraphCompiler
*
graph_compiler
)
{
const
auto
&
dtype_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
common
::
Type
>>
(
"inferdtype"
);
const
auto
&
shape_dict
=
graph
->
GetAttrs
<
absl
::
flat_hash_map
<
std
::
string
,
hlir
::
framework
::
shape_t
>>
(
"infershape"
);
std
::
shared_ptr
<
hlir
::
framework
::
OpLowerer
>
op_lowerer
=
std
::
make_unique
<
hlir
::
framework
::
OpLowerer
>
(
dtype_dict
,
shape_dict
,
target_
);
GraphCompiler
::
CompileOptions
compile_options
;
compile_options
.
with_instantiate_variables
=
true
;
if
(
graph
->
fusion_groups
.
empty
())
{
hlir
::
framework
::
ApplyPasses
(
graph
,
{
"BuildNonFusedGroupsPass"
});
}
compile_options
.
groups
=
graph
->
fusion_groups
;
for
(
auto
group
:
graph
->
fusion_groups
)
{
compile_options
.
lowered_funcs
.
push_back
(
op_lowerer
->
Lower
(
group
,
/*apply_op_schedule = */
false
,
/*apply_group_schedule=*/
false
));
}
VLOG
(
3
)
<<
"===========================No Schedule LoweredFunc "
"Begin==========================="
;
for
(
const
auto
&
funcvec
:
compile_options
.
lowered_funcs
)
{
for
(
const
auto
&
func
:
funcvec
)
{
VLOG
(
3
)
<<
func
;
}
}
VLOG
(
3
)
<<
"===========================No Schedule LoweredFunc "
"End============================="
;
return
graph_compiler
->
Build
(
compile_options
).
runtime_program
;
}
std
::
unique_ptr
<
hlir
::
framework
::
Program
>
BuildManualScheduleProgram
(
Graph
*
graph
,
GraphCompiler
*
graph_compiler
)
{
return
graph_compiler
->
Build
();
}
std
::
unique_ptr
<
hlir
::
framework
::
Program
>
BuildAutoScheduleProgram
(
Graph
*
graph
,
GraphCompiler
*
graph_compiler
)
{
auto
tuner
=
std
::
make_unique
<
AutoTuner
>
(
target_
,
graph
);
AutoTuner
::
Config
tuning_config
;
TuningOptions
tuning_options
;
tuning_options
.
num_tuning_rounds
=
options_
.
num_tuning_rounds
;
tuning_options
.
num_measure_trials
=
2
;
tuning_options
.
num_samples_per_iteration
=
2
;
tuner
->
Initialize
(
tuning_config
,
graph_compiler
);
TuningResult
tuning_result
=
tuner
->
Tune
(
tuning_options
);
GraphCompiler
::
CompileOptions
compile_options
;
compile_options
.
with_instantiate_variables
=
true
;
compile_options
.
Apply
(
tuning_result
);
VLOG
(
3
)
<<
"===========================Auto Schedule LoweredFunc "
"Begin==========================="
;
for
(
const
auto
&
funcvec
:
compile_options
.
lowered_funcs
)
{
for
(
const
auto
&
func
:
funcvec
)
{
VLOG
(
3
)
<<
func
;
}
}
VLOG
(
3
)
<<
"===========================Auto Schedule LoweredFunc "
"End============================="
;
return
graph_compiler
->
Build
(
compile_options
).
runtime_program
;
}
#ifdef CINN_WITH_CUDA
Target
target_
=
common
::
DefaultNVGPUTarget
();
#else
Target
target_
=
common
::
DefaultHostTarget
();
#endif
Options
options_
;
};
constexpr
int
batch_size
=
2
;
TEST_F
(
PerformanceTester
,
Mul
)
{
Evaluate
(
tests
::
OpBuilder
(
"mul"
).
Build
({{
"X"
,
{
32
,
16
}},
{
"Y"
,
{
16
,
32
}}}));
}
TEST_F
(
PerformanceTester
,
Add
)
{
Evaluate
(
tests
::
OpBuilder
(
"elementwise_add"
)
.
Build
({{
"X"
,
{
1
,
56
,
56
,
256
}},
{
"Y"
,
{
1
,
56
,
56
,
256
}}}));
}
TEST_F
(
PerformanceTester
,
Matmul
)
{
Evaluate
(
tests
::
OpBuilder
(
"matmul"
).
Build
(
{{
"X"
,
{
batch_size
,
2048
}},
{
"Y"
,
{
2048
,
1000
}}}));
}
TEST_F
(
PerformanceTester
,
Relu
)
{
Evaluate
(
tests
::
OpBuilder
(
"relu"
).
Build
({{
"X"
,
{
batch_size
,
64
,
56
,
56
}}}));
}
TEST_F
(
PerformanceTester
,
Conv2d
)
{
std
::
vector
<
int
>
strides
{
2
,
2
};
std
::
vector
<
int
>
paddings
{
3
,
3
};
std
::
vector
<
int
>
dilations
{
1
,
1
};
int
groups
=
1
;
std
::
string
conv_type
=
"forward"
;
std
::
string
data_format
=
"NCHW"
;
std
::
string
padding_algorithm
=
"EXPLICIT"
;
Evaluate
(
tests
::
OpBuilder
(
"conv2d"
).
Build
(
{{
"X"
,
{
batch_size
,
3
,
224
,
224
}},
{
"W"
,
{
64
,
3
,
7
,
7
}}},
{{
"stride"
,
strides
},
{
"padding"
,
paddings
},
{
"dilation"
,
dilations
},
{
"groups"
,
groups
},
{
"conv_type"
,
conv_type
},
{
"data_format"
,
data_format
},
{
"padding_algorithm"
,
padding_algorithm
}}));
}
TEST_F
(
PerformanceTester
,
Pool2d
)
{
std
::
vector
<
int32_t
>
input_shape
{
batch_size
,
64
,
112
,
112
};
std
::
string
pooling_type
=
"max"
;
std
::
vector
<
int
>
ksize
{
3
,
3
};
std
::
vector
<
int
>
strides
{
2
,
2
};
std
::
vector
<
int
>
paddings
{
1
,
1
,
1
,
1
};
bool
ceil_mode
=
false
;
bool
exclusive
=
true
;
bool
global_pooling
=
false
;
std
::
string
data_format
=
"NCHW"
;
bool
adaptive
=
false
;
std
::
string
padding_algorithm
=
"EXPLICIT"
;
Evaluate
(
tests
::
OpBuilder
(
"pool2d"
).
Build
(
{{
"X"
,
{
batch_size
,
64
,
112
,
112
}}},
{{
"pool_type"
,
pooling_type
},
{
"kernel_size"
,
ksize
},
{
"stride_size"
,
strides
},
{
"padding_size"
,
paddings
},
{
"ceil_mode"
,
ceil_mode
},
{
"exclusive"
,
exclusive
},
{
"global_pooling"
,
global_pooling
},
{
"data_format"
,
data_format
},
{
"adaptive"
,
adaptive
},
{
"padding_algorithm"
,
padding_algorithm
}}));
}
TEST_F
(
PerformanceTester
,
BatchNorm
)
{
std
::
vector
<
int32_t
>
input_shape
{
batch_size
,
64
,
112
,
112
};
std
::
vector
<
int32_t
>
scale_shape
{
64
};
std
::
vector
<
int32_t
>
bias_shape
{
64
};
std
::
vector
<
int32_t
>
mean_shape
{
64
};
std
::
vector
<
int32_t
>
variance_shape
{
64
};
float
epsilon
=
1e-5
f
;
float
momentum
=
0.9
f
;
const
std
::
string
&
data_layout
=
"NCHW"
;
Evaluate
(
tests
::
OpBuilder
(
"batch_norm"
)
.
Build
({{
"X"
,
{
batch_size
,
64
,
112
,
112
}},
{
"scale"
,
{
64
}},
{
"bias"
,
{
64
}},
{
"mean"
,
{
64
}},
{
"variance"
,
{
64
}}},
{{
"epsilon"
,
epsilon
},
{
"momentum"
,
momentum
},
{
"data_layout"
,
data_layout
}}));
}
TEST_F
(
PerformanceTester
,
Reshape
)
{
std
::
vector
<
int32_t
>
output_shape
{
batch_size
,
2048
};
Evaluate
(
tests
::
OpBuilder
(
"reshape"
).
Build
({{
"X"
,
{
batch_size
,
2048
,
1
,
1
}}},
{{
"shape"
,
output_shape
}}));
}
TEST_F
(
PerformanceTester
,
Softmax
)
{
std
::
vector
<
int
>
axes
=
{
-
1
};
std
::
string
mode
=
"fast"
;
std
::
string
data_format
=
"AnyLayout"
;
Evaluate
(
tests
::
OpBuilder
(
"softmax"
).
Build
(
{{
"X"
,
{
batch_size
,
1000
}}},
{{
"axes"
,
axes
},
{
"mode"
,
mode
},
{
"data_format"
,
data_format
}}));
}
TEST_F
(
PerformanceTester
,
Scale
)
{
float
scale
=
1.0
f
;
float
bias
=
0.0
f
;
bool
bias_after_scale
=
true
;
Evaluate
(
tests
::
OpBuilder
(
"scale"
).
Build
(
{{
"X"
,
{
batch_size
,
1000
}}},
{{
"scale"
,
scale
},
{
"bias"
,
bias
},
{
"bias_after_scale"
,
bias_after_scale
}}));
}
TEST_F
(
PerformanceTester
,
LookupTable
)
{
int64_t
padding_idx
=
-
1
;
Evaluate
(
tests
::
OpBuilder
(
"lookup_table"
)
.
Build
({{
"table"
,
{
50001
,
768
}},
{
"ids"
,
{
10
,
128
,
1
},
common
::
Int
(
64
)}},
{{
"padding_idx"
,
padding_idx
}}));
}
TEST_F
(
PerformanceTester
,
Gather
)
{
int
axis
=
3
;
Evaluate
(
tests
::
OpBuilder
(
"gather"
).
Build
(
{{
"operand"
,
{
10
,
12
,
128
,
512
}},
{
"index"
,
{
1
,
1
,
1
,
128
},
common
::
Int
(
32
)}},
{{
"axis"
,
axis
}}));
}
// paddle model test
TEST_F
(
PerformanceTester
,
ResNet50
)
{
CHECK_NE
(
FLAGS_resnet50_model_dir
,
""
);
FLAGS_cinn_infer_model_version
=
1.0
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
int64_t
>>
feeds
=
{
{
"inputs"
,
{
batch_size
,
3
,
224
,
224
}}};
Evaluate
(
cinn
::
frontend
::
PaddleModelConvertor
(
common
::
DefaultNVGPUTarget
())
.
LoadModel
(
FLAGS_resnet50_model_dir
,
true
,
feeds
));
}
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/auto_schedule/tuning.h
0 → 100644
View file @
992bec46
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/ir/lowered_func.h"
namespace
cinn
{
namespace
auto_schedule
{
// alias a LoweredFunc array as FunctionGroup
using
FunctionGroup
=
std
::
vector
<
ir
::
LoweredFunc
>
;
// alias a Graph::Group array as SubGraph
using
SubGraphPtr
=
std
::
shared_ptr
<
hlir
::
framework
::
Graph
::
Group
>
;
// Options for tuning process
struct
TuningOptions
{
// The number of tuning rounds, each round will tune several tasks,
// each task involves TuningOptions.num_measure_trials measurements.
int
num_tuning_rounds
=
1
;
// The number of measurement trials in a task, if it is 0,
// that means the tunner will return the best
// candidate of schedule config without measurement.
int
num_measure_trials
=
10
;
// Every round TaskSchedule chooses some TuneTask(s) to optimize and run
// several iterations of search algorithm for a task to generate samples.
// Each iteration has num_samples_per_iteration samples.
//
// 1. if TuningOptions.num_measure_trials is 0, the autotune doesn't involve
// hardware measurements. It predicts performance by cost model.
//
// 2. num_measure_trials % num_samples_per_iteration must equal 0.
// In each round, autotune will run iterations until number of iterations
// * num_samples_per_iteration equals num_measure_trials.
int
num_samples_per_iteration
=
10
;
//////////////////////////////////////
// Evolutionary Search Related Options
//////////////////////////////////////
// The number of picks from the stored database in each iteration
// These are best performance recorded from previous generations
//
// Note the number doesn't guaranteed returns those topk when the
// database doesn't have enough data. Evolutionary Search would get
// as many as possible without throwing errors or warnings.
int
evolution_pick_database_topk
=
8
;
// The number of initial populations at each generation. It contains
// the picks from database plus random generated samples.
int
evolution_init_population_num
=
10
;
// The number of samples generated by cross over
int
evolution_cross_over_num
=
0
;
// The fraction of random samples in num_samples_per_iteration.
// So the num_samples_per_iteration would have (1 - eps_greedy) best
// samples from evolutionary search and eps_greedy random samples.
//
// It explores the cases evolutionary search won't predict precisely
float
evolution_eps_greedy
=
0.1
f
;
};
// Result of the tuning process
struct
TuningResult
{
// Result of graph tuning
std
::
vector
<
SubGraphPtr
>
subgraphs
;
// Result of schedule tuning
std
::
vector
<
FunctionGroup
>
function_groups
;
};
}
// namespace auto_schedule
}
// namespace cinn
paddle/cinn/backends/CMakeLists.txt
0 → 100644
View file @
992bec46
core_gather_headers
()
gather_srcs
(
cinnapi_src
SRCS
outputs.cc
codegen_c.cc
codegen_c_x86.cc
codegen_cuda_host.cc
extern_func_emitter.cc
extern_func_emitter_builtin.cc
function_prototype.cc
extern_func_protos.cc
extern_func_jit_register.cc
modular.cc
compiler.cc
)
if
(
WITH_CUDA
)
add_subdirectory
(
nvrtc
)
list
(
APPEND srcs cuda_util.cc codegen_cuda_dev.cc codegen_cuda_util.cc
)
endif
()
if
(
WITH_OPENMP
)
cinn_cc_library
(
__x86_source_fake_lib SRCS _x86_builtin_source.cc
)
endif
()
add_subdirectory
(
llvm
)
if
(
WITH_CUDA
)
cinn_nv_test
(
test_raw_cuda_code SRCS raw_cuda_code_test.cu DEPS cinncore
)
endif
()
cinn_cc_test
(
test_codegen_c
SRCS
codegen_c_test.cc
DEPS
cinncore
ARGS
${
global_test_args
}
)
cinn_cc_test
(
test_codegen_c_x86
SRCS
codegen_c_x86_test.cc
DEPS
cinncore
ARGS
${
global_test_args
}
)
cinn_cc_test
(
test_generated1 SRCS generated_module1.cc DEPS cinn_runtime
)
add_run_test_dependency
(
test_generated1 test_codegen_c
)
cinn_cc_test
(
test_ir_schedule SRCS ir_schedule_test.cc DEPS cinncore
)
include_directories
(
${
CMAKE_SOURCE_DIR
}
/paddle/cinn/runtime
)
if
(
TARGET test_generated1
)
add_dependencies
(
test_generated1 test_codegen_c
)
endif
()
if
(
WITH_CUDA
)
cinn_nv_test
(
test_codegen_cuda_generate SRCS codegen_cuda_generate_test.cc
DEPS cinncore
)
cinn_nv_test
(
test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore
)
if
(
WITH_TESTING
)
if
(
CINN_ONLY
)
cinn_nv_test
(
generated1_cuda SRCS generated1.cu DEPS cinncore
)
else
()
nv_test
(
generated1_cuda
SRCS generated1.cu
DEPS cinncore
)
endif
()
add_run_test_dependency
(
generated1_cuda test_codegen_cuda_generate
)
endif
()
cinn_nv_test
(
test_compiler SRCS compiler_test.cc DEPS cinncore
)
else
()
cinn_cc_test
(
test_compiler SRCS compiler_test.cc DEPS cinncore
)
endif
()
foreach
(
cpp
${
srcs
}
)
set
(
cinnapi_src
"
${
cinnapi_src
}
;paddle/cinn/backends/
${
cpp
}
"
CACHE INTERNAL
""
)
endforeach
()
file
(
GLOB includes
LIST_DIRECTORIES false
RELATIVE
${
CMAKE_SOURCE_DIR
}
*.h
)
foreach
(
header
${
includes
}
)
set
(
core_includes
"
${
core_includes
}
;
${
header
}
"
CACHE INTERNAL
""
)
endforeach
()
Prev
1
…
7
8
9
10
11
12
13
14
15
…
18
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment