Commit 992bec46 authored by “yuguo”'s avatar “yuguo”
Browse files

2.5

parent 0259837d
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/task_creator.h"
#include <gtest/gtest.h>
#include <memory>
#include <vector>
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/node.h"
namespace cinn {
namespace auto_schedule {
using ::cinn::frontend::NetBuilder;
using ::cinn::frontend::Program;
using ::cinn::hlir::framework::Graph;
using ::cinn::hlir::framework::Node;
Program CreateAddProgram() {
constexpr int M = 32;
constexpr int N = 24;
NetBuilder builder("net_builder");
auto a = builder.CreateInput(Float(32), {M, N}, "A");
auto b = builder.CreateInput(Float(32), {M, N}, "B");
auto c = builder.Add(a, b);
auto d = builder.Add(a, c);
auto program = builder.Build();
return program;
}
TEST(TaskCreator, Basic) {
#ifdef CINN_WITH_CUDA
Target target = common::DefaultNVGPUTarget();
#else
Target target = common::DefaultHostTarget();
#endif
Program prog = CreateAddProgram();
auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
TaskCreator task_creator;
std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
ASSERT_EQ(tasks.size(), 2UL);
for (TuneTask& task : tasks) {
std::shared_ptr<Graph::Group> subgraph = task.subgraph;
ASSERT_EQ(subgraph->CollectNodes().size(), 1UL);
ASSERT_EQ(subgraph->nodes[0]->op()->name, "elementwise_add");
}
}
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
#include <glog/logging.h>
#include <functional>
#include <limits>
#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
#include "paddle/cinn/auto_schedule/measure/measure.h"
#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/hlir/op/external_api_registry.h"
#include "paddle/cinn/ir/buffer.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/optim/transform_gpu_forloop.h"
#include "paddle/cinn/runtime/flags.h"
#include "paddle/cinn/utils/string.h"
#ifdef CINN_WITH_CUDA
#include <cuda_runtime_api.h>
#include "paddle/cinn/backends/cuda_util.h"
#endif
DECLARE_bool(auto_schedule_use_cost_model);
namespace cinn {
namespace auto_schedule {
using cinn::hlir::op::ExternalApiRegistry;
// *** forward declarations of auxiliary functions to be used in this file only
// *** update a scheduled function with several post-processors
ir::LoweredFunc FuncWithUpdatedBody(const common::Target& target,
const ir::LoweredFunc& old_func,
ir::Expr& body); // NOLINT
// check whether a scheduled lowered function is valid
bool PruneInvalid(const ir::LoweredFunc& lowered_func,
const common::Target& target);
// exclude some special tasks
bool IsForbiddenToTune(const TuneTask* task);
// tell whether the task has been wrapped by custom_call in
// TransToCustomCallPass
bool IsWrappedByCustomCall(const TuneTask* task);
// tell whether the task has registered external api
bool HasExternalApi(const TuneTask* task);
TaskOptimizer::TaskOptimizer(TuneTask* task,
ScheduleMeasurer* schedule_measurer,
Database* database,
utils::LinearRandomEngine::StateType rand_seed)
: task_(task),
schedule_measurer_(schedule_measurer),
database_(database),
cost_model_(),
rand_seed_(utils::LinearRandomEngine::NormalizeState(rand_seed)) {}
FunctionGroup TaskOptimizer::Optimize(const TuningOptions& options) {
CHECK(task_->subgraph != nullptr) << "subgraph can't be empty";
// task with forbidden or custom_call ops can't be tuned
if (IsForbiddenToTune(task_) || IsWrappedByCustomCall(task_)) {
return task_->op_lowerer->Lower(task_->subgraph);
}
// TODO(CtfGo): the input/output names of a Graph::Group will be changed in
// Lowering by OpLowerer currently, so we should revert them after following
// different lower methods, remove this hard code by fixing the decoupling
// between lowering and BuildInstructions
auto initial_input_names = task_->subgraph->input_names;
auto initial_output_names = task_->subgraph->output_names;
std::vector<TaskOptimizer::Result> candidates;
candidates.emplace_back(OptimizeByEvolution(options));
candidates.emplace_back(OptimizeByManual(options.num_measure_trials > 0));
if (HasExternalApi(task_)) {
candidates.emplace_back(OptimizeByExternal(options.num_measure_trials > 0));
}
sort(candidates.begin(),
candidates.end(),
[](const auto& lhs, const auto& rhs) { return lhs.cost < rhs.cost; });
auto&& best = candidates.front();
VLOG(4) << "Total candidates=" << candidates.size()
<< ", the best from=" << best.from << ", cost=" << best.cost;
// revert input/output names
task_->subgraph->input_names = initial_input_names;
task_->subgraph->output_names = initial_output_names;
return best.functions;
}
TaskOptimizer::Result TaskOptimizer::OptimizeByManual(bool need_measured) {
static constexpr char* kManualMeasuredKeyPrefix = "@ManualMeasured:\n";
TaskOptimizer::Result result("Manual");
result.functions = task_->op_lowerer->Lower(task_->subgraph);
// pack functions body
std::vector<ir::Expr> func_bodys;
for (const ir::LoweredFunc& func : result.functions) {
func_bodys.push_back(func->body);
}
SearchState state(ir::IRSchedule(ir::ModuleExpr(std::move(func_bodys))));
// the manual is regarded as the second best in default, so we set its cost
// 0.0
result.cost = 0.0;
// add the specific prefix in front of serialized_key to be store/load
// measured record for manual schedule
std::string measured_key = kManualMeasuredKeyPrefix + task_->serialized_key;
if (need_measured && database_->Count(measured_key) == 0) {
std::vector<MeasureInput> inputs(1);
inputs.back().task = task_;
inputs.back().lowered_funcs = result.functions;
VLOG(4) << "Measure manual schedule";
std::vector<MeasureResult> measure_outputs =
schedule_measurer_->Measure(inputs);
database_->AddRecord(
TuningRecord(measured_key, state, measure_outputs[0].execution_cost));
}
auto measured_records = database_->LookUp(measured_key);
if (!measured_records.empty()) { // update result.cost by measured if exists
result.cost = measured_records[0].execution_cost;
}
return result;
}
TaskOptimizer::Result TaskOptimizer::OptimizeByExternal(bool need_measured) {
static constexpr char* kExternalMeasuredKeyPrefix = "@ExternalMeasured:\n";
TaskOptimizer::Result result("External");
auto nodes = task_->subgraph->CollectNodes();
auto* first_node = nodes.front();
// set the necessary field for lowering with external api
std::string original_op = first_node->op()->name;
first_node->attrs.attr_store["original_op"] = original_op;
first_node->attrs.op = hlir::framework::Operator::Get("custom_call");
result.functions = task_->op_lowerer->Lower(task_->subgraph);
// add the specific prefix in front of serialized_key to be store/load
// measured record for external api
result.cost = -1.0; // the external is regarded as the best in default, so we
// set its cost -1.0
std::string measured_key = kExternalMeasuredKeyPrefix + task_->serialized_key;
if (need_measured && database_->Count(measured_key) == 0) {
std::vector<MeasureInput> inputs(1);
inputs.back().task = task_;
inputs.back().lowered_funcs = result.functions;
VLOG(4) << "Measure external api";
std::vector<MeasureResult> measure_outputs =
schedule_measurer_->Measure(inputs);
// the SearchState of external is invalid and will not be used, so we just
// put a temporary one
database_->AddRecord(TuningRecord(measured_key,
SearchState(ir::IRSchedule()),
measure_outputs[0].execution_cost));
}
auto measured_records = database_->LookUp(measured_key);
if (!measured_records.empty()) { // update result.cost by measured if exists
result.cost = measured_records[0].execution_cost;
}
return result;
}
bool IsForbiddenToTune(const TuneTask* task) {
// TODO(CtfGo): some operators may change its linked edges in
// TransToCustomCallPass, like conv2d, we will skip these ops in auto-schedule
// because they can't revert original links for no schedule and manual
// schedule lowering.
static std::unordered_set<std::string> links_changed_ops = {"conv2d"};
auto nodes = task->subgraph->CollectNodes();
auto&& op_name = nodes.front()->op()->name;
if (nodes.size() == 1 && links_changed_ops.count(op_name)) {
VLOG(5) << "Op:" << op_name << " is forbidden to call external_api";
return true;
}
return false;
}
bool HasExternalApi(const TuneTask* task) {
auto nodes = task->subgraph->CollectNodes();
auto* first_node = nodes.front();
if (nodes.size() == 1 && ExternalApiRegistry::Global()->Has(
first_node->op()->name, task->target)) {
return true;
}
return false;
}
bool IsWrappedByCustomCall(const TuneTask* task) {
auto nodes = task->subgraph->CollectNodes();
auto* first_node = nodes.front();
if (nodes.size() == 1 && first_node->op()->name == "custom_call") {
CHECK(first_node->attrs.attr_store.count("original_op"))
<< "a custom_call op must store its original op name";
std::string op_name =
absl::get<std::string>(first_node->attrs.attr_store.at("original_op"));
VLOG(5) << "Op:" << op_name << " was wrapped as custom_call";
return true;
}
return false;
}
TaskOptimizer::Result TaskOptimizer::OptimizeByEvolution(
const TuningOptions& options) {
CHECK_EQ(options.num_measure_trials % options.num_samples_per_iteration, 0)
<< "TuningOptions.num_measure_trials % "
"TuningOptions.num_samples_per_iteration must be 0.";
VLOG(4) << "Optimizing TuneTask with num_measure_trials:"
<< options.num_measure_trials
<< ", LoweredFunc before optimization is:";
VLOG(4) << "lowered function size = " << task_->lowered_funcs.size();
for (size_t i = 0; i < task_->lowered_funcs.size(); ++i) {
VLOG(4) << "lowered_funcs[" << i << "] detail:\n"
<< task_->lowered_funcs[i];
}
if (evolutionary_search_ == nullptr) {
// TODO(zhhsplendid): check whether the options is same as previous,
// if not, we should create new EvolutionarySearch
evolutionary_search_ = std::make_unique<EvolutionarySearch>(
*task_, cost_model_, database_, utils::ForkRandomState(&rand_seed_));
}
TaskOptimizer::Result result("Evolution");
auto& optimized_funcs = result.functions;
auto& best_cost = result.cost;
// use initial lowered function as default result
optimized_funcs = optim::IRCopy(task_->lowered_funcs);
if (options.num_measure_trials ==
0) { // no need to measure and simply return the best searched
std::vector<MeasureInput> measure_candidates;
std::vector<SearchState> states =
SearchOneRound(options, &measure_candidates);
if (!states.empty()) {
if (FLAGS_auto_schedule_use_cost_model) {
best_cost = cost_model_.Predict(states.front()->ir_schedule.GetModule(),
task_->target);
}
optimized_funcs = measure_candidates[0].lowered_funcs;
} else {
LOG(WARNING) << "No valid candidate searched, will return initial state";
}
return result;
}
int measured_count = 0;
uint32_t continuous_empty_cnt = 0;
while (measured_count < options.num_measure_trials) {
VLOG(4) << "Launch a new search, current measured_count:" << measured_count;
std::vector<MeasureInput> measure_inputs;
std::vector<SearchState> states = SearchOneRound(options, &measure_inputs);
if (states.empty()) { // no new valid candidate achieved
++continuous_empty_cnt;
if (continuous_empty_cnt <= kMaxRetryContinuousEmpty_) {
VLOG(4) << "No valid state searched, continuous_empty_cnt="
<< continuous_empty_cnt;
continue;
} else {
LOG(WARNING) << "OptimizeByEvolution will be exited in advance due to "
"continuous invalid search, final measured_count="
<< measured_count;
break;
}
}
continuous_empty_cnt = 0; // reset if get valid candidates
VLOG(4) << "ScheduleMeasurer start with input size="
<< measure_inputs.size();
std::vector<MeasureResult> measure_outputs =
schedule_measurer_->Measure(measure_inputs);
CHECK_EQ(measure_outputs.size(), states.size())
<< "ScheduleMeasurer didn't output same number of MeasureOutput of "
"states in TaskOptimizer";
// record to database
for (size_t i = 0; i < states.size(); ++i) {
database_->AddRecord(TuningRecord(measure_inputs[i].task->serialized_key,
states[i],
measure_outputs[i].execution_cost));
}
// update cost model
if (FLAGS_auto_schedule_use_cost_model) {
std::vector<const ir::ModuleExpr*> cost_model_samples(states.size());
std::vector<float> cost_model_labels(states.size());
for (size_t i = 0; i < states.size(); ++i) {
cost_model_samples[i] = &(states[i]->ir_schedule.GetModule());
cost_model_labels[i] = measure_outputs[i].execution_cost;
}
VLOG(4) << utils::StringFormat(
"Update CostModel with samples size=%lu,labels size=%lu",
cost_model_samples.size(),
cost_model_labels.size());
cost_model_.Update(cost_model_samples, cost_model_labels, task_->target);
}
// update the best
for (size_t i = 0; i < measure_outputs.size(); ++i) {
if (measure_outputs[i].execution_cost < best_cost) {
VLOG(4) << "Update best candidate with execution_cost:"
<< measure_outputs[i].execution_cost << "us";
best_cost = measure_outputs[i].execution_cost;
optimized_funcs = measure_inputs[i].lowered_funcs;
}
}
// count result size
measured_count += states.size();
}
return result;
}
std::vector<SearchState> TaskOptimizer::SearchOneRound(
const TuningOptions& options,
std::vector<MeasureInput>* measure_candidates) {
std::vector<SearchState> states =
evolutionary_search_->SearchModuleExprEpsGreedy(options);
VLOG(4) << JoinStatesDebugString("TaskOptimizer::EvolutionarySearch-Result",
states,
/*verbose=*/VLOG_IS_ON(5));
size_t valid_cnt = 0;
for (size_t i = 0; i < states.size(); ++i) {
std::vector<ir::Expr> best_exprs =
states[i]->ir_schedule.GetModule().GetExprs();
CHECK_EQ(best_exprs.size(), task_->lowered_funcs.size())
<< "RuntimeError: Expr size is not equal to LoweredFunc size in "
"TaskOptimizer";
auto init_funcs = optim::IRCopy(task_->lowered_funcs);
std::vector<ir::LoweredFunc> valid_funcs;
for (size_t j = 0; j < best_exprs.size(); ++j) {
auto updated_f =
UpdateFuncWithNewBody(task_->target, init_funcs[j], best_exprs[j]);
if (PruneInvalid(updated_f, task_->target)) {
VLOG(4) << "PruneInvalid states-" << i;
break;
}
valid_funcs.emplace_back(updated_f);
}
// all functions are validated, collect this state to be measured
if (valid_funcs.size() == init_funcs.size()) {
states[valid_cnt++] = states[i];
measure_candidates->emplace_back(MeasureInput());
measure_candidates->back().task = task_;
measure_candidates->back().lowered_funcs = std::move(valid_funcs);
}
}
states.erase(states.begin() + valid_cnt, states.end());
CHECK_EQ(states.size(), measure_candidates->size())
<< "result size of states not equal to measure_candidates";
VLOG(4) << "EvolutionarySearch return size=" << states.size()
<< ", valid count=" << valid_cnt;
VLOG(4) << JoinStatesDebugString("TaskOptimizer::SearchOneRound-Result",
states,
/*verbose=*/VLOG_IS_ON(5));
return states;
}
// detect the limit of available shared memory on the current NVGPU with CUDA
// runtime
size_t GetGPUSharedMemoryLimit() {
#ifdef CINN_WITH_CUDA
int device_id;
CUDA_CALL(cudaGetDevice(&device_id));
cudaDeviceProp prop;
CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
VLOG(4) << utils::StringFormat(
"GPU-%d GPUSharedMemoryLimit=%d", device_id, prop.sharedMemPerBlock);
return prop.sharedMemPerBlock;
#else
return 0;
#endif
}
// detect the limit of available local/stack memory on the current NVGPU with
// CUDA runtime
size_t GetGPULocalStackLimit() {
#ifdef CINN_WITH_CUDA
int device_id;
CUDA_CALL(cudaGetDevice(&device_id));
cudaDeviceProp prop;
CUDA_CALL(cudaGetDeviceProperties(&prop, device_id));
size_t limit = prop.totalGlobalMem / prop.multiProcessorCount /
prop.maxThreadsPerMultiProcessor;
VLOG(4) << utils::StringFormat(
"GPU-%d "
"totalGlobalMem=%lu,maxThreadsPerMultiProcessor=%d,multiProcessorCount=%"
"d, calculated "
"GPULocalStackLimit=%lu",
device_id,
prop.totalGlobalMem,
prop.multiProcessorCount,
prop.maxThreadsPerMultiProcessor,
limit);
return limit;
#else
return 0;
#endif
}
// check whether usage of the specific memory type in the lowered_func exceeds
// hardware limit
bool IsGPUMemoryUsageExceedLimit(const ir::LoweredFunc& lowered_func,
const ir::MemoryType& used_memory_type,
const size_t limit_bytes) {
std::unordered_set<std::string> visited;
size_t used_bytes_cnt = 0;
for (auto&& buf : lowered_func->temp_bufs) {
VLOG(5) << "temp buf name=" << buf->name << ", numel=" << buf->numel()
<< ",dtype=" << buf->dtype;
if (buf->memory_type == used_memory_type && !visited.count(buf->name)) {
used_bytes_cnt += buf->numel() * buf->dtype.bytes();
visited.insert(buf->name);
}
}
VLOG(5) << "total used_bytes_cnt=" << used_bytes_cnt;
return used_bytes_cnt >= limit_bytes;
}
bool PruneInvalid(const ir::LoweredFunc& lowered_func,
const common::Target& target) {
static const size_t kGPUSharedMemoryLimitBytes = GetGPUSharedMemoryLimit();
static const size_t kGPULocalStackLimitBytes = GetGPULocalStackLimit();
if (target == common::DefaultNVGPUTarget()) {
if (IsGPUMemoryUsageExceedLimit(lowered_func,
ir::MemoryType::GPUShared,
kGPUSharedMemoryLimitBytes)) {
VLOG(5) << ir::MemoryType::GPUShared
<< " memory usage exceeds limit, func:\n"
<< lowered_func;
return true;
}
if (IsGPUMemoryUsageExceedLimit(
lowered_func, ir::MemoryType::GPULocal, kGPULocalStackLimitBytes)) {
VLOG(5) << ir::MemoryType::GPULocal
<< " memory usage exceeds limit, func:\n"
<< lowered_func;
return true;
}
}
return false;
}
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include "paddle/cinn/auto_schedule/cost_model/expr_cost_model.h"
#include "paddle/cinn/auto_schedule/database/database.h"
#include "paddle/cinn/auto_schedule/measure/schedule_measurer.h"
#include "paddle/cinn/auto_schedule/search_strategy/evolutionary_search.h"
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/auto_schedule/tuning.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/utils/random_engine.h"
namespace cinn {
namespace auto_schedule {
// This class is responsible for tuning a specific task,
// it will integrate necessary components to search the
// optimal schedule for the task.
class TaskOptimizer {
public:
TaskOptimizer(TuneTask* task,
ScheduleMeasurer* schedule_measurer,
Database* database,
utils::LinearRandomEngine::StateType rand_seed = -1);
FunctionGroup Optimize(const TuningOptions& options);
private:
struct Result {
std::string from;
double cost;
FunctionGroup functions;
explicit Result(const std::string& from_type)
: from(from_type), cost(std::numeric_limits<double>::max()) {}
};
Result OptimizeByManual(bool need_measure);
Result OptimizeByExternal(bool need_measure);
Result OptimizeByEvolution(const TuningOptions& options);
// call search candidates once by EvolutionarySearch and prune invalid ones
std::vector<SearchState> SearchOneRound(
const TuningOptions& options,
std::vector<MeasureInput>* measure_candidates);
private:
// the max retry times if continuously get empty result
static constexpr uint32_t kMaxRetryContinuousEmpty_ = 3;
TuneTask* task_;
ScheduleMeasurer* schedule_measurer_;
std::unique_ptr<EvolutionarySearch> evolutionary_search_ = nullptr;
ExprCostModel cost_model_;
Database* database_;
utils::LinearRandomEngine::StateType rand_seed_;
};
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gflags/gflags.h>
#include <mutex>
#include <string>
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/utils/registry.h"
namespace cinn {
namespace auto_schedule {
struct InitialTaskInfo {
std::string task_key;
ir::ModuleExpr module_expr;
InitialTaskInfo(const std::string& task_key,
const ir::ModuleExpr& module_expr)
: task_key(task_key), module_expr(module_expr) {}
};
// Global task registry, used to save the initial ModuleExpr of each task.
class InitialTaskRegistry : public Registry<InitialTaskInfo> {
public:
static InitialTaskRegistry* Global() {
static InitialTaskRegistry x;
return &x;
}
// Get the initial ModuleExpr of a task.
inline const InitialTaskInfo* Get(const std::string& task_key) {
const InitialTaskInfo* task_info =
Registry<InitialTaskInfo>::Find(task_key);
CHECK(task_info) << "InitialTaskInfo [" << task_key
<< "] is not registered";
return task_info;
}
// Check if the task info with task_key exists;
inline const bool Has(const std::string& task_key) {
return nullptr != Registry<InitialTaskInfo>::Find(task_key);
}
// Regist the initial ModuleExpr of a task into the map
inline void Regist(const std::string& task_key,
const ir::ModuleExpr& module_expr) {
std::lock_guard<std::mutex> guard(registering_mutex);
if (fmap_.count(task_key) == 0) {
InitialTaskInfo* task_info =
new InitialTaskInfo(task_key, optim::IRCopy(module_expr));
__REGISTER__(task_key, task_info);
}
}
private:
InitialTaskRegistry() = default;
CINN_DISALLOW_COPY_AND_ASSIGN(InitialTaskRegistry);
// Regist the initial ModuleExpr of a task.
inline InitialTaskInfo* __REGISTER__(const std::string& task_key,
InitialTaskInfo* task_info) {
fmap_[task_key] = task_info;
const_list_.push_back(task_info);
entry_list_.push_back(task_info);
return task_info;
}
};
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/task_registry.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <cstdlib>
#include "paddle/cinn/auto_schedule/task/task_creator.h"
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/utils/string.h"
#include "paddle/cinn/utils/type_defs.h"
DECLARE_bool(auto_schedule_use_cost_model);
namespace cinn {
namespace auto_schedule {
std::vector<TuneTask> CreateTasks(hlir::framework::Graph* graph,
const common::Target& target) {
// create tasks
TaskCreator task_creator;
std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph);
const auto& dtype_dict =
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
std::unique_ptr<hlir::framework::OpLowerer> op_lowerer =
std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
for (TuneTask& task : tasks) {
task.Initialize(shape_dict, dtype_dict, op_lowerer.get());
VLOG(3) << "Add a task with serialized_key:\n" << task.serialized_key;
}
return tasks;
}
std::shared_ptr<hlir::framework::Graph> CreateAddProgram(
const common::Target& target) {
frontend::NetBuilder builder("test");
auto a = builder.CreateInput(Float(32), {1, 64, 112, 112}, "A");
auto b = builder.CreateInput(Float(32), {64}, "B");
auto c = builder.Add(a, b, 1);
return std::make_shared<hlir::framework::Graph>(builder.Build(), target);
}
TEST(TestTaskRegistry, basic) {
FLAGS_auto_schedule_use_cost_model = true;
#ifdef CINN_WITH_CUDA
Target target = common::DefaultNVGPUTarget();
#else
Target target = common::DefaultHostTarget();
#endif
std::shared_ptr<hlir::framework::Graph> graph = CreateAddProgram(target);
std::vector<TuneTask> tasks = CreateTasks(graph.get(), target);
InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
std::vector<ir::ModuleExpr> module_exprs;
for (const TuneTask& task : tasks) {
module_exprs.emplace_back(task.GetLoweredFuncBodyExprs());
task_registry->Regist(task.serialized_key, module_exprs.back());
}
for (int i = 0; i < tasks.size(); ++i) {
std::string key = tasks[i].serialized_key;
VLOG(3) << "serialized_key = " << key;
ir::ModuleExpr new_expr = task_registry->Get(key)->module_expr;
ASSERT_EQ(new_expr.GetExprs().size(), module_exprs[i].GetExprs().size());
for (int j = 0; j < new_expr.GetExprs().size(); ++j) {
VLOG(3) << "expr " << j << " of task " << key << " : "
<< new_expr.GetExprs().at(j);
ASSERT_EQ(utils::GetStreamCnt(new_expr.GetExprs().at(j)),
utils::GetStreamCnt(module_exprs[i].GetExprs().at(j)));
}
}
bool flag = task_registry->Has(tasks[0].serialized_key);
ASSERT_EQ(flag, true);
flag = task_registry->Has("not_exist");
ASSERT_EQ(flag, false);
}
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include <glog/logging.h>
#include <iostream>
#include <vector>
#include "paddle/cinn/auto_schedule/analysis/analyze_ir.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/utils/string.h"
namespace cinn {
namespace auto_schedule {
void TuneTask::Initialize(
const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
shape_dict,
const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
hlir::framework::OpLowerer* lower_handler) {
CHECK(lower_handler != nullptr) << "op_lowerer can't be nullptr";
op_lowerer = lower_handler;
// Set lowered_funcs and analyze output names.
this->lowered_funcs = op_lowerer->Lower(
subgraph, /*apply_op_schedule = */ false, /*apply_group_schedule=*/false);
this->output_names = GetOutputNamesFromLoweredFunc(this->lowered_funcs);
this->serialized_key = SerializeToString(shape_dict, dtype_dict);
}
std::vector<ir::Expr> TuneTask::GetLoweredFuncBodyExprs() const {
std::vector<ir::Expr> result;
for (const ir::LoweredFunc& func : lowered_funcs) {
result.push_back(func->body);
}
return result;
}
std::string TuneTask::SerializeToString(
const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
shape_dict,
const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict) {
std::stringstream ss;
ss << target << "\n\n"; // print target
// local function to print dtype,shape of out/in variables of the specified
// node
auto print_node_links_fn =
[&](const std::vector<common::Shared<common::GraphEdge>>& links,
bool is_input) {
int printed_num = 0;
for (auto&& edge : links) {
const auto* var_node =
is_input ? edge->source()->safe_as<hlir::framework::NodeData>()
: edge->sink()->safe_as<hlir::framework::NodeData>();
CHECK(var_node) << "var node invalid";
auto sit = shape_dict.find(var_node->id());
CHECK(sit != shape_dict.end())
<< "can't find shape of variable:" << var_node->id();
auto dit = dtype_dict.find(var_node->id());
CHECK(dit != dtype_dict.end())
<< "can't find dtype of variable:" << var_node->id();
if (printed_num > 0) {
ss << ", ";
}
++printed_num;
// TODO(CtfGo): CINN uses the names of input/output NodeData ids as
// arguments of the LoweredFunc in the Lower process, so it will
// result in different LoweredFuncs for two Nodes even though they
// represents the same operator. Here we add `var_node->id()` into the
// serialized_key to distinguish them, otherwise AutoTuner will get
// wrong TuningRecords when querying cached results from database. In
// the future, we should remove name-related limit in Lower process,
// to avoid duplicate tuning tasks with same operators.
ss << var_node->id() << "->" << cinn::common::Type2Str(dit->second)
<< "[" + utils::Join(sit->second, ",") << "]";
}
};
// print each node of the subgraph
ss << "Group {\n";
for (auto&& node : subgraph->CollectNodes()) {
ss << " (";
print_node_links_fn(node->outlinks_in_order(), false);
ss << ") = " << node->op()->name << "(";
print_node_links_fn(node->inlinks_in_order(), true);
ss << ")\n";
}
ss << "}\n";
return ss.str();
}
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <absl/container/flat_hash_map.h>
#include <memory>
#include <string>
#include <vector>
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/common/type.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
namespace cinn {
namespace auto_schedule {
class TuneTask {
public:
TuneTask() = default;
explicit TuneTask(std::shared_ptr<hlir::framework::Graph::Group> group)
: subgraph(group) {}
// Initialize a task
void Initialize(
const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
shape_dict,
const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
hlir::framework::OpLowerer* lower_handler);
// Extract bodies in lowered_funcs() and return
std::vector<ir::Expr> GetLoweredFuncBodyExprs() const;
// In CINN, we use hlir::framework::Graph::Group to represent a fused
// sub-graph (if an op won't be fused, it will be a Group with size=1).
std::shared_ptr<hlir::framework::Graph::Group> subgraph;
// Lower handler, Not owned
hlir::framework::OpLowerer* op_lowerer;
// target of this task
common::Target target;
// stores the initial (un-optimized) LoweredFuncs
std::vector<ir::LoweredFunc> lowered_funcs;
// names of the output arguments of lowered_funcs_
std::unordered_set<std::string> output_names;
// serialized string of this task, it contains struct,shape,dtype,input/output
// variable name of the subgraph and can be further used to hash
std::string serialized_key;
private:
// Serialize this task as a string contains specific fields of it
std::string SerializeToString(
const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
shape_dict,
const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict);
};
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include <gtest/gtest.h>
#include <iostream>
#include <memory>
#include <vector>
#include "paddle/cinn/auto_schedule/task/task_creator.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/hlir/framework/scope.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/utils/string.h"
namespace cinn {
namespace auto_schedule {
using ::cinn::frontend::NetBuilder;
using ::cinn::frontend::Program;
using ::cinn::hlir::framework::OpLowerer;
Program CreateAddProgram() {
constexpr int M = 32;
constexpr int N = 24;
NetBuilder builder("net_builder");
auto a = builder.CreateInput(Float(32), {M, N}, "A");
auto b = builder.CreateInput(Float(32), {M, N}, "B");
auto c = builder.Add(a, b);
auto d = builder.Add(a, c);
auto program = builder.Build();
return program;
}
TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
Context::Global().ResetNameId();
#ifdef CINN_WITH_CUDA
Target target = common::DefaultNVGPUTarget();
#else
Target target = common::DefaultHostTarget();
#endif
Program prog = CreateAddProgram();
auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
TaskCreator task_creator;
std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
ASSERT_EQ(tasks.size(), 2UL);
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
const auto& dtype_dict =
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
OpLowerer op_lowerer(dtype_dict, shape_dict, target);
std::stringstream ss;
for (TuneTask& task : tasks) {
task.Initialize(shape_dict, dtype_dict, &op_lowerer);
std::vector<ir::Expr> exprs = task.GetLoweredFuncBodyExprs();
VLOG(6) << "ir:Expr is: ";
for (const ir::Expr& e : exprs) {
VLOG(6) << e;
ss << e << std::endl;
}
}
std::string expr_str = ss.str();
#ifdef CINN_WITH_CUDA
std::string target_str = R"ROC(
{
ScheduleBlock(root)
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_1)
{
i0, i1 = axis.bind(i, j)
var_1[i, j] = (A[i, j] + B[i, j])
}
}
}
}
}
{
ScheduleBlock(root_0)
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_2)
{
i0_0, i1_0 = axis.bind(i, j)
var_2[i, j] = (A[i, j] + var_1[i, j])
}
}
}
}
}
)ROC";
#else
std::string target_str = R"ROC(
{
ScheduleBlock(root)
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_1)
{
i0, i1 = axis.bind(i, j)
var_1[i0, i1] = (A[i0, i1] + B[i0, i1])
}
}
}
}
}
{
ScheduleBlock(root_0)
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_2)
{
i0_0, i1_0 = axis.bind(i, j)
var_2[i0_0, i1_0] = (A[i0_0, i1_0] + var_1[i0_0, i1_0])
}
}
}
}
}
)ROC";
#endif
EXPECT_EQ(utils::Trim(target_str), utils::Trim(expr_str));
}
TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
Context::Global().ResetNameId();
#ifdef CINN_WITH_CUDA
Target target = common::DefaultNVGPUTarget();
#else
Target target = common::DefaultHostTarget();
#endif
Program prog = CreateAddProgram();
auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
ApplyPass(graph.get(), "OpFusionPass");
TaskCreator task_creator;
std::vector<TuneTask> tasks = task_creator.CreateTuneTaskOpLevel(graph.get());
ASSERT_EQ(tasks.size(), 1UL);
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
const auto& dtype_dict =
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
OpLowerer op_lowerer(dtype_dict, shape_dict, target);
std::stringstream ss;
for (TuneTask& task : tasks) {
task.Initialize(shape_dict, dtype_dict, &op_lowerer);
std::vector<ir::Expr> exprs = task.GetLoweredFuncBodyExprs();
VLOG(6) << "ir:Expr is: ";
for (const ir::Expr& e : exprs) {
VLOG(6) << e;
ss << e << std::endl;
}
}
std::string expr_str = ss.str();
#ifdef CINN_WITH_CUDA
std::string target_str = R"ROC(
{
ScheduleBlock(root)
{
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_1)
{
i0, i1 = axis.bind(i, j)
var_1[i, j] = (A[i, j] + B[i, j])
}
}
}
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_2)
{
i0_0, i1_0 = axis.bind(i, j)
var_2[i, j] = (A[i, j] + var_1[i, j])
}
}
}
}
}
}
)ROC";
#else
std::string target_str = R"ROC(
{
ScheduleBlock(root)
{
{
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_1)
{
i0, i1 = axis.bind(i, j)
var_1[i0, i1] = (A[i0, i1] + B[i0, i1])
}
}
}
serial for (i, 0, 32)
{
serial for (j, 0, 24)
{
ScheduleBlock(var_2)
{
i0_0, i1_0 = axis.bind(i, j)
var_2[i0_0, i1_0] = (A[i0_0, i1_0] + var_1[i0_0, i1_0])
}
}
}
}
}
}
)ROC";
#endif
EXPECT_EQ(utils::Trim(target_str), utils::Trim(expr_str));
}
TEST(TuneTask, SerializeToString) {
Context::Global().ResetNameId();
#ifdef CINN_WITH_CUDA
Target target = common::DefaultNVGPUTarget();
#else
Target target = common::DefaultHostTarget();
#endif
Program prog = CreateAddProgram();
auto graph = std::make_shared<hlir::framework::Graph>(prog, target);
TaskCreator task_creator;
std::vector<TuneTask> single_tasks =
task_creator.CreateTuneTaskOpLevel(graph.get());
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
const auto& dtype_dict =
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
OpLowerer op_lowerer(dtype_dict, shape_dict, target);
ASSERT_EQ(single_tasks.size(), 2UL);
for (auto&& task : single_tasks) {
task.Initialize(shape_dict, dtype_dict, &op_lowerer);
}
#ifdef CINN_WITH_CUDA
std::string single_add_str = R"ROC(Target<linux,nvgpu,64>
Group {
(var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
}
)ROC";
#else
std::string single_add_str = R"ROC(Target<linux,x86,64>
Group {
(var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
}
)ROC";
#endif
EXPECT_EQ(single_tasks[0].serialized_key, single_add_str);
ApplyPass(graph.get(), "OpFusionPass");
std::vector<TuneTask> fused_tasks =
task_creator.CreateTuneTaskOpLevel(graph.get());
ASSERT_EQ(fused_tasks.size(), 1UL);
fused_tasks[0].Initialize(shape_dict, dtype_dict, &op_lowerer);
#ifdef CINN_WITH_CUDA
std::string fused_expected_str = R"ROC(Target<linux,nvgpu,64>
Group {
(var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
(var_2->float32[32,24]) = elementwise_add(A->float32[32,24], var_1->float32[32,24])
}
)ROC";
#else
std::string fused_expected_str = R"ROC(Target<linux,x86,64>
Group {
(var_1->float32[32,24]) = elementwise_add(A->float32[32,24], B->float32[32,24])
(var_2->float32[32,24]) = elementwise_add(A->float32[32,24], var_1->float32[32,24])
}
)ROC";
#endif
EXPECT_EQ(fused_tasks[0].serialized_key, fused_expected_str);
}
} // namespace auto_schedule
} // namespace cinn
core_gather_headers()
gather_srcs(cinnapi_src SRCS task_scheduler.cc round_robin.cc
efficiency_priority.cc)
cinn_cc_test(test_task_scheduler SRCS task_scheduler_test.cc DEPS cinncore)
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
namespace cinn {
namespace auto_schedule {
int EfficiencyPriority::NextTaskId() {
while (cur_task_id_ < tasks_->size()) {
if (IsTaskToTune(&tasks_->at(cur_task_id_))) {
return cur_task_id_++;
}
++cur_task_id_;
}
return -1;
}
bool EfficiencyPriority::IsTaskToTune(const TuneTask* task) {
return config_.minimum_gain_threshold > 0.0;
}
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
namespace cinn {
namespace auto_schedule {
// Schedule tasks with efficiency_priority strategy, that
// is picking a task with the maximum earnings ratio.
class EfficiencyPriority : public TaskScheduler {
public:
EfficiencyPriority(const std::vector<TuneTask>& tasks, const Config& config)
: TaskScheduler(tasks, config) {}
const char* Name() const override { return "efficiency_priority"; };
int NextTaskId() override;
private:
bool IsTaskToTune(const TuneTask* task);
};
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
namespace cinn {
namespace auto_schedule {
int RoundRobin::NextTaskId() {
if (cur_task_id_ < tasks_->size()) {
return cur_task_id_++;
}
return -1;
}
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
namespace cinn {
namespace auto_schedule {
// Schedule tasks with round_robin strategy, that
// is picking a task to tune once a time iteratively.
class RoundRobin : public TaskScheduler {
public:
RoundRobin(const std::vector<TuneTask>& tasks, const Config& config)
: TaskScheduler(tasks, config) {}
const char* Name() const override { return "round_robin"; };
int NextTaskId() override;
};
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
#include <algorithm>
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
namespace cinn {
namespace auto_schedule {
std::unique_ptr<TaskScheduler> TaskScheduler::Make(
const std::vector<TuneTask>& tasks,
const Config& config,
const std::string& strategy) {
CHECK_GT(tasks.size(), 0) << "Empty task list";
if (strategy == "round_robin") {
return std::make_unique<RoundRobin>(tasks, config);
} else if (strategy == "efficiency_priority") {
return std::make_unique<EfficiencyPriority>(tasks, config);
}
LOG(FATAL) << "Unimplemented strategy:" << strategy;
return nullptr;
}
TaskScheduler::TaskScheduler(const std::vector<TuneTask>& tasks,
const Config& config)
: tasks_(&tasks), config_(config), cur_task_id_(0) {}
void TaskScheduler::Reset() { cur_task_id_ = 0; }
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <memory>
#include <string>
#include <vector>
#include "paddle/cinn/auto_schedule/task/task_optimizer.h"
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/auto_schedule/tuning.h"
namespace cinn {
namespace auto_schedule {
// Class for scheduling tasks to perform auto-tune
class TaskScheduler {
public:
// All configs for different schedule strategies
// will be defined here together.
struct Config {
// The minimum threshold of earnings ratio, used by EfficiencyPriority
float minimum_gain_threshold = 0.0;
};
// Create a TaskScheduler with the specific strategy name
// and necessary construct parameters.
static std::unique_ptr<TaskScheduler> Make(
const std::vector<TuneTask>& tasks,
const Config& config,
const std::string& strategy = "round_robin");
// Reset associated states to schedule at the beginning
void Reset();
// Return the name of schedule strategy
virtual const char* Name() const = 0;
// Select a task to tune
virtual int NextTaskId() = 0;
protected:
// A taskScheduler object should be created with the static function Make
TaskScheduler(const std::vector<TuneTask>& tasks, const Config& config);
// The config for scheduling strategy
Config config_;
// The current task id to be estimated
int cur_task_id_;
// The pointer refers to all tasks
const std::vector<TuneTask>* tasks_;
};
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/auto_schedule/task_scheduler/task_scheduler.h"
#include <gtest/gtest.h>
#include <type_traits>
#include "paddle/cinn/auto_schedule/task_scheduler/efficiency_priority.h"
#include "paddle/cinn/auto_schedule/task_scheduler/round_robin.h"
namespace cinn {
namespace auto_schedule {
TEST(TaskScheduler, Make) {
std::vector<TuneTask> tasks(3);
TaskScheduler::Config config;
auto round_robin = TaskScheduler::Make(tasks, config);
ASSERT_STREQ(round_robin->Name(), "round_robin");
auto efficiency_priority =
TaskScheduler::Make(tasks, config, "efficiency_priority");
ASSERT_STREQ(efficiency_priority->Name(), "efficiency_priority");
}
TEST(RoundRobinScheduler, NextTaskId) {
std::vector<TuneTask> tasks(3);
TaskScheduler::Config config;
auto round_robin = TaskScheduler::Make(tasks, config);
ASSERT_EQ(0, round_robin->NextTaskId());
ASSERT_EQ(1, round_robin->NextTaskId());
round_robin->Reset();
ASSERT_EQ(0, round_robin->NextTaskId());
}
TEST(EfficiencyPriorityScheduler, NextTaskId) {
std::vector<TuneTask> tasks(3);
TaskScheduler::Config config;
config.minimum_gain_threshold = -1.0;
auto efficiency_priority =
TaskScheduler::Make(tasks, config, "efficiency_priority");
ASSERT_EQ(-1, efficiency_priority->NextTaskId());
}
} // namespace auto_schedule
} // namespace cinn
if(WITH_CUDA AND (NOT WITH_CUDNN))
cinn_cc_test(
test_performance_comparison
ARGS
"--resnet50_model_dir=${THIRD_PARTY_PATH}/ResNet50"
SRCS
performance_comparison_test.cc
DEPS
cinncore
test_program_builder)
endif()
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <bitset>
#include <iostream>
#include "paddle/cinn/auto_schedule/auto_tuner.h"
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/optimize.h"
#include "paddle/cinn/frontend/paddle_model_convertor.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/runtime/flags.h"
#include "paddle/cinn/utils/data_util.h"
#include "test/cpp/cinn/program_builder.h"
/* This test is used as a tool to evaluate or compare performance of 3
* schedules(no schedule, manual schedule, auto-schedule). One can specify which
* schedules to be evaluated through `FLAGS_evaluate_knobs` and specify which
* operator or model through `--gtest_filter=PerformanceTester.xx`, for example,
* `FLAGS_evaluate_knobs=4
* --gtest_filter=PerformanceTester.Matmul` means it will evaluate auto-schedule
* on Matmul operator. You can refer to explanation of following flags or
* parameters for more detail.
*/
DEFINE_string(resnet50_model_dir,
"./ResNet50",
"the path to paddle model resnet50.");
// Flags that control which schedule tests will be run.
// Bit with index 0 controls no schedule test, means options = 1 = "001" will
// run no schedule test. Bit with index 1 controls manual schedule test, means
// options = 2 = "010" will run manual schedule test. Bit with index 2 controls
// auto schedule test, means options = 4 = "100" will run auto schedule test.
// The default value is -1, which means that this flag is disabled to set the
// options
DEFINE_int32(evaluate_knobs,
-1,
"the options to control which schedule tests will be run.");
DECLARE_double(cinn_infer_model_version);
namespace cinn {
namespace auto_schedule {
using ::cinn::hlir::framework::BuildScope;
using ::cinn::hlir::framework::Graph;
using ::cinn::hlir::framework::GraphCompiler;
using ::cinn::hlir::framework::Instruction;
using ::cinn::hlir::framework::Scope;
class PerformanceTester : public ::testing::Test {
public:
struct Options {
// times of compiled runtime program will be executed repeatedly.
int repeat_times = 2;
// the num_tuning_rounds for auto tuning
int num_tuning_rounds = 2;
// knobs to control which schedules will be measured, refer to
// FLAGS_evaluate_knobs explanation
std::bitset<3> evaluate_knobs = 0UL;
};
void Evaluate(const frontend::Program& program) {
if (FLAGS_evaluate_knobs >= 0) {
options_.evaluate_knobs = FLAGS_evaluate_knobs;
}
VLOG(3) << "evaluate_knobs = " << options_.evaluate_knobs;
auto worker_fn = [this, &program](const std::string& schedule_name,
BuildRuntimeProgramFn build_fn,
bool execute = true) {
Context::Global().ResetNameId();
VLOG(3) << "Initialize graph.";
auto graph = std::make_shared<hlir::framework::Graph>(program, target_);
VLOG(3) << "Apply graph pass.";
hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
VLOG(3) << "Build " << schedule_name << " program.";
auto scope = BuildScope(target_, graph);
auto graph_compiler =
std::make_unique<GraphCompiler>(target_, scope, graph);
auto runtime_program =
(this->*build_fn)(graph.get(), graph_compiler.get());
if (execute) {
VLOG(3) << "Execute " << schedule_name << " program.";
runtime_program->ExecuteTest(options_.repeat_times);
}
};
// if no one is set, build no/manual schedule cases to ensure their build
// functions are valid
if (options_.evaluate_knobs.none()) {
worker_fn("no schedule",
&PerformanceTester::BuildNoScheduleProgram,
/* execute */ false);
worker_fn("manual schedule",
&PerformanceTester::BuildManualScheduleProgram,
/* execute */ false);
} else {
if (options_.evaluate_knobs.test(0)) {
worker_fn("no schedule", &PerformanceTester::BuildNoScheduleProgram);
}
if (options_.evaluate_knobs.test(1)) {
worker_fn("manual schedule",
&PerformanceTester::BuildManualScheduleProgram);
}
if (options_.evaluate_knobs.test(2)) {
worker_fn("auto schedule",
&PerformanceTester::BuildAutoScheduleProgram);
}
}
}
protected:
using BuildRuntimeProgramFn = std::unique_ptr<hlir::framework::Program> (
PerformanceTester::*)(Graph*, GraphCompiler*);
std::unique_ptr<hlir::framework::Program> BuildNoScheduleProgram(
Graph* graph, GraphCompiler* graph_compiler) {
const auto& dtype_dict =
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
"infershape");
std::shared_ptr<hlir::framework::OpLowerer> op_lowerer =
std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target_);
GraphCompiler::CompileOptions compile_options;
compile_options.with_instantiate_variables = true;
if (graph->fusion_groups.empty()) {
hlir::framework::ApplyPasses(graph, {"BuildNonFusedGroupsPass"});
}
compile_options.groups = graph->fusion_groups;
for (auto group : graph->fusion_groups) {
compile_options.lowered_funcs.push_back(
op_lowerer->Lower(group,
/*apply_op_schedule = */ false,
/*apply_group_schedule=*/false));
}
VLOG(3) << "===========================No Schedule LoweredFunc "
"Begin===========================";
for (const auto& funcvec : compile_options.lowered_funcs) {
for (const auto& func : funcvec) {
VLOG(3) << func;
}
}
VLOG(3) << "===========================No Schedule LoweredFunc "
"End=============================";
return graph_compiler->Build(compile_options).runtime_program;
}
std::unique_ptr<hlir::framework::Program> BuildManualScheduleProgram(
Graph* graph, GraphCompiler* graph_compiler) {
return graph_compiler->Build();
}
std::unique_ptr<hlir::framework::Program> BuildAutoScheduleProgram(
Graph* graph, GraphCompiler* graph_compiler) {
auto tuner = std::make_unique<AutoTuner>(target_, graph);
AutoTuner::Config tuning_config;
TuningOptions tuning_options;
tuning_options.num_tuning_rounds = options_.num_tuning_rounds;
tuning_options.num_measure_trials = 2;
tuning_options.num_samples_per_iteration = 2;
tuner->Initialize(tuning_config, graph_compiler);
TuningResult tuning_result = tuner->Tune(tuning_options);
GraphCompiler::CompileOptions compile_options;
compile_options.with_instantiate_variables = true;
compile_options.Apply(tuning_result);
VLOG(3) << "===========================Auto Schedule LoweredFunc "
"Begin===========================";
for (const auto& funcvec : compile_options.lowered_funcs) {
for (const auto& func : funcvec) {
VLOG(3) << func;
}
}
VLOG(3) << "===========================Auto Schedule LoweredFunc "
"End=============================";
return graph_compiler->Build(compile_options).runtime_program;
}
#ifdef CINN_WITH_CUDA
Target target_ = common::DefaultNVGPUTarget();
#else
Target target_ = common::DefaultHostTarget();
#endif
Options options_;
};
constexpr int batch_size = 2;
TEST_F(PerformanceTester, Mul) {
Evaluate(tests::OpBuilder("mul").Build({{"X", {32, 16}}, {"Y", {16, 32}}}));
}
TEST_F(PerformanceTester, Add) {
Evaluate(tests::OpBuilder("elementwise_add")
.Build({{"X", {1, 56, 56, 256}}, {"Y", {1, 56, 56, 256}}}));
}
TEST_F(PerformanceTester, Matmul) {
Evaluate(tests::OpBuilder("matmul").Build(
{{"X", {batch_size, 2048}}, {"Y", {2048, 1000}}}));
}
TEST_F(PerformanceTester, Relu) {
Evaluate(tests::OpBuilder("relu").Build({{"X", {batch_size, 64, 56, 56}}}));
}
TEST_F(PerformanceTester, Conv2d) {
std::vector<int> strides{2, 2};
std::vector<int> paddings{3, 3};
std::vector<int> dilations{1, 1};
int groups = 1;
std::string conv_type = "forward";
std::string data_format = "NCHW";
std::string padding_algorithm = "EXPLICIT";
Evaluate(tests::OpBuilder("conv2d").Build(
{{"X", {batch_size, 3, 224, 224}}, {"W", {64, 3, 7, 7}}},
{{"stride", strides},
{"padding", paddings},
{"dilation", dilations},
{"groups", groups},
{"conv_type", conv_type},
{"data_format", data_format},
{"padding_algorithm", padding_algorithm}}));
}
TEST_F(PerformanceTester, Pool2d) {
std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
std::string pooling_type = "max";
std::vector<int> ksize{3, 3};
std::vector<int> strides{2, 2};
std::vector<int> paddings{1, 1, 1, 1};
bool ceil_mode = false;
bool exclusive = true;
bool global_pooling = false;
std::string data_format = "NCHW";
bool adaptive = false;
std::string padding_algorithm = "EXPLICIT";
Evaluate(tests::OpBuilder("pool2d").Build(
{{"X", {batch_size, 64, 112, 112}}},
{{"pool_type", pooling_type},
{"kernel_size", ksize},
{"stride_size", strides},
{"padding_size", paddings},
{"ceil_mode", ceil_mode},
{"exclusive", exclusive},
{"global_pooling", global_pooling},
{"data_format", data_format},
{"adaptive", adaptive},
{"padding_algorithm", padding_algorithm}}));
}
TEST_F(PerformanceTester, BatchNorm) {
std::vector<int32_t> input_shape{batch_size, 64, 112, 112};
std::vector<int32_t> scale_shape{64};
std::vector<int32_t> bias_shape{64};
std::vector<int32_t> mean_shape{64};
std::vector<int32_t> variance_shape{64};
float epsilon = 1e-5f;
float momentum = 0.9f;
const std::string& data_layout = "NCHW";
Evaluate(tests::OpBuilder("batch_norm")
.Build({{"X", {batch_size, 64, 112, 112}},
{"scale", {64}},
{"bias", {64}},
{"mean", {64}},
{"variance", {64}}},
{{"epsilon", epsilon},
{"momentum", momentum},
{"data_layout", data_layout}}));
}
TEST_F(PerformanceTester, Reshape) {
std::vector<int32_t> output_shape{batch_size, 2048};
Evaluate(tests::OpBuilder("reshape").Build({{"X", {batch_size, 2048, 1, 1}}},
{{"shape", output_shape}}));
}
TEST_F(PerformanceTester, Softmax) {
std::vector<int> axes = {-1};
std::string mode = "fast";
std::string data_format = "AnyLayout";
Evaluate(tests::OpBuilder("softmax").Build(
{{"X", {batch_size, 1000}}},
{{"axes", axes}, {"mode", mode}, {"data_format", data_format}}));
}
TEST_F(PerformanceTester, Scale) {
float scale = 1.0f;
float bias = 0.0f;
bool bias_after_scale = true;
Evaluate(tests::OpBuilder("scale").Build(
{{"X", {batch_size, 1000}}},
{{"scale", scale},
{"bias", bias},
{"bias_after_scale", bias_after_scale}}));
}
TEST_F(PerformanceTester, LookupTable) {
int64_t padding_idx = -1;
Evaluate(tests::OpBuilder("lookup_table")
.Build({{"table", {50001, 768}},
{"ids", {10, 128, 1}, common::Int(64)}},
{{"padding_idx", padding_idx}}));
}
TEST_F(PerformanceTester, Gather) {
int axis = 3;
Evaluate(tests::OpBuilder("gather").Build(
{{"operand", {10, 12, 128, 512}},
{"index", {1, 1, 1, 128}, common::Int(32)}},
{{"axis", axis}}));
}
// paddle model test
TEST_F(PerformanceTester, ResNet50) {
CHECK_NE(FLAGS_resnet50_model_dir, "");
FLAGS_cinn_infer_model_version = 1.0;
std::unordered_map<std::string, std::vector<int64_t>> feeds = {
{"inputs", {batch_size, 3, 224, 224}}};
Evaluate(cinn::frontend::PaddleModelConvertor(common::DefaultNVGPUTarget())
.LoadModel(FLAGS_resnet50_model_dir, true, feeds));
}
} // namespace auto_schedule
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/ir/lowered_func.h"
namespace cinn {
namespace auto_schedule {
// alias a LoweredFunc array as FunctionGroup
using FunctionGroup = std::vector<ir::LoweredFunc>;
// alias a Graph::Group array as SubGraph
using SubGraphPtr = std::shared_ptr<hlir::framework::Graph::Group>;
// Options for tuning process
struct TuningOptions {
// The number of tuning rounds, each round will tune several tasks,
// each task involves TuningOptions.num_measure_trials measurements.
int num_tuning_rounds = 1;
// The number of measurement trials in a task, if it is 0,
// that means the tunner will return the best
// candidate of schedule config without measurement.
int num_measure_trials = 10;
// Every round TaskSchedule chooses some TuneTask(s) to optimize and run
// several iterations of search algorithm for a task to generate samples.
// Each iteration has num_samples_per_iteration samples.
//
// 1. if TuningOptions.num_measure_trials is 0, the autotune doesn't involve
// hardware measurements. It predicts performance by cost model.
//
// 2. num_measure_trials % num_samples_per_iteration must equal 0.
// In each round, autotune will run iterations until number of iterations
// * num_samples_per_iteration equals num_measure_trials.
int num_samples_per_iteration = 10;
//////////////////////////////////////
// Evolutionary Search Related Options
//////////////////////////////////////
// The number of picks from the stored database in each iteration
// These are best performance recorded from previous generations
//
// Note the number doesn't guaranteed returns those topk when the
// database doesn't have enough data. Evolutionary Search would get
// as many as possible without throwing errors or warnings.
int evolution_pick_database_topk = 8;
// The number of initial populations at each generation. It contains
// the picks from database plus random generated samples.
int evolution_init_population_num = 10;
// The number of samples generated by cross over
int evolution_cross_over_num = 0;
// The fraction of random samples in num_samples_per_iteration.
// So the num_samples_per_iteration would have (1 - eps_greedy) best
// samples from evolutionary search and eps_greedy random samples.
//
// It explores the cases evolutionary search won't predict precisely
float evolution_eps_greedy = 0.1f;
};
// Result of the tuning process
struct TuningResult {
// Result of graph tuning
std::vector<SubGraphPtr> subgraphs;
// Result of schedule tuning
std::vector<FunctionGroup> function_groups;
};
} // namespace auto_schedule
} // namespace cinn
core_gather_headers()
gather_srcs(
cinnapi_src
SRCS
outputs.cc
codegen_c.cc
codegen_c_x86.cc
codegen_cuda_host.cc
extern_func_emitter.cc
extern_func_emitter_builtin.cc
function_prototype.cc
extern_func_protos.cc
extern_func_jit_register.cc
modular.cc
compiler.cc)
if(WITH_CUDA)
add_subdirectory(nvrtc)
list(APPEND srcs cuda_util.cc codegen_cuda_dev.cc codegen_cuda_util.cc)
endif()
if(WITH_OPENMP)
cinn_cc_library(__x86_source_fake_lib SRCS _x86_builtin_source.cc)
endif()
add_subdirectory(llvm)
if(WITH_CUDA)
cinn_nv_test(test_raw_cuda_code SRCS raw_cuda_code_test.cu DEPS cinncore)
endif()
cinn_cc_test(
test_codegen_c
SRCS
codegen_c_test.cc
DEPS
cinncore
ARGS
${global_test_args})
cinn_cc_test(
test_codegen_c_x86
SRCS
codegen_c_x86_test.cc
DEPS
cinncore
ARGS
${global_test_args})
cinn_cc_test(test_generated1 SRCS generated_module1.cc DEPS cinn_runtime)
add_run_test_dependency(test_generated1 test_codegen_c)
cinn_cc_test(test_ir_schedule SRCS ir_schedule_test.cc DEPS cinncore)
include_directories(${CMAKE_SOURCE_DIR}/paddle/cinn/runtime)
if(TARGET test_generated1)
add_dependencies(test_generated1 test_codegen_c)
endif()
if(WITH_CUDA)
cinn_nv_test(test_codegen_cuda_generate SRCS codegen_cuda_generate_test.cc
DEPS cinncore)
cinn_nv_test(test_codegen_debug SRCS codegen_debug_test.cc DEPS cinncore)
if(WITH_TESTING)
if(CINN_ONLY)
cinn_nv_test(generated1_cuda SRCS generated1.cu DEPS cinncore)
else()
nv_test(
generated1_cuda
SRCS generated1.cu
DEPS cinncore)
endif()
add_run_test_dependency(generated1_cuda test_codegen_cuda_generate)
endif()
cinn_nv_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
else()
cinn_cc_test(test_compiler SRCS compiler_test.cc DEPS cinncore)
endif()
foreach(cpp ${srcs})
set(cinnapi_src
"${cinnapi_src};paddle/cinn/backends/${cpp}"
CACHE INTERNAL "")
endforeach()
file(
GLOB includes
LIST_DIRECTORIES false
RELATIVE ${CMAKE_SOURCE_DIR}
*.h)
foreach(header ${includes})
set(core_includes
"${core_includes};${header}"
CACHE INTERNAL "")
endforeach()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment