Commit 992bec46 authored by “yuguo”'s avatar “yuguo”
Browse files

2.5

parent 0259837d
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// Predefined utilities in CINN BEGIN(
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#include <immintrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>
#include <vector>
#include "paddle/cinn/runtime/cpu/thread_backend.h"
#ifndef _CINN_X86_BUILTIN_SOURCE_
#define _CINN_X86_BUILTIN_SOURCE_
//! Vector in stack, this can only used in generated .cc file.
template <typename T, size_t Num>
struct StackVec {
typedef T value_type;
typedef StackVec<T, Num> self_type;
self_type& operator=(const StackVec& src) {
if (this != &src) {
memcpy(data_, src.data_, num_bytes());
}
return *this;
}
StackVec() { memset(data_, 0, num_bytes()); }
explicit StackVec(const T* externl) : external_data_(externl) {}
static self_type Broadcast(const value_type& v) {
self_type res;
for (size_t i = 0; i < Num; i++) res.data_[i] = v;
return res;
}
static self_type Ramp(const value_type& base, const value_type& stride) {
self_type res;
for (size_t i = 0; i < Num; i++) {
res.data_[i] = base + stride * i;
}
}
static self_type Load(const void* base, int32_t offset) {
self_type res;
memcpy(&res.data_[0], (const value_type*)base + offset, num_bytes());
}
static self_type Load(const void* base,
const StackVec<int32_t, Num>& offset) {
self_type res;
for (size_t i = 0; i < Num; i++) {
res.data_[i] = ((const value_type*)base)[offset[i]];
}
}
void Store(void* base, int32_t offset) const {
mempcpy((value_type*)base + offset, &data_[0], num_bytes()); // NOLINT
}
inline value_type& operator[](size_t i) { return data_[i]; }
inline value_type operator[](size_t i) const { return data_[i]; }
// binary operator between two vectors
// @{
#define __(op__) \
friend self_type operator op__(const self_type& a, const self_type& b) { \
self_type res; \
for (size_t i = 0; i < Num; i++) { \
res.data_[i] = a[i] op__ b[i]; \
} \
return res; \
}
__(+)
__(-)
__(*)
__(/)
__(%)
// @}
#undef __
// binary operator between a vector and a scalar
// @{
#define __(op__) \
friend self_type operator op__(const self_type& a, const value_type& b) { \
self_type res; \
for (size_t i = 0; i < Num; i++) { \
res.data_[i] = a[i] op__ b; \
} \
return res; \
}
__(+)
__(-)
__(*)
__(/)
__(%)
#undef __
// @}
static constexpr size_t num_bytes() { return sizeof(data_); }
private:
T data_[Num];
T* external_data_{nullptr};
};
/**
* The vector with external data.
*/
template <typename T, size_t Num>
struct ExternalVec {
typedef T value_type;
typedef ExternalVec<T, Num> self_type;
explicit ExternalVec(T* data) : data_(data) {}
self_type& operator=(const self_type& src) {
if (data_ != src.data_) {
memcpy(data_, src.data_, num_bytes());
}
return *this;
}
static self_type Load(const void* base, int32_t offset) {
self_type res((T*)base + offset); // NOLINT
return res;
}
static constexpr size_t num_bytes() { return sizeof(value_type) * Num; }
private:
T* data_{nullptr};
};
// AVX256 load
//@{
inline __m256 cinn_avx256_load(const float* dst) { return _mm256_load_ps(dst); }
inline __m256d cinn_avx256_load(const double* dst) {
return _mm256_load_pd(dst);
}
//@}
// AVX512 load
//@{
inline __m512 cinn_avx512_load(const float* dst) { return _mm512_load_ps(dst); }
inline __m512d cinn_avx512_load(const double* dst) {
return _mm512_load_pd(dst);
}
//@}
// FP32x8 * FP32x8
// @{
inline void cinn_avx256_add(float* dst, float* a, float* b) {
_mm256_store_ps(dst, _mm256_add_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_sub(float* dst, float* a, float* b) {
_mm256_store_ps(dst, _mm256_sub_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_mul(float* dst, float* a, float* b) {
_mm256_store_ps(dst, _mm256_mul_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_div(float* dst, float* a, float* b) {
_mm256_store_ps(dst, _mm256_div_ps(_mm256_load_ps(a), _mm256_load_ps(b)));
}
// @}
// FP32x4 * float
// @{
inline void cinn_avx256_add(float* dst, float* a, float b) {
_mm256_store_ps(dst, _mm256_add_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
}
inline void cinn_avx256_sub(float* dst, float* a, float b) {
_mm256_store_ps(dst, _mm256_sub_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
}
inline void cinn_avx256_mul(float* dst, float* a, float b) {
_mm256_store_ps(dst, _mm256_mul_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
}
inline void cinn_avx256_div(float* dst, float* a, float b) {
_mm256_store_ps(dst, _mm256_div_ps(_mm256_load_ps(a), _mm256_set1_ps(b)));
}
// @}
// float * FP32x4
// @{
inline void cinn_avx256_add(float* dst, float a, float* b) {
_mm256_store_ps(dst, _mm256_add_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_sub(float* dst, float a, float* b) {
_mm256_store_ps(dst, _mm256_sub_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_mul(float* dst, float a, float* b) {
_mm256_store_ps(dst, _mm256_mul_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
}
inline void cinn_avx256_div(float* dst, float a, float* b) {
_mm256_store_ps(dst, _mm256_div_ps(_mm256_set1_ps(a), _mm256_load_ps(b)));
}
// @}
// 4 x float64
// @{
inline void cinn_avx256_add(double* dst, double* a, double* b) {
_mm256_store_pd(dst, _mm256_add_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_sub(double* dst, double* a, double* b) {
_mm256_store_pd(dst, _mm256_sub_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_mul(double* dst, double* a, double* b) {
_mm256_store_pd(dst, _mm256_mul_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_div(double* dst, double* a, double* b) {
_mm256_store_pd(dst, _mm256_div_pd(_mm256_load_pd(a), _mm256_load_pd(b)));
}
// @}
// FP32x4 * FP64
// @{
inline void cinn_avx256_add(double* dst, double* a, double b) {
_mm256_store_pd(dst, _mm256_add_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
}
inline void cinn_avx256_sub(double* dst, double* a, double b) {
_mm256_store_pd(dst, _mm256_sub_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
}
inline void cinn_avx256_mul(double* dst, double* a, double b) {
_mm256_store_pd(dst, _mm256_mul_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
}
inline void cinn_avx256_div(double* dst, double* a, double b) {
_mm256_store_pd(dst, _mm256_div_pd(_mm256_load_pd(a), _mm256_set1_pd(b)));
}
// @}
// float * FP32x4
// @{
inline void cinn_avx256_add(double* dst, double a, double* b) {
_mm256_store_pd(dst, _mm256_add_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_sub(double* dst, double a, double* b) {
_mm256_store_pd(dst, _mm256_sub_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_mul(double* dst, double a, double* b) {
_mm256_store_pd(dst, _mm256_mul_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
}
inline void cinn_avx256_div(double* dst, double a, double* b) {
_mm256_store_pd(dst, _mm256_div_pd(_mm256_set1_pd(a), _mm256_load_pd(b)));
}
// @}
//! 32 x float32 operations.
// @{
inline void cinn_avx512_add(float* dst, float* a, float* b) {
_mm512_store_ps(dst, _mm512_add_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
}
inline void cinn_avx512_sub(float* dst, float* a, float* b) {
_mm512_store_ps(dst, _mm512_sub_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
}
inline void cinn_avx512_mul(float* dst, float* a, float* b) {
_mm512_store_ps(dst, _mm512_mul_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
}
inline void cinn_avx512_div(float* dst, float* a, float* b) {
_mm512_store_ps(dst, _mm512_div_ps(_mm512_load_ps(a), _mm512_load_ps(b)));
}
// @}
// FP32x4 * FP64
// @{
inline void cinn_avx512_add(float* dst, float* a, float b) {
_mm512_store_pd(dst, _mm512_add_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
}
inline void cinn_avx512_sub(float* dst, float* a, float b) {
_mm512_store_pd(dst, _mm512_sub_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
}
inline void cinn_avx512_mul(float* dst, float* a, float b) {
_mm512_store_pd(dst, _mm512_mul_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
}
inline void cinn_avx512_div(float* dst, float* a, float b) {
_mm512_store_pd(dst, _mm512_div_pd(_mm512_load_pd(a), _mm512_set1_pd(b)));
}
// @}
// float * FP32x4
// @{
inline void cinn_avx512_add(float* dst, float a, float* b) {
_mm512_store_pd(dst, _mm512_add_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_sub(float* dst, float a, float* b) {
_mm512_store_pd(dst, _mm512_sub_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_mul(float* dst, float a, float* b) {
_mm512_store_pd(dst, _mm512_mul_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_div(float* dst, float a, float* b) {
_mm512_store_pd(dst, _mm512_div_pd(_mm512_set1_pd(a), _mm512_load_pd(b)));
}
// @}
//! 16 x float32 operations.
// @{
inline void cinn_avx512_add(double* dst, double* a, double* b) {
_mm512_store_pd(dst, _mm512_add_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_sub(double* dst, double* a, double* b) {
_mm512_store_pd(dst, _mm512_sub_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_mul(double* dst, double* a, double* b) {
_mm512_store_pd(dst, _mm512_mul_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
}
inline void cinn_avx512_div(double* dst, double* a, double* b) {
_mm512_store_pd(dst, _mm512_div_pd(_mm512_load_pd(a), _mm512_load_pd(b)));
}
// @}
inline __m512 cinn_avx512_add(const __m512& a, const __m512& b);
inline __m256 cinn_avx256_add_float(const __m256& a, const __m256& b) {
return _mm256_add_ps(a, b);
}
inline __m256d cinn_avx256_add_double(const __m256d& a, const __m256d& b) {
return _mm256_add_pd(a, b);
}
inline __m512 cinn_avx512_add_float(const __m512& a, const __m512& b) {
return _mm512_add_ps(a, b);
}
inline __m512d cinn_avx512_add_double(const __m512d& a, const __m512d& b) {
return _mm512_add_pd(a, b);
}
//! set1
// @{
inline __m256 cinn_avx256_set1(float value) { return _mm256_set1_ps(value); }
inline __m256d cinn_avx256_set1(double value) { return _mm256_set1_pd(value); }
inline __m512 cinn_avx512_set1(float value) { return _mm512_set1_ps(value); }
inline __m512d cinn_avx512_set1(double value) { return _mm512_set1_pd(value); }
// @}
//! store
// @{
inline void cinn_avx512_store(float* dst, const __m512& x) {
_mm512_store_ps(dst, x);
}
inline void cinn_avx512_store(double* dst, const __m512d& x) {
_mm512_store_pd(dst, x);
}
inline void cinn_avx256_store(float* dst, const __m256& x) {
_mm256_store_ps(dst, x);
}
inline void cinn_avx256_store(double* dst, const __m256d& x) {
_mm256_store_pd(dst, x);
}
// @}
//! add
// @{
inline __m256 cinn_avx256_add(const __m256& a, const __m256& b) {
return _mm256_add_ps(a, b);
}
inline __m256d cinn_avx256_add(const __m256d& a, const __m256d& b) {
return _mm256_add_pd(a, b);
}
inline __m512 cinn_avx512_add(const __m512& a, const __m512& b) {
return _mm512_add_ps(a, b);
}
inline __m512d cinn_avx512_add(const __m512d& a, const __m512d& b) {
return _mm512_add_pd(a, b);
}
// @}
//! mul
// @{
inline __m256 cinn_avx256_mul(const __m256& a, const __m256& b) {
return _mm256_mul_ps(a, b);
}
inline __m256d cinn_avx256_mul(const __m256d& a, const __m256d& b) {
return _mm256_mul_pd(a, b);
}
inline __m512 cinn_avx512_mul(const __m512& a, const __m512& b) {
return _mm512_mul_ps(a, b);
}
inline __m512d cinn_avx512_mul(const __m512d& a, const __m512d& b) {
return _mm512_mul_pd(a, b);
}
// @}
//! fma
// @{
inline __m128 cinn_avx128_fma(const __m128& a,
const __m128& b,
const __m128& c) {
return _mm_fmadd_ps(a, b, c);
}
inline __m128d cinn_avx128_fma(const __m128d& a,
const __m128d& b,
const __m128d& c) {
return _mm_fmadd_pd(a, b, c);
}
inline __m256 cinn_avx256_fma(const __m256& a,
const __m256& b,
const __m256& c) {
return _mm256_fmadd_ps(a, b, c);
}
inline __m256d cinn_avx256_fma(const __m256d& a,
const __m256d& b,
const __m256d& c) {
return _mm256_fmadd_pd(a, b, c);
}
inline __m512 cinn_avx512_fma(const __m512& a,
const __m512& b,
const __m512& c) {
return _mm512_fmadd_ps(a, b, c);
}
inline __m512d cinn_avx512_fma(const __m512d& a,
const __m512d& b,
const __m512d& c) {
return _mm512_fmadd_pd(a, b, c);
}
// @}
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
/// )END Predefined utilities in CINN
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
#endif // _CINN_X86_BUILTIN_SOURCE_
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_c.h"
#include <fstream>
#include <string>
#include "paddle/cinn/backends/extern_func_emitter.h"
#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/op/ir_operators.h"
#include "paddle/cinn/ir/utils/ir_verify.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/optim/remove_nested_block.h"
#include "paddle/cinn/runtime/cpu/thread_backend.h"
#include "paddle/cinn/runtime/intrinsic.h"
#include "paddle/cinn/utils/string.h"
//! Root of the builtin code.
DECLARE_string(cinn_x86_builtin_code_root);
namespace cinn {
namespace backends {
using namespace utils; // NOLINT
using cinn::common::float16;
const char *kCKeywordRestrict = "__restrict__";
void CodeGenC::Compile(const ir::Module &module, const Outputs &outputs) {
ir::IrVerify(Expr(module));
if (!outputs.c_header_name.empty()) {
auto source = Compile(module, OutputKind::CHeader);
str_ = "";
std::ofstream file(outputs.c_header_name);
CHECK(file.is_open()) << "failed to open file " << outputs.c_header_name;
file << source;
file.close();
LOG(WARNING) << "Output C header to file " << outputs.c_header_name;
}
if (!outputs.c_source_name.empty()) {
auto source = Compile(module, OutputKind::CImpl);
str_ = "";
std::ofstream file(outputs.c_source_name);
CHECK(file.is_open()) << "failed to open file " << outputs.c_source_name;
file << source;
file.close();
LOG(WARNING) << "Output C source to file " << outputs.c_source_name;
}
}
CodeGenC::CodeGenC(Target target) : ir::IrPrinter(ss_) {}
std::string CodeGenC::Compile(const ir::Module &module,
OutputKind output_kind) {
if (output_kind == OutputKind::CHeader) {
GenerateHeaderFile(module);
} else if (output_kind == OutputKind::CImpl) {
PrintIncludes();
if (inline_builtin_codes_) PrintBuiltinCodes();
for (auto &func : module.functions()) {
Compile(func);
}
} else {
LOG(FATAL) << "Not supported OutputKind";
}
return str_;
}
// TODO(LiuYang): Here the Ret type seems unuseful
void CodeGenC::Compile(const ir::LoweredFunc &function) {
CHECK(function.defined());
IrPrinter::Visit(function);
str_ += "\n\n";
}
std::string CodeGenC::GetTypeName(Type type) {
// common scalar type
#define GET_SCALAR_TYPE(pred_expr, scalar_name) \
if (pred_expr) { \
return scalar_name; \
}
GET_SCALAR_TYPE(type.is_void(), "void");
GET_SCALAR_TYPE(type.is_bool(), "bool");
GET_SCALAR_TYPE(type.is_int(8), "int8_t");
GET_SCALAR_TYPE(type.is_int(16), "int16_t");
GET_SCALAR_TYPE(type.is_int(32), "int32_t");
GET_SCALAR_TYPE(type.is_int(64), "int64_t");
GET_SCALAR_TYPE(type.is_uint(8), "uint8_t");
GET_SCALAR_TYPE(type.is_uint(16), "uint16_t");
GET_SCALAR_TYPE(type.is_uint(32), "uint32_t");
GET_SCALAR_TYPE(type.is_uint(64), "uint64_t");
GET_SCALAR_TYPE(type.is_bfloat16(), "bfloat16");
GET_SCALAR_TYPE(type.is_float16(), "float16");
GET_SCALAR_TYPE(type.is_float(32), "float")
GET_SCALAR_TYPE(type.is_float(64), "double")
#undef GET_SCALAR_TYPE
// customized_type
if (type.is_customized_type()) {
CHECK(!type.customized_type().empty()) << "customized_type can't be empty.";
auto customized_name = type.customized_type();
// get name of a cuda built-in vector type, it is started with a
// 'CudaVectorType::' prefix
if (utils::Startswith(customized_name,
common::customized_type::kcuda_builtin_vector_t)) {
customized_name.erase(
0, strlen(common::customized_type::kcuda_builtin_vector_t));
}
return customized_name;
}
// other types are not implemented yet
CINN_NOT_IMPLEMENTED
return "";
}
std::string CodeGenC::GetTypeRepr(Type type) {
std::string str;
if (type.is_cpp_const()) {
str = "const ";
}
str += GetTypeName(type);
if (type.is_cpp_handle()) {
str += "*";
} else if (type.is_cpp_handle2()) {
str += "**";
}
return str;
}
void CodeGenC::Visit(const ir::IntImm *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::UIntImm *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::FloatImm *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::StringImm *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Add *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Sub *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Mul *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Div *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Mod *op) {
auto copied = op->b();
optim::Simplify(&copied);
if (copied.is_constant()) {
int temp = static_cast<int>(copied.get_constant());
if ((temp & (temp - 1)) == 0) {
str_ += "(";
IrPrinter::Visit(op->a());
str_ += " & ";
str_ += std::to_string(temp - 1);
str_ += ")";
return;
}
}
PrintBinaryOp("%", op);
}
void CodeGenC::Visit(const ir::EQ *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::NE *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::LT *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::LE *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::GT *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::GE *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::And *op) { PrintBinaryOp("&&", op); }
void CodeGenC::Visit(const ir::Or *op) { PrintBinaryOp("||", op); }
void CodeGenC::Visit(const ir::Min *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Max *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Minus *op) { IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Not *op) {
str_ += "(!";
IrPrinter::Visit(op->v());
str_ += ")";
}
void CodeGenC::Visit(const ir::Cast *op) { PrintCastExpr(op->type(), op->v()); }
void CodeGenC::Visit(const ir::For *op) {
Expr extent = op->extent;
Expr min = op->min;
int num_task = 1;
if (op->is_parallel()) {
str_ += "int num_task = max_concurrency();\n";
DoIndent();
str_ += "omp_set_num_threads(num_task);\n";
DoIndent();
str_ += "auto flambda = [=](int task_id, int num_task) -> int {\n";
IncIndent();
DoIndent();
str_ += "int n_per_task = ";
Expr num_task_var = Var("num_task");
IrPrinter::Visit((op->extent + num_task_var - 1) / num_task_var);
str_ += ";\n";
CHECK_EQ(min.as_int32(), 0);
auto task_id = Var("task_id");
auto n_per_task = Var("n_per_task");
min = task_id * n_per_task;
extent = (task_id + 1) * n_per_task;
DoIndent();
}
str_ += "for (";
str_ += GetTypeRepr(Int(32));
str_ += " ";
str_ += op->loop_var->name;
str_ += " = ";
IrPrinter::Visit(min);
str_ += "; ";
str_ += op->loop_var->name;
str_ += " < ";
IrPrinter::Visit(op->extent);
if (op->is_parallel()) {
str_ += " && ";
str_ += op->loop_var->name;
str_ += " < ";
IrPrinter::Visit(extent);
}
str_ += "; ";
str_ += op->loop_var->name;
str_ += " += 1";
str_ += ") ";
IrPrinter::Visit(op->body);
if (op->is_parallel()) {
str_ += "\n";
DoIndent();
str_ += "return 0;\n";
DecIndent();
DoIndent();
str_ += "};\n";
str_ += "#pragma omp parallel num_threads(num_task)\n";
DoIndent();
str_ += "{\n";
IncIndent();
DoIndent();
str_ += "int task_id = omp_get_thread_num();\n";
DoIndent();
str_ += "flambda(task_id, num_task);\n";
DecIndent();
DoIndent();
str_ += "}";
}
}
void CodeGenC::Visit(const ir::PolyFor *op) {
str_ += "for (";
str_ += GetTypeRepr(Int(32));
str_ += " ";
str_ += op->iterator->name;
str_ += " = ";
IrPrinter::Visit(op->init);
str_ += "; ";
IrPrinter::Visit(op->condition);
str_ += "; ";
str_ += op->iterator->name;
str_ += " += ";
IrPrinter::Visit(op->inc);
str_ += ") ";
IrPrinter::Visit(op->body);
}
void CodeGenC::Visit(const ir::Select *op) {
str_ += "(";
str_ += "(";
IrPrinter::Visit(op->condition);
str_ += ") ? ";
IrPrinter::Visit(op->true_value);
str_ += " : ";
IrPrinter::Visit(op->false_value);
str_ += ")";
}
void CodeGenC::Visit(const ir::IfThenElse *op) {
str_ += "if (";
IrPrinter::Visit(op->condition);
str_ += ") {\n";
if (!op->true_case.As<ir::Block>()) IncIndent();
DoIndent();
IrPrinter::Visit(op->true_case);
if (!op->true_case.As<ir::Block>()) str_ += ";";
str_ += "\n";
if (!op->true_case.As<ir::Block>()) DecIndent();
DoIndent();
str_ += "}";
if (op->false_case.defined()) {
str_ += " else {\n";
if (!op->true_case.As<ir::Block>()) IncIndent();
DoIndent();
IrPrinter::Visit(op->false_case);
if (!op->false_case.As<ir::Block>()) str_ += ";";
str_ += "\n";
if (!op->true_case.As<ir::Block>()) DecIndent();
DoIndent();
str_ += "}";
}
}
void CodeGenC::Visit(const ir::Block *op) {
str_ += "{\n";
IncIndent();
for (int i = 0; i < op->stmts.size() - 1; i++) {
DoIndent();
IrPrinter::Visit(op->stmts[i]);
str_ += ";\n";
}
if (op->stmts.size() >= 1) {
DoIndent();
IrPrinter::Visit(op->stmts.back());
str_ += ";";
}
DecIndent();
str_ += "\n";
DoIndent();
str_ += "}";
}
void CodeGenC::Visit(const ir::Call *op) {
if (op->name == runtime::intrinsic::buffer_malloc) {
PrintCall_buffer_malloc(op);
} else if (op->name == runtime::intrinsic::pod_values_to_array_repr) {
PrintCall_pod_values_to_array(op);
} else if (op->is_intrinsic_call()) {
str_ += op->name;
str_ += "(";
PrintCallArgs(op);
str_ += ")";
} else if (op->is_cinn_call()) { // call CINN LoweredFunc
str_ += op->name;
str_ += "(";
PrintCallArgs(op);
str_ += ")";
} else if (op->is_extern_call()) {
const auto &fn_name = ExternFunctionEmitterRegistry::Global().Lookup(
ExternFuncID{backend_C, op->name.c_str()});
if (!fn_name.empty()) {
ExternFunctionLLVMEmitter emitter(fn_name);
emitter.BindCodeGen(this);
emitter.Emit(op);
} else {
CHECK(!op->read_args.empty() || !op->write_args.empty());
str_ += op->name;
str_ += "(";
PrintCallArgs(op);
str_ += ")";
}
} else {
CINN_NOT_IMPLEMENTED
}
}
void CodeGenC::PrintCallArgs(const ir::Call *op) {
if (!op->read_args.empty()) {
for (int i = 0; i < op->read_args.size() - 1; i++) {
IrPrinter::Visit(op->read_args[i]);
str_ += ", ";
}
IrPrinter::Visit(op->read_args.back());
}
if (!op->write_args.empty()) {
if (!op->read_args.empty()) str_ += ", ";
for (int i = 0; i < op->write_args.size() - 1; i++) {
IrPrinter::Visit(op->write_args[i]);
str_ += ", ";
}
IrPrinter::Visit(op->write_args.back());
}
}
void CodeGenC::PrintCall_buffer_malloc(const ir::Call *op) {
CHECK_EQ(op->read_args.size(), 2UL);
str_ += op->name;
str_ += "(";
PrintCastExpr("void*", op->read_args[0]);
str_ += ", ";
IrPrinter::Visit(op->read_args[1]);
str_ += ")";
}
void CodeGenC::PrintCall_cinn_pod_value_to_(const ir::Call *op) {
CHECK_EQ(op->read_args.size(), 1UL);
str_ += op->name;
str_ += "(";
str_ += "&(";
IrPrinter::Visit(op->read_args[0]);
str_ += ")";
str_ += ")";
}
void CodeGenC::PrintCall_get_address(const ir::Call *op) {
CHECK_EQ(op->read_args.size(), 1UL);
CHECK(op->write_args.empty());
auto *read_var = op->read_args.front().as_var();
auto *read_buf = op->read_args.front().as_buffer();
CHECK(read_var || read_buf) << "Only Var or Buffer can get address";
if (read_var) {
if (read_var->type().lanes() <= 1) str_ += "&";
str_ += read_var->name;
} else if (read_buf) {
if (read_buf->type().lanes() <= 1) str_ += "&";
str_ += read_buf->name;
} else {
CINN_NOT_IMPLEMENTED
}
}
void CodeGenC::PrintCall_pod_values_to_array(const ir::Call *op) {
CHECK(!op->read_args.empty());
CHECK_EQ(op->write_args.size(), 1UL);
auto output_var = op->write_args.front().as_var_ref();
CHECK(output_var.defined());
std::vector<std::string> arg_names;
for (auto &arg : op->read_args) {
auto arg_var = arg.as_var();
CHECK(arg_var);
arg_names.push_back(arg_var->name);
}
str_ += "cinn_pod_value_t ";
str_ += output_var->name;
str_ += "[] = ";
str_ += "{ ";
str_ += utils::Join(arg_names, ", ");
str_ += " }";
}
void CodeGenC::Visit(const ir::_Module_ *op) { CINN_NOT_IMPLEMENTED }
void CodeGenC::Visit(const ir::_Var_ *op) { str_ += op->name; }
void CodeGenC::Visit(const ir::Load *op) {
Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1);
if (dense_strided_ramp.defined()) { // Loading a continuous Ramp address.
CHECK(op->type().is_vector());
PrintStackVecType(op->type().ElementOf(), op->index().type().lanes());
str_ += "::";
str_ += "Load(";
str_ += op->tensor.As<ir::_Tensor_>()->name;
str_ += ",";
IrPrinter::Visit(dense_strided_ramp);
str_ += ")";
} else if (op->index().type().is_vector()) {
// gather
CHECK(op->type().is_vector());
PrintStackVecType(op->type().ElementOf(), op->index().type().lanes());
str_ += "::Load(";
str_ += op->tensor.As<ir::_Tensor_>()->name;
str_ += ",";
IrPrinter::Visit(op->index());
str_ += ")";
} else if (op->is_addr_tensor()) {
auto *tensor = op->tensor.As<ir::_Tensor_>();
str_ += tensor->name;
str_ += "[";
IrPrinter::Visit(op->index());
str_ += "]";
} else {
IrPrinter::Visit(op);
}
}
void CodeGenC::Visit(const ir::Store *op) {
CHECK(op->is_addr_tensor());
auto *tensor = op->tensor.As<ir::_Tensor_>();
CHECK(tensor);
str_ += tensor->name;
str_ += "[";
IrPrinter::Visit(op->index());
str_ += "]";
str_ += " = ";
IrPrinter::Visit(op->value);
}
void CodeGenC::Visit(const ir::Alloc *op) {
str_ += runtime::intrinsic::buffer_malloc;
str_ += "(";
str_ += "(void*)(0), ";
auto *buffer = op->destination.As<ir::_Buffer_>();
str_ += buffer->name;
str_ += ")";
}
void CodeGenC::Visit(const ir::Free *op) {
str_ += runtime::intrinsic::buffer_free;
str_ += "(";
str_ += "(void*)(0), ";
auto *buffer = op->destination.As<ir::_Buffer_>();
str_ += buffer->name;
str_ += ")";
}
void CodeGenC::Visit(const ir::_Buffer_ *op) { str_ += op->name; }
void CodeGenC::Visit(const ir::_Tensor_ *op) { str_ += op->buffer->name; }
void CodeGenC::Visit(const ir::Let *op) {
bool is_vec = false;
CHECK(op->type().valid());
if (op->body.defined() && op->body.As<ir::Broadcast>()) {
// broadcast's type is hard to print, so use c++11 auto instead.
str_ += "auto";
is_vec = true;
} else {
str_ += GetTypeRepr(op->type());
}
str_ += " ";
IrPrinter::Visit(op->symbol);
// native C array.
if (op->type().lanes() > 1 && !is_vec) {
str_ += "[";
str_ += std::to_string(op->type().lanes());
str_ += "]";
}
if (op->body.defined()) {
str_ += " = ";
IrPrinter::Visit(op->body);
}
}
void CodeGenC::Visit(const ir::Reduce *op) {
LOG(FATAL) << "Reduce IR is just for internal representation, should not be "
"used for CodeGen.";
}
void CodeGenC::Visit(const ir::Ramp *op) {
str_ += "StackVec<";
str_ += std::to_string(op->lanes);
str_ += ",";
str_ += GetTypeRepr(op->type().ElementOf());
str_ += ">::Ramp(";
IrPrinter::Visit(op->base);
str_ += ", ";
IrPrinter::Visit(op->stride);
str_ += ", ";
str_ += std::to_string(op->lanes);
str_ += ")";
}
void CodeGenC::Visit(const ir::Broadcast *op) {
str_ += "StackVec<";
str_ += std::to_string(op->lanes);
str_ += ",";
str_ += GetTypeRepr(op->type().ElementOf());
str_ += ">::Broadcast(";
IrPrinter::Visit(op->value);
str_ += ", ";
str_ += std::to_string(op->lanes);
str_ += ")";
}
void CodeGenC::Visit(const ir::FracOp *op) { ir::IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Sum *op) { ir::IrPrinter::Visit(op); }
void CodeGenC::Visit(const ir::Product *op) { ir::IrPrinter::Visit(op); }
void CodeGenC::PrintCastExpr(const Type &type, Expr e) {
str_ += "((";
str_ += GetTypeRepr(type);
str_ += ")";
str_ += "(";
IrPrinter::Visit(e);
str_ += "))";
}
void CodeGenC::PrintCastExpr(const std::string &type, Expr e) {
str_ += "(";
str_ += type;
str_ += ")";
str_ += "(";
IrPrinter::Visit(e);
str_ += ")";
}
void CodeGenC::PrintShape(const std::vector<Expr> &shape,
char leftb,
char rightb) {
str_ += leftb;
str_ += " ";
for (int i = 0; i < shape.size() - 1; i++) {
IrPrinter::Visit(shape[i]);
str_ += ", ";
}
if (shape.size() > 1) IrPrinter::Visit(shape.back());
str_ += " ";
str_ += rightb;
}
void CodeGenC::Visit(const ir::_LoweredFunc_ *op) {
PrintFunctionDeclaration(op);
str_ += "\n";
DoIndent();
CHECK_EQ(op->alloc_output_buffer_exprs.size(),
op->dealloc_output_buffer_exprs.size())
<< "the count of allocation and deallocaton expressions is not match";
std::vector<Expr> new_body;
std::vector<Expr> create_temp_buffers = op->PrepareCreateTempBufferExprs();
std::vector<Expr> alloca_temp_buffers = op->PrepareAllocTempBufferExprs();
std::vector<Expr> dealloca_temp_buffers = op->PrepareDeallocTempBufferExprs();
#define APPEND_TO_NEW_BODY(field__) \
new_body.insert( \
std::end(new_body), std::begin(op->field__), std::end(op->field__));
APPEND_TO_NEW_BODY(argument_prepare_exprs)
new_body.insert(std::end(new_body),
std::begin(create_temp_buffers),
std::end(create_temp_buffers));
APPEND_TO_NEW_BODY(alloc_output_buffer_exprs)
new_body.insert(std::end(new_body),
std::begin(alloca_temp_buffers),
std::end(alloca_temp_buffers));
APPEND_TO_NEW_BODY(buffer_data_cast_exprs)
new_body.push_back(op->body);
new_body.insert(std::end(new_body),
std::begin(dealloca_temp_buffers),
std::end(dealloca_temp_buffers));
APPEND_TO_NEW_BODY(dealloc_output_buffer_exprs)
Expr func_body = ir::Block::Make(new_body);
optim::RemoveNestedBlock(&func_body);
IrPrinter::Visit(func_body);
}
void CodeGenC::PrintIncludes() {
str_ += "#include <cinn_runtime.h>\n";
str_ += "#include <stdio.h>\n";
str_ += "\n";
}
void CodeGenC::PrintFileGuardOpen(const std::string &name) {
str_ += utils::StringFormat("#ifndef _%s_CINN_H_\n", Uppercase(name).c_str());
str_ += utils::StringFormat("#define _%s_CINN_H_\n", Uppercase(name).c_str());
str_ += "\n";
}
void CodeGenC::PrintFileGuardClose(const std::string &module_name) {
str_ += utils::StringFormat("#endif // _%s_CINN_H_\n",
Uppercase(module_name).c_str());
}
void CodeGenC::PrintBufferCreation(const std::vector<ir::Buffer> &buffers) {
for (auto &buffer : buffers) {
// Ignore the buffer in other devices.
if (!buffer->is_on_host()) continue;
DoIndent();
auto buffer_ptr_type =
Type()
.set_customized_type(common::customized_type::kbuffer_t)
.set_cpp_handle();
Var variable = ir::_Var_::Make(buffer->name, buffer_ptr_type);
auto expr = ir::intrinsics::BufferCreate::Make(buffer);
expr = ir::Let::Make(variable, expr);
IrPrinter::Visit(expr);
str_ += ";\n";
}
}
void CodeGenC::PrintBufferDestroy(const std::vector<ir::Buffer> &buffers) {
for (auto &buffer : buffers) {
DoIndent();
IrPrinter::Visit(buffer.DestroyExpr());
str_ += ";\n";
}
}
void CodeGenC::GenerateHeaderFile(const ir::Module &module) {
PrintFileGuardOpen(module.name());
PrintIncludes();
for (auto &func : module.functions()) {
PrintFunctionDeclaration(func.As<ir::_LoweredFunc_>());
str_ += ";\n";
str_ += "\n\n";
}
PrintFileGuardClose(module.name());
}
void CodeGenC::PrintFuncArg(const ir::Argument &arg) {
if (arg.is_buffer()) {
if (arg.is_input()) {
str_ += "const struct cinn_buffer_t *";
} else {
str_ += "struct cinn_buffer_t *";
}
} else if (arg.is_var()) {
str_ += GetTypeRepr(arg.type());
str_ += " ";
str_ += arg.name();
} else {
CINN_NOT_IMPLEMENTED
}
str_ += arg.name();
}
void CodeGenC::PrintRuntimeType(const cinn_type_t &type) {
if (type == cinn_bool_t()) {
str_ += "cinn_bool_t()";
} else if (type == cinn_int8_t()) {
str_ += "cinn_int8_t()";
} else if (type == cinn_int16_t()) {
str_ += "cinn_int16_t()";
} else if (type == cinn_int32_t()) {
str_ += "cinn_int32_t()";
} else if (type == cinn_int64_t()) {
str_ += "cinn_int64_t()";
} else if (type == cinn_uint8_t()) {
str_ += "cinn_uint8_t()";
} else if (type == cinn_uint16_t()) {
str_ += "cinn_uint16_t()";
} else if (type == cinn_uint32_t()) {
str_ += "cinn_uint32_t()";
} else if (type == cinn_uint64_t()) {
str_ += "cinn_uint64_t()";
} else if (type == cinn_bfloat16_t()) {
str_ += "cinn_bfloat16_t()";
} else if (type == cinn_float16_t()) {
str_ += "cinn_float16_t()";
} else if (type == cinn_float32_t()) {
str_ += "cinn_float32_t()";
} else if (type == cinn_float64_t()) {
str_ += "cinn_float64_t()";
} else {
LOG(FATAL) << "Unknown type is not supported to print";
}
}
void CodeGenC::PrintStackVecType(Type type, int lanes) {
str_ += "StackedVec<";
str_ += GetTypeRepr(type);
str_ += ",";
str_ += std::to_string(lanes);
str_ += ">";
}
void CodeGenC::Visit(const ir::PrimitiveNode *op) { CINN_NOT_IMPLEMENTED }
void CodeGenC::Visit(const ir::_BufferRange_ *op) { CINN_NOT_IMPLEMENTED }
void CodeGenC::Visit(const ir::ScheduleBlock *op) { CINN_NOT_IMPLEMENTED }
void CodeGenC::Visit(const ir::ScheduleBlockRealize *op) {
CINN_NOT_IMPLEMENTED
}
void CodeGenC::Visit(const ir::IntrinsicOp *op) {
switch (op->getKind()) {
#define __(x) \
case ir::IntrinsicKind::k##x: \
Visit(llvm::dyn_cast<ir::intrinsics::x>(op)); \
break;
INTRINSIC_KIND_FOR_EACH(__)
#undef __
}
}
void CodeGenC::Visit(const ir::intrinsics::BufferGetDataHandle *op) {
str_ += op->buffer.as_buffer()->name;
str_ += "->";
str_ += "memory";
}
void CodeGenC::Visit(const ir::intrinsics::BufferGetDataConstHandle *op) {
str_ += op->buffer.as_buffer()->name;
str_ += "->";
str_ += "memory";
}
void CodeGenC::Visit(const ir::intrinsics::PodValueToX *op) {
auto to_type = op->GetOutputType(0);
if (to_type == type_of<float>()) {
str_ += runtime::intrinsic::pod_value_to_float;
} else if (to_type == type_of<double>()) {
str_ += runtime::intrinsic::pod_value_to_double;
} else if (to_type == type_of<float16>()) {
str_ += runtime::intrinsic::pod_value_to_float16;
} else if (to_type == type_of<bool>()) {
str_ += runtime::intrinsic::pod_value_to_bool;
} else if (to_type == type_of<int8_t>()) {
str_ += runtime::intrinsic::pod_value_to_int8;
} else if (to_type == type_of<int16_t>()) {
str_ += runtime::intrinsic::pod_value_to_int16;
} else if (to_type == type_of<int32_t>()) {
str_ += runtime::intrinsic::pod_value_to_int32;
} else if (to_type == type_of<int64_t>()) {
str_ += runtime::intrinsic::pod_value_to_int64;
} else if (to_type == type_of<uint8_t>()) {
str_ += runtime::intrinsic::pod_value_to_uint8;
} else if (to_type == type_of<uint16_t>()) {
str_ += runtime::intrinsic::pod_value_to_uint16;
} else if (to_type == type_of<uint32_t>()) {
str_ += runtime::intrinsic::pod_value_to_uint32;
} else if (to_type == type_of<uint64_t>()) {
str_ += runtime::intrinsic::pod_value_to_uint64;
} else if (to_type == type_of<void *>()) {
str_ += runtime::intrinsic::pod_value_to_void_p;
} else if (to_type == type_of<cinn_buffer_t *>()) {
str_ += runtime::intrinsic::pod_value_to_buffer_p;
} else {
LOG(FATAL) << "Not supported type: " << to_type;
}
str_ += "(";
IrPrinter::Visit(op->pod_value_ptr);
str_ += ")";
}
void CodeGenC::Visit(const ir::intrinsics::BufferCreate *op) {
const ir::_Buffer_ *buffer_arg = op->buffer.as_buffer();
CHECK(buffer_arg);
str_ += runtime::intrinsic::buffer_create;
str_ += "(";
PrintCastExpr("cinn_device_kind_t", Expr(buffer_arg->target.runtime_arch()));
str_ += "/*target*/, ";
PrintRuntimeType(runtime::ToRuntimeType(buffer_arg->dtype.ElementOf()));
str_ += ", ";
PrintShape(buffer_arg->shape);
if (buffer_arg->data_alignment > 0) {
str_ += ", ";
str_ += std::to_string(buffer_arg->data_alignment);
str_ += "/*align*/";
}
str_ += ")";
}
void CodeGenC::Visit(const ir::intrinsics::GetAddr *op) {
if (op->data.as_buffer()) {
str_ += "&";
str_ += op->data.as_buffer()->name;
} else if (op->data.as_var()) {
str_ += "&";
str_ += op->data.as_var()->name;
} else {
str_ += "&(";
IrPrinter::Visit(op->data);
str_ += ")";
}
}
void CodeGenC::Visit(const ir::intrinsics::ArgsConstruct *op) {
str_ += runtime::intrinsic::args_construct_repr;
str_ += "(";
str_ += op->var->name;
str_ += ", ";
str_ += std::to_string(op->args.size());
str_ += ", ";
for (int i = 0; i < op->args.size() - 1; i++) {
IrPrinter::Visit(op->args[i]);
str_ += ", ";
}
if (!op->args.empty()) {
IrPrinter::Visit(op->args.back());
}
str_ += ")";
}
void CodeGenC::Visit(const ir::intrinsics::BuiltinIntrin *op) {
str_ += op->name;
str_ += "(";
if (!op->args.empty()) {
for (int i = 0; i < op->args.size() - 1; i++) {
IrPrinter::Visit(op->args[i]);
str_ += ", ";
}
IrPrinter::Visit(op->args.back());
}
str_ += ")";
}
std::string ReadWholeFile(const std::string &path) {
CHECK(!path.empty());
std::ifstream file(path);
CHECK(file.is_open()) << "Failed to open file: " << path;
std::stringstream ss;
ss << file.rdbuf();
return ss.str();
}
void CodeGenC::PrintBuiltinCodes() {
CHECK(!FLAGS_cinn_x86_builtin_code_root.empty())
<< "The flag cinn_x86_builtin_code_root should be set first";
const std::string x86_code_file = "_x86_builtin_source.cc";
auto source =
ReadWholeFile(FLAGS_cinn_x86_builtin_code_root + "/" + x86_code_file);
str_ += source;
str_ += "\n";
}
namespace detail {
Expr StridedRampBase(Expr e, int stride) {
auto *ramp_n = e.As<ir::Ramp>();
if (ramp_n) {
auto *iv = ramp_n->stride.As<ir::IntImm>();
if (iv && iv->value == stride) return ramp_n->base;
}
return Expr();
}
} // namespace detail
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <gflags/gflags.h>
#include <string>
#include <vector>
#include "paddle/cinn/common/common.h"
#include "paddle/cinn/ir/intrinsic_ops.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/module.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/lang/packed_func.h"
#include "paddle/cinn/runtime/cinn_runtime.h"
namespace cinn {
namespace ir {
class Module;
} // namespace ir
namespace backends {
//! keyword of __restrict__.
extern const char* kCKeywordRestrict;
class CodeGenC : public ir::IrPrinter {
public:
enum class OutputKind {
CHeader, //! output the C header file.
CImpl, //! output the C implementation file.
};
explicit CodeGenC(Target target);
void Compile(const ir::Module& module, const Outputs& outputs);
virtual std::string Compile(const ir::Module& module, OutputKind output_kind);
//! Disable inline the builtin codes(too large) for simpler string comparison.
void SetInlineBuiltinCodes(bool x = true) { inline_builtin_codes_ = x; }
protected:
void Compile(const ir::LoweredFunc& function);
void GenerateHeaderFile(const ir::Module& module);
std::string GetTypeName(Type type);
std::string GetTypeRepr(Type type);
//! type cast, print like "int(x)"
// @{
void PrintCastExpr(const Type& type, Expr e);
void PrintCastExpr(const std::string& type, Expr e);
// @}
void PrintFunctionDeclaration(const ir::_LoweredFunc_* op) {
str_ += "void ";
str_ += op->name;
str_ += "(";
str_ += "void* _args, int32_t num_args";
str_ += ")";
}
void PrintShape(const std::vector<Expr>& shape,
char leftb = '{',
char rightb = '}');
virtual void PrintIncludes();
void PrintBuiltinCodes();
void PrintFileGuardOpen(const std::string& module_name);
void PrintFileGuardClose(const std::string& module_name);
//! Create the buffers in global scope(just creation without allocating them).
void PrintBufferCreation(const std::vector<ir::Buffer>& buffers);
void PrintBufferDestroy(const std::vector<ir::Buffer>& buffers);
void PrintRuntimeType(const cinn_type_t& type);
//! Print different kinds of Calls.
// @{
void PrintCallArgs(const ir::Call* call);
void PrintCall_buffer_malloc(const ir::Call* op);
void PrintCall_cinn_pod_value_to_(const ir::Call* op);
void PrintCall_get_address(const ir::Call* op);
void PrintCall_pod_values_to_array(const ir::Call* op);
// @}
#define __DEFINE_VISIT(op__) void Visit(const ir::op__* op) override;
NODETY_FORALL(__DEFINE_VISIT)
#undef __DEFINE_VISIT
#define __DEFINE_VISIT(op__) \
void Visit(const ir::intrinsics::op__* op) override;
INTRINSIC_KIND_FOR_EACH(__DEFINE_VISIT)
#undef __DEFINE_VISIT
void PrintFuncArg(const ir::Argument& arg);
void PrintStackVecType(Type type, int lanes);
friend class ExternFunctionEmitter;
protected:
Target target_;
std::stringstream ss_;
bool inline_builtin_codes_{true};
};
namespace detail {
Expr StridedRampBase(Expr e, int stride);
} // namespace detail
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_c.h"
#include <gtest/gtest.h>
#include <sstream>
#include <tuple>
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/module.h"
#include "paddle/cinn/lang/builtin.h"
#include "paddle/cinn/lang/compute.h"
#include "paddle/cinn/lang/lower.h"
#include "paddle/cinn/lang/placeholder.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/runtime/cpu/use_extern_funcs.h"
namespace cinn {
namespace backends {
using ir::Module;
using lang::Compute;
using lang::Lower;
using lang::Placeholder;
using utils::StringFormat;
using utils::Trim;
std::tuple<ir::Tensor, ir::Tensor, ir::Tensor, lang::Buffer> CreateTensor1() {
Expr M(100);
Expr N(20);
Placeholder<float> A("A", {M, N});
Placeholder<float> B("B", {M, N});
lang::Buffer C_buf(Float(32));
auto C = Compute(
{M, N}, [&](Var i, Var j) { return A(i, j) + B(i, j); }, "C");
C->Bind(C_buf);
return std::make_tuple(A, B, C, C_buf);
}
TEST(CodeGenC, module) {
ir::Tensor A, B, C;
lang::Buffer C_buf(Float(32));
std::tie(A, B, C, C_buf) = CreateTensor1();
LOG(INFO) << "C.body: " << C->get_compute_op()->body.front();
Target target;
target.arch = Target::Arch ::X86;
target.bits = Target::Bit ::k32;
target.os = Target::OS ::Linux;
Module::Builder builder("module1", target);
auto stages = CreateStages({A, B, C});
auto func = Lower("add1", stages, {A, B, C});
builder.AddFunction(func);
{
CodeGenC codegen(target);
codegen.SetInlineBuiltinCodes(false);
auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
std::cout << "codegen C:" << std::endl << out << std::endl;
std::string target_str = R"ROC(
#include <cinn_runtime.h>
#include <stdio.h>
void add1(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
for (int32_t i = 0; i < 100; i += 1) {
for (int32_t j = 0; j < 20; j += 1) {
C[((20 * i) + j)] = (A[((20 * i) + j)] + B[((20 * i) + j)]);
};
};
cinn_buffer_free((void*)(0), _C);
}
)ROC";
EXPECT_EQ(utils::Trim(target_str), utils::Trim(out));
}
{
CodeGenC compiler(target);
auto out = compiler.Compile(builder.Build(), CodeGenC::OutputKind::CHeader);
std::cout << "header:\n" << out << std::endl;
auto target_str = R"ROC(
#ifndef _MODULE1_CINN_H_
#define _MODULE1_CINN_H_
#include <cinn_runtime.h>
#include <stdio.h>
void add1(void* _args, int32_t num_args);
#endif // _MODULE1_CINN_H_
)ROC";
EXPECT_EQ(utils::Trim(out), utils::Trim(target_str));
}
{
CodeGenC compiler(target);
compiler.SetInlineBuiltinCodes(false);
Outputs outputs;
outputs = outputs.c_header("./generated_module1.h")
.c_source("./_generated_module1.cc");
compiler.Compile(builder.Build(), outputs);
}
}
TEST(CodeGenC, matmul) {
using namespace ir; // NOLINT
Context::Global().ResetNameId();
Placeholder<float> A("A", {Expr(100), Expr(20)});
Placeholder<float> B("B", {Expr(20), Expr(50)});
Target target{};
Module::Builder builder("module1", target);
// C = A * B
Var k(20, "k0");
Tensor C = Compute(
{Expr(100), Expr(50)},
[&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); },
"C");
auto stages = CreateStages({A, B, C});
// Code gen
auto func = Lower("matmul", stages, {A, B, C});
builder.AddFunction(func);
builder.AddBuffer(C->buffer);
{ // main
std::vector<lang::ReturnType> returns(
{lang::ReturnType{Float(32), C->shape, C->name}});
auto tensors = lang::CallLowered("matmul", {A, B}, returns);
auto C = tensors[0];
C->WithBuffer();
LOG(INFO) << "C.body: " << C->body();
auto stages = CreateStages({C});
auto f = Lower("main", stages, {A, B, C}, {});
std::cout << "f\n" << Expr(f) << std::endl;
builder.AddFunction(f);
}
CodeGenC codegen(target);
codegen.SetInlineBuiltinCodes(false);
auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
std::cout << "codegen C:" << std::endl << out << std::endl;
auto tgt = R"ROC(
#include <cinn_runtime.h>
#include <stdio.h>
void matmul(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
float* C__reduce_init = ((float*)(_C->memory));
for (int32_t i = 0; i < 100; i += 1) {
for (int32_t j = 0; j < 50; j += 1) {
C__reduce_init[((50 * i) + j)] = 0.00000000f;
for (int32_t k0 = 0; k0 < 20; k0 += 1) {
C[((50 * i) + j)] = (C[((50 * i) + j)] + (A[((20 * i) + k0)] * B[((50 * k0) + j)]));
};
};
};
cinn_buffer_free((void*)(0), _C);
}
void main(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
cinn_pod_value_t _pod_val_;
buffer_p_to_cinn_pod_value(_A, &_pod_val_);
cinn_pod_value_t _pod_val__0;
buffer_p_to_cinn_pod_value(_B, &_pod_val__0);
cinn_pod_value_t _pod_val__1;
buffer_p_to_cinn_pod_value(_C, &_pod_val__1);
cinn_pod_value_t _pod_arr[3];
cinn_args_construct(_pod_arr, 3, &_pod_val_, &_pod_val__0, &_pod_val__1);
matmul(_pod_arr, 3);
cinn_buffer_free((void*)(0), _C);
}
)ROC";
ASSERT_EQ(Trim(tgt), Trim(out));
}
// This matches output of competitor.
TEST(CodeGenC, matmul_tile) {
using namespace ir; // NOLINT
Expr M(100);
Expr K(200);
Expr N(500);
Expr bn(32);
Placeholder<float> A("A", {M, K});
Placeholder<float> B("B", {K, N});
// C = A * B
Var k(K.as_int32(), "k0");
Tensor C_init = Compute(
{M, N}, [&](Var i, Var j) { return Expr(0.f); }, "C_init");
Tensor C = Compute(
{M, N},
[&](Var i, Var j) { return lang::ReduceSum(A(i, k) * B(k, j), {k}); },
"C");
auto stages = CreateStages({C, C_init});
stages[C]->ShareBufferWith(stages[C_init]);
{
auto _i_outer_i_inner_j_outer_j_inner_ =
stages[C_init]->Tile(0, 1, bn.as_int32(), bn.as_int32()); // NOLINT
auto &i_outer = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
auto &i_inner = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
auto &j_outer = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
auto &j_inner = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
stages[C_init]->Reorder({i_outer, j_outer, i_inner, j_inner});
}
{
auto _i_outer_i_inner_j_outer_j_inner_ =
stages[C]->Tile(0, 1, bn.as_int32(), bn.as_int32()); // NOLINT
auto &i_outer = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
auto &i_inner = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
auto &j_outer = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
auto &j_inner = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
auto _k_outer_k_inner_ =
stages[C]->Split(poly::Iterator("k0"), 4); // NOLINT
auto &k_outer = std::get<0>(_k_outer_k_inner_);
auto &k_inner = std::get<1>(_k_outer_k_inner_);
stages[C]->Reorder({i_outer, j_outer, i_inner, j_inner, k_outer, k_inner});
}
stages[C_init]->ComputeAtSchedule(
stages[C], 3, poly::Stage::kComputeAtBefore);
// Code gen
auto func = Lower("matmul", stages, {A, B, C});
Target target = common::DefaultHostTarget();
Module::Builder builder("module1", target);
builder.AddFunction(func);
builder.AddBuffer(C_init->buffer);
CodeGenC codegen(target);
codegen.SetInlineBuiltinCodes(false);
auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
std::cout << "codegen C:" << std::endl << out << std::endl;
auto target_out = R"ROC(
#include <cinn_runtime.h>
#include <stdio.h>
void matmul(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
float* C__reduce_init = ((float*)(_C->memory));
float* C_init = ((float*)(_C->memory));
for (int32_t i_outer = 0; i_outer < 4; i_outer += 1) {
for (int32_t j_outer = 0; j_outer < 16; j_outer += 1) {
for (int32_t i_inner = 0; i_inner < cinn_min(32, (100 + (-32 * i_outer))); i_inner += 1) {
for (int32_t j_inner = 0; j_inner < cinn_min(32, (500 + (-32 * j_outer))); j_inner += 1) {
C__reduce_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0.00000000f;
C_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0.00000000f;
for (int32_t k0_outer = 0; k0_outer < 50; k0_outer += 1) {
for (int32_t k0_inner = 0; k0_inner < 4; k0_inner += 1) {
C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = fma(A[((200 * i_inner) + ((6400 * i_outer) + ((4 * k0_outer) + k0_inner)))], B[((32 * j_outer) + ((500 * k0_inner) + ((2000 * k0_outer) + j_inner)))], C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))]);
};
};
};
};
};
};
cinn_buffer_free((void*)(0), _C);
}
)ROC";
ASSERT_EQ(Trim(target_out), Trim(out));
}
TEST(CodeGenC, matmul_packed) {
Expr M(100);
Expr K(200);
Expr N(500);
Expr bn(32);
Placeholder<float> A("A", {M, K});
Placeholder<float> B("B", {K, N});
// TODO(Superjomn) Make sure the domain works.
Var k(K.as_int32(), "k0");
auto packedB = Compute(
{N / bn, K, bn},
[&](Expr x, Expr y, Expr z) { return B(y, x * bn + z); },
"PackedB");
auto C = Compute(
{M, N},
[&](Expr i, Expr j) {
return ReduceSum(A(i, k) * packedB(j / bn, k, j % bn), {k});
},
"C");
auto stages = CreateStages({packedB, C});
{
auto _i_outer_i_inner_j_outer_j_inner_ =
stages[C]->Tile(0, 1, bn.as_int32(), bn.as_int32());
auto &i_outer = std::get<0>(_i_outer_i_inner_j_outer_j_inner_);
auto &i_inner = std::get<1>(_i_outer_i_inner_j_outer_j_inner_);
auto &j_outer = std::get<2>(_i_outer_i_inner_j_outer_j_inner_);
auto &j_inner = std::get<3>(_i_outer_i_inner_j_outer_j_inner_);
auto _k_outer_k_inner_ = stages[C]->Split(poly::Iterator("k0"), 4);
auto &k_outer = std::get<0>(_k_outer_k_inner_);
auto &k_inner = std::get<1>(_k_outer_k_inner_);
stages[C]->Reorder({i_outer, j_outer, i_inner, j_inner, k_outer, k_inner});
}
// Code gen
auto func = Lower("matmul_with_packing", stages, {A, B, packedB, C});
Target target = common::DefaultHostTarget();
Module::Builder builder("module1", target);
builder.AddFunction(func);
builder.AddBuffer(C->buffer);
builder.AddBuffer(packedB->buffer);
CodeGenC codegen(target);
codegen.SetInlineBuiltinCodes(false);
auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
std::cout << "codegen C:" << std::endl << out << std::endl;
auto target_out = R"ROC(
#include <cinn_runtime.h>
#include <stdio.h>
void matmul_with_packing(void* _args, int32_t num_args)
{
const cinn_buffer_t* _A = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[0]));
const cinn_buffer_t* _B = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[1]));
cinn_buffer_t* _PackedB = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[2]));
cinn_buffer_t* _C = cinn_pod_value_to_buffer_p(&(((cinn_pod_value_t*)(_args))[3]));
cinn_buffer_malloc((void*)(0), _PackedB);
cinn_buffer_malloc((void*)(0), _C);
const float* A = ((const float*)(_A->memory));
const float* B = ((const float*)(_B->memory));
float* C = ((float*)(_C->memory));
float* C__reduce_init = ((float*)(_C->memory));
float* PackedB = ((float*)(_PackedB->memory));
for (int32_t i = 0; i < 15; i += 1) {
for (int32_t j = 0; j < 200; j += 1) {
for (int32_t k = 0; k < 32; k += 1) {
PackedB[((6400 * i) + ((32 * j) + k))] = B[((32 * i) + ((500 * j) + k))];
};
};
};
for (int32_t i_outer = 0; i_outer < 4; i_outer += 1) {
for (int32_t j_outer = 0; j_outer < 16; j_outer += 1) {
for (int32_t i_inner = 0; i_inner < cinn_min(32, (100 + (-32 * i_outer))); i_inner += 1) {
for (int32_t j_inner = 0; j_inner < cinn_min(32, (500 + (-32 * j_outer))); j_inner += 1) {
C__reduce_init[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = 0;
for (int32_t k0_outer = 0; k0_outer < 50; k0_outer += 1) {
for (int32_t k0_inner = 0; k0_inner < 4; k0_inner += 1) {
C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))] = fma(A[((200 * i_inner) + ((6400 * i_outer) + ((4 * k0_outer) + k0_inner)))], PackedB[((6400 * (j_inner / 32)) + ((j_inner & 31) + ((6400 * j_outer) + ((32 * k0_inner) + (128 * k0_outer)))))], C[((500 * i_inner) + ((16000 * i_outer) + ((32 * j_outer) + j_inner)))]);
};
};
};
};
};
};
cinn_buffer_free((void*)(0), _PackedB);
cinn_buffer_free((void*)(0), _C);
}
)ROC";
// ToDo @haoze @wangyue Check Codegen
// ASSERT_EQ(utils::Trim(target_out), utils::Trim(out));
}
TEST(CodeGenC, call_extern) {
Expr M(100);
Placeholder<float> x("x", {M});
ir::Tensor y = Compute(
{M},
[=](Var i) -> Expr { return lang::CallExtern("tanh", {x(i)}); },
"y");
auto stages = CreateStages({y});
auto yexpr = Lower("yy", stages, {y});
Module::Builder builder("module0", common::DefaultHostTarget());
builder.AddFunction(yexpr);
CodeGenC codegen(common::DefaultHostTarget());
codegen.SetInlineBuiltinCodes(false);
auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
std::cout << "codegen C:" << std::endl << out << std::endl;
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_c_x86.h"
namespace cinn {
namespace backends {
void CodeGenCX86::Visit(const ir::Add *op) {
VisitBinaryOp(op, op->a(), op->b(), "add");
}
void CodeGenCX86::Visit(const ir::Sub *op) {
VisitBinaryOp(op, op->a(), op->b(), "sub");
}
void CodeGenCX86::Visit(const ir::Mul *op) {
VisitBinaryOp(op, op->a(), op->b(), "mul");
}
void CodeGenCX86::Visit(const ir::Div *op) {
VisitBinaryOp(op, op->a(), op->b(), "div");
}
void CodeGenCX86::Visit(const ir::Load *op) {
Expr dense_strided_ramp = detail::StridedRampBase(op->index(), 1);
if (dense_strided_ramp.defined()) { // Loading a continuous Ramp address.
CHECK(op->type().is_vector());
int bits = op->type().bits() * op->type().lanes();
if (SupportsAVX512() && bits == 512) {
str_ += "cinn_avx512_load(";
PrintAbsAddr(op);
str_ += ")";
} else if (SupportsAVX256() && bits == 256) {
str_ += "cinn_avx256_load(";
PrintAbsAddr(op);
str_ += ")";
} else {
CodeGenC::Visit(op);
}
} else {
CodeGenC::Visit(op);
}
}
void CodeGenCX86::Visit(const ir::Broadcast *op) {
CHECK_GT(op->type().lanes(), 1);
int bits = op->type().bits() * op->type().lanes();
if (SupportsAVX512() && bits == 512) {
str_ += "cinn_avx512_set1(";
PrintCastExpr(op->value.type().ElementOf(), op->value);
str_ += ")";
} else if (SupportsAVX256() && bits == 256) {
str_ += "cinn_avx256_set1(";
PrintCastExpr(op->value.type().ElementOf(), op->value);
str_ += ")";
} else {
CodeGenC::Visit(op);
}
}
void CodeGenCX86::Visit(const ir::Store *op) {
if (op->type().lanes() == 1) {
CodeGenC::Visit(op);
return;
}
int bits = op->type().bits() * op->type().lanes();
if (SupportsAVX512() && bits == 512) {
str_ += "cinn_avx512_store(";
PrintAbsAddr(op);
str_ += ", ";
IrPrinter::Visit(op->value);
str_ += ")";
} else if (SupportsAVX256() && bits == 256) {
str_ += "cinn_avx256_store(";
PrintAbsAddr(op);
str_ += ", ";
IrPrinter::Visit(op->value);
str_ += ")";
} else {
CodeGenC::Visit(op);
}
}
void CodeGenCX86::PrintVecInputArgument(const Expr *op) {
int bits = op->type().bits() * op->type().lanes();
auto *broadcast_n = op->As<ir::Broadcast>();
if (op->type().lanes() == 1 || broadcast_n) {
Expr value = op->type().lanes() == 1 ? *op : broadcast_n->value;
if (SupportsAVX512()) {
str_ += "cinn_avx512_set1(";
IrPrinter::Visit(value);
str_ += ")";
} else if (SupportsAVX256()) {
str_ += "cinn_avx256_set1(";
IrPrinter::Visit(value);
str_ += ")";
} else {
CINN_NOT_IMPLEMENTED
}
} else {
IrPrinter::Visit(*op);
}
}
void CodeGenCX86::Visit(const ir::intrinsics::BuiltinIntrin *op) {
if (op->type().lanes() == 1) {
CodeGenC::Visit(op);
return;
}
int bits = op->type().bits() * op->type().lanes();
if (SupportsAVX512() && bits == 512) {
str_ += "cinn_avx512_";
str_ += op->name;
str_ += "(";
if (!op->args.empty()) {
for (int i = 0; i < op->args.size() - 1; i++) {
PrintVecInputArgument(&op->args[i]);
str_ += ", ";
}
IrPrinter::Visit(op->args.back());
}
str_ += ")";
} else if (SupportsAVX256() && bits == 256) {
str_ += "cinn_avx256_";
str_ += op->name;
str_ += "(";
if (!op->args.empty()) {
for (int i = 0; i < op->args.size() - 1; i++) {
PrintVecInputArgument(&op->args[i]);
str_ += ", ";
}
PrintVecInputArgument(&op->args.back());
}
str_ += ")";
} else if (bits == 128) {
str_ += "cinn_avx128_";
str_ += op->name;
str_ += "(";
if (!op->args.empty()) {
for (int i = 0; i < op->args.size() - 1; i++) {
PrintVecInputArgument(&op->args[i]);
str_ += ", ";
}
PrintVecInputArgument(&op->args.back());
}
str_ += ")";
} else {
CodeGenC::Visit(op);
}
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include "paddle/cinn/backends/codegen_c.h"
#include "paddle/cinn/ir/intrinsic_ops.h"
namespace cinn {
namespace backends {
/**
* C code generation with X86 instruction or math library support.
*/
class CodeGenCX86 : public CodeGenC {
public:
//! The X86 CPU supports some following features. We use SSE or AVX to
//! accelerate the basic operations if forloop is vectorized.
enum class Feature : int {
None = 0,
SSE = 1, //! support SSE instruction set.
AVX256 = 1 << 1, // ! support AVX256 instruction set.
AVX512 = 1 << 2, // ! support AVX512 instruction set.
BLAS = 1 << 3, // ! support BLAS library.
};
Feature feature{Feature::None};
/**
* constructor.
* @param target The device.
* @param features Features it supported.
*/
CodeGenCX86(Target target, Feature feature)
: CodeGenC(target), feature(feature) {}
protected:
void Visit(const ir::Add *op) override;
void Visit(const ir::Sub *op) override;
void Visit(const ir::Mul *op) override;
void Visit(const ir::Div *op) override;
void Visit(const ir::Mod *op) override { CodeGenC::Visit(op); }
void Visit(const ir::EQ *op) override { CodeGenC::Visit(op); }
void Visit(const ir::NE *op) override { CodeGenC::Visit(op); }
void Visit(const ir::LT *op) override { CodeGenC::Visit(op); }
void Visit(const ir::LE *op) override { CodeGenC::Visit(op); }
void Visit(const ir::GT *op) override { CodeGenC::Visit(op); }
void Visit(const ir::GE *op) override { CodeGenC::Visit(op); }
void Visit(const ir::And *op) override { CodeGenC::Visit(op); }
void Visit(const ir::Or *op) override { CodeGenC::Visit(op); }
void Visit(const ir::Load *op) override;
void Visit(const ir::Store *op) override;
void Visit(const ir::Broadcast *op) override;
void Visit(const ir::intrinsics::BuiltinIntrin *op);
//! Check the features.
// @{
bool SupportsSSE() {
return static_cast<int>(feature) & static_cast<int>(Feature::SSE);
}
bool SupportsAVX256() {
return static_cast<int>(feature) & static_cast<int>(Feature::AVX256);
}
bool SupportsAVX512() {
return static_cast<int>(feature) & static_cast<int>(Feature::AVX512);
}
bool SupportsBLAS() {
return static_cast<int>(feature) & static_cast<int>(Feature::BLAS);
}
// @}
//! Print (and prepare) a argument in vectorize type, for example:
// 3. -> set1(3.)
// a[i:j] -> load_ps(a+i)
void PrintVecInputArgument(const Expr *op);
//! The output argument, such as the destination for Load.
void PrintVecOutputArgument(const Expr *op);
template <typename Op>
void PrintAbsAddr(const Op *op) {
str_ += op->tensor.template As<ir::_Tensor_>()->name;
str_ += " + ";
auto index = op->index();
auto *ramp_n = index.template As<ir::Ramp>();
if (ramp_n) {
CHECK(!ramp_n->base.template As<ir::Ramp>())
<< "base of a Ramp node should not be Ramp type";
IrPrinter::Visit(ramp_n->base);
} else {
IrPrinter::Visit(op->index());
}
}
template <typename Op>
void VisitBinaryOp(const Op *op, Expr a, Expr b, const std::string &op_repr);
};
template <typename Op>
void CodeGenCX86::VisitBinaryOp(const Op *op,
Expr a,
Expr b,
const std::string &op_repr) {
CHECK_EQ(a.type(), b.type()) << " a is : " << a << ", and b is : " << b
<< ". op_repr is : " << op_repr;
// scalar.
if (a.type().lanes() == 1) {
CodeGenC::Visit(op);
return;
}
// TODO(Superjomn) Consider support BLAS.
int bits = a.type().bits() * a.type().lanes();
if (SupportsAVX512() && bits == 512) {
str_ += "cinn_avx512_";
str_ += op_repr;
str_ += "(";
PrintVecInputArgument(&a);
str_ += ", ";
PrintVecInputArgument(&b);
str_ += ")";
} else if (SupportsAVX256() && bits == 256) {
str_ += "cinn_avx256_";
str_ += op_repr;
str_ += "(";
PrintVecInputArgument(&a);
str_ += ", ";
PrintVecInputArgument(&b);
str_ += ")";
} else {
CodeGenC::Visit(op);
}
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_c_x86.h"
#include <gtest/gtest.h>
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/ir/module.h"
#include "paddle/cinn/lang/builtin.h"
#include "paddle/cinn/lang/compute.h"
#include "paddle/cinn/lang/lower.h"
#include "paddle/cinn/lang/placeholder.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/optim/transform_polyfor_to_for.h"
#include "paddle/cinn/optim/vectorize_loops.h"
namespace cinn {
namespace backends {
TEST(CodeGenCX86, basic) {
// create two forloops, check only one forloop is marked Vectorize.
Context::info_rgt().Clear();
using namespace ir; // NOLINT
const int M = 100;
const int K = 200;
const int N = 500;
const int bn = 32;
Target target;
target.arch = Target::Arch ::X86;
target.bits = Target::Bit ::k32;
target.os = Target::OS ::Linux;
Placeholder<float> A("A", {M, N});
Placeholder<float> B("B", {M, N});
// C = A * B
Tensor C = Compute(
{Expr(M), Expr(N)}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "C");
Tensor D = Compute(
{Expr(M), Expr(N)}, [&](Var i, Var j) { return A(i, j) * B(i, j); }, "D");
auto stages = CreateStages({C, D});
// vectorize C, not D
stages[C]->Vectorize(1, 16);
stages[C]->Unroll(1);
auto func = Lower("matmul", stages, {A, B, C, D});
std::cout << "before optim\n" << func->body << std::endl;
ir::Module::Builder builder("module1", target);
builder.AddFunction(func);
CodeGenCX86 codegen(target, CodeGenCX86::Feature::AVX512);
codegen.SetInlineBuiltinCodes(false);
auto out = codegen.Compile(builder.Build(), CodeGenC::OutputKind::CImpl);
std::cout << "out:\n" << out;
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include <glog/logging.h>
#include <paddle/cinn/utils/string.h>
#include <fstream>
#include <set>
#include <unordered_set>
#include "paddle/cinn/ir/op/ir_operators.h"
#include "paddle/cinn/ir/utils/ir_verify.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/optim/remove_nested_block.h"
namespace cinn {
namespace backends {
const std::string CodeGenCUDA_Dev::source_header_ = // NOLINT
R"(#include <cstdint>
#define CINN_WITH_CUDA
#include "bfloat16.h"
#include "float16.h"
using cinn::common::bfloat16;
using cinn::common::float16;
using cinn::common::half4;
using cinn::common::half8;
using cinn::common::float8;
#include "cinn_cuda_runtime_source.cuh"
)";
const std::string &CodeGenCUDA_Dev::GetSourceHeader() { return source_header_; }
CodeGenCUDA_Dev::CodeGenCUDA_Dev(Target target) : CodeGenC(target) {}
std::string CodeGenCUDA_Dev::Compile(const ir::Module &module, bool for_nvrtc) {
for_nvrtc_ = for_nvrtc;
auto source = Compile(module, OutputKind::CImpl);
return source;
}
void CodeGenCUDA_Dev::Compile(const ir::Module &module,
const Outputs &outputs) {
ir::IrVerify(Expr(module));
CodeGenC::inline_builtin_codes_ = false;
if (!outputs.c_header_name.empty()) {
auto source = Compile(module, OutputKind::CHeader);
str_ = "";
std::ofstream file(outputs.c_header_name);
CHECK(file.is_open()) << "failed to open file " << outputs.c_header_name;
file << source;
file.close();
LOG(WARNING) << "Output C header to file " << outputs.c_header_name;
}
if (!outputs.cuda_source_name.empty()) {
auto source = Compile(module, OutputKind::CImpl);
str_ = "";
std::ofstream file(outputs.cuda_source_name);
CHECK(file.is_open()) << "failed to open file " << outputs.cuda_source_name;
file << source;
file.close();
LOG(WARNING) << "Output C source to file " << outputs.cuda_source_name;
}
}
void CodeGenCUDA_Dev::Compile(const ir::LoweredFunc &func) {
IrPrinter::Visit(Expr(func));
}
std::vector<Expr> CodeGenCUDA_Dev::GenerateBufferAliasExprs(
const ir::_LoweredFunc_ *op, const std::vector<ir::Buffer> &temp_buffers) {
std::set<ir::Buffer> temp_buffer_set(temp_buffers.begin(),
temp_buffers.end());
// prepare temp buffer alias
std::vector<Expr> buffer_alias;
auto tensors = ir::CollectIRNodes(op->body, [&](const Expr *x) {
return x->as_tensor() && x->as_tensor()->buffer.defined() &&
temp_buffer_set.count(x->as_tensor()->buffer);
});
// unique tensors
std::set<ir::Tensor> unique_tensors;
for (auto &e : tensors) {
unique_tensors.insert(e.as_tensor_ref());
}
for (auto &t : unique_tensors) {
auto data_type = t->type();
auto data_ptr_type = data_type;
data_ptr_type.set_cpp_handle();
Var t_var(t->name, data_ptr_type);
Var buf_var(t->buffer->name, data_ptr_type);
buffer_alias.push_back(ir::Let::Make(t_var, buf_var));
}
return buffer_alias;
}
void CodeGenCUDA_Dev::Visit(const ir::_LoweredFunc_ *op) {
// clear names valid within scope when enter a new function
vectorized_tensor_names_.clear();
str_ += "__global__\n";
PrintFunctionDeclaration(op);
str_ += "\n";
DoIndent();
std::vector<Expr> new_body;
auto alloca_temp_buffers = op->PrepareAllocTempBufferExprs();
auto temp_buffer_alias = GenerateBufferAliasExprs(op, op->temp_bufs);
auto alis_var_exprs = op->CudaAliasVarExprs();
#define APPEND_TO_NEW_BODY(field__) \
new_body.insert(std::end(new_body), std::begin(field__), std::end(field__));
APPEND_TO_NEW_BODY(alloca_temp_buffers)
APPEND_TO_NEW_BODY(temp_buffer_alias)
APPEND_TO_NEW_BODY(alis_var_exprs)
new_body.push_back(op->body);
Expr func_body = ir::Block::Make(new_body);
optim::RemoveNestedBlock(&func_body);
// Make sure that the function's body is wrapped by a block
if (!func_body.As<ir::Block>()) {
func_body = ir::Block::Make({func_body});
}
IrPrinter::Visit(func_body);
}
void CodeGenCUDA_Dev::Visit(const ir::_Var_ *op) {
if (utils::Startswith(op->name, "threadIdx") ||
utils::Startswith(op->name, "blockIdx")) {
str_ += "(int)";
str_ += op->name;
} else {
str_ += op->name;
}
}
void CodeGenCUDA_Dev::Visit(const ir::Alloc *op) {
CHECK(op->destination.as_buffer());
PrintTempBufferCreation(op->destination.as_buffer_ref());
}
void CodeGenCUDA_Dev::Visit(const ir::Min *op) {
str_ += "min(";
IrPrinter::Visit(op->a());
str_ += ", ";
IrPrinter::Visit(op->b());
str_ += ")";
}
void CodeGenCUDA_Dev::Visit(const ir::Max *op) {
str_ += "max(";
IrPrinter::Visit(op->a());
str_ += ", ";
IrPrinter::Visit(op->b());
str_ += ")";
}
void CodeGenCUDA_Dev::PrintFunctionDeclaration(const ir::_LoweredFunc_ *op) {
str_ += "void ";
if (op->cuda_axis_info.valid()) {
int thread_num = 1;
for (int i = 0; i < 3; i++) {
thread_num *= op->cuda_axis_info.block_dim(i);
}
str_ += "__launch_bounds__(";
str_ += std::to_string(thread_num);
str_ += ") ";
}
str_ += op->name;
str_ += "(";
for (int i = 0; i < op->args.size() - 1; i++) {
auto &arg = op->args[i];
PrintFuncArg(arg);
str_ += ", ";
}
if (!op->args.empty()) {
PrintFuncArg(op->args.back());
}
str_ += ")";
}
void CodeGenCUDA_Dev::PrintFuncArg(const ir::Argument &arg) {
if (arg.is_buffer()) {
// In CUDA kernel, only primitive type is supported, so we replace the
// buffer with T*j
if (arg.is_input()) str_ += "const ";
str_ += GetTypeRepr(arg.buffer_arg()->dtype);
str_ += "* ";
str_ += kCKeywordRestrict;
str_ += " ";
str_ += ir::BufferGetTensorName(arg.buffer_arg().As<ir::_Buffer_>());
} else if (arg.is_var()) {
if (arg.var_arg()->type().is_cpp_handle()) {
str_ += kCKeywordRestrict;
}
str_ += GetTypeRepr(arg.type());
str_ += " ";
str_ += arg.name();
} else {
CINN_NOT_IMPLEMENTED
}
}
void CodeGenCUDA_Dev::PrintBuiltinCodes() {}
std::string CodeGenCUDA_Dev::Compile(const ir::Module &module,
CodeGenC::OutputKind output_kind) {
if (output_kind == OutputKind::CHeader) {
GenerateHeaderFile(module);
} else if (output_kind == OutputKind::CImpl) {
PrintIncludes();
if (for_nvrtc_) {
str_ += "\nextern \"C\" {\n\n";
}
PrintBuiltinCodes();
for (auto &func : module.functions()) {
Compile(func);
}
} else {
LOG(FATAL) << "Not supported OutputKind";
}
if (for_nvrtc_) {
str_ += "\n\n}";
}
return str_;
}
void CodeGenCUDA_Dev::PrintIncludes() { str_ += GetSourceHeader(); }
void CodeGenCUDA_Dev::PrintTempBufferCreation(const ir::Buffer &buffer) {
CHECK_NE(buffer->type(), Void());
auto print_gpu_memory = [&](const std::string &mark) {
str_ += mark;
str_ += GetTypeRepr(buffer->dtype);
str_ += " ";
str_ += buffer->name;
str_ += " ";
str_ += "[ ";
Expr buffer_size(1);
for (int i = 0; i < buffer->shape.size(); i++) {
buffer_size = buffer_size * buffer->shape[i];
}
optim::Simplify(&buffer_size);
IrPrinter::Visit(buffer_size);
str_ += " ]";
};
switch (buffer->memory_type) {
case ir::MemoryType::GPUShared:
print_gpu_memory("__shared__ ");
break;
case ir::MemoryType::GPULocal:
print_gpu_memory("");
break;
default:
LOG(FATAL) << "CUDA device codegen not support memory " << buffer->name
<< ", type " << buffer->memory_type;
}
}
void CodeGenCUDA_Dev::Visit(const ir::Call *op) {
str_ += op->name;
str_ += "(";
if (!op->read_args.empty()) {
for (int i = 0; i < op->read_args.size() - 1; i++) {
auto &arg = op->read_args[i];
if (arg.as_tensor()) {
str_ += arg.as_tensor()->name;
str_ += ", ";
} else {
IrPrinter::Visit(arg);
str_ += ", ";
}
}
if (op->read_args.back().as_tensor()) {
str_ += op->read_args.back().as_tensor()->name;
} else {
IrPrinter::Visit(op->read_args.back());
}
}
if (!op->write_args.empty()) {
str_ += ", ";
for (int i = 0; i < op->write_args.size() - 1; i++) {
auto &arg = op->write_args[i];
if (arg.as_tensor()) {
str_ += arg.as_tensor()->name;
str_ += ", ";
} else {
IrPrinter::Visit(arg);
str_ += ", ";
}
}
if (op->write_args.back().as_tensor()) {
str_ += op->write_args.back().as_tensor()->name;
} else {
IrPrinter::Visit(op->write_args.back());
}
}
str_ += ")";
}
void CodeGenCUDA_Dev::Visit(const ir::Let *op) {
CHECK(op->type().valid());
// identify vectorized tensors by checking their dtypes are customized_type
// with customized_type::kcuda_builtin_vector_t prefix, and save their names
if (op->type().is_customized() &&
utils::Startswith(op->type().customized_type(),
common::customized_type::kcuda_builtin_vector_t)) {
str_ += GetTypeRepr(op->type());
if (op->type().is_cpp_handle()) {
str_ += " ";
str_ += kCKeywordRestrict;
}
str_ += " ";
IrPrinter::Visit(op->symbol);
vectorized_tensor_names_.insert(utils::GetStreamCnt(op->symbol));
// skip "=0" in "half8 temp = 0;" sincethe operator= of half8 may not
// overloaded.
if (op->body.As<ir::IntImm>() && op->body.As<ir::IntImm>()->value == 0) {
return;
}
str_ += " = ";
IrPrinter::Visit(op->body);
} else {
CodeGenC::Visit(op);
}
}
bool CodeGenCUDA_Dev::PrintBuiltinVectorAccess(const ir::LoadStoreAddrMnger *op,
ir::Expr index_expr,
bool is_store) {
static constexpr char index2suffix[8] = {
'x', 'y', 'z', 'w', 'v', 'u', 't', 's'};
// addr of op should be a place of tensor and the index is simple int number
if (!op->is_addr_tensor() || !index_expr.As<ir::IntImm>()) {
return false;
}
auto *tensor = op->tensor.As<ir::_Tensor_>();
CHECK(tensor);
// identify vectorized tensors by their names
if (!vectorized_tensor_names_.count(tensor->name)) {
return false;
}
// the index can't exceed the range of cuda built-in vector type
int index = index_expr.As<ir::IntImm>()->value;
if (index < 0 || index >= 8) {
return false;
}
if (is_store && tensor->type().is_cpp_handle()) {
str_ += tensor->name;
str_ += "[";
str_ += std::to_string(index);
str_ += "]";
} else {
str_ += tensor->name;
str_ += (tensor->type().is_cpp_handle() ? "->" : ".");
str_ += index2suffix[index];
}
return true;
}
void CodeGenCUDA_Dev::Visit(const ir::Load *op) {
// overload this visit function to especially deal with the case when it
// accesses element at a cuda built-in vector, others still resolve to
// CodeGenC
if (!PrintBuiltinVectorAccess(op, op->index(), false)) {
CodeGenC::Visit(op);
}
}
void CodeGenCUDA_Dev::Visit(const ir::Store *op) {
// overload this visit function to especially deal with the case when it
// accesses element at a cuda built-in vector, others still resolve to
// CodeGenC
if (PrintBuiltinVectorAccess(op, op->index(), true)) {
str_ += " = ";
IrPrinter::Visit(op->value);
} else {
CodeGenC::Visit(op);
}
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <unordered_set>
#include <vector>
#include "paddle/cinn/backends/codegen_c.h"
#include "paddle/cinn/common/common.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/lowered_func.h"
#include "paddle/cinn/ir/module.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/lang/packed_func.h"
#include "paddle/cinn/runtime/cinn_runtime.h"
namespace cinn::ir {
class Module;
} // namespace cinn::ir
namespace cinn {
namespace backends {
/**
* CUDA device code generator.
*
* It generates the device function, e.g, the function called "myadd" will have
* a __global__ functon called "myadd_kernel", different from codegen_c, the
* declaration of the "myadd_kernel" function has an expanded argument list,
* which finally similar to `__global__ void myadd(float* __restrict__ A, float*
* __restrict__ B, int n);`
*/
class CodeGenCUDA_Dev : public CodeGenC {
public:
explicit CodeGenCUDA_Dev(Target target);
/**
* Compile the \p module to \p outputs.
*/
void Compile(const ir::Module& module, const Outputs& outputs);
//! Compile on NVRTC.
std::string Compile(const ir::Module& module, bool for_nvrtc = true);
void Compile(const ir::LoweredFunc& func);
/**
* \brief Print a function argument in CUDA syntax. Currently, just some
* decoration of __restrict__.
* @param arg the argument.
* @return the representation in CUDA syntax.
*
* We make it a static to make the test easier.
*/
void PrintFuncArg(const ir::Argument& arg);
std::string Compile(const ir::Module& module, OutputKind output_kind);
static const std::string& GetSourceHeader();
protected:
void Visit(const ir::_Var_* op) override;
void Visit(const ir::_LoweredFunc_* op) override;
void Visit(const ir::Min* op) override;
void Visit(const ir::Max* op) override;
void Visit(const ir::Alloc* op) override;
void Visit(const ir::Call* op) override;
void Visit(const ir::Load* op) override;
void Visit(const ir::Store* op) override;
void Visit(const ir::Let* op) override;
// Print element access at a cuda built-in vector on a load/store node
bool PrintBuiltinVectorAccess(const ir::LoadStoreAddrMnger* op,
ir::Expr index,
bool is_store);
void PrintBuiltinCodes();
void PrintIncludes() override;
void PrintTempBufferCreation(const ir::Buffer& buffer);
void PrintTempBufferAliasDefinition(const ir::Buffer& buffer);
std::vector<Expr> GenerateBufferAliasExprs(
const ir::_LoweredFunc_* op, const std::vector<ir::Buffer>& temp_buffers);
/**
* Print the function declaration, this is different from C, we expand the
* arguments and get something like
* `__global__ void myadd(float* __restrict__ A, float* __restrict__ B, int
* n);`
*/
void PrintFunctionDeclaration(const ir::_LoweredFunc_* op);
private:
Target target_;
bool for_nvrtc_{false};
// names of vectorized tensors from `Let` statments where dtypes of the
// tensors are customized_type with customized_type::kcuda_builtin_vector_t
// prefix
std::unordered_set<std::string> vectorized_tensor_names_;
static const std::string source_header_;
};
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <stdlib.h>
#include <fstream>
#include <tuple>
#include <vector>
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/extern_func_jit_register.h"
#include "paddle/cinn/backends/llvm/execution_engine.h"
#include "paddle/cinn/backends/llvm/simple_jit.h"
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/common/ir_util.h"
#include "paddle/cinn/common/test_helper.h"
#include "paddle/cinn/hlir/pe/nn.h"
#include "paddle/cinn/hlir/pe/schedule.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#include "paddle/cinn/lang/lower.h"
#include "paddle/cinn/optim/ir_simplify.h"
#include "paddle/cinn/utils/timer.h"
namespace cinn {
namespace backends {
TEST(CUDAFile, Module_output) {
std::string cuda_source_name = "_generated1.cu";
std::string cuda_source_code = R"ROC(
extern "C" {
__global__
void __launch_bounds__(200) elementwise_mul(const float* __restrict__ A, const float* __restrict__ B, float* __restrict__ C)
{
if (((int)blockIdx.x < 100)) {
if (((int)threadIdx.x < 200)) {
C[((200 * (int)blockIdx.x) + (int)threadIdx.x)] = (A[((200 * (int)blockIdx.x) + (int)threadIdx.x)] * B[((200 * (int)blockIdx.x) + (int)threadIdx.x)]);
};
};
}
}
)ROC";
std::ofstream file(cuda_source_name);
CHECK(file.is_open()) << "failed to open file " << cuda_source_name;
file << CodeGenCUDA_Dev::GetSourceHeader();
file << cuda_source_code;
file.close();
LOG(WARNING) << "Output C source to file " << cuda_source_name;
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include <algorithm>
#include <string>
#include <unordered_map>
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/extern_func_emitter_builtin.h"
#include "paddle/cinn/backends/extern_func_jit_register.h"
#include "paddle/cinn/backends/llvm/llvm_util.h"
#include "paddle/cinn/runtime/intrinsic.h"
namespace cinn {
namespace backends {
using cinn::common::bfloat16;
using cinn::common::float16;
const int kArgsArrayMaxLen = 20;
llvm::Value* CodeGenCUDA_Host::LowerGPUKernelLauncher(
const ir::_LoweredFunc_* func) {
auto body = func->body;
auto* call_ir = body.As<ir::Call>();
CHECK(call_ir);
// Create the function
// @{
auto* function_type = GenFunctionTypeFromCinnFunction(func, true);
llvm::Function* function = llvm::Function::Create(
function_type, llvm::Function::ExternalLinkage, func->name, m_);
function->setCallingConv(llvm::CallingConv::C);
function->setHasUWTable();
std::vector<llvm::Value*> ll_function_args;
std::transform(function->arg_begin(),
function->arg_end(),
std::back_inserter(ll_function_args),
[](auto& arg) { return std::addressof(arg); });
// @}
llvm::BasicBlock* entry = llvm::BasicBlock::Create(
/*Context=*/b_->getContext(),
/*Name=*/"entry",
/*Parent=*/function,
/*InsertBefore=*/nullptr);
b_->SetInsertPoint(entry);
auto* kernel_args = ll_function_args[0];
auto* kernel_args_count = ll_function_args[1];
llvm::Value* kernel_stream = nullptr;
if (ll_function_args.size() == 3) {
kernel_stream = ll_function_args[2];
CHECK_EQ(kernel_stream->getType(), ll_void_p_ty()); // void* stream
}
CHECK_EQ(kernel_args->getType(), ll_void_p_ty()); // void* args
CHECK_EQ(kernel_args_count->getType(), ll_int32_ty()); // int32
std::unordered_map<std::string, llvm::Value*> global_args = {
{KERNEL_ARGS, kernel_args},
{KERNEL_ARGS_NUM, kernel_args_count},
{KERNEL_STREAM, kernel_stream}};
auto ret_type = CinnTypeToLLVMType(Void(), m_);
std::vector<llvm::Type*> args_type;
for (auto r_arg : call_ir->read_args) {
if (r_arg.is_var()) {
if (r_arg.as_var()->type().is_cpp_handle() ||
r_arg.as_var()->type().is_string()) {
args_type.push_back(CinnTypeToLLVMType(type_of<void*>(), m_));
} else if (r_arg.as_var()->type().is_int(32)) {
args_type.push_back(CinnTypeToLLVMType(type_of<int32_t>(), m_));
} else {
CINN_NOT_IMPLEMENTED;
}
} else {
if (r_arg.type().is_bool()) {
args_type.push_back(CinnTypeToLLVMType(type_of<bool>(), m_));
} else if (r_arg.type().is_uint(8)) {
args_type.push_back(CinnTypeToLLVMType(type_of<uint8_t>(), m_));
} else if (r_arg.type().is_uint(16)) {
args_type.push_back(CinnTypeToLLVMType(type_of<uint16_t>(), m_));
} else if (r_arg.type().is_uint(32)) {
args_type.push_back(CinnTypeToLLVMType(type_of<uint32_t>(), m_));
} else if (r_arg.type().is_uint(64)) {
args_type.push_back(CinnTypeToLLVMType(type_of<uint64_t>(), m_));
} else if (r_arg.type().is_int(8)) {
args_type.push_back(CinnTypeToLLVMType(type_of<int8_t>(), m_));
} else if (r_arg.type().is_int(16)) {
args_type.push_back(CinnTypeToLLVMType(type_of<int16_t>(), m_));
} else if (r_arg.type().is_int(32)) {
args_type.push_back(CinnTypeToLLVMType(type_of<int32_t>(), m_));
} else if (r_arg.type().is_int(64)) {
args_type.push_back(CinnTypeToLLVMType(type_of<int64_t>(), m_));
} else if (r_arg.type().is_float(32)) {
args_type.push_back(CinnTypeToLLVMType(type_of<float>(), m_));
} else if (r_arg.type().is_float(64)) {
args_type.push_back(CinnTypeToLLVMType(type_of<double>(), m_));
} else if (r_arg.type().is_bfloat16()) {
args_type.push_back(CinnTypeToLLVMType(type_of<bfloat16>(), m_));
} else if (r_arg.type().is_float16()) {
args_type.push_back(CinnTypeToLLVMType(type_of<float16>(), m_));
} else {
CINN_NOT_IMPLEMENTED;
}
}
}
auto func_type = llvm::FunctionType::get(ret_type, args_type, false);
auto call_func = m_->getOrInsertFunction(call_ir->name, func_type);
std::vector<llvm::Value*> call_args;
for (auto& r_arg : call_ir->read_args) {
if (r_arg.is_var()) {
if (r_arg.as_var()->type().is_string()) {
auto kvalue = m_->getOrInsertGlobal(r_arg.as_var()->name + "_ptr_",
b_->getInt8PtrTy());
call_args.push_back(b_->CreateLoad(
b_->getInt8PtrTy(), kvalue, r_arg.as_var()->name + "_ptr_load"));
} else if (r_arg.as_var()->type().is_cpp_handle() ||
r_arg.as_var()->type().is_int(32)) {
CHECK(global_args.count(r_arg.as_var()->name));
call_args.push_back(global_args[r_arg.as_var()->name]);
} else {
CINN_NOT_IMPLEMENTED;
}
} else {
if (r_arg.type().is_bool()) {
call_args.push_back(b_->getInt1(r_arg.as_bool()));
} else if (r_arg.type().is_int(8)) {
call_args.push_back(b_->getInt8(r_arg.as_int8()));
} else if (r_arg.type().is_int(16)) {
call_args.push_back(b_->getInt16(r_arg.as_int16()));
} else if (r_arg.type().is_int(32)) {
call_args.push_back(b_->getInt32(r_arg.as_int32()));
} else if (r_arg.type().is_int(64)) {
call_args.push_back(b_->getInt64(r_arg.as_int64()));
} else if (r_arg.type().is_uint(8)) {
call_args.push_back(b_->getInt8(r_arg.as_uint8()));
} else if (r_arg.type().is_uint(16)) {
call_args.push_back(b_->getInt16(r_arg.as_uint16()));
} else if (r_arg.type().is_uint(32)) {
call_args.push_back(b_->getInt32(r_arg.as_uint32()));
} else if (r_arg.type().is_uint(64)) {
call_args.push_back(b_->getInt64(r_arg.as_uint64()));
} else if (r_arg.type().is_float(32)) {
call_args.push_back(llvm::ConstantFP::get(
b_->getFloatTy(), llvm::APFloat(r_arg.as_float())));
} else if (r_arg.type().is_float(64)) {
call_args.push_back(llvm::ConstantFP::get(
b_->getDoubleTy(), llvm::APFloat(r_arg.as_double())));
} else if (r_arg.type().is_bfloat16()) {
call_args.push_back(llvm::ConstantFP::get(
b_->getBFloatTy(),
llvm::APFloat(static_cast<float>(r_arg.as_bfloat16()))));
} else if (r_arg.type().is_float16()) {
call_args.push_back(llvm::ConstantFP::get(
b_->getHalfTy(),
llvm::APFloat(static_cast<float>(r_arg.as_float16()))));
} else {
CINN_NOT_IMPLEMENTED;
}
}
}
b_->CreateCall(call_func, call_args);
RetVoid();
return function;
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <absl/container/flat_hash_map.h>
#include <memory>
#include <string>
#include <tuple>
#include <vector>
#include "paddle/cinn/backends/llvm/codegen_llvm.h"
namespace cinn {
namespace backends {
/**
* CodeGenCUDA takes a CINN Module with host functions and output a LLVM module.
*/
class CodeGenCUDA_Host : public CodeGenLLVM {
public:
explicit CodeGenCUDA_Host(llvm::Module *m,
llvm::IRBuilder<> *b,
const std::shared_ptr<SymbolTable> &vars = nullptr)
: CodeGenLLVM(m, b, vars) {}
using CodeGenLLVM::Visit;
llvm::Value *Visit(const ir::_LoweredFunc_ *func) override {
return LowerGPUKernelLauncher(func);
}
private:
/**
* Lower a CUDA kernel launcher.
*
* We launch a CUDA kernel in the following way:
*
* 1. a GPU function (called fn) will compiled to PTX and lower by CUDA driver
* to a function pointer, which we store as a `void*` type global variable
* [fn_kernel_ptr] in LLVM module.
* 2. when lower the host launcher, we replace the Call of the original kernel
* [fn] to a Call of `cinn_call_cuda_kernel` method which is registered as an
* external function.
*
*/
llvm::Value *LowerGPUKernelLauncher(const ir::_LoweredFunc_ *func);
};
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/cuda_util.h"
#include "paddle/cinn/ir/utils/ir_mutator.h"
namespace cinn {
namespace backends {
std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module) {
detail::CollectHostFunctionVisitor visitor(module->name);
Expr expr(module);
return visitor(&expr);
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <absl/container/flat_hash_map.h>
#include <string>
#include <tuple>
#include <vector>
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/ir/ir.h"
#include "paddle/cinn/ir/utils/ir_copy.h"
#include "paddle/cinn/ir/utils/ir_mutator.h"
namespace cinn {
namespace backends {
#define KERNEL_ARGS "kernel_args"
#define KERNEL_ARGS_NUM "kernel_args_num"
#define KERNEL_STREAM "kernel_stream"
/**
* Split a CINN Module into two separate modules, one cantains the host
* functions, the other contains the device kernels.
*
* This contains some process:
*
* - replace the original kernel function with a Call node and add it to the
* first module, add a device kernel function to the second module.
*/
std::tuple<ir::Module, ir::Module> SplitCudaAndHostModule(ir::Module module);
namespace detail {
struct CollectHostFunctionVisitor : public ir::IRMutator<> {
explicit CollectHostFunctionVisitor(const std::string& module_name)
: host_module_builder(module_name + "_host", common::DefaultHostTarget()),
device_module_builder(module_name + "_gpu_device",
common::DefaultNVGPUTarget()) {}
std::tuple<ir::Module, ir::Module> operator()(Expr* expr) {
ir::IRMutator<>::Visit(expr, expr);
return std::make_tuple(host_module_builder.Build(),
device_module_builder.Build());
}
private:
void Visit(const ir::_LoweredFunc_* op, Expr* expr) override {
if (op->body.As<ir::Call>()) {
host_module_builder.AddFunctionWithoutOptim(expr->as_lowered_func_ref());
} else {
if (!op->cuda_axis_info.valid()) {
expr->as_lowered_func_ref()->cuda_axis_info.set_valid(true);
}
auto host_func = CreateHostFunctionGivenDeviceKernel(op);
host_module_builder.AddFunctionWithoutOptim(
host_func.as_lowered_func_ref());
device_module_builder.AddFunctionWithoutOptim(
CreateDeviceFunctionGivenDeviceKernel(*expr).as_lowered_func_ref());
}
}
/**
* Create a wrapper function for a kernel.
*
* For example, we get a kernel function:
*
* \code
* __global__
* void fn (float* a, float* out) { ... }
* \endcode
*
* A host wrapper function will generate for it
*
* \code
* void fn (cinn_buffer_t* a, cinn_buffer_t* out) {
* Call(fn_kernel);
* }
* \endcode
*/
Expr CreateHostFunctionGivenDeviceKernel(const ir::_LoweredFunc_* func) {
// std::vector<Expr> args;
// NOTE the suffix `__ptr` makes this argument lower to a pointer in LLVM
// backend. args.push_back(Var("args__ptr", type_of<cinn_pod_value_t*>()));
// args.push_back(Var("num_args", type_of<int32_t>()));
ir::Var kernel_ptr(GenDeviceKernelName(func->name), type_of<std::string>());
ir::Var kernel_args(KERNEL_ARGS, type_of<void*>());
ir::Var kernel_args_num(KERNEL_ARGS_NUM, type_of<int>());
ir::Var kernel_stream(KERNEL_STREAM, type_of<void*>());
auto call_extern_api =
ir::Call::Make(Void(),
runtime::intrinsic::call_cuda_kernel,
{kernel_ptr,
kernel_args,
kernel_args_num,
Expr(func->cuda_axis_info.grid_dim(0)), // grid_x
Expr(func->cuda_axis_info.grid_dim(1)), // grid_y
Expr(func->cuda_axis_info.grid_dim(2)), // grid_z
Expr(func->cuda_axis_info.block_dim(0)), // block_x
Expr(func->cuda_axis_info.block_dim(1)), // block_y
Expr(func->cuda_axis_info.block_dim(2)), // block_z
kernel_stream},
{},
ir::CallType::Extern,
ir::FunctionRef(),
0);
std::vector<ir::Argument> arguments = {
ir::Argument(kernel_args, ir::Argument::IO::kOutput),
ir::Argument(kernel_args_num, ir::Argument::IO::kInput),
ir::Argument(kernel_stream, ir::Argument::IO::kOutput)};
return ir::_LoweredFunc_::Make(func->name, arguments, call_extern_api, {});
}
Expr CreateDeviceFunctionGivenDeviceKernel(Expr expr) {
auto copied = optim::IRCopy(expr);
auto* lowered_func = copied.as_lowered_func();
lowered_func->name = GenDeviceKernelName(lowered_func->name);
return copied;
}
inline std::string GenDeviceKernelName(const std::string& fn) {
return fn + "_kernel";
}
private:
ir::Module::Builder host_module_builder;
ir::Module::Builder device_module_builder;
};
} // namespace detail
} // namespace backends
} // namespace cinn
// Copyright (c) 2022 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <iostream>
#include <vector>
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/runtime/cuda/cuda_module.h"
namespace cinn {
namespace backends {
/**
* This file is not a common test, it is used as a util for developers to
* write source CUDA code to debug whether it runs correctly during runtime
*/
using runtime::cuda::CUDAModule;
/**
* Utility function to create cuda memory of non-empty shape.
*
* @param shape: a non-empty shape for the created cuda memory
* @param data: the data to initialize the cuda memory. Function doesn't
* initailize if it is nullptr
* @return the CUdeviceptr pointing to the created memory
*/
template <typename T>
CUdeviceptr CreateCudaMemory(const std::vector<int>& shape, const T* data) {
CHECK(!shape.empty()) << "Couldn't create CUDA memory for empty shape";
CUDA_CALL(cudaDeviceSynchronize());
int numel = 1;
for (int s : shape) {
numel = numel * s;
}
CUdeviceptr cuda_ptr = cuMemAlloc(&cuda_ptr, numel * sizeof(T));
if (data != nullptr) {
CUDA_CALL(cudaMemcpy(reinterpret_cast<void*>(cuda_ptr),
data,
numel * sizeof(T),
cudaMemcpyHostToDevice));
}
return cuda_ptr;
}
TEST(CodeGenDebug, RunCudaSourceCode) {
common::Context::Global().ResetNameId();
std::string source_code = R"ROC(
extern "C" {
__global__
void __launch_bounds__(512) fn_relu_1_kernel(const float* __restrict__ var_1, float* __restrict__ Relu_output)
{
for (int32_t j_0 = 0; j_0 < 8; j_0 += 1) {
for (int32_t j_1 = 0; j_1 < 1; j_1 += 1) {
for (int32_t j_2 = 0; j_2 < 1; j_2 += 1) {
for (int32_t j_3 = 0; j_3 < 8; j_3 += 1) {
for (int32_t j_4 = 0; j_4 < 1; j_4 += 1) {
for (int32_t k_0 = 0; k_0 < 1; k_0 += 1) {
for (int32_t k_1 = 0; k_1 < 7; k_1 += 1) {
for (int32_t k_2 = 0; k_2 < 4; k_2 += 1) {
for (int32_t k_3 = 0; k_3 < 4; k_3 += 1) {
for (int32_t k_4 = 0; k_4 < 1; k_4 += 1) {
for (int32_t a_0 = 0; a_0 < 16; a_0 += 1) {
for (int32_t a_1 = 0; a_1 < 1; a_1 += 1) {
for (int32_t a_2 = 0; a_2 < 1; a_2 += 1) {
for (int32_t a_3 = 0; a_3 < 1; a_3 += 1) {
for (int32_t a_4 = 0; a_4 < 7; a_4 += 1) {
Relu_output[((7 * a_0) + ((7 * a_1) + ((7 * a_2) + ((7 * a_3) + ((100352 * j_0) + ((100352 * j_1) + ((100352 * j_2) + ((12544 * j_3) + ((12544 * j_4) + ((12544 * k_0) + ((1792 * k_1) + ((448 * k_2) + ((112 * k_3) + ((112 * k_4) + a_4))))))))))))))] = max(var_1[((7 * a_0) + ((7 * a_1) + ((7 * a_2) + ((7 * a_3) + ((100352 * j_0) + ((100352 * j_1) + ((100352 * j_2) + ((12544 * j_3) + ((12544 * j_4) + ((12544 * k_0) + ((1792 * k_1) + ((448 * k_2) + ((112 * k_3) + ((112 * k_4) + a_4))))))))))))))], 0.00000000f);
};
};
};
};
};
};
};
};
};
};
};
};
};
};
};
}
}
)ROC";
backends::nvrtc::Compiler compiler;
std::string ptx = compiler(CodeGenCUDA_Dev::GetSourceHeader() + source_code);
ASSERT_FALSE(ptx.empty());
CUDAModule cuda_module(ptx, CUDAModule::Kind::PTX);
CUdeviceptr var =
CreateCudaMemory<float>(/* shape */ {64 * 112 * 112}, /* data */ nullptr);
CUdeviceptr out =
CreateCudaMemory<float>(/* shape */ {64 * 112 * 112}, /* data */ nullptr);
void* args[] = {&var, &out};
dim3 grid(512, 1, 1);
dim3 block(512, 1, 1);
cuda_module.LaunchKernel(
/*device_id*/ 0, "fn_relu_1_kernel", grid, block, args);
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/compiler.h"
#include <fstream>
#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/hlir/framework/visualize_helper.h"
#ifdef CINN_WITH_CUDA
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
#include "paddle/cinn/backends/codegen_cuda_util.h"
#include "paddle/cinn/backends/nvrtc/nvrtc_util.h"
#include "paddle/cinn/runtime/cuda/cuda_module.h"
#include "paddle/cinn/runtime/cuda/cuda_util.h"
#include "paddle/cinn/runtime/flags.h"
#endif
DECLARE_string(cinn_source_code_save_path);
DECLARE_string(cinn_dump_group_lowered_func);
DECLARE_string(cinn_dump_group_source_code);
DECLARE_string(cinn_dump_group_ptx);
DECLARE_string(cinn_dump_group_instruction);
namespace cinn {
namespace backends {
using ir::Module;
static constexpr int DebugLogMaxLen = 30000;
void CompilationInfoDumper::DumpLoweredFunc() {
if (FLAGS_cinn_dump_group_lowered_func.empty()) {
return;
}
for (int idx = 0; idx < info_.lowered_funcs.size(); ++idx) {
std::stringstream content;
content << info_.lowered_funcs[idx].front();
Dump(FLAGS_cinn_dump_group_lowered_func,
idx,
"lowered_function.txt",
content.str());
}
}
void CompilationInfoDumper::DumpSourceCode() {
if (FLAGS_cinn_dump_group_source_code.empty()) {
return;
}
for (int idx = 0; idx < info_.source_codes.size(); ++idx) {
Dump(FLAGS_cinn_dump_group_source_code,
idx,
"source_code.cu",
info_.source_codes[idx]);
}
}
void CompilationInfoDumper::DumpPtxCode() {
if (FLAGS_cinn_dump_group_ptx.empty()) {
return;
}
for (int idx = 0; idx < info_.source_ptxs.size(); ++idx) {
Dump(FLAGS_cinn_dump_group_ptx,
idx,
"source_ptx.ptx",
info_.source_ptxs[idx]);
}
}
void CompilationInfoDumper::DumpInstruction() {
if (FLAGS_cinn_dump_group_instruction.empty()) {
return;
}
for (int idx = 0; idx < info_.instructions.size(); ++idx) {
Dump(FLAGS_cinn_dump_group_instruction,
idx,
"instruction.txt",
info_.instructions[idx]->DumpInstruction());
}
}
void CompilationInfoDumper::Dump(const std::string& base_path,
const int idx,
const std::string& file_name,
const std::string& content) {
auto dump_path =
utils::StringFormat("%s/fusion_group_%d", base_path.c_str(), idx);
if (!hlir::framework::MakeDirectory(
dump_path, S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH)) {
LOG(WARNING) << "Failed to make directory: \"" << dump_path
<< "\", the instruction for this group will not dump.";
} else {
auto dump_file =
utils::StringFormat("%s/%s", dump_path.c_str(), file_name.c_str());
VLOG(7) << "Dump instruction to: " << dump_file;
std::ofstream of(dump_file, std::ios_base::out);
if (of.is_open()) {
of << content;
of.close();
} else {
LOG(WARNING) << "Failed to open file: " << dump_file
<< ", please check your path.";
}
}
}
SourceCodePrint::SourceCodePrint() {
if (!FLAGS_cinn_source_code_save_path.empty()) {
LOG(INFO)
<< "The CINN auto generated source code will writing into file: \""
<< FLAGS_cinn_source_code_save_path << "\"";
of.open(FLAGS_cinn_source_code_save_path, std::ios_base::out);
}
}
SourceCodePrint::~SourceCodePrint() {
if (of.is_open()) {
of.close();
}
}
void SourceCodePrint::write(const std::string& source_code) {
std::lock_guard<std::mutex> guard(mtx_);
if (of.is_open()) {
of << source_code << std::endl;
} else if (!FLAGS_cinn_source_code_save_path.empty()) {
LOG(WARNING) << "Failed to open \"" << FLAGS_cinn_source_code_save_path
<< "\", source code will print.";
if (source_code.size() > DebugLogMaxLen) {
LOG(INFO) << "[CUDA] source code-0:\n"
<< source_code.substr(0, DebugLogMaxLen);
for (int i = 1; i * DebugLogMaxLen < source_code.size(); ++i) {
LOG(INFO) << "[CUDA] source code-" << i << ":\n"
<< source_code.substr(DebugLogMaxLen * i, DebugLogMaxLen);
}
} else {
LOG(INFO) << "[CUDA] source code:\n" << source_code;
}
}
}
void Compiler::Build(const Module& module, const std::string& code) {
if (target_.arch == Target::Arch::NVGPU) {
CompileCudaModule(module, code);
} else if (target_.arch == Target::Arch::X86) {
CompileX86Module(module);
} else {
CINN_NOT_IMPLEMENTED
}
}
std::string Compiler::GetSourceCode(const ir::Module& module) {
if (target_.arch == Target::Arch::NVGPU) {
#ifdef CINN_WITH_CUDA
auto _host_module_device_module_ =
SplitCudaAndHostModule(module); // NOLINT
auto& host_module = std::get<0>(_host_module_device_module_);
auto& device_module = std::get<1>(_host_module_device_module_);
CodeGenCUDA_Dev codegen(target_);
auto source_code = codegen.Compile(device_module);
return source_code;
#else
CINN_NOT_IMPLEMENTED
#endif
} else {
CINN_NOT_IMPLEMENTED
}
}
void Compiler::BuildDefault(const Module& module) {
if (target_.arch == Target::Arch::NVGPU) {
CompileCudaModule(module);
} else if (target_.arch == Target::Arch::X86) {
CompileX86Module(module);
} else {
CINN_NOT_IMPLEMENTED
}
}
void Compiler::CompileCudaModule(const Module& module,
const std::string& code) {
#ifdef CINN_WITH_CUDA
auto _host_module_device_module_ = SplitCudaAndHostModule(module); // NOLINT
auto& host_module = std::get<0>(_host_module_device_module_);
auto& device_module = std::get<1>(_host_module_device_module_);
VLOG(3) << "[CUDA] host module:\n" << host_module;
VLOG(3) << "[CUDA] device module:\n" << device_module;
std::string source_code;
if (code.empty()) {
CodeGenCUDA_Dev codegen(target_);
source_code = codegen.Compile(device_module);
} else {
source_code = code;
}
CHECK(!source_code.empty())
<< "Compile CUDA C code failed from device module:\n"
<< device_module;
VLOG(3) << "[CUDA] C:\n" << source_code;
SourceCodePrint::GetInstance()->write(source_code);
using runtime::cuda::CUDAModule;
nvrtc::Compiler compiler;
auto ptx = compiler(source_code);
CHECK(!ptx.empty()) << "Compile PTX failed from source code:\n"
<< source_code;
cuda_module_.reset(new CUDAModule(ptx,
compiler.compile_to_cubin()
? CUDAModule::Kind::CUBIN
: CUDAModule::Kind::PTX));
RuntimeSymbols symbols;
for (auto& fn : device_module.functions()) {
std::string kernel_fn_name = fn->name;
auto fn_kernel = cuda_module_->GetFunction(0, kernel_fn_name);
CHECK(fn_kernel);
symbols.RegisterVar(kernel_fn_name + "_ptr_",
reinterpret_cast<void*>(fn_kernel));
}
engine_ = ExecutionEngine::Create(ExecutionOptions(), std::move(symbols));
engine_->Link<CodeGenCUDA_Host>(host_module);
#else
CINN_NOT_IMPLEMENTED
#endif
}
void Compiler::CompileX86Module(const Module& module) {
engine_->Link<CodeGenX86>(module);
}
void Compiler::ExportObject(const std::string& path) {
engine_->ExportObject(path);
}
void* Compiler::Lookup(absl::string_view fn_name) {
CHECK(engine_);
if (engine_->Lookup(fn_name) != nullptr) {
return engine_->Lookup(fn_name);
}
return nullptr;
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <absl/strings/string_view.h>
#include <fstream>
#include <memory>
#include <mutex>
#include <string>
#include "paddle/cinn/backends/llvm/codegen_llvm.h"
#include "paddle/cinn/backends/llvm/execution_engine.h"
#include "paddle/cinn/backends/llvm/simple_jit.h"
#include "paddle/cinn/hlir/framework/parallel_compiler.h"
#include "paddle/cinn/lang/packed_func.h"
#ifdef CINN_WITH_CUDA
#include "paddle/cinn/runtime/cuda/cuda_module.h"
#endif
namespace cinn {
namespace backends {
/**
* A class for dumping the code after compilation.
* Use FLAGS_cinn_dump_group_lowered_func to specify the directory to dump
* lowered function. Use FLAGS_cinn_dump_group_source_code to specify the
* directory to dump the source code. Use FLAGS_cinn_dump_group_ptx to specify
* the directory to dump ptx. Use FLAGS_cinn_dump_group_instruction to specify
* the directory to dump instruction.
*/
class CompilationInfoDumper {
public:
explicit CompilationInfoDumper(
const hlir::framework::ParallelCompiler::CompilationResult& info)
: info_(info) {
DumpLoweredFunc();
DumpSourceCode();
DumpPtxCode();
DumpInstruction();
}
private:
void DumpLoweredFunc();
void DumpSourceCode();
void DumpPtxCode();
void DumpInstruction();
void Dump(const std::string& base_path,
const int idx,
const std::string& file_name,
const std::string& content);
const hlir::framework::ParallelCompiler::CompilationResult& info_;
};
class SourceCodePrint {
public:
static SourceCodePrint* GetInstance() {
static SourceCodePrint print;
return &print;
}
void write(const std::string& source_code);
private:
SourceCodePrint();
~SourceCodePrint();
std::ofstream of;
std::mutex mtx_;
};
class Compiler final {
public:
static std::unique_ptr<Compiler> Create(const Target& target) {
return std::unique_ptr<Compiler>(new Compiler(target));
}
/**
* Compile and link to a CINN module.
*/
void Build(const ir::Module& module, const std::string& code = "");
void ExportObject(const std::string& path);
std::string GetSourceCode(const ir::Module& module);
void BuildDefault(const ir::Module& module);
/**
* Retrieve a function by \p fn_name.
* @return function address or null if not exists.
*/
void* Lookup(absl::string_view fn_name);
private:
void CompileCudaModule(const ir::Module& module,
const std::string& code = "");
void CompileX86Module(const ir::Module& module);
explicit Compiler(const Target& target)
: target_(target), engine_(ExecutionEngine::Create(ExecutionOptions())) {}
CINN_DISALLOW_COPY_AND_ASSIGN(Compiler);
private:
Target target_;
std::unique_ptr<ExecutionEngine> engine_;
#ifdef CINN_WITH_CUDA
std::unique_ptr<runtime::cuda::CUDAModule> cuda_module_;
#endif
};
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/compiler.h"
#include <gtest/gtest.h>
#include <vector>
#include "paddle/cinn/cinn.h"
#include "paddle/cinn/common/test_helper.h"
#include "paddle/cinn/hlir/pe/elementwise.h"
#include "paddle/cinn/hlir/pe/nn.h"
#include "paddle/cinn/runtime/use_extern_funcs.h"
#include "paddle/cinn/utils/timer.h"
namespace cinn {
namespace backends {
TEST(Compiler, x86) {
Expr M(1024), N(1024);
auto create_module = [&]() {
Placeholder<float> A("A", {M, N});
Placeholder<float> B("B", {M, N});
auto C = Compute(
{M, N}, [=](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
return std::make_tuple(A, B, C);
};
{ // test x86
auto _A_B_C_ = create_module(); // NOLINT
auto& A = std::get<0>(_A_B_C_);
auto& B = std::get<1>(_A_B_C_);
auto& C = std::get<2>(_A_B_C_);
auto stages = CreateStages({C});
auto fn = Lower("fn", stages, {A, B, C});
ir::Module::Builder builder("some_module", common::DefaultHostTarget());
builder.AddFunction(fn);
auto compiler = Compiler::Create(common::DefaultHostTarget());
compiler->Build(builder.Build());
auto* fnp = compiler->Lookup("fn");
ASSERT_TRUE(fnp);
auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
.set_random()
.Build();
auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
.set_random()
.Build();
auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
.set_zero()
.Build();
auto args = common::ArgsBuilder().Add(Ab).Add(Bb).Add(Cb).Build();
reinterpret_cast<void (*)(void*, int)>(fnp)(args.data(), args.size());
// test result
auto* Ad = reinterpret_cast<float*>(Ab->memory);
auto* Bd = reinterpret_cast<float*>(Bb->memory);
auto* Cd = reinterpret_cast<float*>(Cb->memory);
for (int i = 0; i < Ab->num_elements(); i++) {
ASSERT_NEAR(Ad[i] + Bd[i], Cd[i], 1e-5);
}
}
}
#ifdef CINN_WITH_CUDA
TEST(Compiler, cuda) {
Expr M(1024), N(1024);
auto create_module = [&]() {
Placeholder<float> A("A", {M, N});
Placeholder<float> B("B", {M, N});
auto C = Compute(
{M, N}, [=](Expr i, Expr j) { return A(i, j) + B(i, j); }, "C");
return std::make_tuple(A, B, C);
};
{ // cuda
auto _A_B_C_ = create_module(); // NOLINT
auto& A = std::get<0>(_A_B_C_);
auto& B = std::get<1>(_A_B_C_);
auto& C = std::get<2>(_A_B_C_);
auto stages = CreateStages({C});
stages[C]->Bind(0, "blockIdx.x");
stages[C]->Bind(1, "threadIdx.x");
auto fn = Lower("fn", stages, {A, B, C});
ir::Module::Builder builder("some_module", common::DefaultHostTarget());
builder.AddFunction(fn);
auto compiler = Compiler::Create(common::DefaultNVGPUTarget());
compiler->Build(builder.Build());
auto* fnp = compiler->Lookup("fn");
ASSERT_TRUE(fnp);
auto* Ab = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
.set_random()
.Build();
auto* Bb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
.set_random()
.Build();
auto* Cb = common::BufferBuilder(Float(32), {M.as_int32(), N.as_int32()})
.set_zero()
.Build();
// allocate CUDA buffer
void *Ag, *Bg, *Cg;
const int num_bytes = Ab->num_elements() * sizeof(float);
cudaMalloc(&Ag, num_bytes);
cudaMalloc(&Bg, num_bytes);
cudaMalloc(&Cg, num_bytes);
CUDA_CALL(cudaMemcpy(Ag, Ab->memory, num_bytes, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(Bg, Bb->memory, num_bytes, cudaMemcpyHostToDevice));
CUDA_CALL(cudaMemcpy(Cg, Cb->memory, num_bytes, cudaMemcpyHostToDevice));
cinn_buffer_t Abb;
Abb.memory = reinterpret_cast<uint8_t*>(Ag);
cinn_buffer_t Bbb;
Bbb.memory = reinterpret_cast<uint8_t*>(Bg);
cinn_buffer_t Cbb;
Cbb.memory = reinterpret_cast<uint8_t*>(Cg);
auto args = common::ArgsBuilder().Add(&Abb).Add(&Bbb).Add(&Cbb).Build();
utils::Timer timer;
timer.Start();
void* stream = nullptr;
for (int i = 0; i < 1000; i++) {
reinterpret_cast<void (*)(void*, int, void*)>(fnp)(
args.data(), args.size(), stream);
}
CUDA_CALL(cudaDeviceSynchronize());
float latency = timer.Stop();
LOG(INFO) << "latency: " << latency / 1000;
std::vector<float> ch(M.as_int32() * N.as_int32(), 0.f);
CUDA_CALL(cudaMemcpy(
ch.data(), Cg, ch.size() * sizeof(float), cudaMemcpyDeviceToHost));
auto* Ad = reinterpret_cast<float*>(Ab->memory);
auto* Bd = reinterpret_cast<float*>(Bb->memory);
for (int i = 0; i < Ab->num_elements(); i++) {
ASSERT_NEAR(Ad[i] + Bd[i], ch[i], 1e-5);
}
}
}
#endif
TEST(Compiler, sqrt) {
Expr N(100);
Expr C(10);
Expr H(10);
Expr W(10);
Placeholder<float> input("input", {N, C, H, W});
Placeholder<float> mean("mean", {C});
Placeholder<float> scale("scale", {C});
Placeholder<float> variance("variance", {C});
Placeholder<float> bias("bias", {C});
float epsilon = 0.1f;
auto A = Compute(
{N, C, H, W},
[=](Expr n, Expr c, Expr h, Expr w) {
return (input(n, c, h, w) - mean(c)) * scale(c) /
lang::Sqrt(variance(c) + Expr(epsilon)) +
bias(c);
},
"A");
auto B = hlir::pe::Pool2d(
input, {3, 3}, {1, 1}, {1, 1, 1, 1}, "max", false, false);
auto BB = hlir::pe::BatchNorm_NCHW(
input, scale, bias, mean, variance, epsilon, "batchnorm");
auto stages = CreateStages({input, mean, scale, variance, A, bias, B[0], BB});
auto fn =
Lower("fn", stages, {input, mean, scale, bias, variance, A, B[0], BB});
Module::Builder builder("some", common::DefaultHostTarget());
builder.AddFunction(fn);
auto compiler = Compiler::Create(common::DefaultHostTarget());
compiler->Build(builder.Build());
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/backends/cuda_util.h"
#include <glog/logging.h>
#include "paddle/cinn/backends/extern_func_jit_register.h"
#include "paddle/cinn/common/target.h"
namespace cinn {
namespace backends {
std::string cuda_thread_axis_name(int level) {
switch (level) {
case 0:
return "threadIdx.x";
break;
case 1:
return "threadIdx.y";
break;
case 2:
return "threadIdx.z";
break;
}
return "";
}
std::string cuda_block_axis_name(int level) {
switch (level) {
case 0:
return "blockIdx.x";
break;
case 1:
return "blockIdx.y";
break;
case 2:
return "blockIdx.z";
break;
}
return "";
}
} // namespace backends
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef CINN_WITH_CUDA
#include <cuda.h>
#include <cuda_runtime.h>
#include <cudnn.h>
#include <curand.h>
#include <glog/logging.h>
#include <string>
#include <tuple>
#include <vector>
#include "paddle/cinn/runtime/cinn_runtime.h"
#define CUDA_DRIVER_CALL(func) \
{ \
auto status = func; \
if (status != CUDA_SUCCESS) { \
const char* msg; \
cuGetErrorString(status, &msg); \
LOG(FATAL) << "CUDA Driver Error: " #func " failed with error: " << msg; \
} \
}
#define CUDA_CALL(func) \
{ \
auto status = func; \
if (status != cudaSuccess) { \
LOG(FATAL) << "CUDA Error : " << cudaGetErrorString(status); \
} \
}
#define CURAND_CALL(func) \
{ \
auto status = func; \
if (status != CURAND_STATUS_SUCCESS) { \
LOG(FATAL) << "CURAND Error : " << status; \
} \
}
#define CUSOLVER_CALL(func) \
{ \
auto status = func; \
if (status != CUSOLVER_STATUS_SUCCESS) { \
LOG(FATAL) << "CUSOLVER Error: " << status; \
} \
}
#define CUBLAS_CALL(func) \
{ \
auto status = func; \
if (status != CUBLAS_STATUS_SUCCESS) { \
LOG(FATAL) << "CUBLAS Error!"; \
} \
}
#define CUDNN_CALL(func) \
{ \
auto status = func; \
if (status != CUDNN_STATUS_SUCCESS) { \
LOG(FATAL) << "CUDNN Error : " << cudnnGetErrorString(status); \
} \
}
#define NVRTC_CALL(func) \
{ \
auto status = func; \
if (status != NVRTC_SUCCESS) { \
LOG(FATAL) << "NVRTC Error : " << nvrtcGetErrorString(status); \
} \
}
namespace cinn {
namespace backends {
// CUDA syntax for thread axis.
std::string cuda_thread_axis_name(int level);
// CUDA syntax for block axis.
std::string cuda_block_axis_name(int level);
} // namespace backends
} // namespace cinn
#endif // CINN_WITH_CUDA
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment