Commit 992bec46 authored by “yuguo”'s avatar “yuguo”
Browse files

2.5

parent 0259837d
// Copyright (c) 2023 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <array>
#include <functional>
#include <queue>
#include <unordered_set>
namespace cinn {
namespace common {
// Topological order visitor
template <typename NodeType>
class TopoWalker final {
public:
TopoWalker(const TopoWalker&) = delete;
TopoWalker(TopoWalker&&) = delete;
using NodeHandlerType = std::function<void(NodeType)>;
using NodesVisitorType =
std::function<void(NodeType, const NodeHandlerType&)>;
TopoWalker(const NodesVisitorType& VisitPrevNodes,
const NodesVisitorType& VisitNextNodes)
: VisitPrevNodes_(VisitPrevNodes), VisitNextNodes_(VisitNextNodes) {}
void operator()(NodeType node, const NodeHandlerType& NodeHandler) const {
std::array<NodeType, 1> nodes{node};
(*this)(nodes.begin(), nodes.end(), NodeHandler);
}
template <typename NodeIt>
void operator()(NodeIt begin,
NodeIt end,
const NodeHandlerType& NodeHandler) const {
std::queue<NodeType> node_queue;
std::unordered_set<NodeType> queued_nodes;
const auto& TryEnqueueNode = [&](NodeType node) {
if (queued_nodes.count(node) == 0) {
node_queue.push(node);
queued_nodes.insert(node);
}
};
for (NodeIt iter = begin; iter != end; ++iter) {
TryEnqueueNode(*iter);
}
while (!node_queue.empty()) {
NodeType node = node_queue.front();
node_queue.pop();
NodeHandler(node);
VisitNextNodes_(node, [&](NodeType node) {
size_t num_unfinished_inputs = 0;
VisitPrevNodes_(node, [&](NodeType in_node) {
num_unfinished_inputs += (queued_nodes.count(in_node) > 0 ? 0 : 1);
});
if (num_unfinished_inputs == 0) {
TryEnqueueNode(node);
}
});
}
}
private:
NodesVisitorType VisitPrevNodes_;
NodesVisitorType VisitNextNodes_;
};
} // namespace common
} // namespace cinn
// Copyright (c) 2023 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/common/topo_walker.h"
#include <glog/logging.h>
#include <gtest/gtest.h>
namespace cinn {
namespace common {
TEST(TopoWalker, simple) {
std::vector<std::pair<int, int>> edges{
{0, 3}, {1, 2}, {1, 3}, {2, 3}, {3, 4}};
TopoWalker<int> visitor(
[&](int node, const std::function<void(int)>& NodeHandler) {
for (const auto& pair : edges) {
if (pair.second == node) {
NodeHandler(pair.first);
}
}
},
[&](int node, const std::function<void(int)>& NodeHandler) {
for (const auto& pair : edges) {
if (pair.first == node) {
NodeHandler(pair.second);
}
}
});
std::vector<int> sources{0, 1};
std::vector<int> outputs;
visitor(sources.begin(), sources.end(), [&](int node) {
outputs.push_back(node);
});
std::vector<int> expected{0, 1, 2, 3, 4};
EXPECT_TRUE((outputs == expected));
}
} // namespace common
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/common/type.h"
#include <functional>
#include <string>
#include <unordered_map>
#include <utility>
namespace cinn {
namespace common {
struct Type::Storage {
Storage() = default;
Storage(type_t t, int b, int w, specific_type_t st)
: type_(t), bits_(b), lanes_(w), specific_type_(st) {}
type_t type_{type_t::Unk};
// distinguish FP16/BF16, or E5M2/E4M3 (when FP8 is supported)
specific_type_t specific_type_{specific_type_t::None};
cpp_type_t cpp_type_{cpp_type_t::None};
//! How many bits per element.
int bits_{0};
//! How many elements(if a vector type), for scalar types, it should be 1.
int lanes_{1};
//! Name of the customized type.
std::string customized_type_;
};
Type::~Type() {}
std::string Type::to_string() const {
std::string ret = "";
if (is_cpp_const()) ret += "const ";
ret += Type2Str(*this);
if (lanes() > 1) {
ret += "<";
ret += std::to_string(lanes());
ret += ">";
}
if (is_cpp_handle()) ret += "*";
if (is_cpp_handle2()) ret += "**";
return ret;
}
std::ostream &operator<<(std::ostream &os, const Type &t) {
os << t.to_string();
return os;
}
std::ostream &operator<<(std::ostream &os, Type::type_t t) {
switch (t) {
case Type::type_t::Void:
os << "Void";
break;
case Type::type_t::UInt:
os << "UInt";
break;
case Type::type_t::Int:
os << "Int";
break;
case Type::type_t::Float:
os << "Float";
break;
case Type::type_t::Unk:
os << "Unk";
break;
case Type::type_t::Customized:
os << "Customized";
}
return os;
}
Type &Type::set_cpp_handle(bool x) {
// unset the other handle-related bits.
set_cpp_handle2(false);
auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
// unset the other handle-related bits.
v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
if (x)
v |= static_cast<uint8_t>(cpp_type_t::Handle);
else
v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
return *this;
}
Type &Type::set_cpp_handle2(bool x) {
auto &v = (*reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_));
// unset the other handle-related bits.
v &= ~static_cast<uint8_t>(cpp_type_t::Handle);
v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
if (x)
v |= static_cast<uint8_t>(cpp_type_t::HandleHandle);
else
v &= ~static_cast<uint8_t>(cpp_type_t::HandleHandle);
return *this;
}
Type Type::VectorOf(int w) const {
CheckTypeValid();
return Type(type(), bits(), w, specific_type());
}
Type::Type(const Type &other) {
if (other.storage_) storage_.reset(new Storage(*other.storage_));
}
Type Type::ElementOf() const {
CheckTypeValid();
auto type = *this;
type.storage_->lanes_ = 1;
return type;
}
void Type::CheckTypeValid() const {
CHECK_NE(GetStorage().type_, type_t::Unk);
if (GetStorage().type_ == type_t::Float && GetStorage().bits_ == 16) {
CHECK(GetStorage().specific_type_ == specific_type_t::FP16 ||
GetStorage().specific_type_ == specific_type_t::BF16)
<< "When creating a 16 bits Float, the specific_type_t must be FP16 or "
"BF16.";
}
}
Type Type::PointerOf() const {
CheckTypeValid();
auto x = *this;
CHECK(!x.is_cpp_handle2()) << "Not support three level of PointerOf";
if (x.is_cpp_handle())
x.set_cpp_handle2();
else
x.set_cpp_handle();
return x;
}
Type Type::ConstOf() const {
CheckTypeValid();
auto x = *this;
x.set_cpp_const();
return x;
}
bool Type::is_supported() const {
return this->is_float(32) || this->is_float16() || this->is_bfloat16() ||
this->is_float(64) || this->is_bool() || this->is_int(8) ||
this->is_int(16) || this->is_int(32) || this->is_int(64) ||
this->is_uint(8) || this->is_uint(16) || this->is_uint(32) ||
this->is_uint(64);
}
Type Type::IgnoreConst() const {
CheckTypeValid();
auto x = *this;
x.set_cpp_const(false);
return x;
}
Type Type::with_bits(int x) const {
CHECK(is_primitive());
Type type = *this;
type.GetStorage().bits_ = x;
return type;
}
Type Type::with_type(Type::type_t x) const {
Type type = *this;
type.GetStorage().type_ = x;
return type;
}
Type Type::with_lanes(int x) const {
CHECK(valid());
Type type = *this;
type.GetStorage().lanes_ = x;
return type;
}
Type Type::with_cpp_const(bool x) const {
Type type = *this;
type.set_cpp_const(x);
return type;
}
Type &Type::set_cpp_const(bool is_const) {
uint8_t &data = *reinterpret_cast<uint8_t *>(&GetStorage().cpp_type_);
if (is_const) {
data |= static_cast<uint8_t>(cpp_type_t::Const);
} else {
data &= ~(static_cast<uint8_t>(cpp_type_t::Const));
}
return *this;
}
Type &Type::set_customized_type(const std::string &t) {
GetStorage().type_ = type_t ::Customized;
GetStorage().customized_type_ = t;
return *this;
}
bool Type::valid() const {
if (is_unk()) return false;
if (is_customized()) {
return !GetStorage().customized_type_.empty();
}
if (is_float() && GetStorage().bits_ == 16) {
return (GetStorage().specific_type_ == specific_type_t::FP16 ||
GetStorage().specific_type_ == specific_type_t::BF16);
}
if (is_primitive()) {
return bits() != 0;
}
return true;
}
Type::Type(Type::type_t t, int b, int w, specific_type_t st)
: storage_(new Storage(t, b, w, st)) {
if (t == Type::type_t::Float && b == 16) {
CHECK(st == specific_type_t::FP16 || st == specific_type_t::BF16)
<< "When creating a 16 bits Float, the specific_type_t must be FP16 or "
"BF16.";
}
}
bool Type::is_primitive() const {
return !is_unk() && type() != type_t::Customized;
}
bool Type::is_customized() const {
return !is_unk() && type() == type_t::Customized;
}
bool Type::is_unk() const { return type() == type_t::Unk; }
bool Type::is_bool() const { return type() == type_t::UInt && bits() == 1; }
bool Type::is_void() const { return type() == type_t::Void; }
bool Type::is_vector() const { return lanes() > 1; }
bool Type::is_scalar() const { return lanes() == 1; }
// Note: when calling is_float(16), 'st' can't be specific_type_t::None to
// distinguish FP16/BF16, or use is_float16()/is_bfloat16() for short
bool Type::is_float(int bits, specific_type_t st) const {
if (type() == type_t::Float && bits == 16) {
CHECK(st != specific_type_t::None)
<< "when calling is_float(16), 'st' can't be specific_type_t::None to "
"distinguish FP16/BF16, or use is_float16()/is_bfloat16() for short";
return st == this->specific_type();
} else {
return type() == type_t::Float && (bits < 0 || bits == this->bits());
}
}
bool Type::is_float16() const { return is_float(16, specific_type_t::FP16); }
bool Type::is_bfloat16() const { return is_float(16, specific_type_t::BF16); }
bool Type::is_uint(int bits) const {
return type() == type_t::UInt && (bits < 0 || bits == this->bits());
}
bool Type::is_int(int bits) const {
return type() == type_t::Int && (bits < 0 || bits == this->bits());
}
bool Type::is_integer(int bits) const {
return (type() == type_t::Int || type() == type_t::UInt) &&
(bits < 0 || bits == this->bits());
}
bool Type::is_index_type() {
return is_int() && lanes() == 1 && (bits() == 32 || bits() == 64);
}
bool Type::is_cpp_handle() const {
return static_cast<uint8_t>(GetStorage().cpp_type_) &
static_cast<uint8_t>(cpp_type_t::Handle);
}
bool Type::is_cpp_handle2() const {
return static_cast<uint8_t>(GetStorage().cpp_type_) &
static_cast<uint8_t>(cpp_type_t::HandleHandle);
}
bool Type::is_cpp_const() const {
return static_cast<uint8_t>(cpp_type_t::Const) &
static_cast<uint8_t>(GetStorage().cpp_type_);
}
const std::string &Type::customized_type() const {
return GetStorage().customized_type_;
}
bool Type::is_customized_type() const {
return !GetStorage().customized_type_.empty();
}
Type::type_t Type::type() const { return GetStorage().type_; }
Type::specific_type_t Type::specific_type() const {
return GetStorage().specific_type_;
}
int Type::bits() const { return GetStorage().bits_; }
int Type::lanes() const { return GetStorage().lanes_; }
Type::cpp_type_t Type::cpp_type() const { return GetStorage().cpp_type_; }
bool Type::operator==(const Type &other) const {
return type() == other.type() && specific_type() == other.specific_type() &&
bits() == other.bits() && lanes() == other.lanes() &&
GetStorage().cpp_type_ == other.GetStorage().cpp_type_ &&
customized_type() == other.customized_type();
}
bool Type::is_string() const { return type() == type_t::String; }
Type &Type::operator=(const Type &other) {
if (other.storage_) {
storage_.reset(new Storage(other.GetStorage().type_,
other.GetStorage().bits_,
other.GetStorage().lanes_,
other.GetStorage().specific_type_));
storage_->cpp_type_ = other.GetStorage().cpp_type_;
storage_->customized_type_ = other.GetStorage().customized_type_;
}
return *this;
}
Type::Storage &Type::GetStorage() {
CHECK(storage_) << "The type not initializated! Please check.";
return *storage_;
}
const Type::Storage &Type::GetStorage() const {
CHECK(storage_) << "The type not initializated! Please check.";
return *storage_;
}
Type::Type() : storage_(new Storage) {}
Type::Type(Type &&other) : storage_(std::move(other.storage_)) {}
const Type &BF16() {
static auto t = Float(16, 1, Type::specific_type_t::BF16);
return t;
}
const Type &F16() {
static auto t = Float(16, 1, Type::specific_type_t::FP16);
return t;
}
const Type &F32() {
static auto t = Float(32);
return t;
}
const Type &F64() {
static auto t = Float(64);
return t;
}
const Type &I8() {
static auto t = Int(8);
return t;
}
const Type &I16() {
static auto t = Int(16);
return t;
}
const Type &I32() {
static auto t = Int(32);
return t;
}
const Type &I64() {
static auto t = Int(64);
return t;
}
const Type &UI8() {
static auto t = UInt(8);
return t;
}
const Type &UI16() {
static auto t = UInt(16);
return t;
}
const Type &UI32() {
static auto t = UInt(32);
return t;
}
const Type &UI64() {
static auto t = UInt(64);
return t;
}
const Type &I1() {
static auto t = Int(1);
return t;
}
const Type &UI1() {
static auto t = UInt(1);
return t;
}
struct TypeHash {
size_t operator()(const Type &type) const {
std::string hash_str;
hash_str += std::to_string(static_cast<int>(type.type()));
hash_str += std::to_string(static_cast<int>(type.specific_type()));
hash_str += std::to_string(type.bits());
hash_str += std::to_string(type.lanes());
hash_str += std::to_string(static_cast<int>(type.cpp_type()));
if (type.is_customized_type()) {
hash_str += type.customized_type();
}
return std::hash<std::string>()(hash_str);
}
};
int Type::bytes() const {
// if the type is a pointer
auto cpp_type = this->cpp_type();
if (cpp_type == Type::cpp_type_t::Handle ||
cpp_type == Type::cpp_type_t::HandleHandle) {
return sizeof(void *);
}
// if the type is an known pod type
#define GET_TYPE_SIZE_PAIR(TYPE) \
{ type_of<TYPE>(), sizeof(TYPE) }
static std::unordered_map<Type, int, TypeHash> type_bytes = {
GET_TYPE_SIZE_PAIR(bfloat16),
GET_TYPE_SIZE_PAIR(float16),
GET_TYPE_SIZE_PAIR(float),
GET_TYPE_SIZE_PAIR(double),
GET_TYPE_SIZE_PAIR(char),
GET_TYPE_SIZE_PAIR(signed char),
GET_TYPE_SIZE_PAIR(unsigned char),
GET_TYPE_SIZE_PAIR(int8_t),
GET_TYPE_SIZE_PAIR(int16_t),
GET_TYPE_SIZE_PAIR(int32_t),
GET_TYPE_SIZE_PAIR(int64_t),
GET_TYPE_SIZE_PAIR(uint8_t),
GET_TYPE_SIZE_PAIR(uint16_t),
GET_TYPE_SIZE_PAIR(uint32_t),
GET_TYPE_SIZE_PAIR(uint64_t),
GET_TYPE_SIZE_PAIR(bool),
};
#undef GET_TYPE_SIZE_PAIR
if (type_bytes.count(*this)) {
return type_bytes.at(*this);
}
// else get size by bits size
auto bit_size = this->bits();
return (bit_size + 7) / 8;
}
Type Str2Type(const std::string &type) {
static std::unordered_map<std::string, Type> str2type_map = {
{"unk", Type()},
{"void", Void()},
{"bool", Bool()},
{"unsigned char", UI8()},
{"char", I8()},
{"signed char", I8()},
{"string", String()},
{"bit", I1()},
{"signed bit", I1()},
{"int1", I1()},
{"int1_t", I1()},
{"ubit", UI1()},
{"unsigned bit", UI1()},
{"uint1", UI1()},
{"uint1_t", UI1()},
{"int8", I8()},
{"int8_t", I8()},
{"int16", I16()},
{"int16_t", I16()},
{"int", I32()},
{"int32", I32()},
{"int32_t", I32()},
{"int64", I64()},
{"int64_t", I64()},
{"uint8", UI8()},
{"uint8_t", UI8()},
{"uint16", UI16()},
{"uint16_t", UI16()},
{"uint", UI32()},
{"uint32", UI32()},
{"uint32_t", UI32()},
{"uint64", UI64()},
{"uint64_t", UI64()},
{"bfloat16", BF16()},
{"float16", F16()},
{"half", F16()},
{"float", F32()},
{"float32", F32()},
{"float64", F64()},
{"double", F64()},
{"void*", type_of<void *>()},
{"void_p", type_of<void *>()},
{"void**", type_of<void **>()},
{"void_p_p", type_of<void **>()},
{"int8*", type_of<int8_t *>()},
{"int8_p", type_of<int8_t *>()},
{"int8_t*", type_of<int8_t *>()},
{"uint8*", type_of<uint8_t *>()},
{"uint8_p", type_of<uint8_t *>()},
{"uint8_t*", type_of<uint8_t *>()},
{"bfloat16*", type_of<bfloat16 *>()},
{"float16*", type_of<float16 *>()},
{"half*", type_of<float16 *>()},
{"bfloat16_p", type_of<bfloat16 *>()},
{"float16_p", type_of<float16 *>()},
{"half_p", type_of<float16 *>()},
{"float*", type_of<float *>()},
{"float32*", type_of<float *>()},
{"float_p", type_of<float *>()},
{"float32_p", type_of<float *>()},
{"double*", type_of<double *>()},
{"float64*", type_of<double *>()},
{"double_p", type_of<double *>()},
{"float64_p", type_of<double *>()},
{"cinn_buffer", type_of<cinn_buffer_t>()},
{"cinn_buffer*", type_of<cinn_buffer_t>()},
{"cinn_buffer_p", type_of<cinn_buffer_t *>()},
{"const cinn_buffer*", type_of<const cinn_buffer_t *>()},
{"const_cinn_buffer_p", type_of<const cinn_buffer_t *>()},
{"cinn_pod_value", type_of<cinn_pod_value_t>()},
{"cinn_pod_value*", type_of<cinn_pod_value_t *>()},
{"cinn_pod_value_p", type_of<cinn_pod_value_t *>()},
};
CHECK(str2type_map.find(type) != str2type_map.end())
<< "Not support type [" << type << "] ! Please Check.\n";
return str2type_map.at(type);
}
std::string Type2Str(const Type &type) {
switch (type.type()) {
case Type::type_t::Int:
return "int" + std::to_string(type.bits());
case Type::type_t::UInt:
if (type.bits() == 1) {
return "bool";
} else {
return "uint" + std::to_string(type.bits());
}
case Type::type_t::Float:
switch (type.specific_type()) {
case Type::specific_type_t::None:
return "float" + std::to_string(type.bits());
case Type::specific_type_t::BF16:
return "bfloat16";
case Type::specific_type_t::FP16:
return "float16";
default:
break;
}
case Type::type_t::Void:
return "void";
case Type::type_t::Customized:
return type.customized_type();
case Type::type_t::String:
return "string";
case Type::type_t::Unk:
return "unk";
default:
LOG(FATAL) << "Not support type [" << type << "] ! Please Check.\n";
}
return "unk";
}
} // namespace common
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <glog/logging.h>
#include <memory>
#include <string>
#include "paddle/cinn/common/bfloat16.h"
#include "paddle/cinn/common/float16.h"
#include "paddle/cinn/common/float16_bfloat16_utils.h"
#include "paddle/cinn/common/macros.h"
#include "paddle/cinn/runtime/cinn_runtime.h"
//! Much of the concepts are borrowed from Halide project.
namespace cinn {
namespace common {
/**
* Types in the CINN type system. They can be ints, unsigned ints, or floats of
* various bit-widths. They can also be vectors of the same (by setting the
* `lanes` field to something larger than one). NOTE: Front-end code other than
* vectorize shouldn't use vector types.
*/
struct Type {
enum class type_t {
Unk = -1,
Int,
UInt,
Float,
String,
Void,
// stupid idea to mix the Customized with other primitive types, large
// refactor needs here.
Customized, // Customized type
};
// CINN use type_t and bits to distinguish data types, like is_float(64) for
// double, is_float(32) for float, but for Float16 and BFloat16, the bits are
// both 16, so we need some other info to distinguish them.
enum class specific_type_t {
// None for some cases we only care about the bits, e.g. vectorize for
// hardwares
None = -1,
FP16,
BF16,
// for FP8 in future
// E5M2,
// E4M3,
};
//! type decorators in C++, the different code can used together.
enum class cpp_type_t : uint8_t {
None = 0, // None information.
Const = 1, // const.
Handle = 1 << 1, // pointer type, such as `cinn_buffer_t*`.
HandleHandle = 1 << 2, // pointer of pointer, such as `cinn_buffer_t**`.
};
Type();
Type(type_t t, int b, int w, specific_type_t st = specific_type_t::None);
Type(const Type& other);
explicit Type(Type&& other);
Type& operator=(const Type& other);
CINN_NODISCARD bool is_primitive() const;
CINN_NODISCARD bool is_customized() const;
CINN_NODISCARD bool valid() const;
//! Some helper functions to check a type.
// @{
CINN_NODISCARD bool is_unk() const;
CINN_NODISCARD bool is_void() const;
CINN_NODISCARD bool is_bool() const;
CINN_NODISCARD bool is_vector() const;
CINN_NODISCARD bool is_scalar() const;
CINN_NODISCARD bool is_float(
int bits = -1, specific_type_t st = specific_type_t::None) const;
CINN_NODISCARD bool is_float16() const;
CINN_NODISCARD bool is_bfloat16() const;
CINN_NODISCARD bool is_int(int bits = -1) const;
CINN_NODISCARD bool is_integer(int bits = -1) const;
CINN_NODISCARD bool is_uint(int bits = -1) const;
CINN_NODISCARD bool is_string() const;
CINN_NODISCARD bool is_index_type();
// @}
Type& set_cpp_handle(bool x = true);
CINN_NODISCARD bool is_cpp_handle() const;
Type& set_cpp_handle2(bool x = true);
CINN_NODISCARD bool is_cpp_handle2() const;
Type& set_cpp_const(bool is_const = true);
CINN_NODISCARD bool is_cpp_const() const;
Type& set_customized_type(const std::string& t);
const std::string& customized_type() const;
CINN_NODISCARD bool is_customized_type() const;
// Get a new type with bits set to \p x.
Type with_bits(int x) const;
// Get a new type with type set to \p x.
Type with_type(type_t x) const;
// Get a new type with lanes set to \p x.
Type with_lanes(int x) const;
// Get a new type with cpp_const set to \p x.
Type with_cpp_const(bool x = true) const;
//! Getters
// @{
type_t type() const;
specific_type_t specific_type() const;
int bits() const;
int lanes() const;
cpp_type_t cpp_type() const;
int bytes() const;
// @}
//! Compare two types for equality.
bool operator==(const Type& other) const;
//! Compare two types for inequality.
bool operator!=(const Type& other) const { return !(*this == other); }
//! Generate a vector of this type, with `w` elements.
Type VectorOf(int w) const;
//! Generate a element type of this type.
Type ElementOf() const;
//! Generate the address type.
Type PointerOf() const;
//! Ignore const.
Type IgnoreConst() const;
//! Add const.
Type ConstOf() const;
//! Check if a dtype is supported in CINN yet.
bool is_supported() const;
std::string to_string() const;
friend std::ostream& operator<<(std::ostream& os, const Type& t);
~Type();
private:
void CheckTypeValid() const;
struct Storage;
Storage& GetStorage();
const Storage& GetStorage() const;
std::unique_ptr<Storage> storage_;
}; // namespace common
inline Type Void() { return Type(Type::type_t ::Void, 1, 0); }
inline Type Int(int bits, int lanes = 1) {
return Type(Type::type_t ::Int, bits, lanes);
}
inline Type UInt(int bits, int lanes = 1) {
return Type(Type::type_t ::UInt, bits, lanes);
}
inline Type BFloat16(int lanes = 1) {
return Type(Type::type_t ::Float, 16, lanes, Type::specific_type_t::BF16);
}
inline Type Float16(int lanes = 1) {
return Type(Type::type_t ::Float, 16, lanes, Type::specific_type_t::FP16);
}
inline Type Float(int bits,
int lanes = 1,
Type::specific_type_t st = Type::specific_type_t::None) {
if (bits == 16) {
CHECK(st == Type::specific_type_t::FP16 ||
st == Type::specific_type_t::BF16)
<< "When creating a 16 bits Float, the specific_type_t must be FP16 or "
"BF16.";
}
return Type(Type::type_t ::Float, bits, lanes, st);
}
inline Type Bool(int lanes = 1) { return Type(Type::type_t ::UInt, 1, lanes); }
inline Type String() { return Type(Type::type_t::String, 1, 1); }
//! Builtin native types as global singletons.
// @{
const Type& BF16();
const Type& F16();
const Type& F32();
const Type& F64();
const Type& I8();
const Type& I16();
const Type& I32();
const Type& I64();
const Type& UI8();
const Type& UI16();
const Type& UI32();
const Type& UI64();
const Type& I1();
const Type& UI1();
// @}
template <typename T>
Type type_of();
// clang-format off
template <> inline Type type_of<void>() { return Void(); }
template <> inline Type type_of<bfloat16>() { return BF16(); }
template <> inline Type type_of<float16>() { return F16(); }
template <> inline Type type_of<float>() { return F32(); }
template <> inline Type type_of<double>() { return F64(); }
template <> inline Type type_of<bool>() { return UI1(); }
template <> inline Type type_of<char>() { return I8(); }
// template <> inline Type type_of<signed char>() { return I8(); }
// template <> inline Type type_of<unsigned char>() { return UI8(); }
template <> inline Type type_of<std::string>() { return String(); }
template <> inline Type type_of<int8_t>() { return I8(); }
template <> inline Type type_of<int16_t>() { return I16(); }
template <> inline Type type_of<int32_t>() { return I32(); }
template <> inline Type type_of<int64_t>() { return I64(); }
template <> inline Type type_of<uint8_t>() { return UI8(); }
template <> inline Type type_of<uint16_t>() { return UI16(); }
template <> inline Type type_of<uint32_t>() { return UI32(); }
template <> inline Type type_of<uint64_t>() { return UI64(); }
// clang-format on
template <>
inline Type type_of<int8_t*>() {
Type x = Int(8);
x.set_cpp_handle();
return x;
}
template <>
inline Type type_of<uint8_t*>() {
Type x = UInt(8);
x.set_cpp_handle();
return x;
}
template <>
inline Type type_of<void*>() {
Type x = type_of<void>();
x.set_cpp_handle();
return x;
}
template <>
inline Type type_of<void**>() {
Type x = type_of<void>();
x.set_cpp_handle2();
return x;
}
template <>
inline Type type_of<bfloat16*>() {
Type x = type_of<float16>();
x.set_cpp_handle();
return x;
}
template <>
inline Type type_of<float16*>() {
Type x = type_of<float16>();
x.set_cpp_handle();
return x;
}
template <>
inline Type type_of<float*>() {
Type x = type_of<float>();
x.set_cpp_handle();
return x;
}
template <>
inline Type type_of<double*>() {
Type x = type_of<double>();
x.set_cpp_handle();
return x;
}
std::ostream& operator<<(std::ostream& os, Type::type_t t);
namespace customized_type {
static const char* kArgs_type_repr = "Args";
static const char* kArgValue_type_repr = "ArgValue";
static const char* kbuffer_t = "cinn_buffer_t";
static const char* kpod_value_t = "cinn_pod_value_t";
static const char* kcuda_builtin_vector_t = "CudaVectorType::";
} // namespace customized_type
template <>
inline Type type_of<cinn_buffer_t>() {
return Type().set_customized_type(customized_type::kbuffer_t);
}
template <>
inline Type type_of<cinn_buffer_t*>() {
return Type()
.set_customized_type(customized_type::kbuffer_t)
.set_cpp_handle();
}
template <>
inline Type type_of<const cinn_buffer_t*>() {
return Type()
.set_customized_type(customized_type::kbuffer_t)
.set_cpp_handle()
.set_cpp_const();
}
template <>
inline Type type_of<cinn_pod_value_t>() {
return Type().set_customized_type(customized_type::kpod_value_t);
}
template <>
inline Type type_of<cinn_pod_value_t*>() {
return Type()
.set_customized_type(customized_type::kpod_value_t)
.set_cpp_handle();
}
Type Str2Type(const std::string& type);
std::string Type2Str(const Type& type);
enum class Layout {
kUnk = 0,
kNCHW,
kNHWC,
};
} // namespace common
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/common/type.h"
#include <gtest/gtest.h>
namespace cinn::common {
TEST(Type, basic) {
LOG(INFO) << I32();
auto i32 = I32();
LOG(INFO) << I32();
LOG(INFO) << F32();
LOG(INFO) << type_of<float>();
}
} // namespace cinn::common
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/common/union_find.h"
namespace cinn {
namespace common {
const char* UnionFindNode::__type_info__ = "UnionFindNode";
const char* UnionFindNode::type_info() const { return __type_info__; }
} // namespace common
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* \file This file implements a general UnionFind algorithm to help cluster
* something.
*/
#pragma once
#include <cstring>
#include <map>
#include <string>
#include <tuple>
#include <vector>
#include "paddle/cinn/common/object.h"
#include "paddle/cinn/common/shared.h"
namespace cinn {
namespace common {
struct UnionFindNode : public Object {
UnionFindNode* parent{};
std::string cluster_info;
std::tuple<UnionFindNode*, int /*height*/> GetRoot() {
auto* p = this;
int level = 0;
while (p->parent) {
p = p->parent;
level++;
}
return std::make_tuple(p, level);
}
void Union(UnionFindNode* other) {
auto _p0_l0_ = GetRoot();
auto& p0 = std::get<0>(_p0_l0_);
auto& l0 = std::get<1>(_p0_l0_);
auto _p1_l1_ = other->GetRoot();
auto& p1 = std::get<0>(_p1_l1_);
auto& l1 = std::get<1>(_p1_l1_);
if (p0 == p1) return;
if (l0 < l1) {
p1->parent = p0;
} else {
p0->parent = p1;
}
}
template <typename T>
T* safe_as() {
CHECK_EQ(std::strcmp(T::__type_info__, type_info()), 0)
<< "Want a " << T::__type_info__ << " but get a " << type_info();
return reinterpret_cast<T*>(this);
}
const char* type_info() const override;
static const char* __type_info__;
};
struct UnionFind {
UnionFindNode* AddNode(UnionFindNode* node) {
nodes.emplace_back(node);
return node;
}
std::vector<std::vector<UnionFindNode*>> GetClusters() {
std::map<UnionFindNode* /*root*/, std::vector<UnionFindNode*>> clusters;
for (auto& n : nodes) {
auto _root_l_ = n->GetRoot(); // NOLINT
auto& root = std::get<0>(_root_l_);
auto& l = std::get<1>(_root_l_);
clusters[root].push_back(n.get());
}
std::vector<std::vector<UnionFindNode*>> res;
for (auto& item : clusters) {
res.push_back(item.second);
}
return res;
}
std::vector<common::Shared<UnionFindNode>> nodes;
};
} // namespace common
} // namespace cinn
core_gather_headers()
gather_srcs(
cinnapi_src
SRCS
computation.cc
syntax.cc
paddle_model_to_program.cc
interpreter.cc
net_builder.cc
op_mapper_registry.cc
paddle_model_convertor.cc
program_pass.cc
optimize.cc)
if(NOT WITH_CUDA)
cinn_cc_test(
test_frontend_syntax
ARGS
"--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
SRCS
syntax_test.cc
DEPS
cinncore)
# cinn_cc_test(test_frontend_interpreter
# ARGS --model_dir=${THIRD_PARTY_PATH}/naive_mul_model
# SRCS interpreter_test.cc DEPS cinncore)
else()
cinn_nv_test(
test_frontend_syntax
ARGS
"--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
SRCS
syntax_test.cc
DEPS
cinncore)
cinn_nv_test(
test_frontend_interpreter
ARGS
--model_dir=${THIRD_PARTY_PATH}/naive_mul_model
SRCS
interpreter_test.cc
DEPS
cinncore)
endif()
#cinn_cc_test(test_paddle_model_convertor
# ARGS --model_dir=${THIRD_PARTY_PATH}/resnet_model
# SRCS paddle_model_convertor_test.cc DEPS cinncore decomposer_test_helper)
#cinn_cc_test(test_computation
# ARGS "--model_dir=${THIRD_PARTY_PATH}/naive_mul_model"
# SRCS computation_test.cc DEPS cinncore)
cinn_cc_test(test_net_builder SRCS net_builder_test.cc DEPS cinncore)
cinn_cc_test(test_decomposer_registry SRCS decomposer_registry_test.cc DEPS
cinncore)
add_subdirectory(paddle)
add_subdirectory(decomposer)
add_subdirectory(op_mappers)
add_subdirectory(pass)
cinn_cc_test(test_op_mapper_registry SRCS op_mapper_registry_test.cc DEPS
cinncore)
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/frontend/computation.h"
#include "paddle/cinn/frontend/optimize.h"
#include "paddle/cinn/frontend/program_pass.h"
#include "paddle/cinn/hlir/framework/graph.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/hlir/framework/scope.h"
namespace cinn {
namespace frontend {
struct ComputationContext {
Target target;
void *stream;
std::shared_ptr<hlir::framework::Graph> graph;
std::shared_ptr<hlir::framework::Scope> scope;
std::shared_ptr<hlir::framework::Program> program;
std::shared_ptr<hlir::framework::GraphCompiler> graph_compiler;
CinnComputation::CompileOptions compile_options;
std::vector<hlir::framework::Tensor> inputs;
std::vector<hlir::framework::Tensor> outputs;
std::unordered_map<std::string, Variable> varmap;
std::unordered_map<std::string, std::string> varmap_paddle2program;
};
std::shared_ptr<ComputationContext> CompileProgram(
const Target &target,
Program &program, // NOLINT
const std::vector<Variable> &outputs,
std::shared_ptr<hlir::framework::Scope> scope,
const CinnComputation::CompileOptions &options,
void *stream) {
std::shared_ptr<ComputationContext> ctx(new ComputationContext());
ctx->stream = stream;
ctx->target = target;
ctx->compile_options = options;
if (ctx->compile_options.use_decomposer) {
ProgramPass::Apply(&program, {}, target, {"Decomposer"});
}
ctx->graph.reset(new hlir::framework::Graph(program, target));
if (ctx->compile_options.use_default_passes) {
hlir::framework::ApplyPass(ctx->graph.get(), "InferShape");
#ifndef CINN_WITH_CUDA
if (target.arch == Target::Arch::X86) {
hlir::framework::ApplyPass(ctx->graph.get(), "AlterLayout");
}
#endif
hlir::framework::ApplyPass(ctx->graph.get(), "ConstPropagate");
hlir::framework::ApplyPasses(ctx->graph.get(), DefaultOpFusionPasses());
}
for (auto &pass_name : ctx->compile_options.passes) {
hlir::framework::ApplyPass(ctx->graph.get(), pass_name);
}
ctx->scope = hlir::framework::BuildScope(target, ctx->graph, scope);
ctx->graph_compiler.reset(
new hlir::framework::GraphCompiler(target, ctx->scope, ctx->graph));
std::unordered_set<std::string> fetch_var_ids;
for (auto &out : outputs) {
fetch_var_ids.insert(out->id);
}
ctx->program = ctx->graph_compiler->Build(options, std::move(fetch_var_ids))
.runtime_program;
if (ctx->compile_options.do_prerun) {
ctx->program->PreRun();
}
for (auto &in_v : program.GetInputs()) {
hlir::framework::Tensor t = ctx->scope->GetTensor(in_v->id);
ctx->inputs.push_back(t);
}
for (auto &out_v : outputs) {
hlir::framework::Tensor t = ctx->scope->GetTensor(out_v->id);
ctx->outputs.push_back(t);
}
return ctx;
}
std::vector<std::string> CinnComputation::GetAllTensorNames() {
std::vector<std::string> res;
for (auto &v : context_->scope->var_names()) {
res.push_back(std::string(v));
}
return res;
}
std::shared_ptr<CinnComputation> CinnComputation::CompilePaddleModel(
const Target &target,
const std::string &model_path,
const std::vector<std::string> &input_names,
const std::vector<hlir::framework::shape_t> &input_shapes,
bool params_combined,
const CompileOptions &options,
void *stream) {
CHECK(input_names.size() == input_shapes.size());
auto scope = std::make_shared<hlir::framework::Scope>();
std::unordered_map<std::string, std::vector<int>> input_shape_map;
for (int idx = 0; idx < input_names.size(); ++idx) {
input_shape_map[input_names[idx]] = input_shapes[idx];
}
auto loadedProgram = LoadPaddleProgram(
model_path, scope.get(), input_shape_map, params_combined, target);
auto &program = std::get<0>(loadedProgram);
auto &varmap = std::get<1>(loadedProgram);
auto &varmap_paddle2program = std::get<2>(loadedProgram);
auto &fetch_names = std::get<3>(loadedProgram);
// std::vector<Variable> input_vars;
// for (int i = 0; i < input_names.size(); i++) {
// auto &name = input_names[i];
// auto &var = varmap.at(name);
// var->shape = input_shapes[i];
// input_vars.push_back(var);
// }
// program->SetInputs({input_vars});
// program->Validate();
VLOG(3) << "program:\n" << *program;
std::vector<Variable> output_vars;
for (auto &name : fetch_names) {
output_vars.push_back(varmap.at(name));
}
std::shared_ptr<ComputationContext> ctx =
CompileProgram(target, *program, output_vars, scope, options, stream);
for (auto &v : varmap) {
ctx->varmap[v.first] = v.second;
}
for (auto &v : varmap_paddle2program) {
ctx->varmap_paddle2program[v.first] = v.second;
}
auto computation = std::make_shared<CinnComputation>();
computation->context_ = std::move(ctx);
return computation;
}
std::shared_ptr<CinnComputation> CinnComputation::BuildAndCompile(
const Target &target,
NetBuilder &builder,
const CompileOptions &options,
const std::vector<Variable> &outputs,
void *stream) {
auto program = builder.Build();
return Compile(target, program, options, outputs, stream);
}
std::shared_ptr<CinnComputation> CinnComputation::Compile(
const Target &target,
Program &program,
const CompileOptions &options,
const std::vector<Variable> &outputs,
void *stream) {
std::vector<Variable> output_vars = outputs;
if (output_vars.empty()) {
output_vars.push_back(program[program.size() - 1].GetOutput(0));
}
std::shared_ptr<ComputationContext> ctx =
CompileProgram(target, program, output_vars, nullptr, options, stream);
auto computation = std::make_shared<CinnComputation>();
computation->context_ = std::move(ctx);
return computation;
}
void CinnComputation::SetTensorData(const std::string &tname,
void *data,
size_t size) {
hlir::framework::Tensor t = GetTensor(tname);
SetTensorData(t, data, size);
}
void CinnComputation::SetTensorData(hlir::framework::Tensor &t,
void *data,
size_t size) {
void *tdata = t->mutable_data(context_->target, t->type());
CHECK_EQ(size, t->shape().numel() * t->type().bytes());
if (context_->target.arch == Target::Arch::NVGPU) {
#ifdef CINN_WITH_CUDA
CUDA_CALL(cudaMemcpy(tdata, data, size, cudaMemcpyHostToDevice));
#else
CINN_NOT_IMPLEMENTED
#endif
} else if (context_->target.arch == Target::Arch::X86) {
memcpy(tdata, data, size);
} else {
CINN_NOT_IMPLEMENTED
}
}
void CinnComputation::GetTensorData(hlir::framework::Tensor &t,
void *data,
size_t size) {
void *tdata = t->mutable_data(context_->target, t->type());
CHECK_EQ(size, t->shape().numel() * t->type().bytes());
if (context_->target.arch == Target::Arch::NVGPU) {
#ifdef CINN_WITH_CUDA
CUDA_CALL(cudaMemcpy(data, tdata, size, cudaMemcpyDeviceToHost));
#else
CINN_NOT_IMPLEMENTED
#endif
} else if (context_->target.arch == Target::Arch::X86) {
memcpy(data, tdata, size);
} else {
CINN_NOT_IMPLEMENTED
}
}
void CinnComputation::GetTensorData(const std::string &tname,
void *data,
size_t size) {
hlir::framework::Tensor t = GetTensor(tname);
GetTensorData(t, data, size);
}
std::vector<hlir::framework::Tensor> CinnComputation::GetInputTensors() {
return context_->inputs;
}
std::vector<hlir::framework::Tensor> CinnComputation::GetOutputTensors() {
return context_->outputs;
}
hlir::framework::Tensor CinnComputation::GetTensor(const std::string &tname) {
if (context_->scope->FindVar(tname)) {
return context_->scope->GetTensor(tname);
}
auto it = context_->varmap_paddle2program.find(tname);
if (it == context_->varmap_paddle2program.end()) {
LOG(FATAL) << "No variable called [" << tname
<< "] found in computation\nThe existing vars: "
<< utils::Join(context_->scope->var_names(), ", ");
}
return context_->scope->GetTensor(it->second);
}
void CinnComputation::Execute(
const std::map<std::string, cinn_pod_value_t> *name2podargs) {
context_->program->Execute(name2podargs, context_->stream);
}
} // namespace frontend
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <iostream>
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/tensor.h"
namespace cinn {
namespace frontend {
struct ComputationContext;
class CinnComputation {
public:
struct CompileOptions
: public hlir::framework::GraphCompiler::CompileOptions {
bool use_decomposer = false;
bool do_prerun = true;
bool use_default_passes = true;
std::vector<std::string> passes;
};
inline static CompileOptions DefaultCompileOptions() {
CompileOptions options;
options.with_instantiate_variables = true;
options.use_decomposer = false;
options.passes = {};
options.do_prerun = true;
options.use_default_passes = true;
return options;
}
/**
* build program from NetBuilder, then compile it. NetBuilder is normally
* NetBuilder or CINNBuilder.
* @param target the target to run the program
* @param builder program builder (NetBuilder or CINNBuilder)
* @param options CompileOptions, config the compilation steps
* @param outputs program output variables, if outputs is empty, then the
* output variable of the last instruction of the program is used
* @param stream CUDA stream, the value is meaningful only when target is
* NVGPU
* @return shared_ptr pointing to CinnComputation instance
*/
static std::shared_ptr<CinnComputation> BuildAndCompile(
const Target &target,
NetBuilder &builder, // NOLINT
const CompileOptions &options = DefaultCompileOptions(),
const std::vector<Variable> &outputs = {},
void *stream = nullptr);
/**
* compile the program
* @param target the target to run the program
* @param program program (usually generated by a Builder, or converted from
* Paddle model)
* @param options CompileOptions, config the compilation steps
* @param outputs program output variables, if outputs is empty, then the
* output variable of the last instruction of the program is used
* @param stream CUDA stream, the value is meaningful only when target is
* NVGpu
* @return shared_ptr pointing to CinnComputation instance
*/
static std::shared_ptr<CinnComputation> Compile(
const Target &target,
Program &program, // NOLINT
const CompileOptions &options = DefaultCompileOptions(),
const std::vector<Variable> &outputs = {},
void *stream = nullptr);
/**
* convert a paddle model to program, then compile it.
* @param target the target to run the program
* @param model_path the path of the paddle model
* @param input_names input variable names of paddle model
* @param input_shapes input variable shapes of paddle model
* @param params_combined whether params are stored combined
* @param options CompileOptions, config the compilation steps
* @param stream CUDA stream, the value is meaningful only when target is
* NVGpu
* @return shared_ptr pointing to CinnComputation instance
*/
static std::shared_ptr<CinnComputation> CompilePaddleModel(
const Target &target,
const std::string &model_path,
const std::vector<std::string> &input_names,
const std::vector<hlir::framework::shape_t> &input_shapes,
bool params_combined,
const CompileOptions &options = DefaultCompileOptions(),
void *stream = nullptr);
/**
* get all variable names in the program
*/
std::vector<std::string> GetAllTensorNames();
/**
* get tensor by name
* @param name tensor name
*/
hlir::framework::Tensor GetTensor(const std::string &name);
/**
* get input tensors
*/
std::vector<hlir::framework::Tensor> GetInputTensors();
/**
* get output tensors
*/
std::vector<hlir::framework::Tensor> GetOutputTensors();
/**
* set the data of a tensor from user specified buffer.
* if tensor is in NVGPU device memory, cudaMemcpy is used.
* @param t the tensor
* @param data address of the memory buffer to store tensor's data
* @param size size of the memory buffer
*/
void SetTensorData(hlir::framework::Tensor &t, // NOLINT
void *data,
size_t size);
/**
* set the data of a tensor (specified by it's name) from user specified
* buffer. if tensor is in NVGPU device memory, cudaMemcpy is used.
* @param tname name of the tensor
* @param data address of the memory buffer to store tensor's data
* @param size size of the memory buffer
*/
void SetTensorData(const std::string &tname, void *data, size_t size);
/**
* copy the data of a tensor to user specified buffer.
* if tensor is in NVGPU device memory, cudaMemcpy is used.
* @param t the tensor
* @param data address of the memory buffer to store tensor's data
* @param size size of the memory buffer
*/
void GetTensorData(hlir::framework::Tensor &t, // NOLINT
void *data,
size_t size);
/**
* copy the data of a tensor (specified by it's name) to user specified
* buffer. if tensor is in NVGPU device memory, cudaMemcpy is used.
* @param tname name of the tensor
* @param data address of the memory buffer to store tensor's data
* @param size size of the memory buffer
*/
void GetTensorData(const std::string &tname, void *data, size_t size);
/**
* run the compiled program
*/
void Execute(
const std::map<std::string, cinn_pod_value_t> *name2podargs = nullptr);
private:
std::shared_ptr<ComputationContext> context_;
};
} // namespace frontend
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/frontend/computation.h"
#include <gtest/gtest.h>
#include "paddle/cinn/common/target.h"
#include "paddle/cinn/frontend/decomposer/use_decomposer.h"
#include "paddle/cinn/frontend/decomposer_registry.h"
#include "paddle/cinn/frontend/net_builder.h"
#include "paddle/cinn/frontend/pass/use_program_pass.h"
#include "paddle/cinn/frontend/program_pass.h"
DEFINE_string(model_dir, "", "");
namespace cinn {
namespace frontend {
Program CreateTestProgram() {
constexpr int B = 8;
constexpr int M = 32;
constexpr int N = 24;
NetBuilder builder("net_builder");
auto a = builder.CreateInput(Float(32), {M, N / 2}, "A");
auto b = builder.CreateInput(Float(32), {M, N / 2}, "B");
auto t = builder.Transpose(b, {1, 0});
auto r = builder.Reshape(t, {M, N / 2});
auto c = builder.Add(a, r);
auto x = builder.Divide(a, b);
auto d = builder.Concat({c, x}, 1);
auto e = builder.BroadcastTo(d, {B, M, N}, {1, 2});
auto f = builder.Concat({a, b}, 1);
auto g = builder.BroadcastTo(f, {B, M, N}, {1, 2});
auto h = builder.Subtract(e, g);
auto i = builder.Max(e, h);
auto j = builder.Min(e, h);
auto k = builder.Multiply(i, j);
auto l = builder.Constant<bool>(1, "condition");
auto m = builder.BroadcastTo(l, {B, M, N}, {0});
auto n = builder.Select(m, j, k);
auto o = builder.ReduceSum(n, {0, 1, 2});
auto program = builder.Build();
return program;
}
Program CreateAddProgram() {
constexpr int M = 32;
constexpr int N = 24;
NetBuilder builder("net_builder");
auto a = builder.CreateInput(Float(32), {M, N});
auto b = builder.CreateInput(Float(32), {M, N});
auto c = builder.Relu(a);
auto d = builder.Add(b, c);
auto program = builder.Build();
return program;
}
TEST(cinn_computation, basic_cpu) {
NetBuilder builder("basic");
constexpr int M = 32;
constexpr int N = 24;
auto a = builder.CreateInput(Float(32), {M, N}, "A");
auto b = builder.CreateInput(Float(32), {M, N}, "B");
auto c = builder.Add(a, b);
auto d = builder.Add(a, c);
auto target = common::DefaultHostTarget();
auto comp = CinnComputation::BuildAndCompile(target, builder);
std::vector<float> hostA(M * N);
std::vector<float> hostB(M * N);
std::vector<float> hostD(M * N);
std::vector<float> hostD_expected(M * N);
for (int i = 0; i < M * N; i++) {
hostA[i] = static_cast<float>(rand()) / INT_MAX; // NOLINT
hostB[i] = static_cast<float>(rand()) / INT_MAX; // NOLINT
hostD_expected[i] = hostA[i] * 2 + hostB[i];
}
comp->SetTensorData("A",
reinterpret_cast<void *>(hostA.data()),
hostA.size() * sizeof(float));
comp->SetTensorData("B",
reinterpret_cast<void *>(hostB.data()),
hostB.size() * sizeof(float));
comp->Execute();
comp->GetTensorData(d->id,
reinterpret_cast<void *>(hostD.data()),
hostD.size() * sizeof(float));
for (int i = 0; i < hostD.size(); i++) {
ASSERT_NEAR(hostD[i], hostD_expected[i], 1e-5);
}
}
#ifdef CINN_WITH_CUDA
TEST(cinn_computation, basic_gpu) {
NetBuilder builder("basic");
constexpr int M = 32;
constexpr int N = 24;
auto a = builder.CreateInput(Float(32), {M, N}, "A");
auto b = builder.CreateInput(Float(32), {M, N}, "B");
auto c = builder.Add(a, b);
auto d = builder.Add(a, c);
auto target = common::DefaultNVGPUTarget();
auto comp = CinnComputation::BuildAndCompile(target, builder);
std::vector<float> hostA(M * N);
std::vector<float> hostB(M * N);
std::vector<float> hostD(M * N);
std::vector<float> hostD_expected(M * N);
for (int i = 0; i < M * N; i++) {
hostA[i] = static_cast<float>(rand()) / INT_MAX; // NOLINT
hostB[i] = static_cast<float>(rand()) / INT_MAX; // NOLINT
hostD_expected[i] = hostA[i] * 2 + hostB[i];
}
comp->SetTensorData("A",
reinterpret_cast<void *>(hostA.data()),
hostA.size() * sizeof(float));
comp->SetTensorData("B",
reinterpret_cast<void *>(hostB.data()),
hostB.size() * sizeof(float));
comp->Execute();
comp->GetTensorData(d->id,
reinterpret_cast<void *>(hostD.data()),
hostD.size() * sizeof(float));
for (int i = 0; i < hostD.size(); i++) {
ASSERT_NEAR(hostD[i], hostD_expected[i], 1e-5);
}
}
#endif
TEST(cinn_computation, net_builder_cpu) {
auto program = CreateTestProgram();
auto target = common::DefaultHostTarget();
auto compute = CinnComputation::Compile(target, program);
auto inputs = compute->GetInputTensors();
ASSERT_EQ(inputs.size(), 2);
auto tensorA = inputs[0];
auto tensorB = inputs[1];
ASSERT_EQ(tensorA->shape().numel(), 32 * 24 / 2);
ASSERT_EQ(tensorB->shape().numel(), 32 * 24 / 2);
auto outputs = compute->GetOutputTensors();
ASSERT_EQ(outputs.size(), 1);
auto tensorOut = outputs[0];
auto load_input = [=](hlir::framework::Tensor t) {
float *ptr = t->mutable_data<float>(target);
for (int i = 0; i < t->shape().numel(); i++) {
ptr[i] = static_cast<float>(rand()) / INT_MAX; // NOLINT
}
};
// run inference for 10 times
for (int i = 0; i < 10; i++) {
// load data directly to tensor's host memory
load_input(tensorA);
load_input(tensorB);
// execute engine
compute->Execute();
// get outputs (ignored)
}
}
#ifdef CINN_WITH_CUDA
TEST(cinn_computation, net_builder_gpu) {
auto program = CreateTestProgram();
auto target = common::DefaultNVGPUTarget();
auto compute = CinnComputation::Compile(target, program);
auto inputs = compute->GetInputTensors();
ASSERT_EQ(inputs.size(), 2);
auto tensorA = inputs[0];
auto tensorB = inputs[1];
ASSERT_EQ(tensorA->shape().numel(), 32 * 24 / 2);
ASSERT_EQ(tensorB->shape().numel(), 32 * 24 / 2);
auto outputs = compute->GetOutputTensors();
ASSERT_EQ(outputs.size(), 1);
auto tensorOut = outputs[0];
// run inference for 10 times
for (int i = 0; i < 10; i++) {
// load data directly to tensor's host memory
// assume tensorA is generated in GPU directly
float *device_ptrA = tensorOut->mutable_data<float>(target);
// ... generated data directly in device memory via gpu kernels
// ... or async copy to device memory
// ... not showed here
// assume tensorB is generated in host memory, needs copy to GPU memory
// (sync.)
std::vector<float> hostB(32 * 24 / 2);
compute->SetTensorData(tensorB,
reinterpret_cast<void *>(hostB.data()),
hostB.size() * sizeof(float));
// execute engine
compute->Execute();
// get outputs
std::vector<float> hostOut(tensorOut->shape().numel());
compute->GetTensorData(tensorOut,
reinterpret_cast<void *>(hostOut.data()),
hostOut.size() * sizeof(float));
}
}
#endif
TEST(cinn_computation, fc_execute_cpu) {
auto target = common::DefaultHostTarget();
ASSERT_NE(FLAGS_model_dir, "");
auto compute = CinnComputation::CompilePaddleModel(
target, FLAGS_model_dir, {"A"}, {{1, 30}}, false);
auto inputs = compute->GetInputTensors();
ASSERT_EQ(inputs.size(), 1);
auto A = inputs[0];
ASSERT_EQ(A->shape().numel(), 1 * 30);
float *ptrA = A->mutable_data<float>(target);
for (int i = 0; i < 30; i++)
ptrA[i] = static_cast<float>(rand()) / INT_MAX; // NOLINT
for (int i = 0; i < 30; i++) ptrA[i] = static_cast<float>(0);
compute->Execute();
}
#ifdef CINN_WITH_CUDA
TEST(cinn_computation, fc_execute_gpu) {
auto target = common::DefaultNVGPUTarget();
ASSERT_NE(FLAGS_model_dir, "");
auto compute = CinnComputation::CompilePaddleModel(
target, FLAGS_model_dir, {"A"}, {{1, 30}}, false);
auto inputs = compute->GetInputTensors();
ASSERT_EQ(inputs.size(), 1);
auto A = inputs[0];
ASSERT_EQ(A->shape().numel(), 1 * 30);
auto outputs = compute->GetOutputTensors();
ASSERT_EQ(outputs.size(), 1);
auto out = outputs[0];
std::vector<float> hostA(30);
for (float &v : hostA) v = static_cast<float>(rand()) / INT_MAX; // NOLINT
compute->SetTensorData(
A, reinterpret_cast<void *>(hostA.data()), hostA.size() * sizeof(float));
compute->Execute();
std::vector<float> hostOut(30);
compute->GetTensorData(out,
reinterpret_cast<void *>(hostOut.data()),
hostOut.size() * sizeof(float));
}
#endif
TEST(cinn_computation, decomposer_cpu) {
// this test only shows the API usage
ASSERT_NE(cinn::frontend::ProgramPassRegistry::Global()->Find("Decomposer"),
nullptr);
// without decomposer
{
auto prog = CreateAddProgram();
auto target = common::DefaultHostTarget();
auto options = CinnComputation::DefaultCompileOptions();
options.use_decomposer = false;
auto compute = CinnComputation::Compile(target, prog, options);
auto names = compute->GetAllTensorNames();
ASSERT_EQ(names.size(), 3);
}
// with decomposer
{
auto prog = CreateAddProgram();
auto target = common::DefaultHostTarget();
auto options = CinnComputation::DefaultCompileOptions();
options.use_decomposer = true;
auto compute = CinnComputation::Compile(target, prog, options);
auto names = compute->GetAllTensorNames();
}
}
#ifdef CINN_WITH_CUDA
TEST(cinn_computation, gpu_stream) {
// this test only shows the API usage
auto target = common::DefaultNVGPUTarget();
auto prog = CreateAddProgram();
auto options = CinnComputation::DefaultCompileOptions();
cudaStream_t streams[1];
cudaStreamCreate(&streams[0]);
auto compute = CinnComputation::Compile(
target, prog, options, {}, static_cast<void *>(streams[0]));
compute->Execute();
}
#endif
TEST(cinn_computation, without_instantiate_variables) {
// this test only shows the API usage
auto target = common::DefaultHostTarget();
auto prog = CreateAddProgram();
auto options = CinnComputation::DefaultCompileOptions();
options.with_instantiate_variables = false;
auto compute = CinnComputation::Compile(target, prog, options);
auto names = compute->GetAllTensorNames();
std::map<std::string, cinn_pod_value_t> pod2args;
// compute->Execute(&pod2args);
}
} // namespace frontend
} // namespace cinn
core_gather_headers()
gather_srcs(
cinnapi_src
SRCS
activation.cc
elementwise.cc
broadcast.cc
batch_norm.cc
top_k.cc)
cinn_cc_library(decomposer_test_helper SRCS test_helper.cc DEPS cinncore)
if(WITH_CUDA)
cinn_cc_test(test_activation_decomposer SRCS activation_test.cc DEPS cinncore
decomposer_test_helper)
cinn_cc_test(test_elementwise_decomposer SRCS elementwise_test.cc DEPS
cinncore decomposer_test_helper)
cinn_cc_test(test_broadcast_decomposer SRCS broadcast_test.cc DEPS cinncore
decomposer_test_helper)
cinn_cc_test(test_batch_norm_decomposer SRCS batch_norm_test.cc DEPS cinncore
decomposer_test_helper)
cinn_cc_test(test_top_k_decomposer SRCS top_k_test.cc DEPS cinncore
decomposer_test_helper)
endif()
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/frontend/decomposer_registry.h"
#include "paddle/cinn/frontend/syntax.h"
namespace cinn {
namespace frontend {
namespace decomposer {
void relu(const Instruction& instr, const DecomposerContext& context) {
CHECK_EQ(instr->inputs.size(), 1UL)
<< " 1 input tensor for " << instr->op_type;
CHECK_EQ(instr->outputs.size(), 1UL)
<< "1 output tensor for " << instr->op_type;
auto x = instr->inputs[0];
auto output = instr->outputs[0];
auto* builder = context.builder();
auto bcast_zero = builder->FillConstant(
x->shape, 0.0f, common::UniqName("zero"), common::Type2Str(x->type));
auto out = builder->Max(x, bcast_zero);
// map the the output of decomposed operator to the original.
context.MapOutToOrigin(out, output);
}
void relu_grad(const Instruction& instr, const DecomposerContext& context) {
CHECK_EQ(instr->inputs.size(), 2UL)
<< " 2 input tensors for " << instr->op_type;
CHECK_EQ(instr->outputs.size(), 1UL)
<< "1 output tensor for " << instr->op_type;
auto dout = instr->inputs[0];
auto out = instr->inputs[1];
auto dx = instr->outputs[0];
auto* builder = context.builder();
auto bcast_zero = builder->FillConstant(
out->shape, 0.0f, common::UniqName("zero"), common::Type2Str(out->type));
auto condition = builder->GreaterThan(out, bcast_zero);
auto res = builder->Select(condition, dout, bcast_zero);
// map the the output of decomposed operator to the original.
context.MapOutToOrigin(res, dx);
}
void gelu(const Instruction& instr, const DecomposerContext& context) {
CHECK_EQ(instr->inputs.size(), 1UL)
<< " 1 input tensor for " << instr->op_type;
CHECK_EQ(instr->outputs.size(), 1UL)
<< "1 output tensor for " << instr->op_type;
auto x = instr->inputs[0];
auto output = instr->outputs[0];
auto* builder = context.builder();
// x * (0.5 + 0.5 * erf(sqrtf(0.5) * x))
auto p_5 = builder->FillConstant(
x->shape, 0.5f, common::UniqName("p_5"), common::Type2Str(x->type));
auto p_7 = builder->FillConstant(x->shape,
std::sqrt(0.5),
common::UniqName("p_7"),
common::Type2Str(x->type));
auto erf = builder->Erf(builder->Multiply(x, p_7));
auto cdf = builder->Add(p_5, builder->Multiply(p_5, erf));
auto out = builder->Multiply(x, cdf);
// map the the output of decomposed operator to the original.
context.MapOutToOrigin(out, output);
}
void softmax(const Instruction& instr, const DecomposerContext& context) {
CHECK_EQ(instr->inputs.size(), 1UL)
<< " 1 input tensor for " << instr->op_type;
CHECK_EQ(instr->outputs.size(), 1UL)
<< "1 output tensor for " << instr->op_type;
auto x = instr->inputs[0];
auto output = instr->outputs[0];
auto* builder = context.builder();
std::vector<int> b_axes;
auto axes = instr.GetAttrs<std::vector<int>>("axes");
CHECK(axes.size());
for (auto& axis : axes) {
if (axis < 0) {
axis += x->shape.size();
}
}
for (int idx = 0; idx < x->shape.size(); ++idx) {
if (std::find(axes.begin(), axes.end(), idx) == axes.end()) {
b_axes.push_back(idx);
}
}
// When the rank of x is 1, broadcast axes will be empty, so we need to insert
// last dim as broadcast axis.
if (b_axes.empty()) {
b_axes.emplace_back(-1);
}
auto mode = instr.GetAttrs<std::string>("mode");
if (mode == "fast") {
// x_sum = sum(exp(x))
auto x_sum = builder->BroadcastTo(
builder->ReduceSum(builder->Exp(x), axes), x->shape, b_axes);
// x_exp / x_sum
auto out = builder->Divide(builder->Exp(x), x_sum);
// map the the output of decomposed operator to the original.
context.MapOutToOrigin(out, output);
} else {
// x = max(x)
auto x_max =
builder->BroadcastTo(builder->ReduceMax(x, axes), x->shape, b_axes);
// x_exp = exp(x - x_max)
auto x_exp = builder->Exp(builder->Subtract(x, x_max));
// x_sum = sum(x_exp)
auto x_sum =
builder->BroadcastTo(builder->ReduceSum(x_exp, axes), x->shape, b_axes);
// x_exp / x_sum
auto out =
builder->Divide(builder->Exp(builder->Subtract(x, x_max)), x_sum);
// map the the output of decomposed operator to the original.
context.MapOutToOrigin(out, output);
}
}
} // namespace decomposer
} // namespace frontend
} // namespace cinn
CINN_REGISTER_HELPER(relu_decomposers) {
CINN_DECOMPOSER_REGISTER(relu, cinn::frontend::decomposer::relu);
return true;
}
CINN_REGISTER_HELPER(relu_grad_decomposers) {
CINN_DECOMPOSER_REGISTER(relu_grad, cinn::frontend::decomposer::relu_grad);
return true;
}
CINN_REGISTER_HELPER(gelu_decomposers) {
CINN_DECOMPOSER_REGISTER(gelu, cinn::frontend::decomposer::gelu);
return true;
}
CINN_REGISTER_HELPER(softmax_decomposers) {
CINN_DECOMPOSER_REGISTER(softmax, cinn::frontend::decomposer::softmax);
return true;
}
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/cinn/frontend/decomposer/test_helper.h"
namespace cinn::frontend {
TEST(Decomposer, relu) {
NetBuilder builder("relu");
auto x = builder.CreateInput(Float(32), {20, 10}, "x");
auto out = builder.Relu(x);
auto relu_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
size_t n = lengths[0];
float* x = static_cast<float*>(ptrs[0]);
float* out = static_cast<float*>(ptrs[1]);
for (size_t i = 0; i < n; ++i) {
float tmp_0 = x[i];
out[i] = tmp_0 > 0 ? tmp_0 : 0;
}
};
std::vector<std::string> input_names = {x.id().data()};
std::vector<std::string> output_names = {out->id};
std::vector<std::vector<int>> output_shapes = {{20, 10}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, relu_cpu, -1, 1);
}
TEST(Decomposer, relu_grad) {
NetBuilder builder("relu_grad");
auto dout = builder.CreateInput(Float(32), {20, 10}, "dout");
auto out = builder.CreateInput(Float(32), {20, 10}, "out");
auto dx = builder.ReluGrad(dout, out);
auto relu_grad_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
size_t n = lengths[0];
float* dout = static_cast<float*>(ptrs[0]);
float* out = static_cast<float*>(ptrs[1]);
float* dx = static_cast<float*>(ptrs[2]);
for (size_t i = 0; i < n; ++i) {
dx[i] = out[i] > 0 ? dout[i] : 0;
}
};
std::vector<std::string> input_names = {dout.id().data(), out.id().data()};
std::vector<std::string> output_names = {dx->id};
std::vector<std::vector<int>> output_shapes = {{20, 10}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, relu_grad_cpu, -1, 1);
}
TEST(Decomposer, softmax_decomposer) {
int n = 16, c = 128, h = 14, w = 14;
std::vector<int> axes = {1, 2, 3};
NetBuilder net_builder("softmax_decomposer");
std::unordered_set<std::string> output_names;
{
auto x = net_builder.CreateInput(Float(32), {n, c, h, w}, "x");
auto y = net_builder.Softmax(x, axes);
output_names.insert(y->id);
}
auto program = net_builder.Build();
auto target = common::DefaultTarget();
RunDecomposer(&program, target);
auto graph =
std::make_shared<hlir::framework::Graph>(program, output_names, target);
hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
auto scope = BuildScope(target, graph);
hlir::framework::GraphCompiler gc(target, scope, graph);
auto run_program = gc.Build();
std::vector<float> x(n * c * h * w);
InitRandomVector<float>(&x, n * c * h * w, 0.0f, 1.0f, 1e-3);
std::vector<std::pair<std::string, std::vector<float>>> inputs = {{"x", x}};
for (auto& input : inputs) {
scope->Var<hlir::framework::Tensor>(input.first);
auto tensor = scope->GetTensor(input.first);
auto* data = tensor->mutable_data<float>(target);
CopyFromVector(input.second, tensor, target);
}
run_program->Execute();
}
} // namespace cinn::frontend
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/frontend/decomposer_registry.h"
#include "paddle/cinn/frontend/syntax.h"
namespace cinn {
namespace frontend {
namespace decomposer {
struct BatchNormHelper {
BatchNormHelper(NetBuilder* net_builder,
const std::vector<int>& arg_x_shape,
const std::vector<int>& arg_param_shape,
std::string data_layout,
std::string bn_op_type) {
CHECK_EQ(arg_x_shape.size(), 4UL)
<< "Only 4-D input tensor is supported, but get " << arg_x_shape.size()
<< "-D input tensor.";
builder = net_builder;
x_shape = arg_x_shape;
param_shape = arg_param_shape;
if (data_layout == "NCHW") {
channel_dim = 1;
reduce_dim = {0, 2, 3};
element_count = x_shape[0] * x_shape[2] * x_shape[3];
} else if (data_layout == "NHWC") {
channel_dim = 3;
reduce_dim = {0, 1, 2};
element_count = x_shape[0] * x_shape[1] * x_shape[2];
} else {
LOG(FATAL) << data_layout << " setting is not support!";
}
num_instructions = builder->size();
op_type = bn_op_type;
}
~BatchNormHelper() {
VLOG(4) << op_type << " is decomposed to "
<< builder->size() - num_instructions << " instructions.";
}
std::vector<Variable> MeanAndVariance(Variable x) {
auto mean = Mean(x);
// variance = reduce_sum(x * x) / nhw - mean * mean, shape = [c], simplified
// by equation: E(x^2) - [E(x)]^2
auto variance = Variance(x, mean);
return {mean, variance};
}
std::vector<Variable> GradBiasAndScale(Variable x,
Variable x_mean,
Variable y_grad) {
auto mean_4d = builder->BroadcastTo(x_mean, x->shape, {channel_dim});
auto x_mean_diff = builder->Subtract(x, mean_4d);
// bias_grad = reduce_sum(y_grad), shape = [c]
auto bias_grad = Reduce(y_grad);
auto sum_of_y_grad_mul_x_mean_diff =
Reduce(builder->Multiply(y_grad, x_mean_diff));
return {bias_grad, sum_of_y_grad_mul_x_mean_diff};
}
// mean = reduce_sum(x) / nhw
Variable Mean(Variable x) {
auto sum = Reduce(x);
auto element_count_1d =
builder->FillConstant(sum->shape,
element_count,
common::UniqName("element_count"),
common::Type2Str(sum->type));
auto mean = builder->Divide(sum, element_count_1d);
return mean;
}
// variance = reduce_sum(x * x) / nhw - mean * mean
Variable Variance(Variable x, Variable mean) {
auto x_square = builder->Multiply(x, builder->Identity(x));
auto x_square_sum = Reduce(x_square);
auto element_count_1d =
builder->FillConstant(x_square_sum->shape,
element_count,
common::UniqName("element_count"),
common::Type2Str(x_square_sum->type));
auto x_square_mean = builder->Divide(x_square_sum, element_count_1d);
auto variance = builder->Subtract(
x_square_mean, builder->Multiply(mean, builder->Identity(mean)));
return variance;
}
// std_variance_inv = rsqrt(variance + epsilon)
Variable StdVarianceInv1d(Variable variance, float epsilon) {
auto epsilon_1d = builder->FillConstant(variance->shape,
epsilon,
common::UniqName("epsilon"),
common::Type2Str(variance->type));
auto std_variance_inv = builder->Rsqrt(builder->Add(variance, epsilon_1d));
return std_variance_inv;
}
// std_variance_inv = rsqrt(variance + epsilon)
Variable StdVarianceInv4d(Variable variance, float epsilon) {
auto variance_4d = builder->BroadcastTo(variance, x_shape, {channel_dim});
auto epsilon_4d =
builder->FillConstant(variance_4d->shape,
epsilon,
common::UniqName("epsilon"),
common::Type2Str(variance_4d->type));
auto std_variance_inv_4d =
builder->Rsqrt(builder->Add(variance_4d, epsilon_4d));
return std_variance_inv_4d;
}
// moving_value = moving_value * momentum + (1.0 - momentum) * saved_value
// value maybe mean and variance.
Variable UpdateMeanVariance(Variable moving_value,
Variable saved_value,
float momentum) {
auto factor_0 = builder->FillConstant(moving_value->shape,
momentum,
common::UniqName("factor_0"),
common::Type2Str(moving_value->type));
auto factor_1 = builder->FillConstant(saved_value->shape,
1.0f - momentum,
common::UniqName("factor_1"),
common::Type2Str(saved_value->type));
auto new_moving_value =
builder->Add(builder->Multiply(moving_value, factor_0),
builder->Multiply(saved_value, factor_1));
return new_moving_value;
}
Variable Reduce(Variable x) { return builder->ReduceSum(x, reduce_dim); }
NetBuilder* builder{nullptr};
std::vector<int> x_shape;
std::vector<int> param_shape;
std::vector<int> reduce_dim;
float element_count{0};
int channel_dim{0};
std::string op_type;
int num_instructions{0};
};
void batch_norm_train(const Instruction& instr,
const DecomposerContext& context) {
CHECK_EQ(instr->inputs.size(), 5UL)
<< "The number of the given inputs is not equal to the required for op "
<< instr->op_type;
CHECK_EQ(instr->outputs.size(), 5UL)
<< "The number of the given outputs is not equal to the required for op "
<< instr->op_type;
auto& x = instr->inputs[0];
auto& scale = instr->inputs[1];
auto& bias = instr->inputs[2];
auto& moving_mean = instr->inputs[3];
auto& moving_variance = instr->inputs[4];
CHECK_EQ(scale->type, bias->type);
CHECK_EQ(scale->type, moving_mean->type);
CHECK_EQ(scale->type, moving_variance->type);
float epsilon = instr.GetAttrs<float>("epsilon");
float momentum = instr.GetAttrs<float>("momentum");
std::string layout = instr.GetAttrs<std::string>("data_layout");
NetBuilder* builder = context.builder();
BatchNormHelper helper(
builder, x->shape, scale->shape, layout, "batch_norm_train");
auto mean_variance = helper.MeanAndVariance(x);
auto mean = mean_variance[0];
auto variance = mean_variance[1];
auto mean_4d = builder->BroadcastTo(mean, x->shape, {helper.channel_dim});
// std_variance_inv = rsqrt(variance + epsilon), shape = [c]
auto std_variance_inv_4d = helper.StdVarianceInv4d(variance, epsilon);
// y = scale * (x - mean) * std_variance_inv + bias, shape = [n, c, h, w]
auto scale_4d = builder->BroadcastTo(scale, x->shape, {helper.channel_dim});
auto bias_4d = builder->BroadcastTo(bias, x->shape, {helper.channel_dim});
auto normalized =
builder->Multiply(builder->Subtract(x, mean_4d), std_variance_inv_4d);
auto scaled_normalized = builder->Multiply(normalized, scale_4d);
auto y = builder->Add(scaled_normalized, bias_4d);
// moving_mean = moving_mean * momentum + (1.0 - momentum) * mean, shape = [c]
auto new_moving_mean = helper.UpdateMeanVariance(moving_mean, mean, momentum);
// moving_variance = moving_variance * momentum + (1.0 - momentum) * variance,
// shape = [c]
auto new_moving_variance =
helper.UpdateMeanVariance(moving_variance, variance, momentum);
context.MapOutToOrigin(y, instr->outputs[0]);
context.MapOutToOrigin(mean, instr->outputs[1]);
context.MapOutToOrigin(variance, instr->outputs[2]);
context.MapOutToOrigin(new_moving_mean, instr->outputs[3]);
context.MapOutToOrigin(new_moving_variance, instr->outputs[4]);
}
void batch_norm_grad(const Instruction& instr,
const DecomposerContext& context) {
CHECK_EQ(instr->inputs.size(), 5UL)
<< " The number of the given inputs is not equal to the required "
<< instr->op_type;
CHECK_EQ(instr->outputs.size(), 3UL)
<< " The number of the given outputs is not equal to the required"
<< instr->op_type;
auto& y_grad = instr->inputs[0];
auto& x = instr->inputs[1];
auto& scale = instr->inputs[2];
auto& save_mean = instr->inputs[3];
auto& save_variance = instr->inputs[4];
CHECK_EQ(y_grad->type, x->type);
CHECK_EQ(scale->type, save_mean->type);
CHECK_EQ(scale->type, save_variance->type);
auto epsilon = instr.GetAttrs<float>("epsilon");
auto layout = instr.GetAttrs<std::string>("data_layout");
NetBuilder* builder = context.builder();
BatchNormHelper helper(
builder, x->shape, scale->shape, layout, "batch_norm_grad");
auto vars = helper.GradBiasAndScale(x, save_mean, y_grad);
auto bias_grad = vars[0];
auto sum_of_y_grad_mul_x_mean_diff = vars[1];
// scale_grad = reduce_sum(y_grad * (x - mean)) * rsqrt(variance + epsilon),
// shape = [c]
auto scale_grad =
builder->Multiply(sum_of_y_grad_mul_x_mean_diff,
helper.StdVarianceInv1d(save_variance, epsilon));
// x_grad = 1/nhw * scale * rsqrt(variance + epsilon) *
// (nhw * y_grad - reduce_sum(y_grad) - (x - mean) * reduce_sum(y_grad * (x
// - mean)) / (variance + epsilon))
// => x_grad = tmp0 * (tmp1 - tmp2 - tmp3)
auto scaled_std_variance_inv =
builder->Multiply(scale, helper.StdVarianceInv1d(save_variance, epsilon));
auto element_count_1d =
builder->FillConstant(scaled_std_variance_inv->shape,
helper.element_count,
common::UniqName("element_count_1d"),
common::Type2Str(scaled_std_variance_inv->type));
auto tmp0 = builder->BroadcastTo(
builder->Divide(scaled_std_variance_inv, element_count_1d),
x->shape,
{helper.channel_dim});
auto element_count_4d =
builder->FillConstant(y_grad->shape,
helper.element_count,
common::UniqName("element_count_4d"),
common::Type2Str(y_grad->type));
auto tmp1 = builder->Multiply(y_grad, element_count_4d);
auto tmp2 = builder->BroadcastTo(bias_grad, x->shape, {helper.channel_dim});
auto mean_4d =
builder->BroadcastTo(save_mean, x->shape, {helper.channel_dim});
auto x_mean_diff = builder->Subtract(x, mean_4d);
auto sum_of_y_grad_mul_x_mean_diff_4d = builder->BroadcastTo(
sum_of_y_grad_mul_x_mean_diff, x->shape, {helper.channel_dim});
auto tmp3_0 =
builder->Multiply(x_mean_diff, sum_of_y_grad_mul_x_mean_diff_4d);
auto epsilon_1d =
builder->FillConstant(save_variance->shape,
epsilon,
common::UniqName("epsilon"),
common::Type2Str(save_variance->type));
auto variance_add_eps = builder->Add(save_variance, epsilon_1d);
auto variance_add_eps_4d =
builder->BroadcastTo(variance_add_eps, x->shape, {helper.channel_dim});
auto tmp3 = builder->Divide(tmp3_0, variance_add_eps_4d);
auto x_grad = builder->Multiply(
tmp0, builder->Subtract(builder->Subtract(tmp1, tmp2), tmp3));
context.MapOutToOrigin(x_grad, instr->outputs[0]);
context.MapOutToOrigin(scale_grad, instr->outputs[1]);
context.MapOutToOrigin(bias_grad, instr->outputs[2]);
}
void batch_norm(const Instruction& instr, const DecomposerContext& context) {
CHECK_EQ(instr->inputs.size(), 5UL)
<< "The number of the given inputs is not equal to the required for op "
<< instr->op_type;
CHECK_EQ(instr->outputs.size(), 1UL)
<< "The number of the given outputs is not equal to the required for op "
<< instr->op_type;
auto& x = instr->inputs[0];
auto& scale = instr->inputs[1];
auto& bias = instr->inputs[2];
auto& moving_mean = instr->inputs[3];
auto& moving_variance = instr->inputs[4];
CHECK_EQ(scale->type, bias->type);
CHECK_EQ(scale->type, moving_mean->type);
CHECK_EQ(scale->type, moving_variance->type);
float epsilon = instr.GetAttrs<float>("epsilon");
float momentum = instr.GetAttrs<float>("momentum");
std::string layout = instr.GetAttrs<std::string>("data_layout");
NetBuilder* builder = context.builder();
BatchNormHelper helper(builder, x->shape, scale->shape, layout, "batch_norm");
auto mean_4d =
builder->BroadcastTo(moving_mean, x->shape, {helper.channel_dim});
// std_variance_inv = rsqrt(variance + epsilon), shape = [c]
auto std_variance_inv_4d = helper.StdVarianceInv4d(moving_variance, epsilon);
// y = scale * (x - mean) * std_variance_inv + bias, shape = [n, c, h, w]
auto scale_4d = builder->BroadcastTo(scale, x->shape, {helper.channel_dim});
auto bias_4d = builder->BroadcastTo(bias, x->shape, {helper.channel_dim});
auto normalized =
builder->Multiply(builder->Subtract(x, mean_4d), std_variance_inv_4d);
auto scaled_normalized = builder->Multiply(normalized, scale_4d);
auto y = builder->Add(scaled_normalized, bias_4d);
context.MapOutToOrigin(y, instr->outputs[0]);
}
} // namespace decomposer
} // namespace frontend
} // namespace cinn
CINN_REGISTER_HELPER(batch_norm_decomposer) {
CINN_DECOMPOSER_REGISTER(batch_norm, cinn::frontend::decomposer::batch_norm);
return true;
}
CINN_REGISTER_HELPER(batch_norm_train_decomposer) {
CINN_DECOMPOSER_REGISTER(batch_norm_train,
cinn::frontend::decomposer::batch_norm_train);
return true;
}
CINN_REGISTER_HELPER(batch_norm_grad_decomposer) {
CINN_DECOMPOSER_REGISTER(batch_norm_grad,
cinn::frontend::decomposer::batch_norm_grad);
return true;
}
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/cinn/frontend/decomposer/test_helper.h"
namespace cinn {
namespace frontend {
namespace {
struct Offset {
int n;
int c;
int h;
int w;
Offset(int arg_n, int arg_c, int arg_h, int arg_w)
: n(arg_n), c(arg_c), h(arg_h), w(arg_w) {}
int operator()(int idx_n, int idx_c, int idx_h, int idx_w) const {
return idx_n * c * h * w + idx_c * h * w + idx_h * w + idx_w;
}
};
template <typename FuncType>
void Loop(FuncType func, const int n, const int c, const int h, const int w) {
for (int in = 0; in < n; ++in) {
for (int ic = 0; ic < c; ++ic) {
for (int ih = 0; ih < h; ++ih) {
for (int iw = 0; iw < w; ++iw) {
func(in, ic, ih, iw);
}
}
}
}
}
template <typename T>
void ComputeBatchNormTrainRef(const std::vector<T>& x,
const std::vector<T>& scale,
const std::vector<T>& bias,
const std::vector<T>& moving_mean,
const std::vector<T>& moving_variance,
const int n,
const int c,
const int h,
const int w,
std::vector<T>* y,
std::vector<T>* saved_mean,
std::vector<T>* saved_variance,
std::vector<T>* new_moving_mean,
std::vector<T>* new_moving_variance,
const float epsilon,
const float momentum) {
Offset offset(n, c, h, w);
// sum
memset(saved_mean->data(), 0, sizeof(T) * c);
auto func_sum_x = [=](int in, int ic, int ih, int iw) {
saved_mean->at(ic) += x[offset(in, ic, ih, iw)];
};
Loop(func_sum_x, n, c, h, w);
// saved mean
float element_count = static_cast<float>(n * h * w);
for (int ic = 0; ic < c; ++ic) {
// Checking result of saved_mean:
// output[saved_mean], var_name=var_5, shape={32}
// - Total 0 different results, offset=0, 0.00527001 vs 0.00527001,
// maximum_relative_diff=0(absolute_diff=0)
saved_mean->at(ic) /= element_count;
}
// square_sum
std::vector<float> x_square_mean(c, 0);
auto func_sum_square_x = [&](int in, int ic, int ih, int iw) {
x_square_mean.at(ic) +=
x[offset(in, ic, ih, iw)] * x[offset(in, ic, ih, iw)];
};
Loop(func_sum_square_x, n, c, h, w);
for (int ic = 0; ic < c; ++ic) {
x_square_mean[ic] /= element_count;
}
// saved variance, according to equation: E(x^2) - [E(x)]^2
std::vector<float> std_variance(c);
for (int ic = 0; ic < c; ++ic) {
// Checking results of saved_variance and std_variance:
// output[saved_variance], var_name=var_6, shape={32}
// - Total 0 different results, offset=0, 0.336347 vs 0.336347,
// maximum_relative_diff=0(absolute_diff=0) output[std_variance],
// var_name=std_variance, shape={32}
// - Total 0 different results, offset=0, 0.579963 vs 0.579963,
// maximum_relative_diff=0(absolute_diff=0)
saved_variance->at(ic) =
x_square_mean[ic] - (saved_mean->at(ic) * saved_mean->at(ic));
std_variance[ic] = sqrt(saved_variance->at(ic) + epsilon);
}
// compute output
std::vector<float> y_nobias(n * c * h * w);
auto func_y_nobias = [&](int in, int ic, int ih, int iw) {
int idx = offset(in, ic, ih, iw);
// Checking result of y_nobias:
// output[y_nobias], var_name=y_nobias, shape={16, 32, 16, 16}
// - Total 0 different results, offset=32104, -0.000488288 vs -0.000488288,
// maximum_relative_diff=1.19208e-07(absolute_diff=5.82077e-11)
y_nobias[idx] =
(x[idx] - saved_mean->at(ic)) * scale[ic] / std_variance[ic];
};
Loop(func_y_nobias, n, c, h, w);
auto func_y = [&](int in, int ic, int ih, int iw) {
int idx = offset(in, ic, ih, iw);
// Checking result of y:
// output[y], var_name=var_4, shape={16, 32, 16, 16}
// - Total 80 different results, offset=126409, 1.81794e-06 vs 1.80304e-06,
// maximum_relative_diff=0.00826446(absolute_diff=1.49012e-08) For the
// following case:
// idx=126409, y[idx]=1.80304e-06, y_nobias[idx]=0.2033332,
// bias[ic]=-0.2033314
// The computing result of CPU and GPU may have some difference, like
// i=126409, 1.8179417e-06 vs 1.8030405e-06, relative_diff=0.0082644625,
// absolute_diff=1.4901161e-08
// This case is considered reasonable.
y->at(idx) = y_nobias[idx] + bias[ic];
};
Loop(func_y, n, c, h, w);
// new moving running and variance
float factor_0 = momentum;
float factor_1 = static_cast<float>(1.0f - momentum);
for (int ic = 0; ic < c; ++ic) {
// Checking result of new_moving_mean and new_moving_variance:
// output[new_moving_mean], var_name=var_7, shape={32}
// - Total 0 different results, offset=9, 0.00123065 vs 0.00123065,
// maximum_relative_diff=9.45967e-08(absolute_diff=1.16415e-10)
// output[new_moving_variance], var_name=var_8, shape={32}
// - Total 0 different results, offset=16, -0.00140787 vs -0.00140787,
// maximum_relative_diff=5.29211e-06(absolute_diff=7.45058e-09)
new_moving_mean->at(ic) =
moving_mean[ic] * factor_0 + saved_mean->at(ic) * factor_1;
new_moving_variance->at(ic) =
moving_variance[ic] * factor_0 + saved_variance->at(ic) * factor_1;
}
}
TEST(Decomposer, BatchNormTrain) {
int n = 16, c = 128, h = 14, w = 14;
float epsilon = 1e-5;
float momentum = 0.9f;
std::string data_layout = "NCHW";
bool is_test = false;
NetBuilder net_builder("batch_norm_train");
std::vector<std::string> output_names;
{
auto x = net_builder.CreateInput(Float(32), {n, c, h, w}, "x");
auto scale = net_builder.CreateInput(Float(32), {c}, "scale");
auto bias = net_builder.CreateInput(Float(32), {c}, "bias");
auto moving_mean = net_builder.CreateInput(Float(32), {c}, "moving_mean");
auto moving_variance =
net_builder.CreateInput(Float(32), {c}, "moving_variance");
auto outputs = net_builder.BatchNorm(x,
scale,
bias,
moving_mean,
moving_variance,
epsilon,
momentum,
data_layout,
is_test);
for (auto output : outputs) {
output_names.push_back(output->id);
}
}
auto program = net_builder.Build();
auto target = common::DefaultTarget();
RunDecomposer(&program,
target,
cinn::frontend::DefaultTrainingOptimizeOptions().program_passes,
output_names);
auto graph = std::make_shared<hlir::framework::Graph>(program, target);
hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
auto scope = BuildScope(target, graph);
hlir::framework::GraphCompiler gc(target, scope, graph);
auto run_program = gc.Build();
// set input
float precision = 1e-3;
std::vector<float> x(n * c * h * w), scale(c), bias(c), moving_mean(c),
moving_variance(c);
InitRandomVector<float>(&x, n * c * h * w, 0.0f, 1.0f, precision);
InitRandomVector<float>(&scale, c, 0.0f, 1.0f, precision);
InitRandomVector<float>(&bias, c, 10.0f, 20.0f, precision);
InitRandomVector<float>(&moving_mean, c, 0.0f, 1.0f, precision);
InitRandomVector<float>(&moving_variance, c, 0.0f, 1.0f, precision);
std::vector<float> y(n * c * h * w), new_moving_mean(c),
new_moving_variance(c), saved_mean(c), saved_variance(c);
ComputeBatchNormTrainRef<float>(x,
scale,
bias,
moving_mean,
moving_variance,
n,
c,
h,
w,
&y,
&saved_mean,
&saved_variance,
&new_moving_mean,
&new_moving_variance,
epsilon,
momentum);
std::vector<std::pair<std::string, std::vector<float>>> inputs = {
{"x", x},
{"scale", scale},
{"bias", bias},
{"moving_mean", moving_mean},
{"moving_variance", moving_variance}};
for (auto& input : inputs) {
scope->Var<hlir::framework::Tensor>(input.first);
auto tensor = scope->GetTensor(input.first);
auto* data = tensor->mutable_data<float>(target);
CopyFromVector(input.second, tensor, target);
}
run_program->Execute();
std::unordered_map<std::string, std::pair<std::string, std::vector<float>>>
outputs_ref = {
{"new_moving_variance", {output_names[4], new_moving_variance}},
{"new_moving_mean", {output_names[3], new_moving_mean}},
{"saved_variance", {output_names[2], saved_variance}},
{"saved_mean", {output_names[1], saved_mean}},
{"y", {output_names[0], y}}};
for (auto& iter : outputs_ref) {
auto output = iter.second;
auto tensor = scope->GetTensor(output.first);
std::vector<float> data(tensor->shape().numel());
CopyToVector(tensor, &data);
LOG(INFO) << "output[" << iter.first << "], var_name=" << output.first
<< ", shape=" << tensor->shape().data();
CheckOutput<float>(data, output.second, 1e-8, 1e-4);
}
}
template <typename T>
void ComputeBatchNormGradRef(const std::vector<T>& y_grad,
const std::vector<T>& x,
const std::vector<T>& scale,
const std::vector<T>& save_mean,
const std::vector<T>& save_variance,
const int n,
const int c,
const int h,
const int w,
std::vector<T>* x_grad,
std::vector<T>* scale_grad,
std::vector<T>* bias_grad,
const float epsilon = 1e-5) {
Offset offset(n, c, h, w);
// bias_grad
memset(bias_grad->data(), 0, sizeof(T) * c);
auto func_bias_grad = [=](int in, int ic, int ih, int iw) {
bias_grad->at(ic) += y_grad[offset(in, ic, ih, iw)];
};
Loop(func_bias_grad, n, c, h, w);
// std_variance
std::vector<T> std_variance(c);
for (int ic = 0; ic < c; ++ic) {
std_variance[ic] = sqrt(save_variance[ic] + epsilon);
}
// grad scale
memset(scale_grad->data(), 0, sizeof(T) * c);
auto func_scale_grad = [=](int in, int ic, int ih, int iw) {
int idx = offset(in, ic, ih, iw);
scale_grad->at(ic) += y_grad[idx] * (x[idx] - save_mean[ic]);
};
Loop(func_scale_grad, n, c, h, w);
for (int ic = 0; ic < c; ++ic) {
scale_grad->at(ic) /= std_variance[ic];
}
// std_norm_grad
std::vector<T> std_norm_grad(n * c * h * w);
auto func_std_norm_grad = [&](int in, int ic, int ih, int iw) {
int idx = offset(in, ic, ih, iw);
std_norm_grad[idx] = y_grad[idx] * scale[ic];
};
Loop(func_std_norm_grad, n, c, h, w);
// x_mean_diff_grad
std::vector<T> x_mean_diff_grad(n * c * h * w);
auto func_x_mean_diff_grad = [&](int in, int ic, int ih, int iw) {
int idx = offset(in, ic, ih, iw);
x_mean_diff_grad[idx] = std_norm_grad[idx] / std_variance[ic];
};
Loop(func_x_mean_diff_grad, n, c, h, w);
// std_variance_grad
std::vector<T> std_variance_grad(c, 0);
auto func_std_variance_grad = [&](int in, int ic, int ih, int iw) {
int idx = offset(in, ic, ih, iw);
std_variance_grad[ic] += -1.0f * std_norm_grad[idx] *
(x[idx] - save_mean[ic]) /
(save_variance[ic] + epsilon);
};
Loop(func_std_variance_grad, n, c, h, w);
// variance_grad_without_mul
std::vector<T> variance_grad_without_mul(c);
for (int ic = 0; ic < c; ++ic) {
variance_grad_without_mul[ic] = std_variance_grad[ic] / std_variance[ic];
}
// x_grad_0
float element_count = static_cast<float>(n * h * w);
std::vector<T> x_grad_0(n * c * h * w);
auto func_x_grad_0 = [&](int in, int ic, int ih, int iw) {
int idx = offset(in, ic, ih, iw);
x_grad_0[idx] = x[idx] * (variance_grad_without_mul[ic] / element_count);
};
Loop(func_x_grad_0, n, c, h, w);
// minus_mean_grad
std::vector<T> minus_mean_grad(c, 0);
auto func_minus_mean_grad = [&](int in, int ic, int ih, int iw) {
minus_mean_grad[ic] += x_mean_diff_grad[offset(in, ic, ih, iw)];
};
Loop(func_minus_mean_grad, n, c, h, w);
for (int ic = 0; ic < c; ++ic) {
minus_mean_grad[ic] += variance_grad_without_mul[ic] * save_mean[ic];
minus_mean_grad[ic] /= element_count;
}
auto func_x_grad = [=](int in, int ic, int ih, int iw) {
int idx = offset(in, ic, ih, iw);
x_grad->at(idx) =
x_mean_diff_grad[idx] + x_grad_0[idx] - minus_mean_grad[ic];
};
Loop(func_x_grad, n, c, h, w);
}
TEST(Decomposer, BatchNormGrad) {
int n = 16, c = 128, h = 14, w = 14;
int num = n * c * h * w;
float epsilon = 1e-5;
NetBuilder net_builder("batch_norm_grad");
std::vector<std::string> output_names;
{
auto y_grad = net_builder.CreateInput(Float(32), {n, c, h, w}, "y_grad");
auto x = net_builder.CreateInput(Float(32), {n, c, h, w}, "x");
auto scale = net_builder.CreateInput(Float(32), {c}, "scale");
auto saved_mean = net_builder.CreateInput(Float(32), {c}, "saved_mean");
auto saved_variance =
net_builder.CreateInput(Float(32), {c}, "saved_variance");
auto outputs = net_builder.BatchNormGrad(
y_grad, x, scale, saved_mean, saved_variance, epsilon);
for (auto output : outputs) {
output_names.push_back(output->id);
}
}
auto program = net_builder.Build();
auto target = common::DefaultTarget();
RunDecomposer(&program,
target,
cinn::frontend::DefaultTrainingOptimizeOptions().program_passes,
output_names);
auto graph = std::make_shared<hlir::framework::Graph>(program, target);
hlir::framework::ApplyPass(graph.get(), "OpFusionPass");
hlir::framework::ApplyPass(graph.get(), "FusionMergePass");
auto scope = BuildScope(target, graph);
hlir::framework::GraphCompiler gc(target, scope, graph);
auto run_program = gc.Build();
// set input
float precision = 1e-3;
std::vector<float> y_grad(num), x(num), scale(c), saved_mean(c, 0),
saved_variance(c, 0);
InitRandomVector(&y_grad, num, 0.0f, 1.0f, precision);
InitRandomVector(&x, num, 0.0f, 1.0f, precision);
InitRandomVector(&scale, c, 0.0f, 1.0f, precision);
Offset offset(n, c, h, w);
auto func_save_mean = [&](int in, int ic, int ih, int iw) {
int idx = offset(in, ic, ih, iw);
saved_mean[ic] += x[idx];
saved_variance[ic] += x[idx] * x[idx];
};
Loop(func_save_mean, n, c, h, w);
float element_count = static_cast<float>(n * h * w);
for (int ic = 0; ic < c; ++ic) {
saved_mean[ic] /= element_count;
saved_variance[ic] =
saved_variance[ic] / element_count - saved_mean[ic] * saved_mean[ic];
}
std::vector<std::pair<std::string, std::vector<float>>> inputs = {
{"y_grad", y_grad},
{"x", x},
{"scale", scale},
{"saved_mean", saved_mean},
{"saved_variance", saved_variance}};
for (auto& input : inputs) {
scope->Var<hlir::framework::Tensor>(input.first);
auto tensor = scope->GetTensor(input.first);
CopyFromVector(input.second, tensor, target);
}
run_program->Execute();
std::vector<float> x_grad(num), scale_grad(c), bias_grad(c);
ComputeBatchNormGradRef(y_grad,
x,
scale,
saved_mean,
saved_variance,
n,
c,
h,
w,
&x_grad,
&scale_grad,
&bias_grad,
epsilon);
std::unordered_map<std::string, std::pair<std::string, std::vector<float>>>
output_refs = {{"bias_grad", {output_names[2], bias_grad}},
{"scale_grad", {output_names[1], scale_grad}},
{"x_grad", {output_names[0], x_grad}}};
for (auto& iter : output_refs) {
auto output = iter.second;
auto tensor = scope->GetTensor(output.first);
std::vector<float> data(tensor->shape().numel());
CopyToVector(tensor, &data);
LOG(INFO) << "output[" << iter.first << "], var_name=" << output.first
<< ", shape=" << tensor->shape().data();
if (iter.first == "x_grad") {
// TODO(Xreki): fix the precision check of x_grad.
// CheckOutput<float>(data, output.second, 1e-8, 1e-1);
} else if (iter.first == "scale_grad") {
CheckOutput<float>(data, output.second, 1e-8, 1e-2);
} else {
CheckOutput<float>(data, output.second);
}
}
}
} // namespace
} // namespace frontend
} // namespace cinn
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/cinn/frontend/decomposer_registry.h"
#include "paddle/cinn/frontend/syntax.h"
namespace cinn {
namespace frontend {
namespace decomposer {
void GetReduceDimsForX(const std::vector<int>& dx_shape,
const std::vector<int>& dout_shape,
std::vector<int>* reduce_dims) {
// e.g., dx_shape = [4, 1, 3], dout_shape = [4, 2, 3], reduce_dims=[1]
for (size_t i = 0; i < dout_shape.size(); ++i) {
if (dx_shape[i] == 1 && dout_shape[i] != 1) {
reduce_dims->push_back(i);
}
}
VLOG(3) << "The reduce_dims for X: " << utils::Join(*reduce_dims, ",");
}
void GetReduceDimsForY(const std::vector<int>& dy_shape,
const std::vector<int>& dout_shape,
int axis,
std::vector<int>* reduce_dims) {
// e.g., dy_shape = [3, 1, 4], dout_shape = [2, 3, 4, 4, 5], axis = 1
// reduce_dims=[0, 2, 4]
for (size_t i = 0; i < dout_shape.size(); ++i) {
if (i < axis || i >= axis + dy_shape.size()) {
reduce_dims->push_back(i);
} else {
if (dy_shape[i - axis] == 1 && dout_shape[i] != 1) {
reduce_dims->push_back(i);
}
}
}
VLOG(3) << "The reduce_dims for Y: " << utils::Join(*reduce_dims, ",");
}
void elementwise_add(const Instruction& instr,
const DecomposerContext& context) {
CHECK_EQ(instr->inputs.size(), 2UL)
<< " 2 input tensors for " << instr->op_type;
CHECK_EQ(instr->outputs.size(), 1UL)
<< "1 output tensor for " << instr->op_type;
auto x = instr->inputs[0];
auto y = instr->inputs[1];
auto output = instr->outputs[0];
int axis = -1;
if (instr->attrs.find("axis") != instr->attrs.end()) {
axis = instr.GetAttrs<int>("axis");
}
if (x->shape.size() >= y->shape.size()) {
axis = axis >= 0 ? axis : x->shape.size() - y->shape.size();
auto* builder = context.builder();
Variable out;
Variable bcast_x = x;
Variable bcast_y = y;
// e.g., x.shape = [4, 1, 3], y.shape = [2, 3], aixs = 1 out.shape = [4, 2,
// 3] bcast_axes_x = [0, 1, 2], bcast_axes_y = [1, 2]
if (x->shape != output->shape) {
std::vector<int> bcast_axes_x(x->shape.size());
std::iota(bcast_axes_x.begin(), bcast_axes_x.end(), 0);
bcast_x = builder->BroadcastTo(x, output->shape, bcast_axes_x);
}
// if y.shape=[1], y does not need to be broadcast
if (y->shape != output->shape && y->shape != std::vector<int>(1, 1)) {
std::vector<int> bcast_axes_y(y->shape.size());
std::iota(bcast_axes_y.begin(), bcast_axes_y.end(), axis);
bcast_y = builder->BroadcastTo(y, output->shape, bcast_axes_y);
}
out = builder->Add(bcast_x, bcast_y);
// map the the output of decomposed operator to the original.
context.MapOutToOrigin(out, output);
} else {
axis = axis >= 0 ? axis : y->shape.size() - x->shape.size();
auto* builder = context.builder();
Variable out;
Variable bcast_x = x;
Variable bcast_y = y;
if (y->shape != output->shape) {
std::vector<int> bcast_axes_y(y->shape.size());
std::iota(bcast_axes_y.begin(), bcast_axes_y.end(), 0);
bcast_y = builder->BroadcastTo(y, output->shape, bcast_axes_y);
}
if (x->shape != output->shape && x->shape != std::vector<int>(1, 1)) {
std::vector<int> bcast_axes_x(x->shape.size());
std::iota(bcast_axes_x.begin(), bcast_axes_x.end(), axis);
bcast_x = builder->BroadcastTo(x, output->shape, bcast_axes_x);
}
out = builder->Add(bcast_x, bcast_y);
// map the the output of decomposed operator to the original.
context.MapOutToOrigin(out, output);
}
}
void elementwise_add_grad(const Instruction& instr,
const DecomposerContext& context) {
CHECK_EQ(instr->inputs.size(), 3UL)
<< " 3 input tensors for " << instr->op_type;
CHECK_EQ(instr->outputs.size(), 2UL)
<< "2 output tensors for " << instr->op_type;
auto dout = instr->inputs[0];
auto dx = instr->outputs[0];
auto dy = instr->outputs[1];
int axis = instr.GetAttrs<int>("axis");
if (axis < 0 && dx->shape.size() < dy->shape.size()) {
LOG(FATAL) << "Please make sure x'rank greater than or equal to y'rank "
"when axis = -1";
}
axis = axis >= 0 ? axis : dx->shape.size() - dy->shape.size();
auto* builder = context.builder();
Variable dx_t;
if (dx->shape == dout->shape) {
dx_t = builder->Identity(dout);
context.MapOutToOrigin(dx, dout);
} else {
std::vector<int> x_reduce_dims;
GetReduceDimsForX(dx->shape, dout->shape, &x_reduce_dims);
// The rank of dx is same as dout, so set keep_dim = true
dx_t = builder->ReduceSum(dout, x_reduce_dims, true);
}
Variable dy_t;
if (dy->shape == dout->shape) {
dy_t = builder->Identity(dout);
context.MapOutToOrigin(dy, dout);
} else {
std::vector<int> y_reduce_dims;
GetReduceDimsForY(dy->shape, dout->shape, axis, &y_reduce_dims);
// The rank of dy is less or equal to dout, after reduce_sum, there
// may be some extra "1" in the front or back of dy_res's shape. So
// the dt_res needs to be reshaped.
auto dy_res = builder->ReduceSum(dout, y_reduce_dims, true);
dy_t = builder->Reshape(dy_res, dy->shape);
}
// map the the output of decomposed operator to the original.
context.MapOutToOrigin(dx_t, dx);
context.MapOutToOrigin(dy_t, dy);
}
} // namespace decomposer
} // namespace frontend
} // namespace cinn
CINN_REGISTER_HELPER(broadcast_decomposers) {
CINN_DECOMPOSER_REGISTER(elementwise_add,
cinn::frontend::decomposer::elementwise_add);
return true;
}
CINN_REGISTER_HELPER(broadcast_grad_decomposers) {
CINN_DECOMPOSER_REGISTER(elementwise_add_grad,
cinn::frontend::decomposer::elementwise_add_grad);
return true;
}
// Copyright (c) 2021 CINN Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include "paddle/cinn/frontend/decomposer/test_helper.h"
namespace cinn::frontend {
TEST(Decomposer, elementwise_add_bcast0) {
NetBuilder builder("elementwise_add");
auto x = builder.CreateInput(Float(32), {4, 1, 20, 10});
auto y = builder.CreateInput(Float(32), {10, 20});
auto out = builder.Add(x, y, 1);
std::vector<std::string> input_names = {x.id().data(), y.id().data()};
std::vector<std::string> output_names = {out->id};
std::vector<std::vector<int>> output_shapes = {{4, 10, 20, 10}};
RunAndCheckShape<float>(&builder, input_names, output_names, output_shapes);
}
TEST(Decomposer, elementwise_add_bcase1) {
NetBuilder builder("elementwise_add");
auto x = builder.CreateInput(Float(32), {10, 20});
auto y = builder.CreateInput(Float(32), {4, 1, 20, 10});
auto out = builder.Add(x, y, 1);
std::vector<std::string> input_names = {x.id().data(), y.id().data()};
std::vector<std::string> output_names = {out->id};
std::vector<std::vector<int>> output_shapes = {{4, 10, 20, 10}};
RunAndCheckShape<float>(&builder, input_names, output_names, output_shapes);
}
TEST(Decomposer, elementwise_add_grad_bcast0) {
NetBuilder builder("elementwise_add_grad");
auto dout = builder.CreateInput(Float(32), {4, 10, 20, 10});
auto x = builder.CreateInput(Float(32), {4, 1, 20, 10});
auto y = builder.CreateInput(Float(32), {10, 20});
auto out_grads = builder.ElementwiseAddGrad(dout, x, y, 1);
std::vector<std::string> input_names = {dout.id().data()};
std::vector<std::string> output_names = {out_grads[0]->id, out_grads[1]->id};
std::vector<std::vector<int>> output_shapes = {{4, 1, 20, 10}, {10, 20}};
RunAndCheckShape<float>(&builder, input_names, output_names, output_shapes);
}
TEST(Decomposer, elementwise_add_bcast1) {
NetBuilder builder("elementwise_add");
auto x = builder.CreateInput(Float(32), {32, 64, 32, 32});
auto y = builder.CreateInput(Float(32), {64});
auto out = builder.Add(x, y, 1);
auto add_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
float* x = static_cast<float*>(ptrs[0]);
float* y = static_cast<float*>(ptrs[1]);
float* out = static_cast<float*>(ptrs[2]);
for (size_t i = 0; i < 32; ++i) {
for (size_t j = 0; j < 64; ++j) {
for (size_t k = 0; k < 32 * 32; ++k) {
out[(i * 64 + j) * 32 * 32 + k] =
x[(i * 64 + j) * 32 * 32 + k] + y[j];
}
}
}
};
std::vector<std::string> input_names = {x.id().data(), y.id().data()};
std::vector<std::string> output_names = {out->id};
std::vector<std::vector<int>> output_shapes = {{32, 64, 32, 32}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, add_cpu);
}
TEST(Decomposer, elementwise_add_bcast1_2) {
NetBuilder builder("elementwise_add");
auto x = builder.CreateInput(Float(32), {64});
auto y = builder.CreateInput(Float(32), {32, 64, 32, 32});
auto out = builder.Add(x, y, 1);
auto add_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
float* x = static_cast<float*>(ptrs[0]);
float* y = static_cast<float*>(ptrs[1]);
float* out = static_cast<float*>(ptrs[2]);
for (size_t i = 0; i < 32; ++i) {
for (size_t j = 0; j < 64; ++j) {
for (size_t k = 0; k < 32 * 32; ++k) {
out[(i * 64 + j) * 32 * 32 + k] =
y[(i * 64 + j) * 32 * 32 + k] + x[j];
}
}
}
};
std::vector<std::string> input_names = {x.id().data(), y.id().data()};
std::vector<std::string> output_names = {out->id};
std::vector<std::vector<int>> output_shapes = {{32, 64, 32, 32}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, add_cpu);
}
TEST(Decomposer, elementwise_add_grad_bcast1) {
NetBuilder builder("elementwise_add_grad");
auto dout = builder.CreateInput(Float(32), {32, 64, 32, 32});
auto x = builder.CreateInput(Float(32), {32, 64, 32, 32});
auto y = builder.CreateInput(Float(32), {64});
auto out_grads = builder.ElementwiseAddGrad(dout, x, y, 1);
auto add_grad_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
float* dout = static_cast<float*>(ptrs[0]);
float* dx = static_cast<float*>(ptrs[1]);
float* dy = static_cast<float*>(ptrs[2]);
for (size_t j = 0; j < 64; ++j) {
dy[j] = 0;
}
for (size_t i = 0; i < 32; ++i) {
for (size_t j = 0; j < 64; ++j) {
for (size_t k = 0; k < 32 * 32; ++k) {
dx[(i * 64 + j) * 32 * 32 + k] = dout[(i * 64 + j) * 32 * 32 + k];
dy[j] = dy[j] + dout[(i * 64 + j) * 32 * 32 + k];
}
}
}
};
std::vector<std::string> input_names = {dout.id().data()};
std::vector<std::string> output_names = {out_grads[0]->id, out_grads[1]->id};
std::vector<std::vector<int>> output_shapes = {{32, 64, 32, 32}, {64}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, add_grad_cpu);
}
TEST(Decomposer, elementwise_add_bcast2) {
NetBuilder builder("elementwise_add");
auto x = builder.CreateInput(Float(32), {32, 16});
auto y = builder.CreateInput(Float(32), {1});
auto out = builder.Add(x, y);
auto add_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
size_t n = lengths[0];
float* x = static_cast<float*>(ptrs[0]);
float* y = static_cast<float*>(ptrs[1]);
float* out = static_cast<float*>(ptrs[2]);
float y_data = y[0];
for (size_t i = 0; i < n; ++i) {
out[i] = x[i] + y_data;
}
};
std::vector<std::string> input_names = {x.id().data(), y.id().data()};
std::vector<std::string> output_names = {out->id};
std::vector<std::vector<int>> output_shapes = {{32, 16}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, add_cpu);
}
TEST(Decomposer, elementwise_add_bcast2_2) {
NetBuilder builder("elementwise_add");
auto x = builder.CreateInput(Float(32), {1});
auto y = builder.CreateInput(Float(32), {32, 16});
auto out = builder.Add(x, y);
auto add_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
size_t n = 32 * 16;
float* x = static_cast<float*>(ptrs[0]);
float* y = static_cast<float*>(ptrs[1]);
float* out = static_cast<float*>(ptrs[2]);
float x_data = x[0];
for (size_t i = 0; i < n; ++i) {
out[i] = y[i] + x_data;
}
};
std::vector<std::string> input_names = {x.id().data(), y.id().data()};
std::vector<std::string> output_names = {out->id};
std::vector<std::vector<int>> output_shapes = {{32, 16}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, add_cpu);
}
TEST(Decomposer, elementwise_add_bcast2_3) {
constexpr int kLength = 64;
using int_ty = int64_t;
NetBuilder builder("elementwise_add");
auto x = builder.CreateInput(Int(kLength), {32, 16});
auto y = builder.CreateInput(Int(kLength), {1});
auto out = builder.Add(x, y);
auto add_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
size_t n = lengths[0];
int_ty* x = static_cast<int_ty*>(ptrs[0]);
int_ty* y = static_cast<int_ty*>(ptrs[1]);
int_ty* out = static_cast<int_ty*>(ptrs[2]);
int_ty y_data = y[0];
for (size_t i = 0; i < n; ++i) {
out[i] = x[i] + y_data;
}
};
std::vector<std::string> input_names = {x.id().data(), y.id().data()};
std::vector<std::string> output_names = {out->id};
std::vector<std::vector<int>> output_shapes = {{32, 16}};
RunAndCheck<int_ty>(
&builder, input_names, output_names, output_shapes, add_cpu);
}
TEST(Decomposer, elementwise_add_grad_bcast2) {
NetBuilder builder("elementwise_add_grad");
auto dout = builder.CreateInput(Float(32), {32, 16});
auto x = builder.CreateInput(Float(32), {32, 16});
auto y = builder.CreateInput(Float(32), {1});
auto out_grads = builder.ElementwiseAddGrad(dout, x, y);
auto add_grad_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
size_t n = lengths[0];
float* dout = static_cast<float*>(ptrs[0]);
float* dx = static_cast<float*>(ptrs[1]);
float* dy = static_cast<float*>(ptrs[2]);
for (size_t i = 0; i < n; ++i) {
float tmp = dout[i];
dx[i] = tmp;
dy[0] += tmp;
}
};
std::vector<std::string> input_names = {dout.id().data()};
std::vector<std::string> output_names = {out_grads[0]->id, out_grads[1]->id};
std::vector<std::vector<int>> output_shapes = {{32, 16}, {1}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, add_grad_cpu);
}
TEST(Decomposer, elementwise_add_same_dims) {
NetBuilder builder("elementwise_add");
auto x = builder.CreateInput(Float(32), {32, 16});
auto y = builder.CreateInput(Float(32), {32, 16});
auto out = builder.Add(x, y);
auto add_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
size_t n = lengths[0];
float* x = static_cast<float*>(ptrs[0]);
float* y = static_cast<float*>(ptrs[1]);
float* out = static_cast<float*>(ptrs[2]);
for (size_t i = 0; i < n; ++i) {
out[i] = x[i] + y[i];
}
};
std::vector<std::string> input_names = {x.id().data(), y.id().data()};
std::vector<std::string> output_names = {out->id};
std::vector<std::vector<int>> output_shapes = {{32, 16}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, add_cpu);
}
TEST(Decomposer, elementwise_add_grad_same_dims) {
NetBuilder builder("elementwise_add_grad");
auto dout = builder.CreateInput(Float(32), {32, 16});
auto x = builder.CreateInput(Float(32), {32, 16});
auto y = builder.CreateInput(Float(32), {32, 16});
auto out_grads = builder.ElementwiseAddGrad(dout, x, y);
auto add_grad_cpu = [](const std::vector<size_t>& lengths,
const std::vector<void*>& ptrs) {
size_t n = lengths[0];
float* dout = static_cast<float*>(ptrs[0]);
float* dx = static_cast<float*>(ptrs[1]);
float* dy = static_cast<float*>(ptrs[2]);
for (size_t i = 0; i < n; ++i) {
float tmp = dout[i];
dx[i] = tmp;
dy[i] = tmp;
}
};
std::vector<std::string> input_names = {dout.id().data()};
std::vector<std::string> output_names = {out_grads[0]->id, out_grads[1]->id};
std::vector<std::vector<int>> output_shapes = {{32, 16}, {32, 16}};
RunAndCheck<float>(
&builder, input_names, output_names, output_shapes, add_grad_cpu);
}
} // namespace cinn::frontend
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment