// Copyright 2017 Google Inc. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // ============================================================================= // Utils for working with aligned memory blocks. The DRAGNN runtime requires // aligned memory for use in vectorized math. Do not rely on any particular // value of the alignment requirement, because it will vary over time and in // different build configurations. #ifndef DRAGNN_RUNTIME_ALIGNMENT_H_ #define DRAGNN_RUNTIME_ALIGNMENT_H_ #include #include #include #include #include #include #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/lib/strings/str_util.h" #include "tensorflow/core/platform/logging.h" // This is a type that has some private methods (so non-POD), but is known to be // trivially-deconstructable. Ergo we add some special handling so // IsAlignable returns true. namespace tensorflow { struct bfloat16; } namespace syntaxnet { namespace dragnn { namespace runtime { // Returns true if |T| can be used in an aligned memory block. template constexpr bool IsAlignable(); // Returns OK iff the |pointer| satisfies the alignment requirement. tensorflow::Status OkIfAligned(const void *pointer); // Returns the next alignment boundary at or after the |byte_offset|. size_t PadToAlignment(size_t byte_offset); // As above, but for pointers. template T *PadToAlignment(T *pointer); // Returns the number of bytes required to store a sequence of |num_arrays| // aligned arrays of |array_size| bytes, including alignment padding. See // (Mutable)AlignedArea below. size_t ComputeAlignedAreaSize(size_t num_arrays, size_t array_size); // Returns the number of bytes required to store a sequence of byte arrays of // the given |sizes|, including alignment padding after each array. size_t ComputeTotalBytesWithAlignmentPadding(const std::vector &sizes); // Forward-declared for friendship below. class Operands; class UniqueAlignedArray; enum class BlockedMatrixFormat; namespace internal { // A non-owning view of an aligned byte array. Templated so const and mutable // versions can share implementation. Do not use this class directly, instead // use (Mutable)AlignedView below. template class AlignedViewImpl { public: static_assert(sizeof(Byte) == 1, "Byte must be byte-sized"); // Creates an empty view. AlignedViewImpl() = default; // Points this at the same bytes as |that|, possibly reinterpreting type. template explicit AlignedViewImpl(AlignedViewImpl that); template AlignedViewImpl &operator=(AlignedViewImpl that); // Points this at [|data|,|data|+|size|). On error, returns non-OK and // modifies nothing. tensorflow::Status Reset(Byte *data, size_t size); // Splits this into a list of |views| of the |sizes|, possibly reinterpreting // type. The |views| need not completely cover all bytes of this. Requires // that this spans ComputeTotalBytesWithAlignmentPadding(|sizes|) bytes. On // error, returns non-OK and modifies nothing. template tensorflow::Status Split( const std::vector &sizes, std::vector> *views) const; // Accessors. Byte *data() const { return data_; } size_t size() const { return size_; } bool empty() const { return size() == 0; } private: template friend class AlignedViewImpl; template friend class AlignedAreaImpl; friend Operands; friend UniqueAlignedArray; // Directly creates an aligned view, bypassing alignment checks. AlignedViewImpl(Byte *data, size_t size); // Pointer to the start of the view. Byte *data_ = nullptr; // Number of bytes in the view. size_t size_ = 0; }; // A non-owning view of an aligned, 2-dimensional byte array. Templated so // const and mutable versons can share implementation. Do not use this class // directly, instead use (Mutable)AlignedArea below. template class AlignedAreaImpl { public: static_assert(sizeof(Byte) == 1, "Byte must be byte-sized"); // Creates an empty area. AlignedAreaImpl() = default; // Points this at the same bytes as |that|, possibly reinterpreting type. template explicit AlignedAreaImpl(AlignedAreaImpl that); template AlignedAreaImpl &operator=(AlignedAreaImpl that); // Resets this to a sequence of |num_views| aligned sub-views of the |view|, // each |view_size| bytes wide. The first sub-view covers [0,|view_size|) of // |view|, and each subsequent sub-view starts at the next alignment boundary. // Requires that |view| spans ComputeAlignedAreaSize(|num_views|,|view_size|) // bytes or more. On error, returns non-OK and modifies nothing. template tensorflow::Status Reset(AlignedViewImpl view, size_t num_views, size_t view_size); // Accessors. AlignedViewImpl view(size_t index) const; Byte *data() const { return data_; } size_t num_views() const { return num_views_; } size_t view_size() const { return view_size_; } size_t view_stride() const { return view_stride_; } bool empty() const { return num_views() == 0; } private: template friend class AlignedAreaImpl; friend Operands; // Directly creates an aligned view, bypassing alignment checks. AlignedAreaImpl(Byte *data, size_t num_views, size_t view_size, size_t view_stride); // Pointer to the start of the first view. Byte *data_ = nullptr; // Number of views in the area. size_t num_views_ = 0; // Size of each view in bytes, excluding alignment padding. size_t view_size_ = 0; // Number of bytes between the starts of consecutive views. NB: This is not // necessarily equal to PadToAlignment(|view_size_|). size_t view_stride_ = 0; }; } // namespace internal // Public aliases; use these. using AlignedView = internal::AlignedViewImpl; using AlignedArea = internal::AlignedAreaImpl; using MutableAlignedView = internal::AlignedViewImpl; using MutableAlignedArea = internal::AlignedAreaImpl; // A uniquely-owned aligned byte array. class UniqueAlignedArray { public: // Creates an empty byte array. UniqueAlignedArray() = default; // Reallocates this to |new_size| bytes, and discards the current byte array. // Contents are uninitialized. void Reset(size_t new_size); // Like Reset(), but only reallocates if |new_size| is more than the current // capacity. NB: Does not preserve current content when reallocation occurs; // use Resize() if that is desired. void Reserve(size_t new_size); // Resizes this to contain |new_size| bytes, preserving current content. If // |new_size| exceeds the current size, the added bytes are uninitialized. If // |new_size| exceeds the current capacity, reallocates, and copies current // content. Returns true if reallocation occurred. bool Resize(size_t new_size); // Returns the aligned byte array. MutableAlignedView view() const { return view_; } private: // Underlying byte array, which is padded for alignment. std::unique_ptr padded_array_; // Size of the aligned portion of |padded_array_|. size_t capacity_ = 0; // Active range of the |storage_|. MutableAlignedView view_; }; // Implementation details below. namespace internal { // Required alignment for memory blocks. Only the runtime framework should use // this; otherwise, DO NOT access or otherwise depend on this value. enum : size_t { kAlignmentBytes = 32 }; } // namespace internal template constexpr bool IsAlignable() { // Either T is divisible into alignment windows, or an alignment window is // divisible into Ts. Likewise for T's alignment requirement. Finally, T // must be POD because we won't call its constructor or destructor. return (sizeof(T) % internal::kAlignmentBytes == 0 || internal::kAlignmentBytes % sizeof(T) == 0) && (alignof(T) % internal::kAlignmentBytes == 0 || internal::kAlignmentBytes % alignof(T) == 0) && (std::is_pod::value || std::is_same::value); } inline tensorflow::Status OkIfAligned(const void *pointer) { const uintptr_t address = reinterpret_cast(pointer); if (address % internal::kAlignmentBytes != 0) { return tensorflow::errors::InvalidArgument( "Pointer fails alignment requirement: ", address, " vs required ", internal::kAlignmentBytes); } return tensorflow::Status::OK(); } inline size_t PadToAlignment(size_t byte_offset) { // Round up to the next alignment boundary by incrementing by a certain amount // and then rounding down. Note that the bitmask clears the low-order bits of // the offset, effectively rounding down to the previous alignment boundary. return (byte_offset + internal::kAlignmentBytes - 1) & ~(internal::kAlignmentBytes - 1); } template T *PadToAlignment(T *pointer) { static_assert(IsAlignable(), "T is not alignable"); uintptr_t address = reinterpret_cast(pointer); address = (address + internal::kAlignmentBytes - 1) & ~(internal::kAlignmentBytes - 1); return reinterpret_cast(address); } inline size_t ComputeAlignedAreaSize(size_t num_arrays, size_t array_size) { return num_arrays * PadToAlignment(array_size); } inline size_t ComputeTotalBytesWithAlignmentPadding( const std::vector &sizes) { size_t total = 0; for (const size_t size : sizes) total += PadToAlignment(size); return total; } namespace internal { template template AlignedViewImpl::AlignedViewImpl(AlignedViewImpl that) : data_(reinterpret_cast(that.data())), size_(that.size()) {} template template AlignedViewImpl &AlignedViewImpl::operator=( AlignedViewImpl that) { data_ = reinterpret_cast(that.data()); size_ = that.size(); return *this; } template tensorflow::Status AlignedViewImpl::Reset(Byte *data, size_t size) { TF_RETURN_IF_ERROR(OkIfAligned(data)); // Success; make modifications. data_ = data; size_ = size; return tensorflow::Status::OK(); } template template tensorflow::Status AlignedViewImpl::Split( const std::vector &sizes, std::vector> *views) const { const size_t total_bytes = ComputeTotalBytesWithAlignmentPadding(sizes); if (size() < total_bytes) { return tensorflow::errors::InvalidArgument( "View is too small to be split into sizes [", tensorflow::str_util::Join(sizes, ", "), "]: need ", total_bytes, " bytes but have ", size(), " bytes"); } // Success; make modifications. views->clear(); views->reserve(sizes.size()); Byte *base = data(); for (const size_t size : sizes) { views->push_back(AlignedViewImpl(base, size)); base = PadToAlignment(base + size); } DCHECK_EQ(base - data(), total_bytes); return tensorflow::Status::OK(); } template AlignedViewImpl::AlignedViewImpl(Byte *data, size_t size) : data_(data), size_(size) { TF_DCHECK_OK(OkIfAligned(data_)); } template template AlignedAreaImpl::AlignedAreaImpl(AlignedAreaImpl that) : data_(reinterpret_cast(that.data_)), num_views_(that.num_views()), view_size_(that.view_size()), view_stride_(that.view_stride_) {} template template AlignedAreaImpl &AlignedAreaImpl::operator=( AlignedAreaImpl that) { data_ = reinterpret_cast(that.data_); num_views_ = that.num_views(); view_size_ = that.view_size(); view_stride_ = that.view_stride_; return *this; } template template tensorflow::Status AlignedAreaImpl::Reset(AlignedViewImpl view, size_t num_views, size_t view_size) { const size_t total_bytes = ComputeAlignedAreaSize(num_views, view_size); if (view.size() < total_bytes) { return tensorflow::errors::InvalidArgument( "View is too small for area of ", num_views, " views of ", view_size, " bytes: need ", total_bytes, " bytes but got ", view.size(), " bytes"); } // Success; make modifications. data_ = reinterpret_cast(view.data()); num_views_ = num_views; view_size_ = view_size; view_stride_ = PadToAlignment(view_size_); return tensorflow::Status::OK(); } template AlignedViewImpl AlignedAreaImpl::view(size_t index) const { DCHECK_LT(index, num_views()); return AlignedViewImpl(data_ + view_stride_ * index, view_size_); } template AlignedAreaImpl::AlignedAreaImpl(Byte *data, size_t num_views, size_t view_size, size_t view_stride) : data_(data), num_views_(num_views), view_size_(view_size), view_stride_(view_stride) { TF_DCHECK_OK(OkIfAligned(data_)); TF_DCHECK_OK(OkIfAligned(static_cast(nullptr) + view_stride_)); } } // namespace internal inline void UniqueAlignedArray::Reset(size_t new_size) { // Pad the |new_size| to the next alignment boundary, so the final bytes of // the array are still in a full alignment window. E.g., if we resize to 48 // bytes with 32-byte alignment, then we allocate 64 bytes so the final 16 // bytes are still part of a full 32-byte alignment window. const size_t aligned_size = PadToAlignment(new_size); // To obtain an aligned address, allocate a sufficiently-padded byte array and // find an aligned address near the start of the block. // // TODO(googleuser): Alternatively, we could use library functions such as // memalign(), posix_memalign(), or aligned_alloc(), but those may not be // present on all platforms. Consider adding some #ifs to allow use of those // library functions when available. padded_array_.reset(new char[aligned_size + internal::kAlignmentBytes - 1]); capacity_ = aligned_size; view_.size_ = new_size; view_.data_ = PadToAlignment(padded_array_.get()); TF_DCHECK_OK(OkIfAligned(view_.data_)); } inline void UniqueAlignedArray::Reserve(size_t new_size) { if (new_size > capacity_) { Reset(new_size); } else { view_.size_ = new_size; } } inline bool UniqueAlignedArray::Resize(size_t new_size) { // Avoid reallocation, if possible. if (new_size <= capacity_) { view_.size_ = new_size; return false; } // Reallocate and copy. Extend the life of the old array until it is copied. // // Note: C realloc() can extend a byte array in place (i.e., without copying). // Unfortunately, there is no aligned version of realloc(). Moreover, adding // alignment padding could cause double-copying: first, when realloc() copies // the data to the new buffer, and second, if the amount of padding required // at the new address is not the same as before. const std::unique_ptr old_array = std::move(padded_array_); const MutableAlignedView old_view = view_; Reset(2 * new_size); memcpy(view_.data(), old_view.data(), old_view.size()); view_.size_ = new_size; return true; } } // namespace runtime } // namespace dragnn } // namespace syntaxnet #endif // DRAGNN_RUNTIME_ALIGNMENT_H_