Commit a4bb31d0 authored by Terry Koo's avatar Terry Koo
Browse files

Export @195097388.

parent dea7ecf6
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
#if defined(__AVX2__)
#include <stddef.h>
#include "dragnn/runtime/math/arithmetic_common.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// TODO(googleuser): Leaving this empty means that the definitions
// from arithmetic_common.h carry through. Provide template specializations
// that use architecture-specific intrinsics.
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // defined(__AVX2__)
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Declarations of arithmetic operations and trivial generic implementations.
// Architecture-specific implementations should include this header and define
// template specializations that override the generic implementations.
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
#include <stddef.h>
#include <algorithm>
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Performs output = scale * input. Dimensions must match.
template <class T>
void ScaleElements(Vector<T> input, T scale, MutableVector<T> output);
// Performs output += scale * input. Dimensions must match.
template <class T>
void AddScaledElements(Vector<T> input, T scale, MutableVector<T> output);
// Performs values = max(minimum, values) in place.
template <class T>
void MaxElements(T minimum, MutableVector<T> values);
// Performs output = matrix * input. All vectors are interpreted as column
// vectors. Dimensions must match.
template <class T>
void MultiplyMatrixAndVector(Matrix<T> matrix, Vector<T> input,
MutableVector<T> output);
// Performs output = bias + matrix * input. All vectors are interpreted as
// column vectors. Dimensions must match.
template <class T>
void MultiplyMatrixAndVectorWithBias(Matrix<T> matrix, Vector<T> bias,
Vector<T> input, MutableVector<T> output);
// Implementation details below.
template <class T>
void ScaleElements(T scale, Vector<T> input, MutableVector<T> output) {
DCHECK_EQ(input.size(), output.size());
for (size_t i = 0; i < input.size(); ++i) output[i] = scale * input[i];
}
template <class T>
void AddScaledElements(T scale, Vector<T> input, MutableVector<T> output) {
DCHECK_EQ(input.size(), output.size());
for (size_t i = 0; i < input.size(); ++i) output[i] += scale * input[i];
}
template <class T>
void MaxElements(T minimum, MutableVector<T> values) {
for (T &value : values) value = std::max(minimum, value);
}
namespace internal {
// Like MultiplyMatrixAndVectorWithBias(), but if |ignore_bias| is true, then
// the |bias| is treated as zero and its dimensions are not checked.
template <bool ignore_bias, class T>
void MultiplyMatrixAndVectorImpl(Matrix<T> matrix, Vector<T> bias,
Vector<T> input, MutableVector<T> output) {
DCHECK_EQ(matrix.num_columns(), input.size());
if (!ignore_bias) DCHECK_EQ(matrix.num_rows(), bias.size());
DCHECK_EQ(matrix.num_rows(), output.size());
for (size_t i = 0; i < matrix.num_rows(); ++i) {
const Vector<T> row = matrix.row(i);
DCHECK_EQ(row.size(), input.size());
T sum = ignore_bias ? T() : bias[i];
for (size_t j = 0; j < row.size(); ++j) sum += row[j] * input[j];
output[i] = sum;
}
}
} // namespace internal
template <class T>
void MultiplyMatrixAndVector(Matrix<T> matrix, Vector<T> input,
MutableVector<T> output) {
internal::MultiplyMatrixAndVectorImpl<true>(matrix, {}, input, output);
}
template <class T>
void MultiplyMatrixAndVectorWithBias(Matrix<T> matrix, Vector<T> bias,
Vector<T> input, MutableVector<T> output) {
internal::MultiplyMatrixAndVectorImpl<false>(matrix, bias, input, output);
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <stddef.h>
#include "dragnn/runtime/math/arithmetic_common.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// TODO(googleuser): Leaving this empty means that the definitions
// from arithmetic_common.h carry through. Provide template specializations
// that use architecture-specific intrinsics.
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // defined(__ARM_NEON) || defined(__ARM_NEON__)
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
#if defined(__SSE4_2__)
#include <stddef.h>
#include "dragnn/runtime/math/arithmetic_common.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// TODO(googleuser): Leaving this empty means that the definitions
// from arithmetic_common.h carry through. Provide template specializations
// that use architecture-specific intrinsics.
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // defined(__SSE4_2__)
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/arithmetic.h"
#include <stddef.h>
#include <vector>
#include "dragnn/runtime/math/types.h"
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
// Tests that ScaleElements() doesn't crash on empty vectors.
TEST(ScaleElementsTest, Empty) {
Vector<float> input;
MutableVector<float> output;
ScaleElements(1.5f, input, output);
}
// Tests that ScaleElements() copies scaled values from one vector to another.
TEST(ScaleElementsTest, Populated) {
UniqueVector<float> input({-2.0f, -3.0f, 5.0f});
UniqueVector<float> output({7.0f, 11.0f, 13.0f}); // gets overwritten
ScaleElements(1.5f, Vector<float>(*input), *output);
EXPECT_EQ((*output)[0], 1.5 * -2.0);
EXPECT_EQ((*output)[1], 1.5 * -3.0);
EXPECT_EQ((*output)[2], 1.5 * 5.0);
}
// Tests that AddScaledElements() doesn't crash on empty vectors.
TEST(AddScaledElementsTest, Empty) {
Vector<float> input;
MutableVector<float> output;
AddScaledElements(1.5f, input, output);
}
// Tests that AddScaledElements() adds scaled values from one vector to another.
TEST(AddScaledElementsTest, Populated) {
UniqueVector<float> input({-2.0f, -3.0f, 5.0f});
UniqueVector<float> output({7.0f, 11.0f, 13.0f}); // gets added to
AddScaledElements(1.5f, Vector<float>(*input), *output);
EXPECT_EQ((*output)[0], 1.5 * -2.0 + 7.0);
EXPECT_EQ((*output)[1], 1.5 * -3.0 + 11.0);
EXPECT_EQ((*output)[2], 1.5 * 5.0 + 13.0);
}
// Tests that MaxElements() doesn't crash on empty vectors.
TEST(MaxElementsTest, Empty) {
MutableVector<float> values;
MaxElements(1.5f, values);
}
// Tests that MaxElements() performs an in-place element-wise maximum.
TEST(MaxElementsTest, Populated) {
UniqueVector<float> values({-1.0f, 2.0f, 0.25f, -0.5f, 0.375f});
MaxElements(0.125f, *values);
EXPECT_EQ((*values)[0], 0.125);
EXPECT_EQ((*values)[1], 2.0);
EXPECT_EQ((*values)[2], 0.25);
EXPECT_EQ((*values)[3], 0.125);
EXPECT_EQ((*values)[4], 0.375);
}
// Tests that MultiplyMatrixAndVector() doesn't crash on empty inputs.
TEST(MultiplyMatrixAndVectorTest, Empty) {
Matrix<float> matrix;
Vector<float> input;
MutableVector<float> output;
MultiplyMatrixAndVector(matrix, input, output);
}
// Tests that MultiplyMatrixAndVector() computes a matrix-vector product.
TEST(MultiplyMatrixAndVectorTest, Populated) {
UniqueMatrix<float> matrix({{2.0f, 3.0f}, //
{5.0f, 7.0f}, //
{11.0f, 13.0f}});
UniqueVector<float> input({-0.5f, 2.0f});
UniqueVector<float> output({9.8f, 7.6f, 5.4f}); // gets overwritten
MultiplyMatrixAndVector(Matrix<float>(*matrix), Vector<float>(*input),
*output);
EXPECT_EQ((*output)[0], 2.0 * -0.5 + 3.0 * 2.0);
EXPECT_EQ((*output)[1], 5.0 * -0.5 + 7.0 * 2.0);
EXPECT_EQ((*output)[2], 11.0 * -0.5 + 13.0 * 2.0);
}
// Tests that MultiplyMatrixAndVectorWithBias() doesn't crash on empty inputs.
TEST(MultiplyMatrixAndVectorWithBiasTest, Empty) {
Matrix<float> matrix;
Vector<float> bias;
Vector<float> input;
MutableVector<float> output;
MultiplyMatrixAndVectorWithBias(matrix, bias, input, output);
}
// Tests that MultiplyMatrixAndVectorWithBias() computes a matrix-vector product
// with an additive bias.
TEST(MultiplyMatrixAndVectorWithBiasTest, Populated) {
UniqueMatrix<float> matrix({{2.0f, 3.0f}, //
{5.0f, 7.0f}, //
{11.0f, 13.0f}});
UniqueVector<float> bias({100.5f, 200.25f, 300.75f});
UniqueVector<float> input({-0.5f, 2.0f});
UniqueVector<float> output({9.8f, 7.6f, 5.4f}); // gets overwritten
MultiplyMatrixAndVectorWithBias(Matrix<float>(*matrix), Vector<float>(*bias),
Vector<float>(*input), *output);
EXPECT_EQ((*output)[0], 100.5 + 2.0 * -0.5 + 3.0 * 2.0);
EXPECT_EQ((*output)[1], 200.25 + 5.0 * -0.5 + 7.0 * 2.0);
EXPECT_EQ((*output)[2], 300.75 + 11.0 * -0.5 + 13.0 * 2.0);
}
// A dummy type for the specializations below. Specializing on this unique
// dummy type ensures we don't conflict with any existing specialization.
struct Foo {
float value;
};
} // namespace
// Dummy specializations for use in the subsequent tests.
template <>
void ScaleElements(Foo scale, Vector<Foo> input, MutableVector<Foo> output) {
for (Foo &foo : output) foo.value = 777.0;
}
namespace {
// Tests that the template specialization overrides the generic implementation.
TEST(ScaleElementsTest, OverriddenByTemplateSpecialization) {
// These values are uninitialized, but it doesn't matter because the
// specialization never looks at them.
UniqueVector<Foo> input(3);
UniqueVector<Foo> output(3);
ScaleElements(Foo(), Vector<Foo>(*input), *output);
EXPECT_EQ((*output)[0].value, 777.0);
EXPECT_EQ((*output)[1].value, 777.0);
EXPECT_EQ((*output)[2].value, 777.0);
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Contains logic for activation functions and more-complex elementwise
// vectorized operations.
//
// Uses operator overloading to express computation that looks like regular
// code. Currently, overloaded operators are scoped away in an "internal"
// namespace so they won't be accidentally used.
#ifndef DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
#define DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
#if defined(__AVX2__)
#include <immintrin.h>
#endif
#include "dragnn/runtime/math/avx_vector_array.h"
#define DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
#ifdef __clang__
#define DRAGNN_AVXAF_GCC_UNROLL
#else
#define DRAGNN_AVXAF_GCC_UNROLL __attribute__((optimize("unroll-loops")))
#endif
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Public API
namespace activations {
// Calculates elementwise exp(x).
inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE DRAGNN_AVXAF_GCC_UNROLL
Exponential(AvxFloatVec x);
// Calculates elementwise sigmoid(x) = 1/(1+exp(-x)).
inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE Sigmoid(AvxFloatVec x);
// Calculates elementwise tanh(x).
inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE Tanh(AvxFloatVec x);
} // namespace activations
namespace activations {
// Calculates e^x by representing x = m * ln(2) + r. It does a polynomial
// expansion of e^r, and then multiplies in e^(m * ln(2)) = 2^m.
//
inline AvxFloatVec Exponential(AvxFloatVec x) {
// EDSL-like helpers for writing vectorized code.
auto Const = AvxFloatVec::Const;
constexpr float explo = -88.3762626647949f;
constexpr float exphi = 88.3762626647950f;
const float cephes_exp_factors[] = {
1.9875691500e-4f, 1.3981999507e-3f, 8.3334519073e-3f,
4.1665795894e-2f, 1.6666665459e-1f, 5.0000001201e-1f,
};
// Clamp the input. i.e. assume exp(-88) is close to zero and exp(88) is
// close to infinity.
x.Clamp(explo, exphi);
// Calculate `m = floor(x/ln(2) + 0.5)`.
constexpr float inv_log2e = 1.44269504088896341f;
AvxFloatVec m = Const(0.5f);
m += Const(inv_log2e) * x;
m.Floor();
// Calculate `r = x - m*ln(2)` (see function-level comment).
constexpr float neg_ln2 = -0.6931471805599453f;
AvxFloatVec r = x;
r += m * Const(neg_ln2);
// Calculate a polynomial expansion of y = exp(r).
AvxFloatVec r_squared(r * r);
AvxFloatVec y = Const(cephes_exp_factors[0]);
for (int i = 1; i < 6; ++i) {
y = y * r + Const(cephes_exp_factors[i]);
}
y = y * r_squared + r;
y += Const(1.0f);
// Calculate `emm0 = 2^m`. This is done by converting emm0 into an integer,
// and shifting it into the exponent bits of the desired floating-point
// result. Recall that the exponent is unsigned with 127 representing 2^0.
AvxFloatVec emm0 = m;
emm0 += Const(127.0f);
AvxIntVec emm0_i(emm0);
emm0_i.LeftShift(23);
// The final result is `2^m * exp(r)`.
return AvxFloatVec(emm0_i.ReinterpretCastFloat() * y);
}
inline AvxFloatVec Tanh(AvxFloatVec x) {
// EDSL-like helpers for writing vectorized code.
auto Const = AvxFloatVec::Const;
const float numerator_coefficients[] = {
-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
4.89352455891786e-03f,
};
const float denominator_coefficients[] = {
1.19825839466702e-06f,
1.18534705686654e-04f,
2.26843463243900e-03f,
4.89352518554385e-03f,
};
// Clamp the inputs to the range [-9, 9] since anything outside this range
// is +/-1.0 in single-precision.
x.Clamp(-9.0f, 9.0f);
// Compute x^2.
AvxFloatVec x_squared(x * x);
// Compute the numerator polynomial.
AvxFloatVec p = Const(numerator_coefficients[0]);
for (int i = 1; i < 7; ++i) {
// p = p * x^2 + numerator_coefficients_i
p = p * x_squared + Const(numerator_coefficients[i]);
}
// p = p * x
p = AvxFloatVec(p * x);
// Compute the denominator polynomial.
AvxFloatVec q = Const(denominator_coefficients[0]);
for (int i = 1; i < 4; ++i) {
// q = q * x^2 + alqha_i
q = q * x_squared + Const(denominator_coefficients[i]);
}
// Divide the numerator by the denominator.
return p / q;
}
inline AvxFloatVec Sigmoid(AvxFloatVec x) {
AvxFloatVec half = AvxFloatVec::Const(0.5);
return half * Tanh(AvxFloatVec(half * x)) + half;
}
} // namespace activations
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#undef DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE
#undef DRAGNN_AVXAF_GCC_UNROLL
#endif // DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/avx_activation_functions.h"
#include <cmath>
#include <chrono>
#include "dragnn/runtime/test/helpers.h"
#include "syntaxnet/base.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
TEST(AvxActivationFunctionsTest, ExponentialTest) {
AvxVectorFuzzTest(
[](AvxFloatVec *vec) { *vec = activations::Exponential(*vec); },
[](float input_value, float actual) {
const float inverted = log(actual);
EXPECT_NEAR(input_value, inverted, 1e-6)
<< "exp(" << input_value << ") = " << actual
<< ", log(actual) = " << inverted;
});
}
TEST(AvxActivationFunctionsTest, SigmoidTest) {
AvxVectorFuzzTest( //
[](AvxFloatVec *vec) { *vec = activations::Sigmoid(*vec); },
[](float input_value, float actual) {
const float expected = 1.0f / (1.0f + exp(-input_value));
EXPECT_NEAR(actual, expected, 1e-6)
<< "sigmoid(" << input_value << ") = " << actual
<< ", expected = " << expected;
});
}
template <int batch_size, class Function>
void RunPerformanceTest(Function activation, int flops) {
constexpr uint64 kIterations = 1000000;
UniqueVector<float> input(batch_size);
UniqueVector<float> output(batch_size);
InitRandomVector(*input);
InitRandomVector(*output);
AvxFloatVecArray<batch_size / kAvxWidth> array;
auto start_time = std::chrono::system_clock::now();
for (int i = 0; i < kIterations; ++i) {
array.Load(input->data());
array.Apply(activation);
array.Store(output->data());
}
auto end_time = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end_time - start_time;
double elapsed = elapsed_seconds.count();
double exp_ops = kIterations * batch_size;
double macro_gops = exp_ops / 1e9 / elapsed;
VLOG(0) << "For batch_size " << batch_size
<< " macro-GOPS (giga-ops per sec): " << macro_gops
<< ", raw arithmetic: " << flops * macro_gops;
}
TEST(AvxActivationFunctionsTest, SigmoidPerformanceTest) {
RunPerformanceTest<8>(activations::Sigmoid, 26);
RunPerformanceTest<16>(activations::Sigmoid, 26);
RunPerformanceTest<32>(activations::Sigmoid, 26);
RunPerformanceTest<48>(activations::Sigmoid, 26);
RunPerformanceTest<64>(activations::Sigmoid, 26);
RunPerformanceTest<128>(activations::Sigmoid, 26);
}
TEST(AvxActivationFunctionsTest, TanhTest) {
AvxVectorFuzzTest([](AvxFloatVec *vec) { *vec = activations::Tanh(*vec); },
[](float input_value, float actual) {
const float expected = tanh(input_value);
EXPECT_NEAR(actual, expected, 1e-6)
<< "tanh(" << input_value << ") = " << actual
<< ", expected = " << expected;
});
}
TEST(AvxActivationFunctionsTest, TanhPerformanceTest) {
RunPerformanceTest<8>(activations::Sigmoid, 23);
RunPerformanceTest<16>(activations::Sigmoid, 23);
RunPerformanceTest<32>(activations::Tanh, 23);
RunPerformanceTest<48>(activations::Tanh, 23);
RunPerformanceTest<64>(activations::Tanh, 23);
RunPerformanceTest<128>(activations::Tanh, 23);
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Wraps AVX vectors into convenient helper classes. This contains a class
// wrapping a single AVX register, AvxFloatVec, and a class to manipulate a
// batch of registers, AvxFloatVecArray. Use of the latter is recommended where
// applicable, since it will be unrolled into more vectorizable code.
#ifndef DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
#define DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
#include <cmath>
#if defined(__AVX__)
#include <immintrin.h>
#elif defined(__SSE4_2__)
#include <nmmintrin.h>
#endif
#include "dragnn/runtime/math/float16_types.h"
#define DRAGNN_AVXVA_ALWAYS_INLINE inline __attribute__((always_inline))
#ifdef __clang__
// Clang doesn't support __attribute__((optimize(...))).
#define DRAGNN_AVXVA_INLINED_UNROLLED inline __attribute__((always_inline))
#else
// Assume we're using GCC, which does.
#define DRAGNN_AVXVA_INLINED_UNROLLED \
inline __attribute__((always_inline)) \
__attribute__((optimize("unroll-loops")))
#endif
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Number of single-precision floating point numbers that fit into a single SSE
// / AVX2 register (which are 128 and 256 bits respectively).
constexpr int kSseWidth = 128 / 32; // = 4
constexpr int kAvxWidth = 256 / 32; // = 8
constexpr int kSseWidthHalfPrecision = 128 / 16; // = 8
constexpr int kAvxWidthHalfPrecision = 256 / 16; // = 16
class AvxFloatVec;
namespace internal {
// This struct should always be eliminated by the compiler; it only exists so we
// can write `foo += bar * baz`, and have that compiled into a single FMA
// operation.
struct AvxMultiplyExpr {
const AvxFloatVec &a;
const AvxFloatVec &b;
};
} // namespace internal
// Allows EDSL-like programming with AVX vectors.
inline internal::AvxMultiplyExpr operator*(const AvxFloatVec &a,
const AvxFloatVec &b);
inline AvxFloatVec operator+(const internal::AvxMultiplyExpr &expr,
const AvxFloatVec &v);
inline AvxFloatVec operator+(const AvxFloatVec &a, const AvxFloatVec &b);
inline AvxFloatVec operator/(const AvxFloatVec &a, const AvxFloatVec &b);
inline AvxFloatVec operator-(const AvxFloatVec &a, const AvxFloatVec &b);
// API over a single AVX vector (register). The implementation will either use
// a real AVX vector, or a fixed array of floats for compatibility.
//
// Note that we include the "inline" directive in declarations, not just
// definitions, because it is necessary for the "always_inline" directive.
struct AvxFloatVec {
public:
AvxFloatVec() {}
// Evaluates an AvxMultiplyExpr intermediary without adding anything. This is
// not an implicit cast, because typically when we write `a * b` we want to
// add it to something and use an FMA operation.
explicit AvxFloatVec(const internal::AvxMultiplyExpr &expr);
// Loads from an aligned region of memory.
inline void Load(const float *source);
// Loads a constant value.
inline void LoadConstVector(const float val);
// Stores to an aligned region of memory.
inline void Store(float *dst) const;
// Adds `a * b` to this value, using a fused multiply-add operation.
inline void AddProductOf(const AvxFloatVec &a, const AvxFloatVec &b);
// Element-wise floor.
inline void Floor();
// Element-wise clamps values between a min and max value.
inline void Clamp(const float min_value, const float max_value);
// Convenience method for more complex calculations.
static DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec Const(const float value) {
AvxFloatVec result;
result.LoadConstVector(value);
return result;
}
// Syntactic sugar for computing an FMA operation.
inline AvxFloatVec &operator+=(const internal::AvxMultiplyExpr &to_add);
// Adds another vector element-wise.
inline AvxFloatVec &operator+=(const AvxFloatVec &vec);
// Subtracts another vector element-wise.
inline AvxFloatVec &operator-=(const AvxFloatVec &vec);
// Divides another vector element-wise.
inline AvxFloatVec &operator/=(const AvxFloatVec &vec);
#if defined(__AVX__)
__m256 ymm;
#elif defined(__SSE4_2__)
__m128 xmm[2];
#else
float ymm[8];
#endif
};
// Small wrapper around integer AVX vectors, exposing only methods we need for
// implementing the activation functions.
//
// As above, `inline` is specified here for the always_inline directive.
class AvxIntVec {
public:
// Constructs an AVX integer vector, by converting floating-point values.
inline explicit AvxIntVec(const AvxFloatVec &v);
// Left-shifts integer values.
inline void LeftShift(int bits);
// Reinterprets the register as a floating-point register, for bitwise tricks.
inline AvxFloatVec ReinterpretCastFloat();
private:
// Underlying register.
#if defined(__AVX__)
__m256i ymm_;
#elif defined(__SSE4_2__)
__m128i xmm_[2];
#else
int ymm_[8];
#endif
};
// Implements the index permutation that is effectively applied by the
// _mm256_unpack instructions. This permutation is equivalent to swapping the
// 3rd and 4th bits. See the PermutationFunctionIsEqualToTable test for the
// effective permutation that this encodes.
//
// We haven't done performance testing, but hopefully this is sufficiently fast
// for the compatibility routine. Hopefully in its use below, the compiler will
// determine it is being called with a constant (post-unrolling) and inline it.
DRAGNN_AVXVA_ALWAYS_INLINE int FastUnpackPermutation(int original_idx) {
// Bit in the 4th index if the 3rd and 4th bits should be swapped.
int should_swap = (original_idx + /* 0b0100 */ 4) & /* 0b1000 */ 8;
// If should_swap is zero, leaves original_idx untouched. Otherwise, does an
// xor with 0b1100, which will flip 10 to 01 and 01 to 10.
return (should_swap | (should_swap >> 1)) ^ original_idx;
}
// API over an array of AVX vectors (registers). The methods on this class are
// annotated such that the compiler should unroll them.
template <int N>
struct AvxFloatVecArray {
public:
DRAGNN_AVXVA_INLINED_UNROLLED void Load(const float *source) {
for (int i = 0; i < N; i++) {
vectors[i].Load(source + 8 * i);
}
}
DRAGNN_AVXVA_INLINED_UNROLLED void Load(const float *source, int max_idx) {
for (int i = 0; i < N; i++) {
if (i < max_idx) {
vectors[i].Load(source + 8 * i);
} else {
// When testing with a memory sanitizer, we make sure not to read
// uninitialized values. This is usually safe in normal operation
// because such results are never stored (via corresponding
// store-masking logic), but of course each algorithm must be tested to
// ensure correct operation.
//
// It is also worth pointing out that exceptional values (NaN, etc.) can
// slow down AVX/FMA floating point operations considerably. So we
// should investigate whether this is worth enabling in all cases (and
// forcing algorithms to provide a default).
#if defined(MEMORY_SANITIZER)
vectors[i].LoadConstVector(0);
#endif
}
}
}
// Reads and unpacks truncated half-precision values.
//
// Currently, only matrix coefficients use compressed/half-precision values,
// so it's not yet necessary to support max_idx masking (which will get a bit
// more complicated).
DRAGNN_AVXVA_INLINED_UNROLLED void Load(const TruncatedFloat16 *source);
#if defined(__F16C__)
// Reads and unpacks IEEE-754 half-precision values.
//
// Currently, only matrix coefficients use compressed/half-precision values,
// so it's not yet necessary to support max_idx masking (which will get a bit
// more complicated).
//
// TODO(googleuser): Either add non-F16C compatibility support from Eigen,
// or delete this code if it turns out not to be helpful.
DRAGNN_AVXVA_INLINED_UNROLLED void Load(const IeeeFloat16 *source);
#endif
DRAGNN_AVXVA_INLINED_UNROLLED void LoadConstVector(const float val) {
for (int i = 0; i < N; i++) {
vectors[i].LoadConstVector(val);
}
}
DRAGNN_AVXVA_INLINED_UNROLLED void Store(float *dst) {
for (int i = 0; i < N; i++) {
vectors[i].Store(dst + 8 * i);
}
}
DRAGNN_AVXVA_INLINED_UNROLLED void Store(float *dst, int max_idx) {
for (int i = 0; i < N; i++) {
// This is equivalent to writing `i < N && i < max_idx` above, but forces
// the compiler to produce more efficient code (it's still creating jump
// instructions, but the branching is probably more predictable, and the
// loops are unrolled). In the future we could switch to VMASKMOV if
// necessary.
if (i < max_idx) {
vectors[i].Store(dst + 8 * i);
}
}
}
template <class Function>
DRAGNN_AVXVA_INLINED_UNROLLED void Apply(const Function &fcn) {
for (int i = 0; i < N; i++) {
vectors[i] = fcn(vectors[i]);
}
}
AvxFloatVec vectors[N];
};
// Implementation details.
#if defined(__AVX__)
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
ymm = _mm256_mul_ps(expr.a.ymm, expr.b.ymm);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
ymm = _mm256_load_ps(source);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
ymm = _mm256_set1_ps(val);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
_mm256_store_ps(dst, ymm);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
const AvxFloatVec &a, const AvxFloatVec &b) {
#if defined(__AVX2__) && defined(__FMA__)
ymm = _mm256_fmadd_ps(a.ymm, b.ymm, ymm);
#else
*this += AvxFloatVec(a * b);
#endif
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
ymm = _mm256_floor_ps(ymm);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
const float max_value) {
ymm = _mm256_min_ps(ymm, Const(max_value).ymm);
ymm = _mm256_max_ps(ymm, Const(min_value).ymm);
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
const AvxFloatVec &vec) {
ymm = _mm256_add_ps(vec.ymm, ymm);
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
const AvxFloatVec &vec) {
ymm = _mm256_sub_ps(ymm, vec.ymm);
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
const AvxFloatVec &vec) {
ymm = _mm256_div_ps(ymm, vec.ymm);
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v)
: ymm_(_mm256_cvttps_epi32(v.ymm)) {}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
#if defined(__AVX2__)
ymm_ = _mm256_slli_epi32(ymm_, bits);
#else
// Convert to SSE and back again. This is pretty slow, so don't use this code
// except for compatibility purposes.
__m256i upper_bits = _mm256_permute2f128_si256(ymm_, ymm_, 1);
__m128i first = _mm256_castsi256_si128(ymm_); // Lower bits as SSE
__m128i second = _mm256_castsi256_si128(upper_bits); // Upper bits as SSE
first = _mm_slli_epi32(first, bits);
second = _mm_slli_epi32(second, bits);
ymm_ = _mm256_permute2f128_si256(_mm256_castsi128_si256(first),
_mm256_castsi128_si256(second), (2 << 4));
#endif
}
AvxFloatVec DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::ReinterpretCastFloat() {
AvxFloatVec result;
result.ymm = _mm256_castsi256_ps(ymm_);
return result;
}
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const TruncatedFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
for (int i = 0; i < N / 2; i++) {
#if defined(__AVX2__)
const __m256i input = _mm256_load_si256(
reinterpret_cast<__m256i const *>(source + kAvxWidthHalfPrecision * i));
vectors[2 * i].ymm = _mm256_castsi256_ps(
_mm256_unpacklo_epi16(_mm256_setzero_si256(), input));
vectors[2 * i + 1].ymm = _mm256_castsi256_ps(
_mm256_unpackhi_epi16(_mm256_setzero_si256(), input));
#else
// Compatibility AVX (not AVX2) implementation.
__m128i input[2];
input[0] = _mm_load_si128(
reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
input[1] = _mm_load_si128(reinterpret_cast<__m128i const *>(
source + kAvxWidthHalfPrecision * i + kSseWidthHalfPrecision));
// Unpack. This permutation is kinda cryptic and, to be honest, derived by
// simply trying many combinations.
vectors[2 * i].ymm = _mm256_insertf128_ps(
_mm256_castps128_ps256(_mm_castsi128_ps(
_mm_unpacklo_epi16(_mm_setzero_si128(), input[0]))),
_mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[1])), 1);
vectors[2 * i + 1].ymm = _mm256_insertf128_ps(
_mm256_castps128_ps256(_mm_castsi128_ps(
_mm_unpackhi_epi16(_mm_setzero_si128(), input[0]))),
_mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[1])), 1);
#endif
}
}
#if defined(__F16C__)
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const IeeeFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
for (int i = 0; i < N / 2; i++) {
// TODO(googleuser): Experiment with doing a single AVX2 load and
// dividing the result.
__m128i first_half = _mm_load_si128(
reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
__m128i second_half = _mm_load_si128(reinterpret_cast<__m128i const *>(
source + kAvxWidthHalfPrecision * i + kAvxWidth));
vectors[2 * i].ymm = _mm256_cvtph_ps(first_half);
vectors[2 * i + 1].ymm = _mm256_cvtph_ps(second_half);
}
}
#endif
#elif defined(__SSE4_2__)
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_mul_ps(expr.a.xmm[i], expr.b.xmm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_load_ps(&source[i * kSseWidth]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
xmm[1] = xmm[0] = _mm_set1_ps(val);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
for (int i = 0; i < 2; ++i) {
_mm_store_ps(&dst[i * kSseWidth], xmm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
const AvxFloatVec &a, const AvxFloatVec &b) {
*this += AvxFloatVec(a * b);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_floor_ps(xmm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
const float max_value) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_min_ps(xmm[i], Const(max_value).xmm[i]);
xmm[i] = _mm_max_ps(xmm[i], Const(min_value).xmm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
const AvxFloatVec &vec) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_add_ps(vec.xmm[i], xmm[i]);
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
const AvxFloatVec &vec) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_sub_ps(xmm[i], vec.xmm[i]);
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
const AvxFloatVec &vec) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_div_ps(xmm[i], vec.xmm[i]);
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v) {
xmm_[0] = _mm_cvttps_epi32(v.xmm[0]);
xmm_[1] = _mm_cvttps_epi32(v.xmm[1]);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
for (int i = 0; i < 2; ++i) {
xmm_[i] = _mm_slli_epi32(xmm_[i], bits);
}
}
AvxFloatVec DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::ReinterpretCastFloat() {
AvxFloatVec result;
for (int i = 0; i < 2; ++i) {
result.xmm[i] = _mm_castsi128_ps(xmm_[i]);
}
return result;
}
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const TruncatedFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
for (int i = 0; i < N / 2; i++) {
__m128i input[2];
input[0] = _mm_load_si128(
reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
input[1] = _mm_load_si128(reinterpret_cast<__m128i const *>(
source + kAvxWidthHalfPrecision * i + kSseWidthHalfPrecision));
vectors[2 * i].xmm[0] =
_mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[0]));
vectors[2 * i + 1].xmm[0] =
_mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[0]));
vectors[2 * i].xmm[1] =
_mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[1]));
vectors[2 * i + 1].xmm[1] =
_mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[1]));
}
}
#if defined(__F16C__)
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const IeeeFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
for (int i = 0; i < N / 2; i++) {
__m128i first_half = _mm_load_si128(
reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
__m128i second_half = _mm_load_si128(reinterpret_cast<__m128i const *>(
source + kAvxWidthHalfPrecision * i + kAvxWidth));
vectors[2 * i].xmm[0] = _mm_cvtph_ps(first_half);
vectors[2 * i + 1].xmm[0] = _mm_cvtph_ps(second_half);
first_half = _mm_shuffle_epi32(first_half, _MM_SHUFFLE(0, 1, 3, 2));
second_half = _mm_shuffle_epi32(second_half, _MM_SHUFFLE(0, 1, 3, 2));
vectors[2 * i].xmm[1] = _mm_cvtph_ps(first_half);
vectors[2 * i + 1].xmm[1] = _mm_cvtph_ps(second_half);
}
}
#endif
#else
// Compatibility implementations. If you compile with -ftree-vectorize and
// -msse2 flags, you should still get decent performance (maybe 1/4 of the
// AVX/FMA version).
//
// See the class above for method documentation.
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
for (int i = 0; i < 8; i++) {
ymm[i] = expr.a.ymm[i] * expr.b.ymm[i];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
for (int i = 0; i < 8; i++) {
ymm[i] = source[i];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
for (int i = 0; i < 8; i++) {
ymm[i] = val;
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
for (int i = 0; i < 8; i++) {
dst[i] = ymm[i];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
const AvxFloatVec &a, const AvxFloatVec &b) {
for (int i = 0; i < 8; i++) {
ymm[i] += a.ymm[i] * b.ymm[i];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
for (int i = 0; i < 8; i++) {
ymm[i] = floor(ymm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
const float max_value) {
for (int i = 0; i < 8; i++) {
ymm[i] = fmin(fmax(ymm[i], min_value), max_value);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
const AvxFloatVec &vec) {
for (int i = 0; i < 8; i++) {
ymm[i] += vec.ymm[i];
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
const AvxFloatVec &vec) {
for (int i = 0; i < 8; i++) {
ymm[i] -= vec.ymm[i];
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
const AvxFloatVec &vec) {
for (int i = 0; i < 8; i++) {
ymm[i] /= vec.ymm[i];
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v) {
for (int i = 0; i < 8; i++) {
ymm_[i] = static_cast<int>(v.ymm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
for (int i = 0; i < 8; i++) {
ymm_[i] = ymm_[i] << bits;
}
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec AvxIntVec::ReinterpretCastFloat() {
AvxFloatVec result;
for (int i = 0; i < 8; i++) {
result.ymm[i] = reinterpret_cast<float &>(ymm_[i]);
}
return result;
}
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const TruncatedFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
// Iterate through mock AVX vectors, each composed of 16 half-floats.
for (int vec_idx = 0; vec_idx < N / 2; vec_idx++) {
// Making this code a bit more verbose, by reading in-order to a temporary
// array, results in faster performance. The compatibility version is still
// pretty slow though.
TruncatedFloat16 tmp[16];
for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
tmp[i] = source[i + kAvxWidthHalfPrecision * vec_idx];
}
float unpacked[16];
for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
unpacked[i] = tmp[i].DebugToFloat();
}
for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
int permuted = FastUnpackPermutation(i);
vectors[2 * vec_idx + (i / 8)].ymm[i % 8] = unpacked[permuted];
}
}
}
#if defined(__F16C__)
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const IeeeFloat16 *source) {
// Not actually required for the compatibility implementation, but it'd be
// rather non-uniform if this API succeeded, and then compilation failed when
// AVX2 was turned on.
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
// Iterate through mock AVX vectors, each composed of 16 half-floats.
for (int i = 0; i < N * kAvxWidth; ++i) {
vectors[i / 8].ymm[i % 8] = source[i].DebugToFloat();
}
}
#endif
#endif
// The following operations are mostly syntax sugar, so they do not need
// architecture-specific implementations.
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
const internal::AvxMultiplyExpr &to_add) {
AddProductOf(to_add.a, to_add.b);
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE internal::AvxMultiplyExpr operator*(
const AvxFloatVec &a, const AvxFloatVec &b) {
return internal::AvxMultiplyExpr{a, b};
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec
operator+(const internal::AvxMultiplyExpr &expr, const AvxFloatVec &v) {
AvxFloatVec result = v;
result += expr;
return result;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator+(const AvxFloatVec &a,
const AvxFloatVec &b) {
AvxFloatVec result = a;
result += b;
return result;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator/(const AvxFloatVec &a,
const AvxFloatVec &b) {
AvxFloatVec result = a;
result /= b;
return result;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator-(const AvxFloatVec &a,
const AvxFloatVec &b) {
AvxFloatVec result = a;
result -= b;
return result;
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#undef DRAGNN_AVXVA_ALWAYS_INLINE
#undef DRAGNN_AVXVA_INLINED_UNROLLED
#endif // DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/avx_vector_array.h"
#include <cmath>
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
TEST(AvxVectorTest, LoadAndStore) {
UniqueVector<float> input(kAvxWidth);
UniqueVector<float> output(kAvxWidth);
InitRandomVector(*input);
InitRandomVector(*output);
AvxFloatVec vec;
vec.Load(input->data());
vec.Store(output->data());
for (int i = 0; i < kAvxWidth; ++i) {
EXPECT_EQ((*input)[i], (*output)[i]);
}
}
// Test flooring with assignment, just to make the compiler not erase aliases.
TEST(AvxVectorTest, AssignmentAndFloor) {
UniqueVector<float> input(kAvxWidth);
UniqueVector<float> output(kAvxWidth);
UniqueVector<float> floored(kAvxWidth);
InitRandomVector(*input);
InitRandomVector(*output);
AvxFloatVec vec;
vec.Load(input->data());
AvxFloatVec vec2 = vec;
vec.Floor();
vec.Store(floored->data());
vec2.Store(output->data());
for (int i = 0; i < kAvxWidth; ++i) {
EXPECT_EQ((*input)[i], (*output)[i]);
EXPECT_EQ(floor((*input)[i]), (*floored)[i]);
}
}
TEST(AvxVectorTest, ClampTest) {
bool modified = false; // check that some value was clamped.
AvxVectorFuzzTest(
[](AvxFloatVec *vec) { vec->Clamp(-0.314f, 0.314f); },
[&modified](float input_value, float output_value) {
modified = modified || input_value < -0.314 || input_value > 0.314;
EXPECT_EQ(fmax(-0.314f, fmin(0.314f, input_value)), output_value);
});
EXPECT_TRUE(modified) << "No values fell outside test range for ClampTest().";
}
TEST(AvxVectorTest, LoadConstAndStore) {
UniqueVector<float> output(kAvxWidth);
InitRandomVector(*output);
AvxFloatVec vec;
vec.LoadConstVector(3.14f);
vec.Store(output->data());
for (int i = 0; i < kAvxWidth; ++i) {
EXPECT_EQ((*output)[i], 3.14f);
}
}
TEST(AvxVectorTest, AddTest) {
AvxVectorFuzzTest( //
[](AvxFloatVec *vec) { (*vec) += *vec; },
[](float input_value, float output_value) {
EXPECT_EQ(input_value * 2, output_value);
});
}
TEST(AvxVectorTest, SubtractTest) {
AvxVectorFuzzTest(
[](AvxFloatVec *vec) {
AvxFloatVec one;
one.LoadConstVector(1.0f);
(*vec) -= one;
},
[](float input_value, float output_value) {
EXPECT_EQ(input_value - 1.0f, output_value);
});
}
TEST(AvxVectorTest, DivideTest) {
AvxVectorFuzzTest(
[](AvxFloatVec *vec) {
AvxFloatVec result;
result.LoadConstVector(1.0f);
result /= *vec;
*vec = result;
},
[](float input_value, float output_value) {
EXPECT_EQ(1.0f / input_value, output_value);
});
}
// This is a really basic test; half of the purpose is to ensure that the float
// API is still OK (i.e. compiles) for odd-sized arrays. If you try to add a
// call to array.Load(TruncatedFloat16 *source), it should produce a compiler
// error.
TEST(AvxFloatVecArrayTest, SingletonArrayLoadsAndStores) {
AvxFloatVecArray<1> array;
UniqueVector<float> input(kAvxWidth);
UniqueVector<float> output(kAvxWidth);
InitRandomVector(*input);
InitRandomVector(*output);
array.Load(input->data());
array.Store(output->data());
for (int i = 0; i < kAvxWidth; ++i) {
EXPECT_EQ((*input)[i], (*output)[i]);
}
}
TEST(AvxFloatVecArrayTest, LoadTruncatedFloat16) {
AvxFloatVecArray<2> array;
UniqueVector<TruncatedFloat16> values(2 * kAvxWidth);
UniqueVector<float> decompressed(2 * kAvxWidth);
for (int i = 0; i < 2 * kAvxWidth; ++i) {
int permuted = FastUnpackPermutation(i);
(*values)[i] = TruncatedFloat16::DebugFromFloat(permuted / 10.0);
}
// Ensure that state persisted from other tests won't cause this test to
// erroneously pass.
array.LoadConstVector(-1.0f);
array.Load(values->data());
array.Store(decompressed->data());
for (int i = 0; i < 2 * kAvxWidth; ++i) {
ASSERT_NEAR((*decompressed)[i], i / 10.0, 0.01);
}
}
TEST(AvxFloatVecArrayTest, LoadIeeeFloat16) {
#if defined(__F16C__)
AvxFloatVecArray<2> array;
UniqueVector<IeeeFloat16> values(2 * kAvxWidth);
UniqueVector<float> decompressed(2 * kAvxWidth);
for (int i = 0; i < 2 * kAvxWidth; ++i) {
(*values)[i] = IeeeFloat16::DebugFromFloat(i / 10.0);
}
// Ensure that state persisted from other tests won't cause this test to
// erroneously pass.
array.LoadConstVector(-1.0f);
array.Load(values->data());
array.Store(decompressed->data());
for (int i = 0; i < 2 * kAvxWidth; ++i) {
ASSERT_NEAR((*decompressed)[i], i / 10.0, 0.01);
}
#else
LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
<< "this test.";
#endif
}
TEST(AvxFloatVecArrayTest, PermutationFunctionIsEqualToTable) {
std::vector<int> permutation = {0, 1, 2, 3, 8, 9, 10, 11,
4, 5, 6, 7, 12, 13, 14, 15};
for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
EXPECT_EQ(FastUnpackPermutation(i), permutation[i]);
}
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Compatibility support for Eigen.
#ifndef DRAGNN_RUNTIME_MATH_EIGEN_H_
#define DRAGNN_RUNTIME_MATH_EIGEN_H_
#include "dragnn/runtime/alignment.h"
#include "dragnn/runtime/math/types.h"
#include "third_party/eigen3/Eigen/Core"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace internal {
// Returns a combination of bit-options for Eigen matrices.
constexpr int GetEigenMatrixOptions() {
return Eigen::AutoAlign | Eigen::RowMajor;
}
// Returns a combination of bit-options for Eigen maps of runtime types.
constexpr int GetEigenMapOptions() {
static_assert(kAlignmentBytes >= EIGEN_MAX_ALIGN_BYTES,
"Runtime alignment is not compatible with Eigen alignment.");
return Eigen::Aligned;
}
// Eigen matrix and (row) vector types. Don't use these directly; instead use
// the public Map types and functions below to wrap runtime types.
template <class T>
using EigenVector =
Eigen::Matrix<T, 1, Eigen::Dynamic, GetEigenMatrixOptions()>;
template <class T>
using EigenMatrix =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, GetEigenMatrixOptions()>;
// Eigen stride for matrix types.
using EigenMatrixStride = Eigen::Stride<Eigen::Dynamic, 1>;
// Returns the Eigen stride associated with the |matrix|.
template <class T>
EigenMatrixStride GetEigenMatrixStride(MatrixImpl<T> matrix) {
return EigenMatrixStride(matrix.row_stride(), 1);
}
} // namespace internal
// Eigen wrappers around a runtime-allocated matrix or (row) vector.
template <class T>
using EigenVectorMap =
Eigen::Map<const internal::EigenVector<T>, internal::GetEigenMapOptions()>;
template <class T>
using MutableEigenVectorMap =
Eigen::Map<internal::EigenVector<T>, internal::GetEigenMapOptions()>;
template <class T>
using EigenMatrixMap =
Eigen::Map<const internal::EigenMatrix<T>, internal::GetEigenMapOptions(),
internal::EigenMatrixStride>;
template <class T>
using MutableEigenMatrixMap =
Eigen::Map<internal::EigenMatrix<T>, internal::GetEigenMapOptions(),
internal::EigenMatrixStride>;
// Returns an Eigen wrapper around the |vector| or |matrix|.
template <class T>
EigenVectorMap<T> AsEigenMap(Vector<T> vector) {
return EigenVectorMap<T>(vector.data(), vector.size());
}
template <class T>
MutableEigenVectorMap<T> AsEigenMap(MutableVector<T> vector) {
return MutableEigenVectorMap<T>(vector.data(), vector.size());
}
template <class T>
EigenMatrixMap<T> AsEigenMap(Matrix<T> matrix) {
return EigenMatrixMap<T>(matrix.data(), matrix.num_rows(),
matrix.num_columns(),
internal::GetEigenMatrixStride(matrix));
}
template <class T>
MutableEigenMatrixMap<T> AsEigenMap(MutableMatrix<T> matrix) {
return MutableEigenMatrixMap<T>(matrix.data(), matrix.num_rows(),
matrix.num_columns(),
internal::GetEigenMatrixStride(matrix));
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_EIGEN_H_
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/eigen.h"
#include <vector>
#include "dragnn/core/test/generic.h"
#include "dragnn/runtime/math/types.h"
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
// Expects that two pointers point to the same address.
void ExpectSameAddress(const void *ptr1, const void *ptr2) {
EXPECT_EQ(ptr1, ptr2);
}
// Expects that the |vector| has the |values|.
void ExpectValues(MutableVector<float> vector,
const std::vector<float> &values) {
ASSERT_EQ(vector.size(), values.size());
for (int i = 0; i < values.size(); ++i) {
EXPECT_EQ(vector[i], values[i]);
}
}
// Expects that the Eigen |matrix| has the |values|.
template <class EigenMatrix>
void ExpectValues(const EigenMatrix &matrix,
const std::vector<std::vector<float>> &values) {
ASSERT_EQ(matrix.rows(), values.size());
for (int row = 0; row < matrix.rows(); ++row) {
ASSERT_EQ(matrix.cols(), values[row].size());
for (int column = 0; column < matrix.cols(); ++column) {
EXPECT_EQ(matrix(row, column), values[row][column]);
}
}
}
// Tests that an Eigen vector map references the same memory as the underlying
// runtime vector.
TEST(EigenTest, Vector) {
UniqueVector<float> vector({1.0, 2.0, 3.0, 4.0});
EigenVectorMap<float> const_eigen_vector = AsEigenMap(Vector<float>(*vector));
ExpectSameAddress(const_eigen_vector.data(), vector->data());
ExpectValues(const_eigen_vector, {{1.0, 2.0, 3.0, 4.0}});
MutableEigenVectorMap<float> mutable_eigen_vector = AsEigenMap(*vector);
ExpectSameAddress(mutable_eigen_vector.data(), vector->data());
ExpectValues(mutable_eigen_vector, {{1.0, 2.0, 3.0, 4.0}});
// Write into the runtime vector and read from the other views.
(*vector)[0] = 10.0;
(*vector)[1] = 20.0;
(*vector)[2] = 30.0;
(*vector)[3] = 40.0;
ExpectValues(const_eigen_vector, {{10.0, 20.0, 30.0, 40.0}});
ExpectValues(mutable_eigen_vector, {{10.0, 20.0, 30.0, 40.0}});
// Write into the mutable Eigen vector and read from the other views.
mutable_eigen_vector << 100.0, 200.0, 300.0, 400.0;
ExpectValues(const_eigen_vector, {{100.0, 200.0, 300.0, 400.0}});
ExpectValues(*vector, {100.0, 200.0, 300.0, 400.0});
}
// Tests that an Eigen matrix map references the same memory as the underlying
// runtime vector.
TEST(EigenTest, Matrix) {
UniqueMatrix<float> matrix({{1.0, 2.0, 3.0}, //
{4.0, 5.0, 6.0}, //
{7.0, 8.0, 9.0}});
EigenMatrixMap<float> const_eigen_matrix = AsEigenMap(Matrix<float>(*matrix));
ExpectSameAddress(const_eigen_matrix.data(), matrix->row(0).data());
ExpectValues(const_eigen_matrix, {{1.0, 2.0, 3.0}, //
{4.0, 5.0, 6.0}, //
{7.0, 8.0, 9.0}});
MutableEigenMatrixMap<float> mutable_eigen_matrix = AsEigenMap(*matrix);
ExpectSameAddress(mutable_eigen_matrix.data(), matrix->row(0).data());
ExpectValues(mutable_eigen_matrix, {{1.0, 2.0, 3.0}, //
{4.0, 5.0, 6.0}, //
{7.0, 8.0, 9.0}});
// Write into the runtime matrix and read from the other views.
matrix->row(0)[0] = 10.0;
matrix->row(0)[1] = 20.0;
matrix->row(0)[2] = 30.0;
matrix->row(1)[0] = 40.0;
matrix->row(1)[1] = 50.0;
matrix->row(1)[2] = 60.0;
matrix->row(2)[0] = 70.0;
matrix->row(2)[1] = 80.0;
matrix->row(2)[2] = 90.0;
ExpectValues(const_eigen_matrix, {{10.0, 20.0, 30.0}, //
{40.0, 50.0, 60.0}, //
{70.0, 80.0, 90.0}});
ExpectValues(mutable_eigen_matrix, {{10.0, 20.0, 30.0}, //
{40.0, 50.0, 60.0}, //
{70.0, 80.0, 90.0}});
// Write into the mutable Eigen matrix and read from the other views.
mutable_eigen_matrix << 100.0, 200.0, 300.0,
400.0, 500.0, 600.0,
700.0, 800.0, 900.0;
ExpectValues(const_eigen_matrix, {{100.0, 200.0, 300.0}, //
{400.0, 500.0, 600.0}, //
{700.0, 800.0, 900.0}});
ExpectValues(matrix->row(0), {100.0, 200.0, 300.0});
ExpectValues(matrix->row(1), {400.0, 500.0, 600.0});
ExpectValues(matrix->row(2), {700.0, 800.0, 900.0});
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Declares 16-bit floating point types.
#ifndef DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
#define DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
#if defined(__F16C__)
#include <emmintrin.h>
#endif
#include "syntaxnet/base.h"
#include "tensorflow/core/lib/core/casts.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Represents a truncated 16-bit floating point value. This corresponds to
// `bfloat16` in TensorFlow. It just chops the last 16 least-significant bits
// off the significand of a 32-bit floating point value, leaving 7 significand
// bits, 8 exponent bits, and 1 sign bit.
struct TruncatedFloat16 {
// Slow unpacking routine. Use avx_vector_array.h for normal operation.
float DebugToFloat() const {
uint32 upcast = bits;
upcast <<= 16;
return tensorflow::bit_cast<float>(upcast);
}
// Slow packing routine. Use avx_vector_array.h for normal operation.
static TruncatedFloat16 DebugFromFloat(float value) {
uint32 float_bits = tensorflow::bit_cast<uint32>(value);
return TruncatedFloat16{static_cast<uint16>(float_bits >> 16)};
}
uint16 bits;
};
static_assert(sizeof(TruncatedFloat16) == sizeof(uint16), "Bad struct size");
// Currently, only CPUs with the F16C instruction set are supported. All use of
// this struct should be flag-guarded.
//
// If this becomes a problem, we can implement this method with Eigen's
// CUDA/Half.h.
#if defined(__F16C__)
// Represents an IEEE-754 16-bit floating point value. This has 10 significand
// bits, 5 exponent bits, and 1 sign bit.
//
// TODO(googleuser): Either add compatibility support, or delete this code if
// it turns out not to be helpful.
struct IeeeFloat16 {
// Slow unpacking routine. Use avx_vector_array.h for normal operation.
float DebugToFloat() const { return _cvtsh_ss(bits); }
// Slow packing routine. Use avx_vector_array.h for normal operation.
static IeeeFloat16 DebugFromFloat(float value) {
return IeeeFloat16{_cvtss_sh(value, 0)};
}
uint16 bits;
};
static_assert(sizeof(IeeeFloat16) == sizeof(uint16), "Bad struct size");
#endif
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/float16_types.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
// C++11 doesn't support binary literals like 0b01001, so add a helper. :(
uint16 ParseBinaryString(const string &bits) {
CHECK_EQ(bits.size(), 16) << "ParseBinaryString expects full 16-bit values";
uint16 value = 0;
for (const char bit : bits) {
CHECK(bit == '0' || bit == '1') << "String must be 0's and 1's.";
value = (value << 1) + (bit == '0' ? 0 : 1);
}
return value;
}
TEST(Float16TypesTest, IeeeFloat16Accuracy) {
#if defined(__F16C__)
bool some_not_exact = false;
for (int i = -100; i < 100; ++i) {
float value = i / 10.0f;
IeeeFloat16 half = IeeeFloat16::DebugFromFloat(value);
float unpacked = half.DebugToFloat();
EXPECT_NEAR(value, unpacked, 0.01);
some_not_exact = some_not_exact || (value != unpacked);
}
EXPECT_TRUE(some_not_exact);
#else
LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
<< "this test.";
#endif
}
TEST(Float16TypesTest, TruncatedAccuracy) {
bool some_not_exact = false;
for (int i = -100; i < 100; ++i) {
float value = i / 10.0f;
TruncatedFloat16 half = TruncatedFloat16::DebugFromFloat(value);
float unpacked = half.DebugToFloat();
EXPECT_NEAR(value, unpacked, 0.06);
some_not_exact = some_not_exact || (value != unpacked);
}
EXPECT_TRUE(some_not_exact);
}
TEST(Float16TypesTest, TruncatedKnownBinaryRepresentation) {
uint16 neg_1 = ParseBinaryString("1011111110000000");
uint16 one = ParseBinaryString("0011111110000000");
EXPECT_EQ((TruncatedFloat16{neg_1}).DebugToFloat(), -1.0f);
EXPECT_EQ((TruncatedFloat16{one}).DebugToFloat(), 1.0f);
}
TEST(Float16TypesTest, IeeeFloat16KnownBinaryRepresentation) {
#if defined(__F16C__)
uint16 neg_1 = ParseBinaryString("1011110000000000");
uint16 one = ParseBinaryString("0011110000000000");
EXPECT_EQ((IeeeFloat16{neg_1}).DebugToFloat(), -1.0f);
EXPECT_EQ((IeeeFloat16{one}).DebugToFloat(), 1.0f);
#else
LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
<< "this test.";
#endif
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Computes `[y_1, y_2, ...] = M * [v_1, v_2, ...] + [b_1, b_2, ...]`, where
//
// M is a `m x n` dense matrix.
// v_i are `n`-dimensional dense vectors.
// b_i and y_i are `m`-dimensional dense vectors.
//
// Unfortunately even larger (e.g. 128x128) matrix sizes are not sufficient to
// hide the latency of a function call. So the entire implementation needs to
// live in this header file. Please make sure to use all of the optimization
// flags mentioned in the BUILD file in any client libraries.
#ifndef DRAGNN_RUNTIME_MATH_SGEMVV_H_
#define DRAGNN_RUNTIME_MATH_SGEMVV_H_
#if defined(__SSE2__)
#include <xmmintrin.h>
#endif
#include "dragnn/runtime/math/avx_vector_array.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#define DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
#ifdef __clang__
#define DRAGNN_SGEMVV_GCC_UNROLL
#else
#define DRAGNN_SGEMVV_GCC_UNROLL __attribute__((optimize("unroll-loops")))
#endif
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Represents `v, b` from one operation `y = M * v + b`.
template <int num_ops>
struct SgemvInputBatch {
const float *input[num_ops];
const float *initial[num_ops];
};
template <int num_ops>
struct SgemvOutputBatch {
float *output[num_ops];
};
// Matrix argument for the SGEMV/SGEMVV operation. Based on row-batched
// column-major matrices, but pulls the batch size into a template argument
// so code can be compiled more efficiently.
template <int sse_batch_size, typename ElementType = float>
class SgemvMatrix final {
public:
// Convenience type alias.
using MatrixType =
BlockedMatrix<ElementType, BlockedMatrixFormat::kRowBlockedColumnMajor>;
// Creates an empty SgemvMatrix.
SgemvMatrix() = default;
// Initializes the new matrix. Returns an InvalidArgumentError if the block
// size of `matrix` is not equal to `sse_batch_size.
::tensorflow::Status Initialize(const MatrixType &matrix);
// Computes the matrix-vector product with a set of other inputs. See
// top-level comment for the general algorithm.
template <int num_ops, int lookahead_1 = 8, int lookahead_2 = 8>
void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
MatrixMultiVectorProduct(const SgemvInputBatch<num_ops> &inputs,
SgemvOutputBatch<num_ops> *outputs) const {
MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/false,
/*read_initial=*/true, lookahead_1,
lookahead_2>(inputs, -1, outputs);
}
// Computes the matrix-vector product with a set of other inputs. See
// top-level comment for the general algorithm. This variant allows another
// parameter, `output_vector_elements`, to write to outputs which are a
// multiple of kAvxWidth (8 floats, or 32 bytes) but not necessarily
// sse_batch_size. It is slightly slower, but probably more than noise.
//
// |lookahead_1| and |lookahead_2| parameters control prefetching, and should
// usually be tuned via a script. They issue prefetch instructions that are
// `lookahead_1 * sse_batch_size` values ahead of the current matrix entry
// being read, if `lookahead_1 != 0` (and `(lookahead_1 + lookahead_2) *
// sse_batch_size` values, if lookahead_2 != 0). To reiterate, all prefetching
// can be disabled by setting |lookahead_1| to 0, or the second prefetch can
// be disabled by setting |lookahead_2| to 0.
template <int num_ops, int lookahead_1 = 8, int lookahead_2 = 8>
void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
MaskedMatrixMultiVectorProduct(const SgemvInputBatch<num_ops> &inputs,
int output_vector_elements,
SgemvOutputBatch<num_ops> *outputs) const {
MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/true,
/*read_initial=*/true, lookahead_1,
lookahead_2>(inputs, output_vector_elements,
outputs);
}
// Like the above, but assumes existing values are zero instead of reading
// them.
template <int num_ops>
void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
MaskedMatrixMultiVectorProductNoInitial(
const SgemvInputBatch<num_ops> &inputs, int output_vector_elements,
SgemvOutputBatch<num_ops> *outputs) const {
MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/true,
/*read_initial=*/false>(
inputs, output_vector_elements, outputs);
}
// Read-only accessor.
const MatrixType &matrix() const { return matrix_; }
private:
template <int num_ops, bool mask_input_output, bool read_initial,
int lookahead_1 = 8, int lookahead_2 = 8>
DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL void
MatrixMultiVectorProductImpl(const SgemvInputBatch<num_ops> &inputs,
int output_vector_elements,
SgemvOutputBatch<num_ops> *outputs) const;
MatrixType matrix_;
};
// Implementation details.
template <int sse_batch_size, typename ElementType>
template <int num_ops, bool mask_input_output, bool read_initial,
int lookahead_1, int lookahead_2>
inline void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
SgemvMatrix<sse_batch_size, ElementType>::MatrixMultiVectorProductImpl(
const SgemvInputBatch<num_ops> &inputs, int output_vector_elements,
SgemvOutputBatch<num_ops> *outputs) const {
static_assert(sse_batch_size % kAvxWidth == 0,
"sse_batch_size must be a multiple of kAvxWidth (8).");
if (mask_input_output) {
DCHECK_EQ(output_vector_elements % kAvxWidth, 0)
<< "output_vector_elements must be padded to alignment";
}
const ElementType *curr_matrix_ptr = matrix_.vector(0).data();
// Loop over blocks of output rows. Each block of output rows will get a
// partial sum of the [matrix-vector] dot product, where the range of that
// partial sum is designated by start_col and end_col.
for (int row_start = 0; row_start < matrix_.num_rows();
row_start += sse_batch_size) {
const int load_store_max_idx =
(output_vector_elements - row_start) / kAvxWidth;
AvxFloatVecArray<sse_batch_size / kAvxWidth> accumulators[num_ops];
// Read inputs.
for (int op = 0; op < num_ops; ++op) {
if (read_initial) {
if (mask_input_output) {
accumulators[op].Load(&inputs.initial[op][row_start],
load_store_max_idx);
} else {
accumulators[op].Load(&inputs.initial[op][row_start]);
}
} else {
accumulators[op].LoadConstVector(0.0f);
}
}
// Compute matrix-vector product.
for (int col = 0; col < matrix_.num_columns(); ++col) {
if (lookahead_1 != 0) {
#if defined(__SSE2__)
_mm_prefetch(curr_matrix_ptr + lookahead_1 * sse_batch_size,
_MM_HINT_T0);
if (lookahead_2 != 0) {
_mm_prefetch(
curr_matrix_ptr + (lookahead_1 + lookahead_2) * sse_batch_size,
_MM_HINT_T0);
}
#endif
}
// These are the coefficients from each vector at column `col` (just
// broadcast over the whole AVX array).
AvxFloatVec weights[num_ops];
for (int op = 0; op < num_ops; ++op) {
weights[op].LoadConstVector(inputs.input[op][col]);
}
// Loop over each AVX vector and add the current sub-product.
AvxFloatVecArray<sse_batch_size / kAvxWidth> matrix_block;
matrix_block.Load(curr_matrix_ptr);
curr_matrix_ptr += sse_batch_size;
for (int row_offset = 0; row_offset < sse_batch_size / kAvxWidth;
row_offset++) {
for (int op = 0; op < num_ops; ++op) {
accumulators[op].vectors[row_offset].AddProductOf(
weights[op], matrix_block.vectors[row_offset]);
}
}
}
// Save the results.
for (int op = 0; op < num_ops; ++op) {
if (mask_input_output) {
accumulators[op].Store(&outputs->output[op][row_start],
load_store_max_idx);
} else {
accumulators[op].Store(&outputs->output[op][row_start]);
}
}
}
}
template <int sse_batch_size, typename ElementType>
::tensorflow::Status SgemvMatrix<sse_batch_size, ElementType>::Initialize(
const SgemvMatrix<sse_batch_size, ElementType>::MatrixType &matrix) {
if (matrix.block_size() != sse_batch_size) {
return ::tensorflow::errors::InvalidArgument(
"Blocked matrix block_size (", matrix.block_size(),
") must be equal to sse_batch_size (", sse_batch_size, ")");
}
matrix_ = matrix;
return ::tensorflow::Status::OK();
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#undef DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
#undef DRAGNN_SGEMVV_GCC_UNROLL
#endif // DRAGNN_RUNTIME_MATH_SGEMVV_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/sgemvv.h"
#include <chrono>
#include <random>
#include "dragnn/core/test/generic.h"
#include "dragnn/runtime/math/arithmetic.h"
#include "dragnn/runtime/math/transformations.h"
#include "dragnn/runtime/math/types.h"
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
void naive_sgemv(const MutableMatrix<float> &matrix, const float *v,
const float *b, float *y) {
for (int row = 0; row < matrix.num_rows(); row++) {
y[row] = b[row];
for (int col = 0; col < matrix.num_columns(); col++) {
y[row] += matrix.row(row)[col] * v[col];
}
}
}
// Everything except floats require copying.
template <class ElementType>
constexpr bool RequiresCopy();
template <>
constexpr bool RequiresCopy<TruncatedFloat16>() {
return true;
}
#if defined(__F16C__)
template <>
constexpr bool RequiresCopy<IeeeFloat16>() {
return true;
}
#endif
template <>
constexpr bool RequiresCopy<float>() {
return false;
}
template <class ElementType>
void ConvertRow(Vector<float> input, MutableVector<ElementType> output);
template <>
void ConvertRow<float>(Vector<float> input, MutableVector<float> output) {}
template <>
void ConvertRow<TruncatedFloat16>(Vector<float> input,
MutableVector<TruncatedFloat16> output) {
CHECK_EQ(input.size() % 16, 0);
CHECK_EQ(input.size(), output.size());
for (int i = 0; i < input.size(); ++i) {
int i_permuted = (i / 16) * 16 + FastUnpackPermutation(i % 16);
output[i] = TruncatedFloat16::DebugFromFloat(input[i_permuted]);
}
}
#if defined(__F16C__)
template <>
void ConvertRow<IeeeFloat16>(Vector<float> input,
MutableVector<IeeeFloat16> output) {
CHECK_EQ(input.size() % 16, 0);
CHECK_EQ(input.size(), output.size());
for (int i = 0; i < input.size(); ++i) {
output[i] = IeeeFloat16::DebugFromFloat(input[i]);
}
}
#endif
// Converts a matrix to SGEMV. If the element type is not float, copies the
// matrix and then converts it.
template <int sse_batch_size, typename ElementType = float>
SgemvMatrix<sse_batch_size, ElementType> ConvertToSgemv(
const Matrix<float> &matrix, UniqueMatrix<ElementType> *sgemv_storage) {
MutableBlockedMatrix<ElementType, BlockedMatrixFormat::kRowBlockedColumnMajor>
blocked;
TF_EXPECT_OK(blocked.Reset(sgemv_storage->area(), matrix.num_rows(),
matrix.num_columns()));
// TODO(googleuser): Clean this up when we can use C++17's `if constexpr`
// ... then we will not have to introduce this raw pointer, which is either
// an actual new variable or alias to `sgemv_storage`.
UniqueMatrix<float> *uncompressed;
if (RequiresCopy<ElementType>()) {
uncompressed = new UniqueMatrix<float>((*sgemv_storage)->num_rows(),
(*sgemv_storage)->num_columns());
} else {
// NOTE: Because we don't have C++17's `if constexpr`, we need to add a
// reinterpret_cast, so this code can compile when ElementType != float.
uncompressed = reinterpret_cast<UniqueMatrix<float> *>(sgemv_storage);
}
// Copy to the uncompressed matrix. If ElementType == float, this is just
// the output, otherwise it's the temporary array.
MutableBlockedMatrix<float, BlockedMatrixFormat::kRowBlockedColumnMajor>
uncompressed_matrix;
TF_EXPECT_OK(uncompressed_matrix.Reset(
uncompressed->area(), matrix.num_rows(), matrix.num_columns()));
TF_EXPECT_OK(CopyMatrix(matrix, &uncompressed_matrix));
if (RequiresCopy<ElementType>()) {
for (int i = 0; i < blocked.num_vectors(); ++i) {
ConvertRow<ElementType>(Vector<float>(uncompressed_matrix.vector(i)),
blocked.vector(i));
}
delete uncompressed;
}
SgemvMatrix<sse_batch_size, ElementType> sgemv_matrix;
TF_EXPECT_OK(sgemv_matrix.Initialize(blocked.AsConst()));
return sgemv_matrix;
}
void InitRandomVector(MutableVector<float> vector) {
// clock() is updated less frequently than a cycle counter, so keep around the
// RNG just in case we initialize some vectors in less than a clock tick.
static std::mt19937 *rng = new std::mt19937(clock());
std::normal_distribution<float> distribution(0, 1);
for (int i = 0; i < vector.size(); i++) {
vector[i] = distribution(*rng);
}
}
void InitRandomMatrix(MutableMatrix<float> matrix) {
// See InitRandomVector comment.
static std::mt19937 *rng = new std::mt19937(clock());
std::normal_distribution<float> distribution(0, 1);
GenerateMatrix(
matrix.num_rows(), matrix.num_columns(),
[&distribution](int row, int col) { return distribution(*rng); },
&matrix);
}
TEST(SgemvvTest, MatmulNoBias) {
constexpr int sse_batch_size = 32;
constexpr int num_rows = 32;
constexpr int num_columns = 15;
constexpr int output_size = 8;
constexpr int sgemv_views = num_rows * num_columns / sse_batch_size;
static_assert(num_rows * num_columns % sse_batch_size == 0,
"Bad matrix size");
ASSERT_EQ(output_size % 8, 0) << "Output size must still be 32-byte aligned.";
UniqueMatrix<float> matrix(num_rows, num_columns);
UniqueMatrix<float> sgemv_storage(sgemv_views, sse_batch_size);
UniqueVector<float> input_vector(num_columns);
UniqueVector<float> output(num_rows);
UniqueVector<float> expected(num_rows);
// Random initialization for all variables/values.
InitRandomMatrix(*matrix);
InitRandomVector(*output);
InitRandomVector(*expected);
InitRandomVector(*input_vector);
// Layout SGEMV matrix.
SgemvMatrix<sse_batch_size> sgemv_matrix =
ConvertToSgemv<sse_batch_size>(Matrix<float>(*matrix), &sgemv_storage);
// SGEMV multiplication.
SgemvInputBatch<1> inputs = {{input_vector->data()}, {nullptr}};
SgemvOutputBatch<1> outputs = {{output->data()}};
sgemv_matrix.MaskedMatrixMultiVectorProductNoInitial(inputs, output_size,
&outputs);
// Naive algorithm.
MultiplyMatrixAndVector<float>(Matrix<float>(*matrix),
Vector<float>(*input_vector), *expected);
// Check that results are equal.
for (int i = 0; i < output_size; i++) {
EXPECT_NEAR(output->data()[i], expected->data()[i], 1e-5);
}
}
TEST(SgemvvTest, ErrorsWithBadMultiple) {
// Pick num_rows which is (32-byte) alignable, but not a multiple of
// sse_batch_size (32 floats). These should return errors.
for (int num_rows = 8; num_rows < 32; num_rows += 8) {
// Layout blocked matrix.
UniqueMatrix<float> sgemv_storage(1, num_rows);
MutableBlockedMatrix<float, BlockedMatrixFormat::kRowBlockedColumnMajor>
blocked;
TF_EXPECT_OK(blocked.Reset(sgemv_storage.area(), num_rows, 1));
// Initialize SgemvvMatrix.
SgemvMatrix<32> matrix;
EXPECT_THAT(matrix.Initialize(blocked.AsConst()),
test::IsErrorWithSubstr("must be equal to sse_batch_size"));
}
}
template <typename ElementType>
string TypenameString();
template <>
string TypenameString<float>() {
return "float32";
}
template <>
string TypenameString<TruncatedFloat16>() {
return "bfloat16";
}
#if defined(__F16C__)
template <>
string TypenameString<IeeeFloat16>() {
return "float16";
}
#endif
template <typename ElementType>
float ToleranceAt128();
template <>
float ToleranceAt128<float>() {
return 1e-5;
}
template <>
float ToleranceAt128<TruncatedFloat16>() {
return 1;
}
#if defined(__F16C__)
template <>
float ToleranceAt128<IeeeFloat16>() {
return 1e-1;
}
#endif
template <int sse_batch_size, int num_rows, int num_cols, typename ElementType>
void RunPerformanceTest(int output_size) {
constexpr int sgemv_views = num_rows * num_cols / sse_batch_size;
static_assert(num_rows * num_cols % sse_batch_size == 0, "Bad matrix size");
ASSERT_EQ(output_size % 8, 0) << "Output size must still be 32-byte aligned.";
UniqueMatrix<float> matrix(num_rows, num_cols);
UniqueMatrix<ElementType> sgemv_storage(sgemv_views, sse_batch_size);
UniqueVector<float> initial_1(num_rows);
UniqueVector<float> initial_2(num_rows);
UniqueVector<float> vector_1(num_cols);
UniqueVector<float> vector_2(num_cols);
UniqueVector<float> output_1(num_rows);
UniqueVector<float> output_2(num_rows);
UniqueVector<float> expected_output_1(num_rows);
UniqueVector<float> expected_output_2(num_rows);
UniqueVector<float> untouched_output_1(num_rows);
UniqueVector<float> untouched_output_2(num_rows);
// Random initialization for all variables/values.
InitRandomMatrix(*matrix);
InitRandomVector(*initial_1);
InitRandomVector(*initial_2);
InitRandomVector(*output_1);
InitRandomVector(*output_2);
InitRandomVector(*expected_output_1);
InitRandomVector(*expected_output_2);
InitRandomVector(*vector_1);
InitRandomVector(*vector_2);
for (int i = 0; i < num_rows; i++) {
(*untouched_output_1)[i] = (*output_1)[i];
(*untouched_output_2)[i] = (*output_2)[i];
}
// Layout SGEMV matrix.
SgemvMatrix<sse_batch_size, ElementType> sgemv_matrix =
ConvertToSgemv<sse_batch_size, ElementType>(Matrix<float>(*matrix),
&sgemv_storage);
naive_sgemv(*matrix, vector_1->data(), initial_1->data(),
expected_output_1->data());
naive_sgemv(*matrix, vector_2->data(), initial_2->data(),
expected_output_2->data());
double raw_flops_per_iteration = 2.0 * 2.0 * num_rows * num_cols;
const uint64 iterations =
static_cast<uint64>(std::round(4e9 / raw_flops_per_iteration));
auto start_time = std::chrono::system_clock::now();
SgemvInputBatch<2> inputs = {
{vector_1->data(), vector_2->data()},
{initial_1->data(), initial_2->data()},
};
SgemvOutputBatch<2> outputs = {{output_1->data(), output_2->data()}};
if (num_rows == output_size) {
for (int iter = 0; iter < iterations; iter++) {
sgemv_matrix.template MatrixMultiVectorProduct<2, 0, 0>(inputs, &outputs);
}
} else {
for (int iter = 0; iter < iterations; iter++) {
sgemv_matrix.template MaskedMatrixMultiVectorProduct<2>(
inputs, output_size, &outputs);
}
}
auto end_time = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end_time - start_time;
double elapsed = elapsed_seconds.count();
// Each MatrixVectorVectorProduct does 2 Matrix-vector ops, and each op does a
// multiply and an add (2 floating-point operations) for each entry in the
// matrix.
string raw_gflops = "";
if (num_rows != output_size) {
raw_gflops = ::tensorflow::strings::StrCat(
", ", raw_flops_per_iteration * iterations / 1e9 / elapsed, " raw");
}
VLOG(0) << " ElementType " << TypenameString<ElementType>() << " GFLOPS: "
<< (2.0 * 2.0 * output_size * num_cols * iterations) / 1e9 / elapsed
<< " effective" << raw_gflops;
const float tolerance =
ToleranceAt128<ElementType>() * (num_rows / 128.0) + 1e-5;
for (int i = 0; i < output_size; i++) {
EXPECT_NEAR(output_1->data()[i], expected_output_1->data()[i], tolerance);
EXPECT_NEAR(output_2->data()[i], expected_output_2->data()[i], tolerance);
}
// Check that any non-output items are untouched.
for (int i = output_size; i < num_rows; i++) {
EXPECT_EQ((*output_1)[i], (*untouched_output_1)[i]);
EXPECT_EQ((*output_2)[i], (*untouched_output_2)[i]);
}
}
TEST(SgemvvTest, PerformanceAndAccuracyTest) {
// Benchmarking is hard. Sometimes results vary between test runs, or are just
// unreliable. This could be in part from CPU frequency scaling, and also how
// favorably the memory allocator places data (coherence, etc.).
constexpr int kNumBatches = 3;
VLOG(0) << "64x64 32-batch-size test";
for (int batch = 0; batch < kNumBatches; ++batch) {
RunPerformanceTest<32, 64, 64, float>(64);
#if defined(__F16C__)
RunPerformanceTest<32, 64, 64, IeeeFloat16>(64);
#endif
}
VLOG(0) << "128x128 32-batch-size test";
for (int batch = 0; batch < kNumBatches; ++batch) {
RunPerformanceTest<32, 128, 128, float>(128);
}
VLOG(0) << "256x256 32-batch-size test";
for (int batch = 0; batch < kNumBatches; ++batch) {
RunPerformanceTest<32, 256, 256, float>(256);
#if defined(__F16C__)
RunPerformanceTest<32, 256, 256, IeeeFloat16>(256);
#endif
RunPerformanceTest<32, 256, 256, TruncatedFloat16>(256);
}
VLOG(0) << "96x96 48-batch-size test";
for (int batch = 0; batch < kNumBatches; ++batch) {
RunPerformanceTest<48, 96, 96, float>(96);
}
VLOG(0) << "48x96 48-batch-size test";
for (int batch = 0; batch < kNumBatches; ++batch) {
RunPerformanceTest<48, 48, 96, float>(48);
}
VLOG(0) << "40x96 48-batch-size test";
for (int batch = 0; batch < kNumBatches; ++batch) {
RunPerformanceTest<48, 48, 96, float>(40);
}
// These larger matrices are about the same amount of computation as one
// 96-dimensional LSTM cell (without output softmax).
VLOG(0) << "480x96 48-batch-size test";
for (int batch = 0; batch < kNumBatches; ++batch) {
RunPerformanceTest<48, 480, 96, float>(480);
#if defined(__F16C__)
RunPerformanceTest<48, 480, 96, IeeeFloat16>(480);
#endif
RunPerformanceTest<48, 480, 96, TruncatedFloat16>(480);
}
VLOG(0) << "472x96 48-batch-size test";
for (int batch = 0; batch < kNumBatches; ++batch) {
RunPerformanceTest<48, 480, 96, float>(472);
#if defined(__F16C__)
RunPerformanceTest<48, 480, 96, IeeeFloat16>(472);
#endif
RunPerformanceTest<48, 480, 96, TruncatedFloat16>(472);
}
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Utility functions that can transform different matrix types. This includes
// non-trivial transposes, and converting vectors/etc. to the matrix types. This
// library should NOT be used for any performance-critical work, and should NOT
// be included at all in the mobile runtime.
#ifndef DRAGNN_RUNTIME_MATH_TRANSFORMATIONS_H_
#define DRAGNN_RUNTIME_MATH_TRANSFORMATIONS_H_
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace internal {
// Puts a format-agnostic API on matrix-like data types. This is convenient, but
// has the downside of potential confusing compiler errors (when a
// specialization does not exist), and isn't suitable for optimizations like
// blocked transformations.
template <class T>
T *GetMatrixElement(int row, int col, MatrixImpl<T> *matrix) {
return &matrix->row(row)[col];
}
template <class T>
const T &GetMatrixElement(int row, int col, const MatrixImpl<T> &matrix) {
return matrix.row(row)[col];
}
template <class T>
T *GetMatrixElement(
int row, int col,
BlockedMatrixImpl<T, BlockedMatrixFormat::kRowBlockedColumnMajor> *matrix) {
int sub_matrix_idx = row / matrix->block_size();
int vector_idx = sub_matrix_idx * matrix->num_columns() + col;
int element_idx = row % matrix->block_size();
return &matrix->vector(vector_idx)[element_idx];
}
template <class T>
const T &GetMatrixElement(
int row, int col,
const BlockedMatrixImpl<T, BlockedMatrixFormat::kRowBlockedColumnMajor>
&matrix) {
int sub_matrix_idx = row / matrix.block_size();
int vector_idx = sub_matrix_idx * matrix.num_columns() + col;
int element_idx = row % matrix.block_size();
return matrix.vector(vector_idx)[element_idx];
}
template <class T>
T *GetMatrixElement(
int row, int col,
BlockedMatrixImpl<T, BlockedMatrixFormat::kColumnBlockedRowMajor> *matrix) {
int sub_matrix_idx = col / matrix->block_size();
int vector_idx = sub_matrix_idx * matrix->num_rows() + row;
int element_idx = col % matrix->block_size();
return &matrix->vector(vector_idx)[element_idx];
}
template <class T>
const T &GetMatrixElement(
int row, int col,
const BlockedMatrixImpl<T, BlockedMatrixFormat::kColumnBlockedRowMajor>
&matrix) {
int sub_matrix_idx = col / matrix.block_size();
int vector_idx = sub_matrix_idx * matrix.num_rows() + row;
int element_idx = col % matrix.block_size();
return matrix.vector(vector_idx)[element_idx];
}
} // namespace internal
// Generates values for a matrix, by calling a provided function on each
// row/column index. Thanks to the magic of templating, the function call should
// be inlined and not cause too much overhead being "called" on each index.
template <class Function, class OutputMatrix>
void GenerateMatrix(int num_rows, int num_columns, const Function &get_value,
OutputMatrix *output_matrix) {
for (size_t row = 0; row < num_rows; ++row) {
for (size_t column = 0; column < num_columns; ++column) {
*(GetMatrixElement(row, column, output_matrix)) = get_value(row, column);
}
}
}
// Copies the first |num_rows| rows and |num_columns| columns of input_matrix to
// output_matrix.
template <class InputMatrix, class OutputMatrix>
void CopyMatrixPrefix(const InputMatrix &input_matrix, int num_rows,
int num_columns, OutputMatrix *output_matrix) {
const auto &get_value = [input_matrix](int row, int column) {
return GetMatrixElement(row, column, input_matrix);
};
GenerateMatrix(num_rows, num_columns, get_value, output_matrix);
}
// Copies matrices. The matrices can be of different types, but must have the
// same dimensions.
template <class InputMatrix, class OutputMatrix>
tensorflow::Status CopyMatrix(const InputMatrix &input_matrix,
OutputMatrix *output_matrix) {
if (input_matrix.num_rows() != output_matrix->num_rows()) {
return tensorflow::errors::InvalidArgument(
"Input matrix num_rows (", input_matrix.num_rows(),
") != output matrix num_rows (", output_matrix->num_rows(), ")");
}
if (input_matrix.num_columns() != output_matrix->num_columns()) {
return tensorflow::errors::InvalidArgument(
"Input matrix num_columns (", input_matrix.num_columns(),
") != output matrix num_columns (", output_matrix->num_columns(), ")");
}
CopyMatrixPrefix(input_matrix, input_matrix.num_rows(),
input_matrix.num_columns(), output_matrix);
return tensorflow::Status::OK();
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_TRANSFORMATIONS_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/transformations.h"
#include "dragnn/runtime/test/helpers.h"
#include <gmock/gmock.h>
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
// Generates a matrix where each value is of the form `aa.bb`, where `aa` is the
// column index and `bb` is the row index.
TEST(TransformationsTest, GenerateRowColIdxMatrix) {
UniqueMatrix<float> row_col_matrix(5, 5);
GenerateMatrix(
5, 5,
[](int row, int col) { return static_cast<float>(row) + (col / 100.0f); },
row_col_matrix.get());
ExpectMatrix(Matrix<float>(*row_col_matrix),
{{0.0f, 0.01f, 0.02f, 0.03f, 0.04f},
{1.0f, 1.01f, 1.02f, 1.03f, 1.04f},
{2.0f, 2.01f, 2.02f, 2.03f, 2.04f},
{3.0f, 3.01f, 3.02f, 3.03f, 3.04f},
{4.0f, 4.01f, 4.02f, 4.03f, 4.04f}});
}
TEST(TransformationsTest, CopiesMatrix) {
UniqueMatrix<float> a({{1, 2}}), b({{3, 4}});
TF_EXPECT_OK(CopyMatrix(*a, b.get()));
EXPECT_EQ(b->row(0)[0], 1);
EXPECT_EQ(b->row(0)[1], 2);
}
TEST(TransformationsTest, CopiesRowBlockedMatrix) {
UniqueMatrix<double> source({{1, 2, 3}, //
{4, 5, 6}, //
{7, 8, 9}, //
{10, 11, 12}, //
{13, 14, 15}, //
{16, 17, 18}, //
{19, 20, 21}, //
{22, 23, 24}});
UniqueMatrix<double> dst_mem(6, 4);
MutableBlockedMatrix<double, BlockedMatrixFormat::kRowBlockedColumnMajor>
blocked;
TF_EXPECT_OK(blocked.Reset(dst_mem.area(), 8, 3));
TF_EXPECT_OK(CopyMatrix(*source, &blocked));
ExpectMatrix(Matrix<double>(*dst_mem), {{1, 4, 7, 10}, //
{2, 5, 8, 11}, //
{3, 6, 9, 12}, //
{13, 16, 19, 22}, //
{14, 17, 20, 23}, //
{15, 18, 21, 24}});
}
// This test is the same as the above, except everything is transposed.
TEST(TransformationsTest, CopiesColumnBlockedMatrix) {
UniqueMatrix<double> source( //
{{1, 4, 7, 10, 13, 16, 19, 22}, //
{2, 5, 8, 11, 14, 17, 20, 23}, //
{3, 6, 9, 12, 15, 18, 21, 24}});
UniqueMatrix<double> dst_mem(6, 4);
MutableBlockedMatrix<double> blocked;
TF_EXPECT_OK(blocked.Reset(dst_mem.area(), 3, 8));
TF_EXPECT_OK(CopyMatrix(*source, &blocked));
ExpectMatrix(Matrix<double>(*dst_mem), {{1, 4, 7, 10}, //
{2, 5, 8, 11}, //
{3, 6, 9, 12}, //
{13, 16, 19, 22}, //
{14, 17, 20, 23}, //
{15, 18, 21, 24}});
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Mathematical types.
#ifndef DRAGNN_RUNTIME_MATH_TYPES_H_
#define DRAGNN_RUNTIME_MATH_TYPES_H_
#include <stddef.h>
#include <limits>
#include "dragnn/runtime/alignment.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Blocked matrix formats, for fast inference routines.
enum class BlockedMatrixFormat {
// Represents a row-blocked block-column-major matrix. In other words, first
// split a matrix M into
//
// [ M_1
// ...
// M_m ]
//
// sub-matrices, where each M_i is a `block_size x n` sub-matrix. Then each
// M_i is formatted in column-major order, and the sub-matrices' data is
// concatenated together.
kRowBlockedColumnMajor,
// Represents a column-blocked block-row-major matrix. This is the
// transpose of the above. A matrix M is split into
//
// [ M_1 ... M_n ]
//
// sub-matrices, where each M_i is a `m x block_size` sub-matrix. Then each
// M_i is formatted in row-major order, and the sub-matrices' data is
// concatenated together.
kColumnBlockedRowMajor,
};
namespace internal {
// An aligned vector of values. Do not use this class directly, instead use
// (Mutable)Vector below.
template <class T>
class VectorImpl {
public:
static_assert(IsAlignable<T>(), "T must be alignable");
// Creates an empty vector.
VectorImpl() = default;
// Points this at the |view|, which must be evenly divisible into Ts.
template <class Byte>
explicit VectorImpl(AlignedViewImpl<Byte> view);
// Points this at a prefix of the |view| containing |size| Ts. The |view|
// must span at least |size| * sizeof(T) bytes.
template <class Byte>
VectorImpl(AlignedViewImpl<Byte> view, size_t size);
// Points this at the same values as |that|, possibly reinterpreting type.
template <class U>
explicit VectorImpl(VectorImpl<U> that);
template <class U>
VectorImpl &operator=(VectorImpl<U> that);
// Enables range-based for loops.
T *begin() const { return data(); }
T *end() const { return begin() + size(); }
// Accessors.
T *data() const { return data_; }
size_t size() const { return size_; }
bool empty() const { return size() == 0; }
T &operator[](size_t index) const;
// Gets a sub-vector starting at |start| with |size| elements.
VectorImpl<T> Subsequence(size_t start, size_t size) const;
private:
template <class U>
friend class MatrixImpl;
template <class U, BlockedMatrixFormat format>
friend class BlockedMatrixImpl;
// Points this at [|data|,|data|+|size|), bypassing alignment checks.
VectorImpl(T *data, size_t size);
// Pointer to the start of the vector.
T *data_ = nullptr;
// Number of values in the vector.
size_t size_ = 0;
};
// Returns the format corresponding to the transpose of the |format|.
constexpr BlockedMatrixFormat TransposeBlockedMatrixFormat(
BlockedMatrixFormat format);
// A row-major matrix where each row or column is aligned. Do not use this
// class directly, instead use (Mutable)Matrix below.
template <class T>
class MatrixImpl {
public:
static_assert(IsAlignable<T>(), "T must be alignable");
// Creates an empty matrix.
MatrixImpl() = default;
// Points each row of this matrix at the corresponding sub-view of the |area|.
// Each view in the |area| must be evenly divisible into Ts.
template <class Byte>
explicit MatrixImpl(AlignedAreaImpl<Byte> area);
// Creates a matrix from a single vector. Assumes that the vector's stride is
// the minimum alignment padding.
explicit MatrixImpl(VectorImpl<T> single_vector);
// Points this at the same values as |that|.
template <class U>
explicit MatrixImpl(MatrixImpl<U> that);
template <class U>
MatrixImpl &operator=(MatrixImpl<U> that);
// Accessors.
T *data() const { return data_; }
size_t num_rows() const { return num_rows_; }
size_t num_columns() const { return num_columns_; }
size_t row_stride() const { return row_stride_; }
VectorImpl<T> row(size_t index) const;
private:
template <class U>
friend class MatrixImpl;
// Pointer to the start of the matrix.
T *data_ = nullptr;
// Number of rows and columns in the matrix.
size_t num_rows_ = 0;
size_t num_columns_ = 0;
// Distance between the starts of consecutive rows.
size_t row_stride_ = 0;
};
// Blocked matrix representation. See BlockedMatrixFormat for details.
template <class T, BlockedMatrixFormat format>
class BlockedMatrixImpl {
public:
static_assert(IsAlignable<T>(), "T must be alignable");
// These aliases allow templated code to reach back in and get template
// parameters, like std::vector<T>::iterator::value aliases.
using ElementType = T;
static constexpr bool IsRowBlocked() {
return format == BlockedMatrixFormat::kRowBlockedColumnMajor;
}
// Creates an empty matrix.
BlockedMatrixImpl() = default;
// Creates a copy of this matrix, using the same values (underlying area), but
// possibly re-interpreting the type. The new type U must be the same size,
// and `T *` must be implictly convertible to `U *` (usually just adding
// "const" qualifiers, but theoretically it could be a superclass).
template <class U>
explicit BlockedMatrixImpl(BlockedMatrixImpl<U, format> that);
template <class U>
BlockedMatrixImpl &operator=(BlockedMatrixImpl<U, format> that);
// Creates a new view that's const-qualified, in particular converting
// MutableBlockedMatrix to BlockedMatrix.
BlockedMatrixImpl<const T, format> AsConst() const {
return BlockedMatrixImpl<const T, format>(*this);
}
// Initializes the matrix. Raises errors if the matrix dimensions are
// incompatible with the underlying area, namely if the number of views in
// `area` do not cover the whole matrix, and also if the matrix cannot be
// blocked according to (template parameter) `format`.
//
// Further, because this class is used for (delicate / specialized) optimized
// inference routines, it is also required that no padding is present, i.e.
// that the block size is divisible by kAlignmentBytes (currently 32).
template <class Byte>
tensorflow::Status Reset(AlignedAreaImpl<Byte> area, size_t num_rows,
size_t num_columns);
// Returns the transpose of this.
BlockedMatrixImpl<T, TransposeBlockedMatrixFormat(format)> Transpose() const;
// Accessors.
size_t num_rows() const { return num_rows_; }
size_t num_columns() const { return num_columns_; }
size_t block_size() const { return block_size_; }
size_t num_vectors() const { return num_vectors_; }
VectorImpl<T> vector(size_t index) const;
private:
template <class U, BlockedMatrixFormat other_format>
friend class BlockedMatrixImpl;
// This is the same as calling Reset(), except the area is not checked.
template <class Byte>
explicit BlockedMatrixImpl(AlignedAreaImpl<Byte> area, int num_rows,
int num_columns);
// Pointer to the start of the matrix.
T *data_ = nullptr;
// Number of rows and columns in the matrix. Unlike MatrixImpl, there is no
// API for directly accessing rows and columns, but it's necessary for any
// algorithm (e.g. matrix-vector multiplication) to know the logical shape.
size_t num_rows_ = 0;
size_t num_columns_ = 0;
size_t block_size_ = 0; // in T's
size_t num_vectors_ = 0; // = num_rows * num_columns / block_size
};
} // namespace internal
// Public aliases; use these.
template <class T>
using Vector = internal::VectorImpl<const T>;
template <class T>
using Matrix = internal::MatrixImpl<const T>;
template <class T, BlockedMatrixFormat format =
BlockedMatrixFormat::kColumnBlockedRowMajor>
using BlockedMatrix = internal::BlockedMatrixImpl<const T, format>;
template <class T>
using MutableVector = internal::VectorImpl<T>;
template <class T>
using MutableMatrix = internal::MatrixImpl<T>;
template <class T, BlockedMatrixFormat format =
BlockedMatrixFormat::kColumnBlockedRowMajor>
using MutableBlockedMatrix = internal::BlockedMatrixImpl<T, format>;
// Implementation details below.
namespace internal {
template <class T>
template <class Byte>
VectorImpl<T>::VectorImpl(AlignedViewImpl<Byte> view)
: data_(reinterpret_cast<T *>(view.data())),
size_(view.size() / sizeof(T)) {
DCHECK_EQ(view.size() % sizeof(T), 0);
}
template <class T>
template <class Byte>
VectorImpl<T>::VectorImpl(AlignedViewImpl<Byte> view, size_t size)
: data_(reinterpret_cast<T *>(view.data())), size_(size) {
DCHECK_LE(size * sizeof(T), view.size());
}
template <class T>
template <class U>
VectorImpl<T>::VectorImpl(VectorImpl<U> that)
: data_(that.data()), size_(that.size()) {
static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
}
template <class T>
template <class U>
VectorImpl<T> &VectorImpl<T>::operator=(VectorImpl<U> that) {
static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
data_ = that.data();
size_ = that.size();
return *this;
}
template <class T>
T &VectorImpl<T>::operator[](size_t index) const {
DCHECK_LT(index, size());
return data_[index];
}
template <class T>
VectorImpl<T>::VectorImpl(T *data, size_t size) : data_(data), size_(size) {
TF_DCHECK_OK(OkIfAligned(data));
}
template <class T>
VectorImpl<T> VectorImpl<T>::Subsequence(size_t start, size_t size) const {
DCHECK_LE(start + size, size_);
return VectorImpl<T>(&data_[start], size);
}
constexpr BlockedMatrixFormat TransposeBlockedMatrixFormat(
BlockedMatrixFormat format) {
return format == BlockedMatrixFormat::kRowBlockedColumnMajor
? BlockedMatrixFormat::kColumnBlockedRowMajor
: BlockedMatrixFormat::kRowBlockedColumnMajor;
}
template <class T>
MatrixImpl<T>::MatrixImpl(VectorImpl<T> single_vector)
: data_(single_vector.data()),
num_rows_(1),
num_columns_(single_vector.size()),
row_stride_(PadToAlignment(single_vector.size() * sizeof(T)) /
sizeof(T)) {}
template <class T>
template <class Byte>
MatrixImpl<T>::MatrixImpl(AlignedAreaImpl<Byte> area)
: data_(reinterpret_cast<T *>(area.data())),
num_rows_(area.num_views()),
num_columns_(area.view_size() / sizeof(T)),
row_stride_(area.view_stride() / sizeof(T)) {
DCHECK_EQ(area.view_size() % sizeof(T), 0);
DCHECK_EQ(area.view_stride() % sizeof(T), 0);
}
template <class T>
template <class U>
MatrixImpl<T>::MatrixImpl(MatrixImpl<U> that)
: data_(that.data_),
num_rows_(that.num_rows()),
num_columns_(that.num_columns()),
row_stride_(that.row_stride_) {
static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
}
template <class T>
template <class U>
MatrixImpl<T> &MatrixImpl<T>::operator=(MatrixImpl<U> that) {
static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
data_ = that.data_;
num_rows_ = that.num_rows();
num_columns_ = that.num_columns();
row_stride_ = that.row_stride_;
return *this;
}
template <class T>
VectorImpl<T> MatrixImpl<T>::row(size_t index) const {
DCHECK_LT(index, num_rows());
// Note that |row_stride_|, not |num_columns_|, determines the start of the
// row. The former is aligned and may stride over a wider span than normal
// when this is a "slice" of a larger matrix.
return VectorImpl<T>(data_ + row_stride_ * index, num_columns());
}
template <class T, BlockedMatrixFormat format>
template <class U>
BlockedMatrixImpl<T, format>::BlockedMatrixImpl(
BlockedMatrixImpl<U, format> that)
: data_(that.data_),
num_rows_(that.num_rows()),
num_columns_(that.num_columns()),
block_size_(that.block_size()),
num_vectors_(that.num_vectors()) {
static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
}
template <class T, BlockedMatrixFormat format>
template <class U>
BlockedMatrixImpl<T, format> &BlockedMatrixImpl<T, format>::operator=(
BlockedMatrixImpl<U, format> that) {
static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
data_ = that.data_;
num_rows_ = that.num_rows();
num_columns_ = that.num_columns();
block_size_ = that.block_size();
num_vectors_ = that.num_vectors();
return *this;
}
template <class T, BlockedMatrixFormat format>
template <class Byte>
tensorflow::Status BlockedMatrixImpl<T, format>::Reset(
AlignedAreaImpl<Byte> area, size_t num_rows, size_t num_columns) {
data_ = reinterpret_cast<T *>(area.view(0).data());
num_rows_ = num_rows;
num_columns_ = num_columns;
block_size_ = area.view_size() / sizeof(T);
num_vectors_ = num_rows * num_columns / block_size_;
if (area.view_stride() != area.view_size()) {
return tensorflow::errors::InvalidArgument(
"Padding is not supported for blocked matrix formats. Underlying area "
"has size ",
area.view_size(), " which is padded to stride ", area.view_stride(),
".");
}
if (area.view_size() % sizeof(T) != 0) {
return tensorflow::errors::InvalidArgument(
"View size ", area.view_size(),
" is not a multiple of the templated type's size, ", sizeof(T));
}
if (num_vectors_ != area.num_views()) {
return tensorflow::errors::InvalidArgument("Area has ", area.num_views(),
" views, but should have ",
num_vectors_);
}
// The block dimension must divide rows or columns evenly.
size_t divided_dimension = IsRowBlocked() ? num_rows : num_columns;
if (divided_dimension % block_size_ != 0) {
return tensorflow::errors::InvalidArgument(
IsRowBlocked() ? "row" : "column",
"-blocked matrix has major dimension ", divided_dimension,
" which is not divisible by the block size, ", block_size_);
}
return tensorflow::Status::OK();
}
template <class T, BlockedMatrixFormat format>
VectorImpl<T> BlockedMatrixImpl<T, format>::vector(size_t index) const {
DCHECK_LT(index, num_vectors_);
return VectorImpl<T>(data_ + block_size_ * index, block_size_);
}
template <class T, BlockedMatrixFormat format>
BlockedMatrixImpl<T, TransposeBlockedMatrixFormat(format)>
BlockedMatrixImpl<T, format>::Transpose() const {
BlockedMatrixImpl<T, TransposeBlockedMatrixFormat(format)> result;
result.data_ = data_;
result.num_columns_ = num_rows_;
result.num_rows_ = num_columns_;
result.block_size_ = block_size_;
result.num_vectors_ = num_vectors_;
return result;
}
} // namespace internal
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_TYPES_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/types.h"
#include <stddef.h>
#include <string.h>
#include <set>
#include "dragnn/core/test/generic.h"
#include "dragnn/runtime/alignment.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
// Creates a pointer that is be invalid. This is useful for creating proxy areas
// for testing, whose real data should never be accessed. We manually tested
// that if this pointer is dereferenced, a segmentation fault will be thrown.
char *InvalidAlignedPointer() {
return reinterpret_cast<char *>(3 * internal::kAlignmentBytes);
}
// Expects that two pointers point to the same address.
void ExpectSameAddress(const void *ptr1, const void *ptr2) {
EXPECT_EQ(ptr1, ptr2);
}
template <class A, class B>
bool StructsEqual(const A &a, const B &b) {
static_assert(sizeof(A) == sizeof(B),
"StructsEqual must be given structs of the same size.");
return memcmp(&a, &b, sizeof(A)) == 0;
}
// Tests that (Mutable)Vector is empty by default.
TEST(VectorTest, EmptyByDefault) {
const Vector<int> vector1;
EXPECT_EQ(vector1.size(), 0);
EXPECT_TRUE(vector1.empty());
const MutableVector<int> vector2;
EXPECT_EQ(vector2.size(), 0);
EXPECT_TRUE(vector2.empty());
}
// Tests that (Mutable)Vector can be initialized from a view.
TEST(VectorTest, ConstructFromView) {
MutableAlignedView view;
char *ptr = InvalidAlignedPointer();
TF_ASSERT_OK(view.Reset(ptr, 10 * sizeof(int)));
const Vector<int> vector1(view);
ExpectSameAddress(vector1.data(), ptr);
EXPECT_EQ(vector1.size(), 10);
EXPECT_FALSE(vector1.empty());
const MutableVector<int> vector2(view);
ExpectSameAddress(vector2.data(), ptr);
EXPECT_EQ(vector2.size(), 10);
EXPECT_FALSE(vector2.empty());
}
// Tests that (Mutable)Vector can be initialized from a prefix of a view.
TEST(VectorTest, ConstructFromViewPrefix) {
MutableAlignedView view;
char *ptr = InvalidAlignedPointer();
TF_ASSERT_OK(view.Reset(ptr, 10 * sizeof(int)));
// Use a prefix of 3 of the 10 available ints in the |view|.
const Vector<int> vector1(view, 3);
ExpectSameAddress(vector1.data(), ptr);
EXPECT_EQ(vector1.size(), 3);
EXPECT_FALSE(vector1.empty());
// Use a prefix of 5 of the 10 available ints in the |view|.
const MutableVector<int> vector2(view, 5);
ExpectSameAddress(vector2.data(), ptr);
EXPECT_EQ(vector2.size(), 5);
EXPECT_FALSE(vector2.empty());
}
// Tests that (Mutable)Vector supports copy-construction and assignment with
// shallow-copy semantics, and reinterprets from T* to const T*.
TEST(VectorTest, CopyAndAssign) {
MutableAlignedView view;
char *ptr = InvalidAlignedPointer();
TF_ASSERT_OK(view.Reset(ptr, 10 * sizeof(int)));
const MutableVector<int> vector1(view);
// Copy-construct from another vector.
MutableVector<int> vector2(vector1);
ExpectSameAddress(vector2.data(), ptr);
EXPECT_EQ(vector2.size(), 10);
EXPECT_FALSE(vector2.empty());
// Assign from an empty vector, effectively clearing it.
vector2 = MutableVector<int>();
EXPECT_EQ(vector2.size(), 0);
EXPECT_TRUE(vector2.empty());
// Assign from the original vector.
vector2 = vector1;
ExpectSameAddress(vector2.data(), ptr);
EXPECT_EQ(vector2.size(), 10);
EXPECT_FALSE(vector2.empty());
// Copy-construct from another vector. Note that this reinterprets type.
Vector<int> vector3(vector1);
ExpectSameAddress(vector3.data(), ptr);
EXPECT_EQ(vector3.size(), 10);
EXPECT_FALSE(vector3.empty());
// Assign from an empty vector, effectively clearing it.
vector3 = Vector<int>();
EXPECT_EQ(vector3.size(), 0);
EXPECT_TRUE(vector3.empty());
// Assign from another vector. Note that this reinterprets type.
vector3 = vector2;
ExpectSameAddress(vector3.data(), ptr);
EXPECT_EQ(vector3.size(), 10);
EXPECT_FALSE(vector3.empty());
}
// Tests that (Mutable)Vector supports access via operator[].
TEST(VectorTest, Subscript) {
UniqueAlignedArray array;
array.Reset(10 * sizeof(float));
// Write into a mutable vector.
const MutableVector<float> mutable_vector(array.view());
ASSERT_EQ(mutable_vector.size(), 10);
for (int i = 0; i < 10; ++i) mutable_vector[i] = i;
// Read from a const vector that points at the same values.
const Vector<float> const_vector(array.view());
ASSERT_EQ(const_vector.size(), 10);
for (int i = 0; i < 10; ++i) EXPECT_EQ(const_vector[i], i);
}
// Tests the subsequence operator.
TEST(VectorTest, Subsequence) {
// Debug checks will fail if either of the constructed vectors is not aligned.
constexpr int numAlignedFloats = internal::kAlignmentBytes / sizeof(float);
UniqueAlignedArray array;
array.Reset(2 * numAlignedFloats * sizeof(float));
// Write into a mutable vector.
const MutableVector<float> mutable_vector(array.view());
for (int i = 0; i < 2 * numAlignedFloats; ++i) mutable_vector[i] = i;
// Subscript beginning.
Vector<float> first_half(mutable_vector.Subsequence(0, numAlignedFloats));
ASSERT_EQ(first_half.size(), numAlignedFloats);
for (int i = 0; i < numAlignedFloats; ++i) {
EXPECT_EQ(first_half[i], i);
}
// Subscript end.
Vector<float> second_half(
mutable_vector.Subsequence(numAlignedFloats, numAlignedFloats));
ASSERT_EQ(second_half.size(), numAlignedFloats);
for (int i = 0; i < numAlignedFloats; ++i) {
EXPECT_EQ(second_half[i], i + numAlignedFloats);
}
}
// Tests that (Mutable)Vector supports access via range-based for loops.
TEST(VectorTest, RangeBasedFor) {
UniqueAlignedArray array;
array.Reset(10 * sizeof(float));
// Write into a mutable vector.
const MutableVector<float> mutable_vector(array.view());
ASSERT_EQ(mutable_vector.size(), 10);
float counter = 0.0;
for (float &value : mutable_vector) value = counter++;
// Read from a const vector that points at the same values.
const Vector<float> const_vector(array.view());
ASSERT_EQ(const_vector.size(), 10);
counter = 0.0;
for (const float &value : const_vector) EXPECT_EQ(value, counter++);
}
// Tests that (Mutable)Matrix is empty by default.
TEST(MatrixTest, EmptyByDefault) {
const Matrix<int> matrix1;
EXPECT_EQ(matrix1.num_rows(), 0);
EXPECT_EQ(matrix1.num_columns(), 0);
EXPECT_EQ(matrix1.row_stride(), 0);
const MutableMatrix<int> matrix2;
EXPECT_EQ(matrix2.num_rows(), 0);
EXPECT_EQ(matrix2.num_columns(), 0);
EXPECT_EQ(matrix2.row_stride(), 0);
}
// Tests that (Mutable)Matrix can be constructed from an area.
TEST(MatrixTest, ConstructFromArea) {
MutableAlignedView view;
char *ptr = InvalidAlignedPointer();
const size_t kNumRows = 11;
const size_t kNumColumns = 13;
const size_t kRowBytes = kNumColumns * sizeof(int);
const size_t kRowStride = PadToAlignment(kRowBytes) / sizeof(int);
const size_t bytes = ComputeAlignedAreaSize(kNumRows, kRowBytes);
TF_ASSERT_OK(view.Reset(ptr, bytes));
MutableAlignedArea area;
TF_ASSERT_OK(area.Reset(view, kNumRows, kRowBytes));
const Matrix<int> matrix1(area);
EXPECT_EQ(matrix1.num_rows(), kNumRows);
EXPECT_EQ(matrix1.num_columns(), kNumColumns);
EXPECT_EQ(matrix1.row_stride(), kRowStride);
ExpectSameAddress(matrix1.row(0).data(), ptr);
ExpectSameAddress(matrix1.data(), ptr);
const MutableMatrix<int> matrix2(area);
EXPECT_EQ(matrix2.num_rows(), kNumRows);
EXPECT_EQ(matrix2.num_columns(), kNumColumns);
EXPECT_EQ(matrix2.row_stride(), kRowStride);
ExpectSameAddress(matrix2.row(0).data(), ptr);
ExpectSameAddress(matrix2.data(), ptr);
}
// Tests that (Mutable)Matrix supports copy-construction and assignment with
// shallow-copy semantics, and reinterprets from T* to const T*.
TEST(MatrixTest, CopyAndAssign) {
MutableAlignedView view;
char *ptr = InvalidAlignedPointer();
const size_t kNumRows = 11;
const size_t kNumColumns = 13;
const size_t kRowBytes = kNumColumns * sizeof(int);
const size_t kRowStride = PadToAlignment(kRowBytes) / sizeof(int);
const size_t bytes = ComputeAlignedAreaSize(kNumRows, kRowBytes);
TF_ASSERT_OK(view.Reset(ptr, bytes));
MutableAlignedArea area;
TF_ASSERT_OK(area.Reset(view, kNumRows, kRowBytes));
const MutableMatrix<int> matrix1(area);
EXPECT_EQ(matrix1.num_rows(), kNumRows);
EXPECT_EQ(matrix1.num_columns(), kNumColumns);
EXPECT_EQ(matrix1.row_stride(), kRowStride);
ExpectSameAddress(matrix1.row(0).data(), ptr);
ExpectSameAddress(matrix1.data(), ptr);
// Copy-construct from another matrix.
MutableMatrix<int> matrix2(matrix1);
EXPECT_EQ(matrix2.num_rows(), kNumRows);
EXPECT_EQ(matrix2.num_columns(), kNumColumns);
EXPECT_EQ(matrix2.row_stride(), kRowStride);
ExpectSameAddress(matrix2.row(0).data(), ptr);
ExpectSameAddress(matrix2.data(), ptr);
// Assign from an empty matrix, effectively clearing it.
matrix2 = MutableMatrix<int>();
EXPECT_EQ(matrix2.num_rows(), 0);
EXPECT_EQ(matrix2.num_columns(), 0);
EXPECT_EQ(matrix2.row_stride(), 0);
// Assign from the original matrix.
matrix2 = matrix1;
EXPECT_EQ(matrix2.num_rows(), kNumRows);
EXPECT_EQ(matrix2.num_columns(), kNumColumns);
EXPECT_EQ(matrix2.row_stride(), kRowStride);
ExpectSameAddress(matrix2.row(0).data(), ptr);
ExpectSameAddress(matrix2.data(), ptr);
// Copy-construct from another matrix. Note that this reinterprets type.
Matrix<int> matrix3(matrix2);
EXPECT_EQ(matrix3.num_rows(), kNumRows);
EXPECT_EQ(matrix3.num_columns(), kNumColumns);
EXPECT_EQ(matrix3.row_stride(), kRowStride);
ExpectSameAddress(matrix3.row(0).data(), ptr);
ExpectSameAddress(matrix3.data(), ptr);
// Assign from an empty matrix, effectively clearing it.
matrix3 = Matrix<int>();
EXPECT_EQ(matrix3.num_rows(), 0);
EXPECT_EQ(matrix3.num_columns(), 0);
EXPECT_EQ(matrix3.row_stride(), 0);
// Assign from the original matrix. Note that this reinterprets type.
matrix3 = matrix1;
EXPECT_EQ(matrix3.num_rows(), kNumRows);
EXPECT_EQ(matrix3.num_columns(), kNumColumns);
EXPECT_EQ(matrix3.row_stride(), kRowStride);
ExpectSameAddress(matrix3.row(0).data(), ptr);
ExpectSameAddress(matrix3.data(), ptr);
}
// Tests that (Mutable)Matrix supports row access.
TEST(MatrixTest, Rows) {
const size_t kNumRows = 11;
const size_t kNumColumns = 13;
const size_t bytes =
ComputeAlignedAreaSize(kNumRows, kNumColumns * sizeof(float));
UniqueAlignedArray array;
array.Reset(bytes);
MutableAlignedArea area;
TF_ASSERT_OK(area.Reset(array.view(), kNumRows, kNumColumns * sizeof(float)));
// Write to a mutable matrix.
const MutableMatrix<float> mutable_matrix(area);
ASSERT_EQ(mutable_matrix.num_rows(), kNumRows);
ASSERT_EQ(mutable_matrix.num_columns(), kNumColumns);
for (size_t row = 0; row < kNumRows; ++row) {
for (size_t column = 0; column < kNumColumns; ++column) {
mutable_matrix.row(row)[column] = row * 1000.0 + column;
}
}
// Read from a const matrix that points at the same values.
const Matrix<float> const_matrix(area);
ASSERT_EQ(const_matrix.num_rows(), kNumRows);
ASSERT_EQ(const_matrix.num_columns(), kNumColumns);
for (size_t row = 0; row < kNumRows; ++row) {
for (size_t column = 0; column < kNumColumns; ++column) {
EXPECT_EQ(const_matrix.row(row)[column], row * 1000.0 + column);
}
}
}
TEST(MatrixTest, MatrixFromVector) {
for (int cols = 0; cols < 100; ++cols) {
MutableAlignedView view;
char *ptr = InvalidAlignedPointer();
TF_ASSERT_OK(view.Reset(ptr, cols * sizeof(int)));
const MutableVector<int> vector(view);
const MutableMatrix<int> matrix(vector);
ASSERT_EQ(matrix.row(0).data(), vector.data());
ExpectSameAddress(matrix.data(), vector.data());
ASSERT_EQ(matrix.num_rows(), 1);
ASSERT_EQ(matrix.num_columns(), vector.size());
}
}
template <class MatrixType>
class BlockedMatrixTest : public ::testing::Test {};
typedef ::testing::Types<
BlockedMatrix<float, BlockedMatrixFormat::kRowBlockedColumnMajor>,
BlockedMatrix<float, BlockedMatrixFormat::kColumnBlockedRowMajor>,
BlockedMatrix<int64, BlockedMatrixFormat::kRowBlockedColumnMajor>,
BlockedMatrix<int64, BlockedMatrixFormat::kColumnBlockedRowMajor>>
BlockedRowAndColumnTypes;
TYPED_TEST_CASE(BlockedMatrixTest, BlockedRowAndColumnTypes);
TYPED_TEST(BlockedMatrixTest, PaddingNotAllowed) {
MutableAlignedView view;
MutableAlignedArea area;
constexpr size_t kNumRows = 10;
constexpr size_t kNumColumns = 10;
constexpr size_t kBlockSize = 5;
constexpr size_t kNumViews = (kNumRows * kNumColumns) / kBlockSize;
constexpr size_t kBlockSizeBytes =
kBlockSize * sizeof(typename TypeParam::ElementType);
const size_t bytes = ComputeAlignedAreaSize(kNumViews, kBlockSizeBytes);
TF_ASSERT_OK(view.Reset(InvalidAlignedPointer(), bytes));
TF_ASSERT_OK(area.Reset(view, kNumViews, kBlockSizeBytes));
// 5 is usually relatively prime to the alignment size, but you may have to
// update this test if kAlignmentBytes changes.
ASSERT_NE(PadToAlignment(kBlockSizeBytes), kBlockSizeBytes);
TypeParam matrix;
EXPECT_THAT(matrix.Reset(area, kNumRows, kNumColumns),
test::IsErrorWithSubstr(
"Padding is not supported for blocked matrix formats."));
}
// Tests accessors, and the size of matrices after allocation.
TYPED_TEST(BlockedMatrixTest, Accessors) {
MutableAlignedView view;
MutableAlignedArea area;
char *ptr = InvalidAlignedPointer();
constexpr size_t kNumRows = 48;
constexpr size_t kNumColumns = 24;
constexpr size_t kBlockSize = 8;
constexpr size_t kNumViews = (kNumRows * kNumColumns) / kBlockSize;
constexpr size_t kBlockSizeBytes =
kBlockSize * sizeof(typename TypeParam::ElementType);
const size_t bytes = ComputeAlignedAreaSize(kNumViews, kBlockSizeBytes);
TF_ASSERT_OK(view.Reset(ptr, bytes));
TF_ASSERT_OK(area.Reset(view, kNumViews, kBlockSizeBytes));
TypeParam matrix;
// If the view size is wrong, it should fail.
EXPECT_THAT(
matrix.Reset(area, kNumRows + 1, kNumColumns),
test::IsErrorWithSubstr("Area has 144 views, but should have 147"));
// If the blocking scheme cannot divide the matrix evenly, an error should
// be raised. The choice of 12 and 96 is a bit non-trivial: they are numbers
// that conveniently result in the correct area (so other errors won't be
// raised), but an incompatible division of the vectors.
if (TypeParam::IsRowBlocked()) {
EXPECT_THAT(
matrix.Reset(area, 12, 96),
test::IsErrorWithSubstr("row-blocked matrix has major dimension 12 "
"which is not divisible by the block "
"size, 8"));
} else {
EXPECT_THAT(
matrix.Reset(area, 96, 12),
test::IsErrorWithSubstr("column-blocked matrix has major dimension "
"12 which is not divisible by the block "
"size, 8"));
}
TF_EXPECT_OK(matrix.Reset(area, kNumRows, kNumColumns));
EXPECT_EQ(matrix.vector(0).data(),
reinterpret_cast<typename TypeParam::ElementType *>(ptr));
EXPECT_EQ(matrix.num_rows(), kNumRows);
EXPECT_EQ(matrix.num_columns(), kNumColumns);
EXPECT_EQ(matrix.block_size(), kBlockSize);
EXPECT_EQ(matrix.num_vectors(), kNumViews);
}
TYPED_TEST(BlockedMatrixTest, CopyCastTranspose) {
MutableAlignedView view;
MutableAlignedArea area;
constexpr size_t kNumRows = 48;
constexpr size_t kNumColumns = 24;
constexpr size_t kBlockSize = 8;
constexpr size_t kNumViews = (kNumRows * kNumColumns) / kBlockSize;
constexpr size_t kBlockSizeBytes =
kBlockSize * sizeof(typename TypeParam::ElementType);
const size_t bytes = ComputeAlignedAreaSize(kNumViews, kBlockSizeBytes);
TF_ASSERT_OK(view.Reset(InvalidAlignedPointer(), bytes));
TF_ASSERT_OK(area.Reset(view, kNumViews, kBlockSizeBytes));
TypeParam matrix;
TF_EXPECT_OK(matrix.Reset(area, kNumRows, kNumColumns));
// Test both copying and casting to const.
TypeParam matrix_copy = matrix;
auto readonly = matrix.AsConst();
EXPECT_TRUE(StructsEqual(matrix, matrix_copy));
EXPECT_TRUE(StructsEqual(matrix, readonly));
for (int i = 0; i < kNumViews; ++i) {
EXPECT_EQ(matrix.vector(i).data(), matrix_copy.vector(i).data());
EXPECT_EQ(matrix.vector(i).data(), readonly.vector(i).data());
}
// Transpose matrix.
auto transposed = matrix.Transpose();
auto readonly_transposed = readonly.Transpose();
EXPECT_FALSE(StructsEqual(matrix, transposed));
EXPECT_FALSE(StructsEqual(readonly, readonly_transposed));
EXPECT_TRUE(StructsEqual(transposed, readonly_transposed));
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/mmap.h"
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <utility>
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/cleanup.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
int UniqueAlignedMmap::Syscalls::Open(const string &path) const {
return open(path.c_str(), O_RDONLY);
}
int UniqueAlignedMmap::Syscalls::Close(int file_descriptor) const {
return close(file_descriptor);
}
void *UniqueAlignedMmap::Syscalls::Mmap(int file_descriptor,
size_t size) const {
return mmap(nullptr, size, PROT_READ, MAP_SHARED, file_descriptor, 0);
}
int UniqueAlignedMmap::Syscalls::Munmap(void *data, size_t size) const {
return munmap(data, size);
}
UniqueAlignedMmap::UniqueAlignedMmap(std::unique_ptr<Syscalls> syscalls)
: syscalls_(std::move(syscalls)) {}
UniqueAlignedMmap::UniqueAlignedMmap(UniqueAlignedMmap &&that)
: syscalls_(std::move(that.syscalls_)) {
view_ = that.view_;
path_ = that.path_;
that.view_ = MutableAlignedView();
that.path_.clear();
}
UniqueAlignedMmap &UniqueAlignedMmap::operator=(UniqueAlignedMmap &&that) {
syscalls_ = std::move(that.syscalls_);
view_ = that.view_;
path_ = that.path_;
that.view_ = MutableAlignedView();
that.path_.clear();
return *this;
}
UniqueAlignedMmap::~UniqueAlignedMmap() {
UnmapIfNonEmpty(view_.data(), view_.size(), path_);
}
tensorflow::Status UniqueAlignedMmap::Reset(const string &path) {
uint64 size = 0;
TF_RETURN_IF_ERROR(tensorflow::Env::Default()->GetFileSize(path, &size));
// Since mmap() cannot map 0 bytes, we skip the call on empty files. This is
// OK because UnmapIfNonEmpty() also skips munmap() on an empty region.
if (size == 0) {
view_ = MutableAlignedView();
path_ = path;
return tensorflow::Status::OK();
}
const int file_descriptor = syscalls_->Open(path);
if (file_descriptor == -1) {
// TODO(googleuser): Use strerror_r() to export the system error message.
return tensorflow::errors::Unknown("Failed to open '", path, "'");
}
// In case we error out.
auto ensure_closed = tensorflow::gtl::MakeCleanup([&] {
if (syscalls_->Close(file_descriptor) != 0) {
LOG(ERROR) << "Failed to close '" << path << "'";
}
});
void *mmap_result = syscalls_->Mmap(file_descriptor, size);
if (mmap_result == MAP_FAILED) {
return tensorflow::errors::Unknown("Failed to mmap '", path, "'");
}
// In case we error out.
auto ensure_unmapped = tensorflow::gtl::MakeCleanup(
[&] { UnmapIfNonEmpty(mmap_result, size, path); });
// Since mmap() increments the refcount of the |file_descriptor|, it must be
// closed to prevent a leak.
ensure_closed.release(); // going to close it manually
if (syscalls_->Close(file_descriptor) != 0) {
return tensorflow::errors::Unknown("Failed to close '", path, "'");
}
// Most implementations of mmap() place the mapped region on a page boundary,
// which is plenty of alignment. Since this is so unlikely to fail, we don't
// try to recover if the address is misaligned. A potential recovery method
// is to allocate a UniqueAlignedArray and read the file normally.
MutableAlignedView data;
TF_RETURN_IF_ERROR(data.Reset(reinterpret_cast<char *>(mmap_result), size));
// Success; make modifications.
view_ = data;
path_ = path;
ensure_unmapped.release(); // this has taken ownership of the mapped file
return tensorflow::Status::OK();
}
void UniqueAlignedMmap::UnmapIfNonEmpty(void *data, size_t size,
const string &path) const {
if (size == 0) return;
if (syscalls_->Munmap(data, size) != 0) {
LOG(ERROR) << "Failed to munmap() file '" << path << "'";
}
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment