Export @195097388.

a4bb31d0 · Terry Koo · dea7ecf6 · a4bb31d0 · a4bb31d0 · a4bb31d0
Commit a4bb31d0 authored May 02, 2018 by Terry Koo
20 changed files
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_avx.h
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_avx.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
+#define DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
+#if defined(__AVX2__)
+
+#include <stddef.h>
+
+#include "dragnn/runtime/math/arithmetic_common.h"
+#include "dragnn/runtime/math/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+// TODO(googleuser): Leaving this empty means that the definitions
+// from arithmetic_common.h carry through.  Provide template specializations
+// that use architecture-specific intrinsics.
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#endif  // defined(__AVX2__)
+#endif  // DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_common.h
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_common.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+// Declarations of arithmetic operations and trivial generic implementations.
+// Architecture-specific implementations should include this header and define
+// template specializations that override the generic implementations.
+
+#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
+#define DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
+
+#include <stddef.h>
+#include <algorithm>
+
+#include "dragnn/runtime/math/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+// Performs output = scale * input.  Dimensions must match.
+template <class T>
+void ScaleElements(Vector<T> input, T scale, MutableVector<T> output);
+
+// Performs output += scale * input.  Dimensions must match.
+template <class T>
+void AddScaledElements(Vector<T> input, T scale, MutableVector<T> output);
+
+// Performs values = max(minimum, values) in place.
+template <class T>
+void MaxElements(T minimum, MutableVector<T> values);
+
+// Performs output = matrix * input.  All vectors are interpreted as column
+// vectors.  Dimensions must match.
+template <class T>
+void MultiplyMatrixAndVector(Matrix<T> matrix, Vector<T> input,
+                             MutableVector<T> output);
+
+// Performs output = bias + matrix * input.  All vectors are interpreted as
+// column vectors.  Dimensions must match.
+template <class T>
+void MultiplyMatrixAndVectorWithBias(Matrix<T> matrix, Vector<T> bias,
+                                     Vector<T> input, MutableVector<T> output);
+
+// Implementation details below.
+
+template <class T>
+void ScaleElements(T scale, Vector<T> input, MutableVector<T> output) {
+  DCHECK_EQ(input.size(), output.size());
+  for (size_t i = 0; i < input.size(); ++i) output[i] = scale * input[i];
+}
+
+template <class T>
+void AddScaledElements(T scale, Vector<T> input, MutableVector<T> output) {
+  DCHECK_EQ(input.size(), output.size());
+  for (size_t i = 0; i < input.size(); ++i) output[i] += scale * input[i];
+}
+
+template <class T>
+void MaxElements(T minimum, MutableVector<T> values) {
+  for (T &value : values) value = std::max(minimum, value);
+}
+
+namespace internal {
+
+// Like MultiplyMatrixAndVectorWithBias(), but if |ignore_bias| is true, then
+// the |bias| is treated as zero and its dimensions are not checked.
+template <bool ignore_bias, class T>
+void MultiplyMatrixAndVectorImpl(Matrix<T> matrix, Vector<T> bias,
+                                 Vector<T> input, MutableVector<T> output) {
+  DCHECK_EQ(matrix.num_columns(), input.size());
+  if (!ignore_bias) DCHECK_EQ(matrix.num_rows(), bias.size());
+  DCHECK_EQ(matrix.num_rows(), output.size());
+  for (size_t i = 0; i < matrix.num_rows(); ++i) {
+    const Vector<T> row = matrix.row(i);
+    DCHECK_EQ(row.size(), input.size());
+    T sum = ignore_bias ? T() : bias[i];
+    for (size_t j = 0; j < row.size(); ++j) sum += row[j] * input[j];
+    output[i] = sum;
+  }
+}
+
+}  // namespace internal
+
+template <class T>
+void MultiplyMatrixAndVector(Matrix<T> matrix, Vector<T> input,
+                             MutableVector<T> output) {
+  internal::MultiplyMatrixAndVectorImpl<true>(matrix, {}, input, output);
+}
+
+template <class T>
+void MultiplyMatrixAndVectorWithBias(Matrix<T> matrix, Vector<T> bias,
+                                     Vector<T> input, MutableVector<T> output) {
+  internal::MultiplyMatrixAndVectorImpl<false>(matrix, bias, input, output);
+}
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#endif  // DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_neon.h
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_neon.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
+#define DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+
+#include <stddef.h>
+
+#include "dragnn/runtime/math/arithmetic_common.h"
+#include "dragnn/runtime/math/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+// TODO(googleuser): Leaving this empty means that the definitions
+// from arithmetic_common.h carry through.  Provide template specializations
+// that use architecture-specific intrinsics.
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#endif  // defined(__ARM_NEON) || defined(__ARM_NEON__)
+#endif  // DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_sse.h
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_sse.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
+#define DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
+#if defined(__SSE4_2__)
+
+#include <stddef.h>
+
+#include "dragnn/runtime/math/arithmetic_common.h"
+#include "dragnn/runtime/math/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+// TODO(googleuser): Leaving this empty means that the definitions
+// from arithmetic_common.h carry through.  Provide template specializations
+// that use architecture-specific intrinsics.
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#endif  // defined(__SSE4_2__)
+#endif  // DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_test.cc
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "dragnn/runtime/math/arithmetic.h"
+
+#include <stddef.h>
+#include <vector>
+
+#include "dragnn/runtime/math/types.h"
+#include "dragnn/runtime/test/helpers.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+namespace {
+
+// Tests that ScaleElements() doesn't crash on empty vectors.
+TEST(ScaleElementsTest, Empty) {
+  Vector<float> input;
+  MutableVector<float> output;
+
+  ScaleElements(1.5f, input, output);
+}
+
+// Tests that ScaleElements() copies scaled values from one vector to another.
+TEST(ScaleElementsTest, Populated) {
+  UniqueVector<float> input({-2.0f, -3.0f, 5.0f});
+  UniqueVector<float> output({7.0f, 11.0f, 13.0f});  // gets overwritten
+
+  ScaleElements(1.5f, Vector<float>(*input), *output);
+
+  EXPECT_EQ((*output)[0], 1.5 * -2.0);
+  EXPECT_EQ((*output)[1], 1.5 * -3.0);
+  EXPECT_EQ((*output)[2], 1.5 * 5.0);
+}
+
+// Tests that AddScaledElements() doesn't crash on empty vectors.
+TEST(AddScaledElementsTest, Empty) {
+  Vector<float> input;
+  MutableVector<float> output;
+
+  AddScaledElements(1.5f, input, output);
+}
+
+// Tests that AddScaledElements() adds scaled values from one vector to another.
+TEST(AddScaledElementsTest, Populated) {
+  UniqueVector<float> input({-2.0f, -3.0f, 5.0f});
+  UniqueVector<float> output({7.0f, 11.0f, 13.0f});  // gets added to
+
+  AddScaledElements(1.5f, Vector<float>(*input), *output);
+
+  EXPECT_EQ((*output)[0], 1.5 * -2.0 + 7.0);
+  EXPECT_EQ((*output)[1], 1.5 * -3.0 + 11.0);
+  EXPECT_EQ((*output)[2], 1.5 * 5.0 + 13.0);
+}
+
+// Tests that MaxElements() doesn't crash on empty vectors.
+TEST(MaxElementsTest, Empty) {
+  MutableVector<float> values;
+
+  MaxElements(1.5f, values);
+}
+
+// Tests that MaxElements() performs an in-place element-wise maximum.
+TEST(MaxElementsTest, Populated) {
+  UniqueVector<float> values({-1.0f, 2.0f, 0.25f, -0.5f, 0.375f});
+
+  MaxElements(0.125f, *values);
+
+  EXPECT_EQ((*values)[0], 0.125);
+  EXPECT_EQ((*values)[1], 2.0);
+  EXPECT_EQ((*values)[2], 0.25);
+  EXPECT_EQ((*values)[3], 0.125);
+  EXPECT_EQ((*values)[4], 0.375);
+}
+
+// Tests that MultiplyMatrixAndVector() doesn't crash on empty inputs.
+TEST(MultiplyMatrixAndVectorTest, Empty) {
+  Matrix<float> matrix;
+  Vector<float> input;
+  MutableVector<float> output;
+
+  MultiplyMatrixAndVector(matrix, input, output);
+}
+
+// Tests that MultiplyMatrixAndVector() computes a matrix-vector product.
+TEST(MultiplyMatrixAndVectorTest, Populated) {
+  UniqueMatrix<float> matrix({{2.0f, 3.0f},  //
+                              {5.0f, 7.0f},  //
+                              {11.0f, 13.0f}});
+  UniqueVector<float> input({-0.5f, 2.0f});
+  UniqueVector<float> output({9.8f, 7.6f, 5.4f});  // gets overwritten
+
+  MultiplyMatrixAndVector(Matrix<float>(*matrix), Vector<float>(*input),
+                          *output);
+
+  EXPECT_EQ((*output)[0], 2.0 * -0.5 + 3.0 * 2.0);
+  EXPECT_EQ((*output)[1], 5.0 * -0.5 + 7.0 * 2.0);
+  EXPECT_EQ((*output)[2], 11.0 * -0.5 + 13.0 * 2.0);
+}
+
+// Tests that MultiplyMatrixAndVectorWithBias() doesn't crash on empty inputs.
+TEST(MultiplyMatrixAndVectorWithBiasTest, Empty) {
+  Matrix<float> matrix;
+  Vector<float> bias;
+  Vector<float> input;
+  MutableVector<float> output;
+
+  MultiplyMatrixAndVectorWithBias(matrix, bias, input, output);
+}
+
+// Tests that MultiplyMatrixAndVectorWithBias() computes a matrix-vector product
+// with an additive bias.
+TEST(MultiplyMatrixAndVectorWithBiasTest, Populated) {
+  UniqueMatrix<float> matrix({{2.0f, 3.0f},  //
+                              {5.0f, 7.0f},  //
+                              {11.0f, 13.0f}});
+  UniqueVector<float> bias({100.5f, 200.25f, 300.75f});
+  UniqueVector<float> input({-0.5f, 2.0f});
+  UniqueVector<float> output({9.8f, 7.6f, 5.4f});  // gets overwritten
+
+  MultiplyMatrixAndVectorWithBias(Matrix<float>(*matrix), Vector<float>(*bias),
+                                  Vector<float>(*input), *output);
+
+  EXPECT_EQ((*output)[0], 100.5 + 2.0 * -0.5 + 3.0 * 2.0);
+  EXPECT_EQ((*output)[1], 200.25 + 5.0 * -0.5 + 7.0 * 2.0);
+  EXPECT_EQ((*output)[2], 300.75 + 11.0 * -0.5 + 13.0 * 2.0);
+}
+
+// A dummy type for the specializations below.  Specializing on this unique
+// dummy type ensures we don't conflict with any existing specialization.
+struct Foo {
+  float value;
+};
+
+}  // namespace
+
+// Dummy specializations for use in the subsequent tests.
+template <>
+void ScaleElements(Foo scale, Vector<Foo> input, MutableVector<Foo> output) {
+  for (Foo &foo : output) foo.value = 777.0;
+}
+
+namespace {
+
+// Tests that the template specialization overrides the generic implementation.
+TEST(ScaleElementsTest, OverriddenByTemplateSpecialization) {
+  // These values are uninitialized, but it doesn't matter because the
+  // specialization never looks at them.
+  UniqueVector<Foo> input(3);
+  UniqueVector<Foo> output(3);
+
+  ScaleElements(Foo(), Vector<Foo>(*input), *output);
+
+  EXPECT_EQ((*output)[0].value, 777.0);
+  EXPECT_EQ((*output)[1].value, 777.0);
+  EXPECT_EQ((*output)[2].value, 777.0);
+}
+
+}  // namespace
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/avx_activation_functions.h
+++ b/research/syntaxnet/dragnn/runtime/math/avx_activation_functions.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+// Contains logic for activation functions and more-complex elementwise
+// vectorized operations.
+//
+// Uses operator overloading to express computation that looks like regular
+// code. Currently, overloaded operators are scoped away in an "internal"
+// namespace so they won't be accidentally used.
+
+#ifndef DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
+#define DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
+
+#if defined(__AVX2__)
+#include <immintrin.h>
+#endif
+
+#include "dragnn/runtime/math/avx_vector_array.h"
+
+
+#define DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#ifdef __clang__
+#define DRAGNN_AVXAF_GCC_UNROLL
+#else
+#define DRAGNN_AVXAF_GCC_UNROLL __attribute__((optimize("unroll-loops")))
+#endif
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+// Public API
+namespace activations {
+// Calculates elementwise exp(x).
+inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE DRAGNN_AVXAF_GCC_UNROLL
+Exponential(AvxFloatVec x);
+
+// Calculates elementwise sigmoid(x) = 1/(1+exp(-x)).
+inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE Sigmoid(AvxFloatVec x);
+
+// Calculates elementwise tanh(x).
+inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE Tanh(AvxFloatVec x);
+}  // namespace activations
+
+namespace activations {
+
+// Calculates e^x by representing x = m * ln(2) + r. It does a polynomial
+// expansion of e^r, and then multiplies in e^(m * ln(2)) = 2^m.
+//
+inline AvxFloatVec Exponential(AvxFloatVec x) {
+  // EDSL-like helpers for writing vectorized code.
+  auto Const = AvxFloatVec::Const;
+
+  constexpr float explo = -88.3762626647949f;
+  constexpr float exphi = 88.3762626647950f;
+
+  const float cephes_exp_factors[] = {
+      1.9875691500e-4f, 1.3981999507e-3f, 8.3334519073e-3f,
+      4.1665795894e-2f, 1.6666665459e-1f, 5.0000001201e-1f,
+  };
+
+  // Clamp the input. i.e. assume exp(-88) is close to zero and exp(88) is
+  // close to infinity.
+  x.Clamp(explo, exphi);
+
+  // Calculate `m = floor(x/ln(2) + 0.5)`.
+  constexpr float inv_log2e = 1.44269504088896341f;
+  AvxFloatVec m = Const(0.5f);
+  m += Const(inv_log2e) * x;
+  m.Floor();
+
+  // Calculate `r = x - m*ln(2)` (see function-level comment).
+  constexpr float neg_ln2 = -0.6931471805599453f;
+  AvxFloatVec r = x;
+  r += m * Const(neg_ln2);
+
+  // Calculate a polynomial expansion of y = exp(r).
+  AvxFloatVec r_squared(r * r);
+  AvxFloatVec y = Const(cephes_exp_factors[0]);
+  for (int i = 1; i < 6; ++i) {
+    y = y * r + Const(cephes_exp_factors[i]);
+  }
+  y = y * r_squared + r;
+  y += Const(1.0f);
+
+  // Calculate `emm0 = 2^m`. This is done by converting emm0 into an integer,
+  // and shifting it into the exponent bits of the desired floating-point
+  // result. Recall that the exponent is unsigned with 127 representing 2^0.
+  AvxFloatVec emm0 = m;
+  emm0 += Const(127.0f);
+  AvxIntVec emm0_i(emm0);
+  emm0_i.LeftShift(23);
+
+  // The final result is `2^m * exp(r)`.
+  return AvxFloatVec(emm0_i.ReinterpretCastFloat() * y);
+}
+
+inline AvxFloatVec Tanh(AvxFloatVec x) {
+  // EDSL-like helpers for writing vectorized code.
+  auto Const = AvxFloatVec::Const;
+
+  const float numerator_coefficients[] = {
+      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
+      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
+      4.89352455891786e-03f,
+  };
+  const float denominator_coefficients[] = {
+      1.19825839466702e-06f,
+      1.18534705686654e-04f,
+      2.26843463243900e-03f,
+      4.89352518554385e-03f,
+  };
+
+  // Clamp the inputs to the range [-9, 9] since anything outside this range
+  // is +/-1.0 in single-precision.
+  x.Clamp(-9.0f, 9.0f);
+
+  // Compute x^2.
+  AvxFloatVec x_squared(x * x);
+
+  // Compute the numerator polynomial.
+  AvxFloatVec p = Const(numerator_coefficients[0]);
+  for (int i = 1; i < 7; ++i) {
+    // p = p * x^2 + numerator_coefficients_i
+    p = p * x_squared + Const(numerator_coefficients[i]);
+  }
+
+  // p = p * x
+  p = AvxFloatVec(p * x);
+
+  // Compute the denominator polynomial.
+  AvxFloatVec q = Const(denominator_coefficients[0]);
+  for (int i = 1; i < 4; ++i) {
+    // q = q * x^2 + alqha_i
+    q = q * x_squared + Const(denominator_coefficients[i]);
+  }
+
+  // Divide the numerator by the denominator.
+  return p / q;
+}
+
+inline AvxFloatVec Sigmoid(AvxFloatVec x) {
+  AvxFloatVec half = AvxFloatVec::Const(0.5);
+  return half * Tanh(AvxFloatVec(half * x)) + half;
+}
+
+}  // namespace activations
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#undef DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE
+#undef DRAGNN_AVXAF_GCC_UNROLL
+
+#endif  // DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
--- a/research/syntaxnet/dragnn/runtime/math/avx_activation_functions_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/avx_activation_functions_test.cc
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "dragnn/runtime/math/avx_activation_functions.h"
+
+#include <cmath>
+
+#include <chrono>
+
+#include "dragnn/runtime/test/helpers.h"
+#include "syntaxnet/base.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+namespace {
+
+TEST(AvxActivationFunctionsTest, ExponentialTest) {
+  AvxVectorFuzzTest(
+      [](AvxFloatVec *vec) { *vec = activations::Exponential(*vec); },
+      [](float input_value, float actual) {
+        const float inverted = log(actual);
+        EXPECT_NEAR(input_value, inverted, 1e-6)
+            << "exp(" << input_value << ") = " << actual
+            << ", log(actual) = " << inverted;
+      });
+}
+
+TEST(AvxActivationFunctionsTest, SigmoidTest) {
+  AvxVectorFuzzTest(  //
+      [](AvxFloatVec *vec) { *vec = activations::Sigmoid(*vec); },
+      [](float input_value, float actual) {
+        const float expected = 1.0f / (1.0f + exp(-input_value));
+        EXPECT_NEAR(actual, expected, 1e-6)
+            << "sigmoid(" << input_value << ") = " << actual
+            << ", expected = " << expected;
+      });
+}
+
+template <int batch_size, class Function>
+void RunPerformanceTest(Function activation, int flops) {
+  constexpr uint64 kIterations = 1000000;
+
+  UniqueVector<float> input(batch_size);
+  UniqueVector<float> output(batch_size);
+  InitRandomVector(*input);
+  InitRandomVector(*output);
+
+  AvxFloatVecArray<batch_size / kAvxWidth> array;
+  auto start_time = std::chrono::system_clock::now();
+  for (int i = 0; i < kIterations; ++i) {
+    array.Load(input->data());
+    array.Apply(activation);
+    array.Store(output->data());
+  }
+  auto end_time = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end_time - start_time;
+  double elapsed = elapsed_seconds.count();
+  double exp_ops = kIterations * batch_size;
+  double macro_gops = exp_ops / 1e9 / elapsed;
+  VLOG(0) << "For batch_size " << batch_size
+          << " macro-GOPS (giga-ops per sec): " << macro_gops
+          << ", raw arithmetic: " << flops * macro_gops;
+}
+
+TEST(AvxActivationFunctionsTest, SigmoidPerformanceTest) {
+  RunPerformanceTest<8>(activations::Sigmoid, 26);
+  RunPerformanceTest<16>(activations::Sigmoid, 26);
+  RunPerformanceTest<32>(activations::Sigmoid, 26);
+  RunPerformanceTest<48>(activations::Sigmoid, 26);
+  RunPerformanceTest<64>(activations::Sigmoid, 26);
+  RunPerformanceTest<128>(activations::Sigmoid, 26);
+}
+
+TEST(AvxActivationFunctionsTest, TanhTest) {
+  AvxVectorFuzzTest([](AvxFloatVec *vec) { *vec = activations::Tanh(*vec); },
+                    [](float input_value, float actual) {
+                      const float expected = tanh(input_value);
+                      EXPECT_NEAR(actual, expected, 1e-6)
+                          << "tanh(" << input_value << ") = " << actual
+                          << ", expected = " << expected;
+                    });
+}
+
+TEST(AvxActivationFunctionsTest, TanhPerformanceTest) {
+  RunPerformanceTest<8>(activations::Sigmoid, 23);
+  RunPerformanceTest<16>(activations::Sigmoid, 23);
+  RunPerformanceTest<32>(activations::Tanh, 23);
+  RunPerformanceTest<48>(activations::Tanh, 23);
+  RunPerformanceTest<64>(activations::Tanh, 23);
+  RunPerformanceTest<128>(activations::Tanh, 23);
+}
+
+}  // namespace
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/avx_vector_array.h
+++ b/research/syntaxnet/dragnn/runtime/math/avx_vector_array.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+// Wraps AVX vectors into convenient helper classes. This contains a class
+// wrapping a single AVX register, AvxFloatVec, and a class to manipulate a
+// batch of registers, AvxFloatVecArray. Use of the latter is recommended where
+// applicable, since it will be unrolled into more vectorizable code.
+
+#ifndef DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
+#define DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
+
+#include <cmath>
+#if defined(__AVX__)
+#include <immintrin.h>
+#elif defined(__SSE4_2__)
+#include <nmmintrin.h>
+#endif
+
+#include "dragnn/runtime/math/float16_types.h"
+
+#define DRAGNN_AVXVA_ALWAYS_INLINE inline __attribute__((always_inline))
+#ifdef __clang__
+
+// Clang doesn't support __attribute__((optimize(...))).
+#define DRAGNN_AVXVA_INLINED_UNROLLED inline __attribute__((always_inline))
+
+#else
+
+// Assume we're using GCC, which does.
+#define DRAGNN_AVXVA_INLINED_UNROLLED   \
+  inline __attribute__((always_inline)) \
+      __attribute__((optimize("unroll-loops")))
+
+#endif
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+// Number of single-precision floating point numbers that fit into a single SSE
+// / AVX2 register (which are 128 and 256 bits respectively).
+constexpr int kSseWidth = 128 / 32;  // = 4
+constexpr int kAvxWidth = 256 / 32;  // = 8
+constexpr int kSseWidthHalfPrecision = 128 / 16;  // = 8
+constexpr int kAvxWidthHalfPrecision = 256 / 16;  // = 16
+
+class AvxFloatVec;
+
+namespace internal {
+// This struct should always be eliminated by the compiler; it only exists so we
+// can write `foo += bar * baz`, and have that compiled into a single FMA
+// operation.
+struct AvxMultiplyExpr {
+  const AvxFloatVec &a;
+  const AvxFloatVec &b;
+};
+}  // namespace internal
+
+// Allows EDSL-like programming with AVX vectors.
+inline internal::AvxMultiplyExpr operator*(const AvxFloatVec &a,
+                                           const AvxFloatVec &b);
+inline AvxFloatVec operator+(const internal::AvxMultiplyExpr &expr,
+                             const AvxFloatVec &v);
+inline AvxFloatVec operator+(const AvxFloatVec &a, const AvxFloatVec &b);
+inline AvxFloatVec operator/(const AvxFloatVec &a, const AvxFloatVec &b);
+inline AvxFloatVec operator-(const AvxFloatVec &a, const AvxFloatVec &b);
+
+// API over a single AVX vector (register). The implementation will either use
+// a real AVX vector, or a fixed array of floats for compatibility.
+//
+// Note that we include the "inline" directive in declarations, not just
+// definitions, because it is necessary for the "always_inline" directive.
+struct AvxFloatVec {
+ public:
+  AvxFloatVec() {}
+
+  // Evaluates an AvxMultiplyExpr intermediary without adding anything. This is
+  // not an implicit cast, because typically when we write `a * b` we want to
+  // add it to something and use an FMA operation.
+  explicit AvxFloatVec(const internal::AvxMultiplyExpr &expr);
+
+  // Loads from an aligned region of memory.
+  inline void Load(const float *source);
+
+  // Loads a constant value.
+  inline void LoadConstVector(const float val);
+
+  // Stores to an aligned region of memory.
+  inline void Store(float *dst) const;
+
+  // Adds `a * b` to this value, using a fused multiply-add operation.
+  inline void AddProductOf(const AvxFloatVec &a, const AvxFloatVec &b);
+
+  // Element-wise floor.
+  inline void Floor();
+
+  // Element-wise clamps values between a min and max value.
+  inline void Clamp(const float min_value, const float max_value);
+
+  // Convenience method for more complex calculations.
+  static DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec Const(const float value) {
+    AvxFloatVec result;
+    result.LoadConstVector(value);
+    return result;
+  }
+
+  // Syntactic sugar for computing an FMA operation.
+  inline AvxFloatVec &operator+=(const internal::AvxMultiplyExpr &to_add);
+
+  // Adds another vector element-wise.
+  inline AvxFloatVec &operator+=(const AvxFloatVec &vec);
+
+  // Subtracts another vector element-wise.
+  inline AvxFloatVec &operator-=(const AvxFloatVec &vec);
+
+  // Divides another vector element-wise.
+  inline AvxFloatVec &operator/=(const AvxFloatVec &vec);
+
+#if defined(__AVX__)
+  __m256 ymm;
+#elif defined(__SSE4_2__)
+  __m128 xmm[2];
+#else
+  float ymm[8];
+#endif
+};
+
+// Small wrapper around integer AVX vectors, exposing only methods we need for
+// implementing the activation functions.
+//
+// As above, `inline` is specified here for the always_inline directive.
+class AvxIntVec {
+ public:
+  // Constructs an AVX integer vector, by converting floating-point values.
+  inline explicit AvxIntVec(const AvxFloatVec &v);
+
+  // Left-shifts integer values.
+  inline void LeftShift(int bits);
+
+  // Reinterprets the register as a floating-point register, for bitwise tricks.
+  inline AvxFloatVec ReinterpretCastFloat();
+
+ private:
+  // Underlying register.
+#if defined(__AVX__)
+  __m256i ymm_;
+#elif defined(__SSE4_2__)
+  __m128i xmm_[2];
+#else
+  int ymm_[8];
+#endif
+};
+
+// Implements the index permutation that is effectively applied by the
+// _mm256_unpack instructions. This permutation is equivalent to swapping the
+// 3rd and 4th bits. See the PermutationFunctionIsEqualToTable test for the
+// effective permutation that this encodes.
+//
+// We haven't done performance testing, but hopefully this is sufficiently fast
+// for the compatibility routine. Hopefully in its use below, the compiler will
+// determine it is being called with a constant (post-unrolling) and inline it.
+DRAGNN_AVXVA_ALWAYS_INLINE int FastUnpackPermutation(int original_idx) {
+  // Bit in the 4th index if the 3rd and 4th bits should be swapped.
+  int should_swap = (original_idx + /* 0b0100 */ 4) & /* 0b1000 */ 8;
+
+  // If should_swap is zero, leaves original_idx untouched. Otherwise, does an
+  // xor with 0b1100, which will flip 10 to 01 and 01 to 10.
+  return (should_swap | (should_swap >> 1)) ^ original_idx;
+}
+
+// API over an array of AVX vectors (registers). The methods on this class are
+// annotated such that the compiler should unroll them.
+template <int N>
+struct AvxFloatVecArray {
+ public:
+  DRAGNN_AVXVA_INLINED_UNROLLED void Load(const float *source) {
+    for (int i = 0; i < N; i++) {
+      vectors[i].Load(source + 8 * i);
+    }
+  }
+
+  DRAGNN_AVXVA_INLINED_UNROLLED void Load(const float *source, int max_idx) {
+    for (int i = 0; i < N; i++) {
+      if (i < max_idx) {
+        vectors[i].Load(source + 8 * i);
+      } else {
+        // When testing with a memory sanitizer, we make sure not to read
+        // uninitialized values. This is usually safe in normal operation
+        // because such results are never stored (via corresponding
+        // store-masking logic), but of course each algorithm must be tested to
+        // ensure correct operation.
+        //
+        // It is also worth pointing out that exceptional values (NaN, etc.) can
+        // slow down AVX/FMA floating point operations considerably. So we
+        // should investigate whether this is worth enabling in all cases (and
+        // forcing algorithms to provide a default).
+#if defined(MEMORY_SANITIZER)
+        vectors[i].LoadConstVector(0);
+#endif
+      }
+    }
+  }
+
+  // Reads and unpacks truncated half-precision values.
+  //
+  // Currently, only matrix coefficients use compressed/half-precision values,
+  // so it's not yet necessary to support max_idx masking (which will get a bit
+  // more complicated).
+  DRAGNN_AVXVA_INLINED_UNROLLED void Load(const TruncatedFloat16 *source);
+
+#if defined(__F16C__)
+
+  // Reads and unpacks IEEE-754 half-precision values.
+  //
+  // Currently, only matrix coefficients use compressed/half-precision values,
+  // so it's not yet necessary to support max_idx masking (which will get a bit
+  // more complicated).
+  //
+  // TODO(googleuser): Either add non-F16C compatibility support from Eigen,
+  // or delete this code if it turns out not to be helpful.
+  DRAGNN_AVXVA_INLINED_UNROLLED void Load(const IeeeFloat16 *source);
+#endif
+
+  DRAGNN_AVXVA_INLINED_UNROLLED void LoadConstVector(const float val) {
+    for (int i = 0; i < N; i++) {
+      vectors[i].LoadConstVector(val);
+    }
+  }
+
+  DRAGNN_AVXVA_INLINED_UNROLLED void Store(float *dst) {
+    for (int i = 0; i < N; i++) {
+      vectors[i].Store(dst + 8 * i);
+    }
+  }
+
+  DRAGNN_AVXVA_INLINED_UNROLLED void Store(float *dst, int max_idx) {
+    for (int i = 0; i < N; i++) {
+      // This is equivalent to writing `i < N && i < max_idx` above, but forces
+      // the compiler to produce more efficient code (it's still creating jump
+      // instructions, but the branching is probably more predictable, and the
+      // loops are unrolled). In the future we could switch to VMASKMOV if
+      // necessary.
+      if (i < max_idx) {
+        vectors[i].Store(dst + 8 * i);
+      }
+    }
+  }
+
+  template <class Function>
+  DRAGNN_AVXVA_INLINED_UNROLLED void Apply(const Function &fcn) {
+    for (int i = 0; i < N; i++) {
+      vectors[i] = fcn(vectors[i]);
+    }
+  }
+
+  AvxFloatVec vectors[N];
+};
+
+// Implementation details.
+#if defined(__AVX__)
+DRAGNN_AVXVA_ALWAYS_INLINE
+AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
+  ymm = _mm256_mul_ps(expr.a.ymm, expr.b.ymm);
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
+  ymm = _mm256_load_ps(source);
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
+  ymm = _mm256_set1_ps(val);
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
+  _mm256_store_ps(dst, ymm);
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
+    const AvxFloatVec &a, const AvxFloatVec &b) {
+#if defined(__AVX2__) && defined(__FMA__)
+  ymm = _mm256_fmadd_ps(a.ymm, b.ymm, ymm);
+#else
+  *this += AvxFloatVec(a * b);
+#endif
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
+  ymm = _mm256_floor_ps(ymm);
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
+                                                   const float max_value) {
+  ymm = _mm256_min_ps(ymm, Const(max_value).ymm);
+  ymm = _mm256_max_ps(ymm, Const(min_value).ymm);
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
+    const AvxFloatVec &vec) {
+  ymm = _mm256_add_ps(vec.ymm, ymm);
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
+    const AvxFloatVec &vec) {
+  ymm = _mm256_sub_ps(ymm, vec.ymm);
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
+    const AvxFloatVec &vec) {
+  ymm = _mm256_div_ps(ymm, vec.ymm);
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v)
+    : ymm_(_mm256_cvttps_epi32(v.ymm)) {}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
+#if defined(__AVX2__)
+  ymm_ = _mm256_slli_epi32(ymm_, bits);
+#else
+
+  // Convert to SSE and back again. This is pretty slow, so don't use this code
+  // except for compatibility purposes.
+  __m256i upper_bits = _mm256_permute2f128_si256(ymm_, ymm_, 1);
+  __m128i first = _mm256_castsi256_si128(ymm_);         // Lower bits as SSE
+  __m128i second = _mm256_castsi256_si128(upper_bits);  // Upper bits as SSE
+  first = _mm_slli_epi32(first, bits);
+  second = _mm_slli_epi32(second, bits);
+  ymm_ = _mm256_permute2f128_si256(_mm256_castsi128_si256(first),
+                                   _mm256_castsi128_si256(second), (2 << 4));
+#endif
+}
+
+AvxFloatVec DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::ReinterpretCastFloat() {
+  AvxFloatVec result;
+  result.ymm = _mm256_castsi256_ps(ymm_);
+  return result;
+}
+
+template <int N>
+DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
+    const TruncatedFloat16 *source) {
+  static_assert(N % 2 == 0,
+                "Load() from half floats requires even-sized vector arrays.");
+
+  for (int i = 0; i < N / 2; i++) {
+#if defined(__AVX2__)
+    const __m256i input = _mm256_load_si256(
+        reinterpret_cast<__m256i const *>(source + kAvxWidthHalfPrecision * i));
+    vectors[2 * i].ymm = _mm256_castsi256_ps(
+        _mm256_unpacklo_epi16(_mm256_setzero_si256(), input));
+    vectors[2 * i + 1].ymm = _mm256_castsi256_ps(
+        _mm256_unpackhi_epi16(_mm256_setzero_si256(), input));
+#else
+
+    // Compatibility AVX (not AVX2) implementation.
+    __m128i input[2];
+    input[0] = _mm_load_si128(
+        reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
+    input[1] = _mm_load_si128(reinterpret_cast<__m128i const *>(
+        source + kAvxWidthHalfPrecision * i + kSseWidthHalfPrecision));
+
+    // Unpack. This permutation is kinda cryptic and, to be honest, derived by
+    // simply trying many combinations.
+    vectors[2 * i].ymm = _mm256_insertf128_ps(
+        _mm256_castps128_ps256(_mm_castsi128_ps(
+            _mm_unpacklo_epi16(_mm_setzero_si128(), input[0]))),
+        _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[1])), 1);
+    vectors[2 * i + 1].ymm = _mm256_insertf128_ps(
+        _mm256_castps128_ps256(_mm_castsi128_ps(
+            _mm_unpackhi_epi16(_mm_setzero_si128(), input[0]))),
+        _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[1])), 1);
+#endif
+  }
+}
+
+#if defined(__F16C__)
+template <int N>
+DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
+    const IeeeFloat16 *source) {
+  static_assert(N % 2 == 0,
+                "Load() from half floats requires even-sized vector arrays.");
+
+  for (int i = 0; i < N / 2; i++) {
+    // TODO(googleuser): Experiment with doing a single AVX2 load and
+    // dividing the result.
+    __m128i first_half = _mm_load_si128(
+        reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
+    __m128i second_half = _mm_load_si128(reinterpret_cast<__m128i const *>(
+        source + kAvxWidthHalfPrecision * i + kAvxWidth));
+    vectors[2 * i].ymm = _mm256_cvtph_ps(first_half);
+    vectors[2 * i + 1].ymm = _mm256_cvtph_ps(second_half);
+  }
+}
+#endif
+
+#elif defined(__SSE4_2__)
+DRAGNN_AVXVA_ALWAYS_INLINE
+AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
+  for (int i = 0; i < 2; ++i) {
+    xmm[i] = _mm_mul_ps(expr.a.xmm[i], expr.b.xmm[i]);
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
+  for (int i = 0; i < 2; ++i) {
+    xmm[i] = _mm_load_ps(&source[i * kSseWidth]);
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
+  xmm[1] = xmm[0] = _mm_set1_ps(val);
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
+  for (int i = 0; i < 2; ++i) {
+    _mm_store_ps(&dst[i * kSseWidth], xmm[i]);
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
+    const AvxFloatVec &a, const AvxFloatVec &b) {
+  *this += AvxFloatVec(a * b);
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
+  for (int i = 0; i < 2; ++i) {
+    xmm[i] = _mm_floor_ps(xmm[i]);
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
+                                                   const float max_value) {
+  for (int i = 0; i < 2; ++i) {
+    xmm[i] = _mm_min_ps(xmm[i], Const(max_value).xmm[i]);
+    xmm[i] = _mm_max_ps(xmm[i], Const(min_value).xmm[i]);
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
+    const AvxFloatVec &vec) {
+  for (int i = 0; i < 2; ++i) {
+    xmm[i] = _mm_add_ps(vec.xmm[i], xmm[i]);
+  }
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
+    const AvxFloatVec &vec) {
+  for (int i = 0; i < 2; ++i) {
+    xmm[i] = _mm_sub_ps(xmm[i], vec.xmm[i]);
+  }
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
+    const AvxFloatVec &vec) {
+  for (int i = 0; i < 2; ++i) {
+    xmm[i] = _mm_div_ps(xmm[i], vec.xmm[i]);
+  }
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v) {
+  xmm_[0] = _mm_cvttps_epi32(v.xmm[0]);
+  xmm_[1] = _mm_cvttps_epi32(v.xmm[1]);
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
+  for (int i = 0; i < 2; ++i) {
+    xmm_[i] = _mm_slli_epi32(xmm_[i], bits);
+  }
+}
+
+AvxFloatVec DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::ReinterpretCastFloat() {
+  AvxFloatVec result;
+  for (int i = 0; i < 2; ++i) {
+    result.xmm[i] = _mm_castsi128_ps(xmm_[i]);
+  }
+  return result;
+}
+
+template <int N>
+DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
+    const TruncatedFloat16 *source) {
+  static_assert(N % 2 == 0,
+                "Load() from half floats requires even-sized vector arrays.");
+
+  for (int i = 0; i < N / 2; i++) {
+    __m128i input[2];
+    input[0] = _mm_load_si128(
+        reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
+    input[1] = _mm_load_si128(reinterpret_cast<__m128i const *>(
+        source + kAvxWidthHalfPrecision * i + kSseWidthHalfPrecision));
+
+    vectors[2 * i].xmm[0] =
+        _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[0]));
+    vectors[2 * i + 1].xmm[0] =
+        _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[0]));
+    vectors[2 * i].xmm[1] =
+        _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[1]));
+    vectors[2 * i + 1].xmm[1] =
+        _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[1]));
+  }
+}
+
+#if defined(__F16C__)
+template <int N>
+DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
+    const IeeeFloat16 *source) {
+  static_assert(N % 2 == 0,
+                "Load() from half floats requires even-sized vector arrays.");
+
+  for (int i = 0; i < N / 2; i++) {
+    __m128i first_half = _mm_load_si128(
+        reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
+    __m128i second_half = _mm_load_si128(reinterpret_cast<__m128i const *>(
+        source + kAvxWidthHalfPrecision * i + kAvxWidth));
+    vectors[2 * i].xmm[0] = _mm_cvtph_ps(first_half);
+    vectors[2 * i + 1].xmm[0] = _mm_cvtph_ps(second_half);
+
+    first_half = _mm_shuffle_epi32(first_half, _MM_SHUFFLE(0, 1, 3, 2));
+    second_half = _mm_shuffle_epi32(second_half, _MM_SHUFFLE(0, 1, 3, 2));
+    vectors[2 * i].xmm[1] = _mm_cvtph_ps(first_half);
+    vectors[2 * i + 1].xmm[1] = _mm_cvtph_ps(second_half);
+  }
+}
+#endif
+
+#else
+
+// Compatibility implementations. If you compile with -ftree-vectorize and
+// -msse2 flags, you should still get decent performance (maybe 1/4 of the
+// AVX/FMA version).
+//
+// See the class above for method documentation.
+DRAGNN_AVXVA_ALWAYS_INLINE
+AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
+  for (int i = 0; i < 8; i++) {
+    ymm[i] = expr.a.ymm[i] * expr.b.ymm[i];
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
+  for (int i = 0; i < 8; i++) {
+    ymm[i] = source[i];
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
+  for (int i = 0; i < 8; i++) {
+    ymm[i] = val;
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
+  for (int i = 0; i < 8; i++) {
+    dst[i] = ymm[i];
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
+    const AvxFloatVec &a, const AvxFloatVec &b) {
+  for (int i = 0; i < 8; i++) {
+    ymm[i] += a.ymm[i] * b.ymm[i];
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
+  for (int i = 0; i < 8; i++) {
+    ymm[i] = floor(ymm[i]);
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
+                                                   const float max_value) {
+  for (int i = 0; i < 8; i++) {
+    ymm[i] = fmin(fmax(ymm[i], min_value), max_value);
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
+    const AvxFloatVec &vec) {
+  for (int i = 0; i < 8; i++) {
+    ymm[i] += vec.ymm[i];
+  }
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
+    const AvxFloatVec &vec) {
+  for (int i = 0; i < 8; i++) {
+    ymm[i] -= vec.ymm[i];
+  }
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
+    const AvxFloatVec &vec) {
+  for (int i = 0; i < 8; i++) {
+    ymm[i] /= vec.ymm[i];
+  }
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v) {
+  for (int i = 0; i < 8; i++) {
+    ymm_[i] = static_cast<int>(v.ymm[i]);
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
+  for (int i = 0; i < 8; i++) {
+    ymm_[i] = ymm_[i] << bits;
+  }
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec AvxIntVec::ReinterpretCastFloat() {
+  AvxFloatVec result;
+  for (int i = 0; i < 8; i++) {
+    result.ymm[i] = reinterpret_cast<float &>(ymm_[i]);
+  }
+  return result;
+}
+
+template <int N>
+DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
+    const TruncatedFloat16 *source) {
+  static_assert(N % 2 == 0,
+                "Load() from half floats requires even-sized vector arrays.");
+
+  // Iterate through mock AVX vectors, each composed of 16 half-floats.
+  for (int vec_idx = 0; vec_idx < N / 2; vec_idx++) {
+    // Making this code a bit more verbose, by reading in-order to a temporary
+    // array, results in faster performance. The compatibility version is still
+    // pretty slow though.
+    TruncatedFloat16 tmp[16];
+    for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
+      tmp[i] = source[i + kAvxWidthHalfPrecision * vec_idx];
+    }
+    float unpacked[16];
+    for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
+      unpacked[i] = tmp[i].DebugToFloat();
+    }
+    for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
+      int permuted = FastUnpackPermutation(i);
+      vectors[2 * vec_idx + (i / 8)].ymm[i % 8] = unpacked[permuted];
+    }
+  }
+}
+
+#if defined(__F16C__)
+template <int N>
+DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
+    const IeeeFloat16 *source) {
+  // Not actually required for the compatibility implementation, but it'd be
+  // rather non-uniform if this API succeeded, and then compilation failed when
+  // AVX2 was turned on.
+  static_assert(N % 2 == 0,
+                "Load() from half floats requires even-sized vector arrays.");
+
+  // Iterate through mock AVX vectors, each composed of 16 half-floats.
+  for (int i = 0; i < N * kAvxWidth; ++i) {
+    vectors[i / 8].ymm[i % 8] = source[i].DebugToFloat();
+  }
+}
+#endif
+#endif
+
+// The following operations are mostly syntax sugar, so they do not need
+// architecture-specific implementations.
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
+    const internal::AvxMultiplyExpr &to_add) {
+  AddProductOf(to_add.a, to_add.b);
+  return *this;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE internal::AvxMultiplyExpr operator*(
+    const AvxFloatVec &a, const AvxFloatVec &b) {
+  return internal::AvxMultiplyExpr{a, b};
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec
+operator+(const internal::AvxMultiplyExpr &expr, const AvxFloatVec &v) {
+  AvxFloatVec result = v;
+  result += expr;
+  return result;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator+(const AvxFloatVec &a,
+                                                 const AvxFloatVec &b) {
+  AvxFloatVec result = a;
+  result += b;
+  return result;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator/(const AvxFloatVec &a,
+                                                 const AvxFloatVec &b) {
+  AvxFloatVec result = a;
+  result /= b;
+  return result;
+}
+
+DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator-(const AvxFloatVec &a,
+                                                 const AvxFloatVec &b) {
+  AvxFloatVec result = a;
+  result -= b;
+  return result;
+}
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#undef DRAGNN_AVXVA_ALWAYS_INLINE
+#undef DRAGNN_AVXVA_INLINED_UNROLLED
+
+#endif  // DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
--- a/research/syntaxnet/dragnn/runtime/math/avx_vector_array_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/avx_vector_array_test.cc
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "dragnn/runtime/math/avx_vector_array.h"
+
+#include <cmath>
+
+#include "dragnn/runtime/test/helpers.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+namespace {
+
+TEST(AvxVectorTest, LoadAndStore) {
+  UniqueVector<float> input(kAvxWidth);
+  UniqueVector<float> output(kAvxWidth);
+  InitRandomVector(*input);
+  InitRandomVector(*output);
+
+  AvxFloatVec vec;
+  vec.Load(input->data());
+  vec.Store(output->data());
+
+  for (int i = 0; i < kAvxWidth; ++i) {
+    EXPECT_EQ((*input)[i], (*output)[i]);
+  }
+}
+
+// Test flooring with assignment, just to make the compiler not erase aliases.
+TEST(AvxVectorTest, AssignmentAndFloor) {
+  UniqueVector<float> input(kAvxWidth);
+  UniqueVector<float> output(kAvxWidth);
+  UniqueVector<float> floored(kAvxWidth);
+  InitRandomVector(*input);
+  InitRandomVector(*output);
+
+  AvxFloatVec vec;
+  vec.Load(input->data());
+  AvxFloatVec vec2 = vec;
+  vec.Floor();
+  vec.Store(floored->data());
+  vec2.Store(output->data());
+
+  for (int i = 0; i < kAvxWidth; ++i) {
+    EXPECT_EQ((*input)[i], (*output)[i]);
+    EXPECT_EQ(floor((*input)[i]), (*floored)[i]);
+  }
+}
+
+TEST(AvxVectorTest, ClampTest) {
+  bool modified = false;  // check that some value was clamped.
+  AvxVectorFuzzTest(
+      [](AvxFloatVec *vec) { vec->Clamp(-0.314f, 0.314f); },
+      [&modified](float input_value, float output_value) {
+        modified = modified || input_value < -0.314 || input_value > 0.314;
+        EXPECT_EQ(fmax(-0.314f, fmin(0.314f, input_value)), output_value);
+      });
+  EXPECT_TRUE(modified) << "No values fell outside test range for ClampTest().";
+}
+
+TEST(AvxVectorTest, LoadConstAndStore) {
+  UniqueVector<float> output(kAvxWidth);
+  InitRandomVector(*output);
+
+  AvxFloatVec vec;
+  vec.LoadConstVector(3.14f);
+  vec.Store(output->data());
+
+  for (int i = 0; i < kAvxWidth; ++i) {
+    EXPECT_EQ((*output)[i], 3.14f);
+  }
+}
+
+TEST(AvxVectorTest, AddTest) {
+  AvxVectorFuzzTest(  //
+      [](AvxFloatVec *vec) { (*vec) += *vec; },
+      [](float input_value, float output_value) {
+        EXPECT_EQ(input_value * 2, output_value);
+      });
+}
+
+TEST(AvxVectorTest, SubtractTest) {
+  AvxVectorFuzzTest(
+      [](AvxFloatVec *vec) {
+        AvxFloatVec one;
+        one.LoadConstVector(1.0f);
+        (*vec) -= one;
+      },
+      [](float input_value, float output_value) {
+        EXPECT_EQ(input_value - 1.0f, output_value);
+      });
+}
+
+TEST(AvxVectorTest, DivideTest) {
+  AvxVectorFuzzTest(
+      [](AvxFloatVec *vec) {
+        AvxFloatVec result;
+        result.LoadConstVector(1.0f);
+        result /= *vec;
+        *vec = result;
+      },
+      [](float input_value, float output_value) {
+        EXPECT_EQ(1.0f / input_value, output_value);
+      });
+}
+
+// This is a really basic test; half of the purpose is to ensure that the float
+// API is still OK (i.e. compiles) for odd-sized arrays. If you try to add a
+// call to array.Load(TruncatedFloat16 *source), it should produce a compiler
+// error.
+TEST(AvxFloatVecArrayTest, SingletonArrayLoadsAndStores) {
+  AvxFloatVecArray<1> array;
+
+  UniqueVector<float> input(kAvxWidth);
+  UniqueVector<float> output(kAvxWidth);
+  InitRandomVector(*input);
+  InitRandomVector(*output);
+
+  array.Load(input->data());
+  array.Store(output->data());
+
+  for (int i = 0; i < kAvxWidth; ++i) {
+    EXPECT_EQ((*input)[i], (*output)[i]);
+  }
+}
+
+TEST(AvxFloatVecArrayTest, LoadTruncatedFloat16) {
+  AvxFloatVecArray<2> array;
+  UniqueVector<TruncatedFloat16> values(2 * kAvxWidth);
+  UniqueVector<float> decompressed(2 * kAvxWidth);
+
+  for (int i = 0; i < 2 * kAvxWidth; ++i) {
+    int permuted = FastUnpackPermutation(i);
+    (*values)[i] = TruncatedFloat16::DebugFromFloat(permuted / 10.0);
+  }
+
+  // Ensure that state persisted from other tests won't cause this test to
+  // erroneously pass.
+  array.LoadConstVector(-1.0f);
+
+  array.Load(values->data());
+  array.Store(decompressed->data());
+  for (int i = 0; i < 2 * kAvxWidth; ++i) {
+    ASSERT_NEAR((*decompressed)[i], i / 10.0, 0.01);
+  }
+}
+
+TEST(AvxFloatVecArrayTest, LoadIeeeFloat16) {
+#if defined(__F16C__)
+  AvxFloatVecArray<2> array;
+  UniqueVector<IeeeFloat16> values(2 * kAvxWidth);
+  UniqueVector<float> decompressed(2 * kAvxWidth);
+  for (int i = 0; i < 2 * kAvxWidth; ++i) {
+    (*values)[i] = IeeeFloat16::DebugFromFloat(i / 10.0);
+  }
+
+  // Ensure that state persisted from other tests won't cause this test to
+  // erroneously pass.
+  array.LoadConstVector(-1.0f);
+
+  array.Load(values->data());
+  array.Store(decompressed->data());
+  for (int i = 0; i < 2 * kAvxWidth; ++i) {
+    ASSERT_NEAR((*decompressed)[i], i / 10.0, 0.01);
+  }
+#else
+  LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
+            << "this test.";
+#endif
+}
+
+TEST(AvxFloatVecArrayTest, PermutationFunctionIsEqualToTable) {
+  std::vector<int> permutation = {0, 1, 2, 3, 8,  9,  10, 11,
+                                  4, 5, 6, 7, 12, 13, 14, 15};
+
+  for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
+    EXPECT_EQ(FastUnpackPermutation(i), permutation[i]);
+  }
+}
+
+}  // namespace
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/eigen.h
+++ b/research/syntaxnet/dragnn/runtime/math/eigen.h
+// Copyright 2018 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+// Compatibility support for Eigen.
+
+#ifndef DRAGNN_RUNTIME_MATH_EIGEN_H_
+#define DRAGNN_RUNTIME_MATH_EIGEN_H_
+
+#include "dragnn/runtime/alignment.h"
+#include "dragnn/runtime/math/types.h"
+#include "third_party/eigen3/Eigen/Core"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+namespace internal {
+
+// Returns a combination of bit-options for Eigen matrices.
+constexpr int GetEigenMatrixOptions() {
+  return Eigen::AutoAlign | Eigen::RowMajor;
+}
+
+// Returns a combination of bit-options for Eigen maps of runtime types.
+constexpr int GetEigenMapOptions() {
+  static_assert(kAlignmentBytes >= EIGEN_MAX_ALIGN_BYTES,
+                "Runtime alignment is not compatible with Eigen alignment.");
+  return Eigen::Aligned;
+}
+
+// Eigen matrix and (row) vector types.  Don't use these directly; instead use
+// the public Map types and functions below to wrap runtime types.
+template <class T>
+using EigenVector =
+    Eigen::Matrix<T, 1, Eigen::Dynamic, GetEigenMatrixOptions()>;
+template <class T>
+using EigenMatrix =
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, GetEigenMatrixOptions()>;
+
+// Eigen stride for matrix types.
+using EigenMatrixStride = Eigen::Stride<Eigen::Dynamic, 1>;
+
+// Returns the Eigen stride associated with the |matrix|.
+template <class T>
+EigenMatrixStride GetEigenMatrixStride(MatrixImpl<T> matrix) {
+  return EigenMatrixStride(matrix.row_stride(), 1);
+}
+
+}  // namespace internal
+
+// Eigen wrappers around a runtime-allocated matrix or (row) vector.
+template <class T>
+using EigenVectorMap =
+    Eigen::Map<const internal::EigenVector<T>, internal::GetEigenMapOptions()>;
+template <class T>
+using MutableEigenVectorMap =
+    Eigen::Map<internal::EigenVector<T>, internal::GetEigenMapOptions()>;
+template <class T>
+using EigenMatrixMap =
+    Eigen::Map<const internal::EigenMatrix<T>, internal::GetEigenMapOptions(),
+               internal::EigenMatrixStride>;
+template <class T>
+using MutableEigenMatrixMap =
+    Eigen::Map<internal::EigenMatrix<T>, internal::GetEigenMapOptions(),
+               internal::EigenMatrixStride>;
+
+// Returns an Eigen wrapper around the |vector| or |matrix|.
+template <class T>
+EigenVectorMap<T> AsEigenMap(Vector<T> vector) {
+  return EigenVectorMap<T>(vector.data(), vector.size());
+}
+template <class T>
+MutableEigenVectorMap<T> AsEigenMap(MutableVector<T> vector) {
+  return MutableEigenVectorMap<T>(vector.data(), vector.size());
+}
+template <class T>
+EigenMatrixMap<T> AsEigenMap(Matrix<T> matrix) {
+  return EigenMatrixMap<T>(matrix.data(), matrix.num_rows(),
+                           matrix.num_columns(),
+                           internal::GetEigenMatrixStride(matrix));
+}
+template <class T>
+MutableEigenMatrixMap<T> AsEigenMap(MutableMatrix<T> matrix) {
+  return MutableEigenMatrixMap<T>(matrix.data(), matrix.num_rows(),
+                                  matrix.num_columns(),
+                                  internal::GetEigenMatrixStride(matrix));
+}
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#endif  // DRAGNN_RUNTIME_MATH_EIGEN_H_
--- a/research/syntaxnet/dragnn/runtime/math/eigen_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/eigen_test.cc
+// Copyright 2018 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "dragnn/runtime/math/eigen.h"
+
+#include <vector>
+
+#include "dragnn/core/test/generic.h"
+#include "dragnn/runtime/math/types.h"
+#include "dragnn/runtime/test/helpers.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+namespace {
+
+// Expects that two pointers point to the same address.
+void ExpectSameAddress(const void *ptr1, const void *ptr2) {
+  EXPECT_EQ(ptr1, ptr2);
+}
+
+// Expects that the |vector| has the |values|.
+void ExpectValues(MutableVector<float> vector,
+                  const std::vector<float> &values) {
+  ASSERT_EQ(vector.size(), values.size());
+  for (int i = 0; i < values.size(); ++i) {
+    EXPECT_EQ(vector[i], values[i]);
+  }
+}
+
+// Expects that the Eigen |matrix| has the |values|.
+template <class EigenMatrix>
+void ExpectValues(const EigenMatrix &matrix,
+                  const std::vector<std::vector<float>> &values) {
+  ASSERT_EQ(matrix.rows(), values.size());
+  for (int row = 0; row < matrix.rows(); ++row) {
+    ASSERT_EQ(matrix.cols(), values[row].size());
+    for (int column = 0; column < matrix.cols(); ++column) {
+      EXPECT_EQ(matrix(row, column), values[row][column]);
+    }
+  }
+}
+
+// Tests that an Eigen vector map references the same memory as the underlying
+// runtime vector.
+TEST(EigenTest, Vector) {
+  UniqueVector<float> vector({1.0, 2.0, 3.0, 4.0});
+
+  EigenVectorMap<float> const_eigen_vector = AsEigenMap(Vector<float>(*vector));
+  ExpectSameAddress(const_eigen_vector.data(), vector->data());
+  ExpectValues(const_eigen_vector, {{1.0, 2.0, 3.0, 4.0}});
+
+  MutableEigenVectorMap<float> mutable_eigen_vector = AsEigenMap(*vector);
+  ExpectSameAddress(mutable_eigen_vector.data(), vector->data());
+  ExpectValues(mutable_eigen_vector, {{1.0, 2.0, 3.0, 4.0}});
+
+  // Write into the runtime vector and read from the other views.
+  (*vector)[0] = 10.0;
+  (*vector)[1] = 20.0;
+  (*vector)[2] = 30.0;
+  (*vector)[3] = 40.0;
+  ExpectValues(const_eigen_vector, {{10.0, 20.0, 30.0, 40.0}});
+  ExpectValues(mutable_eigen_vector, {{10.0, 20.0, 30.0, 40.0}});
+
+  // Write into the mutable Eigen vector and read from the other views.
+  mutable_eigen_vector << 100.0, 200.0, 300.0, 400.0;
+  ExpectValues(const_eigen_vector, {{100.0, 200.0, 300.0, 400.0}});
+  ExpectValues(*vector, {100.0, 200.0, 300.0, 400.0});
+}
+
+// Tests that an Eigen matrix map references the same memory as the underlying
+// runtime vector.
+TEST(EigenTest, Matrix) {
+  UniqueMatrix<float> matrix({{1.0, 2.0, 3.0},  //
+                              {4.0, 5.0, 6.0},  //
+                              {7.0, 8.0, 9.0}});
+
+  EigenMatrixMap<float> const_eigen_matrix = AsEigenMap(Matrix<float>(*matrix));
+  ExpectSameAddress(const_eigen_matrix.data(), matrix->row(0).data());
+  ExpectValues(const_eigen_matrix, {{1.0, 2.0, 3.0},  //
+                                    {4.0, 5.0, 6.0},  //
+                                    {7.0, 8.0, 9.0}});
+
+  MutableEigenMatrixMap<float> mutable_eigen_matrix = AsEigenMap(*matrix);
+  ExpectSameAddress(mutable_eigen_matrix.data(), matrix->row(0).data());
+  ExpectValues(mutable_eigen_matrix, {{1.0, 2.0, 3.0},  //
+                                      {4.0, 5.0, 6.0},  //
+                                      {7.0, 8.0, 9.0}});
+
+  // Write into the runtime matrix and read from the other views.
+  matrix->row(0)[0] = 10.0;
+  matrix->row(0)[1] = 20.0;
+  matrix->row(0)[2] = 30.0;
+  matrix->row(1)[0] = 40.0;
+  matrix->row(1)[1] = 50.0;
+  matrix->row(1)[2] = 60.0;
+  matrix->row(2)[0] = 70.0;
+  matrix->row(2)[1] = 80.0;
+  matrix->row(2)[2] = 90.0;
+  ExpectValues(const_eigen_matrix, {{10.0, 20.0, 30.0},  //
+                                    {40.0, 50.0, 60.0},  //
+                                    {70.0, 80.0, 90.0}});
+  ExpectValues(mutable_eigen_matrix, {{10.0, 20.0, 30.0},  //
+                                      {40.0, 50.0, 60.0},  //
+                                      {70.0, 80.0, 90.0}});
+
+  // Write into the mutable Eigen matrix and read from the other views.
+  mutable_eigen_matrix << 100.0, 200.0, 300.0,
+                          400.0, 500.0, 600.0,
+                          700.0, 800.0, 900.0;
+  ExpectValues(const_eigen_matrix, {{100.0, 200.0, 300.0},  //
+                                    {400.0, 500.0, 600.0},  //
+                                    {700.0, 800.0, 900.0}});
+  ExpectValues(matrix->row(0), {100.0, 200.0, 300.0});
+  ExpectValues(matrix->row(1), {400.0, 500.0, 600.0});
+  ExpectValues(matrix->row(2), {700.0, 800.0, 900.0});
+}
+
+}  // namespace
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/float16_types.h
+++ b/research/syntaxnet/dragnn/runtime/math/float16_types.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+// Declares 16-bit floating point types.
+
+#ifndef DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
+#define DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
+
+#if defined(__F16C__)
+#include <emmintrin.h>
+#endif
+
+#include "syntaxnet/base.h"
+#include "tensorflow/core/lib/core/casts.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+// Represents a truncated 16-bit floating point value. This corresponds to
+// `bfloat16` in TensorFlow. It just chops the last 16 least-significant bits
+// off the significand of a 32-bit floating point value, leaving 7 significand
+// bits, 8 exponent bits, and 1 sign bit.
+struct TruncatedFloat16 {
+  // Slow unpacking routine. Use avx_vector_array.h for normal operation.
+  float DebugToFloat() const {
+    uint32 upcast = bits;
+    upcast <<= 16;
+    return tensorflow::bit_cast<float>(upcast);
+  }
+
+  // Slow packing routine. Use avx_vector_array.h for normal operation.
+  static TruncatedFloat16 DebugFromFloat(float value) {
+    uint32 float_bits = tensorflow::bit_cast<uint32>(value);
+    return TruncatedFloat16{static_cast<uint16>(float_bits >> 16)};
+  }
+
+  uint16 bits;
+};
+
+static_assert(sizeof(TruncatedFloat16) == sizeof(uint16), "Bad struct size");
+
+// Currently, only CPUs with the F16C instruction set are supported. All use of
+// this struct should be flag-guarded.
+//
+// If this becomes a problem, we can implement this method with Eigen's
+// CUDA/Half.h.
+#if defined(__F16C__)
+
+// Represents an IEEE-754 16-bit floating point value. This has 10 significand
+// bits, 5 exponent bits, and 1 sign bit.
+//
+// TODO(googleuser): Either add compatibility support, or delete this code if
+// it turns out not to be helpful.
+struct IeeeFloat16 {
+  // Slow unpacking routine. Use avx_vector_array.h for normal operation.
+  float DebugToFloat() const { return _cvtsh_ss(bits); }
+
+  // Slow packing routine. Use avx_vector_array.h for normal operation.
+  static IeeeFloat16 DebugFromFloat(float value) {
+    return IeeeFloat16{_cvtss_sh(value, 0)};
+  }
+
+  uint16 bits;
+};
+
+static_assert(sizeof(IeeeFloat16) == sizeof(uint16), "Bad struct size");
+
+#endif
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#endif  // DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
--- a/research/syntaxnet/dragnn/runtime/math/float16_types_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/float16_types_test.cc
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "dragnn/runtime/math/float16_types.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+namespace {
+
+// C++11 doesn't support binary literals like 0b01001, so add a helper. :(
+uint16 ParseBinaryString(const string &bits) {
+  CHECK_EQ(bits.size(), 16) << "ParseBinaryString expects full 16-bit values";
+  uint16 value = 0;
+  for (const char bit : bits) {
+    CHECK(bit == '0' || bit == '1') << "String must be 0's and 1's.";
+    value = (value << 1) + (bit == '0' ? 0 : 1);
+  }
+  return value;
+}
+
+TEST(Float16TypesTest, IeeeFloat16Accuracy) {
+#if defined(__F16C__)
+  bool some_not_exact = false;
+  for (int i = -100; i < 100; ++i) {
+    float value = i / 10.0f;
+    IeeeFloat16 half = IeeeFloat16::DebugFromFloat(value);
+    float unpacked = half.DebugToFloat();
+    EXPECT_NEAR(value, unpacked, 0.01);
+    some_not_exact = some_not_exact || (value != unpacked);
+  }
+  EXPECT_TRUE(some_not_exact);
+#else
+  LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
+            << "this test.";
+#endif
+}
+
+TEST(Float16TypesTest, TruncatedAccuracy) {
+  bool some_not_exact = false;
+  for (int i = -100; i < 100; ++i) {
+    float value = i / 10.0f;
+    TruncatedFloat16 half = TruncatedFloat16::DebugFromFloat(value);
+    float unpacked = half.DebugToFloat();
+    EXPECT_NEAR(value, unpacked, 0.06);
+    some_not_exact = some_not_exact || (value != unpacked);
+  }
+  EXPECT_TRUE(some_not_exact);
+}
+
+TEST(Float16TypesTest, TruncatedKnownBinaryRepresentation) {
+  uint16 neg_1 = ParseBinaryString("1011111110000000");
+  uint16 one = ParseBinaryString("0011111110000000");
+  EXPECT_EQ((TruncatedFloat16{neg_1}).DebugToFloat(), -1.0f);
+  EXPECT_EQ((TruncatedFloat16{one}).DebugToFloat(), 1.0f);
+}
+
+TEST(Float16TypesTest, IeeeFloat16KnownBinaryRepresentation) {
+#if defined(__F16C__)
+  uint16 neg_1 = ParseBinaryString("1011110000000000");
+  uint16 one = ParseBinaryString("0011110000000000");
+  EXPECT_EQ((IeeeFloat16{neg_1}).DebugToFloat(), -1.0f);
+  EXPECT_EQ((IeeeFloat16{one}).DebugToFloat(), 1.0f);
+#else
+  LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
+            << "this test.";
+#endif
+}
+
+}  // namespace
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/sgemvv.h
+++ b/research/syntaxnet/dragnn/runtime/math/sgemvv.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+// Computes `[y_1, y_2, ...] = M * [v_1, v_2, ...] + [b_1, b_2, ...]`, where
+//
+//    M is a `m x n` dense matrix.
+//    v_i are `n`-dimensional dense vectors.
+//    b_i and y_i are `m`-dimensional dense vectors.
+//
+// Unfortunately even larger (e.g. 128x128) matrix sizes are not sufficient to
+// hide the latency of a function call. So the entire implementation needs to
+// live in this header file. Please make sure to use all of the optimization
+// flags mentioned in the BUILD file in any client libraries.
+
+#ifndef DRAGNN_RUNTIME_MATH_SGEMVV_H_
+#define DRAGNN_RUNTIME_MATH_SGEMVV_H_
+
+#if defined(__SSE2__)
+#include <xmmintrin.h>
+#endif
+
+#include "dragnn/runtime/math/avx_vector_array.h"
+#include "dragnn/runtime/math/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+
+#define DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
+#ifdef __clang__
+#define DRAGNN_SGEMVV_GCC_UNROLL
+#else
+#define DRAGNN_SGEMVV_GCC_UNROLL __attribute__((optimize("unroll-loops")))
+#endif
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+// Represents `v, b` from one operation `y = M * v + b`.
+template <int num_ops>
+struct SgemvInputBatch {
+  const float *input[num_ops];
+  const float *initial[num_ops];
+};
+
+template <int num_ops>
+struct SgemvOutputBatch {
+  float *output[num_ops];
+};
+
+// Matrix argument for the SGEMV/SGEMVV operation. Based on row-batched
+// column-major matrices, but pulls the batch size into a template argument
+// so code can be compiled more efficiently.
+template <int sse_batch_size, typename ElementType = float>
+class SgemvMatrix final {
+ public:
+  // Convenience type alias.
+  using MatrixType =
+      BlockedMatrix<ElementType, BlockedMatrixFormat::kRowBlockedColumnMajor>;
+
+  // Creates an empty SgemvMatrix.
+  SgemvMatrix() = default;
+
+  // Initializes the new matrix. Returns an InvalidArgumentError if the block
+  // size of `matrix` is not equal to `sse_batch_size.
+  ::tensorflow::Status Initialize(const MatrixType &matrix);
+
+  // Computes the matrix-vector product with a set of other inputs. See
+  // top-level comment for the general algorithm.
+  template <int num_ops, int lookahead_1 = 8, int lookahead_2 = 8>
+  void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
+  MatrixMultiVectorProduct(const SgemvInputBatch<num_ops> &inputs,
+                           SgemvOutputBatch<num_ops> *outputs) const {
+    MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/false,
+                                 /*read_initial=*/true, lookahead_1,
+                                 lookahead_2>(inputs, -1, outputs);
+  }
+
+  // Computes the matrix-vector product with a set of other inputs. See
+  // top-level comment for the general algorithm. This variant allows another
+  // parameter, `output_vector_elements`, to write to outputs which are a
+  // multiple of kAvxWidth (8 floats, or 32 bytes) but not necessarily
+  // sse_batch_size. It is slightly slower, but probably more than noise.
+  //
+  // |lookahead_1| and |lookahead_2| parameters control prefetching, and should
+  // usually be tuned via a script. They issue prefetch instructions that are
+  // `lookahead_1 * sse_batch_size` values ahead of the current matrix entry
+  // being read, if `lookahead_1 != 0` (and `(lookahead_1 + lookahead_2) *
+  // sse_batch_size` values, if lookahead_2 != 0). To reiterate, all prefetching
+  // can be disabled by setting |lookahead_1| to 0, or the second prefetch can
+  // be disabled by setting |lookahead_2| to 0.
+  template <int num_ops, int lookahead_1 = 8, int lookahead_2 = 8>
+  void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
+  MaskedMatrixMultiVectorProduct(const SgemvInputBatch<num_ops> &inputs,
+                                 int output_vector_elements,
+                                 SgemvOutputBatch<num_ops> *outputs) const {
+    MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/true,
+                                 /*read_initial=*/true, lookahead_1,
+                                 lookahead_2>(inputs, output_vector_elements,
+                                              outputs);
+  }
+
+  // Like the above, but assumes existing values are zero instead of reading
+  // them.
+  template <int num_ops>
+  void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
+  MaskedMatrixMultiVectorProductNoInitial(
+      const SgemvInputBatch<num_ops> &inputs, int output_vector_elements,
+      SgemvOutputBatch<num_ops> *outputs) const {
+    MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/true,
+                                 /*read_initial=*/false>(
+        inputs, output_vector_elements, outputs);
+  }
+
+  // Read-only accessor.
+  const MatrixType &matrix() const { return matrix_; }
+
+ private:
+  template <int num_ops, bool mask_input_output, bool read_initial,
+            int lookahead_1 = 8, int lookahead_2 = 8>
+  DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL void
+  MatrixMultiVectorProductImpl(const SgemvInputBatch<num_ops> &inputs,
+                               int output_vector_elements,
+                               SgemvOutputBatch<num_ops> *outputs) const;
+
+  MatrixType matrix_;
+};
+
+// Implementation details.
+template <int sse_batch_size, typename ElementType>
+template <int num_ops, bool mask_input_output, bool read_initial,
+          int lookahead_1, int lookahead_2>
+inline void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
+SgemvMatrix<sse_batch_size, ElementType>::MatrixMultiVectorProductImpl(
+    const SgemvInputBatch<num_ops> &inputs, int output_vector_elements,
+    SgemvOutputBatch<num_ops> *outputs) const {
+  static_assert(sse_batch_size % kAvxWidth == 0,
+                "sse_batch_size must be a multiple of kAvxWidth (8).");
+  if (mask_input_output) {
+    DCHECK_EQ(output_vector_elements % kAvxWidth, 0)
+        << "output_vector_elements must be padded to alignment";
+  }
+
+  const ElementType *curr_matrix_ptr = matrix_.vector(0).data();
+
+  // Loop over blocks of output rows. Each block of output rows will get a
+  // partial sum of the [matrix-vector] dot product, where the range of that
+  // partial sum is designated by start_col and end_col.
+  for (int row_start = 0; row_start < matrix_.num_rows();
+       row_start += sse_batch_size) {
+    const int load_store_max_idx =
+        (output_vector_elements - row_start) / kAvxWidth;
+    AvxFloatVecArray<sse_batch_size / kAvxWidth> accumulators[num_ops];
+
+    // Read inputs.
+    for (int op = 0; op < num_ops; ++op) {
+      if (read_initial) {
+        if (mask_input_output) {
+          accumulators[op].Load(&inputs.initial[op][row_start],
+                                load_store_max_idx);
+        } else {
+          accumulators[op].Load(&inputs.initial[op][row_start]);
+        }
+      } else {
+        accumulators[op].LoadConstVector(0.0f);
+      }
+    }
+
+    // Compute matrix-vector product.
+    for (int col = 0; col < matrix_.num_columns(); ++col) {
+      if (lookahead_1 != 0) {
+#if defined(__SSE2__)
+        _mm_prefetch(curr_matrix_ptr + lookahead_1 * sse_batch_size,
+                     _MM_HINT_T0);
+        if (lookahead_2 != 0) {
+          _mm_prefetch(
+              curr_matrix_ptr + (lookahead_1 + lookahead_2) * sse_batch_size,
+              _MM_HINT_T0);
+        }
+#endif
+      }
+
+      // These are the coefficients from each vector at column `col` (just
+      // broadcast over the whole AVX array).
+      AvxFloatVec weights[num_ops];
+      for (int op = 0; op < num_ops; ++op) {
+        weights[op].LoadConstVector(inputs.input[op][col]);
+      }
+
+      // Loop over each AVX vector and add the current sub-product.
+      AvxFloatVecArray<sse_batch_size / kAvxWidth> matrix_block;
+      matrix_block.Load(curr_matrix_ptr);
+      curr_matrix_ptr += sse_batch_size;
+      for (int row_offset = 0; row_offset < sse_batch_size / kAvxWidth;
+           row_offset++) {
+        for (int op = 0; op < num_ops; ++op) {
+          accumulators[op].vectors[row_offset].AddProductOf(
+              weights[op], matrix_block.vectors[row_offset]);
+        }
+      }
+    }
+
+    // Save the results.
+    for (int op = 0; op < num_ops; ++op) {
+      if (mask_input_output) {
+        accumulators[op].Store(&outputs->output[op][row_start],
+                               load_store_max_idx);
+      } else {
+        accumulators[op].Store(&outputs->output[op][row_start]);
+      }
+    }
+  }
+}
+
+template <int sse_batch_size, typename ElementType>
+::tensorflow::Status SgemvMatrix<sse_batch_size, ElementType>::Initialize(
+    const SgemvMatrix<sse_batch_size, ElementType>::MatrixType &matrix) {
+  if (matrix.block_size() != sse_batch_size) {
+    return ::tensorflow::errors::InvalidArgument(
+        "Blocked matrix block_size (", matrix.block_size(),
+        ") must be equal to sse_batch_size (", sse_batch_size, ")");
+  }
+  matrix_ = matrix;
+  return ::tensorflow::Status::OK();
+}
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#undef DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
+#undef DRAGNN_SGEMVV_GCC_UNROLL
+
+#endif  // DRAGNN_RUNTIME_MATH_SGEMVV_H_
--- a/research/syntaxnet/dragnn/runtime/math/sgemvv_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/sgemvv_test.cc
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "dragnn/runtime/math/sgemvv.h"
+
+#include <chrono>
+#include <random>
+
+#include "dragnn/core/test/generic.h"
+#include "dragnn/runtime/math/arithmetic.h"
+#include "dragnn/runtime/math/transformations.h"
+#include "dragnn/runtime/math/types.h"
+#include "dragnn/runtime/test/helpers.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+void naive_sgemv(const MutableMatrix<float> &matrix, const float *v,
+                 const float *b, float *y) {
+  for (int row = 0; row < matrix.num_rows(); row++) {
+    y[row] = b[row];
+    for (int col = 0; col < matrix.num_columns(); col++) {
+      y[row] += matrix.row(row)[col] * v[col];
+    }
+  }
+}
+
+// Everything except floats require copying.
+template <class ElementType>
+constexpr bool RequiresCopy();
+template <>
+constexpr bool RequiresCopy<TruncatedFloat16>() {
+  return true;
+}
+#if defined(__F16C__)
+template <>
+constexpr bool RequiresCopy<IeeeFloat16>() {
+  return true;
+}
+#endif
+template <>
+constexpr bool RequiresCopy<float>() {
+  return false;
+}
+
+template <class ElementType>
+void ConvertRow(Vector<float> input, MutableVector<ElementType> output);
+template <>
+void ConvertRow<float>(Vector<float> input, MutableVector<float> output) {}
+template <>
+void ConvertRow<TruncatedFloat16>(Vector<float> input,
+                                  MutableVector<TruncatedFloat16> output) {
+  CHECK_EQ(input.size() % 16, 0);
+  CHECK_EQ(input.size(), output.size());
+
+  for (int i = 0; i < input.size(); ++i) {
+    int i_permuted = (i / 16) * 16 + FastUnpackPermutation(i % 16);
+    output[i] = TruncatedFloat16::DebugFromFloat(input[i_permuted]);
+  }
+}
+#if defined(__F16C__)
+template <>
+void ConvertRow<IeeeFloat16>(Vector<float> input,
+                             MutableVector<IeeeFloat16> output) {
+  CHECK_EQ(input.size() % 16, 0);
+  CHECK_EQ(input.size(), output.size());
+  for (int i = 0; i < input.size(); ++i) {
+    output[i] = IeeeFloat16::DebugFromFloat(input[i]);
+  }
+}
+#endif
+
+// Converts a matrix to SGEMV. If the element type is not float, copies the
+// matrix and then converts it.
+template <int sse_batch_size, typename ElementType = float>
+SgemvMatrix<sse_batch_size, ElementType> ConvertToSgemv(
+    const Matrix<float> &matrix, UniqueMatrix<ElementType> *sgemv_storage) {
+  MutableBlockedMatrix<ElementType, BlockedMatrixFormat::kRowBlockedColumnMajor>
+      blocked;
+  TF_EXPECT_OK(blocked.Reset(sgemv_storage->area(), matrix.num_rows(),
+                             matrix.num_columns()));
+
+  // TODO(googleuser): Clean this up when we can use C++17's `if constexpr`
+  // ... then we will not have to introduce this raw pointer, which is either
+  // an actual new variable or alias to `sgemv_storage`.
+  UniqueMatrix<float> *uncompressed;
+  if (RequiresCopy<ElementType>()) {
+    uncompressed = new UniqueMatrix<float>((*sgemv_storage)->num_rows(),
+                                           (*sgemv_storage)->num_columns());
+  } else {
+    // NOTE: Because we don't have C++17's `if constexpr`, we need to add a
+    // reinterpret_cast, so this code can compile when ElementType != float.
+    uncompressed = reinterpret_cast<UniqueMatrix<float> *>(sgemv_storage);
+  }
+
+  // Copy to the uncompressed matrix. If ElementType == float, this is just
+  // the output, otherwise it's the temporary array.
+  MutableBlockedMatrix<float, BlockedMatrixFormat::kRowBlockedColumnMajor>
+      uncompressed_matrix;
+  TF_EXPECT_OK(uncompressed_matrix.Reset(
+      uncompressed->area(), matrix.num_rows(), matrix.num_columns()));
+  TF_EXPECT_OK(CopyMatrix(matrix, &uncompressed_matrix));
+
+  if (RequiresCopy<ElementType>()) {
+    for (int i = 0; i < blocked.num_vectors(); ++i) {
+      ConvertRow<ElementType>(Vector<float>(uncompressed_matrix.vector(i)),
+                              blocked.vector(i));
+    }
+    delete uncompressed;
+  }
+
+  SgemvMatrix<sse_batch_size, ElementType> sgemv_matrix;
+  TF_EXPECT_OK(sgemv_matrix.Initialize(blocked.AsConst()));
+  return sgemv_matrix;
+}
+
+void InitRandomVector(MutableVector<float> vector) {
+  // clock() is updated less frequently than a cycle counter, so keep around the
+  // RNG just in case we initialize some vectors in less than a clock tick.
+  static std::mt19937 *rng = new std::mt19937(clock());
+  std::normal_distribution<float> distribution(0, 1);
+  for (int i = 0; i < vector.size(); i++) {
+    vector[i] = distribution(*rng);
+  }
+}
+
+void InitRandomMatrix(MutableMatrix<float> matrix) {
+  // See InitRandomVector comment.
+  static std::mt19937 *rng = new std::mt19937(clock());
+  std::normal_distribution<float> distribution(0, 1);
+  GenerateMatrix(
+      matrix.num_rows(), matrix.num_columns(),
+      [&distribution](int row, int col) { return distribution(*rng); },
+      &matrix);
+}
+
+TEST(SgemvvTest, MatmulNoBias) {
+  constexpr int sse_batch_size = 32;
+  constexpr int num_rows = 32;
+  constexpr int num_columns = 15;
+  constexpr int output_size = 8;
+
+  constexpr int sgemv_views = num_rows * num_columns / sse_batch_size;
+  static_assert(num_rows * num_columns % sse_batch_size == 0,
+                "Bad matrix size");
+
+  ASSERT_EQ(output_size % 8, 0) << "Output size must still be 32-byte aligned.";
+
+  UniqueMatrix<float> matrix(num_rows, num_columns);
+  UniqueMatrix<float> sgemv_storage(sgemv_views, sse_batch_size);
+  UniqueVector<float> input_vector(num_columns);
+  UniqueVector<float> output(num_rows);
+  UniqueVector<float> expected(num_rows);
+
+  // Random initialization for all variables/values.
+  InitRandomMatrix(*matrix);
+  InitRandomVector(*output);
+  InitRandomVector(*expected);
+  InitRandomVector(*input_vector);
+
+  // Layout SGEMV matrix.
+  SgemvMatrix<sse_batch_size> sgemv_matrix =
+      ConvertToSgemv<sse_batch_size>(Matrix<float>(*matrix), &sgemv_storage);
+
+  // SGEMV multiplication.
+  SgemvInputBatch<1> inputs = {{input_vector->data()}, {nullptr}};
+  SgemvOutputBatch<1> outputs = {{output->data()}};
+  sgemv_matrix.MaskedMatrixMultiVectorProductNoInitial(inputs, output_size,
+                                                       &outputs);
+
+  // Naive algorithm.
+  MultiplyMatrixAndVector<float>(Matrix<float>(*matrix),
+                                 Vector<float>(*input_vector), *expected);
+
+  // Check that results are equal.
+  for (int i = 0; i < output_size; i++) {
+    EXPECT_NEAR(output->data()[i], expected->data()[i], 1e-5);
+  }
+}
+
+TEST(SgemvvTest, ErrorsWithBadMultiple) {
+  // Pick num_rows which is (32-byte) alignable, but not a multiple of
+  // sse_batch_size (32 floats). These should return errors.
+  for (int num_rows = 8; num_rows < 32; num_rows += 8) {
+    // Layout blocked matrix.
+    UniqueMatrix<float> sgemv_storage(1, num_rows);
+    MutableBlockedMatrix<float, BlockedMatrixFormat::kRowBlockedColumnMajor>
+        blocked;
+    TF_EXPECT_OK(blocked.Reset(sgemv_storage.area(), num_rows, 1));
+
+    // Initialize SgemvvMatrix.
+    SgemvMatrix<32> matrix;
+    EXPECT_THAT(matrix.Initialize(blocked.AsConst()),
+                test::IsErrorWithSubstr("must be equal to sse_batch_size"));
+  }
+}
+
+template <typename ElementType>
+string TypenameString();
+template <>
+string TypenameString<float>() {
+  return "float32";
+}
+template <>
+string TypenameString<TruncatedFloat16>() {
+  return "bfloat16";
+}
+#if defined(__F16C__)
+template <>
+string TypenameString<IeeeFloat16>() {
+  return "float16";
+}
+#endif
+
+template <typename ElementType>
+float ToleranceAt128();
+template <>
+float ToleranceAt128<float>() {
+  return 1e-5;
+}
+template <>
+float ToleranceAt128<TruncatedFloat16>() {
+  return 1;
+}
+#if defined(__F16C__)
+template <>
+float ToleranceAt128<IeeeFloat16>() {
+  return 1e-1;
+}
+#endif
+
+template <int sse_batch_size, int num_rows, int num_cols, typename ElementType>
+void RunPerformanceTest(int output_size) {
+  constexpr int sgemv_views = num_rows * num_cols / sse_batch_size;
+  static_assert(num_rows * num_cols % sse_batch_size == 0, "Bad matrix size");
+
+  ASSERT_EQ(output_size % 8, 0) << "Output size must still be 32-byte aligned.";
+
+  UniqueMatrix<float> matrix(num_rows, num_cols);
+  UniqueMatrix<ElementType> sgemv_storage(sgemv_views, sse_batch_size);
+
+  UniqueVector<float> initial_1(num_rows);
+  UniqueVector<float> initial_2(num_rows);
+  UniqueVector<float> vector_1(num_cols);
+  UniqueVector<float> vector_2(num_cols);
+  UniqueVector<float> output_1(num_rows);
+  UniqueVector<float> output_2(num_rows);
+  UniqueVector<float> expected_output_1(num_rows);
+  UniqueVector<float> expected_output_2(num_rows);
+  UniqueVector<float> untouched_output_1(num_rows);
+  UniqueVector<float> untouched_output_2(num_rows);
+
+  // Random initialization for all variables/values.
+  InitRandomMatrix(*matrix);
+  InitRandomVector(*initial_1);
+  InitRandomVector(*initial_2);
+  InitRandomVector(*output_1);
+  InitRandomVector(*output_2);
+  InitRandomVector(*expected_output_1);
+  InitRandomVector(*expected_output_2);
+  InitRandomVector(*vector_1);
+  InitRandomVector(*vector_2);
+  for (int i = 0; i < num_rows; i++) {
+    (*untouched_output_1)[i] = (*output_1)[i];
+    (*untouched_output_2)[i] = (*output_2)[i];
+  }
+
+  // Layout SGEMV matrix.
+  SgemvMatrix<sse_batch_size, ElementType> sgemv_matrix =
+      ConvertToSgemv<sse_batch_size, ElementType>(Matrix<float>(*matrix),
+                                                  &sgemv_storage);
+
+  naive_sgemv(*matrix, vector_1->data(), initial_1->data(),
+              expected_output_1->data());
+  naive_sgemv(*matrix, vector_2->data(), initial_2->data(),
+              expected_output_2->data());
+
+  double raw_flops_per_iteration = 2.0 * 2.0 * num_rows * num_cols;
+  const uint64 iterations =
+      static_cast<uint64>(std::round(4e9 / raw_flops_per_iteration));
+  auto start_time = std::chrono::system_clock::now();
+  SgemvInputBatch<2> inputs = {
+      {vector_1->data(), vector_2->data()},
+      {initial_1->data(), initial_2->data()},
+  };
+  SgemvOutputBatch<2> outputs = {{output_1->data(), output_2->data()}};
+  if (num_rows == output_size) {
+    for (int iter = 0; iter < iterations; iter++) {
+      sgemv_matrix.template MatrixMultiVectorProduct<2, 0, 0>(inputs, &outputs);
+    }
+  } else {
+    for (int iter = 0; iter < iterations; iter++) {
+      sgemv_matrix.template MaskedMatrixMultiVectorProduct<2>(
+          inputs, output_size, &outputs);
+    }
+  }
+  auto end_time = std::chrono::system_clock::now();
+  std::chrono::duration<double> elapsed_seconds = end_time - start_time;
+  double elapsed = elapsed_seconds.count();
+
+  // Each MatrixVectorVectorProduct does 2 Matrix-vector ops, and each op does a
+  // multiply and an add (2 floating-point operations) for each entry in the
+  // matrix.
+  string raw_gflops = "";
+  if (num_rows != output_size) {
+    raw_gflops = ::tensorflow::strings::StrCat(
+        ", ", raw_flops_per_iteration * iterations / 1e9 / elapsed, " raw");
+  }
+  VLOG(0) << "    ElementType " << TypenameString<ElementType>() << " GFLOPS: "
+          << (2.0 * 2.0 * output_size * num_cols * iterations) / 1e9 / elapsed
+          << " effective" << raw_gflops;
+
+  const float tolerance =
+      ToleranceAt128<ElementType>() * (num_rows / 128.0) + 1e-5;
+  for (int i = 0; i < output_size; i++) {
+    EXPECT_NEAR(output_1->data()[i], expected_output_1->data()[i], tolerance);
+    EXPECT_NEAR(output_2->data()[i], expected_output_2->data()[i], tolerance);
+  }
+
+  // Check that any non-output items are untouched.
+  for (int i = output_size; i < num_rows; i++) {
+    EXPECT_EQ((*output_1)[i], (*untouched_output_1)[i]);
+    EXPECT_EQ((*output_2)[i], (*untouched_output_2)[i]);
+  }
+}
+
+TEST(SgemvvTest, PerformanceAndAccuracyTest) {
+  // Benchmarking is hard. Sometimes results vary between test runs, or are just
+  // unreliable. This could be in part from CPU frequency scaling, and also how
+  // favorably the memory allocator places data (coherence, etc.).
+  constexpr int kNumBatches = 3;
+
+  VLOG(0) << "64x64 32-batch-size test";
+  for (int batch = 0; batch < kNumBatches; ++batch) {
+    RunPerformanceTest<32, 64, 64, float>(64);
+#if defined(__F16C__)
+    RunPerformanceTest<32, 64, 64, IeeeFloat16>(64);
+#endif
+  }
+
+  VLOG(0) << "128x128 32-batch-size test";
+  for (int batch = 0; batch < kNumBatches; ++batch) {
+    RunPerformanceTest<32, 128, 128, float>(128);
+  }
+
+  VLOG(0) << "256x256 32-batch-size test";
+  for (int batch = 0; batch < kNumBatches; ++batch) {
+    RunPerformanceTest<32, 256, 256, float>(256);
+#if defined(__F16C__)
+    RunPerformanceTest<32, 256, 256, IeeeFloat16>(256);
+#endif
+    RunPerformanceTest<32, 256, 256, TruncatedFloat16>(256);
+  }
+
+  VLOG(0) << "96x96 48-batch-size test";
+  for (int batch = 0; batch < kNumBatches; ++batch) {
+    RunPerformanceTest<48, 96, 96, float>(96);
+  }
+
+  VLOG(0) << "48x96 48-batch-size test";
+  for (int batch = 0; batch < kNumBatches; ++batch) {
+    RunPerformanceTest<48, 48, 96, float>(48);
+  }
+
+  VLOG(0) << "40x96 48-batch-size test";
+  for (int batch = 0; batch < kNumBatches; ++batch) {
+    RunPerformanceTest<48, 48, 96, float>(40);
+  }
+
+  // These larger matrices are about the same amount of computation as one
+  // 96-dimensional LSTM cell (without output softmax).
+  VLOG(0) << "480x96 48-batch-size test";
+  for (int batch = 0; batch < kNumBatches; ++batch) {
+    RunPerformanceTest<48, 480, 96, float>(480);
+#if defined(__F16C__)
+    RunPerformanceTest<48, 480, 96, IeeeFloat16>(480);
+#endif
+    RunPerformanceTest<48, 480, 96, TruncatedFloat16>(480);
+  }
+
+  VLOG(0) << "472x96 48-batch-size test";
+  for (int batch = 0; batch < kNumBatches; ++batch) {
+    RunPerformanceTest<48, 480, 96, float>(472);
+#if defined(__F16C__)
+    RunPerformanceTest<48, 480, 96, IeeeFloat16>(472);
+#endif
+    RunPerformanceTest<48, 480, 96, TruncatedFloat16>(472);
+  }
+}
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/transformations.h
+++ b/research/syntaxnet/dragnn/runtime/math/transformations.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+// Utility functions that can transform different matrix types. This includes
+// non-trivial transposes, and converting vectors/etc. to the matrix types. This
+// library should NOT be used for any performance-critical work, and should NOT
+// be included at all in the mobile runtime.
+
+#ifndef DRAGNN_RUNTIME_MATH_TRANSFORMATIONS_H_
+#define DRAGNN_RUNTIME_MATH_TRANSFORMATIONS_H_
+
+#include "dragnn/runtime/math/types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+namespace internal {
+
+// Puts a format-agnostic API on matrix-like data types. This is convenient, but
+// has the downside of potential confusing compiler errors (when a
+// specialization does not exist), and isn't suitable for optimizations like
+// blocked transformations.
+template <class T>
+T *GetMatrixElement(int row, int col, MatrixImpl<T> *matrix) {
+  return &matrix->row(row)[col];
+}
+
+template <class T>
+const T &GetMatrixElement(int row, int col, const MatrixImpl<T> &matrix) {
+  return matrix.row(row)[col];
+}
+
+template <class T>
+T *GetMatrixElement(
+    int row, int col,
+    BlockedMatrixImpl<T, BlockedMatrixFormat::kRowBlockedColumnMajor> *matrix) {
+  int sub_matrix_idx = row / matrix->block_size();
+  int vector_idx = sub_matrix_idx * matrix->num_columns() + col;
+  int element_idx = row % matrix->block_size();
+  return &matrix->vector(vector_idx)[element_idx];
+}
+
+template <class T>
+const T &GetMatrixElement(
+    int row, int col,
+    const BlockedMatrixImpl<T, BlockedMatrixFormat::kRowBlockedColumnMajor>
+        &matrix) {
+  int sub_matrix_idx = row / matrix.block_size();
+  int vector_idx = sub_matrix_idx * matrix.num_columns() + col;
+  int element_idx = row % matrix.block_size();
+  return matrix.vector(vector_idx)[element_idx];
+}
+
+template <class T>
+T *GetMatrixElement(
+    int row, int col,
+    BlockedMatrixImpl<T, BlockedMatrixFormat::kColumnBlockedRowMajor> *matrix) {
+  int sub_matrix_idx = col / matrix->block_size();
+  int vector_idx = sub_matrix_idx * matrix->num_rows() + row;
+  int element_idx = col % matrix->block_size();
+  return &matrix->vector(vector_idx)[element_idx];
+}
+
+template <class T>
+const T &GetMatrixElement(
+    int row, int col,
+    const BlockedMatrixImpl<T, BlockedMatrixFormat::kColumnBlockedRowMajor>
+        &matrix) {
+  int sub_matrix_idx = col / matrix.block_size();
+  int vector_idx = sub_matrix_idx * matrix.num_rows() + row;
+  int element_idx = col % matrix.block_size();
+  return matrix.vector(vector_idx)[element_idx];
+}
+
+}  // namespace internal
+
+// Generates values for a matrix, by calling a provided function on each
+// row/column index. Thanks to the magic of templating, the function call should
+// be inlined and not cause too much overhead being "called" on each index.
+template <class Function, class OutputMatrix>
+void GenerateMatrix(int num_rows, int num_columns, const Function &get_value,
+                    OutputMatrix *output_matrix) {
+  for (size_t row = 0; row < num_rows; ++row) {
+    for (size_t column = 0; column < num_columns; ++column) {
+      *(GetMatrixElement(row, column, output_matrix)) = get_value(row, column);
+    }
+  }
+}
+
+// Copies the first |num_rows| rows and |num_columns| columns of input_matrix to
+// output_matrix.
+template <class InputMatrix, class OutputMatrix>
+void CopyMatrixPrefix(const InputMatrix &input_matrix, int num_rows,
+                      int num_columns, OutputMatrix *output_matrix) {
+  const auto &get_value = [input_matrix](int row, int column) {
+    return GetMatrixElement(row, column, input_matrix);
+  };
+  GenerateMatrix(num_rows, num_columns, get_value, output_matrix);
+}
+
+// Copies matrices. The matrices can be of different types, but must have the
+// same dimensions.
+template <class InputMatrix, class OutputMatrix>
+tensorflow::Status CopyMatrix(const InputMatrix &input_matrix,
+                              OutputMatrix *output_matrix) {
+  if (input_matrix.num_rows() != output_matrix->num_rows()) {
+    return tensorflow::errors::InvalidArgument(
+        "Input matrix num_rows (", input_matrix.num_rows(),
+        ") != output matrix num_rows (", output_matrix->num_rows(), ")");
+  }
+  if (input_matrix.num_columns() != output_matrix->num_columns()) {
+    return tensorflow::errors::InvalidArgument(
+        "Input matrix num_columns (", input_matrix.num_columns(),
+        ") != output matrix num_columns (", output_matrix->num_columns(), ")");
+  }
+  CopyMatrixPrefix(input_matrix, input_matrix.num_rows(),
+                   input_matrix.num_columns(), output_matrix);
+  return tensorflow::Status::OK();
+}
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#endif  // DRAGNN_RUNTIME_MATH_TRANSFORMATIONS_H_
--- a/research/syntaxnet/dragnn/runtime/math/transformations_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/transformations_test.cc
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "dragnn/runtime/math/transformations.h"
+
+#include "dragnn/runtime/test/helpers.h"
+#include <gmock/gmock.h>
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+namespace {
+
+// Generates a matrix where each value is of the form `aa.bb`, where `aa` is the
+// column index and `bb` is the row index.
+TEST(TransformationsTest, GenerateRowColIdxMatrix) {
+  UniqueMatrix<float> row_col_matrix(5, 5);
+  GenerateMatrix(
+      5, 5,
+      [](int row, int col) { return static_cast<float>(row) + (col / 100.0f); },
+      row_col_matrix.get());
+
+  ExpectMatrix(Matrix<float>(*row_col_matrix),
+               {{0.0f, 0.01f, 0.02f, 0.03f, 0.04f},
+                {1.0f, 1.01f, 1.02f, 1.03f, 1.04f},
+                {2.0f, 2.01f, 2.02f, 2.03f, 2.04f},
+                {3.0f, 3.01f, 3.02f, 3.03f, 3.04f},
+                {4.0f, 4.01f, 4.02f, 4.03f, 4.04f}});
+}
+
+TEST(TransformationsTest, CopiesMatrix) {
+  UniqueMatrix<float> a({{1, 2}}), b({{3, 4}});
+  TF_EXPECT_OK(CopyMatrix(*a, b.get()));
+
+  EXPECT_EQ(b->row(0)[0], 1);
+  EXPECT_EQ(b->row(0)[1], 2);
+}
+
+TEST(TransformationsTest, CopiesRowBlockedMatrix) {
+  UniqueMatrix<double> source({{1, 2, 3},     //
+                               {4, 5, 6},     //
+                               {7, 8, 9},     //
+                               {10, 11, 12},  //
+                               {13, 14, 15},  //
+                               {16, 17, 18},  //
+                               {19, 20, 21},  //
+                               {22, 23, 24}});
+  UniqueMatrix<double> dst_mem(6, 4);
+  MutableBlockedMatrix<double, BlockedMatrixFormat::kRowBlockedColumnMajor>
+      blocked;
+  TF_EXPECT_OK(blocked.Reset(dst_mem.area(), 8, 3));
+
+  TF_EXPECT_OK(CopyMatrix(*source, &blocked));
+
+  ExpectMatrix(Matrix<double>(*dst_mem), {{1, 4, 7, 10},     //
+                                          {2, 5, 8, 11},     //
+                                          {3, 6, 9, 12},     //
+                                          {13, 16, 19, 22},  //
+                                          {14, 17, 20, 23},  //
+                                          {15, 18, 21, 24}});
+}
+
+// This test is the same as the above, except everything is transposed.
+TEST(TransformationsTest, CopiesColumnBlockedMatrix) {
+  UniqueMatrix<double> source(         //
+      {{1, 4, 7, 10, 13, 16, 19, 22},  //
+       {2, 5, 8, 11, 14, 17, 20, 23},  //
+       {3, 6, 9, 12, 15, 18, 21, 24}});
+  UniqueMatrix<double> dst_mem(6, 4);
+  MutableBlockedMatrix<double> blocked;
+  TF_EXPECT_OK(blocked.Reset(dst_mem.area(), 3, 8));
+
+  TF_EXPECT_OK(CopyMatrix(*source, &blocked));
+
+  ExpectMatrix(Matrix<double>(*dst_mem), {{1, 4, 7, 10},     //
+                                          {2, 5, 8, 11},     //
+                                          {3, 6, 9, 12},     //
+                                          {13, 16, 19, 22},  //
+                                          {14, 17, 20, 23},  //
+                                          {15, 18, 21, 24}});
+}
+
+}  // namespace
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/types.h
+++ b/research/syntaxnet/dragnn/runtime/math/types.h
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+// Mathematical types.
+
+#ifndef DRAGNN_RUNTIME_MATH_TYPES_H_
+#define DRAGNN_RUNTIME_MATH_TYPES_H_
+
+#include <stddef.h>
+#include <limits>
+
+#include "dragnn/runtime/alignment.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+// Blocked matrix formats, for fast inference routines.
+enum class BlockedMatrixFormat {
+  // Represents a row-blocked block-column-major matrix. In other words, first
+  // split a matrix M into
+  //
+  // [ M_1
+  //   ...
+  //   M_m ]
+  //
+  // sub-matrices, where each M_i is a `block_size x n` sub-matrix. Then each
+  // M_i is formatted in column-major order, and the sub-matrices' data is
+  // concatenated together.
+  kRowBlockedColumnMajor,
+
+  // Represents a column-blocked block-row-major matrix. This is the
+  // transpose of the above. A matrix M is split into
+  //
+  // [ M_1 ... M_n ]
+  //
+  // sub-matrices, where each M_i is a `m x block_size` sub-matrix. Then each
+  // M_i is formatted in row-major order, and the sub-matrices' data is
+  // concatenated together.
+  kColumnBlockedRowMajor,
+};
+
+namespace internal {
+
+// An aligned vector of values.  Do not use this class directly, instead use
+// (Mutable)Vector below.
+template <class T>
+class VectorImpl {
+ public:
+  static_assert(IsAlignable<T>(), "T must be alignable");
+
+  // Creates an empty vector.
+  VectorImpl() = default;
+
+  // Points this at the |view|, which must be evenly divisible into Ts.
+  template <class Byte>
+  explicit VectorImpl(AlignedViewImpl<Byte> view);
+
+  // Points this at a prefix of the |view| containing |size| Ts.  The |view|
+  // must span at least |size| * sizeof(T) bytes.
+  template <class Byte>
+  VectorImpl(AlignedViewImpl<Byte> view, size_t size);
+
+  // Points this at the same values as |that|, possibly reinterpreting type.
+  template <class U>
+  explicit VectorImpl(VectorImpl<U> that);
+  template <class U>
+  VectorImpl &operator=(VectorImpl<U> that);
+
+  // Enables range-based for loops.
+  T *begin() const { return data(); }
+  T *end() const { return begin() + size(); }
+
+  // Accessors.
+  T *data() const { return data_; }
+  size_t size() const { return size_; }
+  bool empty() const { return size() == 0; }
+  T &operator[](size_t index) const;
+
+  // Gets a sub-vector starting at |start| with |size| elements.
+  VectorImpl<T> Subsequence(size_t start, size_t size) const;
+
+ private:
+  template <class U>
+  friend class MatrixImpl;
+  template <class U, BlockedMatrixFormat format>
+  friend class BlockedMatrixImpl;
+
+  // Points this at [|data|,|data|+|size|), bypassing alignment checks.
+  VectorImpl(T *data, size_t size);
+
+  // Pointer to the start of the vector.
+  T *data_ = nullptr;
+
+  // Number of values in the vector.
+  size_t size_ = 0;
+};
+
+// Returns the format corresponding to the transpose of the |format|.
+constexpr BlockedMatrixFormat TransposeBlockedMatrixFormat(
+    BlockedMatrixFormat format);
+
+// A row-major matrix where each row or column is aligned.  Do not use this
+// class directly, instead use (Mutable)Matrix below.
+template <class T>
+class MatrixImpl {
+ public:
+  static_assert(IsAlignable<T>(), "T must be alignable");
+
+  // Creates an empty matrix.
+  MatrixImpl() = default;
+
+  // Points each row of this matrix at the corresponding sub-view of the |area|.
+  // Each view in the |area| must be evenly divisible into Ts.
+  template <class Byte>
+  explicit MatrixImpl(AlignedAreaImpl<Byte> area);
+
+  // Creates a matrix from a single vector. Assumes that the vector's stride is
+  // the minimum alignment padding.
+  explicit MatrixImpl(VectorImpl<T> single_vector);
+
+  // Points this at the same values as |that|.
+  template <class U>
+  explicit MatrixImpl(MatrixImpl<U> that);
+  template <class U>
+  MatrixImpl &operator=(MatrixImpl<U> that);
+
+  // Accessors.
+  T *data() const { return data_; }
+  size_t num_rows() const { return num_rows_; }
+  size_t num_columns() const { return num_columns_; }
+  size_t row_stride() const { return row_stride_; }
+  VectorImpl<T> row(size_t index) const;
+
+ private:
+  template <class U>
+  friend class MatrixImpl;
+
+  // Pointer to the start of the matrix.
+  T *data_ = nullptr;
+
+  // Number of rows and columns in the matrix.
+  size_t num_rows_ = 0;
+  size_t num_columns_ = 0;
+
+  // Distance between the starts of consecutive rows.
+  size_t row_stride_ = 0;
+};
+
+// Blocked matrix representation. See BlockedMatrixFormat for details.
+template <class T, BlockedMatrixFormat format>
+class BlockedMatrixImpl {
+ public:
+  static_assert(IsAlignable<T>(), "T must be alignable");
+
+  // These aliases allow templated code to reach back in and get template
+  // parameters, like std::vector<T>::iterator::value aliases.
+  using ElementType = T;
+  static constexpr bool IsRowBlocked() {
+    return format == BlockedMatrixFormat::kRowBlockedColumnMajor;
+  }
+
+  // Creates an empty matrix.
+  BlockedMatrixImpl() = default;
+
+  // Creates a copy of this matrix, using the same values (underlying area), but
+  // possibly re-interpreting the type. The new type U must be the same size,
+  // and `T *` must be implictly convertible to `U *` (usually just adding
+  // "const" qualifiers, but theoretically it could be a superclass).
+  template <class U>
+  explicit BlockedMatrixImpl(BlockedMatrixImpl<U, format> that);
+  template <class U>
+  BlockedMatrixImpl &operator=(BlockedMatrixImpl<U, format> that);
+
+  // Creates a new view that's const-qualified, in particular converting
+  // MutableBlockedMatrix to BlockedMatrix.
+  BlockedMatrixImpl<const T, format> AsConst() const {
+    return BlockedMatrixImpl<const T, format>(*this);
+  }
+
+  // Initializes the matrix. Raises errors if the matrix dimensions are
+  // incompatible with the underlying area, namely if the number of views in
+  // `area` do not cover the whole matrix, and also if the matrix cannot be
+  // blocked according to (template parameter) `format`.
+  //
+  // Further, because this class is used for (delicate / specialized) optimized
+  // inference routines, it is also required that no padding is present, i.e.
+  // that the block size is divisible by kAlignmentBytes (currently 32).
+  template <class Byte>
+  tensorflow::Status Reset(AlignedAreaImpl<Byte> area, size_t num_rows,
+                           size_t num_columns);
+
+  // Returns the transpose of this.
+  BlockedMatrixImpl<T, TransposeBlockedMatrixFormat(format)> Transpose() const;
+
+  // Accessors.
+  size_t num_rows() const { return num_rows_; }
+  size_t num_columns() const { return num_columns_; }
+  size_t block_size() const { return block_size_; }
+  size_t num_vectors() const { return num_vectors_; }
+  VectorImpl<T> vector(size_t index) const;
+
+ private:
+  template <class U, BlockedMatrixFormat other_format>
+  friend class BlockedMatrixImpl;
+
+  // This is the same as calling Reset(), except the area is not checked.
+  template <class Byte>
+  explicit BlockedMatrixImpl(AlignedAreaImpl<Byte> area, int num_rows,
+                             int num_columns);
+
+  // Pointer to the start of the matrix.
+  T *data_ = nullptr;
+
+  // Number of rows and columns in the matrix. Unlike MatrixImpl, there is no
+  // API for directly accessing rows and columns, but it's necessary for any
+  // algorithm (e.g. matrix-vector multiplication) to know the logical shape.
+  size_t num_rows_ = 0;
+  size_t num_columns_ = 0;
+
+  size_t block_size_ = 0;   // in T's
+  size_t num_vectors_ = 0;  // = num_rows * num_columns / block_size
+};
+
+}  // namespace internal
+
+// Public aliases; use these.
+template <class T>
+using Vector = internal::VectorImpl<const T>;
+template <class T>
+using Matrix = internal::MatrixImpl<const T>;
+template <class T, BlockedMatrixFormat format =
+                       BlockedMatrixFormat::kColumnBlockedRowMajor>
+using BlockedMatrix = internal::BlockedMatrixImpl<const T, format>;
+template <class T>
+using MutableVector = internal::VectorImpl<T>;
+template <class T>
+using MutableMatrix = internal::MatrixImpl<T>;
+template <class T, BlockedMatrixFormat format =
+                       BlockedMatrixFormat::kColumnBlockedRowMajor>
+using MutableBlockedMatrix = internal::BlockedMatrixImpl<T, format>;
+
+// Implementation details below.
+
+namespace internal {
+
+template <class T>
+template <class Byte>
+VectorImpl<T>::VectorImpl(AlignedViewImpl<Byte> view)
+    : data_(reinterpret_cast<T *>(view.data())),
+      size_(view.size() / sizeof(T)) {
+  DCHECK_EQ(view.size() % sizeof(T), 0);
+}
+
+template <class T>
+template <class Byte>
+VectorImpl<T>::VectorImpl(AlignedViewImpl<Byte> view, size_t size)
+    : data_(reinterpret_cast<T *>(view.data())), size_(size) {
+  DCHECK_LE(size * sizeof(T), view.size());
+}
+
+template <class T>
+template <class U>
+VectorImpl<T>::VectorImpl(VectorImpl<U> that)
+    : data_(that.data()), size_(that.size()) {
+  static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
+}
+
+template <class T>
+template <class U>
+VectorImpl<T> &VectorImpl<T>::operator=(VectorImpl<U> that) {
+  static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
+  data_ = that.data();
+  size_ = that.size();
+  return *this;
+}
+
+template <class T>
+T &VectorImpl<T>::operator[](size_t index) const {
+  DCHECK_LT(index, size());
+  return data_[index];
+}
+
+template <class T>
+VectorImpl<T>::VectorImpl(T *data, size_t size) : data_(data), size_(size) {
+  TF_DCHECK_OK(OkIfAligned(data));
+}
+
+template <class T>
+VectorImpl<T> VectorImpl<T>::Subsequence(size_t start, size_t size) const {
+  DCHECK_LE(start + size, size_);
+  return VectorImpl<T>(&data_[start], size);
+}
+
+constexpr BlockedMatrixFormat TransposeBlockedMatrixFormat(
+    BlockedMatrixFormat format) {
+  return format == BlockedMatrixFormat::kRowBlockedColumnMajor
+             ? BlockedMatrixFormat::kColumnBlockedRowMajor
+             : BlockedMatrixFormat::kRowBlockedColumnMajor;
+}
+
+template <class T>
+MatrixImpl<T>::MatrixImpl(VectorImpl<T> single_vector)
+    : data_(single_vector.data()),
+      num_rows_(1),
+      num_columns_(single_vector.size()),
+      row_stride_(PadToAlignment(single_vector.size() * sizeof(T)) /
+                  sizeof(T)) {}
+
+template <class T>
+template <class Byte>
+MatrixImpl<T>::MatrixImpl(AlignedAreaImpl<Byte> area)
+    : data_(reinterpret_cast<T *>(area.data())),
+      num_rows_(area.num_views()),
+      num_columns_(area.view_size() / sizeof(T)),
+      row_stride_(area.view_stride() / sizeof(T)) {
+  DCHECK_EQ(area.view_size() % sizeof(T), 0);
+  DCHECK_EQ(area.view_stride() % sizeof(T), 0);
+}
+
+template <class T>
+template <class U>
+MatrixImpl<T>::MatrixImpl(MatrixImpl<U> that)
+    : data_(that.data_),
+      num_rows_(that.num_rows()),
+      num_columns_(that.num_columns()),
+      row_stride_(that.row_stride_) {
+  static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
+}
+
+template <class T>
+template <class U>
+MatrixImpl<T> &MatrixImpl<T>::operator=(MatrixImpl<U> that) {
+  static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
+  data_ = that.data_;
+  num_rows_ = that.num_rows();
+  num_columns_ = that.num_columns();
+  row_stride_ = that.row_stride_;
+  return *this;
+}
+
+template <class T>
+VectorImpl<T> MatrixImpl<T>::row(size_t index) const {
+  DCHECK_LT(index, num_rows());
+
+  // Note that |row_stride_|, not |num_columns_|, determines the start of the
+  // row.  The former is aligned and may stride over a wider span than normal
+  // when this is a "slice" of a larger matrix.
+  return VectorImpl<T>(data_ + row_stride_ * index, num_columns());
+}
+
+template <class T, BlockedMatrixFormat format>
+template <class U>
+BlockedMatrixImpl<T, format>::BlockedMatrixImpl(
+    BlockedMatrixImpl<U, format> that)
+    : data_(that.data_),
+      num_rows_(that.num_rows()),
+      num_columns_(that.num_columns()),
+      block_size_(that.block_size()),
+      num_vectors_(that.num_vectors()) {
+  static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
+}
+
+template <class T, BlockedMatrixFormat format>
+template <class U>
+BlockedMatrixImpl<T, format> &BlockedMatrixImpl<T, format>::operator=(
+    BlockedMatrixImpl<U, format> that) {
+  static_assert(sizeof(T) == sizeof(U), "T and U must be the same size");
+  data_ = that.data_;
+  num_rows_ = that.num_rows();
+  num_columns_ = that.num_columns();
+  block_size_ = that.block_size();
+  num_vectors_ = that.num_vectors();
+  return *this;
+}
+
+template <class T, BlockedMatrixFormat format>
+template <class Byte>
+tensorflow::Status BlockedMatrixImpl<T, format>::Reset(
+    AlignedAreaImpl<Byte> area, size_t num_rows, size_t num_columns) {
+  data_ = reinterpret_cast<T *>(area.view(0).data());
+  num_rows_ = num_rows;
+  num_columns_ = num_columns;
+  block_size_ = area.view_size() / sizeof(T);
+  num_vectors_ = num_rows * num_columns / block_size_;
+
+  if (area.view_stride() != area.view_size()) {
+    return tensorflow::errors::InvalidArgument(
+        "Padding is not supported for blocked matrix formats. Underlying area "
+        "has size ",
+        area.view_size(), " which is padded to stride ", area.view_stride(),
+        ".");
+  }
+  if (area.view_size() % sizeof(T) != 0) {
+    return tensorflow::errors::InvalidArgument(
+        "View size ", area.view_size(),
+        " is not a multiple of the templated type's size, ", sizeof(T));
+  }
+  if (num_vectors_ != area.num_views()) {
+    return tensorflow::errors::InvalidArgument("Area has ", area.num_views(),
+                                               " views, but should have ",
+                                               num_vectors_);
+  }
+
+  // The block dimension must divide rows or columns evenly.
+  size_t divided_dimension = IsRowBlocked() ? num_rows : num_columns;
+  if (divided_dimension % block_size_ != 0) {
+    return tensorflow::errors::InvalidArgument(
+        IsRowBlocked() ? "row" : "column",
+        "-blocked matrix has major dimension ", divided_dimension,
+        " which is not divisible by the block size, ", block_size_);
+  }
+
+  return tensorflow::Status::OK();
+}
+
+template <class T, BlockedMatrixFormat format>
+VectorImpl<T> BlockedMatrixImpl<T, format>::vector(size_t index) const {
+  DCHECK_LT(index, num_vectors_);
+  return VectorImpl<T>(data_ + block_size_ * index, block_size_);
+}
+
+template <class T, BlockedMatrixFormat format>
+BlockedMatrixImpl<T, TransposeBlockedMatrixFormat(format)>
+BlockedMatrixImpl<T, format>::Transpose() const {
+  BlockedMatrixImpl<T, TransposeBlockedMatrixFormat(format)> result;
+  result.data_ = data_;
+  result.num_columns_ = num_rows_;
+  result.num_rows_ = num_columns_;
+  result.block_size_ = block_size_;
+  result.num_vectors_ = num_vectors_;
+  return result;
+}
+
+}  // namespace internal
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
+
+#endif  // DRAGNN_RUNTIME_MATH_TYPES_H_
--- a/research/syntaxnet/dragnn/runtime/math/types_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/types_test.cc
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "dragnn/runtime/math/types.h"
+
+#include <stddef.h>
+#include <string.h>
+#include <set>
+
+#include "dragnn/core/test/generic.h"
+#include "dragnn/runtime/alignment.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+namespace {
+
+// Creates a pointer that is be invalid. This is useful for creating proxy areas
+// for testing, whose real data should never be accessed. We manually tested
+// that if this pointer is dereferenced, a segmentation fault will be thrown.
+char *InvalidAlignedPointer() {
+  return reinterpret_cast<char *>(3 * internal::kAlignmentBytes);
+}
+
+// Expects that two pointers point to the same address.
+void ExpectSameAddress(const void *ptr1, const void *ptr2) {
+  EXPECT_EQ(ptr1, ptr2);
+}
+
+template <class A, class B>
+bool StructsEqual(const A &a, const B &b) {
+  static_assert(sizeof(A) == sizeof(B),
+                "StructsEqual must be given structs of the same size.");
+  return memcmp(&a, &b, sizeof(A)) == 0;
+}
+
+// Tests that (Mutable)Vector is empty by default.
+TEST(VectorTest, EmptyByDefault) {
+  const Vector<int> vector1;
+  EXPECT_EQ(vector1.size(), 0);
+  EXPECT_TRUE(vector1.empty());
+
+  const MutableVector<int> vector2;
+  EXPECT_EQ(vector2.size(), 0);
+  EXPECT_TRUE(vector2.empty());
+}
+
+// Tests that (Mutable)Vector can be initialized from a view.
+TEST(VectorTest, ConstructFromView) {
+  MutableAlignedView view;
+  char *ptr = InvalidAlignedPointer();
+  TF_ASSERT_OK(view.Reset(ptr, 10 * sizeof(int)));
+
+  const Vector<int> vector1(view);
+  ExpectSameAddress(vector1.data(), ptr);
+  EXPECT_EQ(vector1.size(), 10);
+  EXPECT_FALSE(vector1.empty());
+
+  const MutableVector<int> vector2(view);
+  ExpectSameAddress(vector2.data(), ptr);
+  EXPECT_EQ(vector2.size(), 10);
+  EXPECT_FALSE(vector2.empty());
+}
+
+// Tests that (Mutable)Vector can be initialized from a prefix of a view.
+TEST(VectorTest, ConstructFromViewPrefix) {
+  MutableAlignedView view;
+  char *ptr = InvalidAlignedPointer();
+  TF_ASSERT_OK(view.Reset(ptr, 10 * sizeof(int)));
+
+  // Use a prefix of 3 of the 10 available ints in the |view|.
+  const Vector<int> vector1(view, 3);
+  ExpectSameAddress(vector1.data(), ptr);
+  EXPECT_EQ(vector1.size(), 3);
+  EXPECT_FALSE(vector1.empty());
+
+  // Use a prefix of 5 of the 10 available ints in the |view|.
+  const MutableVector<int> vector2(view, 5);
+  ExpectSameAddress(vector2.data(), ptr);
+  EXPECT_EQ(vector2.size(), 5);
+  EXPECT_FALSE(vector2.empty());
+}
+
+// Tests that (Mutable)Vector supports copy-construction and assignment with
+// shallow-copy semantics, and reinterprets from T* to const T*.
+TEST(VectorTest, CopyAndAssign) {
+  MutableAlignedView view;
+  char *ptr = InvalidAlignedPointer();
+  TF_ASSERT_OK(view.Reset(ptr, 10 * sizeof(int)));
+
+  const MutableVector<int> vector1(view);
+
+  // Copy-construct from another vector.
+  MutableVector<int> vector2(vector1);
+  ExpectSameAddress(vector2.data(), ptr);
+  EXPECT_EQ(vector2.size(), 10);
+  EXPECT_FALSE(vector2.empty());
+
+  // Assign from an empty vector, effectively clearing it.
+  vector2 = MutableVector<int>();
+  EXPECT_EQ(vector2.size(), 0);
+  EXPECT_TRUE(vector2.empty());
+
+  // Assign from the original vector.
+  vector2 = vector1;
+  ExpectSameAddress(vector2.data(), ptr);
+  EXPECT_EQ(vector2.size(), 10);
+  EXPECT_FALSE(vector2.empty());
+
+  // Copy-construct from another vector.  Note that this reinterprets type.
+  Vector<int> vector3(vector1);
+  ExpectSameAddress(vector3.data(), ptr);
+  EXPECT_EQ(vector3.size(), 10);
+  EXPECT_FALSE(vector3.empty());
+
+  // Assign from an empty vector, effectively clearing it.
+  vector3 = Vector<int>();
+  EXPECT_EQ(vector3.size(), 0);
+  EXPECT_TRUE(vector3.empty());
+
+  // Assign from another vector.  Note that this reinterprets type.
+  vector3 = vector2;
+  ExpectSameAddress(vector3.data(), ptr);
+  EXPECT_EQ(vector3.size(), 10);
+  EXPECT_FALSE(vector3.empty());
+}
+
+// Tests that (Mutable)Vector supports access via operator[].
+TEST(VectorTest, Subscript) {
+  UniqueAlignedArray array;
+  array.Reset(10 * sizeof(float));
+
+  // Write into a mutable vector.
+  const MutableVector<float> mutable_vector(array.view());
+  ASSERT_EQ(mutable_vector.size(), 10);
+  for (int i = 0; i < 10; ++i) mutable_vector[i] = i;
+
+  // Read from a const vector that points at the same values.
+  const Vector<float> const_vector(array.view());
+  ASSERT_EQ(const_vector.size(), 10);
+  for (int i = 0; i < 10; ++i) EXPECT_EQ(const_vector[i], i);
+}
+
+// Tests the subsequence operator.
+TEST(VectorTest, Subsequence) {
+  // Debug checks will fail if either of the constructed vectors is not aligned.
+  constexpr int numAlignedFloats = internal::kAlignmentBytes / sizeof(float);
+
+  UniqueAlignedArray array;
+  array.Reset(2 * numAlignedFloats * sizeof(float));
+
+  // Write into a mutable vector.
+  const MutableVector<float> mutable_vector(array.view());
+  for (int i = 0; i < 2 * numAlignedFloats; ++i) mutable_vector[i] = i;
+
+  // Subscript beginning.
+  Vector<float> first_half(mutable_vector.Subsequence(0, numAlignedFloats));
+  ASSERT_EQ(first_half.size(), numAlignedFloats);
+  for (int i = 0; i < numAlignedFloats; ++i) {
+    EXPECT_EQ(first_half[i], i);
+  }
+
+  // Subscript end.
+  Vector<float> second_half(
+      mutable_vector.Subsequence(numAlignedFloats, numAlignedFloats));
+  ASSERT_EQ(second_half.size(), numAlignedFloats);
+  for (int i = 0; i < numAlignedFloats; ++i) {
+    EXPECT_EQ(second_half[i], i + numAlignedFloats);
+  }
+}
+
+// Tests that (Mutable)Vector supports access via range-based for loops.
+TEST(VectorTest, RangeBasedFor) {
+  UniqueAlignedArray array;
+  array.Reset(10 * sizeof(float));
+
+  // Write into a mutable vector.
+  const MutableVector<float> mutable_vector(array.view());
+  ASSERT_EQ(mutable_vector.size(), 10);
+  float counter = 0.0;
+  for (float &value : mutable_vector) value = counter++;
+
+  // Read from a const vector that points at the same values.
+  const Vector<float> const_vector(array.view());
+  ASSERT_EQ(const_vector.size(), 10);
+  counter = 0.0;
+  for (const float &value : const_vector) EXPECT_EQ(value, counter++);
+}
+
+// Tests that (Mutable)Matrix is empty by default.
+TEST(MatrixTest, EmptyByDefault) {
+  const Matrix<int> matrix1;
+  EXPECT_EQ(matrix1.num_rows(), 0);
+  EXPECT_EQ(matrix1.num_columns(), 0);
+  EXPECT_EQ(matrix1.row_stride(), 0);
+
+  const MutableMatrix<int> matrix2;
+  EXPECT_EQ(matrix2.num_rows(), 0);
+  EXPECT_EQ(matrix2.num_columns(), 0);
+  EXPECT_EQ(matrix2.row_stride(), 0);
+}
+
+// Tests that (Mutable)Matrix can be constructed from an area.
+TEST(MatrixTest, ConstructFromArea) {
+  MutableAlignedView view;
+  char *ptr = InvalidAlignedPointer();
+  const size_t kNumRows = 11;
+  const size_t kNumColumns = 13;
+  const size_t kRowBytes = kNumColumns * sizeof(int);
+  const size_t kRowStride = PadToAlignment(kRowBytes) / sizeof(int);
+  const size_t bytes = ComputeAlignedAreaSize(kNumRows, kRowBytes);
+  TF_ASSERT_OK(view.Reset(ptr, bytes));
+
+  MutableAlignedArea area;
+  TF_ASSERT_OK(area.Reset(view, kNumRows, kRowBytes));
+
+  const Matrix<int> matrix1(area);
+  EXPECT_EQ(matrix1.num_rows(), kNumRows);
+  EXPECT_EQ(matrix1.num_columns(), kNumColumns);
+  EXPECT_EQ(matrix1.row_stride(), kRowStride);
+  ExpectSameAddress(matrix1.row(0).data(), ptr);
+  ExpectSameAddress(matrix1.data(), ptr);
+
+  const MutableMatrix<int> matrix2(area);
+  EXPECT_EQ(matrix2.num_rows(), kNumRows);
+  EXPECT_EQ(matrix2.num_columns(), kNumColumns);
+  EXPECT_EQ(matrix2.row_stride(), kRowStride);
+  ExpectSameAddress(matrix2.row(0).data(), ptr);
+  ExpectSameAddress(matrix2.data(), ptr);
+}
+
+// Tests that (Mutable)Matrix supports copy-construction and assignment with
+// shallow-copy semantics, and reinterprets from T* to const T*.
+TEST(MatrixTest, CopyAndAssign) {
+  MutableAlignedView view;
+  char *ptr = InvalidAlignedPointer();
+  const size_t kNumRows = 11;
+  const size_t kNumColumns = 13;
+  const size_t kRowBytes = kNumColumns * sizeof(int);
+  const size_t kRowStride = PadToAlignment(kRowBytes) / sizeof(int);
+  const size_t bytes = ComputeAlignedAreaSize(kNumRows, kRowBytes);
+  TF_ASSERT_OK(view.Reset(ptr, bytes));
+
+  MutableAlignedArea area;
+  TF_ASSERT_OK(area.Reset(view, kNumRows, kRowBytes));
+
+  const MutableMatrix<int> matrix1(area);
+  EXPECT_EQ(matrix1.num_rows(), kNumRows);
+  EXPECT_EQ(matrix1.num_columns(), kNumColumns);
+  EXPECT_EQ(matrix1.row_stride(), kRowStride);
+  ExpectSameAddress(matrix1.row(0).data(), ptr);
+  ExpectSameAddress(matrix1.data(), ptr);
+
+  // Copy-construct from another matrix.
+  MutableMatrix<int> matrix2(matrix1);
+  EXPECT_EQ(matrix2.num_rows(), kNumRows);
+  EXPECT_EQ(matrix2.num_columns(), kNumColumns);
+  EXPECT_EQ(matrix2.row_stride(), kRowStride);
+  ExpectSameAddress(matrix2.row(0).data(), ptr);
+  ExpectSameAddress(matrix2.data(), ptr);
+
+  // Assign from an empty matrix, effectively clearing it.
+  matrix2 = MutableMatrix<int>();
+  EXPECT_EQ(matrix2.num_rows(), 0);
+  EXPECT_EQ(matrix2.num_columns(), 0);
+  EXPECT_EQ(matrix2.row_stride(), 0);
+
+  // Assign from the original matrix.
+  matrix2 = matrix1;
+  EXPECT_EQ(matrix2.num_rows(), kNumRows);
+  EXPECT_EQ(matrix2.num_columns(), kNumColumns);
+  EXPECT_EQ(matrix2.row_stride(), kRowStride);
+  ExpectSameAddress(matrix2.row(0).data(), ptr);
+  ExpectSameAddress(matrix2.data(), ptr);
+
+  // Copy-construct from another matrix.  Note that this reinterprets type.
+  Matrix<int> matrix3(matrix2);
+  EXPECT_EQ(matrix3.num_rows(), kNumRows);
+  EXPECT_EQ(matrix3.num_columns(), kNumColumns);
+  EXPECT_EQ(matrix3.row_stride(), kRowStride);
+  ExpectSameAddress(matrix3.row(0).data(), ptr);
+  ExpectSameAddress(matrix3.data(), ptr);
+
+  // Assign from an empty matrix, effectively clearing it.
+  matrix3 = Matrix<int>();
+  EXPECT_EQ(matrix3.num_rows(), 0);
+  EXPECT_EQ(matrix3.num_columns(), 0);
+  EXPECT_EQ(matrix3.row_stride(), 0);
+
+  // Assign from the original matrix.  Note that this reinterprets type.
+  matrix3 = matrix1;
+  EXPECT_EQ(matrix3.num_rows(), kNumRows);
+  EXPECT_EQ(matrix3.num_columns(), kNumColumns);
+  EXPECT_EQ(matrix3.row_stride(), kRowStride);
+  ExpectSameAddress(matrix3.row(0).data(), ptr);
+  ExpectSameAddress(matrix3.data(), ptr);
+}
+
+// Tests that (Mutable)Matrix supports row access.
+TEST(MatrixTest, Rows) {
+  const size_t kNumRows = 11;
+  const size_t kNumColumns = 13;
+  const size_t bytes =
+      ComputeAlignedAreaSize(kNumRows, kNumColumns * sizeof(float));
+  UniqueAlignedArray array;
+  array.Reset(bytes);
+
+  MutableAlignedArea area;
+  TF_ASSERT_OK(area.Reset(array.view(), kNumRows, kNumColumns * sizeof(float)));
+
+  // Write to a mutable matrix.
+  const MutableMatrix<float> mutable_matrix(area);
+  ASSERT_EQ(mutable_matrix.num_rows(), kNumRows);
+  ASSERT_EQ(mutable_matrix.num_columns(), kNumColumns);
+  for (size_t row = 0; row < kNumRows; ++row) {
+    for (size_t column = 0; column < kNumColumns; ++column) {
+      mutable_matrix.row(row)[column] = row * 1000.0 + column;
+    }
+  }
+
+  // Read from a const matrix that points at the same values.
+  const Matrix<float> const_matrix(area);
+  ASSERT_EQ(const_matrix.num_rows(), kNumRows);
+  ASSERT_EQ(const_matrix.num_columns(), kNumColumns);
+  for (size_t row = 0; row < kNumRows; ++row) {
+    for (size_t column = 0; column < kNumColumns; ++column) {
+      EXPECT_EQ(const_matrix.row(row)[column], row * 1000.0 + column);
+    }
+  }
+}
+
+TEST(MatrixTest, MatrixFromVector) {
+  for (int cols = 0; cols < 100; ++cols) {
+    MutableAlignedView view;
+    char *ptr = InvalidAlignedPointer();
+    TF_ASSERT_OK(view.Reset(ptr, cols * sizeof(int)));
+    const MutableVector<int> vector(view);
+    const MutableMatrix<int> matrix(vector);
+    ASSERT_EQ(matrix.row(0).data(), vector.data());
+    ExpectSameAddress(matrix.data(), vector.data());
+    ASSERT_EQ(matrix.num_rows(), 1);
+    ASSERT_EQ(matrix.num_columns(), vector.size());
+  }
+}
+
+template <class MatrixType>
+class BlockedMatrixTest : public ::testing::Test {};
+
+typedef ::testing::Types<
+    BlockedMatrix<float, BlockedMatrixFormat::kRowBlockedColumnMajor>,
+    BlockedMatrix<float, BlockedMatrixFormat::kColumnBlockedRowMajor>,
+    BlockedMatrix<int64, BlockedMatrixFormat::kRowBlockedColumnMajor>,
+    BlockedMatrix<int64, BlockedMatrixFormat::kColumnBlockedRowMajor>>
+    BlockedRowAndColumnTypes;
+TYPED_TEST_CASE(BlockedMatrixTest, BlockedRowAndColumnTypes);
+
+TYPED_TEST(BlockedMatrixTest, PaddingNotAllowed) {
+  MutableAlignedView view;
+  MutableAlignedArea area;
+  constexpr size_t kNumRows = 10;
+  constexpr size_t kNumColumns = 10;
+  constexpr size_t kBlockSize = 5;
+  constexpr size_t kNumViews = (kNumRows * kNumColumns) / kBlockSize;
+  constexpr size_t kBlockSizeBytes =
+      kBlockSize * sizeof(typename TypeParam::ElementType);
+  const size_t bytes = ComputeAlignedAreaSize(kNumViews, kBlockSizeBytes);
+  TF_ASSERT_OK(view.Reset(InvalidAlignedPointer(), bytes));
+  TF_ASSERT_OK(area.Reset(view, kNumViews, kBlockSizeBytes));
+
+  // 5 is usually relatively prime to the alignment size, but you may have to
+  // update this test if kAlignmentBytes changes.
+  ASSERT_NE(PadToAlignment(kBlockSizeBytes), kBlockSizeBytes);
+
+  TypeParam matrix;
+  EXPECT_THAT(matrix.Reset(area, kNumRows, kNumColumns),
+              test::IsErrorWithSubstr(
+                  "Padding is not supported for blocked matrix formats."));
+}
+
+// Tests accessors, and the size of matrices after allocation.
+TYPED_TEST(BlockedMatrixTest, Accessors) {
+  MutableAlignedView view;
+  MutableAlignedArea area;
+  char *ptr = InvalidAlignedPointer();
+  constexpr size_t kNumRows = 48;
+  constexpr size_t kNumColumns = 24;
+  constexpr size_t kBlockSize = 8;
+  constexpr size_t kNumViews = (kNumRows * kNumColumns) / kBlockSize;
+  constexpr size_t kBlockSizeBytes =
+      kBlockSize * sizeof(typename TypeParam::ElementType);
+  const size_t bytes = ComputeAlignedAreaSize(kNumViews, kBlockSizeBytes);
+  TF_ASSERT_OK(view.Reset(ptr, bytes));
+  TF_ASSERT_OK(area.Reset(view, kNumViews, kBlockSizeBytes));
+
+  TypeParam matrix;
+
+  // If the view size is wrong, it should fail.
+  EXPECT_THAT(
+      matrix.Reset(area, kNumRows + 1, kNumColumns),
+      test::IsErrorWithSubstr("Area has 144 views, but should have 147"));
+
+  // If the blocking scheme cannot divide the matrix evenly, an error should
+  // be raised. The choice of 12 and 96 is a bit non-trivial: they are numbers
+  // that conveniently result in the correct area (so other errors won't be
+  // raised), but an incompatible division of the vectors.
+  if (TypeParam::IsRowBlocked()) {
+    EXPECT_THAT(
+        matrix.Reset(area, 12, 96),
+        test::IsErrorWithSubstr("row-blocked matrix has major dimension 12 "
+                                "which is not divisible by the block "
+                                "size, 8"));
+  } else {
+    EXPECT_THAT(
+        matrix.Reset(area, 96, 12),
+        test::IsErrorWithSubstr("column-blocked matrix has major dimension "
+                                "12 which is not divisible by the block "
+                                "size, 8"));
+  }
+
+  TF_EXPECT_OK(matrix.Reset(area, kNumRows, kNumColumns));
+
+  EXPECT_EQ(matrix.vector(0).data(),
+            reinterpret_cast<typename TypeParam::ElementType *>(ptr));
+  EXPECT_EQ(matrix.num_rows(), kNumRows);
+  EXPECT_EQ(matrix.num_columns(), kNumColumns);
+  EXPECT_EQ(matrix.block_size(), kBlockSize);
+  EXPECT_EQ(matrix.num_vectors(), kNumViews);
+}
+
+TYPED_TEST(BlockedMatrixTest, CopyCastTranspose) {
+  MutableAlignedView view;
+  MutableAlignedArea area;
+  constexpr size_t kNumRows = 48;
+  constexpr size_t kNumColumns = 24;
+  constexpr size_t kBlockSize = 8;
+  constexpr size_t kNumViews = (kNumRows * kNumColumns) / kBlockSize;
+  constexpr size_t kBlockSizeBytes =
+      kBlockSize * sizeof(typename TypeParam::ElementType);
+  const size_t bytes = ComputeAlignedAreaSize(kNumViews, kBlockSizeBytes);
+  TF_ASSERT_OK(view.Reset(InvalidAlignedPointer(), bytes));
+  TF_ASSERT_OK(area.Reset(view, kNumViews, kBlockSizeBytes));
+
+  TypeParam matrix;
+  TF_EXPECT_OK(matrix.Reset(area, kNumRows, kNumColumns));
+
+  // Test both copying and casting to const.
+  TypeParam matrix_copy = matrix;
+  auto readonly = matrix.AsConst();
+  EXPECT_TRUE(StructsEqual(matrix, matrix_copy));
+  EXPECT_TRUE(StructsEqual(matrix, readonly));
+  for (int i = 0; i < kNumViews; ++i) {
+    EXPECT_EQ(matrix.vector(i).data(), matrix_copy.vector(i).data());
+    EXPECT_EQ(matrix.vector(i).data(), readonly.vector(i).data());
+  }
+
+  // Transpose matrix.
+  auto transposed = matrix.Transpose();
+  auto readonly_transposed = readonly.Transpose();
+  EXPECT_FALSE(StructsEqual(matrix, transposed));
+  EXPECT_FALSE(StructsEqual(readonly, readonly_transposed));
+  EXPECT_TRUE(StructsEqual(transposed, readonly_transposed));
+}
+
+}  // namespace
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/mmap.cc
+++ b/research/syntaxnet/dragnn/runtime/mmap.cc
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+
+#include "dragnn/runtime/mmap.h"
+
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <utility>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace syntaxnet {
+namespace dragnn {
+namespace runtime {
+
+int UniqueAlignedMmap::Syscalls::Open(const string &path) const {
+  return open(path.c_str(), O_RDONLY);
+}
+
+int UniqueAlignedMmap::Syscalls::Close(int file_descriptor) const {
+  return close(file_descriptor);
+}
+
+void *UniqueAlignedMmap::Syscalls::Mmap(int file_descriptor,
+                                        size_t size) const {
+  return mmap(nullptr, size, PROT_READ, MAP_SHARED, file_descriptor, 0);
+}
+
+int UniqueAlignedMmap::Syscalls::Munmap(void *data, size_t size) const {
+  return munmap(data, size);
+}
+
+UniqueAlignedMmap::UniqueAlignedMmap(std::unique_ptr<Syscalls> syscalls)
+    : syscalls_(std::move(syscalls)) {}
+
+UniqueAlignedMmap::UniqueAlignedMmap(UniqueAlignedMmap &&that)
+    : syscalls_(std::move(that.syscalls_)) {
+  view_ = that.view_;
+  path_ = that.path_;
+  that.view_ = MutableAlignedView();
+  that.path_.clear();
+}
+
+UniqueAlignedMmap &UniqueAlignedMmap::operator=(UniqueAlignedMmap &&that) {
+  syscalls_ = std::move(that.syscalls_);
+  view_ = that.view_;
+  path_ = that.path_;
+  that.view_ = MutableAlignedView();
+  that.path_.clear();
+  return *this;
+}
+
+UniqueAlignedMmap::~UniqueAlignedMmap() {
+  UnmapIfNonEmpty(view_.data(), view_.size(), path_);
+}
+
+tensorflow::Status UniqueAlignedMmap::Reset(const string &path) {
+  uint64 size = 0;
+  TF_RETURN_IF_ERROR(tensorflow::Env::Default()->GetFileSize(path, &size));
+
+  // Since mmap() cannot map 0 bytes, we skip the call on empty files.  This is
+  // OK because UnmapIfNonEmpty() also skips munmap() on an empty region.
+  if (size == 0) {
+    view_ = MutableAlignedView();
+    path_ = path;
+    return tensorflow::Status::OK();
+  }
+
+  const int file_descriptor = syscalls_->Open(path);
+  if (file_descriptor == -1) {
+    // TODO(googleuser): Use strerror_r() to export the system error message.
+    return tensorflow::errors::Unknown("Failed to open '", path, "'");
+  }
+
+  // In case we error out.
+  auto ensure_closed = tensorflow::gtl::MakeCleanup([&] {
+    if (syscalls_->Close(file_descriptor) != 0) {
+      LOG(ERROR) << "Failed to close '" << path << "'";
+    }
+  });
+
+  void *mmap_result = syscalls_->Mmap(file_descriptor, size);
+  if (mmap_result == MAP_FAILED) {
+    return tensorflow::errors::Unknown("Failed to mmap '", path, "'");
+  }
+
+  // In case we error out.
+  auto ensure_unmapped = tensorflow::gtl::MakeCleanup(
+      [&] { UnmapIfNonEmpty(mmap_result, size, path); });
+
+  // Since mmap() increments the refcount of the |file_descriptor|, it must be
+  // closed to prevent a leak.
+  ensure_closed.release();  // going to close it manually
+  if (syscalls_->Close(file_descriptor) != 0) {
+    return tensorflow::errors::Unknown("Failed to close '", path, "'");
+  }
+
+  // Most implementations of mmap() place the mapped region on a page boundary,
+  // which is plenty of alignment.  Since this is so unlikely to fail, we don't
+  // try to recover if the address is misaligned.  A potential recovery method
+  // is to allocate a UniqueAlignedArray and read the file normally.
+  MutableAlignedView data;
+  TF_RETURN_IF_ERROR(data.Reset(reinterpret_cast<char *>(mmap_result), size));
+
+  // Success; make modifications.
+  view_ = data;
+  path_ = path;
+  ensure_unmapped.release();  // this has taken ownership of the mapped file
+  return tensorflow::Status::OK();
+}
+
+void UniqueAlignedMmap::UnmapIfNonEmpty(void *data, size_t size,
+                                        const string &path) const {
+  if (size == 0) return;
+  if (syscalls_->Munmap(data, size) != 0) {
+    LOG(ERROR) << "Failed to munmap() file '" << path << "'";
+  }
+}
+
+}  // namespace runtime
+}  // namespace dragnn
+}  // namespace syntaxnet