Remove runtime because reasons.

edea2b67 · Terry Koo · a4bb31d0 · a4bb31d0 · a4bb31d0 · a4bb31d0
Commit edea2b67 authored May 11, 2018 by Terry Koo
20 changed files
--- a/research/syntaxnet/dragnn/runtime/lstm_network_test.cc
+++ b/research/syntaxnet/dragnn/runtime/lstm_network_test.cc
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "dragnn/core/test/generic.h"
-#include "dragnn/protos/runtime.pb.h"
-#include "dragnn/runtime/flexible_matrix_kernel.h"
-#include "dragnn/runtime/lstm_cell/cell_function.h"
-#include "dragnn/runtime/network_unit.h"
-#include "dragnn/runtime/test/network_test_base.h"
-#include "dragnn/runtime/variable_store.h"
-#include <gmock/gmock.h>
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-namespace {
-
-using ::testing::Invoke;
-using ::testing::_;
-
-class LstmNetworkTest : public NetworkTestBase {
- protected:
-  // Adds a blocked weight matrix with the |name| with the given dimensions and
-  // |fill_value|.  If |is_flexible_matrix| is true, the variable is set up for
-  // use by the FlexibleMatrixKernel.
-  void AddWeights(const string &name, size_t input_dim, size_t output_dim,
-                  float fill_value, bool is_flexible_matrix = false) {
-    constexpr int kBatchSize = LstmCellFunction<>::kBatchSize;
-    size_t output_padded =
-        kBatchSize * ((output_dim + kBatchSize - 1) / kBatchSize);
-    size_t num_views = (output_padded / kBatchSize) * input_dim;
-    string var_name = tensorflow::strings::StrCat(
-        kTestComponentName, "/", name,
-        is_flexible_matrix ? FlexibleMatrixKernel::kSuffix
-                           : "/matrix/blocked48");
-    const std::vector<float> block(kBatchSize, fill_value);
-    const std::vector<std::vector<float>> blocks(num_views, block);
-    variable_store_.AddOrDie(
-        var_name, blocks, VariableSpec::FORMAT_COLUMN_BLOCKED_ROW_MAJOR_MATRIX);
-    variable_store_.SetBlockedDimensionOverride(
-        var_name, {input_dim, output_padded, kBatchSize});
-  }
-
-  // Adds a bias vector with the |name_suffix| with the given dimensions and
-  // |fill_value|.
-  void AddBiases(const string &name, size_t dimension, float fill_value) {
-    const string biases_name =
-        tensorflow::strings::StrCat(kTestComponentName, "/", name);
-    AddVectorVariable(biases_name, dimension, fill_value);
-  }
-
-  // Creates a network unit, initializes it based on the |component_spec_text|,
-  // and evaluates it.  On error, returns non-OK.
-  tensorflow::Status Run(const string &component_spec_text) {
-    ComponentSpec component_spec;
-    CHECK(TextFormat::ParseFromString(component_spec_text, &component_spec));
-    component_spec.set_name(kTestComponentName);
-
-    // Since LSTMNetwork uses the concatenated input, it is insensitive
-    // to the particular fixed or linked embedding inputs.  For simplicity, the
-    // tests use a trivial network structure and a single fixed embedding.
-    AddComponent(kTestComponentName);
-
-    TF_RETURN_IF_ERROR(
-        NetworkUnit::CreateOrError("LSTMNetwork", &network_unit_));
-    TF_RETURN_IF_ERROR(network_unit_->Initialize(
-        component_spec, &variable_store_, &network_state_manager_,
-        &extension_manager_));
-
-    network_states_.Reset(&network_state_manager_);
-    StartComponent(1);  // only evaluate the first step
-    session_state_.extensions.Reset(&extension_manager_);
-
-    TF_RETURN_IF_ERROR(
-        network_unit_->Evaluate(0, &session_state_, &compute_session_));
-
-    return tensorflow::Status::OK();
-  }
-
-  // Returns the activation vector of the first step of layer named |layer_name|
-  // in the current component.
-  Vector<float> GetActivations(const string &layer_name) const {
-    Matrix<float> layer(GetLayer(kTestComponentName, layer_name));
-    return layer.row(0);
-  }
-
-  std::unique_ptr<NetworkUnit> network_unit_;
-};
-
-// Tests that the LSTMNetwork does not produce logits when omit_logits is
-// true, even if there are actions.
-TEST_F(LstmNetworkTest, NoLogitsOrSoftmaxWhenOmitLogitsTrue) {
-  constexpr size_t input_dim = 32;
-  constexpr int kHiddenDim = LstmCellFunction<>::kBatchSize;
-  const string kSpec = R"(fixed_feature {
-                            vocabulary_size: 50
-                            embedding_dim: 32
-                            size: 1
-                          }
-                          network_unit {
-                            parameters {
-                              key: 'hidden_layer_sizes'
-                              value: '48'
-                            }
-                            parameters {
-                              key: 'omit_logits'
-                              value: 'true'
-                            }
-                          }
-                          num_actions: 10)";
-  const float kEmbedding = 1.25;
-  const float kFeature = 0.5;
-  const float kWeight = 1.5;
-  AddFixedEmbeddingMatrix(0, 50, input_dim, kEmbedding);
-
-  // No "softmax" weights or biases.
-  AddWeights("x_to_ico", input_dim, 3 * kHiddenDim, kWeight);
-  AddWeights("h_to_ico", kHiddenDim, 3 * kHiddenDim, kWeight);
-  AddWeights("c2i", kHiddenDim, kHiddenDim, kWeight);
-  AddWeights("c2o", kHiddenDim, kHiddenDim, kWeight);
-  AddBiases("ico_bias", 3 * kHiddenDim, kWeight);
-
-  EXPECT_CALL(compute_session_, GetInputFeatures(_, _, _, _, _))
-      .WillOnce(Invoke(ExtractFeatures(0, {{1, kFeature}})));
-
-  TF_EXPECT_OK(Run(kSpec));
-
-  // No specified logits layer.
-  EXPECT_TRUE(network_unit_->GetLogitsName().empty());
-
-  // No "logits" layer.
-  size_t unused_dimension = 0;
-  LayerHandle<float> unused_handle;
-  EXPECT_THAT(
-      network_state_manager_.LookupLayer(kTestComponentName, "logits",
-                                         &unused_dimension, &unused_handle),
-      test::IsErrorWithSubstr(
-          "Unknown layer 'logits' in component 'test_component'"));
-}
-
-TEST_F(LstmNetworkTest, NormalOperationSmallHidden) {
-  constexpr size_t input_dim = 32;
-  constexpr int kHiddenDim = 8;
-  constexpr int num_actions = 10;
-
-  const string kSpec = R"(fixed_feature {
-                            vocabulary_size: 50
-                            embedding_dim: 32
-                            size: 1
-                          }
-                          network_unit {
-                            parameters {
-                              key: 'hidden_layer_sizes'
-                              value: '8'
-                            }
-                          }
-                          num_actions: 10)";
-  const float kEmbedding = 1.25;
-  const float kFeature = 0.5;
-  const float kWeight = 1.5;
-  AddFixedEmbeddingMatrix(0, 50, input_dim, kEmbedding);
-
-  // Same as above, with "softmax" weights and biases.
-  AddWeights("x_to_ico", input_dim, 3 * kHiddenDim, kWeight);
-  AddWeights("h_to_ico", kHiddenDim, 3 * kHiddenDim, kWeight);
-  AddWeights("c2i", kHiddenDim, kHiddenDim, kWeight);
-  AddWeights("c2o", kHiddenDim, kHiddenDim, kWeight);
-  AddWeights("weights_softmax", kHiddenDim, num_actions, kWeight,
-             /*is_flexible_matrix=*/true);
-  AddBiases("ico_bias", 3 * kHiddenDim, kWeight);
-  AddBiases("bias_softmax", num_actions, kWeight);
-
-  EXPECT_CALL(compute_session_, GetInputFeatures(_, _, _, _, _))
-      .WillOnce(Invoke(ExtractFeatures(0, {{1, kFeature}})));
-
-  TF_EXPECT_OK(Run(kSpec));
-
-  // Logits should exist.
-  EXPECT_EQ(network_unit_->GetLogitsName(), "logits");
-
-  // Logits dimension matches "num_actions" above. We don't test the values very
-  // precisely here, and feel free to update if the cell function changes. Most
-  // value tests should be in lstm_cell/cell_function_test.cc.
-
-  Vector<float> logits = GetActivations("logits");
-  EXPECT_EQ(logits.size(), num_actions);
-  EXPECT_NEAR(logits[0], 10.6391, 0.1);
-  for (int i = 1; i < 10; ++i) {
-    EXPECT_EQ(logits[i], logits[0])
-        << "With uniform weights, all logits should be equal.";
-  }
-}
-
-TEST_F(LstmNetworkTest, ErrorWithTooSmallHidden) {
-  constexpr size_t input_dim = 32;
-  constexpr int kHiddenDim = 4;
-
-  const string kSpec = R"(fixed_feature {
-                            vocabulary_size: 50
-                            embedding_dim: 32
-                            size: 1
-                          }
-                          network_unit {
-                            parameters {
-                              key: 'hidden_layer_sizes'
-                              value: '4'
-                            }
-                          }
-                          num_actions: 0)";
-  const float kEmbedding = 1.25;
-  const float kWeight = 1.5;
-  AddFixedEmbeddingMatrix(0, 50, input_dim, kEmbedding);
-
-  // Same as above, with "softmax" weights and biases.
-  AddWeights("x_to_ico", input_dim, 3 * kHiddenDim, kWeight);
-  AddWeights("h_to_ico", kHiddenDim, 3 * kHiddenDim, kWeight);
-  AddWeights("c2i", kHiddenDim, kHiddenDim, kWeight);
-  AddWeights("c2o", kHiddenDim, kHiddenDim, kWeight);
-  AddBiases("ico_bias", 3 * kHiddenDim, kWeight);
-
-  EXPECT_THAT(
-      Run(kSpec),
-      test::IsErrorWithSubstr(
-          "Expected hidden size (4) to be a multiple of the AVX width (8)"));
-}
-
-}  // namespace
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/master.cc
+++ b/research/syntaxnet/dragnn/runtime/master.cc
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "dragnn/runtime/master.h"
-
-#include <utility>
-#include <vector>
-
-#include "dragnn/protos/runtime.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/gtl/cleanup.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-namespace {
-
-constexpr int kMaxBeamSize = 1;
-
-// Combines, using MergeFrom(), each step trace in the |source| with the
-// corresponding step of the |target|.  If |source| has more steps, then
-// |target| is extended to match.
-void MergeTraces(const ComponentTrace &source, ComponentTrace *target) {
-  while (target->step_trace_size() < source.step_trace_size()) {
-    target->add_step_trace();
-  }
-  for (int i = 0; i < source.step_trace_size(); ++i) {
-    target->mutable_step_trace(i)->MergeFrom(source.step_trace(i));
-  }
-}
-
-// Combines, using MergeTraces(), each component trace in the |source| with the
-// corresponding component of the |target|.  If |source| has more components,
-// then |target| is extended to match.
-void MergeTraces(const MasterTrace &source, MasterTrace *target) {
-  while (target->component_trace_size() < source.component_trace_size()) {
-    target->add_component_trace();
-  }
-  for (int i = 0; i < source.component_trace_size(); ++i) {
-    MergeTraces(source.component_trace(i), target->mutable_component_trace(i));
-  }
-}
-
-}  // namespace
-
-tensorflow::Status Master::Initialize(
-    const MasterSpec &master_spec,
-    std::unique_ptr<VariableStore> variable_store) {
-  if (variable_store_ != nullptr) {
-    return tensorflow::errors::FailedPrecondition("Can't initialize twice");
-  }
-
-  if (variable_store == nullptr) {
-    return tensorflow::errors::InvalidArgument("No VariableStore");
-  }
-  variable_store_ = std::move(variable_store);
-
-  const auto &master_performance_settings = master_spec.GetExtension(
-      MasterPerformanceSettings::master_spec_extension);
-  session_state_pool_.reset(new SessionStatePool(
-      master_performance_settings.session_state_pool_max_free_states()));
-
-  components_.reserve(master_spec.component_size());
-  for (const ComponentSpec &component_spec : master_spec.component()) {
-    const auto &component_performance_settings = component_spec.GetExtension(
-        ComponentPerformanceSettings::component_spec_extension);
-    components_.emplace_back();
-    ComponentConfig &component = components_.back();
-    component.name = component_spec.name();
-    component.pre_allocate_num_steps =
-        component_performance_settings.pre_allocate_num_steps();
-
-    TF_RETURN_IF_ERROR(
-        network_state_manager_.AddComponent(component_spec.name()));
-    const string component_type =
-        GetNormalizedComponentBuilderName(component_spec);
-    TF_RETURN_IF_ERROR(
-        Component::CreateOrError(component_type, &component.instance));
-    TF_RETURN_IF_ERROR(component.instance->Initialize(
-        component_spec, variable_store_.get(), &network_state_manager_,
-        &extension_manager_));
-  }
-
-  return variable_store_->Close();
-}
-
-tensorflow::Status Master::Evaluate(ComputeSession *compute_session,
-                                    MasterTrace *master_trace) const {
-  if (variable_store_ == nullptr) {
-    return tensorflow::errors::FailedPrecondition("Not initialized");
-  }
-
-  if (compute_session == nullptr) {
-    return tensorflow::errors::InvalidArgument("No ComputeSession");
-  }
-
-  if (master_trace != nullptr) {
-    master_trace->Clear();
-    compute_session->SetTracing(true);
-  }
-  const auto ensure_tracing_disabled = tensorflow::gtl::MakeCleanup([=] {
-    if (master_trace != nullptr) compute_session->SetTracing(false);
-  });
-
-  const ScopedSessionState session_state(session_state_pool_.get());
-  session_state->network_states.Reset(&network_state_manager_);
-  session_state->extensions.Reset(&extension_manager_);
-
-  for (const ComponentConfig &component : components_) {
-    // TODO(googleuser): Generically trace all layers?
-    ComponentTrace *component_trace = nullptr;
-    if (master_trace != nullptr) {
-      component_trace = master_trace->add_component_trace();
-      component_trace->set_name(component.name);
-    }
-
-    compute_session->InitializeComponentData(component.name, kMaxBeamSize);
-    TF_RETURN_IF_ERROR(session_state->network_states.StartNextComponent(
-        component.pre_allocate_num_steps));
-    TF_RETURN_IF_ERROR(component.instance->Evaluate(
-        session_state.get(), compute_session, component_trace));
-    compute_session->FinalizeData(component.name);
-  }
-
-  if (master_trace != nullptr) {
-    // Use only the first trace from the compute session.
-    const std::vector<MasterTrace> traces = compute_session->GetTraceProtos();
-    if (!traces.empty()) MergeTraces(traces[0], master_trace);
-  }
-
-  return tensorflow::Status::OK();
-}
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/master.h
+++ b/research/syntaxnet/dragnn/runtime/master.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef DRAGNN_RUNTIME_MASTER_H_
-#define DRAGNN_RUNTIME_MASTER_H_
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "dragnn/core/compute_session.h"
-#include "dragnn/protos/spec.pb.h"
-#include "dragnn/protos/trace.pb.h"
-#include "dragnn/runtime/component.h"
-#include "dragnn/runtime/extensions.h"
-#include "dragnn/runtime/network_states.h"
-#include "dragnn/runtime/session_state.h"
-#include "dragnn/runtime/session_state_pool.h"
-#include "dragnn/runtime/variable_store.h"
-#include "syntaxnet/base.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-
-// A DRAGNN master, which evaluates a series of components.
-
-class Master {
- public:
-  // Creates an uninitialized master.  Call Initialize() before use.
-  Master() = default;
-
-  // Initializes the components in this based on the |master_spec|, which may
-  // have performance tuning settings attached (see runtime.proto).  Retrieves
-  // pre-trained variables from the |variable_store|, which must not be closed.
-  // On error, returns non-OK.
-  tensorflow::Status Initialize(const MasterSpec &master_spec,
-                                std::unique_ptr<VariableStore> variable_store);
-
-  // Evaluates the pipeline of components on the |compute_session|, which must
-  // be based on the same MasterSpec as this and populated with input data.  If
-  // |master_trace| is non-null, overwrites it with extracted traces.  On error,
-  // returns non-OK.
-  tensorflow::Status Evaluate(ComputeSession *compute_session,
-                              MasterTrace *master_trace) const;
-
- private:
-  // A Component with some associated configuration.
-  struct ComponentConfig {
-    // Name of the component.
-    string name;
-
-    // Number of steps to pre-allocate operands for the component.
-    size_t pre_allocate_num_steps = 0;
-
-    // Component instance to initialize and evaluate.
-    std::unique_ptr<Component> instance;
-  };
-
-  // Store of pre-trained variables used by the |components_|.  Must be declared
-  // before the |components_| to ensure it outlives them.
-  std::unique_ptr<VariableStore> variable_store_;
-
-  // Manager for the network states in the |components_|.
-  NetworkStateManager network_state_manager_;
-
-  // Manager for SessionState extensions.
-  ExtensionManager extension_manager_;
-
-  // Ordered list of components to evaluate.
-  std::vector<ComponentConfig> components_;
-
-  // Pool of session states used when evaluating the |components_|.  This must
-  // be destroyed before the |components_|, in case there are state extensions
-  // that depend on the |components_|.  Declaring this after the |components_|
-  // ensures the proper destructor ordering.
-  std::unique_ptr<SessionStatePool> session_state_pool_;
-};
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#endif  // DRAGNN_RUNTIME_MASTER_H_
--- a/research/syntaxnet/dragnn/runtime/master_test.cc
+++ b/research/syntaxnet/dragnn/runtime/master_test.cc
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "dragnn/runtime/master.h"
-
-#include <stddef.h>
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "dragnn/core/test/generic.h"
-#include "dragnn/core/test/mock_compute_session.h"
-#include "dragnn/protos/spec.pb.h"
-#include "dragnn/protos/trace.pb.h"
-#include "dragnn/runtime/alignment.h"
-#include "dragnn/runtime/component.h"
-#include "dragnn/runtime/extensions.h"
-#include "dragnn/runtime/network_states.h"
-#include "dragnn/runtime/session_state.h"
-#include "dragnn/runtime/test/fake_variable_store.h"
-#include "dragnn/runtime/variable_store.h"
-#include "syntaxnet/base.h"
-#include <gmock/gmock.h>
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-namespace {
-
-using ::testing::_;
-using ::testing::InSequence;
-using ::testing::Invoke;
-using ::testing::Return;
-
-// Number of steps to take in each component.
-constexpr size_t kNumSteps = 123;
-
-// Outputs a layer of all 1s.
-class Ones : public Component {
- public:
-  // Implements Component.
-  tensorflow::Status Initialize(const ComponentSpec &component_spec,
-                                VariableStore *variable_store,
-                                NetworkStateManager *network_state_manager,
-                                ExtensionManager *extension_manager) override {
-    return network_state_manager->AddLayer("ones", 1, &output_handle_);
-  }
-  tensorflow::Status Evaluate(SessionState *session_state,
-                              ComputeSession *compute_session,
-                              ComponentTrace *component_trace) const override {
-    NetworkStates *network_states = &session_state->network_states;
-    for (size_t step = 0; step < kNumSteps; ++step) {
-      network_states->AddStep();
-      network_states->GetLayer(output_handle_).row(step)[0] = 1.0;
-    }
-    return tensorflow::Status::OK();
-  }
-  bool Supports(const ComponentSpec &spec,
-                const string &normalized_builder_name) const override {
-    return normalized_builder_name == "Ones";
-  }
-  bool PreferredTo(const Component &other) const override { return false; }
-
- private:
-  // Handle to the output layer.
-  LayerHandle<float> output_handle_;
-};
-
-DRAGNN_RUNTIME_REGISTER_COMPONENT(Ones);
-
-// Extends its input layer with the step-wise cumulative sum of the final entry
-// in each row of the input.  E.g.,
-//   [[0, 1],      [[0, 1, 1 (= 1)],
-//    [2, 3],  =>   [2, 3, 4 (= 1 + 3)],
-//    [4, 5]]       [4, 5, 9 (= 1 + 3 + 5)]]
-class ExtendWithCumulativeSum : public Component {
- public:
-  // Implements Component.
-  tensorflow::Status Initialize(const ComponentSpec &component_spec,
-                                VariableStore *variable_store,
-                                NetworkStateManager *network_state_manager,
-                                ExtensionManager *extension_manager) override {
-    // NB: In a real Component implementation, linked embeddings are accessed
-    // using the LinkedEmbeddingManager and LinkedEmbeddings.  Here, we set up
-    // the link manually because it's simple and makes the test self-contained.
-    CHECK_EQ(component_spec.linked_feature_size(), 1);
-    const LinkedFeatureChannel &link = component_spec.linked_feature(0);
-    size_t dimension = 0;
-    TF_RETURN_IF_ERROR(network_state_manager->LookupLayer(
-        link.source_component(), link.source_layer(), &dimension,
-        &input_handle_));
-    CHECK_GT(dimension, 0);
-    return network_state_manager->AddLayer("sums", dimension + 1,
-                                           &output_handle_);
-  }
-
-  tensorflow::Status Evaluate(SessionState *session_state,
-                              ComputeSession *compute_session,
-                              ComponentTrace *component_trace) const override {
-    NetworkStates *network_states = &session_state->network_states;
-    float sum = 0.0;
-    for (size_t step = 0; step < kNumSteps; ++step) {
-      network_states->AddStep();
-      const Vector<float> inputs(
-          network_states->GetLayer(input_handle_).row(step));
-      const MutableVector<float> outputs(
-          network_states->GetLayer(output_handle_).row(step));
-      CHECK_EQ(outputs.size(), inputs.size() + 1);
-      sum += inputs[inputs.size() - 1];
-      *std::copy(inputs.begin(), inputs.end(), outputs.begin()) = sum;
-    }
-    return tensorflow::Status::OK();
-  }
-
-  bool Supports(const ComponentSpec &spec,
-                const string &normalized_builder_name) const override {
-    return normalized_builder_name == "ExtendWithCumulativeSum";
-  }
-
-  bool PreferredTo(const Component &other) const override { return false; }
-
- private:
-  // Handles to the input and output layers.
-  LayerHandle<float> input_handle_;
-  LayerHandle<float> output_handle_;
-};
-
-DRAGNN_RUNTIME_REGISTER_COMPONENT(ExtendWithCumulativeSum);
-
-// Makes predictions using its inputs.
-class MakePredictions : public Component {
- public:
-  // Implements Component.
-  tensorflow::Status Initialize(const ComponentSpec &component_spec,
-                                VariableStore *variable_store,
-                                NetworkStateManager *network_state_manager,
-                                ExtensionManager *extension_manager) override {
-    name_ = component_spec.name();
-    CHECK_EQ(component_spec.linked_feature_size(), 1);
-    const LinkedFeatureChannel &link = component_spec.linked_feature(0);
-    size_t dimension = 0;
-    return network_state_manager->LookupLayer(link.source_component(),
-                                              link.source_layer(), &dimension,
-                                              &input_handle_);
-  }
-
-  tensorflow::Status Evaluate(SessionState *session_state,
-                              ComputeSession *compute_session,
-                              ComponentTrace *component_trace) const override {
-    NetworkStates *network_states = &session_state->network_states;
-    Matrix<float> inputs(network_states->GetLayer(input_handle_));
-    for (size_t step = 0; step < kNumSteps; ++step) {
-      const Vector<float> logits = inputs.row(step);
-      if (!compute_session->AdvanceFromPrediction(name_, logits.data(), 1,
-                                                  logits.size())) {
-        return tensorflow::errors::Internal(
-            "Error in ComputeSession::AdvanceFromPrediction() at step ", step);
-      }
-    }
-    return tensorflow::Status::OK();
-  }
-
-  bool Supports(const ComponentSpec &spec,
-                const string &normalized_builder_name) const override {
-    return normalized_builder_name == "MakePredictions";
-  }
-
-  bool PreferredTo(const Component &other) const override { return false; }
-
- private:
-  // Name of this component.
-  string name_;
-
-  // Handle to the input layer, which is treated as prediction logits.
-  LayerHandle<float> input_handle_;
-};
-
-DRAGNN_RUNTIME_REGISTER_COMPONENT(MakePredictions);
-
-// Component whose Evaluate() always fails.
-class AlwaysFails : public Component {
- public:
-  // Implements Component.
-  tensorflow::Status Initialize(const ComponentSpec &component_spec,
-                                VariableStore *variable_store,
-                                NetworkStateManager *network_state_manager,
-                                ExtensionManager *extension_manager) override {
-    return tensorflow::Status::OK();
-  }
-
-  tensorflow::Status Evaluate(SessionState *session_state,
-                              ComputeSession *compute_session,
-                              ComponentTrace *component_trace) const override {
-    return tensorflow::errors::Internal("I always fail!");
-  }
-
-  bool Supports(const ComponentSpec &spec,
-                const string &normalized_builder_name) const override {
-    return normalized_builder_name == "AlwaysFails";
-  }
-
-  bool PreferredTo(const Component &other) const override { return false; }
-};
-
-DRAGNN_RUNTIME_REGISTER_COMPONENT(AlwaysFails);
-
-class MasterTest : public ::testing::Test {
- protected:
-  // Returns a new VariableStore.
-  static std::unique_ptr<VariableStore> NewVariableStore() {
-    // None of the tests or components look at the pre-trained variables, so
-    // return an empty store.
-    return std::unique_ptr<VariableStore>(new FakeVariableStore());
-  }
-
-  // Initializes and runs the |master_| using the text-format MasterSpec in
-  // |master_spec_text|.  The |master_trace| is overwritten with traces, if
-  // specified.  If |expect_success| is false, then EXPECT_CALLs that assume
-  // success are disabled.  On error, returns non-OK.
-  tensorflow::Status TryRun(const string &master_spec_text, bool expect_success,
-                            MasterTrace *master_trace = nullptr) {
-    MasterSpec master_spec;
-    CHECK(TextFormat::ParseFromString(master_spec_text, &master_spec));
-
-    TF_RETURN_IF_ERROR(master_.Initialize(master_spec, NewVariableStore()));
-
-    {  // Add call expectations for initializing each component, in order.
-      InSequence ordered_calls;
-      for (const ComponentSpec &component_spec : master_spec.component()) {
-        EXPECT_CALL(compute_session_,
-                    InitializeComponentData(component_spec.name(), 1))
-            .Times(1);
-      }
-    }
-
-    // If applicable, add call expectations for making "predictions" in the
-    // final component that capture the prediction logits for inspection.
-    if (master_spec.component_size() > 0 && expect_success) {
-      const string &last_component_name =
-          master_spec.component(master_spec.component_size() - 1).name();
-      EXPECT_CALL(compute_session_,
-                  AdvanceFromPrediction(last_component_name, _, 1, _))
-          .Times(kNumSteps)
-          .WillRepeatedly(
-              Invoke([this](const string &, const float *data, int, int size) {
-                logits_.emplace_back(data, data + size);
-                return true;
-              }));
-    }
-
-    // Add call expectations for finalizing data in all components.
-    if (expect_success) {
-      for (const ComponentSpec &component_spec : master_spec.component()) {
-        EXPECT_CALL(compute_session_, FinalizeData(component_spec.name()))
-            .Times(1);
-      }
-    }
-
-    return master_.Evaluate(&compute_session_, master_trace);
-  }
-
-  // As above, but asserts that all operations succeed.
-  void Run(const string &master_spec_text,
-           MasterTrace *master_trace = nullptr) {
-    TF_ASSERT_OK(
-        TryRun(master_spec_text, /*expect_success=*/true, master_trace));
-  }
-
-  ::testing::StrictMock<MockComputeSession> compute_session_;
-  std::vector<std::vector<float>> logits_;
-  Master master_;
-};
-
-// Tests that Master cannot be initialized multiple times.
-TEST_F(MasterTest, InitializeTwice) {
-  TF_ASSERT_OK(master_.Initialize(MasterSpec(), NewVariableStore()));
-  EXPECT_THAT(master_.Initialize(MasterSpec(), NewVariableStore()),
-              test::IsErrorWithSubstr("Can't initialize twice"));
-}
-
-// Tests that Master requires a variable store.
-TEST_F(MasterTest, NoVariableStore) {
-  EXPECT_THAT(master_.Initialize(MasterSpec(), nullptr),
-              test::IsErrorWithSubstr("No VariableStore"));
-}
-
-// Tests that Master must be initialized prior to session.
-TEST_F(MasterTest, EvaluateWithoutInitializing) {
-  EXPECT_THAT(master_.Evaluate(&compute_session_, nullptr),
-              test::IsErrorWithSubstr("Not initialized"));
-}
-
-// Tests that Master requires a compute session.
-TEST_F(MasterTest, NoComputeSession) {
-  TF_ASSERT_OK(master_.Initialize(MasterSpec(), NewVariableStore()));
-  EXPECT_THAT(master_.Evaluate(nullptr, nullptr),
-              test::IsErrorWithSubstr("No ComputeSession"));
-}
-
-// Tests that Master works with an empty spec and does nothing (StrictMock would
-// raise an error if any methods on the ComputeSession were called).
-TEST_F(MasterTest, EmptySpec) {
-  Run("");
-
-  EXPECT_TRUE(logits_.empty());
-}
-
-// Tests that Master can run a simple pipeline that generates ones.
-TEST_F(MasterTest, Ones) {
-  Run(R"(component {
-           name: 'component1'
-           component_builder {
-             registered_name: 'Ones'
-           }
-         }
-         component {
-           name: 'component2'
-           component_builder {
-             registered_name: 'MakePredictions'
-           }
-           linked_feature {
-             source_component: 'component1'
-             source_layer: 'ones'
-           }
-         })");
-
-  EXPECT_EQ(logits_.size(), kNumSteps);
-  const std::vector<float> expected_row = {1.0};
-  for (const auto &row : logits_) EXPECT_EQ(row, expected_row);
-}
-
-// Tests that Master can run a pipeline with a cumulative summation.
-TEST_F(MasterTest, SingleSummation) {
-  Run(R"(component {
-           name: 'component1'
-           component_builder {
-             registered_name: 'Ones'
-           }
-         }
-         component {
-           name: 'component2'
-           component_builder {
-             registered_name: 'ExtendWithCumulativeSum'
-           }
-           linked_feature {
-             source_component: 'component1'
-             source_layer: 'ones'
-           }
-         }
-         component {
-           name: 'component3'
-           component_builder {
-             registered_name: 'MakePredictions'
-           }
-           linked_feature {
-             source_component: 'component2'
-             source_layer: 'sums'
-           }
-         })");
-
-  EXPECT_EQ(logits_.size(), kNumSteps);
-  float sum = 0.0;
-  for (const auto &row : logits_) {
-    ++sum;
-    const std::vector<float> expected_row = {1.0, sum};
-    EXPECT_EQ(row, expected_row);
-  }
-}
-
-// Tests that Master can run a pipeline with multiple summations.
-TEST_F(MasterTest, MultiSummation) {
-  Run(R"(component {
-           name: 'component1'
-           component_builder {
-             registered_name: 'Ones'
-           }
-         }
-         component {
-           name: 'component2'
-           component_builder {
-             registered_name: 'ExtendWithCumulativeSum'
-           }
-           linked_feature {
-             source_component: 'component1'
-             source_layer: 'ones'
-           }
-         }
-         component {
-           name: 'component3'
-           component_builder {
-             registered_name: 'ExtendWithCumulativeSum'
-           }
-           linked_feature {
-             source_component: 'component2'
-             source_layer: 'sums'
-           }
-         }
-         component {
-           name: 'component4'
-           component_builder {
-             registered_name: 'ExtendWithCumulativeSum'
-           }
-           linked_feature {
-             source_component: 'component3'
-             source_layer: 'sums'
-           }
-         }
-         component {
-           name: 'component5'
-           component_builder {
-             registered_name: 'MakePredictions'
-           }
-           linked_feature {
-             source_component: 'component4'
-             source_layer: 'sums'
-           }
-         })");
-
-  EXPECT_EQ(logits_.size(), kNumSteps);
-  float sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
-  for (const auto &row : logits_) {
-    sum3 += sum2 += ++sum1;
-    const std::vector<float> expected_row = {1.0, sum1, sum2, sum3};
-    EXPECT_EQ(row, expected_row);
-  }
-}
-
-// Tests that Master can run a pipeline with tracing.
-TEST_F(MasterTest, SingleSummationWithTracing) {
-  { // Expect to enable and then disable tracing, in that order.
-    InSequence ordered_calls;
-    EXPECT_CALL(compute_session_, SetTracing(true));
-    EXPECT_CALL(compute_session_, SetTracing(false));
-  }
-
-  // Build a set of traces for the compute session to return.
-  std::vector<MasterTrace> traces(1);
-  traces.back().add_component_trace()->add_step_trace()->set_caption("A");
-  traces.back().add_component_trace()->add_step_trace()->set_caption("B");
-  traces.back().add_component_trace()->add_step_trace()->set_caption("C");
-  traces.back().add_component_trace()->add_step_trace()->set_caption("D");
-  EXPECT_CALL(compute_session_, GetTraceProtos()).WillOnce(Return(traces));
-
-  MasterTrace master_trace;
-  Run(R"(component {
-           name: 'component1'
-           component_builder {
-             registered_name: 'Ones'
-           }
-         }
-         component {
-           name: 'component2'
-           component_builder {
-             registered_name: 'ExtendWithCumulativeSum'
-           }
-           linked_feature {
-             source_component: 'component1'
-             source_layer: 'ones'
-           }
-         }
-         component {
-           name: 'component3'
-           component_builder {
-             registered_name: 'MakePredictions'
-           }
-           linked_feature {
-             source_component: 'component2'
-             source_layer: 'sums'
-           }
-         })",
-      &master_trace);
-
-  const string kExpectedTraceText = R"(
-    component_trace { name: 'component1' step_trace { caption: 'A' } }
-    component_trace { name: 'component2' step_trace { caption: 'B' } }
-    component_trace { name: 'component3' step_trace { caption: 'C' } }
-    component_trace {                    step_trace { caption: 'D' } }
-  )";
-  MasterTrace expected_trace;
-  ASSERT_TRUE(TextFormat::ParseFromString(kExpectedTraceText, &expected_trace));
-
-  EXPECT_THAT(master_trace, test::EqualsProto(expected_trace));
-}
-
-// Tests that Master disables tracing even on error.
-TEST_F(MasterTest, DisablesTracingOnFailure) {
-  { // Expect to enable and then disable tracing, in that order.
-    InSequence ordered_calls;
-    EXPECT_CALL(compute_session_, SetTracing(true));
-    EXPECT_CALL(compute_session_, SetTracing(false));
-  }
-
-  const string kMasterSpec = R"(component {
-                                name: 'component1'
-                                component_builder {
-                                  registered_name: 'AlwaysFails'
-                                }
-                              })";
-  MasterTrace master_trace;
-  EXPECT_THAT(TryRun(kMasterSpec, /*expect_success=*/false, &master_trace),
-              test::IsErrorWithSubstr("I always fail!"));
-
-  const string kExpectedTraceText = "component_trace { name: 'component1' }";
-  MasterTrace expected_trace;
-  ASSERT_TRUE(TextFormat::ParseFromString(kExpectedTraceText, &expected_trace));
-
-  EXPECT_THAT(master_trace, test::EqualsProto(expected_trace));
-}
-
-}  // namespace
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/BUILD
+++ b/research/syntaxnet/dragnn/runtime/math/BUILD
-package(
-    default_visibility = ["//visibility:public"],
-)
-
-load(
-    "@org_tensorflow//tensorflow:tensorflow.bzl",
-    "if_linux_x86_64",
-)
-load(
-    "//dragnn/runtime:multiarch.bzl",
-    "dragnn_cc_multiarch_test",
-)
-
-FAST_MATH_COPTS = if_linux_x86_64([
-    "-O3",
-    "-msse4.2",
-    "-ffast-math",
-    "-ftree-vectorize",
-])
-
-cc_library(
-    name = "avx_vector_array",
-    hdrs = ["avx_vector_array.h"],
-    deps = [":float16_types"],
-)
-
-cc_test(
-    name = "avx_vector_array_test",
-    srcs = ["avx_vector_array_test.cc"],
-    deps = [
-        ":avx_vector_array",
-        "//dragnn/runtime/test:helpers",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_library(
-    name = "avx_activation_functions",
-    hdrs = ["avx_activation_functions.h"],
-    deps = [
-        ":avx_vector_array",
-    ],
-)
-
-dragnn_cc_multiarch_test(
-    name = "avx_activation_functions_test",
-    srcs = ["avx_activation_functions_test.cc"],
-    copts = FAST_MATH_COPTS,
-    deps = [
-        ":avx_activation_functions",
-        "//dragnn/runtime/test:helpers",
-        "//syntaxnet:base",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_library(
-    name = "float16_types",
-    hdrs = ["float16_types.h"],
-    deps = [
-        "//syntaxnet:base",
-        "@org_tensorflow//tensorflow/core:lib",
-    ],
-)
-
-cc_test(
-    name = "float16_types_test",
-    srcs = ["float16_types_test.cc"],
-    deps = [
-        ":float16_types",
-        "//syntaxnet:test_main",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_library(
-    name = "sgemvv",
-    hdrs = ["sgemvv.h"],
-    deps = [
-        ":avx_vector_array",
-        ":types",
-        "@org_tensorflow//tensorflow/core:lib",
-    ],
-)
-
-cc_test(
-    name = "sgemvv_test",
-    srcs = ["sgemvv_test.cc"],
-    copts = [
-        "-O3",
-        "-mavx2",
-        "-mfma",
-    ],
-    tags = [
-        "manual",
-    ],
-    deps = [
-        ":arithmetic",
-        ":sgemvv",
-        ":transformations",
-        ":types",
-        "//dragnn/core/test:generic",
-        "//dragnn/runtime/test:helpers",
-        "@org_tensorflow//tensorflow/core:lib",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_test(
-    name = "sgemvv_compatibility_test",
-    srcs = ["sgemvv_test.cc"],
-    copts = [
-        "-O3",
-        "-ftree-vectorize",
-        "-ffast-math",
-    ],
-    deps = [
-        ":arithmetic",
-        ":sgemvv",
-        ":transformations",
-        ":types",
-        "//dragnn/core/test:generic",
-        "//dragnn/runtime/test:helpers",
-        "@org_tensorflow//tensorflow/core:lib",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_library(
-    name = "transformations",
-    hdrs = ["transformations.h"],
-    deps = [
-        ":types",
-        "@org_tensorflow//tensorflow/core:lib",
-    ],
-)
-
-cc_test(
-    name = "transformations_test",
-    srcs = ["transformations_test.cc"],
-    deps = [
-        ":transformations",
-        "//dragnn/runtime/test:helpers",
-        "@org_tensorflow//tensorflow/core:lib",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_library(
-    name = "types",
-    hdrs = ["types.h"],
-    deps = [
-        "//dragnn/runtime:alignment",
-        "@org_tensorflow//tensorflow/core:lib",
-    ],
-)
-
-cc_test(
-    name = "types_test",
-    size = "small",
-    srcs = ["types_test.cc"],
-    deps = [
-        ":types",
-        "//dragnn/core/test:generic",
-        "//dragnn/runtime:alignment",
-        "@org_tensorflow//tensorflow/core:lib",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_library(
-    name = "eigen",
-    hdrs = ["eigen.h"],
-    deps = [
-        ":types",
-        "//dragnn/runtime:alignment",
-        "@org_tensorflow//third_party/eigen3",
-    ],
-)
-
-cc_test(
-    name = "eigen_test",
-    size = "small",
-    srcs = ["eigen_test.cc"],
-    deps = [
-        ":eigen",
-        ":types",
-        "//dragnn/core/test:generic",
-        "//dragnn/runtime/test:helpers",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_library(
-    name = "arithmetic",
-    srcs = [
-        "arithmetic_avx.h",
-        "arithmetic_common.h",
-        "arithmetic_neon.h",
-        "arithmetic_sse.h",
-    ],
-    hdrs = ["arithmetic.h"],
-    deps = [
-        ":types",
-        "@org_tensorflow//tensorflow/core:lib",
-    ],
-)
-
-cc_test(
-    name = "arithmetic_test",
-    size = "small",
-    srcs = ["arithmetic_test.cc"],
-    deps = [
-        ":arithmetic",
-        ":types",
-        "//dragnn/runtime/test:helpers",
-        "//syntaxnet:test_main",
-        "@org_tensorflow//tensorflow/core:lib",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_test(
-    name = "arithmetic_avx_test",
-    size = "small",
-    srcs = ["arithmetic_test.cc"],
-    copts = [
-        "-mavx2",
-        "-mfma",
-    ],
-    tags = [
-        "manual",
-    ],
-    deps = [
-        ":arithmetic",
-        ":types",
-        "//dragnn/runtime/test:helpers",
-        "//syntaxnet:test_main",
-        "@org_tensorflow//tensorflow/core:lib",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
-
-cc_test(
-    name = "arithmetic_sse_test",
-    size = "small",
-    srcs = ["arithmetic_test.cc"],
-    copts = ["-msse4.2"],
-    deps = [
-        ":arithmetic",
-        ":types",
-        "//dragnn/runtime/test:helpers",
-        "//syntaxnet:test_main",
-        "@org_tensorflow//tensorflow/core:lib",
-        "@org_tensorflow//tensorflow/core:test",
-    ],
-)
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic.h
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-// Top-level organizational header for arithmetic operations.  Users should
-// include this instead of directly including the sub-headers below.  See
-// arithmetic_common.h for function declarations and comments.
-//
-// NB: If you wish to use an architecture-specific implementation, make sure to
-// add the relevant copts to the cc_library whose .cc file includes this header.
-
-#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_H_
-#define DRAGNN_RUNTIME_MATH_ARITHMETIC_H_
-
-// Select an architecture-specific implementation, if possible, or fall back to
-// the trivial generic implementations.  The order of the clauses is important:
-// in cases where architectures may overlap the newer version should be checked
-// first (e.g., AVX before SSE).
-#if defined(__AVX2__)
-#include "dragnn/runtime/math/arithmetic_avx.h"
-#elif defined(__SSE4_2__)
-#include "dragnn/runtime/math/arithmetic_sse.h"
-#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
-#include "dragnn/runtime/math/arithmetic_neon.h"
-#else  // no architecture-specific implementation
-#include "dragnn/runtime/math/arithmetic_common.h"
-#endif
-
-#endif  // DRAGNN_RUNTIME_MATH_ARITHMETIC_H_
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_avx.h
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_avx.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
-#define DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
-#if defined(__AVX2__)
-
-#include <stddef.h>
-
-#include "dragnn/runtime/math/arithmetic_common.h"
-#include "dragnn/runtime/math/types.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-
-// TODO(googleuser): Leaving this empty means that the definitions
-// from arithmetic_common.h carry through.  Provide template specializations
-// that use architecture-specific intrinsics.
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#endif  // defined(__AVX2__)
-#endif  // DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_common.h
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_common.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-// Declarations of arithmetic operations and trivial generic implementations.
-// Architecture-specific implementations should include this header and define
-// template specializations that override the generic implementations.
-
-#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
-#define DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
-
-#include <stddef.h>
-#include <algorithm>
-
-#include "dragnn/runtime/math/types.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-
-// Performs output = scale * input.  Dimensions must match.
-template <class T>
-void ScaleElements(Vector<T> input, T scale, MutableVector<T> output);
-
-// Performs output += scale * input.  Dimensions must match.
-template <class T>
-void AddScaledElements(Vector<T> input, T scale, MutableVector<T> output);
-
-// Performs values = max(minimum, values) in place.
-template <class T>
-void MaxElements(T minimum, MutableVector<T> values);
-
-// Performs output = matrix * input.  All vectors are interpreted as column
-// vectors.  Dimensions must match.
-template <class T>
-void MultiplyMatrixAndVector(Matrix<T> matrix, Vector<T> input,
-                             MutableVector<T> output);
-
-// Performs output = bias + matrix * input.  All vectors are interpreted as
-// column vectors.  Dimensions must match.
-template <class T>
-void MultiplyMatrixAndVectorWithBias(Matrix<T> matrix, Vector<T> bias,
-                                     Vector<T> input, MutableVector<T> output);
-
-// Implementation details below.
-
-template <class T>
-void ScaleElements(T scale, Vector<T> input, MutableVector<T> output) {
-  DCHECK_EQ(input.size(), output.size());
-  for (size_t i = 0; i < input.size(); ++i) output[i] = scale * input[i];
-}
-
-template <class T>
-void AddScaledElements(T scale, Vector<T> input, MutableVector<T> output) {
-  DCHECK_EQ(input.size(), output.size());
-  for (size_t i = 0; i < input.size(); ++i) output[i] += scale * input[i];
-}
-
-template <class T>
-void MaxElements(T minimum, MutableVector<T> values) {
-  for (T &value : values) value = std::max(minimum, value);
-}
-
-namespace internal {
-
-// Like MultiplyMatrixAndVectorWithBias(), but if |ignore_bias| is true, then
-// the |bias| is treated as zero and its dimensions are not checked.
-template <bool ignore_bias, class T>
-void MultiplyMatrixAndVectorImpl(Matrix<T> matrix, Vector<T> bias,
-                                 Vector<T> input, MutableVector<T> output) {
-  DCHECK_EQ(matrix.num_columns(), input.size());
-  if (!ignore_bias) DCHECK_EQ(matrix.num_rows(), bias.size());
-  DCHECK_EQ(matrix.num_rows(), output.size());
-  for (size_t i = 0; i < matrix.num_rows(); ++i) {
-    const Vector<T> row = matrix.row(i);
-    DCHECK_EQ(row.size(), input.size());
-    T sum = ignore_bias ? T() : bias[i];
-    for (size_t j = 0; j < row.size(); ++j) sum += row[j] * input[j];
-    output[i] = sum;
-  }
-}
-
-}  // namespace internal
-
-template <class T>
-void MultiplyMatrixAndVector(Matrix<T> matrix, Vector<T> input,
-                             MutableVector<T> output) {
-  internal::MultiplyMatrixAndVectorImpl<true>(matrix, {}, input, output);
-}
-
-template <class T>
-void MultiplyMatrixAndVectorWithBias(Matrix<T> matrix, Vector<T> bias,
-                                     Vector<T> input, MutableVector<T> output) {
-  internal::MultiplyMatrixAndVectorImpl<false>(matrix, bias, input, output);
-}
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#endif  // DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_neon.h
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_neon.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
-#define DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
-
-#include <stddef.h>
-
-#include "dragnn/runtime/math/arithmetic_common.h"
-#include "dragnn/runtime/math/types.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-
-// TODO(googleuser): Leaving this empty means that the definitions
-// from arithmetic_common.h carry through.  Provide template specializations
-// that use architecture-specific intrinsics.
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#endif  // defined(__ARM_NEON) || defined(__ARM_NEON__)
-#endif  // DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_sse.h
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_sse.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
-#define DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
-#if defined(__SSE4_2__)
-
-#include <stddef.h>
-
-#include "dragnn/runtime/math/arithmetic_common.h"
-#include "dragnn/runtime/math/types.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-
-// TODO(googleuser): Leaving this empty means that the definitions
-// from arithmetic_common.h carry through.  Provide template specializations
-// that use architecture-specific intrinsics.
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#endif  // defined(__SSE4_2__)
-#endif  // DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
--- a/research/syntaxnet/dragnn/runtime/math/arithmetic_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/arithmetic_test.cc
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "dragnn/runtime/math/arithmetic.h"
-
-#include <stddef.h>
-#include <vector>
-
-#include "dragnn/runtime/math/types.h"
-#include "dragnn/runtime/test/helpers.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-namespace {
-
-// Tests that ScaleElements() doesn't crash on empty vectors.
-TEST(ScaleElementsTest, Empty) {
-  Vector<float> input;
-  MutableVector<float> output;
-
-  ScaleElements(1.5f, input, output);
-}
-
-// Tests that ScaleElements() copies scaled values from one vector to another.
-TEST(ScaleElementsTest, Populated) {
-  UniqueVector<float> input({-2.0f, -3.0f, 5.0f});
-  UniqueVector<float> output({7.0f, 11.0f, 13.0f});  // gets overwritten
-
-  ScaleElements(1.5f, Vector<float>(*input), *output);
-
-  EXPECT_EQ((*output)[0], 1.5 * -2.0);
-  EXPECT_EQ((*output)[1], 1.5 * -3.0);
-  EXPECT_EQ((*output)[2], 1.5 * 5.0);
-}
-
-// Tests that AddScaledElements() doesn't crash on empty vectors.
-TEST(AddScaledElementsTest, Empty) {
-  Vector<float> input;
-  MutableVector<float> output;
-
-  AddScaledElements(1.5f, input, output);
-}
-
-// Tests that AddScaledElements() adds scaled values from one vector to another.
-TEST(AddScaledElementsTest, Populated) {
-  UniqueVector<float> input({-2.0f, -3.0f, 5.0f});
-  UniqueVector<float> output({7.0f, 11.0f, 13.0f});  // gets added to
-
-  AddScaledElements(1.5f, Vector<float>(*input), *output);
-
-  EXPECT_EQ((*output)[0], 1.5 * -2.0 + 7.0);
-  EXPECT_EQ((*output)[1], 1.5 * -3.0 + 11.0);
-  EXPECT_EQ((*output)[2], 1.5 * 5.0 + 13.0);
-}
-
-// Tests that MaxElements() doesn't crash on empty vectors.
-TEST(MaxElementsTest, Empty) {
-  MutableVector<float> values;
-
-  MaxElements(1.5f, values);
-}
-
-// Tests that MaxElements() performs an in-place element-wise maximum.
-TEST(MaxElementsTest, Populated) {
-  UniqueVector<float> values({-1.0f, 2.0f, 0.25f, -0.5f, 0.375f});
-
-  MaxElements(0.125f, *values);
-
-  EXPECT_EQ((*values)[0], 0.125);
-  EXPECT_EQ((*values)[1], 2.0);
-  EXPECT_EQ((*values)[2], 0.25);
-  EXPECT_EQ((*values)[3], 0.125);
-  EXPECT_EQ((*values)[4], 0.375);
-}
-
-// Tests that MultiplyMatrixAndVector() doesn't crash on empty inputs.
-TEST(MultiplyMatrixAndVectorTest, Empty) {
-  Matrix<float> matrix;
-  Vector<float> input;
-  MutableVector<float> output;
-
-  MultiplyMatrixAndVector(matrix, input, output);
-}
-
-// Tests that MultiplyMatrixAndVector() computes a matrix-vector product.
-TEST(MultiplyMatrixAndVectorTest, Populated) {
-  UniqueMatrix<float> matrix({{2.0f, 3.0f},  //
-                              {5.0f, 7.0f},  //
-                              {11.0f, 13.0f}});
-  UniqueVector<float> input({-0.5f, 2.0f});
-  UniqueVector<float> output({9.8f, 7.6f, 5.4f});  // gets overwritten
-
-  MultiplyMatrixAndVector(Matrix<float>(*matrix), Vector<float>(*input),
-                          *output);
-
-  EXPECT_EQ((*output)[0], 2.0 * -0.5 + 3.0 * 2.0);
-  EXPECT_EQ((*output)[1], 5.0 * -0.5 + 7.0 * 2.0);
-  EXPECT_EQ((*output)[2], 11.0 * -0.5 + 13.0 * 2.0);
-}
-
-// Tests that MultiplyMatrixAndVectorWithBias() doesn't crash on empty inputs.
-TEST(MultiplyMatrixAndVectorWithBiasTest, Empty) {
-  Matrix<float> matrix;
-  Vector<float> bias;
-  Vector<float> input;
-  MutableVector<float> output;
-
-  MultiplyMatrixAndVectorWithBias(matrix, bias, input, output);
-}
-
-// Tests that MultiplyMatrixAndVectorWithBias() computes a matrix-vector product
-// with an additive bias.
-TEST(MultiplyMatrixAndVectorWithBiasTest, Populated) {
-  UniqueMatrix<float> matrix({{2.0f, 3.0f},  //
-                              {5.0f, 7.0f},  //
-                              {11.0f, 13.0f}});
-  UniqueVector<float> bias({100.5f, 200.25f, 300.75f});
-  UniqueVector<float> input({-0.5f, 2.0f});
-  UniqueVector<float> output({9.8f, 7.6f, 5.4f});  // gets overwritten
-
-  MultiplyMatrixAndVectorWithBias(Matrix<float>(*matrix), Vector<float>(*bias),
-                                  Vector<float>(*input), *output);
-
-  EXPECT_EQ((*output)[0], 100.5 + 2.0 * -0.5 + 3.0 * 2.0);
-  EXPECT_EQ((*output)[1], 200.25 + 5.0 * -0.5 + 7.0 * 2.0);
-  EXPECT_EQ((*output)[2], 300.75 + 11.0 * -0.5 + 13.0 * 2.0);
-}
-
-// A dummy type for the specializations below.  Specializing on this unique
-// dummy type ensures we don't conflict with any existing specialization.
-struct Foo {
-  float value;
-};
-
-}  // namespace
-
-// Dummy specializations for use in the subsequent tests.
-template <>
-void ScaleElements(Foo scale, Vector<Foo> input, MutableVector<Foo> output) {
-  for (Foo &foo : output) foo.value = 777.0;
-}
-
-namespace {
-
-// Tests that the template specialization overrides the generic implementation.
-TEST(ScaleElementsTest, OverriddenByTemplateSpecialization) {
-  // These values are uninitialized, but it doesn't matter because the
-  // specialization never looks at them.
-  UniqueVector<Foo> input(3);
-  UniqueVector<Foo> output(3);
-
-  ScaleElements(Foo(), Vector<Foo>(*input), *output);
-
-  EXPECT_EQ((*output)[0].value, 777.0);
-  EXPECT_EQ((*output)[1].value, 777.0);
-  EXPECT_EQ((*output)[2].value, 777.0);
-}
-
-}  // namespace
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/avx_activation_functions.h
+++ b/research/syntaxnet/dragnn/runtime/math/avx_activation_functions.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-// Contains logic for activation functions and more-complex elementwise
-// vectorized operations.
-//
-// Uses operator overloading to express computation that looks like regular
-// code. Currently, overloaded operators are scoped away in an "internal"
-// namespace so they won't be accidentally used.
-
-#ifndef DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
-#define DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
-
-#if defined(__AVX2__)
-#include <immintrin.h>
-#endif
-
-#include "dragnn/runtime/math/avx_vector_array.h"
-
-
-#define DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
-#ifdef __clang__
-#define DRAGNN_AVXAF_GCC_UNROLL
-#else
-#define DRAGNN_AVXAF_GCC_UNROLL __attribute__((optimize("unroll-loops")))
-#endif
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-
-// Public API
-namespace activations {
-// Calculates elementwise exp(x).
-inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE DRAGNN_AVXAF_GCC_UNROLL
-Exponential(AvxFloatVec x);
-
-// Calculates elementwise sigmoid(x) = 1/(1+exp(-x)).
-inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE Sigmoid(AvxFloatVec x);
-
-// Calculates elementwise tanh(x).
-inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE Tanh(AvxFloatVec x);
-}  // namespace activations
-
-namespace activations {
-
-// Calculates e^x by representing x = m * ln(2) + r. It does a polynomial
-// expansion of e^r, and then multiplies in e^(m * ln(2)) = 2^m.
-//
-inline AvxFloatVec Exponential(AvxFloatVec x) {
-  // EDSL-like helpers for writing vectorized code.
-  auto Const = AvxFloatVec::Const;
-
-  constexpr float explo = -88.3762626647949f;
-  constexpr float exphi = 88.3762626647950f;
-
-  const float cephes_exp_factors[] = {
-      1.9875691500e-4f, 1.3981999507e-3f, 8.3334519073e-3f,
-      4.1665795894e-2f, 1.6666665459e-1f, 5.0000001201e-1f,
-  };
-
-  // Clamp the input. i.e. assume exp(-88) is close to zero and exp(88) is
-  // close to infinity.
-  x.Clamp(explo, exphi);
-
-  // Calculate `m = floor(x/ln(2) + 0.5)`.
-  constexpr float inv_log2e = 1.44269504088896341f;
-  AvxFloatVec m = Const(0.5f);
-  m += Const(inv_log2e) * x;
-  m.Floor();
-
-  // Calculate `r = x - m*ln(2)` (see function-level comment).
-  constexpr float neg_ln2 = -0.6931471805599453f;
-  AvxFloatVec r = x;
-  r += m * Const(neg_ln2);
-
-  // Calculate a polynomial expansion of y = exp(r).
-  AvxFloatVec r_squared(r * r);
-  AvxFloatVec y = Const(cephes_exp_factors[0]);
-  for (int i = 1; i < 6; ++i) {
-    y = y * r + Const(cephes_exp_factors[i]);
-  }
-  y = y * r_squared + r;
-  y += Const(1.0f);
-
-  // Calculate `emm0 = 2^m`. This is done by converting emm0 into an integer,
-  // and shifting it into the exponent bits of the desired floating-point
-  // result. Recall that the exponent is unsigned with 127 representing 2^0.
-  AvxFloatVec emm0 = m;
-  emm0 += Const(127.0f);
-  AvxIntVec emm0_i(emm0);
-  emm0_i.LeftShift(23);
-
-  // The final result is `2^m * exp(r)`.
-  return AvxFloatVec(emm0_i.ReinterpretCastFloat() * y);
-}
-
-inline AvxFloatVec Tanh(AvxFloatVec x) {
-  // EDSL-like helpers for writing vectorized code.
-  auto Const = AvxFloatVec::Const;
-
-  const float numerator_coefficients[] = {
-      -2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
-      5.12229709037114e-08f,  1.48572235717979e-05f, 6.37261928875436e-04f,
-      4.89352455891786e-03f,
-  };
-  const float denominator_coefficients[] = {
-      1.19825839466702e-06f,
-      1.18534705686654e-04f,
-      2.26843463243900e-03f,
-      4.89352518554385e-03f,
-  };
-
-  // Clamp the inputs to the range [-9, 9] since anything outside this range
-  // is +/-1.0 in single-precision.
-  x.Clamp(-9.0f, 9.0f);
-
-  // Compute x^2.
-  AvxFloatVec x_squared(x * x);
-
-  // Compute the numerator polynomial.
-  AvxFloatVec p = Const(numerator_coefficients[0]);
-  for (int i = 1; i < 7; ++i) {
-    // p = p * x^2 + numerator_coefficients_i
-    p = p * x_squared + Const(numerator_coefficients[i]);
-  }
-
-  // p = p * x
-  p = AvxFloatVec(p * x);
-
-  // Compute the denominator polynomial.
-  AvxFloatVec q = Const(denominator_coefficients[0]);
-  for (int i = 1; i < 4; ++i) {
-    // q = q * x^2 + alqha_i
-    q = q * x_squared + Const(denominator_coefficients[i]);
-  }
-
-  // Divide the numerator by the denominator.
-  return p / q;
-}
-
-inline AvxFloatVec Sigmoid(AvxFloatVec x) {
-  AvxFloatVec half = AvxFloatVec::Const(0.5);
-  return half * Tanh(AvxFloatVec(half * x)) + half;
-}
-
-}  // namespace activations
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#undef DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE
-#undef DRAGNN_AVXAF_GCC_UNROLL
-
-#endif  // DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
--- a/research/syntaxnet/dragnn/runtime/math/avx_activation_functions_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/avx_activation_functions_test.cc
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "dragnn/runtime/math/avx_activation_functions.h"
-
-#include <cmath>
-
-#include <chrono>
-
-#include "dragnn/runtime/test/helpers.h"
-#include "syntaxnet/base.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-namespace {
-
-TEST(AvxActivationFunctionsTest, ExponentialTest) {
-  AvxVectorFuzzTest(
-      [](AvxFloatVec *vec) { *vec = activations::Exponential(*vec); },
-      [](float input_value, float actual) {
-        const float inverted = log(actual);
-        EXPECT_NEAR(input_value, inverted, 1e-6)
-            << "exp(" << input_value << ") = " << actual
-            << ", log(actual) = " << inverted;
-      });
-}
-
-TEST(AvxActivationFunctionsTest, SigmoidTest) {
-  AvxVectorFuzzTest(  //
-      [](AvxFloatVec *vec) { *vec = activations::Sigmoid(*vec); },
-      [](float input_value, float actual) {
-        const float expected = 1.0f / (1.0f + exp(-input_value));
-        EXPECT_NEAR(actual, expected, 1e-6)
-            << "sigmoid(" << input_value << ") = " << actual
-            << ", expected = " << expected;
-      });
-}
-
-template <int batch_size, class Function>
-void RunPerformanceTest(Function activation, int flops) {
-  constexpr uint64 kIterations = 1000000;
-
-  UniqueVector<float> input(batch_size);
-  UniqueVector<float> output(batch_size);
-  InitRandomVector(*input);
-  InitRandomVector(*output);
-
-  AvxFloatVecArray<batch_size / kAvxWidth> array;
-  auto start_time = std::chrono::system_clock::now();
-  for (int i = 0; i < kIterations; ++i) {
-    array.Load(input->data());
-    array.Apply(activation);
-    array.Store(output->data());
-  }
-  auto end_time = std::chrono::system_clock::now();
-  std::chrono::duration<double> elapsed_seconds = end_time - start_time;
-  double elapsed = elapsed_seconds.count();
-  double exp_ops = kIterations * batch_size;
-  double macro_gops = exp_ops / 1e9 / elapsed;
-  VLOG(0) << "For batch_size " << batch_size
-          << " macro-GOPS (giga-ops per sec): " << macro_gops
-          << ", raw arithmetic: " << flops * macro_gops;
-}
-
-TEST(AvxActivationFunctionsTest, SigmoidPerformanceTest) {
-  RunPerformanceTest<8>(activations::Sigmoid, 26);
-  RunPerformanceTest<16>(activations::Sigmoid, 26);
-  RunPerformanceTest<32>(activations::Sigmoid, 26);
-  RunPerformanceTest<48>(activations::Sigmoid, 26);
-  RunPerformanceTest<64>(activations::Sigmoid, 26);
-  RunPerformanceTest<128>(activations::Sigmoid, 26);
-}
-
-TEST(AvxActivationFunctionsTest, TanhTest) {
-  AvxVectorFuzzTest([](AvxFloatVec *vec) { *vec = activations::Tanh(*vec); },
-                    [](float input_value, float actual) {
-                      const float expected = tanh(input_value);
-                      EXPECT_NEAR(actual, expected, 1e-6)
-                          << "tanh(" << input_value << ") = " << actual
-                          << ", expected = " << expected;
-                    });
-}
-
-TEST(AvxActivationFunctionsTest, TanhPerformanceTest) {
-  RunPerformanceTest<8>(activations::Sigmoid, 23);
-  RunPerformanceTest<16>(activations::Sigmoid, 23);
-  RunPerformanceTest<32>(activations::Tanh, 23);
-  RunPerformanceTest<48>(activations::Tanh, 23);
-  RunPerformanceTest<64>(activations::Tanh, 23);
-  RunPerformanceTest<128>(activations::Tanh, 23);
-}
-
-}  // namespace
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/avx_vector_array.h
+++ b/research/syntaxnet/dragnn/runtime/math/avx_vector_array.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-// Wraps AVX vectors into convenient helper classes. This contains a class
-// wrapping a single AVX register, AvxFloatVec, and a class to manipulate a
-// batch of registers, AvxFloatVecArray. Use of the latter is recommended where
-// applicable, since it will be unrolled into more vectorizable code.
-
-#ifndef DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
-#define DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
-
-#include <cmath>
-#if defined(__AVX__)
-#include <immintrin.h>
-#elif defined(__SSE4_2__)
-#include <nmmintrin.h>
-#endif
-
-#include "dragnn/runtime/math/float16_types.h"
-
-#define DRAGNN_AVXVA_ALWAYS_INLINE inline __attribute__((always_inline))
-#ifdef __clang__
-
-// Clang doesn't support __attribute__((optimize(...))).
-#define DRAGNN_AVXVA_INLINED_UNROLLED inline __attribute__((always_inline))
-
-#else
-
-// Assume we're using GCC, which does.
-#define DRAGNN_AVXVA_INLINED_UNROLLED   \
-  inline __attribute__((always_inline)) \
-      __attribute__((optimize("unroll-loops")))
-
-#endif
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-
-// Number of single-precision floating point numbers that fit into a single SSE
-// / AVX2 register (which are 128 and 256 bits respectively).
-constexpr int kSseWidth = 128 / 32;  // = 4
-constexpr int kAvxWidth = 256 / 32;  // = 8
-constexpr int kSseWidthHalfPrecision = 128 / 16;  // = 8
-constexpr int kAvxWidthHalfPrecision = 256 / 16;  // = 16
-
-class AvxFloatVec;
-
-namespace internal {
-// This struct should always be eliminated by the compiler; it only exists so we
-// can write `foo += bar * baz`, and have that compiled into a single FMA
-// operation.
-struct AvxMultiplyExpr {
-  const AvxFloatVec &a;
-  const AvxFloatVec &b;
-};
-}  // namespace internal
-
-// Allows EDSL-like programming with AVX vectors.
-inline internal::AvxMultiplyExpr operator*(const AvxFloatVec &a,
-                                           const AvxFloatVec &b);
-inline AvxFloatVec operator+(const internal::AvxMultiplyExpr &expr,
-                             const AvxFloatVec &v);
-inline AvxFloatVec operator+(const AvxFloatVec &a, const AvxFloatVec &b);
-inline AvxFloatVec operator/(const AvxFloatVec &a, const AvxFloatVec &b);
-inline AvxFloatVec operator-(const AvxFloatVec &a, const AvxFloatVec &b);
-
-// API over a single AVX vector (register). The implementation will either use
-// a real AVX vector, or a fixed array of floats for compatibility.
-//
-// Note that we include the "inline" directive in declarations, not just
-// definitions, because it is necessary for the "always_inline" directive.
-struct AvxFloatVec {
- public:
-  AvxFloatVec() {}
-
-  // Evaluates an AvxMultiplyExpr intermediary without adding anything. This is
-  // not an implicit cast, because typically when we write `a * b` we want to
-  // add it to something and use an FMA operation.
-  explicit AvxFloatVec(const internal::AvxMultiplyExpr &expr);
-
-  // Loads from an aligned region of memory.
-  inline void Load(const float *source);
-
-  // Loads a constant value.
-  inline void LoadConstVector(const float val);
-
-  // Stores to an aligned region of memory.
-  inline void Store(float *dst) const;
-
-  // Adds `a * b` to this value, using a fused multiply-add operation.
-  inline void AddProductOf(const AvxFloatVec &a, const AvxFloatVec &b);
-
-  // Element-wise floor.
-  inline void Floor();
-
-  // Element-wise clamps values between a min and max value.
-  inline void Clamp(const float min_value, const float max_value);
-
-  // Convenience method for more complex calculations.
-  static DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec Const(const float value) {
-    AvxFloatVec result;
-    result.LoadConstVector(value);
-    return result;
-  }
-
-  // Syntactic sugar for computing an FMA operation.
-  inline AvxFloatVec &operator+=(const internal::AvxMultiplyExpr &to_add);
-
-  // Adds another vector element-wise.
-  inline AvxFloatVec &operator+=(const AvxFloatVec &vec);
-
-  // Subtracts another vector element-wise.
-  inline AvxFloatVec &operator-=(const AvxFloatVec &vec);
-
-  // Divides another vector element-wise.
-  inline AvxFloatVec &operator/=(const AvxFloatVec &vec);
-
-#if defined(__AVX__)
-  __m256 ymm;
-#elif defined(__SSE4_2__)
-  __m128 xmm[2];
-#else
-  float ymm[8];
-#endif
-};
-
-// Small wrapper around integer AVX vectors, exposing only methods we need for
-// implementing the activation functions.
-//
-// As above, `inline` is specified here for the always_inline directive.
-class AvxIntVec {
- public:
-  // Constructs an AVX integer vector, by converting floating-point values.
-  inline explicit AvxIntVec(const AvxFloatVec &v);
-
-  // Left-shifts integer values.
-  inline void LeftShift(int bits);
-
-  // Reinterprets the register as a floating-point register, for bitwise tricks.
-  inline AvxFloatVec ReinterpretCastFloat();
-
- private:
-  // Underlying register.
-#if defined(__AVX__)
-  __m256i ymm_;
-#elif defined(__SSE4_2__)
-  __m128i xmm_[2];
-#else
-  int ymm_[8];
-#endif
-};
-
-// Implements the index permutation that is effectively applied by the
-// _mm256_unpack instructions. This permutation is equivalent to swapping the
-// 3rd and 4th bits. See the PermutationFunctionIsEqualToTable test for the
-// effective permutation that this encodes.
-//
-// We haven't done performance testing, but hopefully this is sufficiently fast
-// for the compatibility routine. Hopefully in its use below, the compiler will
-// determine it is being called with a constant (post-unrolling) and inline it.
-DRAGNN_AVXVA_ALWAYS_INLINE int FastUnpackPermutation(int original_idx) {
-  // Bit in the 4th index if the 3rd and 4th bits should be swapped.
-  int should_swap = (original_idx + /* 0b0100 */ 4) & /* 0b1000 */ 8;
-
-  // If should_swap is zero, leaves original_idx untouched. Otherwise, does an
-  // xor with 0b1100, which will flip 10 to 01 and 01 to 10.
-  return (should_swap | (should_swap >> 1)) ^ original_idx;
-}
-
-// API over an array of AVX vectors (registers). The methods on this class are
-// annotated such that the compiler should unroll them.
-template <int N>
-struct AvxFloatVecArray {
- public:
-  DRAGNN_AVXVA_INLINED_UNROLLED void Load(const float *source) {
-    for (int i = 0; i < N; i++) {
-      vectors[i].Load(source + 8 * i);
-    }
-  }
-
-  DRAGNN_AVXVA_INLINED_UNROLLED void Load(const float *source, int max_idx) {
-    for (int i = 0; i < N; i++) {
-      if (i < max_idx) {
-        vectors[i].Load(source + 8 * i);
-      } else {
-        // When testing with a memory sanitizer, we make sure not to read
-        // uninitialized values. This is usually safe in normal operation
-        // because such results are never stored (via corresponding
-        // store-masking logic), but of course each algorithm must be tested to
-        // ensure correct operation.
-        //
-        // It is also worth pointing out that exceptional values (NaN, etc.) can
-        // slow down AVX/FMA floating point operations considerably. So we
-        // should investigate whether this is worth enabling in all cases (and
-        // forcing algorithms to provide a default).
-#if defined(MEMORY_SANITIZER)
-        vectors[i].LoadConstVector(0);
-#endif
-      }
-    }
-  }
-
-  // Reads and unpacks truncated half-precision values.
-  //
-  // Currently, only matrix coefficients use compressed/half-precision values,
-  // so it's not yet necessary to support max_idx masking (which will get a bit
-  // more complicated).
-  DRAGNN_AVXVA_INLINED_UNROLLED void Load(const TruncatedFloat16 *source);
-
-#if defined(__F16C__)
-
-  // Reads and unpacks IEEE-754 half-precision values.
-  //
-  // Currently, only matrix coefficients use compressed/half-precision values,
-  // so it's not yet necessary to support max_idx masking (which will get a bit
-  // more complicated).
-  //
-  // TODO(googleuser): Either add non-F16C compatibility support from Eigen,
-  // or delete this code if it turns out not to be helpful.
-  DRAGNN_AVXVA_INLINED_UNROLLED void Load(const IeeeFloat16 *source);
-#endif
-
-  DRAGNN_AVXVA_INLINED_UNROLLED void LoadConstVector(const float val) {
-    for (int i = 0; i < N; i++) {
-      vectors[i].LoadConstVector(val);
-    }
-  }
-
-  DRAGNN_AVXVA_INLINED_UNROLLED void Store(float *dst) {
-    for (int i = 0; i < N; i++) {
-      vectors[i].Store(dst + 8 * i);
-    }
-  }
-
-  DRAGNN_AVXVA_INLINED_UNROLLED void Store(float *dst, int max_idx) {
-    for (int i = 0; i < N; i++) {
-      // This is equivalent to writing `i < N && i < max_idx` above, but forces
-      // the compiler to produce more efficient code (it's still creating jump
-      // instructions, but the branching is probably more predictable, and the
-      // loops are unrolled). In the future we could switch to VMASKMOV if
-      // necessary.
-      if (i < max_idx) {
-        vectors[i].Store(dst + 8 * i);
-      }
-    }
-  }
-
-  template <class Function>
-  DRAGNN_AVXVA_INLINED_UNROLLED void Apply(const Function &fcn) {
-    for (int i = 0; i < N; i++) {
-      vectors[i] = fcn(vectors[i]);
-    }
-  }
-
-  AvxFloatVec vectors[N];
-};
-
-// Implementation details.
-#if defined(__AVX__)
-DRAGNN_AVXVA_ALWAYS_INLINE
-AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
-  ymm = _mm256_mul_ps(expr.a.ymm, expr.b.ymm);
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
-  ymm = _mm256_load_ps(source);
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
-  ymm = _mm256_set1_ps(val);
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
-  _mm256_store_ps(dst, ymm);
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
-    const AvxFloatVec &a, const AvxFloatVec &b) {
-#if defined(__AVX2__) && defined(__FMA__)
-  ymm = _mm256_fmadd_ps(a.ymm, b.ymm, ymm);
-#else
-  *this += AvxFloatVec(a * b);
-#endif
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
-  ymm = _mm256_floor_ps(ymm);
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
-                                                   const float max_value) {
-  ymm = _mm256_min_ps(ymm, Const(max_value).ymm);
-  ymm = _mm256_max_ps(ymm, Const(min_value).ymm);
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
-    const AvxFloatVec &vec) {
-  ymm = _mm256_add_ps(vec.ymm, ymm);
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
-    const AvxFloatVec &vec) {
-  ymm = _mm256_sub_ps(ymm, vec.ymm);
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
-    const AvxFloatVec &vec) {
-  ymm = _mm256_div_ps(ymm, vec.ymm);
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v)
-    : ymm_(_mm256_cvttps_epi32(v.ymm)) {}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
-#if defined(__AVX2__)
-  ymm_ = _mm256_slli_epi32(ymm_, bits);
-#else
-
-  // Convert to SSE and back again. This is pretty slow, so don't use this code
-  // except for compatibility purposes.
-  __m256i upper_bits = _mm256_permute2f128_si256(ymm_, ymm_, 1);
-  __m128i first = _mm256_castsi256_si128(ymm_);         // Lower bits as SSE
-  __m128i second = _mm256_castsi256_si128(upper_bits);  // Upper bits as SSE
-  first = _mm_slli_epi32(first, bits);
-  second = _mm_slli_epi32(second, bits);
-  ymm_ = _mm256_permute2f128_si256(_mm256_castsi128_si256(first),
-                                   _mm256_castsi128_si256(second), (2 << 4));
-#endif
-}
-
-AvxFloatVec DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::ReinterpretCastFloat() {
-  AvxFloatVec result;
-  result.ymm = _mm256_castsi256_ps(ymm_);
-  return result;
-}
-
-template <int N>
-DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
-    const TruncatedFloat16 *source) {
-  static_assert(N % 2 == 0,
-                "Load() from half floats requires even-sized vector arrays.");
-
-  for (int i = 0; i < N / 2; i++) {
-#if defined(__AVX2__)
-    const __m256i input = _mm256_load_si256(
-        reinterpret_cast<__m256i const *>(source + kAvxWidthHalfPrecision * i));
-    vectors[2 * i].ymm = _mm256_castsi256_ps(
-        _mm256_unpacklo_epi16(_mm256_setzero_si256(), input));
-    vectors[2 * i + 1].ymm = _mm256_castsi256_ps(
-        _mm256_unpackhi_epi16(_mm256_setzero_si256(), input));
-#else
-
-    // Compatibility AVX (not AVX2) implementation.
-    __m128i input[2];
-    input[0] = _mm_load_si128(
-        reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
-    input[1] = _mm_load_si128(reinterpret_cast<__m128i const *>(
-        source + kAvxWidthHalfPrecision * i + kSseWidthHalfPrecision));
-
-    // Unpack. This permutation is kinda cryptic and, to be honest, derived by
-    // simply trying many combinations.
-    vectors[2 * i].ymm = _mm256_insertf128_ps(
-        _mm256_castps128_ps256(_mm_castsi128_ps(
-            _mm_unpacklo_epi16(_mm_setzero_si128(), input[0]))),
-        _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[1])), 1);
-    vectors[2 * i + 1].ymm = _mm256_insertf128_ps(
-        _mm256_castps128_ps256(_mm_castsi128_ps(
-            _mm_unpackhi_epi16(_mm_setzero_si128(), input[0]))),
-        _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[1])), 1);
-#endif
-  }
-}
-
-#if defined(__F16C__)
-template <int N>
-DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
-    const IeeeFloat16 *source) {
-  static_assert(N % 2 == 0,
-                "Load() from half floats requires even-sized vector arrays.");
-
-  for (int i = 0; i < N / 2; i++) {
-    // TODO(googleuser): Experiment with doing a single AVX2 load and
-    // dividing the result.
-    __m128i first_half = _mm_load_si128(
-        reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
-    __m128i second_half = _mm_load_si128(reinterpret_cast<__m128i const *>(
-        source + kAvxWidthHalfPrecision * i + kAvxWidth));
-    vectors[2 * i].ymm = _mm256_cvtph_ps(first_half);
-    vectors[2 * i + 1].ymm = _mm256_cvtph_ps(second_half);
-  }
-}
-#endif
-
-#elif defined(__SSE4_2__)
-DRAGNN_AVXVA_ALWAYS_INLINE
-AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
-  for (int i = 0; i < 2; ++i) {
-    xmm[i] = _mm_mul_ps(expr.a.xmm[i], expr.b.xmm[i]);
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
-  for (int i = 0; i < 2; ++i) {
-    xmm[i] = _mm_load_ps(&source[i * kSseWidth]);
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
-  xmm[1] = xmm[0] = _mm_set1_ps(val);
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
-  for (int i = 0; i < 2; ++i) {
-    _mm_store_ps(&dst[i * kSseWidth], xmm[i]);
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
-    const AvxFloatVec &a, const AvxFloatVec &b) {
-  *this += AvxFloatVec(a * b);
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
-  for (int i = 0; i < 2; ++i) {
-    xmm[i] = _mm_floor_ps(xmm[i]);
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
-                                                   const float max_value) {
-  for (int i = 0; i < 2; ++i) {
-    xmm[i] = _mm_min_ps(xmm[i], Const(max_value).xmm[i]);
-    xmm[i] = _mm_max_ps(xmm[i], Const(min_value).xmm[i]);
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
-    const AvxFloatVec &vec) {
-  for (int i = 0; i < 2; ++i) {
-    xmm[i] = _mm_add_ps(vec.xmm[i], xmm[i]);
-  }
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
-    const AvxFloatVec &vec) {
-  for (int i = 0; i < 2; ++i) {
-    xmm[i] = _mm_sub_ps(xmm[i], vec.xmm[i]);
-  }
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
-    const AvxFloatVec &vec) {
-  for (int i = 0; i < 2; ++i) {
-    xmm[i] = _mm_div_ps(xmm[i], vec.xmm[i]);
-  }
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v) {
-  xmm_[0] = _mm_cvttps_epi32(v.xmm[0]);
-  xmm_[1] = _mm_cvttps_epi32(v.xmm[1]);
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
-  for (int i = 0; i < 2; ++i) {
-    xmm_[i] = _mm_slli_epi32(xmm_[i], bits);
-  }
-}
-
-AvxFloatVec DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::ReinterpretCastFloat() {
-  AvxFloatVec result;
-  for (int i = 0; i < 2; ++i) {
-    result.xmm[i] = _mm_castsi128_ps(xmm_[i]);
-  }
-  return result;
-}
-
-template <int N>
-DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
-    const TruncatedFloat16 *source) {
-  static_assert(N % 2 == 0,
-                "Load() from half floats requires even-sized vector arrays.");
-
-  for (int i = 0; i < N / 2; i++) {
-    __m128i input[2];
-    input[0] = _mm_load_si128(
-        reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
-    input[1] = _mm_load_si128(reinterpret_cast<__m128i const *>(
-        source + kAvxWidthHalfPrecision * i + kSseWidthHalfPrecision));
-
-    vectors[2 * i].xmm[0] =
-        _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[0]));
-    vectors[2 * i + 1].xmm[0] =
-        _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[0]));
-    vectors[2 * i].xmm[1] =
-        _mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[1]));
-    vectors[2 * i + 1].xmm[1] =
-        _mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[1]));
-  }
-}
-
-#if defined(__F16C__)
-template <int N>
-DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
-    const IeeeFloat16 *source) {
-  static_assert(N % 2 == 0,
-                "Load() from half floats requires even-sized vector arrays.");
-
-  for (int i = 0; i < N / 2; i++) {
-    __m128i first_half = _mm_load_si128(
-        reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
-    __m128i second_half = _mm_load_si128(reinterpret_cast<__m128i const *>(
-        source + kAvxWidthHalfPrecision * i + kAvxWidth));
-    vectors[2 * i].xmm[0] = _mm_cvtph_ps(first_half);
-    vectors[2 * i + 1].xmm[0] = _mm_cvtph_ps(second_half);
-
-    first_half = _mm_shuffle_epi32(first_half, _MM_SHUFFLE(0, 1, 3, 2));
-    second_half = _mm_shuffle_epi32(second_half, _MM_SHUFFLE(0, 1, 3, 2));
-    vectors[2 * i].xmm[1] = _mm_cvtph_ps(first_half);
-    vectors[2 * i + 1].xmm[1] = _mm_cvtph_ps(second_half);
-  }
-}
-#endif
-
-#else
-
-// Compatibility implementations. If you compile with -ftree-vectorize and
-// -msse2 flags, you should still get decent performance (maybe 1/4 of the
-// AVX/FMA version).
-//
-// See the class above for method documentation.
-DRAGNN_AVXVA_ALWAYS_INLINE
-AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
-  for (int i = 0; i < 8; i++) {
-    ymm[i] = expr.a.ymm[i] * expr.b.ymm[i];
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
-  for (int i = 0; i < 8; i++) {
-    ymm[i] = source[i];
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
-  for (int i = 0; i < 8; i++) {
-    ymm[i] = val;
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
-  for (int i = 0; i < 8; i++) {
-    dst[i] = ymm[i];
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
-    const AvxFloatVec &a, const AvxFloatVec &b) {
-  for (int i = 0; i < 8; i++) {
-    ymm[i] += a.ymm[i] * b.ymm[i];
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
-  for (int i = 0; i < 8; i++) {
-    ymm[i] = floor(ymm[i]);
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
-                                                   const float max_value) {
-  for (int i = 0; i < 8; i++) {
-    ymm[i] = fmin(fmax(ymm[i], min_value), max_value);
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
-    const AvxFloatVec &vec) {
-  for (int i = 0; i < 8; i++) {
-    ymm[i] += vec.ymm[i];
-  }
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
-    const AvxFloatVec &vec) {
-  for (int i = 0; i < 8; i++) {
-    ymm[i] -= vec.ymm[i];
-  }
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
-    const AvxFloatVec &vec) {
-  for (int i = 0; i < 8; i++) {
-    ymm[i] /= vec.ymm[i];
-  }
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v) {
-  for (int i = 0; i < 8; i++) {
-    ymm_[i] = static_cast<int>(v.ymm[i]);
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
-  for (int i = 0; i < 8; i++) {
-    ymm_[i] = ymm_[i] << bits;
-  }
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec AvxIntVec::ReinterpretCastFloat() {
-  AvxFloatVec result;
-  for (int i = 0; i < 8; i++) {
-    result.ymm[i] = reinterpret_cast<float &>(ymm_[i]);
-  }
-  return result;
-}
-
-template <int N>
-DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
-    const TruncatedFloat16 *source) {
-  static_assert(N % 2 == 0,
-                "Load() from half floats requires even-sized vector arrays.");
-
-  // Iterate through mock AVX vectors, each composed of 16 half-floats.
-  for (int vec_idx = 0; vec_idx < N / 2; vec_idx++) {
-    // Making this code a bit more verbose, by reading in-order to a temporary
-    // array, results in faster performance. The compatibility version is still
-    // pretty slow though.
-    TruncatedFloat16 tmp[16];
-    for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
-      tmp[i] = source[i + kAvxWidthHalfPrecision * vec_idx];
-    }
-    float unpacked[16];
-    for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
-      unpacked[i] = tmp[i].DebugToFloat();
-    }
-    for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
-      int permuted = FastUnpackPermutation(i);
-      vectors[2 * vec_idx + (i / 8)].ymm[i % 8] = unpacked[permuted];
-    }
-  }
-}
-
-#if defined(__F16C__)
-template <int N>
-DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
-    const IeeeFloat16 *source) {
-  // Not actually required for the compatibility implementation, but it'd be
-  // rather non-uniform if this API succeeded, and then compilation failed when
-  // AVX2 was turned on.
-  static_assert(N % 2 == 0,
-                "Load() from half floats requires even-sized vector arrays.");
-
-  // Iterate through mock AVX vectors, each composed of 16 half-floats.
-  for (int i = 0; i < N * kAvxWidth; ++i) {
-    vectors[i / 8].ymm[i % 8] = source[i].DebugToFloat();
-  }
-}
-#endif
-#endif
-
-// The following operations are mostly syntax sugar, so they do not need
-// architecture-specific implementations.
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
-    const internal::AvxMultiplyExpr &to_add) {
-  AddProductOf(to_add.a, to_add.b);
-  return *this;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE internal::AvxMultiplyExpr operator*(
-    const AvxFloatVec &a, const AvxFloatVec &b) {
-  return internal::AvxMultiplyExpr{a, b};
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec
-operator+(const internal::AvxMultiplyExpr &expr, const AvxFloatVec &v) {
-  AvxFloatVec result = v;
-  result += expr;
-  return result;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator+(const AvxFloatVec &a,
-                                                 const AvxFloatVec &b) {
-  AvxFloatVec result = a;
-  result += b;
-  return result;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator/(const AvxFloatVec &a,
-                                                 const AvxFloatVec &b) {
-  AvxFloatVec result = a;
-  result /= b;
-  return result;
-}
-
-DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator-(const AvxFloatVec &a,
-                                                 const AvxFloatVec &b) {
-  AvxFloatVec result = a;
-  result -= b;
-  return result;
-}
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#undef DRAGNN_AVXVA_ALWAYS_INLINE
-#undef DRAGNN_AVXVA_INLINED_UNROLLED
-
-#endif  // DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
--- a/research/syntaxnet/dragnn/runtime/math/avx_vector_array_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/avx_vector_array_test.cc
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "dragnn/runtime/math/avx_vector_array.h"
-
-#include <cmath>
-
-#include "dragnn/runtime/test/helpers.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-namespace {
-
-TEST(AvxVectorTest, LoadAndStore) {
-  UniqueVector<float> input(kAvxWidth);
-  UniqueVector<float> output(kAvxWidth);
-  InitRandomVector(*input);
-  InitRandomVector(*output);
-
-  AvxFloatVec vec;
-  vec.Load(input->data());
-  vec.Store(output->data());
-
-  for (int i = 0; i < kAvxWidth; ++i) {
-    EXPECT_EQ((*input)[i], (*output)[i]);
-  }
-}
-
-// Test flooring with assignment, just to make the compiler not erase aliases.
-TEST(AvxVectorTest, AssignmentAndFloor) {
-  UniqueVector<float> input(kAvxWidth);
-  UniqueVector<float> output(kAvxWidth);
-  UniqueVector<float> floored(kAvxWidth);
-  InitRandomVector(*input);
-  InitRandomVector(*output);
-
-  AvxFloatVec vec;
-  vec.Load(input->data());
-  AvxFloatVec vec2 = vec;
-  vec.Floor();
-  vec.Store(floored->data());
-  vec2.Store(output->data());
-
-  for (int i = 0; i < kAvxWidth; ++i) {
-    EXPECT_EQ((*input)[i], (*output)[i]);
-    EXPECT_EQ(floor((*input)[i]), (*floored)[i]);
-  }
-}
-
-TEST(AvxVectorTest, ClampTest) {
-  bool modified = false;  // check that some value was clamped.
-  AvxVectorFuzzTest(
-      [](AvxFloatVec *vec) { vec->Clamp(-0.314f, 0.314f); },
-      [&modified](float input_value, float output_value) {
-        modified = modified || input_value < -0.314 || input_value > 0.314;
-        EXPECT_EQ(fmax(-0.314f, fmin(0.314f, input_value)), output_value);
-      });
-  EXPECT_TRUE(modified) << "No values fell outside test range for ClampTest().";
-}
-
-TEST(AvxVectorTest, LoadConstAndStore) {
-  UniqueVector<float> output(kAvxWidth);
-  InitRandomVector(*output);
-
-  AvxFloatVec vec;
-  vec.LoadConstVector(3.14f);
-  vec.Store(output->data());
-
-  for (int i = 0; i < kAvxWidth; ++i) {
-    EXPECT_EQ((*output)[i], 3.14f);
-  }
-}
-
-TEST(AvxVectorTest, AddTest) {
-  AvxVectorFuzzTest(  //
-      [](AvxFloatVec *vec) { (*vec) += *vec; },
-      [](float input_value, float output_value) {
-        EXPECT_EQ(input_value * 2, output_value);
-      });
-}
-
-TEST(AvxVectorTest, SubtractTest) {
-  AvxVectorFuzzTest(
-      [](AvxFloatVec *vec) {
-        AvxFloatVec one;
-        one.LoadConstVector(1.0f);
-        (*vec) -= one;
-      },
-      [](float input_value, float output_value) {
-        EXPECT_EQ(input_value - 1.0f, output_value);
-      });
-}
-
-TEST(AvxVectorTest, DivideTest) {
-  AvxVectorFuzzTest(
-      [](AvxFloatVec *vec) {
-        AvxFloatVec result;
-        result.LoadConstVector(1.0f);
-        result /= *vec;
-        *vec = result;
-      },
-      [](float input_value, float output_value) {
-        EXPECT_EQ(1.0f / input_value, output_value);
-      });
-}
-
-// This is a really basic test; half of the purpose is to ensure that the float
-// API is still OK (i.e. compiles) for odd-sized arrays. If you try to add a
-// call to array.Load(TruncatedFloat16 *source), it should produce a compiler
-// error.
-TEST(AvxFloatVecArrayTest, SingletonArrayLoadsAndStores) {
-  AvxFloatVecArray<1> array;
-
-  UniqueVector<float> input(kAvxWidth);
-  UniqueVector<float> output(kAvxWidth);
-  InitRandomVector(*input);
-  InitRandomVector(*output);
-
-  array.Load(input->data());
-  array.Store(output->data());
-
-  for (int i = 0; i < kAvxWidth; ++i) {
-    EXPECT_EQ((*input)[i], (*output)[i]);
-  }
-}
-
-TEST(AvxFloatVecArrayTest, LoadTruncatedFloat16) {
-  AvxFloatVecArray<2> array;
-  UniqueVector<TruncatedFloat16> values(2 * kAvxWidth);
-  UniqueVector<float> decompressed(2 * kAvxWidth);
-
-  for (int i = 0; i < 2 * kAvxWidth; ++i) {
-    int permuted = FastUnpackPermutation(i);
-    (*values)[i] = TruncatedFloat16::DebugFromFloat(permuted / 10.0);
-  }
-
-  // Ensure that state persisted from other tests won't cause this test to
-  // erroneously pass.
-  array.LoadConstVector(-1.0f);
-
-  array.Load(values->data());
-  array.Store(decompressed->data());
-  for (int i = 0; i < 2 * kAvxWidth; ++i) {
-    ASSERT_NEAR((*decompressed)[i], i / 10.0, 0.01);
-  }
-}
-
-TEST(AvxFloatVecArrayTest, LoadIeeeFloat16) {
-#if defined(__F16C__)
-  AvxFloatVecArray<2> array;
-  UniqueVector<IeeeFloat16> values(2 * kAvxWidth);
-  UniqueVector<float> decompressed(2 * kAvxWidth);
-  for (int i = 0; i < 2 * kAvxWidth; ++i) {
-    (*values)[i] = IeeeFloat16::DebugFromFloat(i / 10.0);
-  }
-
-  // Ensure that state persisted from other tests won't cause this test to
-  // erroneously pass.
-  array.LoadConstVector(-1.0f);
-
-  array.Load(values->data());
-  array.Store(decompressed->data());
-  for (int i = 0; i < 2 * kAvxWidth; ++i) {
-    ASSERT_NEAR((*decompressed)[i], i / 10.0, 0.01);
-  }
-#else
-  LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
-            << "this test.";
-#endif
-}
-
-TEST(AvxFloatVecArrayTest, PermutationFunctionIsEqualToTable) {
-  std::vector<int> permutation = {0, 1, 2, 3, 8,  9,  10, 11,
-                                  4, 5, 6, 7, 12, 13, 14, 15};
-
-  for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
-    EXPECT_EQ(FastUnpackPermutation(i), permutation[i]);
-  }
-}
-
-}  // namespace
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/eigen.h
+++ b/research/syntaxnet/dragnn/runtime/math/eigen.h
-// Copyright 2018 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-// Compatibility support for Eigen.
-
-#ifndef DRAGNN_RUNTIME_MATH_EIGEN_H_
-#define DRAGNN_RUNTIME_MATH_EIGEN_H_
-
-#include "dragnn/runtime/alignment.h"
-#include "dragnn/runtime/math/types.h"
-#include "third_party/eigen3/Eigen/Core"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-namespace internal {
-
-// Returns a combination of bit-options for Eigen matrices.
-constexpr int GetEigenMatrixOptions() {
-  return Eigen::AutoAlign | Eigen::RowMajor;
-}
-
-// Returns a combination of bit-options for Eigen maps of runtime types.
-constexpr int GetEigenMapOptions() {
-  static_assert(kAlignmentBytes >= EIGEN_MAX_ALIGN_BYTES,
-                "Runtime alignment is not compatible with Eigen alignment.");
-  return Eigen::Aligned;
-}
-
-// Eigen matrix and (row) vector types.  Don't use these directly; instead use
-// the public Map types and functions below to wrap runtime types.
-template <class T>
-using EigenVector =
-    Eigen::Matrix<T, 1, Eigen::Dynamic, GetEigenMatrixOptions()>;
-template <class T>
-using EigenMatrix =
-    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, GetEigenMatrixOptions()>;
-
-// Eigen stride for matrix types.
-using EigenMatrixStride = Eigen::Stride<Eigen::Dynamic, 1>;
-
-// Returns the Eigen stride associated with the |matrix|.
-template <class T>
-EigenMatrixStride GetEigenMatrixStride(MatrixImpl<T> matrix) {
-  return EigenMatrixStride(matrix.row_stride(), 1);
-}
-
-}  // namespace internal
-
-// Eigen wrappers around a runtime-allocated matrix or (row) vector.
-template <class T>
-using EigenVectorMap =
-    Eigen::Map<const internal::EigenVector<T>, internal::GetEigenMapOptions()>;
-template <class T>
-using MutableEigenVectorMap =
-    Eigen::Map<internal::EigenVector<T>, internal::GetEigenMapOptions()>;
-template <class T>
-using EigenMatrixMap =
-    Eigen::Map<const internal::EigenMatrix<T>, internal::GetEigenMapOptions(),
-               internal::EigenMatrixStride>;
-template <class T>
-using MutableEigenMatrixMap =
-    Eigen::Map<internal::EigenMatrix<T>, internal::GetEigenMapOptions(),
-               internal::EigenMatrixStride>;
-
-// Returns an Eigen wrapper around the |vector| or |matrix|.
-template <class T>
-EigenVectorMap<T> AsEigenMap(Vector<T> vector) {
-  return EigenVectorMap<T>(vector.data(), vector.size());
-}
-template <class T>
-MutableEigenVectorMap<T> AsEigenMap(MutableVector<T> vector) {
-  return MutableEigenVectorMap<T>(vector.data(), vector.size());
-}
-template <class T>
-EigenMatrixMap<T> AsEigenMap(Matrix<T> matrix) {
-  return EigenMatrixMap<T>(matrix.data(), matrix.num_rows(),
-                           matrix.num_columns(),
-                           internal::GetEigenMatrixStride(matrix));
-}
-template <class T>
-MutableEigenMatrixMap<T> AsEigenMap(MutableMatrix<T> matrix) {
-  return MutableEigenMatrixMap<T>(matrix.data(), matrix.num_rows(),
-                                  matrix.num_columns(),
-                                  internal::GetEigenMatrixStride(matrix));
-}
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#endif  // DRAGNN_RUNTIME_MATH_EIGEN_H_
--- a/research/syntaxnet/dragnn/runtime/math/eigen_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/eigen_test.cc
-// Copyright 2018 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "dragnn/runtime/math/eigen.h"
-
-#include <vector>
-
-#include "dragnn/core/test/generic.h"
-#include "dragnn/runtime/math/types.h"
-#include "dragnn/runtime/test/helpers.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-namespace {
-
-// Expects that two pointers point to the same address.
-void ExpectSameAddress(const void *ptr1, const void *ptr2) {
-  EXPECT_EQ(ptr1, ptr2);
-}
-
-// Expects that the |vector| has the |values|.
-void ExpectValues(MutableVector<float> vector,
-                  const std::vector<float> &values) {
-  ASSERT_EQ(vector.size(), values.size());
-  for (int i = 0; i < values.size(); ++i) {
-    EXPECT_EQ(vector[i], values[i]);
-  }
-}
-
-// Expects that the Eigen |matrix| has the |values|.
-template <class EigenMatrix>
-void ExpectValues(const EigenMatrix &matrix,
-                  const std::vector<std::vector<float>> &values) {
-  ASSERT_EQ(matrix.rows(), values.size());
-  for (int row = 0; row < matrix.rows(); ++row) {
-    ASSERT_EQ(matrix.cols(), values[row].size());
-    for (int column = 0; column < matrix.cols(); ++column) {
-      EXPECT_EQ(matrix(row, column), values[row][column]);
-    }
-  }
-}
-
-// Tests that an Eigen vector map references the same memory as the underlying
-// runtime vector.
-TEST(EigenTest, Vector) {
-  UniqueVector<float> vector({1.0, 2.0, 3.0, 4.0});
-
-  EigenVectorMap<float> const_eigen_vector = AsEigenMap(Vector<float>(*vector));
-  ExpectSameAddress(const_eigen_vector.data(), vector->data());
-  ExpectValues(const_eigen_vector, {{1.0, 2.0, 3.0, 4.0}});
-
-  MutableEigenVectorMap<float> mutable_eigen_vector = AsEigenMap(*vector);
-  ExpectSameAddress(mutable_eigen_vector.data(), vector->data());
-  ExpectValues(mutable_eigen_vector, {{1.0, 2.0, 3.0, 4.0}});
-
-  // Write into the runtime vector and read from the other views.
-  (*vector)[0] = 10.0;
-  (*vector)[1] = 20.0;
-  (*vector)[2] = 30.0;
-  (*vector)[3] = 40.0;
-  ExpectValues(const_eigen_vector, {{10.0, 20.0, 30.0, 40.0}});
-  ExpectValues(mutable_eigen_vector, {{10.0, 20.0, 30.0, 40.0}});
-
-  // Write into the mutable Eigen vector and read from the other views.
-  mutable_eigen_vector << 100.0, 200.0, 300.0, 400.0;
-  ExpectValues(const_eigen_vector, {{100.0, 200.0, 300.0, 400.0}});
-  ExpectValues(*vector, {100.0, 200.0, 300.0, 400.0});
-}
-
-// Tests that an Eigen matrix map references the same memory as the underlying
-// runtime vector.
-TEST(EigenTest, Matrix) {
-  UniqueMatrix<float> matrix({{1.0, 2.0, 3.0},  //
-                              {4.0, 5.0, 6.0},  //
-                              {7.0, 8.0, 9.0}});
-
-  EigenMatrixMap<float> const_eigen_matrix = AsEigenMap(Matrix<float>(*matrix));
-  ExpectSameAddress(const_eigen_matrix.data(), matrix->row(0).data());
-  ExpectValues(const_eigen_matrix, {{1.0, 2.0, 3.0},  //
-                                    {4.0, 5.0, 6.0},  //
-                                    {7.0, 8.0, 9.0}});
-
-  MutableEigenMatrixMap<float> mutable_eigen_matrix = AsEigenMap(*matrix);
-  ExpectSameAddress(mutable_eigen_matrix.data(), matrix->row(0).data());
-  ExpectValues(mutable_eigen_matrix, {{1.0, 2.0, 3.0},  //
-                                      {4.0, 5.0, 6.0},  //
-                                      {7.0, 8.0, 9.0}});
-
-  // Write into the runtime matrix and read from the other views.
-  matrix->row(0)[0] = 10.0;
-  matrix->row(0)[1] = 20.0;
-  matrix->row(0)[2] = 30.0;
-  matrix->row(1)[0] = 40.0;
-  matrix->row(1)[1] = 50.0;
-  matrix->row(1)[2] = 60.0;
-  matrix->row(2)[0] = 70.0;
-  matrix->row(2)[1] = 80.0;
-  matrix->row(2)[2] = 90.0;
-  ExpectValues(const_eigen_matrix, {{10.0, 20.0, 30.0},  //
-                                    {40.0, 50.0, 60.0},  //
-                                    {70.0, 80.0, 90.0}});
-  ExpectValues(mutable_eigen_matrix, {{10.0, 20.0, 30.0},  //
-                                      {40.0, 50.0, 60.0},  //
-                                      {70.0, 80.0, 90.0}});
-
-  // Write into the mutable Eigen matrix and read from the other views.
-  mutable_eigen_matrix << 100.0, 200.0, 300.0,
-                          400.0, 500.0, 600.0,
-                          700.0, 800.0, 900.0;
-  ExpectValues(const_eigen_matrix, {{100.0, 200.0, 300.0},  //
-                                    {400.0, 500.0, 600.0},  //
-                                    {700.0, 800.0, 900.0}});
-  ExpectValues(matrix->row(0), {100.0, 200.0, 300.0});
-  ExpectValues(matrix->row(1), {400.0, 500.0, 600.0});
-  ExpectValues(matrix->row(2), {700.0, 800.0, 900.0});
-}
-
-}  // namespace
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/float16_types.h
+++ b/research/syntaxnet/dragnn/runtime/math/float16_types.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-// Declares 16-bit floating point types.
-
-#ifndef DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
-#define DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
-
-#if defined(__F16C__)
-#include <emmintrin.h>
-#endif
-
-#include "syntaxnet/base.h"
-#include "tensorflow/core/lib/core/casts.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-
-// Represents a truncated 16-bit floating point value. This corresponds to
-// `bfloat16` in TensorFlow. It just chops the last 16 least-significant bits
-// off the significand of a 32-bit floating point value, leaving 7 significand
-// bits, 8 exponent bits, and 1 sign bit.
-struct TruncatedFloat16 {
-  // Slow unpacking routine. Use avx_vector_array.h for normal operation.
-  float DebugToFloat() const {
-    uint32 upcast = bits;
-    upcast <<= 16;
-    return tensorflow::bit_cast<float>(upcast);
-  }
-
-  // Slow packing routine. Use avx_vector_array.h for normal operation.
-  static TruncatedFloat16 DebugFromFloat(float value) {
-    uint32 float_bits = tensorflow::bit_cast<uint32>(value);
-    return TruncatedFloat16{static_cast<uint16>(float_bits >> 16)};
-  }
-
-  uint16 bits;
-};
-
-static_assert(sizeof(TruncatedFloat16) == sizeof(uint16), "Bad struct size");
-
-// Currently, only CPUs with the F16C instruction set are supported. All use of
-// this struct should be flag-guarded.
-//
-// If this becomes a problem, we can implement this method with Eigen's
-// CUDA/Half.h.
-#if defined(__F16C__)
-
-// Represents an IEEE-754 16-bit floating point value. This has 10 significand
-// bits, 5 exponent bits, and 1 sign bit.
-//
-// TODO(googleuser): Either add compatibility support, or delete this code if
-// it turns out not to be helpful.
-struct IeeeFloat16 {
-  // Slow unpacking routine. Use avx_vector_array.h for normal operation.
-  float DebugToFloat() const { return _cvtsh_ss(bits); }
-
-  // Slow packing routine. Use avx_vector_array.h for normal operation.
-  static IeeeFloat16 DebugFromFloat(float value) {
-    return IeeeFloat16{_cvtss_sh(value, 0)};
-  }
-
-  uint16 bits;
-};
-
-static_assert(sizeof(IeeeFloat16) == sizeof(uint16), "Bad struct size");
-
-#endif
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#endif  // DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
--- a/research/syntaxnet/dragnn/runtime/math/float16_types_test.cc
+++ b/research/syntaxnet/dragnn/runtime/math/float16_types_test.cc
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-#include "dragnn/runtime/math/float16_types.h"
-
-#include "tensorflow/core/platform/test.h"
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-namespace {
-
-// C++11 doesn't support binary literals like 0b01001, so add a helper. :(
-uint16 ParseBinaryString(const string &bits) {
-  CHECK_EQ(bits.size(), 16) << "ParseBinaryString expects full 16-bit values";
-  uint16 value = 0;
-  for (const char bit : bits) {
-    CHECK(bit == '0' || bit == '1') << "String must be 0's and 1's.";
-    value = (value << 1) + (bit == '0' ? 0 : 1);
-  }
-  return value;
-}
-
-TEST(Float16TypesTest, IeeeFloat16Accuracy) {
-#if defined(__F16C__)
-  bool some_not_exact = false;
-  for (int i = -100; i < 100; ++i) {
-    float value = i / 10.0f;
-    IeeeFloat16 half = IeeeFloat16::DebugFromFloat(value);
-    float unpacked = half.DebugToFloat();
-    EXPECT_NEAR(value, unpacked, 0.01);
-    some_not_exact = some_not_exact || (value != unpacked);
-  }
-  EXPECT_TRUE(some_not_exact);
-#else
-  LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
-            << "this test.";
-#endif
-}
-
-TEST(Float16TypesTest, TruncatedAccuracy) {
-  bool some_not_exact = false;
-  for (int i = -100; i < 100; ++i) {
-    float value = i / 10.0f;
-    TruncatedFloat16 half = TruncatedFloat16::DebugFromFloat(value);
-    float unpacked = half.DebugToFloat();
-    EXPECT_NEAR(value, unpacked, 0.06);
-    some_not_exact = some_not_exact || (value != unpacked);
-  }
-  EXPECT_TRUE(some_not_exact);
-}
-
-TEST(Float16TypesTest, TruncatedKnownBinaryRepresentation) {
-  uint16 neg_1 = ParseBinaryString("1011111110000000");
-  uint16 one = ParseBinaryString("0011111110000000");
-  EXPECT_EQ((TruncatedFloat16{neg_1}).DebugToFloat(), -1.0f);
-  EXPECT_EQ((TruncatedFloat16{one}).DebugToFloat(), 1.0f);
-}
-
-TEST(Float16TypesTest, IeeeFloat16KnownBinaryRepresentation) {
-#if defined(__F16C__)
-  uint16 neg_1 = ParseBinaryString("1011110000000000");
-  uint16 one = ParseBinaryString("0011110000000000");
-  EXPECT_EQ((IeeeFloat16{neg_1}).DebugToFloat(), -1.0f);
-  EXPECT_EQ((IeeeFloat16{one}).DebugToFloat(), 1.0f);
-#else
-  LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
-            << "this test.";
-#endif
-}
-
-}  // namespace
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
--- a/research/syntaxnet/dragnn/runtime/math/sgemvv.h
+++ b/research/syntaxnet/dragnn/runtime/math/sgemvv.h
-// Copyright 2017 Google Inc. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-// Computes `[y_1, y_2, ...] = M * [v_1, v_2, ...] + [b_1, b_2, ...]`, where
-//
-//    M is a `m x n` dense matrix.
-//    v_i are `n`-dimensional dense vectors.
-//    b_i and y_i are `m`-dimensional dense vectors.
-//
-// Unfortunately even larger (e.g. 128x128) matrix sizes are not sufficient to
-// hide the latency of a function call. So the entire implementation needs to
-// live in this header file. Please make sure to use all of the optimization
-// flags mentioned in the BUILD file in any client libraries.
-
-#ifndef DRAGNN_RUNTIME_MATH_SGEMVV_H_
-#define DRAGNN_RUNTIME_MATH_SGEMVV_H_
-
-#if defined(__SSE2__)
-#include <xmmintrin.h>
-#endif
-
-#include "dragnn/runtime/math/avx_vector_array.h"
-#include "dragnn/runtime/math/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-
-
-#define DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
-#ifdef __clang__
-#define DRAGNN_SGEMVV_GCC_UNROLL
-#else
-#define DRAGNN_SGEMVV_GCC_UNROLL __attribute__((optimize("unroll-loops")))
-#endif
-
-namespace syntaxnet {
-namespace dragnn {
-namespace runtime {
-
-// Represents `v, b` from one operation `y = M * v + b`.
-template <int num_ops>
-struct SgemvInputBatch {
-  const float *input[num_ops];
-  const float *initial[num_ops];
-};
-
-template <int num_ops>
-struct SgemvOutputBatch {
-  float *output[num_ops];
-};
-
-// Matrix argument for the SGEMV/SGEMVV operation. Based on row-batched
-// column-major matrices, but pulls the batch size into a template argument
-// so code can be compiled more efficiently.
-template <int sse_batch_size, typename ElementType = float>
-class SgemvMatrix final {
- public:
-  // Convenience type alias.
-  using MatrixType =
-      BlockedMatrix<ElementType, BlockedMatrixFormat::kRowBlockedColumnMajor>;
-
-  // Creates an empty SgemvMatrix.
-  SgemvMatrix() = default;
-
-  // Initializes the new matrix. Returns an InvalidArgumentError if the block
-  // size of `matrix` is not equal to `sse_batch_size.
-  ::tensorflow::Status Initialize(const MatrixType &matrix);
-
-  // Computes the matrix-vector product with a set of other inputs. See
-  // top-level comment for the general algorithm.
-  template <int num_ops, int lookahead_1 = 8, int lookahead_2 = 8>
-  void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
-  MatrixMultiVectorProduct(const SgemvInputBatch<num_ops> &inputs,
-                           SgemvOutputBatch<num_ops> *outputs) const {
-    MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/false,
-                                 /*read_initial=*/true, lookahead_1,
-                                 lookahead_2>(inputs, -1, outputs);
-  }
-
-  // Computes the matrix-vector product with a set of other inputs. See
-  // top-level comment for the general algorithm. This variant allows another
-  // parameter, `output_vector_elements`, to write to outputs which are a
-  // multiple of kAvxWidth (8 floats, or 32 bytes) but not necessarily
-  // sse_batch_size. It is slightly slower, but probably more than noise.
-  //
-  // |lookahead_1| and |lookahead_2| parameters control prefetching, and should
-  // usually be tuned via a script. They issue prefetch instructions that are
-  // `lookahead_1 * sse_batch_size` values ahead of the current matrix entry
-  // being read, if `lookahead_1 != 0` (and `(lookahead_1 + lookahead_2) *
-  // sse_batch_size` values, if lookahead_2 != 0). To reiterate, all prefetching
-  // can be disabled by setting |lookahead_1| to 0, or the second prefetch can
-  // be disabled by setting |lookahead_2| to 0.
-  template <int num_ops, int lookahead_1 = 8, int lookahead_2 = 8>
-  void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
-  MaskedMatrixMultiVectorProduct(const SgemvInputBatch<num_ops> &inputs,
-                                 int output_vector_elements,
-                                 SgemvOutputBatch<num_ops> *outputs) const {
-    MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/true,
-                                 /*read_initial=*/true, lookahead_1,
-                                 lookahead_2>(inputs, output_vector_elements,
-                                              outputs);
-  }
-
-  // Like the above, but assumes existing values are zero instead of reading
-  // them.
-  template <int num_ops>
-  void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
-  MaskedMatrixMultiVectorProductNoInitial(
-      const SgemvInputBatch<num_ops> &inputs, int output_vector_elements,
-      SgemvOutputBatch<num_ops> *outputs) const {
-    MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/true,
-                                 /*read_initial=*/false>(
-        inputs, output_vector_elements, outputs);
-  }
-
-  // Read-only accessor.
-  const MatrixType &matrix() const { return matrix_; }
-
- private:
-  template <int num_ops, bool mask_input_output, bool read_initial,
-            int lookahead_1 = 8, int lookahead_2 = 8>
-  DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL void
-  MatrixMultiVectorProductImpl(const SgemvInputBatch<num_ops> &inputs,
-                               int output_vector_elements,
-                               SgemvOutputBatch<num_ops> *outputs) const;
-
-  MatrixType matrix_;
-};
-
-// Implementation details.
-template <int sse_batch_size, typename ElementType>
-template <int num_ops, bool mask_input_output, bool read_initial,
-          int lookahead_1, int lookahead_2>
-inline void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
-SgemvMatrix<sse_batch_size, ElementType>::MatrixMultiVectorProductImpl(
-    const SgemvInputBatch<num_ops> &inputs, int output_vector_elements,
-    SgemvOutputBatch<num_ops> *outputs) const {
-  static_assert(sse_batch_size % kAvxWidth == 0,
-                "sse_batch_size must be a multiple of kAvxWidth (8).");
-  if (mask_input_output) {
-    DCHECK_EQ(output_vector_elements % kAvxWidth, 0)
-        << "output_vector_elements must be padded to alignment";
-  }
-
-  const ElementType *curr_matrix_ptr = matrix_.vector(0).data();
-
-  // Loop over blocks of output rows. Each block of output rows will get a
-  // partial sum of the [matrix-vector] dot product, where the range of that
-  // partial sum is designated by start_col and end_col.
-  for (int row_start = 0; row_start < matrix_.num_rows();
-       row_start += sse_batch_size) {
-    const int load_store_max_idx =
-        (output_vector_elements - row_start) / kAvxWidth;
-    AvxFloatVecArray<sse_batch_size / kAvxWidth> accumulators[num_ops];
-
-    // Read inputs.
-    for (int op = 0; op < num_ops; ++op) {
-      if (read_initial) {
-        if (mask_input_output) {
-          accumulators[op].Load(&inputs.initial[op][row_start],
-                                load_store_max_idx);
-        } else {
-          accumulators[op].Load(&inputs.initial[op][row_start]);
-        }
-      } else {
-        accumulators[op].LoadConstVector(0.0f);
-      }
-    }
-
-    // Compute matrix-vector product.
-    for (int col = 0; col < matrix_.num_columns(); ++col) {
-      if (lookahead_1 != 0) {
-#if defined(__SSE2__)
-        _mm_prefetch(curr_matrix_ptr + lookahead_1 * sse_batch_size,
-                     _MM_HINT_T0);
-        if (lookahead_2 != 0) {
-          _mm_prefetch(
-              curr_matrix_ptr + (lookahead_1 + lookahead_2) * sse_batch_size,
-              _MM_HINT_T0);
-        }
-#endif
-      }
-
-      // These are the coefficients from each vector at column `col` (just
-      // broadcast over the whole AVX array).
-      AvxFloatVec weights[num_ops];
-      for (int op = 0; op < num_ops; ++op) {
-        weights[op].LoadConstVector(inputs.input[op][col]);
-      }
-
-      // Loop over each AVX vector and add the current sub-product.
-      AvxFloatVecArray<sse_batch_size / kAvxWidth> matrix_block;
-      matrix_block.Load(curr_matrix_ptr);
-      curr_matrix_ptr += sse_batch_size;
-      for (int row_offset = 0; row_offset < sse_batch_size / kAvxWidth;
-           row_offset++) {
-        for (int op = 0; op < num_ops; ++op) {
-          accumulators[op].vectors[row_offset].AddProductOf(
-              weights[op], matrix_block.vectors[row_offset]);
-        }
-      }
-    }
-
-    // Save the results.
-    for (int op = 0; op < num_ops; ++op) {
-      if (mask_input_output) {
-        accumulators[op].Store(&outputs->output[op][row_start],
-                               load_store_max_idx);
-      } else {
-        accumulators[op].Store(&outputs->output[op][row_start]);
-      }
-    }
-  }
-}
-
-template <int sse_batch_size, typename ElementType>
-::tensorflow::Status SgemvMatrix<sse_batch_size, ElementType>::Initialize(
-    const SgemvMatrix<sse_batch_size, ElementType>::MatrixType &matrix) {
-  if (matrix.block_size() != sse_batch_size) {
-    return ::tensorflow::errors::InvalidArgument(
-        "Blocked matrix block_size (", matrix.block_size(),
-        ") must be equal to sse_batch_size (", sse_batch_size, ")");
-  }
-  matrix_ = matrix;
-  return ::tensorflow::Status::OK();
-}
-
-}  // namespace runtime
-}  // namespace dragnn
-}  // namespace syntaxnet
-
-#undef DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
-#undef DRAGNN_SGEMVV_GCC_UNROLL
-
-#endif  // DRAGNN_RUNTIME_MATH_SGEMVV_H_