Commit edea2b67 authored by Terry Koo's avatar Terry Koo
Browse files

Remove runtime because reasons.

parent a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/core/test/generic.h"
#include "dragnn/protos/runtime.pb.h"
#include "dragnn/runtime/flexible_matrix_kernel.h"
#include "dragnn/runtime/lstm_cell/cell_function.h"
#include "dragnn/runtime/network_unit.h"
#include "dragnn/runtime/test/network_test_base.h"
#include "dragnn/runtime/variable_store.h"
#include <gmock/gmock.h>
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
using ::testing::Invoke;
using ::testing::_;
class LstmNetworkTest : public NetworkTestBase {
protected:
// Adds a blocked weight matrix with the |name| with the given dimensions and
// |fill_value|. If |is_flexible_matrix| is true, the variable is set up for
// use by the FlexibleMatrixKernel.
void AddWeights(const string &name, size_t input_dim, size_t output_dim,
float fill_value, bool is_flexible_matrix = false) {
constexpr int kBatchSize = LstmCellFunction<>::kBatchSize;
size_t output_padded =
kBatchSize * ((output_dim + kBatchSize - 1) / kBatchSize);
size_t num_views = (output_padded / kBatchSize) * input_dim;
string var_name = tensorflow::strings::StrCat(
kTestComponentName, "/", name,
is_flexible_matrix ? FlexibleMatrixKernel::kSuffix
: "/matrix/blocked48");
const std::vector<float> block(kBatchSize, fill_value);
const std::vector<std::vector<float>> blocks(num_views, block);
variable_store_.AddOrDie(
var_name, blocks, VariableSpec::FORMAT_COLUMN_BLOCKED_ROW_MAJOR_MATRIX);
variable_store_.SetBlockedDimensionOverride(
var_name, {input_dim, output_padded, kBatchSize});
}
// Adds a bias vector with the |name_suffix| with the given dimensions and
// |fill_value|.
void AddBiases(const string &name, size_t dimension, float fill_value) {
const string biases_name =
tensorflow::strings::StrCat(kTestComponentName, "/", name);
AddVectorVariable(biases_name, dimension, fill_value);
}
// Creates a network unit, initializes it based on the |component_spec_text|,
// and evaluates it. On error, returns non-OK.
tensorflow::Status Run(const string &component_spec_text) {
ComponentSpec component_spec;
CHECK(TextFormat::ParseFromString(component_spec_text, &component_spec));
component_spec.set_name(kTestComponentName);
// Since LSTMNetwork uses the concatenated input, it is insensitive
// to the particular fixed or linked embedding inputs. For simplicity, the
// tests use a trivial network structure and a single fixed embedding.
AddComponent(kTestComponentName);
TF_RETURN_IF_ERROR(
NetworkUnit::CreateOrError("LSTMNetwork", &network_unit_));
TF_RETURN_IF_ERROR(network_unit_->Initialize(
component_spec, &variable_store_, &network_state_manager_,
&extension_manager_));
network_states_.Reset(&network_state_manager_);
StartComponent(1); // only evaluate the first step
session_state_.extensions.Reset(&extension_manager_);
TF_RETURN_IF_ERROR(
network_unit_->Evaluate(0, &session_state_, &compute_session_));
return tensorflow::Status::OK();
}
// Returns the activation vector of the first step of layer named |layer_name|
// in the current component.
Vector<float> GetActivations(const string &layer_name) const {
Matrix<float> layer(GetLayer(kTestComponentName, layer_name));
return layer.row(0);
}
std::unique_ptr<NetworkUnit> network_unit_;
};
// Tests that the LSTMNetwork does not produce logits when omit_logits is
// true, even if there are actions.
TEST_F(LstmNetworkTest, NoLogitsOrSoftmaxWhenOmitLogitsTrue) {
constexpr size_t input_dim = 32;
constexpr int kHiddenDim = LstmCellFunction<>::kBatchSize;
const string kSpec = R"(fixed_feature {
vocabulary_size: 50
embedding_dim: 32
size: 1
}
network_unit {
parameters {
key: 'hidden_layer_sizes'
value: '48'
}
parameters {
key: 'omit_logits'
value: 'true'
}
}
num_actions: 10)";
const float kEmbedding = 1.25;
const float kFeature = 0.5;
const float kWeight = 1.5;
AddFixedEmbeddingMatrix(0, 50, input_dim, kEmbedding);
// No "softmax" weights or biases.
AddWeights("x_to_ico", input_dim, 3 * kHiddenDim, kWeight);
AddWeights("h_to_ico", kHiddenDim, 3 * kHiddenDim, kWeight);
AddWeights("c2i", kHiddenDim, kHiddenDim, kWeight);
AddWeights("c2o", kHiddenDim, kHiddenDim, kWeight);
AddBiases("ico_bias", 3 * kHiddenDim, kWeight);
EXPECT_CALL(compute_session_, GetInputFeatures(_, _, _, _, _))
.WillOnce(Invoke(ExtractFeatures(0, {{1, kFeature}})));
TF_EXPECT_OK(Run(kSpec));
// No specified logits layer.
EXPECT_TRUE(network_unit_->GetLogitsName().empty());
// No "logits" layer.
size_t unused_dimension = 0;
LayerHandle<float> unused_handle;
EXPECT_THAT(
network_state_manager_.LookupLayer(kTestComponentName, "logits",
&unused_dimension, &unused_handle),
test::IsErrorWithSubstr(
"Unknown layer 'logits' in component 'test_component'"));
}
TEST_F(LstmNetworkTest, NormalOperationSmallHidden) {
constexpr size_t input_dim = 32;
constexpr int kHiddenDim = 8;
constexpr int num_actions = 10;
const string kSpec = R"(fixed_feature {
vocabulary_size: 50
embedding_dim: 32
size: 1
}
network_unit {
parameters {
key: 'hidden_layer_sizes'
value: '8'
}
}
num_actions: 10)";
const float kEmbedding = 1.25;
const float kFeature = 0.5;
const float kWeight = 1.5;
AddFixedEmbeddingMatrix(0, 50, input_dim, kEmbedding);
// Same as above, with "softmax" weights and biases.
AddWeights("x_to_ico", input_dim, 3 * kHiddenDim, kWeight);
AddWeights("h_to_ico", kHiddenDim, 3 * kHiddenDim, kWeight);
AddWeights("c2i", kHiddenDim, kHiddenDim, kWeight);
AddWeights("c2o", kHiddenDim, kHiddenDim, kWeight);
AddWeights("weights_softmax", kHiddenDim, num_actions, kWeight,
/*is_flexible_matrix=*/true);
AddBiases("ico_bias", 3 * kHiddenDim, kWeight);
AddBiases("bias_softmax", num_actions, kWeight);
EXPECT_CALL(compute_session_, GetInputFeatures(_, _, _, _, _))
.WillOnce(Invoke(ExtractFeatures(0, {{1, kFeature}})));
TF_EXPECT_OK(Run(kSpec));
// Logits should exist.
EXPECT_EQ(network_unit_->GetLogitsName(), "logits");
// Logits dimension matches "num_actions" above. We don't test the values very
// precisely here, and feel free to update if the cell function changes. Most
// value tests should be in lstm_cell/cell_function_test.cc.
Vector<float> logits = GetActivations("logits");
EXPECT_EQ(logits.size(), num_actions);
EXPECT_NEAR(logits[0], 10.6391, 0.1);
for (int i = 1; i < 10; ++i) {
EXPECT_EQ(logits[i], logits[0])
<< "With uniform weights, all logits should be equal.";
}
}
TEST_F(LstmNetworkTest, ErrorWithTooSmallHidden) {
constexpr size_t input_dim = 32;
constexpr int kHiddenDim = 4;
const string kSpec = R"(fixed_feature {
vocabulary_size: 50
embedding_dim: 32
size: 1
}
network_unit {
parameters {
key: 'hidden_layer_sizes'
value: '4'
}
}
num_actions: 0)";
const float kEmbedding = 1.25;
const float kWeight = 1.5;
AddFixedEmbeddingMatrix(0, 50, input_dim, kEmbedding);
// Same as above, with "softmax" weights and biases.
AddWeights("x_to_ico", input_dim, 3 * kHiddenDim, kWeight);
AddWeights("h_to_ico", kHiddenDim, 3 * kHiddenDim, kWeight);
AddWeights("c2i", kHiddenDim, kHiddenDim, kWeight);
AddWeights("c2o", kHiddenDim, kHiddenDim, kWeight);
AddBiases("ico_bias", 3 * kHiddenDim, kWeight);
EXPECT_THAT(
Run(kSpec),
test::IsErrorWithSubstr(
"Expected hidden size (4) to be a multiple of the AVX width (8)"));
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/master.h"
#include <utility>
#include <vector>
#include "dragnn/protos/runtime.pb.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/cleanup.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
constexpr int kMaxBeamSize = 1;
// Combines, using MergeFrom(), each step trace in the |source| with the
// corresponding step of the |target|. If |source| has more steps, then
// |target| is extended to match.
void MergeTraces(const ComponentTrace &source, ComponentTrace *target) {
while (target->step_trace_size() < source.step_trace_size()) {
target->add_step_trace();
}
for (int i = 0; i < source.step_trace_size(); ++i) {
target->mutable_step_trace(i)->MergeFrom(source.step_trace(i));
}
}
// Combines, using MergeTraces(), each component trace in the |source| with the
// corresponding component of the |target|. If |source| has more components,
// then |target| is extended to match.
void MergeTraces(const MasterTrace &source, MasterTrace *target) {
while (target->component_trace_size() < source.component_trace_size()) {
target->add_component_trace();
}
for (int i = 0; i < source.component_trace_size(); ++i) {
MergeTraces(source.component_trace(i), target->mutable_component_trace(i));
}
}
} // namespace
tensorflow::Status Master::Initialize(
const MasterSpec &master_spec,
std::unique_ptr<VariableStore> variable_store) {
if (variable_store_ != nullptr) {
return tensorflow::errors::FailedPrecondition("Can't initialize twice");
}
if (variable_store == nullptr) {
return tensorflow::errors::InvalidArgument("No VariableStore");
}
variable_store_ = std::move(variable_store);
const auto &master_performance_settings = master_spec.GetExtension(
MasterPerformanceSettings::master_spec_extension);
session_state_pool_.reset(new SessionStatePool(
master_performance_settings.session_state_pool_max_free_states()));
components_.reserve(master_spec.component_size());
for (const ComponentSpec &component_spec : master_spec.component()) {
const auto &component_performance_settings = component_spec.GetExtension(
ComponentPerformanceSettings::component_spec_extension);
components_.emplace_back();
ComponentConfig &component = components_.back();
component.name = component_spec.name();
component.pre_allocate_num_steps =
component_performance_settings.pre_allocate_num_steps();
TF_RETURN_IF_ERROR(
network_state_manager_.AddComponent(component_spec.name()));
const string component_type =
GetNormalizedComponentBuilderName(component_spec);
TF_RETURN_IF_ERROR(
Component::CreateOrError(component_type, &component.instance));
TF_RETURN_IF_ERROR(component.instance->Initialize(
component_spec, variable_store_.get(), &network_state_manager_,
&extension_manager_));
}
return variable_store_->Close();
}
tensorflow::Status Master::Evaluate(ComputeSession *compute_session,
MasterTrace *master_trace) const {
if (variable_store_ == nullptr) {
return tensorflow::errors::FailedPrecondition("Not initialized");
}
if (compute_session == nullptr) {
return tensorflow::errors::InvalidArgument("No ComputeSession");
}
if (master_trace != nullptr) {
master_trace->Clear();
compute_session->SetTracing(true);
}
const auto ensure_tracing_disabled = tensorflow::gtl::MakeCleanup([=] {
if (master_trace != nullptr) compute_session->SetTracing(false);
});
const ScopedSessionState session_state(session_state_pool_.get());
session_state->network_states.Reset(&network_state_manager_);
session_state->extensions.Reset(&extension_manager_);
for (const ComponentConfig &component : components_) {
// TODO(googleuser): Generically trace all layers?
ComponentTrace *component_trace = nullptr;
if (master_trace != nullptr) {
component_trace = master_trace->add_component_trace();
component_trace->set_name(component.name);
}
compute_session->InitializeComponentData(component.name, kMaxBeamSize);
TF_RETURN_IF_ERROR(session_state->network_states.StartNextComponent(
component.pre_allocate_num_steps));
TF_RETURN_IF_ERROR(component.instance->Evaluate(
session_state.get(), compute_session, component_trace));
compute_session->FinalizeData(component.name);
}
if (master_trace != nullptr) {
// Use only the first trace from the compute session.
const std::vector<MasterTrace> traces = compute_session->GetTraceProtos();
if (!traces.empty()) MergeTraces(traces[0], master_trace);
}
return tensorflow::Status::OK();
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MASTER_H_
#define DRAGNN_RUNTIME_MASTER_H_
#include <memory>
#include <string>
#include <vector>
#include "dragnn/core/compute_session.h"
#include "dragnn/protos/spec.pb.h"
#include "dragnn/protos/trace.pb.h"
#include "dragnn/runtime/component.h"
#include "dragnn/runtime/extensions.h"
#include "dragnn/runtime/network_states.h"
#include "dragnn/runtime/session_state.h"
#include "dragnn/runtime/session_state_pool.h"
#include "dragnn/runtime/variable_store.h"
#include "syntaxnet/base.h"
#include "tensorflow/core/lib/core/status.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// A DRAGNN master, which evaluates a series of components.
class Master {
public:
// Creates an uninitialized master. Call Initialize() before use.
Master() = default;
// Initializes the components in this based on the |master_spec|, which may
// have performance tuning settings attached (see runtime.proto). Retrieves
// pre-trained variables from the |variable_store|, which must not be closed.
// On error, returns non-OK.
tensorflow::Status Initialize(const MasterSpec &master_spec,
std::unique_ptr<VariableStore> variable_store);
// Evaluates the pipeline of components on the |compute_session|, which must
// be based on the same MasterSpec as this and populated with input data. If
// |master_trace| is non-null, overwrites it with extracted traces. On error,
// returns non-OK.
tensorflow::Status Evaluate(ComputeSession *compute_session,
MasterTrace *master_trace) const;
private:
// A Component with some associated configuration.
struct ComponentConfig {
// Name of the component.
string name;
// Number of steps to pre-allocate operands for the component.
size_t pre_allocate_num_steps = 0;
// Component instance to initialize and evaluate.
std::unique_ptr<Component> instance;
};
// Store of pre-trained variables used by the |components_|. Must be declared
// before the |components_| to ensure it outlives them.
std::unique_ptr<VariableStore> variable_store_;
// Manager for the network states in the |components_|.
NetworkStateManager network_state_manager_;
// Manager for SessionState extensions.
ExtensionManager extension_manager_;
// Ordered list of components to evaluate.
std::vector<ComponentConfig> components_;
// Pool of session states used when evaluating the |components_|. This must
// be destroyed before the |components_|, in case there are state extensions
// that depend on the |components_|. Declaring this after the |components_|
// ensures the proper destructor ordering.
std::unique_ptr<SessionStatePool> session_state_pool_;
};
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // DRAGNN_RUNTIME_MASTER_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/master.h"
#include <stddef.h>
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "dragnn/core/test/generic.h"
#include "dragnn/core/test/mock_compute_session.h"
#include "dragnn/protos/spec.pb.h"
#include "dragnn/protos/trace.pb.h"
#include "dragnn/runtime/alignment.h"
#include "dragnn/runtime/component.h"
#include "dragnn/runtime/extensions.h"
#include "dragnn/runtime/network_states.h"
#include "dragnn/runtime/session_state.h"
#include "dragnn/runtime/test/fake_variable_store.h"
#include "dragnn/runtime/variable_store.h"
#include "syntaxnet/base.h"
#include <gmock/gmock.h>
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
using ::testing::_;
using ::testing::InSequence;
using ::testing::Invoke;
using ::testing::Return;
// Number of steps to take in each component.
constexpr size_t kNumSteps = 123;
// Outputs a layer of all 1s.
class Ones : public Component {
public:
// Implements Component.
tensorflow::Status Initialize(const ComponentSpec &component_spec,
VariableStore *variable_store,
NetworkStateManager *network_state_manager,
ExtensionManager *extension_manager) override {
return network_state_manager->AddLayer("ones", 1, &output_handle_);
}
tensorflow::Status Evaluate(SessionState *session_state,
ComputeSession *compute_session,
ComponentTrace *component_trace) const override {
NetworkStates *network_states = &session_state->network_states;
for (size_t step = 0; step < kNumSteps; ++step) {
network_states->AddStep();
network_states->GetLayer(output_handle_).row(step)[0] = 1.0;
}
return tensorflow::Status::OK();
}
bool Supports(const ComponentSpec &spec,
const string &normalized_builder_name) const override {
return normalized_builder_name == "Ones";
}
bool PreferredTo(const Component &other) const override { return false; }
private:
// Handle to the output layer.
LayerHandle<float> output_handle_;
};
DRAGNN_RUNTIME_REGISTER_COMPONENT(Ones);
// Extends its input layer with the step-wise cumulative sum of the final entry
// in each row of the input. E.g.,
// [[0, 1], [[0, 1, 1 (= 1)],
// [2, 3], => [2, 3, 4 (= 1 + 3)],
// [4, 5]] [4, 5, 9 (= 1 + 3 + 5)]]
class ExtendWithCumulativeSum : public Component {
public:
// Implements Component.
tensorflow::Status Initialize(const ComponentSpec &component_spec,
VariableStore *variable_store,
NetworkStateManager *network_state_manager,
ExtensionManager *extension_manager) override {
// NB: In a real Component implementation, linked embeddings are accessed
// using the LinkedEmbeddingManager and LinkedEmbeddings. Here, we set up
// the link manually because it's simple and makes the test self-contained.
CHECK_EQ(component_spec.linked_feature_size(), 1);
const LinkedFeatureChannel &link = component_spec.linked_feature(0);
size_t dimension = 0;
TF_RETURN_IF_ERROR(network_state_manager->LookupLayer(
link.source_component(), link.source_layer(), &dimension,
&input_handle_));
CHECK_GT(dimension, 0);
return network_state_manager->AddLayer("sums", dimension + 1,
&output_handle_);
}
tensorflow::Status Evaluate(SessionState *session_state,
ComputeSession *compute_session,
ComponentTrace *component_trace) const override {
NetworkStates *network_states = &session_state->network_states;
float sum = 0.0;
for (size_t step = 0; step < kNumSteps; ++step) {
network_states->AddStep();
const Vector<float> inputs(
network_states->GetLayer(input_handle_).row(step));
const MutableVector<float> outputs(
network_states->GetLayer(output_handle_).row(step));
CHECK_EQ(outputs.size(), inputs.size() + 1);
sum += inputs[inputs.size() - 1];
*std::copy(inputs.begin(), inputs.end(), outputs.begin()) = sum;
}
return tensorflow::Status::OK();
}
bool Supports(const ComponentSpec &spec,
const string &normalized_builder_name) const override {
return normalized_builder_name == "ExtendWithCumulativeSum";
}
bool PreferredTo(const Component &other) const override { return false; }
private:
// Handles to the input and output layers.
LayerHandle<float> input_handle_;
LayerHandle<float> output_handle_;
};
DRAGNN_RUNTIME_REGISTER_COMPONENT(ExtendWithCumulativeSum);
// Makes predictions using its inputs.
class MakePredictions : public Component {
public:
// Implements Component.
tensorflow::Status Initialize(const ComponentSpec &component_spec,
VariableStore *variable_store,
NetworkStateManager *network_state_manager,
ExtensionManager *extension_manager) override {
name_ = component_spec.name();
CHECK_EQ(component_spec.linked_feature_size(), 1);
const LinkedFeatureChannel &link = component_spec.linked_feature(0);
size_t dimension = 0;
return network_state_manager->LookupLayer(link.source_component(),
link.source_layer(), &dimension,
&input_handle_);
}
tensorflow::Status Evaluate(SessionState *session_state,
ComputeSession *compute_session,
ComponentTrace *component_trace) const override {
NetworkStates *network_states = &session_state->network_states;
Matrix<float> inputs(network_states->GetLayer(input_handle_));
for (size_t step = 0; step < kNumSteps; ++step) {
const Vector<float> logits = inputs.row(step);
if (!compute_session->AdvanceFromPrediction(name_, logits.data(), 1,
logits.size())) {
return tensorflow::errors::Internal(
"Error in ComputeSession::AdvanceFromPrediction() at step ", step);
}
}
return tensorflow::Status::OK();
}
bool Supports(const ComponentSpec &spec,
const string &normalized_builder_name) const override {
return normalized_builder_name == "MakePredictions";
}
bool PreferredTo(const Component &other) const override { return false; }
private:
// Name of this component.
string name_;
// Handle to the input layer, which is treated as prediction logits.
LayerHandle<float> input_handle_;
};
DRAGNN_RUNTIME_REGISTER_COMPONENT(MakePredictions);
// Component whose Evaluate() always fails.
class AlwaysFails : public Component {
public:
// Implements Component.
tensorflow::Status Initialize(const ComponentSpec &component_spec,
VariableStore *variable_store,
NetworkStateManager *network_state_manager,
ExtensionManager *extension_manager) override {
return tensorflow::Status::OK();
}
tensorflow::Status Evaluate(SessionState *session_state,
ComputeSession *compute_session,
ComponentTrace *component_trace) const override {
return tensorflow::errors::Internal("I always fail!");
}
bool Supports(const ComponentSpec &spec,
const string &normalized_builder_name) const override {
return normalized_builder_name == "AlwaysFails";
}
bool PreferredTo(const Component &other) const override { return false; }
};
DRAGNN_RUNTIME_REGISTER_COMPONENT(AlwaysFails);
class MasterTest : public ::testing::Test {
protected:
// Returns a new VariableStore.
static std::unique_ptr<VariableStore> NewVariableStore() {
// None of the tests or components look at the pre-trained variables, so
// return an empty store.
return std::unique_ptr<VariableStore>(new FakeVariableStore());
}
// Initializes and runs the |master_| using the text-format MasterSpec in
// |master_spec_text|. The |master_trace| is overwritten with traces, if
// specified. If |expect_success| is false, then EXPECT_CALLs that assume
// success are disabled. On error, returns non-OK.
tensorflow::Status TryRun(const string &master_spec_text, bool expect_success,
MasterTrace *master_trace = nullptr) {
MasterSpec master_spec;
CHECK(TextFormat::ParseFromString(master_spec_text, &master_spec));
TF_RETURN_IF_ERROR(master_.Initialize(master_spec, NewVariableStore()));
{ // Add call expectations for initializing each component, in order.
InSequence ordered_calls;
for (const ComponentSpec &component_spec : master_spec.component()) {
EXPECT_CALL(compute_session_,
InitializeComponentData(component_spec.name(), 1))
.Times(1);
}
}
// If applicable, add call expectations for making "predictions" in the
// final component that capture the prediction logits for inspection.
if (master_spec.component_size() > 0 && expect_success) {
const string &last_component_name =
master_spec.component(master_spec.component_size() - 1).name();
EXPECT_CALL(compute_session_,
AdvanceFromPrediction(last_component_name, _, 1, _))
.Times(kNumSteps)
.WillRepeatedly(
Invoke([this](const string &, const float *data, int, int size) {
logits_.emplace_back(data, data + size);
return true;
}));
}
// Add call expectations for finalizing data in all components.
if (expect_success) {
for (const ComponentSpec &component_spec : master_spec.component()) {
EXPECT_CALL(compute_session_, FinalizeData(component_spec.name()))
.Times(1);
}
}
return master_.Evaluate(&compute_session_, master_trace);
}
// As above, but asserts that all operations succeed.
void Run(const string &master_spec_text,
MasterTrace *master_trace = nullptr) {
TF_ASSERT_OK(
TryRun(master_spec_text, /*expect_success=*/true, master_trace));
}
::testing::StrictMock<MockComputeSession> compute_session_;
std::vector<std::vector<float>> logits_;
Master master_;
};
// Tests that Master cannot be initialized multiple times.
TEST_F(MasterTest, InitializeTwice) {
TF_ASSERT_OK(master_.Initialize(MasterSpec(), NewVariableStore()));
EXPECT_THAT(master_.Initialize(MasterSpec(), NewVariableStore()),
test::IsErrorWithSubstr("Can't initialize twice"));
}
// Tests that Master requires a variable store.
TEST_F(MasterTest, NoVariableStore) {
EXPECT_THAT(master_.Initialize(MasterSpec(), nullptr),
test::IsErrorWithSubstr("No VariableStore"));
}
// Tests that Master must be initialized prior to session.
TEST_F(MasterTest, EvaluateWithoutInitializing) {
EXPECT_THAT(master_.Evaluate(&compute_session_, nullptr),
test::IsErrorWithSubstr("Not initialized"));
}
// Tests that Master requires a compute session.
TEST_F(MasterTest, NoComputeSession) {
TF_ASSERT_OK(master_.Initialize(MasterSpec(), NewVariableStore()));
EXPECT_THAT(master_.Evaluate(nullptr, nullptr),
test::IsErrorWithSubstr("No ComputeSession"));
}
// Tests that Master works with an empty spec and does nothing (StrictMock would
// raise an error if any methods on the ComputeSession were called).
TEST_F(MasterTest, EmptySpec) {
Run("");
EXPECT_TRUE(logits_.empty());
}
// Tests that Master can run a simple pipeline that generates ones.
TEST_F(MasterTest, Ones) {
Run(R"(component {
name: 'component1'
component_builder {
registered_name: 'Ones'
}
}
component {
name: 'component2'
component_builder {
registered_name: 'MakePredictions'
}
linked_feature {
source_component: 'component1'
source_layer: 'ones'
}
})");
EXPECT_EQ(logits_.size(), kNumSteps);
const std::vector<float> expected_row = {1.0};
for (const auto &row : logits_) EXPECT_EQ(row, expected_row);
}
// Tests that Master can run a pipeline with a cumulative summation.
TEST_F(MasterTest, SingleSummation) {
Run(R"(component {
name: 'component1'
component_builder {
registered_name: 'Ones'
}
}
component {
name: 'component2'
component_builder {
registered_name: 'ExtendWithCumulativeSum'
}
linked_feature {
source_component: 'component1'
source_layer: 'ones'
}
}
component {
name: 'component3'
component_builder {
registered_name: 'MakePredictions'
}
linked_feature {
source_component: 'component2'
source_layer: 'sums'
}
})");
EXPECT_EQ(logits_.size(), kNumSteps);
float sum = 0.0;
for (const auto &row : logits_) {
++sum;
const std::vector<float> expected_row = {1.0, sum};
EXPECT_EQ(row, expected_row);
}
}
// Tests that Master can run a pipeline with multiple summations.
TEST_F(MasterTest, MultiSummation) {
Run(R"(component {
name: 'component1'
component_builder {
registered_name: 'Ones'
}
}
component {
name: 'component2'
component_builder {
registered_name: 'ExtendWithCumulativeSum'
}
linked_feature {
source_component: 'component1'
source_layer: 'ones'
}
}
component {
name: 'component3'
component_builder {
registered_name: 'ExtendWithCumulativeSum'
}
linked_feature {
source_component: 'component2'
source_layer: 'sums'
}
}
component {
name: 'component4'
component_builder {
registered_name: 'ExtendWithCumulativeSum'
}
linked_feature {
source_component: 'component3'
source_layer: 'sums'
}
}
component {
name: 'component5'
component_builder {
registered_name: 'MakePredictions'
}
linked_feature {
source_component: 'component4'
source_layer: 'sums'
}
})");
EXPECT_EQ(logits_.size(), kNumSteps);
float sum1 = 0.0, sum2 = 0.0, sum3 = 0.0;
for (const auto &row : logits_) {
sum3 += sum2 += ++sum1;
const std::vector<float> expected_row = {1.0, sum1, sum2, sum3};
EXPECT_EQ(row, expected_row);
}
}
// Tests that Master can run a pipeline with tracing.
TEST_F(MasterTest, SingleSummationWithTracing) {
{ // Expect to enable and then disable tracing, in that order.
InSequence ordered_calls;
EXPECT_CALL(compute_session_, SetTracing(true));
EXPECT_CALL(compute_session_, SetTracing(false));
}
// Build a set of traces for the compute session to return.
std::vector<MasterTrace> traces(1);
traces.back().add_component_trace()->add_step_trace()->set_caption("A");
traces.back().add_component_trace()->add_step_trace()->set_caption("B");
traces.back().add_component_trace()->add_step_trace()->set_caption("C");
traces.back().add_component_trace()->add_step_trace()->set_caption("D");
EXPECT_CALL(compute_session_, GetTraceProtos()).WillOnce(Return(traces));
MasterTrace master_trace;
Run(R"(component {
name: 'component1'
component_builder {
registered_name: 'Ones'
}
}
component {
name: 'component2'
component_builder {
registered_name: 'ExtendWithCumulativeSum'
}
linked_feature {
source_component: 'component1'
source_layer: 'ones'
}
}
component {
name: 'component3'
component_builder {
registered_name: 'MakePredictions'
}
linked_feature {
source_component: 'component2'
source_layer: 'sums'
}
})",
&master_trace);
const string kExpectedTraceText = R"(
component_trace { name: 'component1' step_trace { caption: 'A' } }
component_trace { name: 'component2' step_trace { caption: 'B' } }
component_trace { name: 'component3' step_trace { caption: 'C' } }
component_trace { step_trace { caption: 'D' } }
)";
MasterTrace expected_trace;
ASSERT_TRUE(TextFormat::ParseFromString(kExpectedTraceText, &expected_trace));
EXPECT_THAT(master_trace, test::EqualsProto(expected_trace));
}
// Tests that Master disables tracing even on error.
TEST_F(MasterTest, DisablesTracingOnFailure) {
{ // Expect to enable and then disable tracing, in that order.
InSequence ordered_calls;
EXPECT_CALL(compute_session_, SetTracing(true));
EXPECT_CALL(compute_session_, SetTracing(false));
}
const string kMasterSpec = R"(component {
name: 'component1'
component_builder {
registered_name: 'AlwaysFails'
}
})";
MasterTrace master_trace;
EXPECT_THAT(TryRun(kMasterSpec, /*expect_success=*/false, &master_trace),
test::IsErrorWithSubstr("I always fail!"));
const string kExpectedTraceText = "component_trace { name: 'component1' }";
MasterTrace expected_trace;
ASSERT_TRUE(TextFormat::ParseFromString(kExpectedTraceText, &expected_trace));
EXPECT_THAT(master_trace, test::EqualsProto(expected_trace));
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
package(
default_visibility = ["//visibility:public"],
)
load(
"@org_tensorflow//tensorflow:tensorflow.bzl",
"if_linux_x86_64",
)
load(
"//dragnn/runtime:multiarch.bzl",
"dragnn_cc_multiarch_test",
)
FAST_MATH_COPTS = if_linux_x86_64([
"-O3",
"-msse4.2",
"-ffast-math",
"-ftree-vectorize",
])
cc_library(
name = "avx_vector_array",
hdrs = ["avx_vector_array.h"],
deps = [":float16_types"],
)
cc_test(
name = "avx_vector_array_test",
srcs = ["avx_vector_array_test.cc"],
deps = [
":avx_vector_array",
"//dragnn/runtime/test:helpers",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_library(
name = "avx_activation_functions",
hdrs = ["avx_activation_functions.h"],
deps = [
":avx_vector_array",
],
)
dragnn_cc_multiarch_test(
name = "avx_activation_functions_test",
srcs = ["avx_activation_functions_test.cc"],
copts = FAST_MATH_COPTS,
deps = [
":avx_activation_functions",
"//dragnn/runtime/test:helpers",
"//syntaxnet:base",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_library(
name = "float16_types",
hdrs = ["float16_types.h"],
deps = [
"//syntaxnet:base",
"@org_tensorflow//tensorflow/core:lib",
],
)
cc_test(
name = "float16_types_test",
srcs = ["float16_types_test.cc"],
deps = [
":float16_types",
"//syntaxnet:test_main",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_library(
name = "sgemvv",
hdrs = ["sgemvv.h"],
deps = [
":avx_vector_array",
":types",
"@org_tensorflow//tensorflow/core:lib",
],
)
cc_test(
name = "sgemvv_test",
srcs = ["sgemvv_test.cc"],
copts = [
"-O3",
"-mavx2",
"-mfma",
],
tags = [
"manual",
],
deps = [
":arithmetic",
":sgemvv",
":transformations",
":types",
"//dragnn/core/test:generic",
"//dragnn/runtime/test:helpers",
"@org_tensorflow//tensorflow/core:lib",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_test(
name = "sgemvv_compatibility_test",
srcs = ["sgemvv_test.cc"],
copts = [
"-O3",
"-ftree-vectorize",
"-ffast-math",
],
deps = [
":arithmetic",
":sgemvv",
":transformations",
":types",
"//dragnn/core/test:generic",
"//dragnn/runtime/test:helpers",
"@org_tensorflow//tensorflow/core:lib",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_library(
name = "transformations",
hdrs = ["transformations.h"],
deps = [
":types",
"@org_tensorflow//tensorflow/core:lib",
],
)
cc_test(
name = "transformations_test",
srcs = ["transformations_test.cc"],
deps = [
":transformations",
"//dragnn/runtime/test:helpers",
"@org_tensorflow//tensorflow/core:lib",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_library(
name = "types",
hdrs = ["types.h"],
deps = [
"//dragnn/runtime:alignment",
"@org_tensorflow//tensorflow/core:lib",
],
)
cc_test(
name = "types_test",
size = "small",
srcs = ["types_test.cc"],
deps = [
":types",
"//dragnn/core/test:generic",
"//dragnn/runtime:alignment",
"@org_tensorflow//tensorflow/core:lib",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_library(
name = "eigen",
hdrs = ["eigen.h"],
deps = [
":types",
"//dragnn/runtime:alignment",
"@org_tensorflow//third_party/eigen3",
],
)
cc_test(
name = "eigen_test",
size = "small",
srcs = ["eigen_test.cc"],
deps = [
":eigen",
":types",
"//dragnn/core/test:generic",
"//dragnn/runtime/test:helpers",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_library(
name = "arithmetic",
srcs = [
"arithmetic_avx.h",
"arithmetic_common.h",
"arithmetic_neon.h",
"arithmetic_sse.h",
],
hdrs = ["arithmetic.h"],
deps = [
":types",
"@org_tensorflow//tensorflow/core:lib",
],
)
cc_test(
name = "arithmetic_test",
size = "small",
srcs = ["arithmetic_test.cc"],
deps = [
":arithmetic",
":types",
"//dragnn/runtime/test:helpers",
"//syntaxnet:test_main",
"@org_tensorflow//tensorflow/core:lib",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_test(
name = "arithmetic_avx_test",
size = "small",
srcs = ["arithmetic_test.cc"],
copts = [
"-mavx2",
"-mfma",
],
tags = [
"manual",
],
deps = [
":arithmetic",
":types",
"//dragnn/runtime/test:helpers",
"//syntaxnet:test_main",
"@org_tensorflow//tensorflow/core:lib",
"@org_tensorflow//tensorflow/core:test",
],
)
cc_test(
name = "arithmetic_sse_test",
size = "small",
srcs = ["arithmetic_test.cc"],
copts = ["-msse4.2"],
deps = [
":arithmetic",
":types",
"//dragnn/runtime/test:helpers",
"//syntaxnet:test_main",
"@org_tensorflow//tensorflow/core:lib",
"@org_tensorflow//tensorflow/core:test",
],
)
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Top-level organizational header for arithmetic operations. Users should
// include this instead of directly including the sub-headers below. See
// arithmetic_common.h for function declarations and comments.
//
// NB: If you wish to use an architecture-specific implementation, make sure to
// add the relevant copts to the cc_library whose .cc file includes this header.
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_H_
// Select an architecture-specific implementation, if possible, or fall back to
// the trivial generic implementations. The order of the clauses is important:
// in cases where architectures may overlap the newer version should be checked
// first (e.g., AVX before SSE).
#if defined(__AVX2__)
#include "dragnn/runtime/math/arithmetic_avx.h"
#elif defined(__SSE4_2__)
#include "dragnn/runtime/math/arithmetic_sse.h"
#elif defined(__ARM_NEON) || defined(__ARM_NEON__)
#include "dragnn/runtime/math/arithmetic_neon.h"
#else // no architecture-specific implementation
#include "dragnn/runtime/math/arithmetic_common.h"
#endif
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
#if defined(__AVX2__)
#include <stddef.h>
#include "dragnn/runtime/math/arithmetic_common.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// TODO(googleuser): Leaving this empty means that the definitions
// from arithmetic_common.h carry through. Provide template specializations
// that use architecture-specific intrinsics.
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // defined(__AVX2__)
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Declarations of arithmetic operations and trivial generic implementations.
// Architecture-specific implementations should include this header and define
// template specializations that override the generic implementations.
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
#include <stddef.h>
#include <algorithm>
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Performs output = scale * input. Dimensions must match.
template <class T>
void ScaleElements(Vector<T> input, T scale, MutableVector<T> output);
// Performs output += scale * input. Dimensions must match.
template <class T>
void AddScaledElements(Vector<T> input, T scale, MutableVector<T> output);
// Performs values = max(minimum, values) in place.
template <class T>
void MaxElements(T minimum, MutableVector<T> values);
// Performs output = matrix * input. All vectors are interpreted as column
// vectors. Dimensions must match.
template <class T>
void MultiplyMatrixAndVector(Matrix<T> matrix, Vector<T> input,
MutableVector<T> output);
// Performs output = bias + matrix * input. All vectors are interpreted as
// column vectors. Dimensions must match.
template <class T>
void MultiplyMatrixAndVectorWithBias(Matrix<T> matrix, Vector<T> bias,
Vector<T> input, MutableVector<T> output);
// Implementation details below.
template <class T>
void ScaleElements(T scale, Vector<T> input, MutableVector<T> output) {
DCHECK_EQ(input.size(), output.size());
for (size_t i = 0; i < input.size(); ++i) output[i] = scale * input[i];
}
template <class T>
void AddScaledElements(T scale, Vector<T> input, MutableVector<T> output) {
DCHECK_EQ(input.size(), output.size());
for (size_t i = 0; i < input.size(); ++i) output[i] += scale * input[i];
}
template <class T>
void MaxElements(T minimum, MutableVector<T> values) {
for (T &value : values) value = std::max(minimum, value);
}
namespace internal {
// Like MultiplyMatrixAndVectorWithBias(), but if |ignore_bias| is true, then
// the |bias| is treated as zero and its dimensions are not checked.
template <bool ignore_bias, class T>
void MultiplyMatrixAndVectorImpl(Matrix<T> matrix, Vector<T> bias,
Vector<T> input, MutableVector<T> output) {
DCHECK_EQ(matrix.num_columns(), input.size());
if (!ignore_bias) DCHECK_EQ(matrix.num_rows(), bias.size());
DCHECK_EQ(matrix.num_rows(), output.size());
for (size_t i = 0; i < matrix.num_rows(); ++i) {
const Vector<T> row = matrix.row(i);
DCHECK_EQ(row.size(), input.size());
T sum = ignore_bias ? T() : bias[i];
for (size_t j = 0; j < row.size(); ++j) sum += row[j] * input[j];
output[i] = sum;
}
}
} // namespace internal
template <class T>
void MultiplyMatrixAndVector(Matrix<T> matrix, Vector<T> input,
MutableVector<T> output) {
internal::MultiplyMatrixAndVectorImpl<true>(matrix, {}, input, output);
}
template <class T>
void MultiplyMatrixAndVectorWithBias(Matrix<T> matrix, Vector<T> bias,
Vector<T> input, MutableVector<T> output) {
internal::MultiplyMatrixAndVectorImpl<false>(matrix, bias, input, output);
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <stddef.h>
#include "dragnn/runtime/math/arithmetic_common.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// TODO(googleuser): Leaving this empty means that the definitions
// from arithmetic_common.h carry through. Provide template specializations
// that use architecture-specific intrinsics.
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // defined(__ARM_NEON) || defined(__ARM_NEON__)
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
#if defined(__SSE4_2__)
#include <stddef.h>
#include "dragnn/runtime/math/arithmetic_common.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// TODO(googleuser): Leaving this empty means that the definitions
// from arithmetic_common.h carry through. Provide template specializations
// that use architecture-specific intrinsics.
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // defined(__SSE4_2__)
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/arithmetic.h"
#include <stddef.h>
#include <vector>
#include "dragnn/runtime/math/types.h"
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
// Tests that ScaleElements() doesn't crash on empty vectors.
TEST(ScaleElementsTest, Empty) {
Vector<float> input;
MutableVector<float> output;
ScaleElements(1.5f, input, output);
}
// Tests that ScaleElements() copies scaled values from one vector to another.
TEST(ScaleElementsTest, Populated) {
UniqueVector<float> input({-2.0f, -3.0f, 5.0f});
UniqueVector<float> output({7.0f, 11.0f, 13.0f}); // gets overwritten
ScaleElements(1.5f, Vector<float>(*input), *output);
EXPECT_EQ((*output)[0], 1.5 * -2.0);
EXPECT_EQ((*output)[1], 1.5 * -3.0);
EXPECT_EQ((*output)[2], 1.5 * 5.0);
}
// Tests that AddScaledElements() doesn't crash on empty vectors.
TEST(AddScaledElementsTest, Empty) {
Vector<float> input;
MutableVector<float> output;
AddScaledElements(1.5f, input, output);
}
// Tests that AddScaledElements() adds scaled values from one vector to another.
TEST(AddScaledElementsTest, Populated) {
UniqueVector<float> input({-2.0f, -3.0f, 5.0f});
UniqueVector<float> output({7.0f, 11.0f, 13.0f}); // gets added to
AddScaledElements(1.5f, Vector<float>(*input), *output);
EXPECT_EQ((*output)[0], 1.5 * -2.0 + 7.0);
EXPECT_EQ((*output)[1], 1.5 * -3.0 + 11.0);
EXPECT_EQ((*output)[2], 1.5 * 5.0 + 13.0);
}
// Tests that MaxElements() doesn't crash on empty vectors.
TEST(MaxElementsTest, Empty) {
MutableVector<float> values;
MaxElements(1.5f, values);
}
// Tests that MaxElements() performs an in-place element-wise maximum.
TEST(MaxElementsTest, Populated) {
UniqueVector<float> values({-1.0f, 2.0f, 0.25f, -0.5f, 0.375f});
MaxElements(0.125f, *values);
EXPECT_EQ((*values)[0], 0.125);
EXPECT_EQ((*values)[1], 2.0);
EXPECT_EQ((*values)[2], 0.25);
EXPECT_EQ((*values)[3], 0.125);
EXPECT_EQ((*values)[4], 0.375);
}
// Tests that MultiplyMatrixAndVector() doesn't crash on empty inputs.
TEST(MultiplyMatrixAndVectorTest, Empty) {
Matrix<float> matrix;
Vector<float> input;
MutableVector<float> output;
MultiplyMatrixAndVector(matrix, input, output);
}
// Tests that MultiplyMatrixAndVector() computes a matrix-vector product.
TEST(MultiplyMatrixAndVectorTest, Populated) {
UniqueMatrix<float> matrix({{2.0f, 3.0f}, //
{5.0f, 7.0f}, //
{11.0f, 13.0f}});
UniqueVector<float> input({-0.5f, 2.0f});
UniqueVector<float> output({9.8f, 7.6f, 5.4f}); // gets overwritten
MultiplyMatrixAndVector(Matrix<float>(*matrix), Vector<float>(*input),
*output);
EXPECT_EQ((*output)[0], 2.0 * -0.5 + 3.0 * 2.0);
EXPECT_EQ((*output)[1], 5.0 * -0.5 + 7.0 * 2.0);
EXPECT_EQ((*output)[2], 11.0 * -0.5 + 13.0 * 2.0);
}
// Tests that MultiplyMatrixAndVectorWithBias() doesn't crash on empty inputs.
TEST(MultiplyMatrixAndVectorWithBiasTest, Empty) {
Matrix<float> matrix;
Vector<float> bias;
Vector<float> input;
MutableVector<float> output;
MultiplyMatrixAndVectorWithBias(matrix, bias, input, output);
}
// Tests that MultiplyMatrixAndVectorWithBias() computes a matrix-vector product
// with an additive bias.
TEST(MultiplyMatrixAndVectorWithBiasTest, Populated) {
UniqueMatrix<float> matrix({{2.0f, 3.0f}, //
{5.0f, 7.0f}, //
{11.0f, 13.0f}});
UniqueVector<float> bias({100.5f, 200.25f, 300.75f});
UniqueVector<float> input({-0.5f, 2.0f});
UniqueVector<float> output({9.8f, 7.6f, 5.4f}); // gets overwritten
MultiplyMatrixAndVectorWithBias(Matrix<float>(*matrix), Vector<float>(*bias),
Vector<float>(*input), *output);
EXPECT_EQ((*output)[0], 100.5 + 2.0 * -0.5 + 3.0 * 2.0);
EXPECT_EQ((*output)[1], 200.25 + 5.0 * -0.5 + 7.0 * 2.0);
EXPECT_EQ((*output)[2], 300.75 + 11.0 * -0.5 + 13.0 * 2.0);
}
// A dummy type for the specializations below. Specializing on this unique
// dummy type ensures we don't conflict with any existing specialization.
struct Foo {
float value;
};
} // namespace
// Dummy specializations for use in the subsequent tests.
template <>
void ScaleElements(Foo scale, Vector<Foo> input, MutableVector<Foo> output) {
for (Foo &foo : output) foo.value = 777.0;
}
namespace {
// Tests that the template specialization overrides the generic implementation.
TEST(ScaleElementsTest, OverriddenByTemplateSpecialization) {
// These values are uninitialized, but it doesn't matter because the
// specialization never looks at them.
UniqueVector<Foo> input(3);
UniqueVector<Foo> output(3);
ScaleElements(Foo(), Vector<Foo>(*input), *output);
EXPECT_EQ((*output)[0].value, 777.0);
EXPECT_EQ((*output)[1].value, 777.0);
EXPECT_EQ((*output)[2].value, 777.0);
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Contains logic for activation functions and more-complex elementwise
// vectorized operations.
//
// Uses operator overloading to express computation that looks like regular
// code. Currently, overloaded operators are scoped away in an "internal"
// namespace so they won't be accidentally used.
#ifndef DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
#define DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
#if defined(__AVX2__)
#include <immintrin.h>
#endif
#include "dragnn/runtime/math/avx_vector_array.h"
#define DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
#ifdef __clang__
#define DRAGNN_AVXAF_GCC_UNROLL
#else
#define DRAGNN_AVXAF_GCC_UNROLL __attribute__((optimize("unroll-loops")))
#endif
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Public API
namespace activations {
// Calculates elementwise exp(x).
inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE DRAGNN_AVXAF_GCC_UNROLL
Exponential(AvxFloatVec x);
// Calculates elementwise sigmoid(x) = 1/(1+exp(-x)).
inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE Sigmoid(AvxFloatVec x);
// Calculates elementwise tanh(x).
inline AvxFloatVec DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE Tanh(AvxFloatVec x);
} // namespace activations
namespace activations {
// Calculates e^x by representing x = m * ln(2) + r. It does a polynomial
// expansion of e^r, and then multiplies in e^(m * ln(2)) = 2^m.
//
inline AvxFloatVec Exponential(AvxFloatVec x) {
// EDSL-like helpers for writing vectorized code.
auto Const = AvxFloatVec::Const;
constexpr float explo = -88.3762626647949f;
constexpr float exphi = 88.3762626647950f;
const float cephes_exp_factors[] = {
1.9875691500e-4f, 1.3981999507e-3f, 8.3334519073e-3f,
4.1665795894e-2f, 1.6666665459e-1f, 5.0000001201e-1f,
};
// Clamp the input. i.e. assume exp(-88) is close to zero and exp(88) is
// close to infinity.
x.Clamp(explo, exphi);
// Calculate `m = floor(x/ln(2) + 0.5)`.
constexpr float inv_log2e = 1.44269504088896341f;
AvxFloatVec m = Const(0.5f);
m += Const(inv_log2e) * x;
m.Floor();
// Calculate `r = x - m*ln(2)` (see function-level comment).
constexpr float neg_ln2 = -0.6931471805599453f;
AvxFloatVec r = x;
r += m * Const(neg_ln2);
// Calculate a polynomial expansion of y = exp(r).
AvxFloatVec r_squared(r * r);
AvxFloatVec y = Const(cephes_exp_factors[0]);
for (int i = 1; i < 6; ++i) {
y = y * r + Const(cephes_exp_factors[i]);
}
y = y * r_squared + r;
y += Const(1.0f);
// Calculate `emm0 = 2^m`. This is done by converting emm0 into an integer,
// and shifting it into the exponent bits of the desired floating-point
// result. Recall that the exponent is unsigned with 127 representing 2^0.
AvxFloatVec emm0 = m;
emm0 += Const(127.0f);
AvxIntVec emm0_i(emm0);
emm0_i.LeftShift(23);
// The final result is `2^m * exp(r)`.
return AvxFloatVec(emm0_i.ReinterpretCastFloat() * y);
}
inline AvxFloatVec Tanh(AvxFloatVec x) {
// EDSL-like helpers for writing vectorized code.
auto Const = AvxFloatVec::Const;
const float numerator_coefficients[] = {
-2.76076847742355e-16f, 2.00018790482477e-13f, -8.60467152213735e-11f,
5.12229709037114e-08f, 1.48572235717979e-05f, 6.37261928875436e-04f,
4.89352455891786e-03f,
};
const float denominator_coefficients[] = {
1.19825839466702e-06f,
1.18534705686654e-04f,
2.26843463243900e-03f,
4.89352518554385e-03f,
};
// Clamp the inputs to the range [-9, 9] since anything outside this range
// is +/-1.0 in single-precision.
x.Clamp(-9.0f, 9.0f);
// Compute x^2.
AvxFloatVec x_squared(x * x);
// Compute the numerator polynomial.
AvxFloatVec p = Const(numerator_coefficients[0]);
for (int i = 1; i < 7; ++i) {
// p = p * x^2 + numerator_coefficients_i
p = p * x_squared + Const(numerator_coefficients[i]);
}
// p = p * x
p = AvxFloatVec(p * x);
// Compute the denominator polynomial.
AvxFloatVec q = Const(denominator_coefficients[0]);
for (int i = 1; i < 4; ++i) {
// q = q * x^2 + alqha_i
q = q * x_squared + Const(denominator_coefficients[i]);
}
// Divide the numerator by the denominator.
return p / q;
}
inline AvxFloatVec Sigmoid(AvxFloatVec x) {
AvxFloatVec half = AvxFloatVec::Const(0.5);
return half * Tanh(AvxFloatVec(half * x)) + half;
}
} // namespace activations
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#undef DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE
#undef DRAGNN_AVXAF_GCC_UNROLL
#endif // DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/avx_activation_functions.h"
#include <cmath>
#include <chrono>
#include "dragnn/runtime/test/helpers.h"
#include "syntaxnet/base.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
TEST(AvxActivationFunctionsTest, ExponentialTest) {
AvxVectorFuzzTest(
[](AvxFloatVec *vec) { *vec = activations::Exponential(*vec); },
[](float input_value, float actual) {
const float inverted = log(actual);
EXPECT_NEAR(input_value, inverted, 1e-6)
<< "exp(" << input_value << ") = " << actual
<< ", log(actual) = " << inverted;
});
}
TEST(AvxActivationFunctionsTest, SigmoidTest) {
AvxVectorFuzzTest( //
[](AvxFloatVec *vec) { *vec = activations::Sigmoid(*vec); },
[](float input_value, float actual) {
const float expected = 1.0f / (1.0f + exp(-input_value));
EXPECT_NEAR(actual, expected, 1e-6)
<< "sigmoid(" << input_value << ") = " << actual
<< ", expected = " << expected;
});
}
template <int batch_size, class Function>
void RunPerformanceTest(Function activation, int flops) {
constexpr uint64 kIterations = 1000000;
UniqueVector<float> input(batch_size);
UniqueVector<float> output(batch_size);
InitRandomVector(*input);
InitRandomVector(*output);
AvxFloatVecArray<batch_size / kAvxWidth> array;
auto start_time = std::chrono::system_clock::now();
for (int i = 0; i < kIterations; ++i) {
array.Load(input->data());
array.Apply(activation);
array.Store(output->data());
}
auto end_time = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end_time - start_time;
double elapsed = elapsed_seconds.count();
double exp_ops = kIterations * batch_size;
double macro_gops = exp_ops / 1e9 / elapsed;
VLOG(0) << "For batch_size " << batch_size
<< " macro-GOPS (giga-ops per sec): " << macro_gops
<< ", raw arithmetic: " << flops * macro_gops;
}
TEST(AvxActivationFunctionsTest, SigmoidPerformanceTest) {
RunPerformanceTest<8>(activations::Sigmoid, 26);
RunPerformanceTest<16>(activations::Sigmoid, 26);
RunPerformanceTest<32>(activations::Sigmoid, 26);
RunPerformanceTest<48>(activations::Sigmoid, 26);
RunPerformanceTest<64>(activations::Sigmoid, 26);
RunPerformanceTest<128>(activations::Sigmoid, 26);
}
TEST(AvxActivationFunctionsTest, TanhTest) {
AvxVectorFuzzTest([](AvxFloatVec *vec) { *vec = activations::Tanh(*vec); },
[](float input_value, float actual) {
const float expected = tanh(input_value);
EXPECT_NEAR(actual, expected, 1e-6)
<< "tanh(" << input_value << ") = " << actual
<< ", expected = " << expected;
});
}
TEST(AvxActivationFunctionsTest, TanhPerformanceTest) {
RunPerformanceTest<8>(activations::Sigmoid, 23);
RunPerformanceTest<16>(activations::Sigmoid, 23);
RunPerformanceTest<32>(activations::Tanh, 23);
RunPerformanceTest<48>(activations::Tanh, 23);
RunPerformanceTest<64>(activations::Tanh, 23);
RunPerformanceTest<128>(activations::Tanh, 23);
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Wraps AVX vectors into convenient helper classes. This contains a class
// wrapping a single AVX register, AvxFloatVec, and a class to manipulate a
// batch of registers, AvxFloatVecArray. Use of the latter is recommended where
// applicable, since it will be unrolled into more vectorizable code.
#ifndef DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
#define DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
#include <cmath>
#if defined(__AVX__)
#include <immintrin.h>
#elif defined(__SSE4_2__)
#include <nmmintrin.h>
#endif
#include "dragnn/runtime/math/float16_types.h"
#define DRAGNN_AVXVA_ALWAYS_INLINE inline __attribute__((always_inline))
#ifdef __clang__
// Clang doesn't support __attribute__((optimize(...))).
#define DRAGNN_AVXVA_INLINED_UNROLLED inline __attribute__((always_inline))
#else
// Assume we're using GCC, which does.
#define DRAGNN_AVXVA_INLINED_UNROLLED \
inline __attribute__((always_inline)) \
__attribute__((optimize("unroll-loops")))
#endif
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Number of single-precision floating point numbers that fit into a single SSE
// / AVX2 register (which are 128 and 256 bits respectively).
constexpr int kSseWidth = 128 / 32; // = 4
constexpr int kAvxWidth = 256 / 32; // = 8
constexpr int kSseWidthHalfPrecision = 128 / 16; // = 8
constexpr int kAvxWidthHalfPrecision = 256 / 16; // = 16
class AvxFloatVec;
namespace internal {
// This struct should always be eliminated by the compiler; it only exists so we
// can write `foo += bar * baz`, and have that compiled into a single FMA
// operation.
struct AvxMultiplyExpr {
const AvxFloatVec &a;
const AvxFloatVec &b;
};
} // namespace internal
// Allows EDSL-like programming with AVX vectors.
inline internal::AvxMultiplyExpr operator*(const AvxFloatVec &a,
const AvxFloatVec &b);
inline AvxFloatVec operator+(const internal::AvxMultiplyExpr &expr,
const AvxFloatVec &v);
inline AvxFloatVec operator+(const AvxFloatVec &a, const AvxFloatVec &b);
inline AvxFloatVec operator/(const AvxFloatVec &a, const AvxFloatVec &b);
inline AvxFloatVec operator-(const AvxFloatVec &a, const AvxFloatVec &b);
// API over a single AVX vector (register). The implementation will either use
// a real AVX vector, or a fixed array of floats for compatibility.
//
// Note that we include the "inline" directive in declarations, not just
// definitions, because it is necessary for the "always_inline" directive.
struct AvxFloatVec {
public:
AvxFloatVec() {}
// Evaluates an AvxMultiplyExpr intermediary without adding anything. This is
// not an implicit cast, because typically when we write `a * b` we want to
// add it to something and use an FMA operation.
explicit AvxFloatVec(const internal::AvxMultiplyExpr &expr);
// Loads from an aligned region of memory.
inline void Load(const float *source);
// Loads a constant value.
inline void LoadConstVector(const float val);
// Stores to an aligned region of memory.
inline void Store(float *dst) const;
// Adds `a * b` to this value, using a fused multiply-add operation.
inline void AddProductOf(const AvxFloatVec &a, const AvxFloatVec &b);
// Element-wise floor.
inline void Floor();
// Element-wise clamps values between a min and max value.
inline void Clamp(const float min_value, const float max_value);
// Convenience method for more complex calculations.
static DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec Const(const float value) {
AvxFloatVec result;
result.LoadConstVector(value);
return result;
}
// Syntactic sugar for computing an FMA operation.
inline AvxFloatVec &operator+=(const internal::AvxMultiplyExpr &to_add);
// Adds another vector element-wise.
inline AvxFloatVec &operator+=(const AvxFloatVec &vec);
// Subtracts another vector element-wise.
inline AvxFloatVec &operator-=(const AvxFloatVec &vec);
// Divides another vector element-wise.
inline AvxFloatVec &operator/=(const AvxFloatVec &vec);
#if defined(__AVX__)
__m256 ymm;
#elif defined(__SSE4_2__)
__m128 xmm[2];
#else
float ymm[8];
#endif
};
// Small wrapper around integer AVX vectors, exposing only methods we need for
// implementing the activation functions.
//
// As above, `inline` is specified here for the always_inline directive.
class AvxIntVec {
public:
// Constructs an AVX integer vector, by converting floating-point values.
inline explicit AvxIntVec(const AvxFloatVec &v);
// Left-shifts integer values.
inline void LeftShift(int bits);
// Reinterprets the register as a floating-point register, for bitwise tricks.
inline AvxFloatVec ReinterpretCastFloat();
private:
// Underlying register.
#if defined(__AVX__)
__m256i ymm_;
#elif defined(__SSE4_2__)
__m128i xmm_[2];
#else
int ymm_[8];
#endif
};
// Implements the index permutation that is effectively applied by the
// _mm256_unpack instructions. This permutation is equivalent to swapping the
// 3rd and 4th bits. See the PermutationFunctionIsEqualToTable test for the
// effective permutation that this encodes.
//
// We haven't done performance testing, but hopefully this is sufficiently fast
// for the compatibility routine. Hopefully in its use below, the compiler will
// determine it is being called with a constant (post-unrolling) and inline it.
DRAGNN_AVXVA_ALWAYS_INLINE int FastUnpackPermutation(int original_idx) {
// Bit in the 4th index if the 3rd and 4th bits should be swapped.
int should_swap = (original_idx + /* 0b0100 */ 4) & /* 0b1000 */ 8;
// If should_swap is zero, leaves original_idx untouched. Otherwise, does an
// xor with 0b1100, which will flip 10 to 01 and 01 to 10.
return (should_swap | (should_swap >> 1)) ^ original_idx;
}
// API over an array of AVX vectors (registers). The methods on this class are
// annotated such that the compiler should unroll them.
template <int N>
struct AvxFloatVecArray {
public:
DRAGNN_AVXVA_INLINED_UNROLLED void Load(const float *source) {
for (int i = 0; i < N; i++) {
vectors[i].Load(source + 8 * i);
}
}
DRAGNN_AVXVA_INLINED_UNROLLED void Load(const float *source, int max_idx) {
for (int i = 0; i < N; i++) {
if (i < max_idx) {
vectors[i].Load(source + 8 * i);
} else {
// When testing with a memory sanitizer, we make sure not to read
// uninitialized values. This is usually safe in normal operation
// because such results are never stored (via corresponding
// store-masking logic), but of course each algorithm must be tested to
// ensure correct operation.
//
// It is also worth pointing out that exceptional values (NaN, etc.) can
// slow down AVX/FMA floating point operations considerably. So we
// should investigate whether this is worth enabling in all cases (and
// forcing algorithms to provide a default).
#if defined(MEMORY_SANITIZER)
vectors[i].LoadConstVector(0);
#endif
}
}
}
// Reads and unpacks truncated half-precision values.
//
// Currently, only matrix coefficients use compressed/half-precision values,
// so it's not yet necessary to support max_idx masking (which will get a bit
// more complicated).
DRAGNN_AVXVA_INLINED_UNROLLED void Load(const TruncatedFloat16 *source);
#if defined(__F16C__)
// Reads and unpacks IEEE-754 half-precision values.
//
// Currently, only matrix coefficients use compressed/half-precision values,
// so it's not yet necessary to support max_idx masking (which will get a bit
// more complicated).
//
// TODO(googleuser): Either add non-F16C compatibility support from Eigen,
// or delete this code if it turns out not to be helpful.
DRAGNN_AVXVA_INLINED_UNROLLED void Load(const IeeeFloat16 *source);
#endif
DRAGNN_AVXVA_INLINED_UNROLLED void LoadConstVector(const float val) {
for (int i = 0; i < N; i++) {
vectors[i].LoadConstVector(val);
}
}
DRAGNN_AVXVA_INLINED_UNROLLED void Store(float *dst) {
for (int i = 0; i < N; i++) {
vectors[i].Store(dst + 8 * i);
}
}
DRAGNN_AVXVA_INLINED_UNROLLED void Store(float *dst, int max_idx) {
for (int i = 0; i < N; i++) {
// This is equivalent to writing `i < N && i < max_idx` above, but forces
// the compiler to produce more efficient code (it's still creating jump
// instructions, but the branching is probably more predictable, and the
// loops are unrolled). In the future we could switch to VMASKMOV if
// necessary.
if (i < max_idx) {
vectors[i].Store(dst + 8 * i);
}
}
}
template <class Function>
DRAGNN_AVXVA_INLINED_UNROLLED void Apply(const Function &fcn) {
for (int i = 0; i < N; i++) {
vectors[i] = fcn(vectors[i]);
}
}
AvxFloatVec vectors[N];
};
// Implementation details.
#if defined(__AVX__)
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
ymm = _mm256_mul_ps(expr.a.ymm, expr.b.ymm);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
ymm = _mm256_load_ps(source);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
ymm = _mm256_set1_ps(val);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
_mm256_store_ps(dst, ymm);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
const AvxFloatVec &a, const AvxFloatVec &b) {
#if defined(__AVX2__) && defined(__FMA__)
ymm = _mm256_fmadd_ps(a.ymm, b.ymm, ymm);
#else
*this += AvxFloatVec(a * b);
#endif
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
ymm = _mm256_floor_ps(ymm);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
const float max_value) {
ymm = _mm256_min_ps(ymm, Const(max_value).ymm);
ymm = _mm256_max_ps(ymm, Const(min_value).ymm);
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
const AvxFloatVec &vec) {
ymm = _mm256_add_ps(vec.ymm, ymm);
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
const AvxFloatVec &vec) {
ymm = _mm256_sub_ps(ymm, vec.ymm);
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
const AvxFloatVec &vec) {
ymm = _mm256_div_ps(ymm, vec.ymm);
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v)
: ymm_(_mm256_cvttps_epi32(v.ymm)) {}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
#if defined(__AVX2__)
ymm_ = _mm256_slli_epi32(ymm_, bits);
#else
// Convert to SSE and back again. This is pretty slow, so don't use this code
// except for compatibility purposes.
__m256i upper_bits = _mm256_permute2f128_si256(ymm_, ymm_, 1);
__m128i first = _mm256_castsi256_si128(ymm_); // Lower bits as SSE
__m128i second = _mm256_castsi256_si128(upper_bits); // Upper bits as SSE
first = _mm_slli_epi32(first, bits);
second = _mm_slli_epi32(second, bits);
ymm_ = _mm256_permute2f128_si256(_mm256_castsi128_si256(first),
_mm256_castsi128_si256(second), (2 << 4));
#endif
}
AvxFloatVec DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::ReinterpretCastFloat() {
AvxFloatVec result;
result.ymm = _mm256_castsi256_ps(ymm_);
return result;
}
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const TruncatedFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
for (int i = 0; i < N / 2; i++) {
#if defined(__AVX2__)
const __m256i input = _mm256_load_si256(
reinterpret_cast<__m256i const *>(source + kAvxWidthHalfPrecision * i));
vectors[2 * i].ymm = _mm256_castsi256_ps(
_mm256_unpacklo_epi16(_mm256_setzero_si256(), input));
vectors[2 * i + 1].ymm = _mm256_castsi256_ps(
_mm256_unpackhi_epi16(_mm256_setzero_si256(), input));
#else
// Compatibility AVX (not AVX2) implementation.
__m128i input[2];
input[0] = _mm_load_si128(
reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
input[1] = _mm_load_si128(reinterpret_cast<__m128i const *>(
source + kAvxWidthHalfPrecision * i + kSseWidthHalfPrecision));
// Unpack. This permutation is kinda cryptic and, to be honest, derived by
// simply trying many combinations.
vectors[2 * i].ymm = _mm256_insertf128_ps(
_mm256_castps128_ps256(_mm_castsi128_ps(
_mm_unpacklo_epi16(_mm_setzero_si128(), input[0]))),
_mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[1])), 1);
vectors[2 * i + 1].ymm = _mm256_insertf128_ps(
_mm256_castps128_ps256(_mm_castsi128_ps(
_mm_unpackhi_epi16(_mm_setzero_si128(), input[0]))),
_mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[1])), 1);
#endif
}
}
#if defined(__F16C__)
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const IeeeFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
for (int i = 0; i < N / 2; i++) {
// TODO(googleuser): Experiment with doing a single AVX2 load and
// dividing the result.
__m128i first_half = _mm_load_si128(
reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
__m128i second_half = _mm_load_si128(reinterpret_cast<__m128i const *>(
source + kAvxWidthHalfPrecision * i + kAvxWidth));
vectors[2 * i].ymm = _mm256_cvtph_ps(first_half);
vectors[2 * i + 1].ymm = _mm256_cvtph_ps(second_half);
}
}
#endif
#elif defined(__SSE4_2__)
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_mul_ps(expr.a.xmm[i], expr.b.xmm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_load_ps(&source[i * kSseWidth]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
xmm[1] = xmm[0] = _mm_set1_ps(val);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
for (int i = 0; i < 2; ++i) {
_mm_store_ps(&dst[i * kSseWidth], xmm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
const AvxFloatVec &a, const AvxFloatVec &b) {
*this += AvxFloatVec(a * b);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_floor_ps(xmm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
const float max_value) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_min_ps(xmm[i], Const(max_value).xmm[i]);
xmm[i] = _mm_max_ps(xmm[i], Const(min_value).xmm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
const AvxFloatVec &vec) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_add_ps(vec.xmm[i], xmm[i]);
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
const AvxFloatVec &vec) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_sub_ps(xmm[i], vec.xmm[i]);
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
const AvxFloatVec &vec) {
for (int i = 0; i < 2; ++i) {
xmm[i] = _mm_div_ps(xmm[i], vec.xmm[i]);
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v) {
xmm_[0] = _mm_cvttps_epi32(v.xmm[0]);
xmm_[1] = _mm_cvttps_epi32(v.xmm[1]);
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
for (int i = 0; i < 2; ++i) {
xmm_[i] = _mm_slli_epi32(xmm_[i], bits);
}
}
AvxFloatVec DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::ReinterpretCastFloat() {
AvxFloatVec result;
for (int i = 0; i < 2; ++i) {
result.xmm[i] = _mm_castsi128_ps(xmm_[i]);
}
return result;
}
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const TruncatedFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
for (int i = 0; i < N / 2; i++) {
__m128i input[2];
input[0] = _mm_load_si128(
reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
input[1] = _mm_load_si128(reinterpret_cast<__m128i const *>(
source + kAvxWidthHalfPrecision * i + kSseWidthHalfPrecision));
vectors[2 * i].xmm[0] =
_mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[0]));
vectors[2 * i + 1].xmm[0] =
_mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[0]));
vectors[2 * i].xmm[1] =
_mm_castsi128_ps(_mm_unpacklo_epi16(_mm_setzero_si128(), input[1]));
vectors[2 * i + 1].xmm[1] =
_mm_castsi128_ps(_mm_unpackhi_epi16(_mm_setzero_si128(), input[1]));
}
}
#if defined(__F16C__)
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const IeeeFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
for (int i = 0; i < N / 2; i++) {
__m128i first_half = _mm_load_si128(
reinterpret_cast<__m128i const *>(source + kAvxWidthHalfPrecision * i));
__m128i second_half = _mm_load_si128(reinterpret_cast<__m128i const *>(
source + kAvxWidthHalfPrecision * i + kAvxWidth));
vectors[2 * i].xmm[0] = _mm_cvtph_ps(first_half);
vectors[2 * i + 1].xmm[0] = _mm_cvtph_ps(second_half);
first_half = _mm_shuffle_epi32(first_half, _MM_SHUFFLE(0, 1, 3, 2));
second_half = _mm_shuffle_epi32(second_half, _MM_SHUFFLE(0, 1, 3, 2));
vectors[2 * i].xmm[1] = _mm_cvtph_ps(first_half);
vectors[2 * i + 1].xmm[1] = _mm_cvtph_ps(second_half);
}
}
#endif
#else
// Compatibility implementations. If you compile with -ftree-vectorize and
// -msse2 flags, you should still get decent performance (maybe 1/4 of the
// AVX/FMA version).
//
// See the class above for method documentation.
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec::AvxFloatVec(const internal::AvxMultiplyExpr &expr) {
for (int i = 0; i < 8; i++) {
ymm[i] = expr.a.ymm[i] * expr.b.ymm[i];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Load(const float *source) {
for (int i = 0; i < 8; i++) {
ymm[i] = source[i];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::LoadConstVector(const float val) {
for (int i = 0; i < 8; i++) {
ymm[i] = val;
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Store(float *dst) const {
for (int i = 0; i < 8; i++) {
dst[i] = ymm[i];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::AddProductOf(
const AvxFloatVec &a, const AvxFloatVec &b) {
for (int i = 0; i < 8; i++) {
ymm[i] += a.ymm[i] * b.ymm[i];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Floor() {
for (int i = 0; i < 8; i++) {
ymm[i] = floor(ymm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxFloatVec::Clamp(const float min_value,
const float max_value) {
for (int i = 0; i < 8; i++) {
ymm[i] = fmin(fmax(ymm[i], min_value), max_value);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
const AvxFloatVec &vec) {
for (int i = 0; i < 8; i++) {
ymm[i] += vec.ymm[i];
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator-=(
const AvxFloatVec &vec) {
for (int i = 0; i < 8; i++) {
ymm[i] -= vec.ymm[i];
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator/=(
const AvxFloatVec &vec) {
for (int i = 0; i < 8; i++) {
ymm[i] /= vec.ymm[i];
}
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxIntVec::AvxIntVec(const AvxFloatVec &v) {
for (int i = 0; i < 8; i++) {
ymm_[i] = static_cast<int>(v.ymm[i]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE void AvxIntVec::LeftShift(int bits) {
for (int i = 0; i < 8; i++) {
ymm_[i] = ymm_[i] << bits;
}
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec AvxIntVec::ReinterpretCastFloat() {
AvxFloatVec result;
for (int i = 0; i < 8; i++) {
result.ymm[i] = reinterpret_cast<float &>(ymm_[i]);
}
return result;
}
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const TruncatedFloat16 *source) {
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
// Iterate through mock AVX vectors, each composed of 16 half-floats.
for (int vec_idx = 0; vec_idx < N / 2; vec_idx++) {
// Making this code a bit more verbose, by reading in-order to a temporary
// array, results in faster performance. The compatibility version is still
// pretty slow though.
TruncatedFloat16 tmp[16];
for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
tmp[i] = source[i + kAvxWidthHalfPrecision * vec_idx];
}
float unpacked[16];
for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
unpacked[i] = tmp[i].DebugToFloat();
}
for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
int permuted = FastUnpackPermutation(i);
vectors[2 * vec_idx + (i / 8)].ymm[i % 8] = unpacked[permuted];
}
}
}
#if defined(__F16C__)
template <int N>
DRAGNN_AVXVA_INLINED_UNROLLED void AvxFloatVecArray<N>::Load(
const IeeeFloat16 *source) {
// Not actually required for the compatibility implementation, but it'd be
// rather non-uniform if this API succeeded, and then compilation failed when
// AVX2 was turned on.
static_assert(N % 2 == 0,
"Load() from half floats requires even-sized vector arrays.");
// Iterate through mock AVX vectors, each composed of 16 half-floats.
for (int i = 0; i < N * kAvxWidth; ++i) {
vectors[i / 8].ymm[i % 8] = source[i].DebugToFloat();
}
}
#endif
#endif
// The following operations are mostly syntax sugar, so they do not need
// architecture-specific implementations.
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec &AvxFloatVec::operator+=(
const internal::AvxMultiplyExpr &to_add) {
AddProductOf(to_add.a, to_add.b);
return *this;
}
DRAGNN_AVXVA_ALWAYS_INLINE internal::AvxMultiplyExpr operator*(
const AvxFloatVec &a, const AvxFloatVec &b) {
return internal::AvxMultiplyExpr{a, b};
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec
operator+(const internal::AvxMultiplyExpr &expr, const AvxFloatVec &v) {
AvxFloatVec result = v;
result += expr;
return result;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator+(const AvxFloatVec &a,
const AvxFloatVec &b) {
AvxFloatVec result = a;
result += b;
return result;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator/(const AvxFloatVec &a,
const AvxFloatVec &b) {
AvxFloatVec result = a;
result /= b;
return result;
}
DRAGNN_AVXVA_ALWAYS_INLINE AvxFloatVec operator-(const AvxFloatVec &a,
const AvxFloatVec &b) {
AvxFloatVec result = a;
result -= b;
return result;
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#undef DRAGNN_AVXVA_ALWAYS_INLINE
#undef DRAGNN_AVXVA_INLINED_UNROLLED
#endif // DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/avx_vector_array.h"
#include <cmath>
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
TEST(AvxVectorTest, LoadAndStore) {
UniqueVector<float> input(kAvxWidth);
UniqueVector<float> output(kAvxWidth);
InitRandomVector(*input);
InitRandomVector(*output);
AvxFloatVec vec;
vec.Load(input->data());
vec.Store(output->data());
for (int i = 0; i < kAvxWidth; ++i) {
EXPECT_EQ((*input)[i], (*output)[i]);
}
}
// Test flooring with assignment, just to make the compiler not erase aliases.
TEST(AvxVectorTest, AssignmentAndFloor) {
UniqueVector<float> input(kAvxWidth);
UniqueVector<float> output(kAvxWidth);
UniqueVector<float> floored(kAvxWidth);
InitRandomVector(*input);
InitRandomVector(*output);
AvxFloatVec vec;
vec.Load(input->data());
AvxFloatVec vec2 = vec;
vec.Floor();
vec.Store(floored->data());
vec2.Store(output->data());
for (int i = 0; i < kAvxWidth; ++i) {
EXPECT_EQ((*input)[i], (*output)[i]);
EXPECT_EQ(floor((*input)[i]), (*floored)[i]);
}
}
TEST(AvxVectorTest, ClampTest) {
bool modified = false; // check that some value was clamped.
AvxVectorFuzzTest(
[](AvxFloatVec *vec) { vec->Clamp(-0.314f, 0.314f); },
[&modified](float input_value, float output_value) {
modified = modified || input_value < -0.314 || input_value > 0.314;
EXPECT_EQ(fmax(-0.314f, fmin(0.314f, input_value)), output_value);
});
EXPECT_TRUE(modified) << "No values fell outside test range for ClampTest().";
}
TEST(AvxVectorTest, LoadConstAndStore) {
UniqueVector<float> output(kAvxWidth);
InitRandomVector(*output);
AvxFloatVec vec;
vec.LoadConstVector(3.14f);
vec.Store(output->data());
for (int i = 0; i < kAvxWidth; ++i) {
EXPECT_EQ((*output)[i], 3.14f);
}
}
TEST(AvxVectorTest, AddTest) {
AvxVectorFuzzTest( //
[](AvxFloatVec *vec) { (*vec) += *vec; },
[](float input_value, float output_value) {
EXPECT_EQ(input_value * 2, output_value);
});
}
TEST(AvxVectorTest, SubtractTest) {
AvxVectorFuzzTest(
[](AvxFloatVec *vec) {
AvxFloatVec one;
one.LoadConstVector(1.0f);
(*vec) -= one;
},
[](float input_value, float output_value) {
EXPECT_EQ(input_value - 1.0f, output_value);
});
}
TEST(AvxVectorTest, DivideTest) {
AvxVectorFuzzTest(
[](AvxFloatVec *vec) {
AvxFloatVec result;
result.LoadConstVector(1.0f);
result /= *vec;
*vec = result;
},
[](float input_value, float output_value) {
EXPECT_EQ(1.0f / input_value, output_value);
});
}
// This is a really basic test; half of the purpose is to ensure that the float
// API is still OK (i.e. compiles) for odd-sized arrays. If you try to add a
// call to array.Load(TruncatedFloat16 *source), it should produce a compiler
// error.
TEST(AvxFloatVecArrayTest, SingletonArrayLoadsAndStores) {
AvxFloatVecArray<1> array;
UniqueVector<float> input(kAvxWidth);
UniqueVector<float> output(kAvxWidth);
InitRandomVector(*input);
InitRandomVector(*output);
array.Load(input->data());
array.Store(output->data());
for (int i = 0; i < kAvxWidth; ++i) {
EXPECT_EQ((*input)[i], (*output)[i]);
}
}
TEST(AvxFloatVecArrayTest, LoadTruncatedFloat16) {
AvxFloatVecArray<2> array;
UniqueVector<TruncatedFloat16> values(2 * kAvxWidth);
UniqueVector<float> decompressed(2 * kAvxWidth);
for (int i = 0; i < 2 * kAvxWidth; ++i) {
int permuted = FastUnpackPermutation(i);
(*values)[i] = TruncatedFloat16::DebugFromFloat(permuted / 10.0);
}
// Ensure that state persisted from other tests won't cause this test to
// erroneously pass.
array.LoadConstVector(-1.0f);
array.Load(values->data());
array.Store(decompressed->data());
for (int i = 0; i < 2 * kAvxWidth; ++i) {
ASSERT_NEAR((*decompressed)[i], i / 10.0, 0.01);
}
}
TEST(AvxFloatVecArrayTest, LoadIeeeFloat16) {
#if defined(__F16C__)
AvxFloatVecArray<2> array;
UniqueVector<IeeeFloat16> values(2 * kAvxWidth);
UniqueVector<float> decompressed(2 * kAvxWidth);
for (int i = 0; i < 2 * kAvxWidth; ++i) {
(*values)[i] = IeeeFloat16::DebugFromFloat(i / 10.0);
}
// Ensure that state persisted from other tests won't cause this test to
// erroneously pass.
array.LoadConstVector(-1.0f);
array.Load(values->data());
array.Store(decompressed->data());
for (int i = 0; i < 2 * kAvxWidth; ++i) {
ASSERT_NEAR((*decompressed)[i], i / 10.0, 0.01);
}
#else
LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
<< "this test.";
#endif
}
TEST(AvxFloatVecArrayTest, PermutationFunctionIsEqualToTable) {
std::vector<int> permutation = {0, 1, 2, 3, 8, 9, 10, 11,
4, 5, 6, 7, 12, 13, 14, 15};
for (int i = 0; i < kAvxWidthHalfPrecision; ++i) {
EXPECT_EQ(FastUnpackPermutation(i), permutation[i]);
}
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Compatibility support for Eigen.
#ifndef DRAGNN_RUNTIME_MATH_EIGEN_H_
#define DRAGNN_RUNTIME_MATH_EIGEN_H_
#include "dragnn/runtime/alignment.h"
#include "dragnn/runtime/math/types.h"
#include "third_party/eigen3/Eigen/Core"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace internal {
// Returns a combination of bit-options for Eigen matrices.
constexpr int GetEigenMatrixOptions() {
return Eigen::AutoAlign | Eigen::RowMajor;
}
// Returns a combination of bit-options for Eigen maps of runtime types.
constexpr int GetEigenMapOptions() {
static_assert(kAlignmentBytes >= EIGEN_MAX_ALIGN_BYTES,
"Runtime alignment is not compatible with Eigen alignment.");
return Eigen::Aligned;
}
// Eigen matrix and (row) vector types. Don't use these directly; instead use
// the public Map types and functions below to wrap runtime types.
template <class T>
using EigenVector =
Eigen::Matrix<T, 1, Eigen::Dynamic, GetEigenMatrixOptions()>;
template <class T>
using EigenMatrix =
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, GetEigenMatrixOptions()>;
// Eigen stride for matrix types.
using EigenMatrixStride = Eigen::Stride<Eigen::Dynamic, 1>;
// Returns the Eigen stride associated with the |matrix|.
template <class T>
EigenMatrixStride GetEigenMatrixStride(MatrixImpl<T> matrix) {
return EigenMatrixStride(matrix.row_stride(), 1);
}
} // namespace internal
// Eigen wrappers around a runtime-allocated matrix or (row) vector.
template <class T>
using EigenVectorMap =
Eigen::Map<const internal::EigenVector<T>, internal::GetEigenMapOptions()>;
template <class T>
using MutableEigenVectorMap =
Eigen::Map<internal::EigenVector<T>, internal::GetEigenMapOptions()>;
template <class T>
using EigenMatrixMap =
Eigen::Map<const internal::EigenMatrix<T>, internal::GetEigenMapOptions(),
internal::EigenMatrixStride>;
template <class T>
using MutableEigenMatrixMap =
Eigen::Map<internal::EigenMatrix<T>, internal::GetEigenMapOptions(),
internal::EigenMatrixStride>;
// Returns an Eigen wrapper around the |vector| or |matrix|.
template <class T>
EigenVectorMap<T> AsEigenMap(Vector<T> vector) {
return EigenVectorMap<T>(vector.data(), vector.size());
}
template <class T>
MutableEigenVectorMap<T> AsEigenMap(MutableVector<T> vector) {
return MutableEigenVectorMap<T>(vector.data(), vector.size());
}
template <class T>
EigenMatrixMap<T> AsEigenMap(Matrix<T> matrix) {
return EigenMatrixMap<T>(matrix.data(), matrix.num_rows(),
matrix.num_columns(),
internal::GetEigenMatrixStride(matrix));
}
template <class T>
MutableEigenMatrixMap<T> AsEigenMap(MutableMatrix<T> matrix) {
return MutableEigenMatrixMap<T>(matrix.data(), matrix.num_rows(),
matrix.num_columns(),
internal::GetEigenMatrixStride(matrix));
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_EIGEN_H_
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/eigen.h"
#include <vector>
#include "dragnn/core/test/generic.h"
#include "dragnn/runtime/math/types.h"
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
// Expects that two pointers point to the same address.
void ExpectSameAddress(const void *ptr1, const void *ptr2) {
EXPECT_EQ(ptr1, ptr2);
}
// Expects that the |vector| has the |values|.
void ExpectValues(MutableVector<float> vector,
const std::vector<float> &values) {
ASSERT_EQ(vector.size(), values.size());
for (int i = 0; i < values.size(); ++i) {
EXPECT_EQ(vector[i], values[i]);
}
}
// Expects that the Eigen |matrix| has the |values|.
template <class EigenMatrix>
void ExpectValues(const EigenMatrix &matrix,
const std::vector<std::vector<float>> &values) {
ASSERT_EQ(matrix.rows(), values.size());
for (int row = 0; row < matrix.rows(); ++row) {
ASSERT_EQ(matrix.cols(), values[row].size());
for (int column = 0; column < matrix.cols(); ++column) {
EXPECT_EQ(matrix(row, column), values[row][column]);
}
}
}
// Tests that an Eigen vector map references the same memory as the underlying
// runtime vector.
TEST(EigenTest, Vector) {
UniqueVector<float> vector({1.0, 2.0, 3.0, 4.0});
EigenVectorMap<float> const_eigen_vector = AsEigenMap(Vector<float>(*vector));
ExpectSameAddress(const_eigen_vector.data(), vector->data());
ExpectValues(const_eigen_vector, {{1.0, 2.0, 3.0, 4.0}});
MutableEigenVectorMap<float> mutable_eigen_vector = AsEigenMap(*vector);
ExpectSameAddress(mutable_eigen_vector.data(), vector->data());
ExpectValues(mutable_eigen_vector, {{1.0, 2.0, 3.0, 4.0}});
// Write into the runtime vector and read from the other views.
(*vector)[0] = 10.0;
(*vector)[1] = 20.0;
(*vector)[2] = 30.0;
(*vector)[3] = 40.0;
ExpectValues(const_eigen_vector, {{10.0, 20.0, 30.0, 40.0}});
ExpectValues(mutable_eigen_vector, {{10.0, 20.0, 30.0, 40.0}});
// Write into the mutable Eigen vector and read from the other views.
mutable_eigen_vector << 100.0, 200.0, 300.0, 400.0;
ExpectValues(const_eigen_vector, {{100.0, 200.0, 300.0, 400.0}});
ExpectValues(*vector, {100.0, 200.0, 300.0, 400.0});
}
// Tests that an Eigen matrix map references the same memory as the underlying
// runtime vector.
TEST(EigenTest, Matrix) {
UniqueMatrix<float> matrix({{1.0, 2.0, 3.0}, //
{4.0, 5.0, 6.0}, //
{7.0, 8.0, 9.0}});
EigenMatrixMap<float> const_eigen_matrix = AsEigenMap(Matrix<float>(*matrix));
ExpectSameAddress(const_eigen_matrix.data(), matrix->row(0).data());
ExpectValues(const_eigen_matrix, {{1.0, 2.0, 3.0}, //
{4.0, 5.0, 6.0}, //
{7.0, 8.0, 9.0}});
MutableEigenMatrixMap<float> mutable_eigen_matrix = AsEigenMap(*matrix);
ExpectSameAddress(mutable_eigen_matrix.data(), matrix->row(0).data());
ExpectValues(mutable_eigen_matrix, {{1.0, 2.0, 3.0}, //
{4.0, 5.0, 6.0}, //
{7.0, 8.0, 9.0}});
// Write into the runtime matrix and read from the other views.
matrix->row(0)[0] = 10.0;
matrix->row(0)[1] = 20.0;
matrix->row(0)[2] = 30.0;
matrix->row(1)[0] = 40.0;
matrix->row(1)[1] = 50.0;
matrix->row(1)[2] = 60.0;
matrix->row(2)[0] = 70.0;
matrix->row(2)[1] = 80.0;
matrix->row(2)[2] = 90.0;
ExpectValues(const_eigen_matrix, {{10.0, 20.0, 30.0}, //
{40.0, 50.0, 60.0}, //
{70.0, 80.0, 90.0}});
ExpectValues(mutable_eigen_matrix, {{10.0, 20.0, 30.0}, //
{40.0, 50.0, 60.0}, //
{70.0, 80.0, 90.0}});
// Write into the mutable Eigen matrix and read from the other views.
mutable_eigen_matrix << 100.0, 200.0, 300.0,
400.0, 500.0, 600.0,
700.0, 800.0, 900.0;
ExpectValues(const_eigen_matrix, {{100.0, 200.0, 300.0}, //
{400.0, 500.0, 600.0}, //
{700.0, 800.0, 900.0}});
ExpectValues(matrix->row(0), {100.0, 200.0, 300.0});
ExpectValues(matrix->row(1), {400.0, 500.0, 600.0});
ExpectValues(matrix->row(2), {700.0, 800.0, 900.0});
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Declares 16-bit floating point types.
#ifndef DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
#define DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
#if defined(__F16C__)
#include <emmintrin.h>
#endif
#include "syntaxnet/base.h"
#include "tensorflow/core/lib/core/casts.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Represents a truncated 16-bit floating point value. This corresponds to
// `bfloat16` in TensorFlow. It just chops the last 16 least-significant bits
// off the significand of a 32-bit floating point value, leaving 7 significand
// bits, 8 exponent bits, and 1 sign bit.
struct TruncatedFloat16 {
// Slow unpacking routine. Use avx_vector_array.h for normal operation.
float DebugToFloat() const {
uint32 upcast = bits;
upcast <<= 16;
return tensorflow::bit_cast<float>(upcast);
}
// Slow packing routine. Use avx_vector_array.h for normal operation.
static TruncatedFloat16 DebugFromFloat(float value) {
uint32 float_bits = tensorflow::bit_cast<uint32>(value);
return TruncatedFloat16{static_cast<uint16>(float_bits >> 16)};
}
uint16 bits;
};
static_assert(sizeof(TruncatedFloat16) == sizeof(uint16), "Bad struct size");
// Currently, only CPUs with the F16C instruction set are supported. All use of
// this struct should be flag-guarded.
//
// If this becomes a problem, we can implement this method with Eigen's
// CUDA/Half.h.
#if defined(__F16C__)
// Represents an IEEE-754 16-bit floating point value. This has 10 significand
// bits, 5 exponent bits, and 1 sign bit.
//
// TODO(googleuser): Either add compatibility support, or delete this code if
// it turns out not to be helpful.
struct IeeeFloat16 {
// Slow unpacking routine. Use avx_vector_array.h for normal operation.
float DebugToFloat() const { return _cvtsh_ss(bits); }
// Slow packing routine. Use avx_vector_array.h for normal operation.
static IeeeFloat16 DebugFromFloat(float value) {
return IeeeFloat16{_cvtss_sh(value, 0)};
}
uint16 bits;
};
static_assert(sizeof(IeeeFloat16) == sizeof(uint16), "Bad struct size");
#endif
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/float16_types.h"
#include "tensorflow/core/platform/test.h"
namespace syntaxnet {
namespace dragnn {
namespace runtime {
namespace {
// C++11 doesn't support binary literals like 0b01001, so add a helper. :(
uint16 ParseBinaryString(const string &bits) {
CHECK_EQ(bits.size(), 16) << "ParseBinaryString expects full 16-bit values";
uint16 value = 0;
for (const char bit : bits) {
CHECK(bit == '0' || bit == '1') << "String must be 0's and 1's.";
value = (value << 1) + (bit == '0' ? 0 : 1);
}
return value;
}
TEST(Float16TypesTest, IeeeFloat16Accuracy) {
#if defined(__F16C__)
bool some_not_exact = false;
for (int i = -100; i < 100; ++i) {
float value = i / 10.0f;
IeeeFloat16 half = IeeeFloat16::DebugFromFloat(value);
float unpacked = half.DebugToFloat();
EXPECT_NEAR(value, unpacked, 0.01);
some_not_exact = some_not_exact || (value != unpacked);
}
EXPECT_TRUE(some_not_exact);
#else
LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
<< "this test.";
#endif
}
TEST(Float16TypesTest, TruncatedAccuracy) {
bool some_not_exact = false;
for (int i = -100; i < 100; ++i) {
float value = i / 10.0f;
TruncatedFloat16 half = TruncatedFloat16::DebugFromFloat(value);
float unpacked = half.DebugToFloat();
EXPECT_NEAR(value, unpacked, 0.06);
some_not_exact = some_not_exact || (value != unpacked);
}
EXPECT_TRUE(some_not_exact);
}
TEST(Float16TypesTest, TruncatedKnownBinaryRepresentation) {
uint16 neg_1 = ParseBinaryString("1011111110000000");
uint16 one = ParseBinaryString("0011111110000000");
EXPECT_EQ((TruncatedFloat16{neg_1}).DebugToFloat(), -1.0f);
EXPECT_EQ((TruncatedFloat16{one}).DebugToFloat(), 1.0f);
}
TEST(Float16TypesTest, IeeeFloat16KnownBinaryRepresentation) {
#if defined(__F16C__)
uint16 neg_1 = ParseBinaryString("1011110000000000");
uint16 one = ParseBinaryString("0011110000000000");
EXPECT_EQ((IeeeFloat16{neg_1}).DebugToFloat(), -1.0f);
EXPECT_EQ((IeeeFloat16{one}).DebugToFloat(), 1.0f);
#else
LOG(INFO) << "Test binary wasn't compiled with F16C support, so skipping "
<< "this test.";
#endif
}
} // namespace
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Computes `[y_1, y_2, ...] = M * [v_1, v_2, ...] + [b_1, b_2, ...]`, where
//
// M is a `m x n` dense matrix.
// v_i are `n`-dimensional dense vectors.
// b_i and y_i are `m`-dimensional dense vectors.
//
// Unfortunately even larger (e.g. 128x128) matrix sizes are not sufficient to
// hide the latency of a function call. So the entire implementation needs to
// live in this header file. Please make sure to use all of the optimization
// flags mentioned in the BUILD file in any client libraries.
#ifndef DRAGNN_RUNTIME_MATH_SGEMVV_H_
#define DRAGNN_RUNTIME_MATH_SGEMVV_H_
#if defined(__SSE2__)
#include <xmmintrin.h>
#endif
#include "dragnn/runtime/math/avx_vector_array.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#define DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
#ifdef __clang__
#define DRAGNN_SGEMVV_GCC_UNROLL
#else
#define DRAGNN_SGEMVV_GCC_UNROLL __attribute__((optimize("unroll-loops")))
#endif
namespace syntaxnet {
namespace dragnn {
namespace runtime {
// Represents `v, b` from one operation `y = M * v + b`.
template <int num_ops>
struct SgemvInputBatch {
const float *input[num_ops];
const float *initial[num_ops];
};
template <int num_ops>
struct SgemvOutputBatch {
float *output[num_ops];
};
// Matrix argument for the SGEMV/SGEMVV operation. Based on row-batched
// column-major matrices, but pulls the batch size into a template argument
// so code can be compiled more efficiently.
template <int sse_batch_size, typename ElementType = float>
class SgemvMatrix final {
public:
// Convenience type alias.
using MatrixType =
BlockedMatrix<ElementType, BlockedMatrixFormat::kRowBlockedColumnMajor>;
// Creates an empty SgemvMatrix.
SgemvMatrix() = default;
// Initializes the new matrix. Returns an InvalidArgumentError if the block
// size of `matrix` is not equal to `sse_batch_size.
::tensorflow::Status Initialize(const MatrixType &matrix);
// Computes the matrix-vector product with a set of other inputs. See
// top-level comment for the general algorithm.
template <int num_ops, int lookahead_1 = 8, int lookahead_2 = 8>
void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
MatrixMultiVectorProduct(const SgemvInputBatch<num_ops> &inputs,
SgemvOutputBatch<num_ops> *outputs) const {
MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/false,
/*read_initial=*/true, lookahead_1,
lookahead_2>(inputs, -1, outputs);
}
// Computes the matrix-vector product with a set of other inputs. See
// top-level comment for the general algorithm. This variant allows another
// parameter, `output_vector_elements`, to write to outputs which are a
// multiple of kAvxWidth (8 floats, or 32 bytes) but not necessarily
// sse_batch_size. It is slightly slower, but probably more than noise.
//
// |lookahead_1| and |lookahead_2| parameters control prefetching, and should
// usually be tuned via a script. They issue prefetch instructions that are
// `lookahead_1 * sse_batch_size` values ahead of the current matrix entry
// being read, if `lookahead_1 != 0` (and `(lookahead_1 + lookahead_2) *
// sse_batch_size` values, if lookahead_2 != 0). To reiterate, all prefetching
// can be disabled by setting |lookahead_1| to 0, or the second prefetch can
// be disabled by setting |lookahead_2| to 0.
template <int num_ops, int lookahead_1 = 8, int lookahead_2 = 8>
void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
MaskedMatrixMultiVectorProduct(const SgemvInputBatch<num_ops> &inputs,
int output_vector_elements,
SgemvOutputBatch<num_ops> *outputs) const {
MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/true,
/*read_initial=*/true, lookahead_1,
lookahead_2>(inputs, output_vector_elements,
outputs);
}
// Like the above, but assumes existing values are zero instead of reading
// them.
template <int num_ops>
void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
MaskedMatrixMultiVectorProductNoInitial(
const SgemvInputBatch<num_ops> &inputs, int output_vector_elements,
SgemvOutputBatch<num_ops> *outputs) const {
MatrixMultiVectorProductImpl<num_ops, /*mask_input_output=*/true,
/*read_initial=*/false>(
inputs, output_vector_elements, outputs);
}
// Read-only accessor.
const MatrixType &matrix() const { return matrix_; }
private:
template <int num_ops, bool mask_input_output, bool read_initial,
int lookahead_1 = 8, int lookahead_2 = 8>
DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL void
MatrixMultiVectorProductImpl(const SgemvInputBatch<num_ops> &inputs,
int output_vector_elements,
SgemvOutputBatch<num_ops> *outputs) const;
MatrixType matrix_;
};
// Implementation details.
template <int sse_batch_size, typename ElementType>
template <int num_ops, bool mask_input_output, bool read_initial,
int lookahead_1, int lookahead_2>
inline void DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE DRAGNN_SGEMVV_GCC_UNROLL
SgemvMatrix<sse_batch_size, ElementType>::MatrixMultiVectorProductImpl(
const SgemvInputBatch<num_ops> &inputs, int output_vector_elements,
SgemvOutputBatch<num_ops> *outputs) const {
static_assert(sse_batch_size % kAvxWidth == 0,
"sse_batch_size must be a multiple of kAvxWidth (8).");
if (mask_input_output) {
DCHECK_EQ(output_vector_elements % kAvxWidth, 0)
<< "output_vector_elements must be padded to alignment";
}
const ElementType *curr_matrix_ptr = matrix_.vector(0).data();
// Loop over blocks of output rows. Each block of output rows will get a
// partial sum of the [matrix-vector] dot product, where the range of that
// partial sum is designated by start_col and end_col.
for (int row_start = 0; row_start < matrix_.num_rows();
row_start += sse_batch_size) {
const int load_store_max_idx =
(output_vector_elements - row_start) / kAvxWidth;
AvxFloatVecArray<sse_batch_size / kAvxWidth> accumulators[num_ops];
// Read inputs.
for (int op = 0; op < num_ops; ++op) {
if (read_initial) {
if (mask_input_output) {
accumulators[op].Load(&inputs.initial[op][row_start],
load_store_max_idx);
} else {
accumulators[op].Load(&inputs.initial[op][row_start]);
}
} else {
accumulators[op].LoadConstVector(0.0f);
}
}
// Compute matrix-vector product.
for (int col = 0; col < matrix_.num_columns(); ++col) {
if (lookahead_1 != 0) {
#if defined(__SSE2__)
_mm_prefetch(curr_matrix_ptr + lookahead_1 * sse_batch_size,
_MM_HINT_T0);
if (lookahead_2 != 0) {
_mm_prefetch(
curr_matrix_ptr + (lookahead_1 + lookahead_2) * sse_batch_size,
_MM_HINT_T0);
}
#endif
}
// These are the coefficients from each vector at column `col` (just
// broadcast over the whole AVX array).
AvxFloatVec weights[num_ops];
for (int op = 0; op < num_ops; ++op) {
weights[op].LoadConstVector(inputs.input[op][col]);
}
// Loop over each AVX vector and add the current sub-product.
AvxFloatVecArray<sse_batch_size / kAvxWidth> matrix_block;
matrix_block.Load(curr_matrix_ptr);
curr_matrix_ptr += sse_batch_size;
for (int row_offset = 0; row_offset < sse_batch_size / kAvxWidth;
row_offset++) {
for (int op = 0; op < num_ops; ++op) {
accumulators[op].vectors[row_offset].AddProductOf(
weights[op], matrix_block.vectors[row_offset]);
}
}
}
// Save the results.
for (int op = 0; op < num_ops; ++op) {
if (mask_input_output) {
accumulators[op].Store(&outputs->output[op][row_start],
load_store_max_idx);
} else {
accumulators[op].Store(&outputs->output[op][row_start]);
}
}
}
}
template <int sse_batch_size, typename ElementType>
::tensorflow::Status SgemvMatrix<sse_batch_size, ElementType>::Initialize(
const SgemvMatrix<sse_batch_size, ElementType>::MatrixType &matrix) {
if (matrix.block_size() != sse_batch_size) {
return ::tensorflow::errors::InvalidArgument(
"Blocked matrix block_size (", matrix.block_size(),
") must be equal to sse_batch_size (", sse_batch_size, ")");
}
matrix_ = matrix;
return ::tensorflow::Status::OK();
}
} // namespace runtime
} // namespace dragnn
} // namespace syntaxnet
#undef DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
#undef DRAGNN_SGEMVV_GCC_UNROLL
#endif // DRAGNN_RUNTIME_MATH_SGEMVV_H_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment