Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
ResNet50_tensorflow
Commits
a4bb31d0
Commit
a4bb31d0
authored
May 02, 2018
by
Terry Koo
Browse files
Export @195097388.
parent
dea7ecf6
Changes
294
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3997 additions
and
0 deletions
+3997
-0
research/syntaxnet/dragnn/runtime/math/arithmetic_avx.h
research/syntaxnet/dragnn/runtime/math/arithmetic_avx.h
+39
-0
research/syntaxnet/dragnn/runtime/math/arithmetic_common.h
research/syntaxnet/dragnn/runtime/math/arithmetic_common.h
+113
-0
research/syntaxnet/dragnn/runtime/math/arithmetic_neon.h
research/syntaxnet/dragnn/runtime/math/arithmetic_neon.h
+39
-0
research/syntaxnet/dragnn/runtime/math/arithmetic_sse.h
research/syntaxnet/dragnn/runtime/math/arithmetic_sse.h
+39
-0
research/syntaxnet/dragnn/runtime/math/arithmetic_test.cc
research/syntaxnet/dragnn/runtime/math/arithmetic_test.cc
+176
-0
research/syntaxnet/dragnn/runtime/math/avx_activation_functions.h
.../syntaxnet/dragnn/runtime/math/avx_activation_functions.h
+167
-0
research/syntaxnet/dragnn/runtime/math/avx_activation_functions_test.cc
...xnet/dragnn/runtime/math/avx_activation_functions_test.cc
+110
-0
research/syntaxnet/dragnn/runtime/math/avx_vector_array.h
research/syntaxnet/dragnn/runtime/math/avx_vector_array.h
+732
-0
research/syntaxnet/dragnn/runtime/math/avx_vector_array_test.cc
...ch/syntaxnet/dragnn/runtime/math/avx_vector_array_test.cc
+198
-0
research/syntaxnet/dragnn/runtime/math/eigen.h
research/syntaxnet/dragnn/runtime/math/eigen.h
+104
-0
research/syntaxnet/dragnn/runtime/math/eigen_test.cc
research/syntaxnet/dragnn/runtime/math/eigen_test.cc
+135
-0
research/syntaxnet/dragnn/runtime/math/float16_types.h
research/syntaxnet/dragnn/runtime/math/float16_types.h
+87
-0
research/syntaxnet/dragnn/runtime/math/float16_types_test.cc
research/syntaxnet/dragnn/runtime/math/float16_types_test.cc
+87
-0
research/syntaxnet/dragnn/runtime/math/sgemvv.h
research/syntaxnet/dragnn/runtime/math/sgemvv.h
+246
-0
research/syntaxnet/dragnn/runtime/math/sgemvv_test.cc
research/syntaxnet/dragnn/runtime/math/sgemvv_test.cc
+409
-0
research/syntaxnet/dragnn/runtime/math/transformations.h
research/syntaxnet/dragnn/runtime/math/transformations.h
+140
-0
research/syntaxnet/dragnn/runtime/math/transformations_test.cc
...rch/syntaxnet/dragnn/runtime/math/transformations_test.cc
+101
-0
research/syntaxnet/dragnn/runtime/math/types.h
research/syntaxnet/dragnn/runtime/math/types.h
+455
-0
research/syntaxnet/dragnn/runtime/math/types_test.cc
research/syntaxnet/dragnn/runtime/math/types_test.cc
+482
-0
research/syntaxnet/dragnn/runtime/mmap.cc
research/syntaxnet/dragnn/runtime/mmap.cc
+138
-0
No files found.
Too many changes to show.
To preserve performance only
294 of 294+
files are displayed.
Plain diff
Email patch
research/syntaxnet/dragnn/runtime/math/arithmetic_avx.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
#if defined(__AVX2__)
#include <stddef.h>
#include "dragnn/runtime/math/arithmetic_common.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
// TODO(googleuser): Leaving this empty means that the definitions
// from arithmetic_common.h carry through. Provide template specializations
// that use architecture-specific intrinsics.
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#endif // defined(__AVX2__)
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_AVX_H_
research/syntaxnet/dragnn/runtime/math/arithmetic_common.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Declarations of arithmetic operations and trivial generic implementations.
// Architecture-specific implementations should include this header and define
// template specializations that override the generic implementations.
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
#include <stddef.h>
#include <algorithm>
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
// Performs output = scale * input. Dimensions must match.
template
<
class
T
>
void
ScaleElements
(
Vector
<
T
>
input
,
T
scale
,
MutableVector
<
T
>
output
);
// Performs output += scale * input. Dimensions must match.
template
<
class
T
>
void
AddScaledElements
(
Vector
<
T
>
input
,
T
scale
,
MutableVector
<
T
>
output
);
// Performs values = max(minimum, values) in place.
template
<
class
T
>
void
MaxElements
(
T
minimum
,
MutableVector
<
T
>
values
);
// Performs output = matrix * input. All vectors are interpreted as column
// vectors. Dimensions must match.
template
<
class
T
>
void
MultiplyMatrixAndVector
(
Matrix
<
T
>
matrix
,
Vector
<
T
>
input
,
MutableVector
<
T
>
output
);
// Performs output = bias + matrix * input. All vectors are interpreted as
// column vectors. Dimensions must match.
template
<
class
T
>
void
MultiplyMatrixAndVectorWithBias
(
Matrix
<
T
>
matrix
,
Vector
<
T
>
bias
,
Vector
<
T
>
input
,
MutableVector
<
T
>
output
);
// Implementation details below.
template
<
class
T
>
void
ScaleElements
(
T
scale
,
Vector
<
T
>
input
,
MutableVector
<
T
>
output
)
{
DCHECK_EQ
(
input
.
size
(),
output
.
size
());
for
(
size_t
i
=
0
;
i
<
input
.
size
();
++
i
)
output
[
i
]
=
scale
*
input
[
i
];
}
template
<
class
T
>
void
AddScaledElements
(
T
scale
,
Vector
<
T
>
input
,
MutableVector
<
T
>
output
)
{
DCHECK_EQ
(
input
.
size
(),
output
.
size
());
for
(
size_t
i
=
0
;
i
<
input
.
size
();
++
i
)
output
[
i
]
+=
scale
*
input
[
i
];
}
template
<
class
T
>
void
MaxElements
(
T
minimum
,
MutableVector
<
T
>
values
)
{
for
(
T
&
value
:
values
)
value
=
std
::
max
(
minimum
,
value
);
}
namespace
internal
{
// Like MultiplyMatrixAndVectorWithBias(), but if |ignore_bias| is true, then
// the |bias| is treated as zero and its dimensions are not checked.
template
<
bool
ignore_bias
,
class
T
>
void
MultiplyMatrixAndVectorImpl
(
Matrix
<
T
>
matrix
,
Vector
<
T
>
bias
,
Vector
<
T
>
input
,
MutableVector
<
T
>
output
)
{
DCHECK_EQ
(
matrix
.
num_columns
(),
input
.
size
());
if
(
!
ignore_bias
)
DCHECK_EQ
(
matrix
.
num_rows
(),
bias
.
size
());
DCHECK_EQ
(
matrix
.
num_rows
(),
output
.
size
());
for
(
size_t
i
=
0
;
i
<
matrix
.
num_rows
();
++
i
)
{
const
Vector
<
T
>
row
=
matrix
.
row
(
i
);
DCHECK_EQ
(
row
.
size
(),
input
.
size
());
T
sum
=
ignore_bias
?
T
()
:
bias
[
i
];
for
(
size_t
j
=
0
;
j
<
row
.
size
();
++
j
)
sum
+=
row
[
j
]
*
input
[
j
];
output
[
i
]
=
sum
;
}
}
}
// namespace internal
template
<
class
T
>
void
MultiplyMatrixAndVector
(
Matrix
<
T
>
matrix
,
Vector
<
T
>
input
,
MutableVector
<
T
>
output
)
{
internal
::
MultiplyMatrixAndVectorImpl
<
true
>
(
matrix
,
{},
input
,
output
);
}
template
<
class
T
>
void
MultiplyMatrixAndVectorWithBias
(
Matrix
<
T
>
matrix
,
Vector
<
T
>
bias
,
Vector
<
T
>
input
,
MutableVector
<
T
>
output
)
{
internal
::
MultiplyMatrixAndVectorImpl
<
false
>
(
matrix
,
bias
,
input
,
output
);
}
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_COMMON_H_
research/syntaxnet/dragnn/runtime/math/arithmetic_neon.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#include <stddef.h>
#include "dragnn/runtime/math/arithmetic_common.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
// TODO(googleuser): Leaving this empty means that the definitions
// from arithmetic_common.h carry through. Provide template specializations
// that use architecture-specific intrinsics.
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#endif // defined(__ARM_NEON) || defined(__ARM_NEON__)
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_NEON_H_
research/syntaxnet/dragnn/runtime/math/arithmetic_sse.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#ifndef DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
#define DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
#if defined(__SSE4_2__)
#include <stddef.h>
#include "dragnn/runtime/math/arithmetic_common.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/platform/logging.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
// TODO(googleuser): Leaving this empty means that the definitions
// from arithmetic_common.h carry through. Provide template specializations
// that use architecture-specific intrinsics.
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#endif // defined(__SSE4_2__)
#endif // DRAGNN_RUNTIME_MATH_ARITHMETIC_SSE_H_
research/syntaxnet/dragnn/runtime/math/arithmetic_test.cc
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/arithmetic.h"
#include <stddef.h>
#include <vector>
#include "dragnn/runtime/math/types.h"
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/platform/test.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
namespace
{
// Tests that ScaleElements() doesn't crash on empty vectors.
TEST
(
ScaleElementsTest
,
Empty
)
{
Vector
<
float
>
input
;
MutableVector
<
float
>
output
;
ScaleElements
(
1.5
f
,
input
,
output
);
}
// Tests that ScaleElements() copies scaled values from one vector to another.
TEST
(
ScaleElementsTest
,
Populated
)
{
UniqueVector
<
float
>
input
({
-
2.0
f
,
-
3.0
f
,
5.0
f
});
UniqueVector
<
float
>
output
({
7.0
f
,
11.0
f
,
13.0
f
});
// gets overwritten
ScaleElements
(
1.5
f
,
Vector
<
float
>
(
*
input
),
*
output
);
EXPECT_EQ
((
*
output
)[
0
],
1.5
*
-
2.0
);
EXPECT_EQ
((
*
output
)[
1
],
1.5
*
-
3.0
);
EXPECT_EQ
((
*
output
)[
2
],
1.5
*
5.0
);
}
// Tests that AddScaledElements() doesn't crash on empty vectors.
TEST
(
AddScaledElementsTest
,
Empty
)
{
Vector
<
float
>
input
;
MutableVector
<
float
>
output
;
AddScaledElements
(
1.5
f
,
input
,
output
);
}
// Tests that AddScaledElements() adds scaled values from one vector to another.
TEST
(
AddScaledElementsTest
,
Populated
)
{
UniqueVector
<
float
>
input
({
-
2.0
f
,
-
3.0
f
,
5.0
f
});
UniqueVector
<
float
>
output
({
7.0
f
,
11.0
f
,
13.0
f
});
// gets added to
AddScaledElements
(
1.5
f
,
Vector
<
float
>
(
*
input
),
*
output
);
EXPECT_EQ
((
*
output
)[
0
],
1.5
*
-
2.0
+
7.0
);
EXPECT_EQ
((
*
output
)[
1
],
1.5
*
-
3.0
+
11.0
);
EXPECT_EQ
((
*
output
)[
2
],
1.5
*
5.0
+
13.0
);
}
// Tests that MaxElements() doesn't crash on empty vectors.
TEST
(
MaxElementsTest
,
Empty
)
{
MutableVector
<
float
>
values
;
MaxElements
(
1.5
f
,
values
);
}
// Tests that MaxElements() performs an in-place element-wise maximum.
TEST
(
MaxElementsTest
,
Populated
)
{
UniqueVector
<
float
>
values
({
-
1.0
f
,
2.0
f
,
0.25
f
,
-
0.5
f
,
0.375
f
});
MaxElements
(
0.125
f
,
*
values
);
EXPECT_EQ
((
*
values
)[
0
],
0.125
);
EXPECT_EQ
((
*
values
)[
1
],
2.0
);
EXPECT_EQ
((
*
values
)[
2
],
0.25
);
EXPECT_EQ
((
*
values
)[
3
],
0.125
);
EXPECT_EQ
((
*
values
)[
4
],
0.375
);
}
// Tests that MultiplyMatrixAndVector() doesn't crash on empty inputs.
TEST
(
MultiplyMatrixAndVectorTest
,
Empty
)
{
Matrix
<
float
>
matrix
;
Vector
<
float
>
input
;
MutableVector
<
float
>
output
;
MultiplyMatrixAndVector
(
matrix
,
input
,
output
);
}
// Tests that MultiplyMatrixAndVector() computes a matrix-vector product.
TEST
(
MultiplyMatrixAndVectorTest
,
Populated
)
{
UniqueMatrix
<
float
>
matrix
({{
2.0
f
,
3.0
f
},
//
{
5.0
f
,
7.0
f
},
//
{
11.0
f
,
13.0
f
}});
UniqueVector
<
float
>
input
({
-
0.5
f
,
2.0
f
});
UniqueVector
<
float
>
output
({
9.8
f
,
7.6
f
,
5.4
f
});
// gets overwritten
MultiplyMatrixAndVector
(
Matrix
<
float
>
(
*
matrix
),
Vector
<
float
>
(
*
input
),
*
output
);
EXPECT_EQ
((
*
output
)[
0
],
2.0
*
-
0.5
+
3.0
*
2.0
);
EXPECT_EQ
((
*
output
)[
1
],
5.0
*
-
0.5
+
7.0
*
2.0
);
EXPECT_EQ
((
*
output
)[
2
],
11.0
*
-
0.5
+
13.0
*
2.0
);
}
// Tests that MultiplyMatrixAndVectorWithBias() doesn't crash on empty inputs.
TEST
(
MultiplyMatrixAndVectorWithBiasTest
,
Empty
)
{
Matrix
<
float
>
matrix
;
Vector
<
float
>
bias
;
Vector
<
float
>
input
;
MutableVector
<
float
>
output
;
MultiplyMatrixAndVectorWithBias
(
matrix
,
bias
,
input
,
output
);
}
// Tests that MultiplyMatrixAndVectorWithBias() computes a matrix-vector product
// with an additive bias.
TEST
(
MultiplyMatrixAndVectorWithBiasTest
,
Populated
)
{
UniqueMatrix
<
float
>
matrix
({{
2.0
f
,
3.0
f
},
//
{
5.0
f
,
7.0
f
},
//
{
11.0
f
,
13.0
f
}});
UniqueVector
<
float
>
bias
({
100.5
f
,
200.25
f
,
300.75
f
});
UniqueVector
<
float
>
input
({
-
0.5
f
,
2.0
f
});
UniqueVector
<
float
>
output
({
9.8
f
,
7.6
f
,
5.4
f
});
// gets overwritten
MultiplyMatrixAndVectorWithBias
(
Matrix
<
float
>
(
*
matrix
),
Vector
<
float
>
(
*
bias
),
Vector
<
float
>
(
*
input
),
*
output
);
EXPECT_EQ
((
*
output
)[
0
],
100.5
+
2.0
*
-
0.5
+
3.0
*
2.0
);
EXPECT_EQ
((
*
output
)[
1
],
200.25
+
5.0
*
-
0.5
+
7.0
*
2.0
);
EXPECT_EQ
((
*
output
)[
2
],
300.75
+
11.0
*
-
0.5
+
13.0
*
2.0
);
}
// A dummy type for the specializations below. Specializing on this unique
// dummy type ensures we don't conflict with any existing specialization.
struct
Foo
{
float
value
;
};
}
// namespace
// Dummy specializations for use in the subsequent tests.
template
<
>
void
ScaleElements
(
Foo
scale
,
Vector
<
Foo
>
input
,
MutableVector
<
Foo
>
output
)
{
for
(
Foo
&
foo
:
output
)
foo
.
value
=
777.0
;
}
namespace
{
// Tests that the template specialization overrides the generic implementation.
TEST
(
ScaleElementsTest
,
OverriddenByTemplateSpecialization
)
{
// These values are uninitialized, but it doesn't matter because the
// specialization never looks at them.
UniqueVector
<
Foo
>
input
(
3
);
UniqueVector
<
Foo
>
output
(
3
);
ScaleElements
(
Foo
(),
Vector
<
Foo
>
(
*
input
),
*
output
);
EXPECT_EQ
((
*
output
)[
0
].
value
,
777.0
);
EXPECT_EQ
((
*
output
)[
1
].
value
,
777.0
);
EXPECT_EQ
((
*
output
)[
2
].
value
,
777.0
);
}
}
// namespace
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
research/syntaxnet/dragnn/runtime/math/avx_activation_functions.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Contains logic for activation functions and more-complex elementwise
// vectorized operations.
//
// Uses operator overloading to express computation that looks like regular
// code. Currently, overloaded operators are scoped away in an "internal"
// namespace so they won't be accidentally used.
#ifndef DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
#define DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
#if defined(__AVX2__)
#include <immintrin.h>
#endif
#include "dragnn/runtime/math/avx_vector_array.h"
#define DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
#ifdef __clang__
#define DRAGNN_AVXAF_GCC_UNROLL
#else
#define DRAGNN_AVXAF_GCC_UNROLL __attribute__((optimize("unroll-loops")))
#endif
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
// Public API
namespace
activations
{
// Calculates elementwise exp(x).
inline
AvxFloatVec
DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE
DRAGNN_AVXAF_GCC_UNROLL
Exponential
(
AvxFloatVec
x
);
// Calculates elementwise sigmoid(x) = 1/(1+exp(-x)).
inline
AvxFloatVec
DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE
Sigmoid
(
AvxFloatVec
x
);
// Calculates elementwise tanh(x).
inline
AvxFloatVec
DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE
Tanh
(
AvxFloatVec
x
);
}
// namespace activations
namespace
activations
{
// Calculates e^x by representing x = m * ln(2) + r. It does a polynomial
// expansion of e^r, and then multiplies in e^(m * ln(2)) = 2^m.
//
inline
AvxFloatVec
Exponential
(
AvxFloatVec
x
)
{
// EDSL-like helpers for writing vectorized code.
auto
Const
=
AvxFloatVec
::
Const
;
constexpr
float
explo
=
-
88.3762626647949
f
;
constexpr
float
exphi
=
88.3762626647950
f
;
const
float
cephes_exp_factors
[]
=
{
1.9875691500e-4
f
,
1.3981999507e-3
f
,
8.3334519073e-3
f
,
4.1665795894e-2
f
,
1.6666665459e-1
f
,
5.0000001201e-1
f
,
};
// Clamp the input. i.e. assume exp(-88) is close to zero and exp(88) is
// close to infinity.
x
.
Clamp
(
explo
,
exphi
);
// Calculate `m = floor(x/ln(2) + 0.5)`.
constexpr
float
inv_log2e
=
1.44269504088896341
f
;
AvxFloatVec
m
=
Const
(
0.5
f
);
m
+=
Const
(
inv_log2e
)
*
x
;
m
.
Floor
();
// Calculate `r = x - m*ln(2)` (see function-level comment).
constexpr
float
neg_ln2
=
-
0.6931471805599453
f
;
AvxFloatVec
r
=
x
;
r
+=
m
*
Const
(
neg_ln2
);
// Calculate a polynomial expansion of y = exp(r).
AvxFloatVec
r_squared
(
r
*
r
);
AvxFloatVec
y
=
Const
(
cephes_exp_factors
[
0
]);
for
(
int
i
=
1
;
i
<
6
;
++
i
)
{
y
=
y
*
r
+
Const
(
cephes_exp_factors
[
i
]);
}
y
=
y
*
r_squared
+
r
;
y
+=
Const
(
1.0
f
);
// Calculate `emm0 = 2^m`. This is done by converting emm0 into an integer,
// and shifting it into the exponent bits of the desired floating-point
// result. Recall that the exponent is unsigned with 127 representing 2^0.
AvxFloatVec
emm0
=
m
;
emm0
+=
Const
(
127.0
f
);
AvxIntVec
emm0_i
(
emm0
);
emm0_i
.
LeftShift
(
23
);
// The final result is `2^m * exp(r)`.
return
AvxFloatVec
(
emm0_i
.
ReinterpretCastFloat
()
*
y
);
}
inline
AvxFloatVec
Tanh
(
AvxFloatVec
x
)
{
// EDSL-like helpers for writing vectorized code.
auto
Const
=
AvxFloatVec
::
Const
;
const
float
numerator_coefficients
[]
=
{
-
2.76076847742355e-16
f
,
2.00018790482477e-13
f
,
-
8.60467152213735e-11
f
,
5.12229709037114e-08
f
,
1.48572235717979e-05
f
,
6.37261928875436e-04
f
,
4.89352455891786e-03
f
,
};
const
float
denominator_coefficients
[]
=
{
1.19825839466702e-06
f
,
1.18534705686654e-04
f
,
2.26843463243900e-03
f
,
4.89352518554385e-03
f
,
};
// Clamp the inputs to the range [-9, 9] since anything outside this range
// is +/-1.0 in single-precision.
x
.
Clamp
(
-
9.0
f
,
9.0
f
);
// Compute x^2.
AvxFloatVec
x_squared
(
x
*
x
);
// Compute the numerator polynomial.
AvxFloatVec
p
=
Const
(
numerator_coefficients
[
0
]);
for
(
int
i
=
1
;
i
<
7
;
++
i
)
{
// p = p * x^2 + numerator_coefficients_i
p
=
p
*
x_squared
+
Const
(
numerator_coefficients
[
i
]);
}
// p = p * x
p
=
AvxFloatVec
(
p
*
x
);
// Compute the denominator polynomial.
AvxFloatVec
q
=
Const
(
denominator_coefficients
[
0
]);
for
(
int
i
=
1
;
i
<
4
;
++
i
)
{
// q = q * x^2 + alqha_i
q
=
q
*
x_squared
+
Const
(
denominator_coefficients
[
i
]);
}
// Divide the numerator by the denominator.
return
p
/
q
;
}
inline
AvxFloatVec
Sigmoid
(
AvxFloatVec
x
)
{
AvxFloatVec
half
=
AvxFloatVec
::
Const
(
0.5
);
return
half
*
Tanh
(
AvxFloatVec
(
half
*
x
))
+
half
;
}
}
// namespace activations
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#undef DRAGNN_AVXAF_ATTRIBUTE_ALWAYS_INLINE
#undef DRAGNN_AVXAF_GCC_UNROLL
#endif // DRAGNN_RUNTIME_MATH_AVX_ACTIVATION_FUNCTIONS_H_
research/syntaxnet/dragnn/runtime/math/avx_activation_functions_test.cc
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/avx_activation_functions.h"
#include <cmath>
#include <chrono>
#include "dragnn/runtime/test/helpers.h"
#include "syntaxnet/base.h"
#include "tensorflow/core/platform/test.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
namespace
{
TEST
(
AvxActivationFunctionsTest
,
ExponentialTest
)
{
AvxVectorFuzzTest
(
[](
AvxFloatVec
*
vec
)
{
*
vec
=
activations
::
Exponential
(
*
vec
);
},
[](
float
input_value
,
float
actual
)
{
const
float
inverted
=
log
(
actual
);
EXPECT_NEAR
(
input_value
,
inverted
,
1e-6
)
<<
"exp("
<<
input_value
<<
") = "
<<
actual
<<
", log(actual) = "
<<
inverted
;
});
}
TEST
(
AvxActivationFunctionsTest
,
SigmoidTest
)
{
AvxVectorFuzzTest
(
//
[](
AvxFloatVec
*
vec
)
{
*
vec
=
activations
::
Sigmoid
(
*
vec
);
},
[](
float
input_value
,
float
actual
)
{
const
float
expected
=
1.0
f
/
(
1.0
f
+
exp
(
-
input_value
));
EXPECT_NEAR
(
actual
,
expected
,
1e-6
)
<<
"sigmoid("
<<
input_value
<<
") = "
<<
actual
<<
", expected = "
<<
expected
;
});
}
template
<
int
batch_size
,
class
Function
>
void
RunPerformanceTest
(
Function
activation
,
int
flops
)
{
constexpr
uint64
kIterations
=
1000000
;
UniqueVector
<
float
>
input
(
batch_size
);
UniqueVector
<
float
>
output
(
batch_size
);
InitRandomVector
(
*
input
);
InitRandomVector
(
*
output
);
AvxFloatVecArray
<
batch_size
/
kAvxWidth
>
array
;
auto
start_time
=
std
::
chrono
::
system_clock
::
now
();
for
(
int
i
=
0
;
i
<
kIterations
;
++
i
)
{
array
.
Load
(
input
->
data
());
array
.
Apply
(
activation
);
array
.
Store
(
output
->
data
());
}
auto
end_time
=
std
::
chrono
::
system_clock
::
now
();
std
::
chrono
::
duration
<
double
>
elapsed_seconds
=
end_time
-
start_time
;
double
elapsed
=
elapsed_seconds
.
count
();
double
exp_ops
=
kIterations
*
batch_size
;
double
macro_gops
=
exp_ops
/
1e9
/
elapsed
;
VLOG
(
0
)
<<
"For batch_size "
<<
batch_size
<<
" macro-GOPS (giga-ops per sec): "
<<
macro_gops
<<
", raw arithmetic: "
<<
flops
*
macro_gops
;
}
TEST
(
AvxActivationFunctionsTest
,
SigmoidPerformanceTest
)
{
RunPerformanceTest
<
8
>
(
activations
::
Sigmoid
,
26
);
RunPerformanceTest
<
16
>
(
activations
::
Sigmoid
,
26
);
RunPerformanceTest
<
32
>
(
activations
::
Sigmoid
,
26
);
RunPerformanceTest
<
48
>
(
activations
::
Sigmoid
,
26
);
RunPerformanceTest
<
64
>
(
activations
::
Sigmoid
,
26
);
RunPerformanceTest
<
128
>
(
activations
::
Sigmoid
,
26
);
}
TEST
(
AvxActivationFunctionsTest
,
TanhTest
)
{
AvxVectorFuzzTest
([](
AvxFloatVec
*
vec
)
{
*
vec
=
activations
::
Tanh
(
*
vec
);
},
[](
float
input_value
,
float
actual
)
{
const
float
expected
=
tanh
(
input_value
);
EXPECT_NEAR
(
actual
,
expected
,
1e-6
)
<<
"tanh("
<<
input_value
<<
") = "
<<
actual
<<
", expected = "
<<
expected
;
});
}
TEST
(
AvxActivationFunctionsTest
,
TanhPerformanceTest
)
{
RunPerformanceTest
<
8
>
(
activations
::
Sigmoid
,
23
);
RunPerformanceTest
<
16
>
(
activations
::
Sigmoid
,
23
);
RunPerformanceTest
<
32
>
(
activations
::
Tanh
,
23
);
RunPerformanceTest
<
48
>
(
activations
::
Tanh
,
23
);
RunPerformanceTest
<
64
>
(
activations
::
Tanh
,
23
);
RunPerformanceTest
<
128
>
(
activations
::
Tanh
,
23
);
}
}
// namespace
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
research/syntaxnet/dragnn/runtime/math/avx_vector_array.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Wraps AVX vectors into convenient helper classes. This contains a class
// wrapping a single AVX register, AvxFloatVec, and a class to manipulate a
// batch of registers, AvxFloatVecArray. Use of the latter is recommended where
// applicable, since it will be unrolled into more vectorizable code.
#ifndef DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
#define DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
#include <cmath>
#if defined(__AVX__)
#include <immintrin.h>
#elif defined(__SSE4_2__)
#include <nmmintrin.h>
#endif
#include "dragnn/runtime/math/float16_types.h"
#define DRAGNN_AVXVA_ALWAYS_INLINE inline __attribute__((always_inline))
#ifdef __clang__
// Clang doesn't support __attribute__((optimize(...))).
#define DRAGNN_AVXVA_INLINED_UNROLLED inline __attribute__((always_inline))
#else
// Assume we're using GCC, which does.
#define DRAGNN_AVXVA_INLINED_UNROLLED \
inline __attribute__((always_inline)) \
__attribute__((optimize("unroll-loops")))
#endif
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
// Number of single-precision floating point numbers that fit into a single SSE
// / AVX2 register (which are 128 and 256 bits respectively).
constexpr
int
kSseWidth
=
128
/
32
;
// = 4
constexpr
int
kAvxWidth
=
256
/
32
;
// = 8
constexpr
int
kSseWidthHalfPrecision
=
128
/
16
;
// = 8
constexpr
int
kAvxWidthHalfPrecision
=
256
/
16
;
// = 16
class
AvxFloatVec
;
namespace
internal
{
// This struct should always be eliminated by the compiler; it only exists so we
// can write `foo += bar * baz`, and have that compiled into a single FMA
// operation.
struct
AvxMultiplyExpr
{
const
AvxFloatVec
&
a
;
const
AvxFloatVec
&
b
;
};
}
// namespace internal
// Allows EDSL-like programming with AVX vectors.
inline
internal
::
AvxMultiplyExpr
operator
*
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
);
inline
AvxFloatVec
operator
+
(
const
internal
::
AvxMultiplyExpr
&
expr
,
const
AvxFloatVec
&
v
);
inline
AvxFloatVec
operator
+
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
);
inline
AvxFloatVec
operator
/
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
);
inline
AvxFloatVec
operator
-
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
);
// API over a single AVX vector (register). The implementation will either use
// a real AVX vector, or a fixed array of floats for compatibility.
//
// Note that we include the "inline" directive in declarations, not just
// definitions, because it is necessary for the "always_inline" directive.
struct
AvxFloatVec
{
public:
AvxFloatVec
()
{}
// Evaluates an AvxMultiplyExpr intermediary without adding anything. This is
// not an implicit cast, because typically when we write `a * b` we want to
// add it to something and use an FMA operation.
explicit
AvxFloatVec
(
const
internal
::
AvxMultiplyExpr
&
expr
);
// Loads from an aligned region of memory.
inline
void
Load
(
const
float
*
source
);
// Loads a constant value.
inline
void
LoadConstVector
(
const
float
val
);
// Stores to an aligned region of memory.
inline
void
Store
(
float
*
dst
)
const
;
// Adds `a * b` to this value, using a fused multiply-add operation.
inline
void
AddProductOf
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
);
// Element-wise floor.
inline
void
Floor
();
// Element-wise clamps values between a min and max value.
inline
void
Clamp
(
const
float
min_value
,
const
float
max_value
);
// Convenience method for more complex calculations.
static
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
Const
(
const
float
value
)
{
AvxFloatVec
result
;
result
.
LoadConstVector
(
value
);
return
result
;
}
// Syntactic sugar for computing an FMA operation.
inline
AvxFloatVec
&
operator
+=
(
const
internal
::
AvxMultiplyExpr
&
to_add
);
// Adds another vector element-wise.
inline
AvxFloatVec
&
operator
+=
(
const
AvxFloatVec
&
vec
);
// Subtracts another vector element-wise.
inline
AvxFloatVec
&
operator
-=
(
const
AvxFloatVec
&
vec
);
// Divides another vector element-wise.
inline
AvxFloatVec
&
operator
/=
(
const
AvxFloatVec
&
vec
);
#if defined(__AVX__)
__m256
ymm
;
#elif defined(__SSE4_2__)
__m128
xmm
[
2
];
#else
float
ymm
[
8
];
#endif
};
// Small wrapper around integer AVX vectors, exposing only methods we need for
// implementing the activation functions.
//
// As above, `inline` is specified here for the always_inline directive.
class
AvxIntVec
{
public:
// Constructs an AVX integer vector, by converting floating-point values.
inline
explicit
AvxIntVec
(
const
AvxFloatVec
&
v
);
// Left-shifts integer values.
inline
void
LeftShift
(
int
bits
);
// Reinterprets the register as a floating-point register, for bitwise tricks.
inline
AvxFloatVec
ReinterpretCastFloat
();
private:
// Underlying register.
#if defined(__AVX__)
__m256i
ymm_
;
#elif defined(__SSE4_2__)
__m128i
xmm_
[
2
];
#else
int
ymm_
[
8
];
#endif
};
// Implements the index permutation that is effectively applied by the
// _mm256_unpack instructions. This permutation is equivalent to swapping the
// 3rd and 4th bits. See the PermutationFunctionIsEqualToTable test for the
// effective permutation that this encodes.
//
// We haven't done performance testing, but hopefully this is sufficiently fast
// for the compatibility routine. Hopefully in its use below, the compiler will
// determine it is being called with a constant (post-unrolling) and inline it.
DRAGNN_AVXVA_ALWAYS_INLINE
int
FastUnpackPermutation
(
int
original_idx
)
{
// Bit in the 4th index if the 3rd and 4th bits should be swapped.
int
should_swap
=
(
original_idx
+
/* 0b0100 */
4
)
&
/* 0b1000 */
8
;
// If should_swap is zero, leaves original_idx untouched. Otherwise, does an
// xor with 0b1100, which will flip 10 to 01 and 01 to 10.
return
(
should_swap
|
(
should_swap
>>
1
))
^
original_idx
;
}
// API over an array of AVX vectors (registers). The methods on this class are
// annotated such that the compiler should unroll them.
template
<
int
N
>
struct
AvxFloatVecArray
{
public:
DRAGNN_AVXVA_INLINED_UNROLLED
void
Load
(
const
float
*
source
)
{
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
vectors
[
i
].
Load
(
source
+
8
*
i
);
}
}
DRAGNN_AVXVA_INLINED_UNROLLED
void
Load
(
const
float
*
source
,
int
max_idx
)
{
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
if
(
i
<
max_idx
)
{
vectors
[
i
].
Load
(
source
+
8
*
i
);
}
else
{
// When testing with a memory sanitizer, we make sure not to read
// uninitialized values. This is usually safe in normal operation
// because such results are never stored (via corresponding
// store-masking logic), but of course each algorithm must be tested to
// ensure correct operation.
//
// It is also worth pointing out that exceptional values (NaN, etc.) can
// slow down AVX/FMA floating point operations considerably. So we
// should investigate whether this is worth enabling in all cases (and
// forcing algorithms to provide a default).
#if defined(MEMORY_SANITIZER)
vectors
[
i
].
LoadConstVector
(
0
);
#endif
}
}
}
// Reads and unpacks truncated half-precision values.
//
// Currently, only matrix coefficients use compressed/half-precision values,
// so it's not yet necessary to support max_idx masking (which will get a bit
// more complicated).
DRAGNN_AVXVA_INLINED_UNROLLED
void
Load
(
const
TruncatedFloat16
*
source
);
#if defined(__F16C__)
// Reads and unpacks IEEE-754 half-precision values.
//
// Currently, only matrix coefficients use compressed/half-precision values,
// so it's not yet necessary to support max_idx masking (which will get a bit
// more complicated).
//
// TODO(googleuser): Either add non-F16C compatibility support from Eigen,
// or delete this code if it turns out not to be helpful.
DRAGNN_AVXVA_INLINED_UNROLLED
void
Load
(
const
IeeeFloat16
*
source
);
#endif
DRAGNN_AVXVA_INLINED_UNROLLED
void
LoadConstVector
(
const
float
val
)
{
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
vectors
[
i
].
LoadConstVector
(
val
);
}
}
DRAGNN_AVXVA_INLINED_UNROLLED
void
Store
(
float
*
dst
)
{
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
vectors
[
i
].
Store
(
dst
+
8
*
i
);
}
}
DRAGNN_AVXVA_INLINED_UNROLLED
void
Store
(
float
*
dst
,
int
max_idx
)
{
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
// This is equivalent to writing `i < N && i < max_idx` above, but forces
// the compiler to produce more efficient code (it's still creating jump
// instructions, but the branching is probably more predictable, and the
// loops are unrolled). In the future we could switch to VMASKMOV if
// necessary.
if
(
i
<
max_idx
)
{
vectors
[
i
].
Store
(
dst
+
8
*
i
);
}
}
}
template
<
class
Function
>
DRAGNN_AVXVA_INLINED_UNROLLED
void
Apply
(
const
Function
&
fcn
)
{
for
(
int
i
=
0
;
i
<
N
;
i
++
)
{
vectors
[
i
]
=
fcn
(
vectors
[
i
]);
}
}
AvxFloatVec
vectors
[
N
];
};
// Implementation details.
#if defined(__AVX__)
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
::
AvxFloatVec
(
const
internal
::
AvxMultiplyExpr
&
expr
)
{
ymm
=
_mm256_mul_ps
(
expr
.
a
.
ymm
,
expr
.
b
.
ymm
);
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Load
(
const
float
*
source
)
{
ymm
=
_mm256_load_ps
(
source
);
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
LoadConstVector
(
const
float
val
)
{
ymm
=
_mm256_set1_ps
(
val
);
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Store
(
float
*
dst
)
const
{
_mm256_store_ps
(
dst
,
ymm
);
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
AddProductOf
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
)
{
#if defined(__AVX2__) && defined(__FMA__)
ymm
=
_mm256_fmadd_ps
(
a
.
ymm
,
b
.
ymm
,
ymm
);
#else
*
this
+=
AvxFloatVec
(
a
*
b
);
#endif
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Floor
()
{
ymm
=
_mm256_floor_ps
(
ymm
);
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Clamp
(
const
float
min_value
,
const
float
max_value
)
{
ymm
=
_mm256_min_ps
(
ymm
,
Const
(
max_value
).
ymm
);
ymm
=
_mm256_max_ps
(
ymm
,
Const
(
min_value
).
ymm
);
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
+=
(
const
AvxFloatVec
&
vec
)
{
ymm
=
_mm256_add_ps
(
vec
.
ymm
,
ymm
);
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
-=
(
const
AvxFloatVec
&
vec
)
{
ymm
=
_mm256_sub_ps
(
ymm
,
vec
.
ymm
);
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
/=
(
const
AvxFloatVec
&
vec
)
{
ymm
=
_mm256_div_ps
(
ymm
,
vec
.
ymm
);
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxIntVec
::
AvxIntVec
(
const
AvxFloatVec
&
v
)
:
ymm_
(
_mm256_cvttps_epi32
(
v
.
ymm
))
{}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxIntVec
::
LeftShift
(
int
bits
)
{
#if defined(__AVX2__)
ymm_
=
_mm256_slli_epi32
(
ymm_
,
bits
);
#else
// Convert to SSE and back again. This is pretty slow, so don't use this code
// except for compatibility purposes.
__m256i
upper_bits
=
_mm256_permute2f128_si256
(
ymm_
,
ymm_
,
1
);
__m128i
first
=
_mm256_castsi256_si128
(
ymm_
);
// Lower bits as SSE
__m128i
second
=
_mm256_castsi256_si128
(
upper_bits
);
// Upper bits as SSE
first
=
_mm_slli_epi32
(
first
,
bits
);
second
=
_mm_slli_epi32
(
second
,
bits
);
ymm_
=
_mm256_permute2f128_si256
(
_mm256_castsi128_si256
(
first
),
_mm256_castsi128_si256
(
second
),
(
2
<<
4
));
#endif
}
AvxFloatVec
DRAGNN_AVXVA_ALWAYS_INLINE
AvxIntVec
::
ReinterpretCastFloat
()
{
AvxFloatVec
result
;
result
.
ymm
=
_mm256_castsi256_ps
(
ymm_
);
return
result
;
}
template
<
int
N
>
DRAGNN_AVXVA_INLINED_UNROLLED
void
AvxFloatVecArray
<
N
>::
Load
(
const
TruncatedFloat16
*
source
)
{
static_assert
(
N
%
2
==
0
,
"Load() from half floats requires even-sized vector arrays."
);
for
(
int
i
=
0
;
i
<
N
/
2
;
i
++
)
{
#if defined(__AVX2__)
const
__m256i
input
=
_mm256_load_si256
(
reinterpret_cast
<
__m256i
const
*>
(
source
+
kAvxWidthHalfPrecision
*
i
));
vectors
[
2
*
i
].
ymm
=
_mm256_castsi256_ps
(
_mm256_unpacklo_epi16
(
_mm256_setzero_si256
(),
input
));
vectors
[
2
*
i
+
1
].
ymm
=
_mm256_castsi256_ps
(
_mm256_unpackhi_epi16
(
_mm256_setzero_si256
(),
input
));
#else
// Compatibility AVX (not AVX2) implementation.
__m128i
input
[
2
];
input
[
0
]
=
_mm_load_si128
(
reinterpret_cast
<
__m128i
const
*>
(
source
+
kAvxWidthHalfPrecision
*
i
));
input
[
1
]
=
_mm_load_si128
(
reinterpret_cast
<
__m128i
const
*>
(
source
+
kAvxWidthHalfPrecision
*
i
+
kSseWidthHalfPrecision
));
// Unpack. This permutation is kinda cryptic and, to be honest, derived by
// simply trying many combinations.
vectors
[
2
*
i
].
ymm
=
_mm256_insertf128_ps
(
_mm256_castps128_ps256
(
_mm_castsi128_ps
(
_mm_unpacklo_epi16
(
_mm_setzero_si128
(),
input
[
0
]))),
_mm_castsi128_ps
(
_mm_unpacklo_epi16
(
_mm_setzero_si128
(),
input
[
1
])),
1
);
vectors
[
2
*
i
+
1
].
ymm
=
_mm256_insertf128_ps
(
_mm256_castps128_ps256
(
_mm_castsi128_ps
(
_mm_unpackhi_epi16
(
_mm_setzero_si128
(),
input
[
0
]))),
_mm_castsi128_ps
(
_mm_unpackhi_epi16
(
_mm_setzero_si128
(),
input
[
1
])),
1
);
#endif
}
}
#if defined(__F16C__)
template
<
int
N
>
DRAGNN_AVXVA_INLINED_UNROLLED
void
AvxFloatVecArray
<
N
>::
Load
(
const
IeeeFloat16
*
source
)
{
static_assert
(
N
%
2
==
0
,
"Load() from half floats requires even-sized vector arrays."
);
for
(
int
i
=
0
;
i
<
N
/
2
;
i
++
)
{
// TODO(googleuser): Experiment with doing a single AVX2 load and
// dividing the result.
__m128i
first_half
=
_mm_load_si128
(
reinterpret_cast
<
__m128i
const
*>
(
source
+
kAvxWidthHalfPrecision
*
i
));
__m128i
second_half
=
_mm_load_si128
(
reinterpret_cast
<
__m128i
const
*>
(
source
+
kAvxWidthHalfPrecision
*
i
+
kAvxWidth
));
vectors
[
2
*
i
].
ymm
=
_mm256_cvtph_ps
(
first_half
);
vectors
[
2
*
i
+
1
].
ymm
=
_mm256_cvtph_ps
(
second_half
);
}
}
#endif
#elif defined(__SSE4_2__)
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
::
AvxFloatVec
(
const
internal
::
AvxMultiplyExpr
&
expr
)
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
xmm
[
i
]
=
_mm_mul_ps
(
expr
.
a
.
xmm
[
i
],
expr
.
b
.
xmm
[
i
]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Load
(
const
float
*
source
)
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
xmm
[
i
]
=
_mm_load_ps
(
&
source
[
i
*
kSseWidth
]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
LoadConstVector
(
const
float
val
)
{
xmm
[
1
]
=
xmm
[
0
]
=
_mm_set1_ps
(
val
);
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Store
(
float
*
dst
)
const
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
_mm_store_ps
(
&
dst
[
i
*
kSseWidth
],
xmm
[
i
]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
AddProductOf
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
)
{
*
this
+=
AvxFloatVec
(
a
*
b
);
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Floor
()
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
xmm
[
i
]
=
_mm_floor_ps
(
xmm
[
i
]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Clamp
(
const
float
min_value
,
const
float
max_value
)
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
xmm
[
i
]
=
_mm_min_ps
(
xmm
[
i
],
Const
(
max_value
).
xmm
[
i
]);
xmm
[
i
]
=
_mm_max_ps
(
xmm
[
i
],
Const
(
min_value
).
xmm
[
i
]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
+=
(
const
AvxFloatVec
&
vec
)
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
xmm
[
i
]
=
_mm_add_ps
(
vec
.
xmm
[
i
],
xmm
[
i
]);
}
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
-=
(
const
AvxFloatVec
&
vec
)
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
xmm
[
i
]
=
_mm_sub_ps
(
xmm
[
i
],
vec
.
xmm
[
i
]);
}
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
/=
(
const
AvxFloatVec
&
vec
)
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
xmm
[
i
]
=
_mm_div_ps
(
xmm
[
i
],
vec
.
xmm
[
i
]);
}
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxIntVec
::
AvxIntVec
(
const
AvxFloatVec
&
v
)
{
xmm_
[
0
]
=
_mm_cvttps_epi32
(
v
.
xmm
[
0
]);
xmm_
[
1
]
=
_mm_cvttps_epi32
(
v
.
xmm
[
1
]);
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxIntVec
::
LeftShift
(
int
bits
)
{
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
xmm_
[
i
]
=
_mm_slli_epi32
(
xmm_
[
i
],
bits
);
}
}
AvxFloatVec
DRAGNN_AVXVA_ALWAYS_INLINE
AvxIntVec
::
ReinterpretCastFloat
()
{
AvxFloatVec
result
;
for
(
int
i
=
0
;
i
<
2
;
++
i
)
{
result
.
xmm
[
i
]
=
_mm_castsi128_ps
(
xmm_
[
i
]);
}
return
result
;
}
template
<
int
N
>
DRAGNN_AVXVA_INLINED_UNROLLED
void
AvxFloatVecArray
<
N
>::
Load
(
const
TruncatedFloat16
*
source
)
{
static_assert
(
N
%
2
==
0
,
"Load() from half floats requires even-sized vector arrays."
);
for
(
int
i
=
0
;
i
<
N
/
2
;
i
++
)
{
__m128i
input
[
2
];
input
[
0
]
=
_mm_load_si128
(
reinterpret_cast
<
__m128i
const
*>
(
source
+
kAvxWidthHalfPrecision
*
i
));
input
[
1
]
=
_mm_load_si128
(
reinterpret_cast
<
__m128i
const
*>
(
source
+
kAvxWidthHalfPrecision
*
i
+
kSseWidthHalfPrecision
));
vectors
[
2
*
i
].
xmm
[
0
]
=
_mm_castsi128_ps
(
_mm_unpacklo_epi16
(
_mm_setzero_si128
(),
input
[
0
]));
vectors
[
2
*
i
+
1
].
xmm
[
0
]
=
_mm_castsi128_ps
(
_mm_unpackhi_epi16
(
_mm_setzero_si128
(),
input
[
0
]));
vectors
[
2
*
i
].
xmm
[
1
]
=
_mm_castsi128_ps
(
_mm_unpacklo_epi16
(
_mm_setzero_si128
(),
input
[
1
]));
vectors
[
2
*
i
+
1
].
xmm
[
1
]
=
_mm_castsi128_ps
(
_mm_unpackhi_epi16
(
_mm_setzero_si128
(),
input
[
1
]));
}
}
#if defined(__F16C__)
template
<
int
N
>
DRAGNN_AVXVA_INLINED_UNROLLED
void
AvxFloatVecArray
<
N
>::
Load
(
const
IeeeFloat16
*
source
)
{
static_assert
(
N
%
2
==
0
,
"Load() from half floats requires even-sized vector arrays."
);
for
(
int
i
=
0
;
i
<
N
/
2
;
i
++
)
{
__m128i
first_half
=
_mm_load_si128
(
reinterpret_cast
<
__m128i
const
*>
(
source
+
kAvxWidthHalfPrecision
*
i
));
__m128i
second_half
=
_mm_load_si128
(
reinterpret_cast
<
__m128i
const
*>
(
source
+
kAvxWidthHalfPrecision
*
i
+
kAvxWidth
));
vectors
[
2
*
i
].
xmm
[
0
]
=
_mm_cvtph_ps
(
first_half
);
vectors
[
2
*
i
+
1
].
xmm
[
0
]
=
_mm_cvtph_ps
(
second_half
);
first_half
=
_mm_shuffle_epi32
(
first_half
,
_MM_SHUFFLE
(
0
,
1
,
3
,
2
));
second_half
=
_mm_shuffle_epi32
(
second_half
,
_MM_SHUFFLE
(
0
,
1
,
3
,
2
));
vectors
[
2
*
i
].
xmm
[
1
]
=
_mm_cvtph_ps
(
first_half
);
vectors
[
2
*
i
+
1
].
xmm
[
1
]
=
_mm_cvtph_ps
(
second_half
);
}
}
#endif
#else
// Compatibility implementations. If you compile with -ftree-vectorize and
// -msse2 flags, you should still get decent performance (maybe 1/4 of the
// AVX/FMA version).
//
// See the class above for method documentation.
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
::
AvxFloatVec
(
const
internal
::
AvxMultiplyExpr
&
expr
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm
[
i
]
=
expr
.
a
.
ymm
[
i
]
*
expr
.
b
.
ymm
[
i
];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Load
(
const
float
*
source
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm
[
i
]
=
source
[
i
];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
LoadConstVector
(
const
float
val
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm
[
i
]
=
val
;
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Store
(
float
*
dst
)
const
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
dst
[
i
]
=
ymm
[
i
];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
AddProductOf
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm
[
i
]
+=
a
.
ymm
[
i
]
*
b
.
ymm
[
i
];
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Floor
()
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm
[
i
]
=
floor
(
ymm
[
i
]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxFloatVec
::
Clamp
(
const
float
min_value
,
const
float
max_value
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm
[
i
]
=
fmin
(
fmax
(
ymm
[
i
],
min_value
),
max_value
);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
+=
(
const
AvxFloatVec
&
vec
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm
[
i
]
+=
vec
.
ymm
[
i
];
}
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
-=
(
const
AvxFloatVec
&
vec
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm
[
i
]
-=
vec
.
ymm
[
i
];
}
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
/=
(
const
AvxFloatVec
&
vec
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm
[
i
]
/=
vec
.
ymm
[
i
];
}
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxIntVec
::
AvxIntVec
(
const
AvxFloatVec
&
v
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm_
[
i
]
=
static_cast
<
int
>
(
v
.
ymm
[
i
]);
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
void
AvxIntVec
::
LeftShift
(
int
bits
)
{
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
ymm_
[
i
]
=
ymm_
[
i
]
<<
bits
;
}
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
AvxIntVec
::
ReinterpretCastFloat
()
{
AvxFloatVec
result
;
for
(
int
i
=
0
;
i
<
8
;
i
++
)
{
result
.
ymm
[
i
]
=
reinterpret_cast
<
float
&>
(
ymm_
[
i
]);
}
return
result
;
}
template
<
int
N
>
DRAGNN_AVXVA_INLINED_UNROLLED
void
AvxFloatVecArray
<
N
>::
Load
(
const
TruncatedFloat16
*
source
)
{
static_assert
(
N
%
2
==
0
,
"Load() from half floats requires even-sized vector arrays."
);
// Iterate through mock AVX vectors, each composed of 16 half-floats.
for
(
int
vec_idx
=
0
;
vec_idx
<
N
/
2
;
vec_idx
++
)
{
// Making this code a bit more verbose, by reading in-order to a temporary
// array, results in faster performance. The compatibility version is still
// pretty slow though.
TruncatedFloat16
tmp
[
16
];
for
(
int
i
=
0
;
i
<
kAvxWidthHalfPrecision
;
++
i
)
{
tmp
[
i
]
=
source
[
i
+
kAvxWidthHalfPrecision
*
vec_idx
];
}
float
unpacked
[
16
];
for
(
int
i
=
0
;
i
<
kAvxWidthHalfPrecision
;
++
i
)
{
unpacked
[
i
]
=
tmp
[
i
].
DebugToFloat
();
}
for
(
int
i
=
0
;
i
<
kAvxWidthHalfPrecision
;
++
i
)
{
int
permuted
=
FastUnpackPermutation
(
i
);
vectors
[
2
*
vec_idx
+
(
i
/
8
)].
ymm
[
i
%
8
]
=
unpacked
[
permuted
];
}
}
}
#if defined(__F16C__)
template
<
int
N
>
DRAGNN_AVXVA_INLINED_UNROLLED
void
AvxFloatVecArray
<
N
>::
Load
(
const
IeeeFloat16
*
source
)
{
// Not actually required for the compatibility implementation, but it'd be
// rather non-uniform if this API succeeded, and then compilation failed when
// AVX2 was turned on.
static_assert
(
N
%
2
==
0
,
"Load() from half floats requires even-sized vector arrays."
);
// Iterate through mock AVX vectors, each composed of 16 half-floats.
for
(
int
i
=
0
;
i
<
N
*
kAvxWidth
;
++
i
)
{
vectors
[
i
/
8
].
ymm
[
i
%
8
]
=
source
[
i
].
DebugToFloat
();
}
}
#endif
#endif
// The following operations are mostly syntax sugar, so they do not need
// architecture-specific implementations.
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
&
AvxFloatVec
::
operator
+=
(
const
internal
::
AvxMultiplyExpr
&
to_add
)
{
AddProductOf
(
to_add
.
a
,
to_add
.
b
);
return
*
this
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
internal
::
AvxMultiplyExpr
operator
*
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
)
{
return
internal
::
AvxMultiplyExpr
{
a
,
b
};
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
operator
+
(
const
internal
::
AvxMultiplyExpr
&
expr
,
const
AvxFloatVec
&
v
)
{
AvxFloatVec
result
=
v
;
result
+=
expr
;
return
result
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
operator
+
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
)
{
AvxFloatVec
result
=
a
;
result
+=
b
;
return
result
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
operator
/
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
)
{
AvxFloatVec
result
=
a
;
result
/=
b
;
return
result
;
}
DRAGNN_AVXVA_ALWAYS_INLINE
AvxFloatVec
operator
-
(
const
AvxFloatVec
&
a
,
const
AvxFloatVec
&
b
)
{
AvxFloatVec
result
=
a
;
result
-=
b
;
return
result
;
}
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#undef DRAGNN_AVXVA_ALWAYS_INLINE
#undef DRAGNN_AVXVA_INLINED_UNROLLED
#endif // DRAGNN_RUNTIME_MATH_AVX_VECTOR_ARRAY_H_
research/syntaxnet/dragnn/runtime/math/avx_vector_array_test.cc
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/avx_vector_array.h"
#include <cmath>
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/platform/test.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
namespace
{
TEST
(
AvxVectorTest
,
LoadAndStore
)
{
UniqueVector
<
float
>
input
(
kAvxWidth
);
UniqueVector
<
float
>
output
(
kAvxWidth
);
InitRandomVector
(
*
input
);
InitRandomVector
(
*
output
);
AvxFloatVec
vec
;
vec
.
Load
(
input
->
data
());
vec
.
Store
(
output
->
data
());
for
(
int
i
=
0
;
i
<
kAvxWidth
;
++
i
)
{
EXPECT_EQ
((
*
input
)[
i
],
(
*
output
)[
i
]);
}
}
// Test flooring with assignment, just to make the compiler not erase aliases.
TEST
(
AvxVectorTest
,
AssignmentAndFloor
)
{
UniqueVector
<
float
>
input
(
kAvxWidth
);
UniqueVector
<
float
>
output
(
kAvxWidth
);
UniqueVector
<
float
>
floored
(
kAvxWidth
);
InitRandomVector
(
*
input
);
InitRandomVector
(
*
output
);
AvxFloatVec
vec
;
vec
.
Load
(
input
->
data
());
AvxFloatVec
vec2
=
vec
;
vec
.
Floor
();
vec
.
Store
(
floored
->
data
());
vec2
.
Store
(
output
->
data
());
for
(
int
i
=
0
;
i
<
kAvxWidth
;
++
i
)
{
EXPECT_EQ
((
*
input
)[
i
],
(
*
output
)[
i
]);
EXPECT_EQ
(
floor
((
*
input
)[
i
]),
(
*
floored
)[
i
]);
}
}
TEST
(
AvxVectorTest
,
ClampTest
)
{
bool
modified
=
false
;
// check that some value was clamped.
AvxVectorFuzzTest
(
[](
AvxFloatVec
*
vec
)
{
vec
->
Clamp
(
-
0.314
f
,
0.314
f
);
},
[
&
modified
](
float
input_value
,
float
output_value
)
{
modified
=
modified
||
input_value
<
-
0.314
||
input_value
>
0.314
;
EXPECT_EQ
(
fmax
(
-
0.314
f
,
fmin
(
0.314
f
,
input_value
)),
output_value
);
});
EXPECT_TRUE
(
modified
)
<<
"No values fell outside test range for ClampTest()."
;
}
TEST
(
AvxVectorTest
,
LoadConstAndStore
)
{
UniqueVector
<
float
>
output
(
kAvxWidth
);
InitRandomVector
(
*
output
);
AvxFloatVec
vec
;
vec
.
LoadConstVector
(
3.14
f
);
vec
.
Store
(
output
->
data
());
for
(
int
i
=
0
;
i
<
kAvxWidth
;
++
i
)
{
EXPECT_EQ
((
*
output
)[
i
],
3.14
f
);
}
}
TEST
(
AvxVectorTest
,
AddTest
)
{
AvxVectorFuzzTest
(
//
[](
AvxFloatVec
*
vec
)
{
(
*
vec
)
+=
*
vec
;
},
[](
float
input_value
,
float
output_value
)
{
EXPECT_EQ
(
input_value
*
2
,
output_value
);
});
}
TEST
(
AvxVectorTest
,
SubtractTest
)
{
AvxVectorFuzzTest
(
[](
AvxFloatVec
*
vec
)
{
AvxFloatVec
one
;
one
.
LoadConstVector
(
1.0
f
);
(
*
vec
)
-=
one
;
},
[](
float
input_value
,
float
output_value
)
{
EXPECT_EQ
(
input_value
-
1.0
f
,
output_value
);
});
}
TEST
(
AvxVectorTest
,
DivideTest
)
{
AvxVectorFuzzTest
(
[](
AvxFloatVec
*
vec
)
{
AvxFloatVec
result
;
result
.
LoadConstVector
(
1.0
f
);
result
/=
*
vec
;
*
vec
=
result
;
},
[](
float
input_value
,
float
output_value
)
{
EXPECT_EQ
(
1.0
f
/
input_value
,
output_value
);
});
}
// This is a really basic test; half of the purpose is to ensure that the float
// API is still OK (i.e. compiles) for odd-sized arrays. If you try to add a
// call to array.Load(TruncatedFloat16 *source), it should produce a compiler
// error.
TEST
(
AvxFloatVecArrayTest
,
SingletonArrayLoadsAndStores
)
{
AvxFloatVecArray
<
1
>
array
;
UniqueVector
<
float
>
input
(
kAvxWidth
);
UniqueVector
<
float
>
output
(
kAvxWidth
);
InitRandomVector
(
*
input
);
InitRandomVector
(
*
output
);
array
.
Load
(
input
->
data
());
array
.
Store
(
output
->
data
());
for
(
int
i
=
0
;
i
<
kAvxWidth
;
++
i
)
{
EXPECT_EQ
((
*
input
)[
i
],
(
*
output
)[
i
]);
}
}
TEST
(
AvxFloatVecArrayTest
,
LoadTruncatedFloat16
)
{
AvxFloatVecArray
<
2
>
array
;
UniqueVector
<
TruncatedFloat16
>
values
(
2
*
kAvxWidth
);
UniqueVector
<
float
>
decompressed
(
2
*
kAvxWidth
);
for
(
int
i
=
0
;
i
<
2
*
kAvxWidth
;
++
i
)
{
int
permuted
=
FastUnpackPermutation
(
i
);
(
*
values
)[
i
]
=
TruncatedFloat16
::
DebugFromFloat
(
permuted
/
10.0
);
}
// Ensure that state persisted from other tests won't cause this test to
// erroneously pass.
array
.
LoadConstVector
(
-
1.0
f
);
array
.
Load
(
values
->
data
());
array
.
Store
(
decompressed
->
data
());
for
(
int
i
=
0
;
i
<
2
*
kAvxWidth
;
++
i
)
{
ASSERT_NEAR
((
*
decompressed
)[
i
],
i
/
10.0
,
0.01
);
}
}
TEST
(
AvxFloatVecArrayTest
,
LoadIeeeFloat16
)
{
#if defined(__F16C__)
AvxFloatVecArray
<
2
>
array
;
UniqueVector
<
IeeeFloat16
>
values
(
2
*
kAvxWidth
);
UniqueVector
<
float
>
decompressed
(
2
*
kAvxWidth
);
for
(
int
i
=
0
;
i
<
2
*
kAvxWidth
;
++
i
)
{
(
*
values
)[
i
]
=
IeeeFloat16
::
DebugFromFloat
(
i
/
10.0
);
}
// Ensure that state persisted from other tests won't cause this test to
// erroneously pass.
array
.
LoadConstVector
(
-
1.0
f
);
array
.
Load
(
values
->
data
());
array
.
Store
(
decompressed
->
data
());
for
(
int
i
=
0
;
i
<
2
*
kAvxWidth
;
++
i
)
{
ASSERT_NEAR
((
*
decompressed
)[
i
],
i
/
10.0
,
0.01
);
}
#else
LOG
(
INFO
)
<<
"Test binary wasn't compiled with F16C support, so skipping "
<<
"this test."
;
#endif
}
TEST
(
AvxFloatVecArrayTest
,
PermutationFunctionIsEqualToTable
)
{
std
::
vector
<
int
>
permutation
=
{
0
,
1
,
2
,
3
,
8
,
9
,
10
,
11
,
4
,
5
,
6
,
7
,
12
,
13
,
14
,
15
};
for
(
int
i
=
0
;
i
<
kAvxWidthHalfPrecision
;
++
i
)
{
EXPECT_EQ
(
FastUnpackPermutation
(
i
),
permutation
[
i
]);
}
}
}
// namespace
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
research/syntaxnet/dragnn/runtime/math/eigen.h
0 → 100644
View file @
a4bb31d0
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Compatibility support for Eigen.
#ifndef DRAGNN_RUNTIME_MATH_EIGEN_H_
#define DRAGNN_RUNTIME_MATH_EIGEN_H_
#include "dragnn/runtime/alignment.h"
#include "dragnn/runtime/math/types.h"
#include "third_party/eigen3/Eigen/Core"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
namespace
internal
{
// Returns a combination of bit-options for Eigen matrices.
constexpr
int
GetEigenMatrixOptions
()
{
return
Eigen
::
AutoAlign
|
Eigen
::
RowMajor
;
}
// Returns a combination of bit-options for Eigen maps of runtime types.
constexpr
int
GetEigenMapOptions
()
{
static_assert
(
kAlignmentBytes
>=
EIGEN_MAX_ALIGN_BYTES
,
"Runtime alignment is not compatible with Eigen alignment."
);
return
Eigen
::
Aligned
;
}
// Eigen matrix and (row) vector types. Don't use these directly; instead use
// the public Map types and functions below to wrap runtime types.
template
<
class
T
>
using
EigenVector
=
Eigen
::
Matrix
<
T
,
1
,
Eigen
::
Dynamic
,
GetEigenMatrixOptions
()
>
;
template
<
class
T
>
using
EigenMatrix
=
Eigen
::
Matrix
<
T
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
GetEigenMatrixOptions
()
>
;
// Eigen stride for matrix types.
using
EigenMatrixStride
=
Eigen
::
Stride
<
Eigen
::
Dynamic
,
1
>
;
// Returns the Eigen stride associated with the |matrix|.
template
<
class
T
>
EigenMatrixStride
GetEigenMatrixStride
(
MatrixImpl
<
T
>
matrix
)
{
return
EigenMatrixStride
(
matrix
.
row_stride
(),
1
);
}
}
// namespace internal
// Eigen wrappers around a runtime-allocated matrix or (row) vector.
template
<
class
T
>
using
EigenVectorMap
=
Eigen
::
Map
<
const
internal
::
EigenVector
<
T
>
,
internal
::
GetEigenMapOptions
()
>
;
template
<
class
T
>
using
MutableEigenVectorMap
=
Eigen
::
Map
<
internal
::
EigenVector
<
T
>
,
internal
::
GetEigenMapOptions
()
>
;
template
<
class
T
>
using
EigenMatrixMap
=
Eigen
::
Map
<
const
internal
::
EigenMatrix
<
T
>
,
internal
::
GetEigenMapOptions
(),
internal
::
EigenMatrixStride
>
;
template
<
class
T
>
using
MutableEigenMatrixMap
=
Eigen
::
Map
<
internal
::
EigenMatrix
<
T
>
,
internal
::
GetEigenMapOptions
(),
internal
::
EigenMatrixStride
>
;
// Returns an Eigen wrapper around the |vector| or |matrix|.
template
<
class
T
>
EigenVectorMap
<
T
>
AsEigenMap
(
Vector
<
T
>
vector
)
{
return
EigenVectorMap
<
T
>
(
vector
.
data
(),
vector
.
size
());
}
template
<
class
T
>
MutableEigenVectorMap
<
T
>
AsEigenMap
(
MutableVector
<
T
>
vector
)
{
return
MutableEigenVectorMap
<
T
>
(
vector
.
data
(),
vector
.
size
());
}
template
<
class
T
>
EigenMatrixMap
<
T
>
AsEigenMap
(
Matrix
<
T
>
matrix
)
{
return
EigenMatrixMap
<
T
>
(
matrix
.
data
(),
matrix
.
num_rows
(),
matrix
.
num_columns
(),
internal
::
GetEigenMatrixStride
(
matrix
));
}
template
<
class
T
>
MutableEigenMatrixMap
<
T
>
AsEigenMap
(
MutableMatrix
<
T
>
matrix
)
{
return
MutableEigenMatrixMap
<
T
>
(
matrix
.
data
(),
matrix
.
num_rows
(),
matrix
.
num_columns
(),
internal
::
GetEigenMatrixStride
(
matrix
));
}
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_EIGEN_H_
research/syntaxnet/dragnn/runtime/math/eigen_test.cc
0 → 100644
View file @
a4bb31d0
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/eigen.h"
#include <vector>
#include "dragnn/core/test/generic.h"
#include "dragnn/runtime/math/types.h"
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/platform/test.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
namespace
{
// Expects that two pointers point to the same address.
void
ExpectSameAddress
(
const
void
*
ptr1
,
const
void
*
ptr2
)
{
EXPECT_EQ
(
ptr1
,
ptr2
);
}
// Expects that the |vector| has the |values|.
void
ExpectValues
(
MutableVector
<
float
>
vector
,
const
std
::
vector
<
float
>
&
values
)
{
ASSERT_EQ
(
vector
.
size
(),
values
.
size
());
for
(
int
i
=
0
;
i
<
values
.
size
();
++
i
)
{
EXPECT_EQ
(
vector
[
i
],
values
[
i
]);
}
}
// Expects that the Eigen |matrix| has the |values|.
template
<
class
EigenMatrix
>
void
ExpectValues
(
const
EigenMatrix
&
matrix
,
const
std
::
vector
<
std
::
vector
<
float
>>
&
values
)
{
ASSERT_EQ
(
matrix
.
rows
(),
values
.
size
());
for
(
int
row
=
0
;
row
<
matrix
.
rows
();
++
row
)
{
ASSERT_EQ
(
matrix
.
cols
(),
values
[
row
].
size
());
for
(
int
column
=
0
;
column
<
matrix
.
cols
();
++
column
)
{
EXPECT_EQ
(
matrix
(
row
,
column
),
values
[
row
][
column
]);
}
}
}
// Tests that an Eigen vector map references the same memory as the underlying
// runtime vector.
TEST
(
EigenTest
,
Vector
)
{
UniqueVector
<
float
>
vector
({
1.0
,
2.0
,
3.0
,
4.0
});
EigenVectorMap
<
float
>
const_eigen_vector
=
AsEigenMap
(
Vector
<
float
>
(
*
vector
));
ExpectSameAddress
(
const_eigen_vector
.
data
(),
vector
->
data
());
ExpectValues
(
const_eigen_vector
,
{{
1.0
,
2.0
,
3.0
,
4.0
}});
MutableEigenVectorMap
<
float
>
mutable_eigen_vector
=
AsEigenMap
(
*
vector
);
ExpectSameAddress
(
mutable_eigen_vector
.
data
(),
vector
->
data
());
ExpectValues
(
mutable_eigen_vector
,
{{
1.0
,
2.0
,
3.0
,
4.0
}});
// Write into the runtime vector and read from the other views.
(
*
vector
)[
0
]
=
10.0
;
(
*
vector
)[
1
]
=
20.0
;
(
*
vector
)[
2
]
=
30.0
;
(
*
vector
)[
3
]
=
40.0
;
ExpectValues
(
const_eigen_vector
,
{{
10.0
,
20.0
,
30.0
,
40.0
}});
ExpectValues
(
mutable_eigen_vector
,
{{
10.0
,
20.0
,
30.0
,
40.0
}});
// Write into the mutable Eigen vector and read from the other views.
mutable_eigen_vector
<<
100.0
,
200.0
,
300.0
,
400.0
;
ExpectValues
(
const_eigen_vector
,
{{
100.0
,
200.0
,
300.0
,
400.0
}});
ExpectValues
(
*
vector
,
{
100.0
,
200.0
,
300.0
,
400.0
});
}
// Tests that an Eigen matrix map references the same memory as the underlying
// runtime vector.
TEST
(
EigenTest
,
Matrix
)
{
UniqueMatrix
<
float
>
matrix
({{
1.0
,
2.0
,
3.0
},
//
{
4.0
,
5.0
,
6.0
},
//
{
7.0
,
8.0
,
9.0
}});
EigenMatrixMap
<
float
>
const_eigen_matrix
=
AsEigenMap
(
Matrix
<
float
>
(
*
matrix
));
ExpectSameAddress
(
const_eigen_matrix
.
data
(),
matrix
->
row
(
0
).
data
());
ExpectValues
(
const_eigen_matrix
,
{{
1.0
,
2.0
,
3.0
},
//
{
4.0
,
5.0
,
6.0
},
//
{
7.0
,
8.0
,
9.0
}});
MutableEigenMatrixMap
<
float
>
mutable_eigen_matrix
=
AsEigenMap
(
*
matrix
);
ExpectSameAddress
(
mutable_eigen_matrix
.
data
(),
matrix
->
row
(
0
).
data
());
ExpectValues
(
mutable_eigen_matrix
,
{{
1.0
,
2.0
,
3.0
},
//
{
4.0
,
5.0
,
6.0
},
//
{
7.0
,
8.0
,
9.0
}});
// Write into the runtime matrix and read from the other views.
matrix
->
row
(
0
)[
0
]
=
10.0
;
matrix
->
row
(
0
)[
1
]
=
20.0
;
matrix
->
row
(
0
)[
2
]
=
30.0
;
matrix
->
row
(
1
)[
0
]
=
40.0
;
matrix
->
row
(
1
)[
1
]
=
50.0
;
matrix
->
row
(
1
)[
2
]
=
60.0
;
matrix
->
row
(
2
)[
0
]
=
70.0
;
matrix
->
row
(
2
)[
1
]
=
80.0
;
matrix
->
row
(
2
)[
2
]
=
90.0
;
ExpectValues
(
const_eigen_matrix
,
{{
10.0
,
20.0
,
30.0
},
//
{
40.0
,
50.0
,
60.0
},
//
{
70.0
,
80.0
,
90.0
}});
ExpectValues
(
mutable_eigen_matrix
,
{{
10.0
,
20.0
,
30.0
},
//
{
40.0
,
50.0
,
60.0
},
//
{
70.0
,
80.0
,
90.0
}});
// Write into the mutable Eigen matrix and read from the other views.
mutable_eigen_matrix
<<
100.0
,
200.0
,
300.0
,
400.0
,
500.0
,
600.0
,
700.0
,
800.0
,
900.0
;
ExpectValues
(
const_eigen_matrix
,
{{
100.0
,
200.0
,
300.0
},
//
{
400.0
,
500.0
,
600.0
},
//
{
700.0
,
800.0
,
900.0
}});
ExpectValues
(
matrix
->
row
(
0
),
{
100.0
,
200.0
,
300.0
});
ExpectValues
(
matrix
->
row
(
1
),
{
400.0
,
500.0
,
600.0
});
ExpectValues
(
matrix
->
row
(
2
),
{
700.0
,
800.0
,
900.0
});
}
}
// namespace
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
research/syntaxnet/dragnn/runtime/math/float16_types.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Declares 16-bit floating point types.
#ifndef DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
#define DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
#if defined(__F16C__)
#include <emmintrin.h>
#endif
#include "syntaxnet/base.h"
#include "tensorflow/core/lib/core/casts.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
// Represents a truncated 16-bit floating point value. This corresponds to
// `bfloat16` in TensorFlow. It just chops the last 16 least-significant bits
// off the significand of a 32-bit floating point value, leaving 7 significand
// bits, 8 exponent bits, and 1 sign bit.
struct
TruncatedFloat16
{
// Slow unpacking routine. Use avx_vector_array.h for normal operation.
float
DebugToFloat
()
const
{
uint32
upcast
=
bits
;
upcast
<<=
16
;
return
tensorflow
::
bit_cast
<
float
>
(
upcast
);
}
// Slow packing routine. Use avx_vector_array.h for normal operation.
static
TruncatedFloat16
DebugFromFloat
(
float
value
)
{
uint32
float_bits
=
tensorflow
::
bit_cast
<
uint32
>
(
value
);
return
TruncatedFloat16
{
static_cast
<
uint16
>
(
float_bits
>>
16
)};
}
uint16
bits
;
};
static_assert
(
sizeof
(
TruncatedFloat16
)
==
sizeof
(
uint16
),
"Bad struct size"
);
// Currently, only CPUs with the F16C instruction set are supported. All use of
// this struct should be flag-guarded.
//
// If this becomes a problem, we can implement this method with Eigen's
// CUDA/Half.h.
#if defined(__F16C__)
// Represents an IEEE-754 16-bit floating point value. This has 10 significand
// bits, 5 exponent bits, and 1 sign bit.
//
// TODO(googleuser): Either add compatibility support, or delete this code if
// it turns out not to be helpful.
struct
IeeeFloat16
{
// Slow unpacking routine. Use avx_vector_array.h for normal operation.
float
DebugToFloat
()
const
{
return
_cvtsh_ss
(
bits
);
}
// Slow packing routine. Use avx_vector_array.h for normal operation.
static
IeeeFloat16
DebugFromFloat
(
float
value
)
{
return
IeeeFloat16
{
_cvtss_sh
(
value
,
0
)};
}
uint16
bits
;
};
static_assert
(
sizeof
(
IeeeFloat16
)
==
sizeof
(
uint16
),
"Bad struct size"
);
#endif
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_FLOAT16_TYPES_H_
research/syntaxnet/dragnn/runtime/math/float16_types_test.cc
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/float16_types.h"
#include "tensorflow/core/platform/test.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
namespace
{
// C++11 doesn't support binary literals like 0b01001, so add a helper. :(
uint16
ParseBinaryString
(
const
string
&
bits
)
{
CHECK_EQ
(
bits
.
size
(),
16
)
<<
"ParseBinaryString expects full 16-bit values"
;
uint16
value
=
0
;
for
(
const
char
bit
:
bits
)
{
CHECK
(
bit
==
'0'
||
bit
==
'1'
)
<<
"String must be 0's and 1's."
;
value
=
(
value
<<
1
)
+
(
bit
==
'0'
?
0
:
1
);
}
return
value
;
}
TEST
(
Float16TypesTest
,
IeeeFloat16Accuracy
)
{
#if defined(__F16C__)
bool
some_not_exact
=
false
;
for
(
int
i
=
-
100
;
i
<
100
;
++
i
)
{
float
value
=
i
/
10.0
f
;
IeeeFloat16
half
=
IeeeFloat16
::
DebugFromFloat
(
value
);
float
unpacked
=
half
.
DebugToFloat
();
EXPECT_NEAR
(
value
,
unpacked
,
0.01
);
some_not_exact
=
some_not_exact
||
(
value
!=
unpacked
);
}
EXPECT_TRUE
(
some_not_exact
);
#else
LOG
(
INFO
)
<<
"Test binary wasn't compiled with F16C support, so skipping "
<<
"this test."
;
#endif
}
TEST
(
Float16TypesTest
,
TruncatedAccuracy
)
{
bool
some_not_exact
=
false
;
for
(
int
i
=
-
100
;
i
<
100
;
++
i
)
{
float
value
=
i
/
10.0
f
;
TruncatedFloat16
half
=
TruncatedFloat16
::
DebugFromFloat
(
value
);
float
unpacked
=
half
.
DebugToFloat
();
EXPECT_NEAR
(
value
,
unpacked
,
0.06
);
some_not_exact
=
some_not_exact
||
(
value
!=
unpacked
);
}
EXPECT_TRUE
(
some_not_exact
);
}
TEST
(
Float16TypesTest
,
TruncatedKnownBinaryRepresentation
)
{
uint16
neg_1
=
ParseBinaryString
(
"1011111110000000"
);
uint16
one
=
ParseBinaryString
(
"0011111110000000"
);
EXPECT_EQ
((
TruncatedFloat16
{
neg_1
}).
DebugToFloat
(),
-
1.0
f
);
EXPECT_EQ
((
TruncatedFloat16
{
one
}).
DebugToFloat
(),
1.0
f
);
}
TEST
(
Float16TypesTest
,
IeeeFloat16KnownBinaryRepresentation
)
{
#if defined(__F16C__)
uint16
neg_1
=
ParseBinaryString
(
"1011110000000000"
);
uint16
one
=
ParseBinaryString
(
"0011110000000000"
);
EXPECT_EQ
((
IeeeFloat16
{
neg_1
}).
DebugToFloat
(),
-
1.0
f
);
EXPECT_EQ
((
IeeeFloat16
{
one
}).
DebugToFloat
(),
1.0
f
);
#else
LOG
(
INFO
)
<<
"Test binary wasn't compiled with F16C support, so skipping "
<<
"this test."
;
#endif
}
}
// namespace
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
research/syntaxnet/dragnn/runtime/math/sgemvv.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Computes `[y_1, y_2, ...] = M * [v_1, v_2, ...] + [b_1, b_2, ...]`, where
//
// M is a `m x n` dense matrix.
// v_i are `n`-dimensional dense vectors.
// b_i and y_i are `m`-dimensional dense vectors.
//
// Unfortunately even larger (e.g. 128x128) matrix sizes are not sufficient to
// hide the latency of a function call. So the entire implementation needs to
// live in this header file. Please make sure to use all of the optimization
// flags mentioned in the BUILD file in any client libraries.
#ifndef DRAGNN_RUNTIME_MATH_SGEMVV_H_
#define DRAGNN_RUNTIME_MATH_SGEMVV_H_
#if defined(__SSE2__)
#include <xmmintrin.h>
#endif
#include "dragnn/runtime/math/avx_vector_array.h"
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#define DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE __attribute__((always_inline))
#ifdef __clang__
#define DRAGNN_SGEMVV_GCC_UNROLL
#else
#define DRAGNN_SGEMVV_GCC_UNROLL __attribute__((optimize("unroll-loops")))
#endif
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
// Represents `v, b` from one operation `y = M * v + b`.
template
<
int
num_ops
>
struct
SgemvInputBatch
{
const
float
*
input
[
num_ops
];
const
float
*
initial
[
num_ops
];
};
template
<
int
num_ops
>
struct
SgemvOutputBatch
{
float
*
output
[
num_ops
];
};
// Matrix argument for the SGEMV/SGEMVV operation. Based on row-batched
// column-major matrices, but pulls the batch size into a template argument
// so code can be compiled more efficiently.
template
<
int
sse_batch_size
,
typename
ElementType
=
float
>
class
SgemvMatrix
final
{
public:
// Convenience type alias.
using
MatrixType
=
BlockedMatrix
<
ElementType
,
BlockedMatrixFormat
::
kRowBlockedColumnMajor
>
;
// Creates an empty SgemvMatrix.
SgemvMatrix
()
=
default
;
// Initializes the new matrix. Returns an InvalidArgumentError if the block
// size of `matrix` is not equal to `sse_batch_size.
::
tensorflow
::
Status
Initialize
(
const
MatrixType
&
matrix
);
// Computes the matrix-vector product with a set of other inputs. See
// top-level comment for the general algorithm.
template
<
int
num_ops
,
int
lookahead_1
=
8
,
int
lookahead_2
=
8
>
void
DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
DRAGNN_SGEMVV_GCC_UNROLL
MatrixMultiVectorProduct
(
const
SgemvInputBatch
<
num_ops
>
&
inputs
,
SgemvOutputBatch
<
num_ops
>
*
outputs
)
const
{
MatrixMultiVectorProductImpl
<
num_ops
,
/*mask_input_output=*/
false
,
/*read_initial=*/
true
,
lookahead_1
,
lookahead_2
>
(
inputs
,
-
1
,
outputs
);
}
// Computes the matrix-vector product with a set of other inputs. See
// top-level comment for the general algorithm. This variant allows another
// parameter, `output_vector_elements`, to write to outputs which are a
// multiple of kAvxWidth (8 floats, or 32 bytes) but not necessarily
// sse_batch_size. It is slightly slower, but probably more than noise.
//
// |lookahead_1| and |lookahead_2| parameters control prefetching, and should
// usually be tuned via a script. They issue prefetch instructions that are
// `lookahead_1 * sse_batch_size` values ahead of the current matrix entry
// being read, if `lookahead_1 != 0` (and `(lookahead_1 + lookahead_2) *
// sse_batch_size` values, if lookahead_2 != 0). To reiterate, all prefetching
// can be disabled by setting |lookahead_1| to 0, or the second prefetch can
// be disabled by setting |lookahead_2| to 0.
template
<
int
num_ops
,
int
lookahead_1
=
8
,
int
lookahead_2
=
8
>
void
DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
DRAGNN_SGEMVV_GCC_UNROLL
MaskedMatrixMultiVectorProduct
(
const
SgemvInputBatch
<
num_ops
>
&
inputs
,
int
output_vector_elements
,
SgemvOutputBatch
<
num_ops
>
*
outputs
)
const
{
MatrixMultiVectorProductImpl
<
num_ops
,
/*mask_input_output=*/
true
,
/*read_initial=*/
true
,
lookahead_1
,
lookahead_2
>
(
inputs
,
output_vector_elements
,
outputs
);
}
// Like the above, but assumes existing values are zero instead of reading
// them.
template
<
int
num_ops
>
void
DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
DRAGNN_SGEMVV_GCC_UNROLL
MaskedMatrixMultiVectorProductNoInitial
(
const
SgemvInputBatch
<
num_ops
>
&
inputs
,
int
output_vector_elements
,
SgemvOutputBatch
<
num_ops
>
*
outputs
)
const
{
MatrixMultiVectorProductImpl
<
num_ops
,
/*mask_input_output=*/
true
,
/*read_initial=*/
false
>
(
inputs
,
output_vector_elements
,
outputs
);
}
// Read-only accessor.
const
MatrixType
&
matrix
()
const
{
return
matrix_
;
}
private:
template
<
int
num_ops
,
bool
mask_input_output
,
bool
read_initial
,
int
lookahead_1
=
8
,
int
lookahead_2
=
8
>
DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
DRAGNN_SGEMVV_GCC_UNROLL
void
MatrixMultiVectorProductImpl
(
const
SgemvInputBatch
<
num_ops
>
&
inputs
,
int
output_vector_elements
,
SgemvOutputBatch
<
num_ops
>
*
outputs
)
const
;
MatrixType
matrix_
;
};
// Implementation details.
template
<
int
sse_batch_size
,
typename
ElementType
>
template
<
int
num_ops
,
bool
mask_input_output
,
bool
read_initial
,
int
lookahead_1
,
int
lookahead_2
>
inline
void
DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
DRAGNN_SGEMVV_GCC_UNROLL
SgemvMatrix
<
sse_batch_size
,
ElementType
>::
MatrixMultiVectorProductImpl
(
const
SgemvInputBatch
<
num_ops
>
&
inputs
,
int
output_vector_elements
,
SgemvOutputBatch
<
num_ops
>
*
outputs
)
const
{
static_assert
(
sse_batch_size
%
kAvxWidth
==
0
,
"sse_batch_size must be a multiple of kAvxWidth (8)."
);
if
(
mask_input_output
)
{
DCHECK_EQ
(
output_vector_elements
%
kAvxWidth
,
0
)
<<
"output_vector_elements must be padded to alignment"
;
}
const
ElementType
*
curr_matrix_ptr
=
matrix_
.
vector
(
0
).
data
();
// Loop over blocks of output rows. Each block of output rows will get a
// partial sum of the [matrix-vector] dot product, where the range of that
// partial sum is designated by start_col and end_col.
for
(
int
row_start
=
0
;
row_start
<
matrix_
.
num_rows
();
row_start
+=
sse_batch_size
)
{
const
int
load_store_max_idx
=
(
output_vector_elements
-
row_start
)
/
kAvxWidth
;
AvxFloatVecArray
<
sse_batch_size
/
kAvxWidth
>
accumulators
[
num_ops
];
// Read inputs.
for
(
int
op
=
0
;
op
<
num_ops
;
++
op
)
{
if
(
read_initial
)
{
if
(
mask_input_output
)
{
accumulators
[
op
].
Load
(
&
inputs
.
initial
[
op
][
row_start
],
load_store_max_idx
);
}
else
{
accumulators
[
op
].
Load
(
&
inputs
.
initial
[
op
][
row_start
]);
}
}
else
{
accumulators
[
op
].
LoadConstVector
(
0.0
f
);
}
}
// Compute matrix-vector product.
for
(
int
col
=
0
;
col
<
matrix_
.
num_columns
();
++
col
)
{
if
(
lookahead_1
!=
0
)
{
#if defined(__SSE2__)
_mm_prefetch
(
curr_matrix_ptr
+
lookahead_1
*
sse_batch_size
,
_MM_HINT_T0
);
if
(
lookahead_2
!=
0
)
{
_mm_prefetch
(
curr_matrix_ptr
+
(
lookahead_1
+
lookahead_2
)
*
sse_batch_size
,
_MM_HINT_T0
);
}
#endif
}
// These are the coefficients from each vector at column `col` (just
// broadcast over the whole AVX array).
AvxFloatVec
weights
[
num_ops
];
for
(
int
op
=
0
;
op
<
num_ops
;
++
op
)
{
weights
[
op
].
LoadConstVector
(
inputs
.
input
[
op
][
col
]);
}
// Loop over each AVX vector and add the current sub-product.
AvxFloatVecArray
<
sse_batch_size
/
kAvxWidth
>
matrix_block
;
matrix_block
.
Load
(
curr_matrix_ptr
);
curr_matrix_ptr
+=
sse_batch_size
;
for
(
int
row_offset
=
0
;
row_offset
<
sse_batch_size
/
kAvxWidth
;
row_offset
++
)
{
for
(
int
op
=
0
;
op
<
num_ops
;
++
op
)
{
accumulators
[
op
].
vectors
[
row_offset
].
AddProductOf
(
weights
[
op
],
matrix_block
.
vectors
[
row_offset
]);
}
}
}
// Save the results.
for
(
int
op
=
0
;
op
<
num_ops
;
++
op
)
{
if
(
mask_input_output
)
{
accumulators
[
op
].
Store
(
&
outputs
->
output
[
op
][
row_start
],
load_store_max_idx
);
}
else
{
accumulators
[
op
].
Store
(
&
outputs
->
output
[
op
][
row_start
]);
}
}
}
}
template
<
int
sse_batch_size
,
typename
ElementType
>
::
tensorflow
::
Status
SgemvMatrix
<
sse_batch_size
,
ElementType
>::
Initialize
(
const
SgemvMatrix
<
sse_batch_size
,
ElementType
>::
MatrixType
&
matrix
)
{
if
(
matrix
.
block_size
()
!=
sse_batch_size
)
{
return
::
tensorflow
::
errors
::
InvalidArgument
(
"Blocked matrix block_size ("
,
matrix
.
block_size
(),
") must be equal to sse_batch_size ("
,
sse_batch_size
,
")"
);
}
matrix_
=
matrix
;
return
::
tensorflow
::
Status
::
OK
();
}
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#undef DRAGNN_SGEMVV_ATTRIBUTE_ALWAYS_INLINE
#undef DRAGNN_SGEMVV_GCC_UNROLL
#endif // DRAGNN_RUNTIME_MATH_SGEMVV_H_
research/syntaxnet/dragnn/runtime/math/sgemvv_test.cc
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/sgemvv.h"
#include <chrono>
#include <random>
#include "dragnn/core/test/generic.h"
#include "dragnn/runtime/math/arithmetic.h"
#include "dragnn/runtime/math/transformations.h"
#include "dragnn/runtime/math/types.h"
#include "dragnn/runtime/test/helpers.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/lib/strings/strcat.h"
#include "tensorflow/core/platform/test.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
void
naive_sgemv
(
const
MutableMatrix
<
float
>
&
matrix
,
const
float
*
v
,
const
float
*
b
,
float
*
y
)
{
for
(
int
row
=
0
;
row
<
matrix
.
num_rows
();
row
++
)
{
y
[
row
]
=
b
[
row
];
for
(
int
col
=
0
;
col
<
matrix
.
num_columns
();
col
++
)
{
y
[
row
]
+=
matrix
.
row
(
row
)[
col
]
*
v
[
col
];
}
}
}
// Everything except floats require copying.
template
<
class
ElementType
>
constexpr
bool
RequiresCopy
();
template
<
>
constexpr
bool
RequiresCopy
<
TruncatedFloat16
>
()
{
return
true
;
}
#if defined(__F16C__)
template
<
>
constexpr
bool
RequiresCopy
<
IeeeFloat16
>
()
{
return
true
;
}
#endif
template
<
>
constexpr
bool
RequiresCopy
<
float
>
()
{
return
false
;
}
template
<
class
ElementType
>
void
ConvertRow
(
Vector
<
float
>
input
,
MutableVector
<
ElementType
>
output
);
template
<
>
void
ConvertRow
<
float
>
(
Vector
<
float
>
input
,
MutableVector
<
float
>
output
)
{}
template
<
>
void
ConvertRow
<
TruncatedFloat16
>
(
Vector
<
float
>
input
,
MutableVector
<
TruncatedFloat16
>
output
)
{
CHECK_EQ
(
input
.
size
()
%
16
,
0
);
CHECK_EQ
(
input
.
size
(),
output
.
size
());
for
(
int
i
=
0
;
i
<
input
.
size
();
++
i
)
{
int
i_permuted
=
(
i
/
16
)
*
16
+
FastUnpackPermutation
(
i
%
16
);
output
[
i
]
=
TruncatedFloat16
::
DebugFromFloat
(
input
[
i_permuted
]);
}
}
#if defined(__F16C__)
template
<
>
void
ConvertRow
<
IeeeFloat16
>
(
Vector
<
float
>
input
,
MutableVector
<
IeeeFloat16
>
output
)
{
CHECK_EQ
(
input
.
size
()
%
16
,
0
);
CHECK_EQ
(
input
.
size
(),
output
.
size
());
for
(
int
i
=
0
;
i
<
input
.
size
();
++
i
)
{
output
[
i
]
=
IeeeFloat16
::
DebugFromFloat
(
input
[
i
]);
}
}
#endif
// Converts a matrix to SGEMV. If the element type is not float, copies the
// matrix and then converts it.
template
<
int
sse_batch_size
,
typename
ElementType
=
float
>
SgemvMatrix
<
sse_batch_size
,
ElementType
>
ConvertToSgemv
(
const
Matrix
<
float
>
&
matrix
,
UniqueMatrix
<
ElementType
>
*
sgemv_storage
)
{
MutableBlockedMatrix
<
ElementType
,
BlockedMatrixFormat
::
kRowBlockedColumnMajor
>
blocked
;
TF_EXPECT_OK
(
blocked
.
Reset
(
sgemv_storage
->
area
(),
matrix
.
num_rows
(),
matrix
.
num_columns
()));
// TODO(googleuser): Clean this up when we can use C++17's `if constexpr`
// ... then we will not have to introduce this raw pointer, which is either
// an actual new variable or alias to `sgemv_storage`.
UniqueMatrix
<
float
>
*
uncompressed
;
if
(
RequiresCopy
<
ElementType
>
())
{
uncompressed
=
new
UniqueMatrix
<
float
>
((
*
sgemv_storage
)
->
num_rows
(),
(
*
sgemv_storage
)
->
num_columns
());
}
else
{
// NOTE: Because we don't have C++17's `if constexpr`, we need to add a
// reinterpret_cast, so this code can compile when ElementType != float.
uncompressed
=
reinterpret_cast
<
UniqueMatrix
<
float
>
*>
(
sgemv_storage
);
}
// Copy to the uncompressed matrix. If ElementType == float, this is just
// the output, otherwise it's the temporary array.
MutableBlockedMatrix
<
float
,
BlockedMatrixFormat
::
kRowBlockedColumnMajor
>
uncompressed_matrix
;
TF_EXPECT_OK
(
uncompressed_matrix
.
Reset
(
uncompressed
->
area
(),
matrix
.
num_rows
(),
matrix
.
num_columns
()));
TF_EXPECT_OK
(
CopyMatrix
(
matrix
,
&
uncompressed_matrix
));
if
(
RequiresCopy
<
ElementType
>
())
{
for
(
int
i
=
0
;
i
<
blocked
.
num_vectors
();
++
i
)
{
ConvertRow
<
ElementType
>
(
Vector
<
float
>
(
uncompressed_matrix
.
vector
(
i
)),
blocked
.
vector
(
i
));
}
delete
uncompressed
;
}
SgemvMatrix
<
sse_batch_size
,
ElementType
>
sgemv_matrix
;
TF_EXPECT_OK
(
sgemv_matrix
.
Initialize
(
blocked
.
AsConst
()));
return
sgemv_matrix
;
}
void
InitRandomVector
(
MutableVector
<
float
>
vector
)
{
// clock() is updated less frequently than a cycle counter, so keep around the
// RNG just in case we initialize some vectors in less than a clock tick.
static
std
::
mt19937
*
rng
=
new
std
::
mt19937
(
clock
());
std
::
normal_distribution
<
float
>
distribution
(
0
,
1
);
for
(
int
i
=
0
;
i
<
vector
.
size
();
i
++
)
{
vector
[
i
]
=
distribution
(
*
rng
);
}
}
void
InitRandomMatrix
(
MutableMatrix
<
float
>
matrix
)
{
// See InitRandomVector comment.
static
std
::
mt19937
*
rng
=
new
std
::
mt19937
(
clock
());
std
::
normal_distribution
<
float
>
distribution
(
0
,
1
);
GenerateMatrix
(
matrix
.
num_rows
(),
matrix
.
num_columns
(),
[
&
distribution
](
int
row
,
int
col
)
{
return
distribution
(
*
rng
);
},
&
matrix
);
}
TEST
(
SgemvvTest
,
MatmulNoBias
)
{
constexpr
int
sse_batch_size
=
32
;
constexpr
int
num_rows
=
32
;
constexpr
int
num_columns
=
15
;
constexpr
int
output_size
=
8
;
constexpr
int
sgemv_views
=
num_rows
*
num_columns
/
sse_batch_size
;
static_assert
(
num_rows
*
num_columns
%
sse_batch_size
==
0
,
"Bad matrix size"
);
ASSERT_EQ
(
output_size
%
8
,
0
)
<<
"Output size must still be 32-byte aligned."
;
UniqueMatrix
<
float
>
matrix
(
num_rows
,
num_columns
);
UniqueMatrix
<
float
>
sgemv_storage
(
sgemv_views
,
sse_batch_size
);
UniqueVector
<
float
>
input_vector
(
num_columns
);
UniqueVector
<
float
>
output
(
num_rows
);
UniqueVector
<
float
>
expected
(
num_rows
);
// Random initialization for all variables/values.
InitRandomMatrix
(
*
matrix
);
InitRandomVector
(
*
output
);
InitRandomVector
(
*
expected
);
InitRandomVector
(
*
input_vector
);
// Layout SGEMV matrix.
SgemvMatrix
<
sse_batch_size
>
sgemv_matrix
=
ConvertToSgemv
<
sse_batch_size
>
(
Matrix
<
float
>
(
*
matrix
),
&
sgemv_storage
);
// SGEMV multiplication.
SgemvInputBatch
<
1
>
inputs
=
{{
input_vector
->
data
()},
{
nullptr
}};
SgemvOutputBatch
<
1
>
outputs
=
{{
output
->
data
()}};
sgemv_matrix
.
MaskedMatrixMultiVectorProductNoInitial
(
inputs
,
output_size
,
&
outputs
);
// Naive algorithm.
MultiplyMatrixAndVector
<
float
>
(
Matrix
<
float
>
(
*
matrix
),
Vector
<
float
>
(
*
input_vector
),
*
expected
);
// Check that results are equal.
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
EXPECT_NEAR
(
output
->
data
()[
i
],
expected
->
data
()[
i
],
1e-5
);
}
}
TEST
(
SgemvvTest
,
ErrorsWithBadMultiple
)
{
// Pick num_rows which is (32-byte) alignable, but not a multiple of
// sse_batch_size (32 floats). These should return errors.
for
(
int
num_rows
=
8
;
num_rows
<
32
;
num_rows
+=
8
)
{
// Layout blocked matrix.
UniqueMatrix
<
float
>
sgemv_storage
(
1
,
num_rows
);
MutableBlockedMatrix
<
float
,
BlockedMatrixFormat
::
kRowBlockedColumnMajor
>
blocked
;
TF_EXPECT_OK
(
blocked
.
Reset
(
sgemv_storage
.
area
(),
num_rows
,
1
));
// Initialize SgemvvMatrix.
SgemvMatrix
<
32
>
matrix
;
EXPECT_THAT
(
matrix
.
Initialize
(
blocked
.
AsConst
()),
test
::
IsErrorWithSubstr
(
"must be equal to sse_batch_size"
));
}
}
template
<
typename
ElementType
>
string
TypenameString
();
template
<
>
string
TypenameString
<
float
>
()
{
return
"float32"
;
}
template
<
>
string
TypenameString
<
TruncatedFloat16
>
()
{
return
"bfloat16"
;
}
#if defined(__F16C__)
template
<
>
string
TypenameString
<
IeeeFloat16
>
()
{
return
"float16"
;
}
#endif
template
<
typename
ElementType
>
float
ToleranceAt128
();
template
<
>
float
ToleranceAt128
<
float
>
()
{
return
1e-5
;
}
template
<
>
float
ToleranceAt128
<
TruncatedFloat16
>
()
{
return
1
;
}
#if defined(__F16C__)
template
<
>
float
ToleranceAt128
<
IeeeFloat16
>
()
{
return
1e-1
;
}
#endif
template
<
int
sse_batch_size
,
int
num_rows
,
int
num_cols
,
typename
ElementType
>
void
RunPerformanceTest
(
int
output_size
)
{
constexpr
int
sgemv_views
=
num_rows
*
num_cols
/
sse_batch_size
;
static_assert
(
num_rows
*
num_cols
%
sse_batch_size
==
0
,
"Bad matrix size"
);
ASSERT_EQ
(
output_size
%
8
,
0
)
<<
"Output size must still be 32-byte aligned."
;
UniqueMatrix
<
float
>
matrix
(
num_rows
,
num_cols
);
UniqueMatrix
<
ElementType
>
sgemv_storage
(
sgemv_views
,
sse_batch_size
);
UniqueVector
<
float
>
initial_1
(
num_rows
);
UniqueVector
<
float
>
initial_2
(
num_rows
);
UniqueVector
<
float
>
vector_1
(
num_cols
);
UniqueVector
<
float
>
vector_2
(
num_cols
);
UniqueVector
<
float
>
output_1
(
num_rows
);
UniqueVector
<
float
>
output_2
(
num_rows
);
UniqueVector
<
float
>
expected_output_1
(
num_rows
);
UniqueVector
<
float
>
expected_output_2
(
num_rows
);
UniqueVector
<
float
>
untouched_output_1
(
num_rows
);
UniqueVector
<
float
>
untouched_output_2
(
num_rows
);
// Random initialization for all variables/values.
InitRandomMatrix
(
*
matrix
);
InitRandomVector
(
*
initial_1
);
InitRandomVector
(
*
initial_2
);
InitRandomVector
(
*
output_1
);
InitRandomVector
(
*
output_2
);
InitRandomVector
(
*
expected_output_1
);
InitRandomVector
(
*
expected_output_2
);
InitRandomVector
(
*
vector_1
);
InitRandomVector
(
*
vector_2
);
for
(
int
i
=
0
;
i
<
num_rows
;
i
++
)
{
(
*
untouched_output_1
)[
i
]
=
(
*
output_1
)[
i
];
(
*
untouched_output_2
)[
i
]
=
(
*
output_2
)[
i
];
}
// Layout SGEMV matrix.
SgemvMatrix
<
sse_batch_size
,
ElementType
>
sgemv_matrix
=
ConvertToSgemv
<
sse_batch_size
,
ElementType
>
(
Matrix
<
float
>
(
*
matrix
),
&
sgemv_storage
);
naive_sgemv
(
*
matrix
,
vector_1
->
data
(),
initial_1
->
data
(),
expected_output_1
->
data
());
naive_sgemv
(
*
matrix
,
vector_2
->
data
(),
initial_2
->
data
(),
expected_output_2
->
data
());
double
raw_flops_per_iteration
=
2.0
*
2.0
*
num_rows
*
num_cols
;
const
uint64
iterations
=
static_cast
<
uint64
>
(
std
::
round
(
4e9
/
raw_flops_per_iteration
));
auto
start_time
=
std
::
chrono
::
system_clock
::
now
();
SgemvInputBatch
<
2
>
inputs
=
{
{
vector_1
->
data
(),
vector_2
->
data
()},
{
initial_1
->
data
(),
initial_2
->
data
()},
};
SgemvOutputBatch
<
2
>
outputs
=
{{
output_1
->
data
(),
output_2
->
data
()}};
if
(
num_rows
==
output_size
)
{
for
(
int
iter
=
0
;
iter
<
iterations
;
iter
++
)
{
sgemv_matrix
.
template
MatrixMultiVectorProduct
<
2
,
0
,
0
>(
inputs
,
&
outputs
);
}
}
else
{
for
(
int
iter
=
0
;
iter
<
iterations
;
iter
++
)
{
sgemv_matrix
.
template
MaskedMatrixMultiVectorProduct
<
2
>(
inputs
,
output_size
,
&
outputs
);
}
}
auto
end_time
=
std
::
chrono
::
system_clock
::
now
();
std
::
chrono
::
duration
<
double
>
elapsed_seconds
=
end_time
-
start_time
;
double
elapsed
=
elapsed_seconds
.
count
();
// Each MatrixVectorVectorProduct does 2 Matrix-vector ops, and each op does a
// multiply and an add (2 floating-point operations) for each entry in the
// matrix.
string
raw_gflops
=
""
;
if
(
num_rows
!=
output_size
)
{
raw_gflops
=
::
tensorflow
::
strings
::
StrCat
(
", "
,
raw_flops_per_iteration
*
iterations
/
1e9
/
elapsed
,
" raw"
);
}
VLOG
(
0
)
<<
" ElementType "
<<
TypenameString
<
ElementType
>
()
<<
" GFLOPS: "
<<
(
2.0
*
2.0
*
output_size
*
num_cols
*
iterations
)
/
1e9
/
elapsed
<<
" effective"
<<
raw_gflops
;
const
float
tolerance
=
ToleranceAt128
<
ElementType
>
()
*
(
num_rows
/
128.0
)
+
1e-5
;
for
(
int
i
=
0
;
i
<
output_size
;
i
++
)
{
EXPECT_NEAR
(
output_1
->
data
()[
i
],
expected_output_1
->
data
()[
i
],
tolerance
);
EXPECT_NEAR
(
output_2
->
data
()[
i
],
expected_output_2
->
data
()[
i
],
tolerance
);
}
// Check that any non-output items are untouched.
for
(
int
i
=
output_size
;
i
<
num_rows
;
i
++
)
{
EXPECT_EQ
((
*
output_1
)[
i
],
(
*
untouched_output_1
)[
i
]);
EXPECT_EQ
((
*
output_2
)[
i
],
(
*
untouched_output_2
)[
i
]);
}
}
TEST
(
SgemvvTest
,
PerformanceAndAccuracyTest
)
{
// Benchmarking is hard. Sometimes results vary between test runs, or are just
// unreliable. This could be in part from CPU frequency scaling, and also how
// favorably the memory allocator places data (coherence, etc.).
constexpr
int
kNumBatches
=
3
;
VLOG
(
0
)
<<
"64x64 32-batch-size test"
;
for
(
int
batch
=
0
;
batch
<
kNumBatches
;
++
batch
)
{
RunPerformanceTest
<
32
,
64
,
64
,
float
>
(
64
);
#if defined(__F16C__)
RunPerformanceTest
<
32
,
64
,
64
,
IeeeFloat16
>
(
64
);
#endif
}
VLOG
(
0
)
<<
"128x128 32-batch-size test"
;
for
(
int
batch
=
0
;
batch
<
kNumBatches
;
++
batch
)
{
RunPerformanceTest
<
32
,
128
,
128
,
float
>
(
128
);
}
VLOG
(
0
)
<<
"256x256 32-batch-size test"
;
for
(
int
batch
=
0
;
batch
<
kNumBatches
;
++
batch
)
{
RunPerformanceTest
<
32
,
256
,
256
,
float
>
(
256
);
#if defined(__F16C__)
RunPerformanceTest
<
32
,
256
,
256
,
IeeeFloat16
>
(
256
);
#endif
RunPerformanceTest
<
32
,
256
,
256
,
TruncatedFloat16
>
(
256
);
}
VLOG
(
0
)
<<
"96x96 48-batch-size test"
;
for
(
int
batch
=
0
;
batch
<
kNumBatches
;
++
batch
)
{
RunPerformanceTest
<
48
,
96
,
96
,
float
>
(
96
);
}
VLOG
(
0
)
<<
"48x96 48-batch-size test"
;
for
(
int
batch
=
0
;
batch
<
kNumBatches
;
++
batch
)
{
RunPerformanceTest
<
48
,
48
,
96
,
float
>
(
48
);
}
VLOG
(
0
)
<<
"40x96 48-batch-size test"
;
for
(
int
batch
=
0
;
batch
<
kNumBatches
;
++
batch
)
{
RunPerformanceTest
<
48
,
48
,
96
,
float
>
(
40
);
}
// These larger matrices are about the same amount of computation as one
// 96-dimensional LSTM cell (without output softmax).
VLOG
(
0
)
<<
"480x96 48-batch-size test"
;
for
(
int
batch
=
0
;
batch
<
kNumBatches
;
++
batch
)
{
RunPerformanceTest
<
48
,
480
,
96
,
float
>
(
480
);
#if defined(__F16C__)
RunPerformanceTest
<
48
,
480
,
96
,
IeeeFloat16
>
(
480
);
#endif
RunPerformanceTest
<
48
,
480
,
96
,
TruncatedFloat16
>
(
480
);
}
VLOG
(
0
)
<<
"472x96 48-batch-size test"
;
for
(
int
batch
=
0
;
batch
<
kNumBatches
;
++
batch
)
{
RunPerformanceTest
<
48
,
480
,
96
,
float
>
(
472
);
#if defined(__F16C__)
RunPerformanceTest
<
48
,
480
,
96
,
IeeeFloat16
>
(
472
);
#endif
RunPerformanceTest
<
48
,
480
,
96
,
TruncatedFloat16
>
(
472
);
}
}
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
research/syntaxnet/dragnn/runtime/math/transformations.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Utility functions that can transform different matrix types. This includes
// non-trivial transposes, and converting vectors/etc. to the matrix types. This
// library should NOT be used for any performance-critical work, and should NOT
// be included at all in the mobile runtime.
#ifndef DRAGNN_RUNTIME_MATH_TRANSFORMATIONS_H_
#define DRAGNN_RUNTIME_MATH_TRANSFORMATIONS_H_
#include "dragnn/runtime/math/types.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
namespace
internal
{
// Puts a format-agnostic API on matrix-like data types. This is convenient, but
// has the downside of potential confusing compiler errors (when a
// specialization does not exist), and isn't suitable for optimizations like
// blocked transformations.
template
<
class
T
>
T
*
GetMatrixElement
(
int
row
,
int
col
,
MatrixImpl
<
T
>
*
matrix
)
{
return
&
matrix
->
row
(
row
)[
col
];
}
template
<
class
T
>
const
T
&
GetMatrixElement
(
int
row
,
int
col
,
const
MatrixImpl
<
T
>
&
matrix
)
{
return
matrix
.
row
(
row
)[
col
];
}
template
<
class
T
>
T
*
GetMatrixElement
(
int
row
,
int
col
,
BlockedMatrixImpl
<
T
,
BlockedMatrixFormat
::
kRowBlockedColumnMajor
>
*
matrix
)
{
int
sub_matrix_idx
=
row
/
matrix
->
block_size
();
int
vector_idx
=
sub_matrix_idx
*
matrix
->
num_columns
()
+
col
;
int
element_idx
=
row
%
matrix
->
block_size
();
return
&
matrix
->
vector
(
vector_idx
)[
element_idx
];
}
template
<
class
T
>
const
T
&
GetMatrixElement
(
int
row
,
int
col
,
const
BlockedMatrixImpl
<
T
,
BlockedMatrixFormat
::
kRowBlockedColumnMajor
>
&
matrix
)
{
int
sub_matrix_idx
=
row
/
matrix
.
block_size
();
int
vector_idx
=
sub_matrix_idx
*
matrix
.
num_columns
()
+
col
;
int
element_idx
=
row
%
matrix
.
block_size
();
return
matrix
.
vector
(
vector_idx
)[
element_idx
];
}
template
<
class
T
>
T
*
GetMatrixElement
(
int
row
,
int
col
,
BlockedMatrixImpl
<
T
,
BlockedMatrixFormat
::
kColumnBlockedRowMajor
>
*
matrix
)
{
int
sub_matrix_idx
=
col
/
matrix
->
block_size
();
int
vector_idx
=
sub_matrix_idx
*
matrix
->
num_rows
()
+
row
;
int
element_idx
=
col
%
matrix
->
block_size
();
return
&
matrix
->
vector
(
vector_idx
)[
element_idx
];
}
template
<
class
T
>
const
T
&
GetMatrixElement
(
int
row
,
int
col
,
const
BlockedMatrixImpl
<
T
,
BlockedMatrixFormat
::
kColumnBlockedRowMajor
>
&
matrix
)
{
int
sub_matrix_idx
=
col
/
matrix
.
block_size
();
int
vector_idx
=
sub_matrix_idx
*
matrix
.
num_rows
()
+
row
;
int
element_idx
=
col
%
matrix
.
block_size
();
return
matrix
.
vector
(
vector_idx
)[
element_idx
];
}
}
// namespace internal
// Generates values for a matrix, by calling a provided function on each
// row/column index. Thanks to the magic of templating, the function call should
// be inlined and not cause too much overhead being "called" on each index.
template
<
class
Function
,
class
OutputMatrix
>
void
GenerateMatrix
(
int
num_rows
,
int
num_columns
,
const
Function
&
get_value
,
OutputMatrix
*
output_matrix
)
{
for
(
size_t
row
=
0
;
row
<
num_rows
;
++
row
)
{
for
(
size_t
column
=
0
;
column
<
num_columns
;
++
column
)
{
*
(
GetMatrixElement
(
row
,
column
,
output_matrix
))
=
get_value
(
row
,
column
);
}
}
}
// Copies the first |num_rows| rows and |num_columns| columns of input_matrix to
// output_matrix.
template
<
class
InputMatrix
,
class
OutputMatrix
>
void
CopyMatrixPrefix
(
const
InputMatrix
&
input_matrix
,
int
num_rows
,
int
num_columns
,
OutputMatrix
*
output_matrix
)
{
const
auto
&
get_value
=
[
input_matrix
](
int
row
,
int
column
)
{
return
GetMatrixElement
(
row
,
column
,
input_matrix
);
};
GenerateMatrix
(
num_rows
,
num_columns
,
get_value
,
output_matrix
);
}
// Copies matrices. The matrices can be of different types, but must have the
// same dimensions.
template
<
class
InputMatrix
,
class
OutputMatrix
>
tensorflow
::
Status
CopyMatrix
(
const
InputMatrix
&
input_matrix
,
OutputMatrix
*
output_matrix
)
{
if
(
input_matrix
.
num_rows
()
!=
output_matrix
->
num_rows
())
{
return
tensorflow
::
errors
::
InvalidArgument
(
"Input matrix num_rows ("
,
input_matrix
.
num_rows
(),
") != output matrix num_rows ("
,
output_matrix
->
num_rows
(),
")"
);
}
if
(
input_matrix
.
num_columns
()
!=
output_matrix
->
num_columns
())
{
return
tensorflow
::
errors
::
InvalidArgument
(
"Input matrix num_columns ("
,
input_matrix
.
num_columns
(),
") != output matrix num_columns ("
,
output_matrix
->
num_columns
(),
")"
);
}
CopyMatrixPrefix
(
input_matrix
,
input_matrix
.
num_rows
(),
input_matrix
.
num_columns
(),
output_matrix
);
return
tensorflow
::
Status
::
OK
();
}
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_TRANSFORMATIONS_H_
research/syntaxnet/dragnn/runtime/math/transformations_test.cc
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/transformations.h"
#include "dragnn/runtime/test/helpers.h"
#include <gmock/gmock.h>
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/platform/test.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
namespace
{
// Generates a matrix where each value is of the form `aa.bb`, where `aa` is the
// column index and `bb` is the row index.
TEST
(
TransformationsTest
,
GenerateRowColIdxMatrix
)
{
UniqueMatrix
<
float
>
row_col_matrix
(
5
,
5
);
GenerateMatrix
(
5
,
5
,
[](
int
row
,
int
col
)
{
return
static_cast
<
float
>
(
row
)
+
(
col
/
100.0
f
);
},
row_col_matrix
.
get
());
ExpectMatrix
(
Matrix
<
float
>
(
*
row_col_matrix
),
{{
0.0
f
,
0.01
f
,
0.02
f
,
0.03
f
,
0.04
f
},
{
1.0
f
,
1.01
f
,
1.02
f
,
1.03
f
,
1.04
f
},
{
2.0
f
,
2.01
f
,
2.02
f
,
2.03
f
,
2.04
f
},
{
3.0
f
,
3.01
f
,
3.02
f
,
3.03
f
,
3.04
f
},
{
4.0
f
,
4.01
f
,
4.02
f
,
4.03
f
,
4.04
f
}});
}
TEST
(
TransformationsTest
,
CopiesMatrix
)
{
UniqueMatrix
<
float
>
a
({{
1
,
2
}}),
b
({{
3
,
4
}});
TF_EXPECT_OK
(
CopyMatrix
(
*
a
,
b
.
get
()));
EXPECT_EQ
(
b
->
row
(
0
)[
0
],
1
);
EXPECT_EQ
(
b
->
row
(
0
)[
1
],
2
);
}
TEST
(
TransformationsTest
,
CopiesRowBlockedMatrix
)
{
UniqueMatrix
<
double
>
source
({{
1
,
2
,
3
},
//
{
4
,
5
,
6
},
//
{
7
,
8
,
9
},
//
{
10
,
11
,
12
},
//
{
13
,
14
,
15
},
//
{
16
,
17
,
18
},
//
{
19
,
20
,
21
},
//
{
22
,
23
,
24
}});
UniqueMatrix
<
double
>
dst_mem
(
6
,
4
);
MutableBlockedMatrix
<
double
,
BlockedMatrixFormat
::
kRowBlockedColumnMajor
>
blocked
;
TF_EXPECT_OK
(
blocked
.
Reset
(
dst_mem
.
area
(),
8
,
3
));
TF_EXPECT_OK
(
CopyMatrix
(
*
source
,
&
blocked
));
ExpectMatrix
(
Matrix
<
double
>
(
*
dst_mem
),
{{
1
,
4
,
7
,
10
},
//
{
2
,
5
,
8
,
11
},
//
{
3
,
6
,
9
,
12
},
//
{
13
,
16
,
19
,
22
},
//
{
14
,
17
,
20
,
23
},
//
{
15
,
18
,
21
,
24
}});
}
// This test is the same as the above, except everything is transposed.
TEST
(
TransformationsTest
,
CopiesColumnBlockedMatrix
)
{
UniqueMatrix
<
double
>
source
(
//
{{
1
,
4
,
7
,
10
,
13
,
16
,
19
,
22
},
//
{
2
,
5
,
8
,
11
,
14
,
17
,
20
,
23
},
//
{
3
,
6
,
9
,
12
,
15
,
18
,
21
,
24
}});
UniqueMatrix
<
double
>
dst_mem
(
6
,
4
);
MutableBlockedMatrix
<
double
>
blocked
;
TF_EXPECT_OK
(
blocked
.
Reset
(
dst_mem
.
area
(),
3
,
8
));
TF_EXPECT_OK
(
CopyMatrix
(
*
source
,
&
blocked
));
ExpectMatrix
(
Matrix
<
double
>
(
*
dst_mem
),
{{
1
,
4
,
7
,
10
},
//
{
2
,
5
,
8
,
11
},
//
{
3
,
6
,
9
,
12
},
//
{
13
,
16
,
19
,
22
},
//
{
14
,
17
,
20
,
23
},
//
{
15
,
18
,
21
,
24
}});
}
}
// namespace
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
research/syntaxnet/dragnn/runtime/math/types.h
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
// Mathematical types.
#ifndef DRAGNN_RUNTIME_MATH_TYPES_H_
#define DRAGNN_RUNTIME_MATH_TYPES_H_
#include <stddef.h>
#include <limits>
#include "dragnn/runtime/alignment.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/logging.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
// Blocked matrix formats, for fast inference routines.
enum
class
BlockedMatrixFormat
{
// Represents a row-blocked block-column-major matrix. In other words, first
// split a matrix M into
//
// [ M_1
// ...
// M_m ]
//
// sub-matrices, where each M_i is a `block_size x n` sub-matrix. Then each
// M_i is formatted in column-major order, and the sub-matrices' data is
// concatenated together.
kRowBlockedColumnMajor
,
// Represents a column-blocked block-row-major matrix. This is the
// transpose of the above. A matrix M is split into
//
// [ M_1 ... M_n ]
//
// sub-matrices, where each M_i is a `m x block_size` sub-matrix. Then each
// M_i is formatted in row-major order, and the sub-matrices' data is
// concatenated together.
kColumnBlockedRowMajor
,
};
namespace
internal
{
// An aligned vector of values. Do not use this class directly, instead use
// (Mutable)Vector below.
template
<
class
T
>
class
VectorImpl
{
public:
static_assert
(
IsAlignable
<
T
>
(),
"T must be alignable"
);
// Creates an empty vector.
VectorImpl
()
=
default
;
// Points this at the |view|, which must be evenly divisible into Ts.
template
<
class
Byte
>
explicit
VectorImpl
(
AlignedViewImpl
<
Byte
>
view
);
// Points this at a prefix of the |view| containing |size| Ts. The |view|
// must span at least |size| * sizeof(T) bytes.
template
<
class
Byte
>
VectorImpl
(
AlignedViewImpl
<
Byte
>
view
,
size_t
size
);
// Points this at the same values as |that|, possibly reinterpreting type.
template
<
class
U
>
explicit
VectorImpl
(
VectorImpl
<
U
>
that
);
template
<
class
U
>
VectorImpl
&
operator
=
(
VectorImpl
<
U
>
that
);
// Enables range-based for loops.
T
*
begin
()
const
{
return
data
();
}
T
*
end
()
const
{
return
begin
()
+
size
();
}
// Accessors.
T
*
data
()
const
{
return
data_
;
}
size_t
size
()
const
{
return
size_
;
}
bool
empty
()
const
{
return
size
()
==
0
;
}
T
&
operator
[](
size_t
index
)
const
;
// Gets a sub-vector starting at |start| with |size| elements.
VectorImpl
<
T
>
Subsequence
(
size_t
start
,
size_t
size
)
const
;
private:
template
<
class
U
>
friend
class
MatrixImpl
;
template
<
class
U
,
BlockedMatrixFormat
format
>
friend
class
BlockedMatrixImpl
;
// Points this at [|data|,|data|+|size|), bypassing alignment checks.
VectorImpl
(
T
*
data
,
size_t
size
);
// Pointer to the start of the vector.
T
*
data_
=
nullptr
;
// Number of values in the vector.
size_t
size_
=
0
;
};
// Returns the format corresponding to the transpose of the |format|.
constexpr
BlockedMatrixFormat
TransposeBlockedMatrixFormat
(
BlockedMatrixFormat
format
);
// A row-major matrix where each row or column is aligned. Do not use this
// class directly, instead use (Mutable)Matrix below.
template
<
class
T
>
class
MatrixImpl
{
public:
static_assert
(
IsAlignable
<
T
>
(),
"T must be alignable"
);
// Creates an empty matrix.
MatrixImpl
()
=
default
;
// Points each row of this matrix at the corresponding sub-view of the |area|.
// Each view in the |area| must be evenly divisible into Ts.
template
<
class
Byte
>
explicit
MatrixImpl
(
AlignedAreaImpl
<
Byte
>
area
);
// Creates a matrix from a single vector. Assumes that the vector's stride is
// the minimum alignment padding.
explicit
MatrixImpl
(
VectorImpl
<
T
>
single_vector
);
// Points this at the same values as |that|.
template
<
class
U
>
explicit
MatrixImpl
(
MatrixImpl
<
U
>
that
);
template
<
class
U
>
MatrixImpl
&
operator
=
(
MatrixImpl
<
U
>
that
);
// Accessors.
T
*
data
()
const
{
return
data_
;
}
size_t
num_rows
()
const
{
return
num_rows_
;
}
size_t
num_columns
()
const
{
return
num_columns_
;
}
size_t
row_stride
()
const
{
return
row_stride_
;
}
VectorImpl
<
T
>
row
(
size_t
index
)
const
;
private:
template
<
class
U
>
friend
class
MatrixImpl
;
// Pointer to the start of the matrix.
T
*
data_
=
nullptr
;
// Number of rows and columns in the matrix.
size_t
num_rows_
=
0
;
size_t
num_columns_
=
0
;
// Distance between the starts of consecutive rows.
size_t
row_stride_
=
0
;
};
// Blocked matrix representation. See BlockedMatrixFormat for details.
template
<
class
T
,
BlockedMatrixFormat
format
>
class
BlockedMatrixImpl
{
public:
static_assert
(
IsAlignable
<
T
>
(),
"T must be alignable"
);
// These aliases allow templated code to reach back in and get template
// parameters, like std::vector<T>::iterator::value aliases.
using
ElementType
=
T
;
static
constexpr
bool
IsRowBlocked
()
{
return
format
==
BlockedMatrixFormat
::
kRowBlockedColumnMajor
;
}
// Creates an empty matrix.
BlockedMatrixImpl
()
=
default
;
// Creates a copy of this matrix, using the same values (underlying area), but
// possibly re-interpreting the type. The new type U must be the same size,
// and `T *` must be implictly convertible to `U *` (usually just adding
// "const" qualifiers, but theoretically it could be a superclass).
template
<
class
U
>
explicit
BlockedMatrixImpl
(
BlockedMatrixImpl
<
U
,
format
>
that
);
template
<
class
U
>
BlockedMatrixImpl
&
operator
=
(
BlockedMatrixImpl
<
U
,
format
>
that
);
// Creates a new view that's const-qualified, in particular converting
// MutableBlockedMatrix to BlockedMatrix.
BlockedMatrixImpl
<
const
T
,
format
>
AsConst
()
const
{
return
BlockedMatrixImpl
<
const
T
,
format
>
(
*
this
);
}
// Initializes the matrix. Raises errors if the matrix dimensions are
// incompatible with the underlying area, namely if the number of views in
// `area` do not cover the whole matrix, and also if the matrix cannot be
// blocked according to (template parameter) `format`.
//
// Further, because this class is used for (delicate / specialized) optimized
// inference routines, it is also required that no padding is present, i.e.
// that the block size is divisible by kAlignmentBytes (currently 32).
template
<
class
Byte
>
tensorflow
::
Status
Reset
(
AlignedAreaImpl
<
Byte
>
area
,
size_t
num_rows
,
size_t
num_columns
);
// Returns the transpose of this.
BlockedMatrixImpl
<
T
,
TransposeBlockedMatrixFormat
(
format
)
>
Transpose
()
const
;
// Accessors.
size_t
num_rows
()
const
{
return
num_rows_
;
}
size_t
num_columns
()
const
{
return
num_columns_
;
}
size_t
block_size
()
const
{
return
block_size_
;
}
size_t
num_vectors
()
const
{
return
num_vectors_
;
}
VectorImpl
<
T
>
vector
(
size_t
index
)
const
;
private:
template
<
class
U
,
BlockedMatrixFormat
other_format
>
friend
class
BlockedMatrixImpl
;
// This is the same as calling Reset(), except the area is not checked.
template
<
class
Byte
>
explicit
BlockedMatrixImpl
(
AlignedAreaImpl
<
Byte
>
area
,
int
num_rows
,
int
num_columns
);
// Pointer to the start of the matrix.
T
*
data_
=
nullptr
;
// Number of rows and columns in the matrix. Unlike MatrixImpl, there is no
// API for directly accessing rows and columns, but it's necessary for any
// algorithm (e.g. matrix-vector multiplication) to know the logical shape.
size_t
num_rows_
=
0
;
size_t
num_columns_
=
0
;
size_t
block_size_
=
0
;
// in T's
size_t
num_vectors_
=
0
;
// = num_rows * num_columns / block_size
};
}
// namespace internal
// Public aliases; use these.
template
<
class
T
>
using
Vector
=
internal
::
VectorImpl
<
const
T
>
;
template
<
class
T
>
using
Matrix
=
internal
::
MatrixImpl
<
const
T
>
;
template
<
class
T
,
BlockedMatrixFormat
format
=
BlockedMatrixFormat
::
kColumnBlockedRowMajor
>
using
BlockedMatrix
=
internal
::
BlockedMatrixImpl
<
const
T
,
format
>
;
template
<
class
T
>
using
MutableVector
=
internal
::
VectorImpl
<
T
>
;
template
<
class
T
>
using
MutableMatrix
=
internal
::
MatrixImpl
<
T
>
;
template
<
class
T
,
BlockedMatrixFormat
format
=
BlockedMatrixFormat
::
kColumnBlockedRowMajor
>
using
MutableBlockedMatrix
=
internal
::
BlockedMatrixImpl
<
T
,
format
>
;
// Implementation details below.
namespace
internal
{
template
<
class
T
>
template
<
class
Byte
>
VectorImpl
<
T
>::
VectorImpl
(
AlignedViewImpl
<
Byte
>
view
)
:
data_
(
reinterpret_cast
<
T
*>
(
view
.
data
())),
size_
(
view
.
size
()
/
sizeof
(
T
))
{
DCHECK_EQ
(
view
.
size
()
%
sizeof
(
T
),
0
);
}
template
<
class
T
>
template
<
class
Byte
>
VectorImpl
<
T
>::
VectorImpl
(
AlignedViewImpl
<
Byte
>
view
,
size_t
size
)
:
data_
(
reinterpret_cast
<
T
*>
(
view
.
data
())),
size_
(
size
)
{
DCHECK_LE
(
size
*
sizeof
(
T
),
view
.
size
());
}
template
<
class
T
>
template
<
class
U
>
VectorImpl
<
T
>::
VectorImpl
(
VectorImpl
<
U
>
that
)
:
data_
(
that
.
data
()),
size_
(
that
.
size
())
{
static_assert
(
sizeof
(
T
)
==
sizeof
(
U
),
"T and U must be the same size"
);
}
template
<
class
T
>
template
<
class
U
>
VectorImpl
<
T
>
&
VectorImpl
<
T
>::
operator
=
(
VectorImpl
<
U
>
that
)
{
static_assert
(
sizeof
(
T
)
==
sizeof
(
U
),
"T and U must be the same size"
);
data_
=
that
.
data
();
size_
=
that
.
size
();
return
*
this
;
}
template
<
class
T
>
T
&
VectorImpl
<
T
>::
operator
[](
size_t
index
)
const
{
DCHECK_LT
(
index
,
size
());
return
data_
[
index
];
}
template
<
class
T
>
VectorImpl
<
T
>::
VectorImpl
(
T
*
data
,
size_t
size
)
:
data_
(
data
),
size_
(
size
)
{
TF_DCHECK_OK
(
OkIfAligned
(
data
));
}
template
<
class
T
>
VectorImpl
<
T
>
VectorImpl
<
T
>::
Subsequence
(
size_t
start
,
size_t
size
)
const
{
DCHECK_LE
(
start
+
size
,
size_
);
return
VectorImpl
<
T
>
(
&
data_
[
start
],
size
);
}
constexpr
BlockedMatrixFormat
TransposeBlockedMatrixFormat
(
BlockedMatrixFormat
format
)
{
return
format
==
BlockedMatrixFormat
::
kRowBlockedColumnMajor
?
BlockedMatrixFormat
::
kColumnBlockedRowMajor
:
BlockedMatrixFormat
::
kRowBlockedColumnMajor
;
}
template
<
class
T
>
MatrixImpl
<
T
>::
MatrixImpl
(
VectorImpl
<
T
>
single_vector
)
:
data_
(
single_vector
.
data
()),
num_rows_
(
1
),
num_columns_
(
single_vector
.
size
()),
row_stride_
(
PadToAlignment
(
single_vector
.
size
()
*
sizeof
(
T
))
/
sizeof
(
T
))
{}
template
<
class
T
>
template
<
class
Byte
>
MatrixImpl
<
T
>::
MatrixImpl
(
AlignedAreaImpl
<
Byte
>
area
)
:
data_
(
reinterpret_cast
<
T
*>
(
area
.
data
())),
num_rows_
(
area
.
num_views
()),
num_columns_
(
area
.
view_size
()
/
sizeof
(
T
)),
row_stride_
(
area
.
view_stride
()
/
sizeof
(
T
))
{
DCHECK_EQ
(
area
.
view_size
()
%
sizeof
(
T
),
0
);
DCHECK_EQ
(
area
.
view_stride
()
%
sizeof
(
T
),
0
);
}
template
<
class
T
>
template
<
class
U
>
MatrixImpl
<
T
>::
MatrixImpl
(
MatrixImpl
<
U
>
that
)
:
data_
(
that
.
data_
),
num_rows_
(
that
.
num_rows
()),
num_columns_
(
that
.
num_columns
()),
row_stride_
(
that
.
row_stride_
)
{
static_assert
(
sizeof
(
T
)
==
sizeof
(
U
),
"T and U must be the same size"
);
}
template
<
class
T
>
template
<
class
U
>
MatrixImpl
<
T
>
&
MatrixImpl
<
T
>::
operator
=
(
MatrixImpl
<
U
>
that
)
{
static_assert
(
sizeof
(
T
)
==
sizeof
(
U
),
"T and U must be the same size"
);
data_
=
that
.
data_
;
num_rows_
=
that
.
num_rows
();
num_columns_
=
that
.
num_columns
();
row_stride_
=
that
.
row_stride_
;
return
*
this
;
}
template
<
class
T
>
VectorImpl
<
T
>
MatrixImpl
<
T
>::
row
(
size_t
index
)
const
{
DCHECK_LT
(
index
,
num_rows
());
// Note that |row_stride_|, not |num_columns_|, determines the start of the
// row. The former is aligned and may stride over a wider span than normal
// when this is a "slice" of a larger matrix.
return
VectorImpl
<
T
>
(
data_
+
row_stride_
*
index
,
num_columns
());
}
template
<
class
T
,
BlockedMatrixFormat
format
>
template
<
class
U
>
BlockedMatrixImpl
<
T
,
format
>::
BlockedMatrixImpl
(
BlockedMatrixImpl
<
U
,
format
>
that
)
:
data_
(
that
.
data_
),
num_rows_
(
that
.
num_rows
()),
num_columns_
(
that
.
num_columns
()),
block_size_
(
that
.
block_size
()),
num_vectors_
(
that
.
num_vectors
())
{
static_assert
(
sizeof
(
T
)
==
sizeof
(
U
),
"T and U must be the same size"
);
}
template
<
class
T
,
BlockedMatrixFormat
format
>
template
<
class
U
>
BlockedMatrixImpl
<
T
,
format
>
&
BlockedMatrixImpl
<
T
,
format
>::
operator
=
(
BlockedMatrixImpl
<
U
,
format
>
that
)
{
static_assert
(
sizeof
(
T
)
==
sizeof
(
U
),
"T and U must be the same size"
);
data_
=
that
.
data_
;
num_rows_
=
that
.
num_rows
();
num_columns_
=
that
.
num_columns
();
block_size_
=
that
.
block_size
();
num_vectors_
=
that
.
num_vectors
();
return
*
this
;
}
template
<
class
T
,
BlockedMatrixFormat
format
>
template
<
class
Byte
>
tensorflow
::
Status
BlockedMatrixImpl
<
T
,
format
>::
Reset
(
AlignedAreaImpl
<
Byte
>
area
,
size_t
num_rows
,
size_t
num_columns
)
{
data_
=
reinterpret_cast
<
T
*>
(
area
.
view
(
0
).
data
());
num_rows_
=
num_rows
;
num_columns_
=
num_columns
;
block_size_
=
area
.
view_size
()
/
sizeof
(
T
);
num_vectors_
=
num_rows
*
num_columns
/
block_size_
;
if
(
area
.
view_stride
()
!=
area
.
view_size
())
{
return
tensorflow
::
errors
::
InvalidArgument
(
"Padding is not supported for blocked matrix formats. Underlying area "
"has size "
,
area
.
view_size
(),
" which is padded to stride "
,
area
.
view_stride
(),
"."
);
}
if
(
area
.
view_size
()
%
sizeof
(
T
)
!=
0
)
{
return
tensorflow
::
errors
::
InvalidArgument
(
"View size "
,
area
.
view_size
(),
" is not a multiple of the templated type's size, "
,
sizeof
(
T
));
}
if
(
num_vectors_
!=
area
.
num_views
())
{
return
tensorflow
::
errors
::
InvalidArgument
(
"Area has "
,
area
.
num_views
(),
" views, but should have "
,
num_vectors_
);
}
// The block dimension must divide rows or columns evenly.
size_t
divided_dimension
=
IsRowBlocked
()
?
num_rows
:
num_columns
;
if
(
divided_dimension
%
block_size_
!=
0
)
{
return
tensorflow
::
errors
::
InvalidArgument
(
IsRowBlocked
()
?
"row"
:
"column"
,
"-blocked matrix has major dimension "
,
divided_dimension
,
" which is not divisible by the block size, "
,
block_size_
);
}
return
tensorflow
::
Status
::
OK
();
}
template
<
class
T
,
BlockedMatrixFormat
format
>
VectorImpl
<
T
>
BlockedMatrixImpl
<
T
,
format
>::
vector
(
size_t
index
)
const
{
DCHECK_LT
(
index
,
num_vectors_
);
return
VectorImpl
<
T
>
(
data_
+
block_size_
*
index
,
block_size_
);
}
template
<
class
T
,
BlockedMatrixFormat
format
>
BlockedMatrixImpl
<
T
,
TransposeBlockedMatrixFormat
(
format
)
>
BlockedMatrixImpl
<
T
,
format
>::
Transpose
()
const
{
BlockedMatrixImpl
<
T
,
TransposeBlockedMatrixFormat
(
format
)
>
result
;
result
.
data_
=
data_
;
result
.
num_columns_
=
num_rows_
;
result
.
num_rows_
=
num_columns_
;
result
.
block_size_
=
block_size_
;
result
.
num_vectors_
=
num_vectors_
;
return
result
;
}
}
// namespace internal
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
#endif // DRAGNN_RUNTIME_MATH_TYPES_H_
research/syntaxnet/dragnn/runtime/math/types_test.cc
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/math/types.h"
#include <stddef.h>
#include <string.h>
#include <set>
#include "dragnn/core/test/generic.h"
#include "dragnn/runtime/alignment.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/platform/test.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
namespace
{
// Creates a pointer that is be invalid. This is useful for creating proxy areas
// for testing, whose real data should never be accessed. We manually tested
// that if this pointer is dereferenced, a segmentation fault will be thrown.
char
*
InvalidAlignedPointer
()
{
return
reinterpret_cast
<
char
*>
(
3
*
internal
::
kAlignmentBytes
);
}
// Expects that two pointers point to the same address.
void
ExpectSameAddress
(
const
void
*
ptr1
,
const
void
*
ptr2
)
{
EXPECT_EQ
(
ptr1
,
ptr2
);
}
template
<
class
A
,
class
B
>
bool
StructsEqual
(
const
A
&
a
,
const
B
&
b
)
{
static_assert
(
sizeof
(
A
)
==
sizeof
(
B
),
"StructsEqual must be given structs of the same size."
);
return
memcmp
(
&
a
,
&
b
,
sizeof
(
A
))
==
0
;
}
// Tests that (Mutable)Vector is empty by default.
TEST
(
VectorTest
,
EmptyByDefault
)
{
const
Vector
<
int
>
vector1
;
EXPECT_EQ
(
vector1
.
size
(),
0
);
EXPECT_TRUE
(
vector1
.
empty
());
const
MutableVector
<
int
>
vector2
;
EXPECT_EQ
(
vector2
.
size
(),
0
);
EXPECT_TRUE
(
vector2
.
empty
());
}
// Tests that (Mutable)Vector can be initialized from a view.
TEST
(
VectorTest
,
ConstructFromView
)
{
MutableAlignedView
view
;
char
*
ptr
=
InvalidAlignedPointer
();
TF_ASSERT_OK
(
view
.
Reset
(
ptr
,
10
*
sizeof
(
int
)));
const
Vector
<
int
>
vector1
(
view
);
ExpectSameAddress
(
vector1
.
data
(),
ptr
);
EXPECT_EQ
(
vector1
.
size
(),
10
);
EXPECT_FALSE
(
vector1
.
empty
());
const
MutableVector
<
int
>
vector2
(
view
);
ExpectSameAddress
(
vector2
.
data
(),
ptr
);
EXPECT_EQ
(
vector2
.
size
(),
10
);
EXPECT_FALSE
(
vector2
.
empty
());
}
// Tests that (Mutable)Vector can be initialized from a prefix of a view.
TEST
(
VectorTest
,
ConstructFromViewPrefix
)
{
MutableAlignedView
view
;
char
*
ptr
=
InvalidAlignedPointer
();
TF_ASSERT_OK
(
view
.
Reset
(
ptr
,
10
*
sizeof
(
int
)));
// Use a prefix of 3 of the 10 available ints in the |view|.
const
Vector
<
int
>
vector1
(
view
,
3
);
ExpectSameAddress
(
vector1
.
data
(),
ptr
);
EXPECT_EQ
(
vector1
.
size
(),
3
);
EXPECT_FALSE
(
vector1
.
empty
());
// Use a prefix of 5 of the 10 available ints in the |view|.
const
MutableVector
<
int
>
vector2
(
view
,
5
);
ExpectSameAddress
(
vector2
.
data
(),
ptr
);
EXPECT_EQ
(
vector2
.
size
(),
5
);
EXPECT_FALSE
(
vector2
.
empty
());
}
// Tests that (Mutable)Vector supports copy-construction and assignment with
// shallow-copy semantics, and reinterprets from T* to const T*.
TEST
(
VectorTest
,
CopyAndAssign
)
{
MutableAlignedView
view
;
char
*
ptr
=
InvalidAlignedPointer
();
TF_ASSERT_OK
(
view
.
Reset
(
ptr
,
10
*
sizeof
(
int
)));
const
MutableVector
<
int
>
vector1
(
view
);
// Copy-construct from another vector.
MutableVector
<
int
>
vector2
(
vector1
);
ExpectSameAddress
(
vector2
.
data
(),
ptr
);
EXPECT_EQ
(
vector2
.
size
(),
10
);
EXPECT_FALSE
(
vector2
.
empty
());
// Assign from an empty vector, effectively clearing it.
vector2
=
MutableVector
<
int
>
();
EXPECT_EQ
(
vector2
.
size
(),
0
);
EXPECT_TRUE
(
vector2
.
empty
());
// Assign from the original vector.
vector2
=
vector1
;
ExpectSameAddress
(
vector2
.
data
(),
ptr
);
EXPECT_EQ
(
vector2
.
size
(),
10
);
EXPECT_FALSE
(
vector2
.
empty
());
// Copy-construct from another vector. Note that this reinterprets type.
Vector
<
int
>
vector3
(
vector1
);
ExpectSameAddress
(
vector3
.
data
(),
ptr
);
EXPECT_EQ
(
vector3
.
size
(),
10
);
EXPECT_FALSE
(
vector3
.
empty
());
// Assign from an empty vector, effectively clearing it.
vector3
=
Vector
<
int
>
();
EXPECT_EQ
(
vector3
.
size
(),
0
);
EXPECT_TRUE
(
vector3
.
empty
());
// Assign from another vector. Note that this reinterprets type.
vector3
=
vector2
;
ExpectSameAddress
(
vector3
.
data
(),
ptr
);
EXPECT_EQ
(
vector3
.
size
(),
10
);
EXPECT_FALSE
(
vector3
.
empty
());
}
// Tests that (Mutable)Vector supports access via operator[].
TEST
(
VectorTest
,
Subscript
)
{
UniqueAlignedArray
array
;
array
.
Reset
(
10
*
sizeof
(
float
));
// Write into a mutable vector.
const
MutableVector
<
float
>
mutable_vector
(
array
.
view
());
ASSERT_EQ
(
mutable_vector
.
size
(),
10
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
mutable_vector
[
i
]
=
i
;
// Read from a const vector that points at the same values.
const
Vector
<
float
>
const_vector
(
array
.
view
());
ASSERT_EQ
(
const_vector
.
size
(),
10
);
for
(
int
i
=
0
;
i
<
10
;
++
i
)
EXPECT_EQ
(
const_vector
[
i
],
i
);
}
// Tests the subsequence operator.
TEST
(
VectorTest
,
Subsequence
)
{
// Debug checks will fail if either of the constructed vectors is not aligned.
constexpr
int
numAlignedFloats
=
internal
::
kAlignmentBytes
/
sizeof
(
float
);
UniqueAlignedArray
array
;
array
.
Reset
(
2
*
numAlignedFloats
*
sizeof
(
float
));
// Write into a mutable vector.
const
MutableVector
<
float
>
mutable_vector
(
array
.
view
());
for
(
int
i
=
0
;
i
<
2
*
numAlignedFloats
;
++
i
)
mutable_vector
[
i
]
=
i
;
// Subscript beginning.
Vector
<
float
>
first_half
(
mutable_vector
.
Subsequence
(
0
,
numAlignedFloats
));
ASSERT_EQ
(
first_half
.
size
(),
numAlignedFloats
);
for
(
int
i
=
0
;
i
<
numAlignedFloats
;
++
i
)
{
EXPECT_EQ
(
first_half
[
i
],
i
);
}
// Subscript end.
Vector
<
float
>
second_half
(
mutable_vector
.
Subsequence
(
numAlignedFloats
,
numAlignedFloats
));
ASSERT_EQ
(
second_half
.
size
(),
numAlignedFloats
);
for
(
int
i
=
0
;
i
<
numAlignedFloats
;
++
i
)
{
EXPECT_EQ
(
second_half
[
i
],
i
+
numAlignedFloats
);
}
}
// Tests that (Mutable)Vector supports access via range-based for loops.
TEST
(
VectorTest
,
RangeBasedFor
)
{
UniqueAlignedArray
array
;
array
.
Reset
(
10
*
sizeof
(
float
));
// Write into a mutable vector.
const
MutableVector
<
float
>
mutable_vector
(
array
.
view
());
ASSERT_EQ
(
mutable_vector
.
size
(),
10
);
float
counter
=
0.0
;
for
(
float
&
value
:
mutable_vector
)
value
=
counter
++
;
// Read from a const vector that points at the same values.
const
Vector
<
float
>
const_vector
(
array
.
view
());
ASSERT_EQ
(
const_vector
.
size
(),
10
);
counter
=
0.0
;
for
(
const
float
&
value
:
const_vector
)
EXPECT_EQ
(
value
,
counter
++
);
}
// Tests that (Mutable)Matrix is empty by default.
TEST
(
MatrixTest
,
EmptyByDefault
)
{
const
Matrix
<
int
>
matrix1
;
EXPECT_EQ
(
matrix1
.
num_rows
(),
0
);
EXPECT_EQ
(
matrix1
.
num_columns
(),
0
);
EXPECT_EQ
(
matrix1
.
row_stride
(),
0
);
const
MutableMatrix
<
int
>
matrix2
;
EXPECT_EQ
(
matrix2
.
num_rows
(),
0
);
EXPECT_EQ
(
matrix2
.
num_columns
(),
0
);
EXPECT_EQ
(
matrix2
.
row_stride
(),
0
);
}
// Tests that (Mutable)Matrix can be constructed from an area.
TEST
(
MatrixTest
,
ConstructFromArea
)
{
MutableAlignedView
view
;
char
*
ptr
=
InvalidAlignedPointer
();
const
size_t
kNumRows
=
11
;
const
size_t
kNumColumns
=
13
;
const
size_t
kRowBytes
=
kNumColumns
*
sizeof
(
int
);
const
size_t
kRowStride
=
PadToAlignment
(
kRowBytes
)
/
sizeof
(
int
);
const
size_t
bytes
=
ComputeAlignedAreaSize
(
kNumRows
,
kRowBytes
);
TF_ASSERT_OK
(
view
.
Reset
(
ptr
,
bytes
));
MutableAlignedArea
area
;
TF_ASSERT_OK
(
area
.
Reset
(
view
,
kNumRows
,
kRowBytes
));
const
Matrix
<
int
>
matrix1
(
area
);
EXPECT_EQ
(
matrix1
.
num_rows
(),
kNumRows
);
EXPECT_EQ
(
matrix1
.
num_columns
(),
kNumColumns
);
EXPECT_EQ
(
matrix1
.
row_stride
(),
kRowStride
);
ExpectSameAddress
(
matrix1
.
row
(
0
).
data
(),
ptr
);
ExpectSameAddress
(
matrix1
.
data
(),
ptr
);
const
MutableMatrix
<
int
>
matrix2
(
area
);
EXPECT_EQ
(
matrix2
.
num_rows
(),
kNumRows
);
EXPECT_EQ
(
matrix2
.
num_columns
(),
kNumColumns
);
EXPECT_EQ
(
matrix2
.
row_stride
(),
kRowStride
);
ExpectSameAddress
(
matrix2
.
row
(
0
).
data
(),
ptr
);
ExpectSameAddress
(
matrix2
.
data
(),
ptr
);
}
// Tests that (Mutable)Matrix supports copy-construction and assignment with
// shallow-copy semantics, and reinterprets from T* to const T*.
TEST
(
MatrixTest
,
CopyAndAssign
)
{
MutableAlignedView
view
;
char
*
ptr
=
InvalidAlignedPointer
();
const
size_t
kNumRows
=
11
;
const
size_t
kNumColumns
=
13
;
const
size_t
kRowBytes
=
kNumColumns
*
sizeof
(
int
);
const
size_t
kRowStride
=
PadToAlignment
(
kRowBytes
)
/
sizeof
(
int
);
const
size_t
bytes
=
ComputeAlignedAreaSize
(
kNumRows
,
kRowBytes
);
TF_ASSERT_OK
(
view
.
Reset
(
ptr
,
bytes
));
MutableAlignedArea
area
;
TF_ASSERT_OK
(
area
.
Reset
(
view
,
kNumRows
,
kRowBytes
));
const
MutableMatrix
<
int
>
matrix1
(
area
);
EXPECT_EQ
(
matrix1
.
num_rows
(),
kNumRows
);
EXPECT_EQ
(
matrix1
.
num_columns
(),
kNumColumns
);
EXPECT_EQ
(
matrix1
.
row_stride
(),
kRowStride
);
ExpectSameAddress
(
matrix1
.
row
(
0
).
data
(),
ptr
);
ExpectSameAddress
(
matrix1
.
data
(),
ptr
);
// Copy-construct from another matrix.
MutableMatrix
<
int
>
matrix2
(
matrix1
);
EXPECT_EQ
(
matrix2
.
num_rows
(),
kNumRows
);
EXPECT_EQ
(
matrix2
.
num_columns
(),
kNumColumns
);
EXPECT_EQ
(
matrix2
.
row_stride
(),
kRowStride
);
ExpectSameAddress
(
matrix2
.
row
(
0
).
data
(),
ptr
);
ExpectSameAddress
(
matrix2
.
data
(),
ptr
);
// Assign from an empty matrix, effectively clearing it.
matrix2
=
MutableMatrix
<
int
>
();
EXPECT_EQ
(
matrix2
.
num_rows
(),
0
);
EXPECT_EQ
(
matrix2
.
num_columns
(),
0
);
EXPECT_EQ
(
matrix2
.
row_stride
(),
0
);
// Assign from the original matrix.
matrix2
=
matrix1
;
EXPECT_EQ
(
matrix2
.
num_rows
(),
kNumRows
);
EXPECT_EQ
(
matrix2
.
num_columns
(),
kNumColumns
);
EXPECT_EQ
(
matrix2
.
row_stride
(),
kRowStride
);
ExpectSameAddress
(
matrix2
.
row
(
0
).
data
(),
ptr
);
ExpectSameAddress
(
matrix2
.
data
(),
ptr
);
// Copy-construct from another matrix. Note that this reinterprets type.
Matrix
<
int
>
matrix3
(
matrix2
);
EXPECT_EQ
(
matrix3
.
num_rows
(),
kNumRows
);
EXPECT_EQ
(
matrix3
.
num_columns
(),
kNumColumns
);
EXPECT_EQ
(
matrix3
.
row_stride
(),
kRowStride
);
ExpectSameAddress
(
matrix3
.
row
(
0
).
data
(),
ptr
);
ExpectSameAddress
(
matrix3
.
data
(),
ptr
);
// Assign from an empty matrix, effectively clearing it.
matrix3
=
Matrix
<
int
>
();
EXPECT_EQ
(
matrix3
.
num_rows
(),
0
);
EXPECT_EQ
(
matrix3
.
num_columns
(),
0
);
EXPECT_EQ
(
matrix3
.
row_stride
(),
0
);
// Assign from the original matrix. Note that this reinterprets type.
matrix3
=
matrix1
;
EXPECT_EQ
(
matrix3
.
num_rows
(),
kNumRows
);
EXPECT_EQ
(
matrix3
.
num_columns
(),
kNumColumns
);
EXPECT_EQ
(
matrix3
.
row_stride
(),
kRowStride
);
ExpectSameAddress
(
matrix3
.
row
(
0
).
data
(),
ptr
);
ExpectSameAddress
(
matrix3
.
data
(),
ptr
);
}
// Tests that (Mutable)Matrix supports row access.
TEST
(
MatrixTest
,
Rows
)
{
const
size_t
kNumRows
=
11
;
const
size_t
kNumColumns
=
13
;
const
size_t
bytes
=
ComputeAlignedAreaSize
(
kNumRows
,
kNumColumns
*
sizeof
(
float
));
UniqueAlignedArray
array
;
array
.
Reset
(
bytes
);
MutableAlignedArea
area
;
TF_ASSERT_OK
(
area
.
Reset
(
array
.
view
(),
kNumRows
,
kNumColumns
*
sizeof
(
float
)));
// Write to a mutable matrix.
const
MutableMatrix
<
float
>
mutable_matrix
(
area
);
ASSERT_EQ
(
mutable_matrix
.
num_rows
(),
kNumRows
);
ASSERT_EQ
(
mutable_matrix
.
num_columns
(),
kNumColumns
);
for
(
size_t
row
=
0
;
row
<
kNumRows
;
++
row
)
{
for
(
size_t
column
=
0
;
column
<
kNumColumns
;
++
column
)
{
mutable_matrix
.
row
(
row
)[
column
]
=
row
*
1000.0
+
column
;
}
}
// Read from a const matrix that points at the same values.
const
Matrix
<
float
>
const_matrix
(
area
);
ASSERT_EQ
(
const_matrix
.
num_rows
(),
kNumRows
);
ASSERT_EQ
(
const_matrix
.
num_columns
(),
kNumColumns
);
for
(
size_t
row
=
0
;
row
<
kNumRows
;
++
row
)
{
for
(
size_t
column
=
0
;
column
<
kNumColumns
;
++
column
)
{
EXPECT_EQ
(
const_matrix
.
row
(
row
)[
column
],
row
*
1000.0
+
column
);
}
}
}
TEST
(
MatrixTest
,
MatrixFromVector
)
{
for
(
int
cols
=
0
;
cols
<
100
;
++
cols
)
{
MutableAlignedView
view
;
char
*
ptr
=
InvalidAlignedPointer
();
TF_ASSERT_OK
(
view
.
Reset
(
ptr
,
cols
*
sizeof
(
int
)));
const
MutableVector
<
int
>
vector
(
view
);
const
MutableMatrix
<
int
>
matrix
(
vector
);
ASSERT_EQ
(
matrix
.
row
(
0
).
data
(),
vector
.
data
());
ExpectSameAddress
(
matrix
.
data
(),
vector
.
data
());
ASSERT_EQ
(
matrix
.
num_rows
(),
1
);
ASSERT_EQ
(
matrix
.
num_columns
(),
vector
.
size
());
}
}
template
<
class
MatrixType
>
class
BlockedMatrixTest
:
public
::
testing
::
Test
{};
typedef
::
testing
::
Types
<
BlockedMatrix
<
float
,
BlockedMatrixFormat
::
kRowBlockedColumnMajor
>
,
BlockedMatrix
<
float
,
BlockedMatrixFormat
::
kColumnBlockedRowMajor
>
,
BlockedMatrix
<
int64
,
BlockedMatrixFormat
::
kRowBlockedColumnMajor
>
,
BlockedMatrix
<
int64
,
BlockedMatrixFormat
::
kColumnBlockedRowMajor
>>
BlockedRowAndColumnTypes
;
TYPED_TEST_CASE
(
BlockedMatrixTest
,
BlockedRowAndColumnTypes
);
TYPED_TEST
(
BlockedMatrixTest
,
PaddingNotAllowed
)
{
MutableAlignedView
view
;
MutableAlignedArea
area
;
constexpr
size_t
kNumRows
=
10
;
constexpr
size_t
kNumColumns
=
10
;
constexpr
size_t
kBlockSize
=
5
;
constexpr
size_t
kNumViews
=
(
kNumRows
*
kNumColumns
)
/
kBlockSize
;
constexpr
size_t
kBlockSizeBytes
=
kBlockSize
*
sizeof
(
typename
TypeParam
::
ElementType
);
const
size_t
bytes
=
ComputeAlignedAreaSize
(
kNumViews
,
kBlockSizeBytes
);
TF_ASSERT_OK
(
view
.
Reset
(
InvalidAlignedPointer
(),
bytes
));
TF_ASSERT_OK
(
area
.
Reset
(
view
,
kNumViews
,
kBlockSizeBytes
));
// 5 is usually relatively prime to the alignment size, but you may have to
// update this test if kAlignmentBytes changes.
ASSERT_NE
(
PadToAlignment
(
kBlockSizeBytes
),
kBlockSizeBytes
);
TypeParam
matrix
;
EXPECT_THAT
(
matrix
.
Reset
(
area
,
kNumRows
,
kNumColumns
),
test
::
IsErrorWithSubstr
(
"Padding is not supported for blocked matrix formats."
));
}
// Tests accessors, and the size of matrices after allocation.
TYPED_TEST
(
BlockedMatrixTest
,
Accessors
)
{
MutableAlignedView
view
;
MutableAlignedArea
area
;
char
*
ptr
=
InvalidAlignedPointer
();
constexpr
size_t
kNumRows
=
48
;
constexpr
size_t
kNumColumns
=
24
;
constexpr
size_t
kBlockSize
=
8
;
constexpr
size_t
kNumViews
=
(
kNumRows
*
kNumColumns
)
/
kBlockSize
;
constexpr
size_t
kBlockSizeBytes
=
kBlockSize
*
sizeof
(
typename
TypeParam
::
ElementType
);
const
size_t
bytes
=
ComputeAlignedAreaSize
(
kNumViews
,
kBlockSizeBytes
);
TF_ASSERT_OK
(
view
.
Reset
(
ptr
,
bytes
));
TF_ASSERT_OK
(
area
.
Reset
(
view
,
kNumViews
,
kBlockSizeBytes
));
TypeParam
matrix
;
// If the view size is wrong, it should fail.
EXPECT_THAT
(
matrix
.
Reset
(
area
,
kNumRows
+
1
,
kNumColumns
),
test
::
IsErrorWithSubstr
(
"Area has 144 views, but should have 147"
));
// If the blocking scheme cannot divide the matrix evenly, an error should
// be raised. The choice of 12 and 96 is a bit non-trivial: they are numbers
// that conveniently result in the correct area (so other errors won't be
// raised), but an incompatible division of the vectors.
if
(
TypeParam
::
IsRowBlocked
())
{
EXPECT_THAT
(
matrix
.
Reset
(
area
,
12
,
96
),
test
::
IsErrorWithSubstr
(
"row-blocked matrix has major dimension 12 "
"which is not divisible by the block "
"size, 8"
));
}
else
{
EXPECT_THAT
(
matrix
.
Reset
(
area
,
96
,
12
),
test
::
IsErrorWithSubstr
(
"column-blocked matrix has major dimension "
"12 which is not divisible by the block "
"size, 8"
));
}
TF_EXPECT_OK
(
matrix
.
Reset
(
area
,
kNumRows
,
kNumColumns
));
EXPECT_EQ
(
matrix
.
vector
(
0
).
data
(),
reinterpret_cast
<
typename
TypeParam
::
ElementType
*>
(
ptr
));
EXPECT_EQ
(
matrix
.
num_rows
(),
kNumRows
);
EXPECT_EQ
(
matrix
.
num_columns
(),
kNumColumns
);
EXPECT_EQ
(
matrix
.
block_size
(),
kBlockSize
);
EXPECT_EQ
(
matrix
.
num_vectors
(),
kNumViews
);
}
TYPED_TEST
(
BlockedMatrixTest
,
CopyCastTranspose
)
{
MutableAlignedView
view
;
MutableAlignedArea
area
;
constexpr
size_t
kNumRows
=
48
;
constexpr
size_t
kNumColumns
=
24
;
constexpr
size_t
kBlockSize
=
8
;
constexpr
size_t
kNumViews
=
(
kNumRows
*
kNumColumns
)
/
kBlockSize
;
constexpr
size_t
kBlockSizeBytes
=
kBlockSize
*
sizeof
(
typename
TypeParam
::
ElementType
);
const
size_t
bytes
=
ComputeAlignedAreaSize
(
kNumViews
,
kBlockSizeBytes
);
TF_ASSERT_OK
(
view
.
Reset
(
InvalidAlignedPointer
(),
bytes
));
TF_ASSERT_OK
(
area
.
Reset
(
view
,
kNumViews
,
kBlockSizeBytes
));
TypeParam
matrix
;
TF_EXPECT_OK
(
matrix
.
Reset
(
area
,
kNumRows
,
kNumColumns
));
// Test both copying and casting to const.
TypeParam
matrix_copy
=
matrix
;
auto
readonly
=
matrix
.
AsConst
();
EXPECT_TRUE
(
StructsEqual
(
matrix
,
matrix_copy
));
EXPECT_TRUE
(
StructsEqual
(
matrix
,
readonly
));
for
(
int
i
=
0
;
i
<
kNumViews
;
++
i
)
{
EXPECT_EQ
(
matrix
.
vector
(
i
).
data
(),
matrix_copy
.
vector
(
i
).
data
());
EXPECT_EQ
(
matrix
.
vector
(
i
).
data
(),
readonly
.
vector
(
i
).
data
());
}
// Transpose matrix.
auto
transposed
=
matrix
.
Transpose
();
auto
readonly_transposed
=
readonly
.
Transpose
();
EXPECT_FALSE
(
StructsEqual
(
matrix
,
transposed
));
EXPECT_FALSE
(
StructsEqual
(
readonly
,
readonly_transposed
));
EXPECT_TRUE
(
StructsEqual
(
transposed
,
readonly_transposed
));
}
}
// namespace
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
research/syntaxnet/dragnn/runtime/mmap.cc
0 → 100644
View file @
a4bb31d0
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// =============================================================================
#include "dragnn/runtime/mmap.h"
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <utility>
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/gtl/cleanup.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/logging.h"
namespace
syntaxnet
{
namespace
dragnn
{
namespace
runtime
{
int
UniqueAlignedMmap
::
Syscalls
::
Open
(
const
string
&
path
)
const
{
return
open
(
path
.
c_str
(),
O_RDONLY
);
}
int
UniqueAlignedMmap
::
Syscalls
::
Close
(
int
file_descriptor
)
const
{
return
close
(
file_descriptor
);
}
void
*
UniqueAlignedMmap
::
Syscalls
::
Mmap
(
int
file_descriptor
,
size_t
size
)
const
{
return
mmap
(
nullptr
,
size
,
PROT_READ
,
MAP_SHARED
,
file_descriptor
,
0
);
}
int
UniqueAlignedMmap
::
Syscalls
::
Munmap
(
void
*
data
,
size_t
size
)
const
{
return
munmap
(
data
,
size
);
}
UniqueAlignedMmap
::
UniqueAlignedMmap
(
std
::
unique_ptr
<
Syscalls
>
syscalls
)
:
syscalls_
(
std
::
move
(
syscalls
))
{}
UniqueAlignedMmap
::
UniqueAlignedMmap
(
UniqueAlignedMmap
&&
that
)
:
syscalls_
(
std
::
move
(
that
.
syscalls_
))
{
view_
=
that
.
view_
;
path_
=
that
.
path_
;
that
.
view_
=
MutableAlignedView
();
that
.
path_
.
clear
();
}
UniqueAlignedMmap
&
UniqueAlignedMmap
::
operator
=
(
UniqueAlignedMmap
&&
that
)
{
syscalls_
=
std
::
move
(
that
.
syscalls_
);
view_
=
that
.
view_
;
path_
=
that
.
path_
;
that
.
view_
=
MutableAlignedView
();
that
.
path_
.
clear
();
return
*
this
;
}
UniqueAlignedMmap
::~
UniqueAlignedMmap
()
{
UnmapIfNonEmpty
(
view_
.
data
(),
view_
.
size
(),
path_
);
}
tensorflow
::
Status
UniqueAlignedMmap
::
Reset
(
const
string
&
path
)
{
uint64
size
=
0
;
TF_RETURN_IF_ERROR
(
tensorflow
::
Env
::
Default
()
->
GetFileSize
(
path
,
&
size
));
// Since mmap() cannot map 0 bytes, we skip the call on empty files. This is
// OK because UnmapIfNonEmpty() also skips munmap() on an empty region.
if
(
size
==
0
)
{
view_
=
MutableAlignedView
();
path_
=
path
;
return
tensorflow
::
Status
::
OK
();
}
const
int
file_descriptor
=
syscalls_
->
Open
(
path
);
if
(
file_descriptor
==
-
1
)
{
// TODO(googleuser): Use strerror_r() to export the system error message.
return
tensorflow
::
errors
::
Unknown
(
"Failed to open '"
,
path
,
"'"
);
}
// In case we error out.
auto
ensure_closed
=
tensorflow
::
gtl
::
MakeCleanup
([
&
]
{
if
(
syscalls_
->
Close
(
file_descriptor
)
!=
0
)
{
LOG
(
ERROR
)
<<
"Failed to close '"
<<
path
<<
"'"
;
}
});
void
*
mmap_result
=
syscalls_
->
Mmap
(
file_descriptor
,
size
);
if
(
mmap_result
==
MAP_FAILED
)
{
return
tensorflow
::
errors
::
Unknown
(
"Failed to mmap '"
,
path
,
"'"
);
}
// In case we error out.
auto
ensure_unmapped
=
tensorflow
::
gtl
::
MakeCleanup
(
[
&
]
{
UnmapIfNonEmpty
(
mmap_result
,
size
,
path
);
});
// Since mmap() increments the refcount of the |file_descriptor|, it must be
// closed to prevent a leak.
ensure_closed
.
release
();
// going to close it manually
if
(
syscalls_
->
Close
(
file_descriptor
)
!=
0
)
{
return
tensorflow
::
errors
::
Unknown
(
"Failed to close '"
,
path
,
"'"
);
}
// Most implementations of mmap() place the mapped region on a page boundary,
// which is plenty of alignment. Since this is so unlikely to fail, we don't
// try to recover if the address is misaligned. A potential recovery method
// is to allocate a UniqueAlignedArray and read the file normally.
MutableAlignedView
data
;
TF_RETURN_IF_ERROR
(
data
.
Reset
(
reinterpret_cast
<
char
*>
(
mmap_result
),
size
));
// Success; make modifications.
view_
=
data
;
path_
=
path
;
ensure_unmapped
.
release
();
// this has taken ownership of the mapped file
return
tensorflow
::
Status
::
OK
();
}
void
UniqueAlignedMmap
::
UnmapIfNonEmpty
(
void
*
data
,
size_t
size
,
const
string
&
path
)
const
{
if
(
size
==
0
)
return
;
if
(
syscalls_
->
Munmap
(
data
,
size
)
!=
0
)
{
LOG
(
ERROR
)
<<
"Failed to munmap() file '"
<<
path
<<
"'"
;
}
}
}
// namespace runtime
}
// namespace dragnn
}
// namespace syntaxnet
Prev
1
…
6
7
8
9
10
11
12
13
14
15
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment