More optimizations to CPU platform

8d0fee51 · peastman · 40947735 · 8d0fee51 · 8d0fee51 · 8d0fee51
Commit 8d0fee51 authored Oct 01, 2015 by peastman
10 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -82,7 +82,7 @@ ENDIF(${CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT})
 # The source is organized into subdirectories, but we handle them all from
 # this CMakeLists file rather than letting CMake visit them as SUBDIRS.
-SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert libraries/csha1 platforms/reference serialization libraries/validate libraries/irrxml)
+SET(OPENMM_SOURCE_SUBDIRS . openmmapi olla libraries/jama libraries/quern libraries/lepton libraries/sfmt libraries/lbfgs libraries/hilbert libraries/csha1 platforms/reference serialization libraries/validate libraries/irrxml libraries/vecmath)
 IF(WIN32)
    SET(OPENMM_SOURCE_SUBDIRS ${OPENMM_SOURCE_SUBDIRS} libraries/pthreads)
 ELSE(WIN32)

--- a/libraries/vecmath/include/neon_mathfun.h
+++ b/libraries/vecmath/include/neon_mathfun.h
+/* NEON implementation of sin, cos, exp and log
+   Inspired by Intel Approximate Math library, and based on the
+   corresponding algorithms of the cephes math library
+*/
+/* Copyright (C) 2011  Julien Pommier
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the authors be held liable for any damages
+  arising from the use of this software.
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+  1. The origin of this software must not be misrepresented; you must not
+     claim that you wrote the original software. If you use this software
+     in a product, an acknowledgment in the product documentation would be
+     appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+     misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+  (this is the zlib license)
+*/
+#include <arm_neon.h>
+typedef float32x4_t v4sf;  // vector of 4 float
+typedef uint32x4_t v4su;  // vector of 4 uint32
+typedef int32x4_t v4si;  // vector of 4 uint32
+#define c_inv_mant_mask ~0x7f800000u
+#define c_cephes_SQRTHF 0.707106781186547524
+#define c_cephes_log_p0 7.0376836292E-2
+#define c_cephes_log_p1 - 1.1514610310E-1
+#define c_cephes_log_p2 1.1676998740E-1
+#define c_cephes_log_p3 - 1.2420140846E-1
+#define c_cephes_log_p4 + 1.4249322787E-1
+#define c_cephes_log_p5 - 1.6668057665E-1
+#define c_cephes_log_p6 + 2.0000714765E-1
+#define c_cephes_log_p7 - 2.4999993993E-1
+#define c_cephes_log_p8 + 3.3333331174E-1
+#define c_cephes_log_q1 -2.12194440e-4
+#define c_cephes_log_q2 0.693359375
+/* natural logarithm computed for 4 simultaneous float 
+   return NaN for x <= 0
+*/
+v4sf log_ps(v4sf x) {
+  v4sf one = vdupq_n_f32(1);
+  x = vmaxq_f32(x, vdupq_n_f32(0)); /* force flush to zero on denormal values */
+  v4su invalid_mask = vcleq_f32(x, vdupq_n_f32(0));
+  v4si ux = vreinterpretq_s32_f32(x);
+  v4si emm0 = vshrq_n_s32(ux, 23);
+  /* keep only the fractional part */
+  ux = vandq_s32(ux, vdupq_n_s32(c_inv_mant_mask));
+  ux = vorrq_s32(ux, vreinterpretq_s32_f32(vdupq_n_f32(0.5f)));
+  x = vreinterpretq_f32_s32(ux);
+  emm0 = vsubq_s32(emm0, vdupq_n_s32(0x7f));
+  v4sf e = vcvtq_f32_s32(emm0);
+  e = vaddq_f32(e, one);
+  /* part2: 
+     if( x < SQRTHF ) {
+       e -= 1;
+       x = x + x - 1.0;
+     } else { x = x - 1.0; }
+  */
+  v4su mask = vcltq_f32(x, vdupq_n_f32(c_cephes_SQRTHF));
+  v4sf tmp = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(x), mask));
+  x = vsubq_f32(x, one);
+  e = vsubq_f32(e, vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(one), mask)));
+  x = vaddq_f32(x, tmp);
+  v4sf z = vmulq_f32(x,x);
+  v4sf y = vdupq_n_f32(c_cephes_log_p0);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p1));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p2));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p3));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p4));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p5));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p6));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p7));
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, vdupq_n_f32(c_cephes_log_p8));
+  y = vmulq_f32(y, x);
+  y = vmulq_f32(y, z);
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q1));
+  y = vaddq_f32(y, tmp);
+  tmp = vmulq_f32(z, vdupq_n_f32(0.5f));
+  y = vsubq_f32(y, tmp);
+  tmp = vmulq_f32(e, vdupq_n_f32(c_cephes_log_q2));
+  x = vaddq_f32(x, y);
+  x = vaddq_f32(x, tmp);
+  x = vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(x), invalid_mask)); // negative arg will be NAN
+  return x;
+}
+#define c_exp_hi 88.3762626647949f
+#define c_exp_lo -88.3762626647949f
+#define c_cephes_LOG2EF 1.44269504088896341
+#define c_cephes_exp_C1 0.693359375
+#define c_cephes_exp_C2 -2.12194440e-4
+#define c_cephes_exp_p0 1.9875691500E-4
+#define c_cephes_exp_p1 1.3981999507E-3
+#define c_cephes_exp_p2 8.3334519073E-3
+#define c_cephes_exp_p3 4.1665795894E-2
+#define c_cephes_exp_p4 1.6666665459E-1
+#define c_cephes_exp_p5 5.0000001201E-1
+/* exp() computed for 4 float at once */
+v4sf exp_ps(v4sf x) {
+  v4sf tmp, fx;
+  v4sf one = vdupq_n_f32(1);
+  x = vminq_f32(x, vdupq_n_f32(c_exp_hi));
+  x = vmaxq_f32(x, vdupq_n_f32(c_exp_lo));
+  /* express exp(x) as exp(g + n*log(2)) */
+  fx = vmlaq_f32(vdupq_n_f32(0.5f), x, vdupq_n_f32(c_cephes_LOG2EF));
+  /* perform a floorf */
+  tmp = vcvtq_f32_s32(vcvtq_s32_f32(fx));
+  /* if greater, substract 1 */
+  v4su mask = vcgtq_f32(tmp, fx);    
+  mask = vandq_u32(mask, vreinterpretq_u32_f32(one));
+  fx = vsubq_f32(tmp, vreinterpretq_f32_u32(mask));
+  tmp = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C1));
+  v4sf z = vmulq_f32(fx, vdupq_n_f32(c_cephes_exp_C2));
+  x = vsubq_f32(x, tmp);
+  x = vsubq_f32(x, z);
+  static const float cephes_exp_p[6] = { c_cephes_exp_p0, c_cephes_exp_p1, c_cephes_exp_p2, c_cephes_exp_p3, c_cephes_exp_p4, c_cephes_exp_p5 };
+  v4sf y = vld1q_dup_f32(cephes_exp_p+0);
+  v4sf c1 = vld1q_dup_f32(cephes_exp_p+1); 
+  v4sf c2 = vld1q_dup_f32(cephes_exp_p+2); 
+  v4sf c3 = vld1q_dup_f32(cephes_exp_p+3); 
+  v4sf c4 = vld1q_dup_f32(cephes_exp_p+4); 
+  v4sf c5 = vld1q_dup_f32(cephes_exp_p+5);
+  y = vmulq_f32(y, x);
+  z = vmulq_f32(x,x);
+  y = vaddq_f32(y, c1);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c2);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c3);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c4);
+  y = vmulq_f32(y, x);
+  y = vaddq_f32(y, c5);
+  y = vmulq_f32(y, z);
+  y = vaddq_f32(y, x);
+  y = vaddq_f32(y, one);
+  /* build 2^n */
+  int32x4_t mm;
+  mm = vcvtq_s32_f32(fx);
+  mm = vaddq_s32(mm, vdupq_n_s32(0x7f));
+  mm = vshlq_n_s32(mm, 23);
+  v4sf pow2n = vreinterpretq_f32_s32(mm);
+  y = vmulq_f32(y, pow2n);
+  return y;
+}
+#define c_minus_cephes_DP1 -0.78515625
+#define c_minus_cephes_DP2 -2.4187564849853515625e-4
+#define c_minus_cephes_DP3 -3.77489497744594108e-8
+#define c_sincof_p0 -1.9515295891E-4
+#define c_sincof_p1  8.3321608736E-3
+#define c_sincof_p2 -1.6666654611E-1
+#define c_coscof_p0  2.443315711809948E-005
+#define c_coscof_p1 -1.388731625493765E-003
+#define c_coscof_p2  4.166664568298827E-002
+#define c_cephes_FOPI 1.27323954473516 // 4 / M_PI
+/* evaluation of 4 sines & cosines at once.
+   The code is the exact rewriting of the cephes sinf function.
+   Precision is excellent as long as x < 8192 (I did not bother to
+   take into account the special handling they have for greater values
+   -- it does not return garbage for arguments over 8192, though, but
+   the extra precision is missing).
+   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
+   surprising but correct result.
+   Note also that when you compute sin(x), cos(x) is available at
+   almost no extra price so both sin_ps and cos_ps make use of
+   sincos_ps..
+  */
+void sincos_ps(v4sf x, v4sf *ysin, v4sf *ycos) { // any x
+  v4sf xmm1, xmm2, xmm3, y;
+  v4su emm2;
+  v4su sign_mask_sin, sign_mask_cos;
+  sign_mask_sin = vcltq_f32(x, vdupq_n_f32(0));
+  x = vabsq_f32(x);
+  /* scale by 4/Pi */
+  y = vmulq_f32(x, vdupq_n_f32(c_cephes_FOPI));
+  /* store the integer part of y in mm0 */
+  emm2 = vcvtq_u32_f32(y);
+  /* j=(j+1) & (~1) (see the cephes sources) */
+  emm2 = vaddq_u32(emm2, vdupq_n_u32(1));
+  emm2 = vandq_u32(emm2, vdupq_n_u32(~1));
+  y = vcvtq_f32_u32(emm2);
+  /* get the polynom selection mask 
+     there is one polynom for 0 <= x <= Pi/4
+     and another one for Pi/4<x<=Pi/2
+     Both branches will be computed.
+  */
+  v4su poly_mask = vtstq_u32(emm2, vdupq_n_u32(2));
+  /* The magic pass: "Extended precision modular arithmetic" 
+     x = ((x - y * DP1) - y * DP2) - y * DP3; */
+  xmm1 = vmulq_n_f32(y, c_minus_cephes_DP1);
+  xmm2 = vmulq_n_f32(y, c_minus_cephes_DP2);
+  xmm3 = vmulq_n_f32(y, c_minus_cephes_DP3);
+  x = vaddq_f32(x, xmm1);
+  x = vaddq_f32(x, xmm2);
+  x = vaddq_f32(x, xmm3);
+  sign_mask_sin = veorq_u32(sign_mask_sin, vtstq_u32(emm2, vdupq_n_u32(4)));
+  sign_mask_cos = vtstq_u32(vsubq_u32(emm2, vdupq_n_u32(2)), vdupq_n_u32(4));
+  /* Evaluate the first polynom  (0 <= x <= Pi/4) in y1, 
+     and the second polynom      (Pi/4 <= x <= 0) in y2 */
+  v4sf z = vmulq_f32(x,x);
+  v4sf y1, y2;
+  y1 = vmulq_n_f32(z, c_coscof_p0);
+  y2 = vmulq_n_f32(z, c_sincof_p0);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p1));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p1));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vaddq_f32(y1, vdupq_n_f32(c_coscof_p2));
+  y2 = vaddq_f32(y2, vdupq_n_f32(c_sincof_p2));
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, z);
+  y1 = vmulq_f32(y1, z);
+  y2 = vmulq_f32(y2, x);
+  y1 = vsubq_f32(y1, vmulq_f32(z, vdupq_n_f32(0.5f)));
+  y2 = vaddq_f32(y2, x);
+  y1 = vaddq_f32(y1, vdupq_n_f32(1));
+  /* select the correct result from the two polynoms */  
+  v4sf ys = vbslq_f32(poly_mask, y1, y2);
+  v4sf yc = vbslq_f32(poly_mask, y2, y1);
+  *ysin = vbslq_f32(sign_mask_sin, vnegq_f32(ys), ys);
+  *ycos = vbslq_f32(sign_mask_cos, yc, vnegq_f32(yc));
+}
+v4sf sin_ps(v4sf x) {
+  v4sf ysin, ycos; 
+  sincos_ps(x, &ysin, &ycos); 
+  return ysin;
+}
+v4sf cos_ps(v4sf x) {
+  v4sf ysin, ycos; 
+  sincos_ps(x, &ysin, &ycos); 
+  return ycos;
+}
--- a/libraries/vecmath/include/sse_mathfun.h
+++ b/libraries/vecmath/include/sse_mathfun.h
--- a/libraries/vecmath/src/vecmath.cpp
+++ b/libraries/vecmath/src/vecmath.cpp
+#if defined(__ANDROID__)
+    #include "neon_mathfun.h"
+#else
+    #if !defined(__PNACL__)
+        #include "sse_mathfun.h"
+    #endif
+#endif
--- a/openmmapi/include/openmm/internal/vectorize_neon.h
+++ b/openmmapi/include/openmm/internal/vectorize_neon.h
@@ -40,6 +40,10 @@ typedef int int32_t;
 // This file defines classes and functions to simplify vectorizing code with NEON.
+// These two functions are defined in the vecmath library, which is linked into OpenMM.
+float32x4_t exp_ps(float32x4_t);
+float32x4_t log_ps(float32x4_t);
 /**
 * Determine whether ivec4 and fvec4 are supported on this processor.
 */
@@ -262,6 +266,14 @@ static inline fvec4 sqrt(const fvec4& v) {
    return rsqrt(v)*v;
 }
+static inline fvec4 exp(const fvec4& v) {
+    return fvec4(exp_ps(v.val));
+}
+static inline fvec4 log(const fvec4& v) {
+    return fvec4(log_ps(v.val));
+}
 static inline float dot3(const fvec4& v1, const fvec4& v2) {
    fvec4 result = v1*v2;
    return vgetq_lane_f32(result, 0) + vgetq_lane_f32(result, 1) + vgetq_lane_f32(result, 2);

--- a/openmmapi/include/openmm/internal/vectorize_pnacl.h
+++ b/openmmapi/include/openmm/internal/vectorize_pnacl.h
@@ -233,6 +233,14 @@ static inline fvec4 abs(const fvec4& v) {
    return v&(__m128) ivec4(0x7FFFFFFF);
 }
+static inline fvec4 exp(const fvec4& v) {
+    return fvec4(expf(v[0]), expf(v[1]), expf(v[2]), expf(v[3]));
+}
+static inline fvec4 log(const fvec4& v) {
+    return fvec4(logf(v[0]), logf(v[1]), logf(v[2]), logf(v[3]));
+}
 static inline float dot3(const fvec4& v1, const fvec4& v2) {
    fvec4 r = v1*v2;
    return r[0]+r[1]+r[2];

--- a/openmmapi/include/openmm/internal/vectorize_sse.h
+++ b/openmmapi/include/openmm/internal/vectorize_sse.h
@@ -37,6 +37,10 @@
 // This file defines classes and functions to simplify vectorizing code with SSE.
+// These two functions are defined in the vecmath library, which is linked into OpenMM.
+__m128 exp_ps(__m128);
+__m128 log_ps(__m128);
 /**
 * Determine whether ivec4 and fvec4 are supported on this processor.
 */
@@ -253,6 +257,14 @@ static inline fvec4 rsqrt(const fvec4& v) {
    return y;
 }
+static inline fvec4 exp(const fvec4& v) {
+    return fvec4(exp_ps(v.val));
+}
+static inline fvec4 log(const fvec4& v) {
+    return fvec4(log_ps(v.val));
+}
 static inline float dot3(const fvec4& v1, const fvec4& v2) {
    return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
 }

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
@@ -279,7 +279,7 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
            fvec4 r = sqrt(r2);
            fvec4 alpha2_ij = radii*bornRadii[atomJ];
            fvec4 D_ij = r2/(4.0f*alpha2_ij);
-            fvec4 expTerm(expf(-D_ij[0]), expf(-D_ij[1]), expf(-D_ij[2]), expf(-D_ij[3]));
+            fvec4 expTerm = exp(-D_ij);
            fvec4 denominator2 = r2 + alpha2_ij*expTerm;
            fvec4 denominator = sqrt(denominator2);
            fvec4 Gpol = (partialChargeI*posJ[3])/denominator; 

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -322,6 +322,14 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
    threads.execute(task);
    threads.waitForThreads();
+    // Signal the threads to subtract the exclusions.
+    if (ewald || pme) {
+        gmx_atomic_set(&counter, 0);
+        threads.resumeThreads();
+        threads.waitForThreads();
+    }
    // Combine the energies from all the threads.
    if (totalEnergy != NULL) {
@@ -354,28 +362,37 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
        // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
-        for (int i = threadIndex; i < numberOfAtoms; i += numThreads) {
+        threads.syncThreads();
-            fvec4 posI((float) atomCoordinates[i][0], (float) atomCoordinates[i][1], (float) atomCoordinates[i][2], 0.0f);
+        const int groupSize = max(1, numberOfAtoms/(10*numThreads));
-            for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
+        while (true) {
-                if (*iter > i) {
+            int start = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), groupSize);
-                    int j = *iter;
+            if (start >= numberOfAtoms)
-                    fvec4 deltaR;
+                break;
-                    fvec4 posJ((float) atomCoordinates[j][0], (float) atomCoordinates[j][1], (float) atomCoordinates[j][2], 0.0f);
+            int end = min(start+groupSize, numberOfAtoms);
-                    float r2;
+            for (int i = start; i < end; i++) {
-                    getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
+                fvec4 posI(posq[4*i], posq[4*i+1], posq[4*i+2], 0.0f);
-                    float r = sqrtf(r2);
+                float scaledChargeI = (float) (ONE_4PI_EPS0*posq[4*i+3]);
-                    float inverseR = 1/r;
+                for (set<int>::const_iterator iter = exclusions[i].begin(); iter != exclusions[i].end(); ++iter) {
-                    float chargeProd = ONE_4PI_EPS0*posq[4*i+3]*posq[4*j+3];
+                    if (*iter > i) {
-                    float alphaR = alphaEwald*r;
+                        int j = *iter;
-                    float erfAlphaR = erf(alphaR);
+                        fvec4 deltaR;
-                    if (erfAlphaR > 1e-6f) {
+                        fvec4 posJ(posq[4*j], posq[4*j+1], posq[4*j+2], 0.0f);
-                        float dEdR = (float) (chargeProd * inverseR * inverseR * inverseR);
+                        float r2;
-                        dEdR = (float) (dEdR * (erfAlphaR-TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR)));
+                        getDeltaR(posJ, posI, deltaR, r2, false, boxSize, invBoxSize);
-                        fvec4 result = deltaR*dEdR;
+                        float r = sqrtf(r2);
-                        (fvec4(forces+4*i)-result).store(forces+4*i);
+                        float alphaR = alphaEwald*r;
-                        (fvec4(forces+4*j)+result).store(forces+4*j);
+                        float erfAlphaR = erf(alphaR);
-                        if (includeEnergy)
+                        if (erfAlphaR > 1e-6f) {
-                            threadEnergy[threadIndex] -= chargeProd*inverseR*erfAlphaR;
+                            float inverseR = 1/r;
+                            float chargeProdOverR = scaledChargeI*posq[4*j+3]*inverseR;
+                            float dEdR = chargeProdOverR*inverseR*inverseR;
+                            dEdR = dEdR * (erfAlphaR-(float)TWO_OVER_SQRT_PI*alphaR*(float)exp(-alphaR*alphaR));
+                            fvec4 result = deltaR*dEdR;
+                            (fvec4(forces+4*i)-result).store(forces+4*i);
+                            (fvec4(forces+4*j)+result).store(forces+4*j);
+                            if (includeEnergy)
+                                threadEnergy[threadIndex] -= chargeProdOverR*erfAlphaR;
+                        }
                    }
                }
            }

--- a/tests/TestVectorize.cpp
+++ b/tests/TestVectorize.cpp
@@ -149,6 +149,8 @@ void testMathFunctions() {
    ASSERT_VEC4_EQUAL(max(f1, f2), 1.1, 1.9, 1.3, -3.8);
    ASSERT_VEC4_EQUAL(sqrt(fvec4(1.5, 3.1, 4.0, 15.0)), sqrt(1.5), sqrt(3.1), sqrt(4.0), sqrt(15.0));
    ASSERT_VEC4_EQUAL(rsqrt(fvec4(1.5, 3.1, 4.0, 15.0)), 1.0/sqrt(1.5), 1.0/sqrt(3.1), 1.0/sqrt(4.0), 1.0/sqrt(15.0));
+    ASSERT_VEC4_EQUAL(exp(fvec4(-2.1, -0.5, 1.5, 3.1)), expf(-2.1), expf(-0.5), expf(1.5), expf(3.1));
+    ASSERT_VEC4_EQUAL(log(fvec4(1.5, 3.1, 4.0, 15.0)), logf(1.5), logf(3.1), logf(4.0), logf(15.0));
    ASSERT_EQUAL_TOL(f1[0]*f2[0]+f1[1]*f2[1]+f1[2]*f2[2], dot3(f1, f2), 1e-6);
    ASSERT_EQUAL_TOL(f1[0]*f2[0]+f1[1]*f2[1]+f1[2]*f2[2]+f1[3]*f2[3], dot4(f1, f2), 1e-6);
    ASSERT(any(f1 > 0.5));