Enforce memory alignment to improve performance of vector operations. Also...

Enforce memory alignment to improve performance of vector operations. Also fixed bugs in an earlier optimization.

Enforce memory alignment to improve performance of vector operations. Also...
Enforce memory alignment to improve performance of vector operations. Also fixed bugs in an earlier optimization.
b1be68d8 · peastman · be04ec57 · b1be68d8 · b1be68d8 · b1be68d8
Commit b1be68d8 authored Dec 04, 2013 by peastman
11 changed files
--- a/platforms/cpu/include/AlignedArray.h
+++ b/platforms/cpu/include/AlignedArray.h
+#ifndef OPENMM_ALIGNEDARRAY_H_
+#define OPENMM_ALIGNEDARRAY_H_
+
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+
+namespace OpenMM {
+
+/**
+ * This class represents an array in memory whose starting point is guaranteed to
+ * be aligned with a 16 byte boundary.  This can improve the performance of vectorized
+ * code, since loads and stores are more efficient.
+ */
+template <class T>
+class AlignedArray {
+public:
+    /**
+     * Default constructor, to allow AlignedArrays to be used inside collections.
+     */
+    AlignedArray() : dataSize(0), baseData(0), data(0) {
+    }
+    /**
+     * Create an Aligned array that contains a specified number of elements.
+     */
+    AlignedArray(int size) {
+        allocate(size);
+    }
+    ~AlignedArray() {
+        if (baseData != 0)
+            delete[] baseData;
+    }
+    /**
+     * Get the number of elements in the array.
+     */
+    int size() const {
+        return dataSize;
+    }
+    /**
+     * Change the size of the array.  This may cause all contents to be lost.
+     */
+    void resize(int size) {
+        if (dataSize == size)
+            return;
+        if (baseData != 0)
+            delete[] baseData;
+        allocate(size);
+    }
+    /**
+     * Get a reference to an element of the array.
+     */
+    T& operator[](int i) {
+        return data[i];
+    }
+    /**
+     * Get a const reference to an element of the array.
+     */
+    const T& operator[](int i) const {
+        return data[i];
+    }
+private:
+    void allocate(int size) {
+        dataSize = size;
+        baseData = new char[size*sizeof(T)+16];
+        char* offsetData = baseData+15;
+        offsetData -= (long long)offsetData&0xF;
+        data = (T*) offsetData;
+    }
+    int dataSize;
+    char* baseData;
+    T* data;
+};
+
+} // namespace OpenMM
+
+#endif /*OPENMM_ALIGNEDARRAY_H_*/
+
--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
@@ -25,6 +25,7 @@
 #ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
 #define OPENMM_CPU_GBSAOBC_FORCE_H__

+#include "AlignedArray.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
 #include <set>
@@ -84,7 +85,7 @@ public:
     * @param totalEnergy      total energy
     * @param threads          the thread pool to use
     */
-    void computeForce(const std::vector<float>& posq, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+    void computeForce(const AlignedArray<float>& posq, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);

    /**
     * This routine contains the code executed by each thread.
@@ -105,11 +106,13 @@ private:
    float logDX, logDXInv;
    // The following variables are used to make information accessible to the individual threads.
    float const* posq;
-    std::vector<std::vector<float> >* threadForce;
+    std::vector<AlignedArray<float> >* threadForce;
    bool includeEnergy;
    void* atomicCounter;
  
    static const int NUM_TABLE_POINTS;
+    static const float TABLE_MIN;
+    static const float TABLE_MAX;

    /**
     * Compute the displacement and squared distance between a collection of points, optionally using

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -32,6 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#include "AlignedArray.h"
 #include "windowsExportCpu.h"
 #include "openmm/internal/ThreadPool.h"
 #include <set>
@@ -46,7 +47,7 @@ public:
    class Voxels;
    static const int BlockSize;
    CpuNeighborList();
-    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
+    void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
    const std::vector<int>& getSortedAtoms() const;

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -25,6 +25,7 @@
 #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
 #define OPENMM_CPU_NONBONDED_FORCE_H__

+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
 #include "openmm/internal/ThreadPool.h"
@@ -143,7 +144,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
          
      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);

    /**
     * This routine contains the code executed by each thread.
@@ -173,7 +174,7 @@ private:
        RealVec const* atomCoordinates;
        std::pair<float, float> const* atomParameters;        
        std::set<int> const* exclusions;
-        std::vector<std::vector<float> >* threadForce;
+        std::vector<AlignedArray<float> >* threadForce;
        bool includeEnergy;
        void* atomicCounter;


--- a/platforms/cpu/include/CpuPlatform.h
+++ b/platforms/cpu/include/CpuPlatform.h
@@ -32,6 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */

+#include "AlignedArray.h"
 #include "ReferencePlatform.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/ThreadPool.h"
@@ -69,8 +70,8 @@ private:
 class CpuPlatform::PlatformData {
 public:
    PlatformData(int numParticles);
-    std::vector<float> posq;
-    std::vector<std::vector<float> > threadForce;
+    AlignedArray<float> posq;
+    std::vector<AlignedArray<float> > threadForce;
    ThreadPool threads;
    bool isPeriodic;
 };

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
@@ -31,7 +31,9 @@
 using namespace std;
 using namespace OpenMM;

-const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 2048;
+const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
+const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
+const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;

 class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
 public:
@@ -44,11 +46,11 @@ public:
 };

 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
-    logDX = 0.5/NUM_TABLE_POINTS;
+    logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
    logTable.resize(NUM_TABLE_POINTS+1);
    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        double x = 0.5+i*logDX;
+        double x = TABLE_MIN+i*logDX;
        logTable[i] = log(x);
    }
 }
@@ -83,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
    obcChain.resize(params.size()+3);
 }

-void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    
    this->posq = &posq[0];
@@ -393,14 +395,15 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
 fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
    // Evaluate log(x) using a lookup table for speed.

-    fvec4 x1 = (x-0.5f)*logDXInv;
+    if (any(x < TABLE_MIN) || any(x >= TABLE_MAX))
+        return fvec4(logf(x[0]), logf(x[1]), logf(x[2]), logf(x[3]));
+    fvec4 x1 = (x-TABLE_MIN)*logDXInv;
    ivec4 index = floor(x1);
    fvec4 coeff2 = x1-index;
    fvec4 coeff1 = 1.0f-coeff2;
    float table1[4], table2[4];
    for (int i = 0; i < 4; i++) {
        int tableIndex = index[i];
-        if (tableIndex < NUM_TABLE_POINTS)
        table1[i] = logTable[tableIndex];
        table2[i] = logTable[tableIndex+1];
    }

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -81,7 +81,7 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
    
    // Convert the positions to single precision and apply periodic boundary conditions
    
-    vector<float>& posq = data.posq;
+    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    RealVec boxSize = extractBoxSize(context);
    float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
@@ -255,7 +255,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            }
        }
    }
-    vector<float>& posq = data.posq;
+    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    vector<RealVec>& forceData = extractForces(context);
    RealVec boxSize = extractBoxSize(context);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -330,7 +330,7 @@ public:
 CpuNeighborList::CpuNeighborList() {
 }

-void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
+void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
    blockNeighbors.resize(numBlocks);

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -288,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c


 void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    
    this->numberOfAtoms = numberOfAtoms;
@@ -701,9 +701,10 @@ fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
    float table1[4], table2[4];
    for (int i = 0; i < 4; i++) {
        int tableIndex = index[i];
-        if (tableIndex < NUM_TABLE_POINTS)
+        if (tableIndex < NUM_TABLE_POINTS) {
            table1[i] = ewaldScaleTable[tableIndex];
            table2[i] = ewaldScaleTable[tableIndex+1];
        }
+    }
    return coeff1*fvec4(table1) + coeff2*fvec4(table2);
 }
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -89,8 +89,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
    return *contextData[&context];
 }

-CpuPlatform::PlatformData::PlatformData(int numParticles) {
-    posq.resize(4*numParticles);
+CpuPlatform::PlatformData::PlatformData(int numParticles) : posq(4*numParticles) {
    int numThreads = threads.getNumThreads();
    threadForce.resize(numThreads);
    for (int i = 0; i < numThreads; i++)

--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -35,6 +35,7 @@

 #include "openmm/internal/AssertionUtilities.h"
 #include "openmm/internal/ThreadPool.h"
+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "CpuPlatform.h"
 #include "sfmt/SFMT.h"
@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
    const float boxSize[3] = {20.0f, 15.0f, 22.0f};
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
-    vector<float> positions(4*numParticles);
+    AlignedArray<float> positions(4*numParticles);
    for (int i = 0; i < 4*numParticles; i++)
        if (i%4 < 3)
            positions[i] = boxSize[i%4]*genrand_real2(sfmt);