Merge pull request #224 from peastman/master

Enforce memory alignment to improve performance of vector operations. Also fixed bugs in an earlier optimization.

Merge pull request #224 from peastman/master
Enforce memory alignment to improve performance of vector operations. Also fixed bugs in an earlier optimization.
6f3d9fd5 · peastman · 3732e0e6 · b1be68d8 · 6f3d9fd5 · 6f3d9fd5
Commit 6f3d9fd5 authored Dec 04, 2013 by peastman
11 changed files
--- a/platforms/cpu/include/AlignedArray.h
+++ b/platforms/cpu/include/AlignedArray.h
+#ifndef OPENMM_ALIGNEDARRAY_H_
+#define OPENMM_ALIGNEDARRAY_H_
+/* -------------------------------------------------------------------------- *
+ *                                   OpenMM                                   *
+ * -------------------------------------------------------------------------- *
+ * This is part of the OpenMM molecular simulation toolkit originating from   *
+ * Simbios, the NIH National Center for Physics-Based Simulation of           *
+ * Biological Structures at Stanford, funded under the NIH Roadmap for        *
+ * Medical Research, grant U54 GM072970. See https://simtk.org.               *
+ *                                                                            *
+ * Portions copyright (c) 2013 Stanford University and the Authors.           *
+ * Authors: Peter Eastman                                                     *
+ * Contributors:                                                              *
+ *                                                                            *
+ * Permission is hereby granted, free of charge, to any person obtaining a    *
+ * copy of this software and associated documentation files (the "Software"), *
+ * to deal in the Software without restriction, including without limitation  *
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,   *
+ * and/or sell copies of the Software, and to permit persons to whom the      *
+ * Software is furnished to do so, subject to the following conditions:       *
+ *                                                                            *
+ * The above copyright notice and this permission notice shall be included in *
+ * all copies or substantial portions of the Software.                        *
+ *                                                                            *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,   *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL    *
+ * THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,    *
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR      *
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE  *
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
+ * -------------------------------------------------------------------------- */
+namespace OpenMM {
+/**
+ * This class represents an array in memory whose starting point is guaranteed to
+ * be aligned with a 16 byte boundary.  This can improve the performance of vectorized
+ * code, since loads and stores are more efficient.
+ */
+template <class T>
+class AlignedArray {
+public:
+    /**
+     * Default constructor, to allow AlignedArrays to be used inside collections.
+     */
+    AlignedArray() : dataSize(0), baseData(0), data(0) {
+    }
+    /**
+     * Create an Aligned array that contains a specified number of elements.
+     */
+    AlignedArray(int size) {
+        allocate(size);
+    }
+    ~AlignedArray() {
+        if (baseData != 0)
+            delete[] baseData;
+    }
+    /**
+     * Get the number of elements in the array.
+     */
+    int size() const {
+        return dataSize;
+    }
+    /**
+     * Change the size of the array.  This may cause all contents to be lost.
+     */
+    void resize(int size) {
+        if (dataSize == size)
+            return;
+        if (baseData != 0)
+            delete[] baseData;
+        allocate(size);
+    }
+    /**
+     * Get a reference to an element of the array.
+     */
+    T& operator[](int i) {
+        return data[i];
+    }
+    /**
+     * Get a const reference to an element of the array.
+     */
+    const T& operator[](int i) const {
+        return data[i];
+    }
+private:
+    void allocate(int size) {
+        dataSize = size;
+        baseData = new char[size*sizeof(T)+16];
+        char* offsetData = baseData+15;
+        offsetData -= (long long)offsetData&0xF;
+        data = (T*) offsetData;
+    }
+    int dataSize;
+    char* baseData;
+    T* data;
+};
+} // namespace OpenMM
+#endif /*OPENMM_ALIGNEDARRAY_H_*/
--- a/platforms/cpu/include/CpuGBSAOBCForce.h
+++ b/platforms/cpu/include/CpuGBSAOBCForce.h
@@ -25,6 +25,7 @@
 #ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
 #define OPENMM_CPU_GBSAOBC_FORCE_H__
+#include "AlignedArray.h"
 #include "openmm/internal/ThreadPool.h"
 #include "openmm/internal/vectorize.h"
 #include <set>
@@ -84,7 +85,7 @@ public:
     * @param totalEnergy      total energy
     * @param threads          the thread pool to use
     */
-    void computeForce(const std::vector<float>& posq, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+    void computeForce(const AlignedArray<float>& posq, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
    /**
     * This routine contains the code executed by each thread.
@@ -105,11 +106,13 @@ private:
    float logDX, logDXInv;
    // The following variables are used to make information accessible to the individual threads.
    float const* posq;
-    std::vector<std::vector<float> >* threadForce;
+    std::vector<AlignedArray<float> >* threadForce;
    bool includeEnergy;
    void* atomicCounter;
    static const int NUM_TABLE_POINTS;
+    static const float TABLE_MIN;
+    static const float TABLE_MAX;
    /**
     * Compute the displacement and squared distance between a collection of points, optionally using

--- a/platforms/cpu/include/CpuNeighborList.h
+++ b/platforms/cpu/include/CpuNeighborList.h
@@ -32,6 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
+#include "AlignedArray.h"
 #include "windowsExportCpu.h"
 #include "openmm/internal/ThreadPool.h"
 #include <set>
@@ -46,7 +47,7 @@ public:
    class Voxels;
    static const int BlockSize;
    CpuNeighborList();
-    void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
+    void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
    int getNumBlocks() const;
    const std::vector<int>& getSortedAtoms() const;

--- a/platforms/cpu/include/CpuNonbondedForce.h
+++ b/platforms/cpu/include/CpuNonbondedForce.h
@@ -25,6 +25,7 @@
 #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
 #define OPENMM_CPU_NONBONDED_FORCE_H__
+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "ReferencePairIxn.h"
 #include "openmm/internal/ThreadPool.h"
@@ -143,7 +144,7 @@ class CpuNonbondedForce {
         --------------------------------------------------------------------------------------- */
      void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
-            const std::vector<std::set<int> >& exclusions, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
+            const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
    /**
     * This routine contains the code executed by each thread.
@@ -173,7 +174,7 @@ private:
        RealVec const* atomCoordinates;
        std::pair<float, float> const* atomParameters;        
        std::set<int> const* exclusions;
-        std::vector<std::vector<float> >* threadForce;
+        std::vector<AlignedArray<float> >* threadForce;
        bool includeEnergy;
        void* atomicCounter;

--- a/platforms/cpu/include/CpuPlatform.h
+++ b/platforms/cpu/include/CpuPlatform.h
@@ -32,6 +32,7 @@
 * USE OR OTHER DEALINGS IN THE SOFTWARE.                                     *
 * -------------------------------------------------------------------------- */
+#include "AlignedArray.h"
 #include "ReferencePlatform.h"
 #include "openmm/internal/ContextImpl.h"
 #include "openmm/internal/ThreadPool.h"
@@ -69,8 +70,8 @@ private:
 class CpuPlatform::PlatformData {
 public:
    PlatformData(int numParticles);
-    std::vector<float> posq;
+    AlignedArray<float> posq;
-    std::vector<std::vector<float> > threadForce;
+    std::vector<AlignedArray<float> > threadForce;
    ThreadPool threads;
    bool isPeriodic;
 };

--- a/platforms/cpu/src/CpuGBSAOBCForce.cpp
+++ b/platforms/cpu/src/CpuGBSAOBCForce.cpp
@@ -31,7 +31,9 @@
 using namespace std;
 using namespace OpenMM;
-const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 2048;
+const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
+const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
+const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
 class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
 public:
@@ -44,11 +46,11 @@ public:
 };
 CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
-    logDX = 0.5/NUM_TABLE_POINTS;
+    logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
    logDXInv = 1.0f/logDX;
    logTable.resize(NUM_TABLE_POINTS+1);
    for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
-        double x = 0.5+i*logDX;
+        double x = TABLE_MIN+i*logDX;
        logTable[i] = log(x);
    }
 }
@@ -83,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
    obcChain.resize(params.size()+3);
 }
-void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    this->posq = &posq[0];
@@ -393,14 +395,15 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
 fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
    // Evaluate log(x) using a lookup table for speed.
-    fvec4 x1 = (x-0.5f)*logDXInv;
+    if (any(x < TABLE_MIN) || any(x >= TABLE_MAX))
+        return fvec4(logf(x[0]), logf(x[1]), logf(x[2]), logf(x[3]));
+    fvec4 x1 = (x-TABLE_MIN)*logDXInv;
    ivec4 index = floor(x1);
    fvec4 coeff2 = x1-index;
    fvec4 coeff1 = 1.0f-coeff2;
    float table1[4], table2[4];
    for (int i = 0; i < 4; i++) {
        int tableIndex = index[i];
-        if (tableIndex < NUM_TABLE_POINTS)
        table1[i] = logTable[tableIndex];
        table2[i] = logTable[tableIndex+1];
    }

--- a/platforms/cpu/src/CpuKernels.cpp
+++ b/platforms/cpu/src/CpuKernels.cpp
@@ -81,7 +81,7 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
    // Convert the positions to single precision and apply periodic boundary conditions
-    vector<float>& posq = data.posq;
+    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    RealVec boxSize = extractBoxSize(context);
    float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
@@ -255,7 +255,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
            }
        }
    }
-    vector<float>& posq = data.posq;
+    AlignedArray<float>& posq = data.posq;
    vector<RealVec>& posData = extractPositions(context);
    vector<RealVec>& forceData = extractForces(context);
    RealVec boxSize = extractBoxSize(context);

--- a/platforms/cpu/src/CpuNeighborList.cpp
+++ b/platforms/cpu/src/CpuNeighborList.cpp
@@ -330,7 +330,7 @@ public:
 CpuNeighborList::CpuNeighborList() {
 }
-void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
+void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
            const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
    int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
    blockNeighbors.resize(numBlocks);

--- a/platforms/cpu/src/CpuNonbondedForce.cpp
+++ b/platforms/cpu/src/CpuNonbondedForce.cpp
@@ -288,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
 void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
-                const vector<set<int> >& exclusions, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
+                const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
    // Record the parameters for the threads.
    this->numberOfAtoms = numberOfAtoms;
@@ -701,9 +701,10 @@ fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
    float table1[4], table2[4];
    for (int i = 0; i < 4; i++) {
        int tableIndex = index[i];
-        if (tableIndex < NUM_TABLE_POINTS)
+        if (tableIndex < NUM_TABLE_POINTS) {
            table1[i] = ewaldScaleTable[tableIndex];
            table2[i] = ewaldScaleTable[tableIndex+1];
        }
+    }
    return coeff1*fvec4(table1) + coeff2*fvec4(table2);
 }
--- a/platforms/cpu/src/CpuPlatform.cpp
+++ b/platforms/cpu/src/CpuPlatform.cpp
@@ -89,8 +89,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
    return *contextData[&context];
 }
-CpuPlatform::PlatformData::PlatformData(int numParticles) {
+CpuPlatform::PlatformData::PlatformData(int numParticles) : posq(4*numParticles) {
-    posq.resize(4*numParticles);
    int numThreads = threads.getNumThreads();
    threadForce.resize(numThreads);
    for (int i = 0; i < numThreads; i++)

--- a/platforms/cpu/tests/TestCpuNeighborList.cpp
+++ b/platforms/cpu/tests/TestCpuNeighborList.cpp
@@ -35,6 +35,7 @@
 #include "openmm/internal/AssertionUtilities.h"
 #include "openmm/internal/ThreadPool.h"
+#include "AlignedArray.h"
 #include "CpuNeighborList.h"
 #include "CpuPlatform.h"
 #include "sfmt/SFMT.h"
@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
    const float boxSize[3] = {20.0f, 15.0f, 22.0f};
    OpenMM_SFMT::SFMT sfmt;
    init_gen_rand(0, sfmt);
-    vector<float> positions(4*numParticles);
+    AlignedArray<float> positions(4*numParticles);
    for (int i = 0; i < 4*numParticles; i++)
        if (i%4 < 3)
            positions[i] = boxSize[i%4]*genrand_real2(sfmt);