Commit 6f3d9fd5 authored by peastman's avatar peastman
Browse files

Merge pull request #224 from peastman/master

Enforce memory alignment to improve performance of vector operations.  Also fixed bugs in an earlier optimization.
parents 3732e0e6 b1be68d8
#ifndef OPENMM_ALIGNEDARRAY_H_
#define OPENMM_ALIGNEDARRAY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
namespace OpenMM {
/**
* This class represents an array in memory whose starting point is guaranteed to
* be aligned with a 16 byte boundary. This can improve the performance of vectorized
* code, since loads and stores are more efficient.
*/
template <class T>
class AlignedArray {
public:
/**
* Default constructor, to allow AlignedArrays to be used inside collections.
*/
AlignedArray() : dataSize(0), baseData(0), data(0) {
}
/**
* Create an Aligned array that contains a specified number of elements.
*/
AlignedArray(int size) {
allocate(size);
}
~AlignedArray() {
if (baseData != 0)
delete[] baseData;
}
/**
* Get the number of elements in the array.
*/
int size() const {
return dataSize;
}
/**
* Change the size of the array. This may cause all contents to be lost.
*/
void resize(int size) {
if (dataSize == size)
return;
if (baseData != 0)
delete[] baseData;
allocate(size);
}
/**
* Get a reference to an element of the array.
*/
T& operator[](int i) {
return data[i];
}
/**
* Get a const reference to an element of the array.
*/
const T& operator[](int i) const {
return data[i];
}
private:
void allocate(int size) {
dataSize = size;
baseData = new char[size*sizeof(T)+16];
char* offsetData = baseData+15;
offsetData -= (long long)offsetData&0xF;
data = (T*) offsetData;
}
int dataSize;
char* baseData;
T* data;
};
} // namespace OpenMM
#endif /*OPENMM_ALIGNEDARRAY_H_*/
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_GBSAOBC_FORCE_H__ #ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
#define OPENMM_CPU_GBSAOBC_FORCE_H__ #define OPENMM_CPU_GBSAOBC_FORCE_H__
#include "AlignedArray.h"
#include "openmm/internal/ThreadPool.h" #include "openmm/internal/ThreadPool.h"
#include "openmm/internal/vectorize.h" #include "openmm/internal/vectorize.h"
#include <set> #include <set>
...@@ -84,7 +85,7 @@ public: ...@@ -84,7 +85,7 @@ public:
* @param totalEnergy total energy * @param totalEnergy total energy
* @param threads the thread pool to use * @param threads the thread pool to use
*/ */
void computeForce(const std::vector<float>& posq, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads); void computeForce(const AlignedArray<float>& posq, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
/** /**
* This routine contains the code executed by each thread. * This routine contains the code executed by each thread.
...@@ -105,11 +106,13 @@ private: ...@@ -105,11 +106,13 @@ private:
float logDX, logDXInv; float logDX, logDXInv;
// The following variables are used to make information accessible to the individual threads. // The following variables are used to make information accessible to the individual threads.
float const* posq; float const* posq;
std::vector<std::vector<float> >* threadForce; std::vector<AlignedArray<float> >* threadForce;
bool includeEnergy; bool includeEnergy;
void* atomicCounter; void* atomicCounter;
static const int NUM_TABLE_POINTS; static const int NUM_TABLE_POINTS;
static const float TABLE_MIN;
static const float TABLE_MAX;
/** /**
* Compute the displacement and squared distance between a collection of points, optionally using * Compute the displacement and squared distance between a collection of points, optionally using
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "windowsExportCpu.h" #include "windowsExportCpu.h"
#include "openmm/internal/ThreadPool.h" #include "openmm/internal/ThreadPool.h"
#include <set> #include <set>
...@@ -46,7 +47,7 @@ public: ...@@ -46,7 +47,7 @@ public:
class Voxels; class Voxels;
static const int BlockSize; static const int BlockSize;
CpuNeighborList(); CpuNeighborList();
void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions, void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads); const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
int getNumBlocks() const; int getNumBlocks() const;
const std::vector<int>& getSortedAtoms() const; const std::vector<int>& getSortedAtoms() const;
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__ #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__ #define OPENMM_CPU_NONBONDED_FORCE_H__
#include "AlignedArray.h"
#include "CpuNeighborList.h" #include "CpuNeighborList.h"
#include "ReferencePairIxn.h" #include "ReferencePairIxn.h"
#include "openmm/internal/ThreadPool.h" #include "openmm/internal/ThreadPool.h"
...@@ -143,7 +144,7 @@ class CpuNonbondedForce { ...@@ -143,7 +144,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters, void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
const std::vector<std::set<int> >& exclusions, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads); const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
/** /**
* This routine contains the code executed by each thread. * This routine contains the code executed by each thread.
...@@ -173,7 +174,7 @@ private: ...@@ -173,7 +174,7 @@ private:
RealVec const* atomCoordinates; RealVec const* atomCoordinates;
std::pair<float, float> const* atomParameters; std::pair<float, float> const* atomParameters;
std::set<int> const* exclusions; std::set<int> const* exclusions;
std::vector<std::vector<float> >* threadForce; std::vector<AlignedArray<float> >* threadForce;
bool includeEnergy; bool includeEnergy;
void* atomicCounter; void* atomicCounter;
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "ReferencePlatform.h" #include "ReferencePlatform.h"
#include "openmm/internal/ContextImpl.h" #include "openmm/internal/ContextImpl.h"
#include "openmm/internal/ThreadPool.h" #include "openmm/internal/ThreadPool.h"
...@@ -69,8 +70,8 @@ private: ...@@ -69,8 +70,8 @@ private:
class CpuPlatform::PlatformData { class CpuPlatform::PlatformData {
public: public:
PlatformData(int numParticles); PlatformData(int numParticles);
std::vector<float> posq; AlignedArray<float> posq;
std::vector<std::vector<float> > threadForce; std::vector<AlignedArray<float> > threadForce;
ThreadPool threads; ThreadPool threads;
bool isPeriodic; bool isPeriodic;
}; };
......
...@@ -31,7 +31,9 @@ ...@@ -31,7 +31,9 @@
using namespace std; using namespace std;
using namespace OpenMM; using namespace OpenMM;
const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 2048; const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task { class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
public: public:
...@@ -44,11 +46,11 @@ public: ...@@ -44,11 +46,11 @@ public:
}; };
CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) { CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
logDX = 0.5/NUM_TABLE_POINTS; logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
logDXInv = 1.0f/logDX; logDXInv = 1.0f/logDX;
logTable.resize(NUM_TABLE_POINTS+1); logTable.resize(NUM_TABLE_POINTS+1);
for (int i = 0; i < NUM_TABLE_POINTS+1; i++) { for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
double x = 0.5+i*logDX; double x = TABLE_MIN+i*logDX;
logTable[i] = log(x); logTable[i] = log(x);
} }
} }
...@@ -83,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f ...@@ -83,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
obcChain.resize(params.size()+3); obcChain.resize(params.size()+3);
} }
void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) { void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads. // Record the parameters for the threads.
this->posq = &posq[0]; this->posq = &posq[0];
...@@ -393,14 +395,15 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& ...@@ -393,14 +395,15 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) { fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
// Evaluate log(x) using a lookup table for speed. // Evaluate log(x) using a lookup table for speed.
fvec4 x1 = (x-0.5f)*logDXInv; if (any(x < TABLE_MIN) || any(x >= TABLE_MAX))
return fvec4(logf(x[0]), logf(x[1]), logf(x[2]), logf(x[3]));
fvec4 x1 = (x-TABLE_MIN)*logDXInv;
ivec4 index = floor(x1); ivec4 index = floor(x1);
fvec4 coeff2 = x1-index; fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2; fvec4 coeff1 = 1.0f-coeff2;
float table1[4], table2[4]; float table1[4], table2[4];
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
int tableIndex = index[i]; int tableIndex = index[i];
if (tableIndex < NUM_TABLE_POINTS)
table1[i] = logTable[tableIndex]; table1[i] = logTable[tableIndex];
table2[i] = logTable[tableIndex+1]; table2[i] = logTable[tableIndex+1];
} }
......
...@@ -81,7 +81,7 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i ...@@ -81,7 +81,7 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
// Convert the positions to single precision and apply periodic boundary conditions // Convert the positions to single precision and apply periodic boundary conditions
vector<float>& posq = data.posq; AlignedArray<float>& posq = data.posq;
vector<RealVec>& posData = extractPositions(context); vector<RealVec>& posData = extractPositions(context);
RealVec boxSize = extractBoxSize(context); RealVec boxSize = extractBoxSize(context);
float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]}; float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
...@@ -255,7 +255,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -255,7 +255,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
} }
} }
} }
vector<float>& posq = data.posq; AlignedArray<float>& posq = data.posq;
vector<RealVec>& posData = extractPositions(context); vector<RealVec>& posData = extractPositions(context);
vector<RealVec>& forceData = extractForces(context); vector<RealVec>& forceData = extractForces(context);
RealVec boxSize = extractBoxSize(context); RealVec boxSize = extractBoxSize(context);
......
...@@ -330,7 +330,7 @@ public: ...@@ -330,7 +330,7 @@ public:
CpuNeighborList::CpuNeighborList() { CpuNeighborList::CpuNeighborList() {
} }
void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions, void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) { const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
int numBlocks = (numAtoms+BlockSize-1)/BlockSize; int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
blockNeighbors.resize(numBlocks); blockNeighbors.resize(numBlocks);
......
...@@ -288,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c ...@@ -288,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters, void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
const vector<set<int> >& exclusions, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) { const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads. // Record the parameters for the threads.
this->numberOfAtoms = numberOfAtoms; this->numberOfAtoms = numberOfAtoms;
...@@ -701,9 +701,10 @@ fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) { ...@@ -701,9 +701,10 @@ fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
float table1[4], table2[4]; float table1[4], table2[4];
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
int tableIndex = index[i]; int tableIndex = index[i];
if (tableIndex < NUM_TABLE_POINTS) if (tableIndex < NUM_TABLE_POINTS) {
table1[i] = ewaldScaleTable[tableIndex]; table1[i] = ewaldScaleTable[tableIndex];
table2[i] = ewaldScaleTable[tableIndex+1]; table2[i] = ewaldScaleTable[tableIndex+1];
} }
}
return coeff1*fvec4(table1) + coeff2*fvec4(table2); return coeff1*fvec4(table1) + coeff2*fvec4(table2);
} }
...@@ -89,8 +89,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) { ...@@ -89,8 +89,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
return *contextData[&context]; return *contextData[&context];
} }
CpuPlatform::PlatformData::PlatformData(int numParticles) { CpuPlatform::PlatformData::PlatformData(int numParticles) : posq(4*numParticles) {
posq.resize(4*numParticles);
int numThreads = threads.getNumThreads(); int numThreads = threads.getNumThreads();
threadForce.resize(numThreads); threadForce.resize(numThreads);
for (int i = 0; i < numThreads; i++) for (int i = 0; i < numThreads; i++)
......
...@@ -35,6 +35,7 @@ ...@@ -35,6 +35,7 @@
#include "openmm/internal/AssertionUtilities.h" #include "openmm/internal/AssertionUtilities.h"
#include "openmm/internal/ThreadPool.h" #include "openmm/internal/ThreadPool.h"
#include "AlignedArray.h"
#include "CpuNeighborList.h" #include "CpuNeighborList.h"
#include "CpuPlatform.h" #include "CpuPlatform.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
...@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) { ...@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
const float boxSize[3] = {20.0f, 15.0f, 22.0f}; const float boxSize[3] = {20.0f, 15.0f, 22.0f};
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
vector<float> positions(4*numParticles); AlignedArray<float> positions(4*numParticles);
for (int i = 0; i < 4*numParticles; i++) for (int i = 0; i < 4*numParticles; i++)
if (i%4 < 3) if (i%4 < 3)
positions[i] = boxSize[i%4]*genrand_real2(sfmt); positions[i] = boxSize[i%4]*genrand_real2(sfmt);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment