"vscode:/vscode.git/clone" did not exist on "564fe013e5d6421abc55ed31b3a6e09e092f82d9"
Commit 6f3d9fd5 authored by peastman's avatar peastman
Browse files

Merge pull request #224 from peastman/master

Enforce memory alignment to improve performance of vector operations.  Also fixed bugs in an earlier optimization.
parents 3732e0e6 b1be68d8
#ifndef OPENMM_ALIGNEDARRAY_H_
#define OPENMM_ALIGNEDARRAY_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
namespace OpenMM {
/**
* This class represents an array in memory whose starting point is guaranteed to
* be aligned with a 16 byte boundary. This can improve the performance of vectorized
* code, since loads and stores are more efficient.
*/
template <class T>
class AlignedArray {
public:
/**
* Default constructor, to allow AlignedArrays to be used inside collections.
*/
AlignedArray() : dataSize(0), baseData(0), data(0) {
}
/**
* Create an Aligned array that contains a specified number of elements.
*/
AlignedArray(int size) {
allocate(size);
}
~AlignedArray() {
if (baseData != 0)
delete[] baseData;
}
/**
* Get the number of elements in the array.
*/
int size() const {
return dataSize;
}
/**
* Change the size of the array. This may cause all contents to be lost.
*/
void resize(int size) {
if (dataSize == size)
return;
if (baseData != 0)
delete[] baseData;
allocate(size);
}
/**
* Get a reference to an element of the array.
*/
T& operator[](int i) {
return data[i];
}
/**
* Get a const reference to an element of the array.
*/
const T& operator[](int i) const {
return data[i];
}
private:
void allocate(int size) {
dataSize = size;
baseData = new char[size*sizeof(T)+16];
char* offsetData = baseData+15;
offsetData -= (long long)offsetData&0xF;
data = (T*) offsetData;
}
int dataSize;
char* baseData;
T* data;
};
} // namespace OpenMM
#endif /*OPENMM_ALIGNEDARRAY_H_*/
......@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_GBSAOBC_FORCE_H__
#define OPENMM_CPU_GBSAOBC_FORCE_H__
#include "AlignedArray.h"
#include "openmm/internal/ThreadPool.h"
#include "openmm/internal/vectorize.h"
#include <set>
......@@ -84,7 +85,7 @@ public:
* @param totalEnergy total energy
* @param threads the thread pool to use
*/
void computeForce(const std::vector<float>& posq, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
void computeForce(const AlignedArray<float>& posq, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
/**
* This routine contains the code executed by each thread.
......@@ -105,11 +106,13 @@ private:
float logDX, logDXInv;
// The following variables are used to make information accessible to the individual threads.
float const* posq;
std::vector<std::vector<float> >* threadForce;
std::vector<AlignedArray<float> >* threadForce;
bool includeEnergy;
void* atomicCounter;
static const int NUM_TABLE_POINTS;
static const float TABLE_MIN;
static const float TABLE_MAX;
/**
* Compute the displacement and squared distance between a collection of points, optionally using
......
......@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "windowsExportCpu.h"
#include "openmm/internal/ThreadPool.h"
#include <set>
......@@ -46,7 +47,7 @@ public:
class Voxels;
static const int BlockSize;
CpuNeighborList();
void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
void computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads);
int getNumBlocks() const;
const std::vector<int>& getSortedAtoms() const;
......
......@@ -25,6 +25,7 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h"
#include "openmm/internal/ThreadPool.h"
......@@ -143,7 +144,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */
void calculateDirectIxn(int numberOfAtoms, float* posq, const std::vector<RealVec>& atomCoordinates, const std::vector<std::pair<float, float> >& atomParameters,
const std::vector<std::set<int> >& exclusions, std::vector<std::vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
const std::vector<std::set<int> >& exclusions, std::vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads);
/**
* This routine contains the code executed by each thread.
......@@ -173,7 +174,7 @@ private:
RealVec const* atomCoordinates;
std::pair<float, float> const* atomParameters;
std::set<int> const* exclusions;
std::vector<std::vector<float> >* threadForce;
std::vector<AlignedArray<float> >* threadForce;
bool includeEnergy;
void* atomicCounter;
......
......@@ -32,6 +32,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "AlignedArray.h"
#include "ReferencePlatform.h"
#include "openmm/internal/ContextImpl.h"
#include "openmm/internal/ThreadPool.h"
......@@ -69,8 +70,8 @@ private:
class CpuPlatform::PlatformData {
public:
PlatformData(int numParticles);
std::vector<float> posq;
std::vector<std::vector<float> > threadForce;
AlignedArray<float> posq;
std::vector<AlignedArray<float> > threadForce;
ThreadPool threads;
bool isPeriodic;
};
......
......@@ -31,7 +31,9 @@
using namespace std;
using namespace OpenMM;
const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 2048;
const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 4096;
const float CpuGBSAOBCForce::TABLE_MIN = 0.25f;
const float CpuGBSAOBCForce::TABLE_MAX = 1.5f;
class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
public:
......@@ -44,11 +46,11 @@ public:
};
CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
logDX = 0.5/NUM_TABLE_POINTS;
logDX = (TABLE_MAX-TABLE_MIN)/NUM_TABLE_POINTS;
logDXInv = 1.0f/logDX;
logTable.resize(NUM_TABLE_POINTS+1);
for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
double x = 0.5+i*logDX;
double x = TABLE_MIN+i*logDX;
logTable[i] = log(x);
}
}
......@@ -83,7 +85,7 @@ void CpuGBSAOBCForce::setParticleParameters(const std::vector<std::pair<float, f
obcChain.resize(params.size()+3);
}
void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
void CpuGBSAOBCForce::computeForce(const AlignedArray<float>& posq, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads.
this->posq = &posq[0];
......@@ -393,16 +395,17 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
// Evaluate log(x) using a lookup table for speed.
fvec4 x1 = (x-0.5f)*logDXInv;
if (any(x < TABLE_MIN) || any(x >= TABLE_MAX))
return fvec4(logf(x[0]), logf(x[1]), logf(x[2]), logf(x[3]));
fvec4 x1 = (x-TABLE_MIN)*logDXInv;
ivec4 index = floor(x1);
fvec4 coeff2 = x1-index;
fvec4 coeff1 = 1.0f-coeff2;
float table1[4], table2[4];
for (int i = 0; i < 4; i++) {
int tableIndex = index[i];
if (tableIndex < NUM_TABLE_POINTS)
table1[i] = logTable[tableIndex];
table2[i] = logTable[tableIndex+1];
table1[i] = logTable[tableIndex];
table2[i] = logTable[tableIndex+1];
}
return coeff1*fvec4(table1) + coeff2*fvec4(table2);
}
......@@ -81,7 +81,7 @@ void CpuCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool i
// Convert the positions to single precision and apply periodic boundary conditions
vector<float>& posq = data.posq;
AlignedArray<float>& posq = data.posq;
vector<RealVec>& posData = extractPositions(context);
RealVec boxSize = extractBoxSize(context);
float floatBoxSize[3] = {(float) boxSize[0], (float) boxSize[1], (float) boxSize[2]};
......@@ -255,7 +255,7 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
}
}
}
vector<float>& posq = data.posq;
AlignedArray<float>& posq = data.posq;
vector<RealVec>& posData = extractPositions(context);
vector<RealVec>& forceData = extractForces(context);
RealVec boxSize = extractBoxSize(context);
......
......@@ -330,7 +330,7 @@ public:
CpuNeighborList::CpuNeighborList() {
}
void CpuNeighborList::computeNeighborList(int numAtoms, const vector<float>& atomLocations, const vector<set<int> >& exclusions,
void CpuNeighborList::computeNeighborList(int numAtoms, const AlignedArray<float>& atomLocations, const vector<set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance, ThreadPool& threads) {
int numBlocks = (numAtoms+BlockSize-1)/BlockSize;
blockNeighbors.resize(numBlocks);
......
......@@ -288,7 +288,7 @@ void CpuNonbondedForce::calculateReciprocalIxn(int numberOfAtoms, float* posq, c
void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const vector<RealVec>& atomCoordinates, const vector<pair<float, float> >& atomParameters,
const vector<set<int> >& exclusions, vector<vector<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
const vector<set<int> >& exclusions, vector<AlignedArray<float> >& threadForce, double* totalEnergy, ThreadPool& threads) {
// Record the parameters for the threads.
this->numberOfAtoms = numberOfAtoms;
......@@ -701,9 +701,10 @@ fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
float table1[4], table2[4];
for (int i = 0; i < 4; i++) {
int tableIndex = index[i];
if (tableIndex < NUM_TABLE_POINTS)
if (tableIndex < NUM_TABLE_POINTS) {
table1[i] = ewaldScaleTable[tableIndex];
table2[i] = ewaldScaleTable[tableIndex+1];
}
}
return coeff1*fvec4(table1) + coeff2*fvec4(table2);
}
......@@ -89,8 +89,7 @@ CpuPlatform::PlatformData& CpuPlatform::getPlatformData(ContextImpl& context) {
return *contextData[&context];
}
CpuPlatform::PlatformData::PlatformData(int numParticles) {
posq.resize(4*numParticles);
CpuPlatform::PlatformData::PlatformData(int numParticles) : posq(4*numParticles) {
int numThreads = threads.getNumThreads();
threadForce.resize(numThreads);
for (int i = 0; i < numThreads; i++)
......
......@@ -35,6 +35,7 @@
#include "openmm/internal/AssertionUtilities.h"
#include "openmm/internal/ThreadPool.h"
#include "AlignedArray.h"
#include "CpuNeighborList.h"
#include "CpuPlatform.h"
#include "sfmt/SFMT.h"
......@@ -52,7 +53,7 @@ void testNeighborList(bool periodic) {
const float boxSize[3] = {20.0f, 15.0f, 22.0f};
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> positions(4*numParticles);
AlignedArray<float> positions(4*numParticles);
for (int i = 0; i < 4*numParticles; i++)
if (i%4 < 3)
positions[i] = boxSize[i%4]*genrand_real2(sfmt);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment