Commit 79fd69c3 authored by peastman's avatar peastman
Browse files

Merge pull request #182 from peastman/master

Lots of optimizations to CPU platform
parents b5da7380 062a2db0
...@@ -252,6 +252,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS}) ...@@ -252,6 +252,7 @@ FOREACH(subdir ${OPENMM_SOURCE_SUBDIRS})
## OpenMM was previously installed there. ## OpenMM was previously installed there.
INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include) INCLUDE_DIRECTORIES(BEFORE ${CMAKE_CURRENT_SOURCE_DIR}/${subdir}/include)
ENDFOREACH(subdir) ENDFOREACH(subdir)
SET_SOURCE_FILES_PROPERTIES(${CMAKE_SOURCE_DIR}/libraries/sfmt/src/SFMT.cpp PROPERTIES COMPILE_FLAGS "-msse2 -DHAVE_SSE2=1")
# If API wrappers are being generated, and add them to the build. # If API wrappers are being generated, and add them to the build.
FIND_PROGRAM(GCCXML_PATH gccxml PATH FIND_PROGRAM(GCCXML_PATH gccxml PATH
......
...@@ -49,22 +49,23 @@ PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b, ...@@ -49,22 +49,23 @@ PRE_ALWAYS static __m128i mm_recursion(__m128i *a, __m128i *b,
* This function fills the internal state array with pseudorandom * This function fills the internal state array with pseudorandom
* integers. * integers.
*/ */
inline static void gen_rand_all(void) { inline static void gen_rand_all(SFMT& sfmt) {
int i; int i;
__m128i r, r1, r2, mask; __m128i r, r1, r2, mask;
mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1); mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
r1 = _mm_load_si128(&sfmt[N - 2].si); SFMTData& data = *sfmt.data;
r2 = _mm_load_si128(&sfmt[N - 1].si); r1 = _mm_load_si128(&data.sfmt[N - 2].si);
r2 = _mm_load_si128(&data.sfmt[N - 1].si);
for (i = 0; i < N - POS1; i++) { for (i = 0; i < N - POS1; i++) {
r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask); r = mm_recursion(&data.sfmt[i].si, &data.sfmt[i + POS1].si, r1, r2, mask);
_mm_store_si128(&sfmt[i].si, r); _mm_store_si128(&data.sfmt[i].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
} }
for (; i < N; i++) { for (; i < N; i++) {
r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1 - N].si, r1, r2, mask); r = mm_recursion(&data.sfmt[i].si, &data.sfmt[i + POS1 - N].si, r1, r2, mask);
_mm_store_si128(&sfmt[i].si, r); _mm_store_si128(&data.sfmt[i].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
} }
...@@ -77,21 +78,22 @@ inline static void gen_rand_all(void) { ...@@ -77,21 +78,22 @@ inline static void gen_rand_all(void) {
* @param array an 128-bit array to be filled by pseudorandom numbers. * @param array an 128-bit array to be filled by pseudorandom numbers.
* @param size number of 128-bit pesudorandom numbers to be generated. * @param size number of 128-bit pesudorandom numbers to be generated.
*/ */
inline static void gen_rand_array(w128_t *array, int size) { inline static void gen_rand_array(w128_t *array, int size, SFMT& sfmt) {
int i, j; int i, j;
__m128i r, r1, r2, mask; __m128i r, r1, r2, mask;
mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1); mask = _mm_set_epi32(MSK4, MSK3, MSK2, MSK1);
r1 = _mm_load_si128(&sfmt[N - 2].si); SFMTData& data = *sfmt.data;
r2 = _mm_load_si128(&sfmt[N - 1].si); r1 = _mm_load_si128(&data.sfmt[N - 2].si);
r2 = _mm_load_si128(&data.sfmt[N - 1].si);
for (i = 0; i < N - POS1; i++) { for (i = 0; i < N - POS1; i++) {
r = mm_recursion(&sfmt[i].si, &sfmt[i + POS1].si, r1, r2, mask); r = mm_recursion(&data.sfmt[i].si, &data.sfmt[i + POS1].si, r1, r2, mask);
_mm_store_si128(&array[i].si, r); _mm_store_si128(&array[i].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
} }
for (; i < N; i++) { for (; i < N; i++) {
r = mm_recursion(&sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask); r = mm_recursion(&data.sfmt[i].si, &array[i + POS1 - N].si, r1, r2, mask);
_mm_store_si128(&array[i].si, r); _mm_store_si128(&array[i].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
...@@ -106,13 +108,13 @@ inline static void gen_rand_array(w128_t *array, int size) { ...@@ -106,13 +108,13 @@ inline static void gen_rand_array(w128_t *array, int size) {
} }
for (j = 0; j < 2 * N - size; j++) { for (j = 0; j < 2 * N - size; j++) {
r = _mm_load_si128(&array[j + size - N].si); r = _mm_load_si128(&array[j + size - N].si);
_mm_store_si128(&sfmt[j].si, r); _mm_store_si128(&data.sfmt[j].si, r);
} }
for (; i < size; i++) { for (; i < size; i++) {
r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2, r = mm_recursion(&array[i - N].si, &array[i + POS1 - N].si, r1, r2,
mask); mask);
_mm_store_si128(&array[i].si, r); _mm_store_si128(&array[i].si, r);
_mm_store_si128(&sfmt[j++].si, r); _mm_store_si128(&data.sfmt[j++].si, r);
r1 = r2; r1 = r2;
r2 = r; r2 = r;
} }
......
...@@ -144,9 +144,9 @@ inline static void swap(w128_t *array, int size); ...@@ -144,9 +144,9 @@ inline static void swap(w128_t *array, int size);
#endif #endif
#if defined(HAVE_ALTIVEC) #if defined(HAVE_ALTIVEC)
#include "SFMT-alti.h" #include "sfmt/SFMT-alti.h"
#elif defined(HAVE_SSE2) #elif defined(HAVE_SSE2)
#include "SFMT-sse2.h" #include "sfmt/SFMT-sse2.h"
#endif #endif
/** /**
......
#ifndef OPENMM_VECTORIZE_H_
#define OPENMM_VECTORIZE_H_
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include <smmintrin.h>
// This file defines classes and functions to simplify vectorizing code with SSE.
class ivec4;
/**
* A four element vector of floats.
*/
class fvec4 {
public:
__m128 val;
fvec4() {}
fvec4(float v) : val(_mm_set1_ps(v)) {}
fvec4(float v1, float v2, float v3, float v4) : val(_mm_set_ps(v4, v3, v2, v1)) {}
fvec4(__m128 v) : val(v) {}
fvec4(const float* v) : val(_mm_loadu_ps(v)) {}
operator __m128() const {
return val;
}
float operator[](int i) const {
int resultBits = _mm_extract_ps(val, i);
return *((float*) &resultBits);
}
void store(float* v) const {
_mm_storeu_ps(v, val);
}
fvec4 operator+(fvec4 other) const {
return _mm_add_ps(val, other);
}
fvec4 operator-(fvec4 other) const {
return _mm_sub_ps(val, other);
}
fvec4 operator*(fvec4 other) const {
return _mm_mul_ps(val, other);
}
fvec4 operator/(fvec4 other) const {
return _mm_div_ps(val, other);
}
void operator+=(fvec4 other) {
val = _mm_add_ps(val, other);
}
void operator-=(fvec4 other) {
val = _mm_sub_ps(val, other);
}
void operator*=(fvec4 other) {
val = _mm_mul_ps(val, other);
}
void operator/=(fvec4 other) {
val = _mm_div_ps(val, other);
}
fvec4 operator-() const {
return _mm_sub_ps(_mm_set1_ps(0.0f), val);
}
fvec4 operator&(fvec4 other) const {
return _mm_and_ps(val, other);
}
fvec4 operator==(fvec4 other) const {
return _mm_cmpeq_ps(val, other);
}
fvec4 operator!=(fvec4 other) const {
return _mm_cmpneq_ps(val, other);
}
fvec4 operator>(fvec4 other) const {
return _mm_cmpgt_ps(val, other);
}
fvec4 operator<(fvec4 other) const {
return _mm_cmplt_ps(val, other);
}
fvec4 operator>=(fvec4 other) const {
return _mm_cmpge_ps(val, other);
}
fvec4 operator<=(fvec4 other) const {
return _mm_cmple_ps(val, other);
}
operator ivec4() const;
};
/**
* A four element vector of ints.
*/
class ivec4 {
public:
__m128i val;
ivec4() {}
ivec4(int v) : val(_mm_set1_epi32(v)) {}
ivec4(int v1, int v2, int v3, int v4) : val(_mm_set_epi32(v4, v3, v2, v1)) {}
ivec4(__m128i v) : val(v) {}
ivec4(const int* v) : val(_mm_loadu_si128((const __m128i*) v)) {}
operator __m128i() const {
return val;
}
int operator[](int i) const {
return _mm_extract_epi32(val, i);
}
void store(int* v) const {
_mm_storeu_si128((__m128i*) v, val);
}
ivec4 operator+(ivec4 other) const {
return _mm_add_epi32(val, other);
}
ivec4 operator-(ivec4 other) const {
return _mm_sub_epi32(val, other);
}
ivec4 operator*(ivec4 other) const {
return _mm_mul_epi32(val, other);
}
void operator+=(ivec4 other) {
val = _mm_add_epi32(val, other);
}
void operator-=(ivec4 other) {
val = _mm_sub_epi32(val, other);
}
void operator*=(ivec4 other) {
val = _mm_mul_epi32(val, other);
}
ivec4 operator-() const {
return _mm_sub_epi32(_mm_set1_epi32(0), val);
}
ivec4 operator&(ivec4 other) const {
return _mm_and_si128(val, other);
}
ivec4 operator==(ivec4 other) const {
return _mm_cmpeq_epi32(val, other);
}
operator fvec4() const;
};
// Conversion operators.
inline fvec4::operator ivec4() const {
return _mm_cvttps_epi32(val);
}
inline ivec4::operator fvec4() const {
return _mm_cvtepi32_ps(val);
}
// Functions that operate on fvec4s.
static inline fvec4 floor(fvec4 v) {
return fvec4(_mm_floor_ps(v.val));
}
static inline fvec4 ceil(fvec4 v) {
return fvec4(_mm_ceil_ps(v.val));
}
static inline fvec4 round(fvec4 v) {
return fvec4(_mm_round_ps(v.val, _MM_FROUND_TO_NEAREST_INT));
}
static inline fvec4 min(fvec4 v1, fvec4 v2) {
return fvec4(_mm_min_ps(v1.val, v2.val));
}
static inline fvec4 max(fvec4 v1, fvec4 v2) {
return fvec4(_mm_max_ps(v1.val, v2.val));
}
static inline fvec4 abs(fvec4 v) {
static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7FFFFFFF));
return fvec4(_mm_and_ps(v.val, mask));
}
static inline fvec4 sqrt(fvec4 v) {
return fvec4(_mm_sqrt_ps(v.val));
}
static inline float dot3(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0x71));
}
static inline float dot4(fvec4 v1, fvec4 v2) {
return _mm_cvtss_f32(_mm_dp_ps(v1, v2, 0xF1));
}
// Functions that operate on ivec4s.
static inline ivec4 min(ivec4 v1, ivec4 v2) {
return ivec4(_mm_min_epi32(v1.val, v2.val));
}
static inline ivec4 max(ivec4 v1, ivec4 v2) {
return ivec4(_mm_max_epi32(v1.val, v2.val));
}
static inline ivec4 abs(ivec4 v) {
return ivec4(_mm_abs_epi32(v.val));
}
// Mathematical operators involving a scalar and a vector.
static inline fvec4 operator+(float v1, fvec4 v2) {
return fvec4(v1)+v2;
}
static inline fvec4 operator-(float v1, fvec4 v2) {
return fvec4(v1)-v2;
}
static inline fvec4 operator*(float v1, fvec4 v2) {
return fvec4(v1)*v2;
}
static inline fvec4 operator/(float v1, fvec4 v2) {
return fvec4(v1)/v2;
}
#endif /*OPENMM_VECTORIZE_H_*/
...@@ -12,26 +12,42 @@ namespace OpenMM { ...@@ -12,26 +12,42 @@ namespace OpenMM {
class OPENMM_EXPORT_CPU CpuNeighborList { class OPENMM_EXPORT_CPU CpuNeighborList {
public: public:
class ThreadData; class ThreadData;
class VoxelHash; class Voxels;
static const int BlockSize;
CpuNeighborList(); CpuNeighborList();
~CpuNeighborList(); ~CpuNeighborList();
void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions, void computeNeighborList(int numAtoms, const std::vector<float>& atomLocations, const std::vector<std::set<int> >& exclusions,
const float* periodicBoxSize, bool usePeriodic, float maxDistance); const float* periodicBoxSize, bool usePeriodic, float maxDistance);
const std::vector<std::pair<int, int> >& getNeighbors(); int getNumBlocks() const;
const std::vector<int>& getSortedAtoms() const;
const std::vector<int>& getBlockNeighbors(int blockIndex) const;
const std::vector<char>& getBlockExclusions(int blockIndex) const;
/** /**
* This routine contains the code executed by each thread. * This routine contains the code executed by each thread.
*/ */
void runThread(int index, std::vector<std::pair<int, int> >& threadNeighbors); void runThread(int index);
private: private:
/**
* This is called by the worker threads to wait until the master thread instructs them to advance.
*/
void threadWait();
/**
* This is called by the master thread to instruct all the worker threads to advance.
*/
void advanceThreads();
bool isDeleted; bool isDeleted;
int numThreads, waitCount; int numThreads, waitCount;
std::vector<std::pair<int, int> > neighbors; std::vector<int> sortedAtoms;
std::vector<std::vector<int> > blockNeighbors;
std::vector<std::vector<char> > blockExclusions;
std::vector<pthread_t> thread; std::vector<pthread_t> thread;
std::vector<ThreadData*> threadData; std::vector<ThreadData*> threadData;
pthread_cond_t startCondition, endCondition; pthread_cond_t startCondition, endCondition;
pthread_mutex_t lock; pthread_mutex_t lock;
// The following variables are used to make information accessible to the individual threads. // The following variables are used to make information accessible to the individual threads.
VoxelHash* voxelHash; float minx, maxx, miny, maxy, minz, maxz;
std::vector<std::pair<int, int> > atomBins;
Voxels* voxels;
const std::vector<std::set<int> >* exclusions; const std::vector<std::set<int> >* exclusions;
const float* atomLocations; const float* atomLocations;
const float* periodicBoxSize; const float* periodicBoxSize;
......
...@@ -25,14 +25,17 @@ ...@@ -25,14 +25,17 @@
#ifndef OPENMM_CPU_NONBONDED_FORCE_H__ #ifndef OPENMM_CPU_NONBONDED_FORCE_H__
#define OPENMM_CPU_NONBONDED_FORCE_H__ #define OPENMM_CPU_NONBONDED_FORCE_H__
#include "CpuNeighborList.h"
#include "ReferencePairIxn.h" #include "ReferencePairIxn.h"
#include "openmm/internal/vectorize.h"
#include <pthread.h> #include <pthread.h>
#include <set> #include <set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <smmintrin.h>
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
namespace OpenMM {
class CpuNonbondedForce { class CpuNonbondedForce {
public: public:
class ThreadData; class ThreadData;
...@@ -63,7 +66,7 @@ class CpuNonbondedForce { ...@@ -63,7 +66,7 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void setUseCutoff(float distance, const std::vector<std::pair<int, int> >& neighbors, float solventDielectric); void setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric);
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -127,9 +130,9 @@ class CpuNonbondedForce { ...@@ -127,9 +130,9 @@ class CpuNonbondedForce {
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<OpenMM::RealVec>& atomCoordinates, void calculateReciprocalIxn(int numberOfAtoms, float* posq, std::vector<RealVec>& atomCoordinates,
const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions, const std::vector<std::pair<float, float> >& atomParameters, const std::vector<std::set<int> >& exclusions,
std::vector<OpenMM::RealVec>& forces, float* totalEnergy) const; std::vector<RealVec>& forces, float* totalEnergy) const;
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
...@@ -159,14 +162,14 @@ private: ...@@ -159,14 +162,14 @@ private:
bool periodic; bool periodic;
bool ewald; bool ewald;
bool pme; bool pme;
const std::vector<std::pair<int, int> >* neighborList; const CpuNeighborList* neighborList;
float periodicBoxSize[3]; float periodicBoxSize[3];
float cutoffDistance, switchingDistance; float cutoffDistance, switchingDistance;
float krf, crf; float krf, crf;
float alphaEwald; float alphaEwald;
int numRx, numRy, numRz; int numRx, numRy, numRz;
int meshDim[3]; int meshDim[3];
std::vector<float> ewaldScaleX, ewaldScaleY, ewaldScaleDeriv; std::vector<float> ewaldScaleTable;
float ewaldDX, ewaldDXInv; float ewaldDX, ewaldDXInv;
bool isDeleted; bool isDeleted;
int numThreads, waitCount; int numThreads, waitCount;
...@@ -195,31 +198,42 @@ private: ...@@ -195,31 +198,42 @@ private:
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void calculateOneIxn(int atom1, int atom2, float* forces, double* totalEnergy, const __m128& boxSize, const __m128& invBoxSize); void calculateOneIxn(int atom1, int atom2, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Calculate LJ Coulomb pair ixn between two atoms Calculate all the interactions for one atom block.
@param atom1 the index of the first atom @param blockIndex the index of the atom block
@param atom2 the index of the second atom @param forces force array (forces added)
@param totalEnergy total energy
--------------------------------------------------------------------------------------- */
void calculateBlockIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/**---------------------------------------------------------------------------------------
Calculate all the interactions for one atom block.
@param blockIndex the index of the atom block
@param forces force array (forces added) @param forces force array (forces added)
@param totalEnergy total energy @param totalEnergy total energy
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void calculateOneEwaldIxn(int atom1, int atom2, float* forces, double* totalEnergy, const __m128& boxSize, const __m128& invBoxSize); void calculateBlockEwaldIxn(int blockIndex, float* forces, double* totalEnergy, const fvec4& boxSize, const fvec4& invBoxSize);
/** /**
* Compute the displacement and squared distance between two points, optionally using * Compute the displacement and squared distance between two points, optionally using
* periodic boundary conditions. * periodic boundary conditions.
*/ */
void getDeltaR(const __m128& posI, const __m128& posJ, __m128& deltaR, float& r2, bool periodic, const __m128& boxSize, const __m128& invBoxSize) const; void getDeltaR(const fvec4& posI, const fvec4& posJ, fvec4& deltaR, float& r2, bool periodic, const fvec4& boxSize, const fvec4& invBoxSize) const;
/** /**
* Compute a fast approximation to erfc(x). * Compute a fast approximation to erfc(x).
*/ */
static float erfcApprox(float x); static fvec4 erfcApprox(fvec4 x);
/** /**
* Create a lookup table for the scale factor used with Ewald and PME. * Create a lookup table for the scale factor used with Ewald and PME.
...@@ -229,9 +243,11 @@ private: ...@@ -229,9 +243,11 @@ private:
/** /**
* Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI) * Evaluate the scale factor used with Ewald and PME: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
*/ */
float ewaldScaleFunction(float x); fvec4 ewaldScaleFunction(fvec4 x);
}; };
} // namespace OpenMM
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
#endif // OPENMM_CPU_NONBONDED_FORCE_H__ #endif // OPENMM_CPU_NONBONDED_FORCE_H__
...@@ -221,20 +221,48 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -221,20 +221,48 @@ double CpuCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeFo
if (nonbondedMethod != NoCutoff) { if (nonbondedMethod != NoCutoff) {
// Determine whether we need to recompute the neighbor list. // Determine whether we need to recompute the neighbor list.
double padding = 0.1*nonbondedCutoff; double padding = 0.15*nonbondedCutoff;
bool needRecompute = false; bool needRecompute = false;
double closeCutoff2 = 0.25*padding*padding;
double farCutoff2 = 0.5*padding*padding;
int maxNumMoved = numParticles/10;
vector<int> moved;
for (int i = 0; i < numParticles; i++) { for (int i = 0; i < numParticles; i++) {
RealVec delta = posData[i]-lastPositions[i]; RealVec delta = posData[i]-lastPositions[i];
if (delta.dot(delta) > 0.25*padding*padding) { double dist2 = delta.dot(delta);
if (dist2 > closeCutoff2) {
moved.push_back(i);
if (dist2 > farCutoff2 || moved.size() > maxNumMoved) {
needRecompute = true; needRecompute = true;
break; break;
} }
} }
}
if (!needRecompute && moved.size() > 0) {
// Some particles have moved further than half the padding distance. Look for pairs
// that are missing from the neighbor list.
int numMoved = moved.size();
double cutoff2 = nonbondedCutoff*nonbondedCutoff;
for (int i = 1; i < numMoved && !needRecompute; i++)
for (int j = 0; j < i; j++) {
RealVec delta = posData[moved[i]]-posData[moved[j]];
if (delta.dot(delta) < cutoff2) {
// These particles should interact. See if they are in the neighbor list.
RealVec oldDelta = lastPositions[moved[i]]-lastPositions[moved[j]];
if (oldDelta.dot(oldDelta) > cutoff2) {
needRecompute = true;
break;
}
}
}
}
if (needRecompute) { if (needRecompute) {
neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding); neighborList.computeNeighborList(numParticles, posq, exclusions, floatBoxSize, periodic || ewald || pme, nonbondedCutoff+padding);
lastPositions = posData; lastPositions = posData;
} }
nonbonded.setUseCutoff(nonbondedCutoff, neighborList.getNeighbors(), rfDielectric); nonbonded.setUseCutoff(nonbondedCutoff, neighborList, rfDielectric);
} }
if (periodic || ewald || pme) { if (periodic || ewald || pme) {
double minAllowedSize = 1.999999*nonbondedCutoff; double minAllowedSize = 1.999999*nonbondedCutoff;
......
This diff is collapsed.
This diff is collapsed.
...@@ -39,6 +39,7 @@ ...@@ -39,6 +39,7 @@
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
#include <iostream> #include <iostream>
#include <set> #include <set>
#include <utility>
#include <vector> #include <vector>
using namespace OpenMM; using namespace OpenMM;
...@@ -68,11 +69,20 @@ void testNeighborList(bool periodic) { ...@@ -68,11 +69,20 @@ void testNeighborList(bool periodic) {
// Convert the neighbor list to a set for faster lookup. // Convert the neighbor list to a set for faster lookup.
set<pair<int, int> > neighbors; set<pair<int, int> > neighbors;
for (int i = 0; i < (int) neighborList.getNeighbors().size(); i++) { for (int i = 0; i < (int) neighborList.getSortedAtoms().size(); i++) {
pair<int, int> entry = neighborList.getNeighbors()[i]; int blockIndex = i/CpuNeighborList::BlockSize;
int indexInBlock = i-blockIndex*CpuNeighborList::BlockSize;
char mask = 1<<indexInBlock;
for (int j = 0; j < (int) neighborList.getBlockExclusions(blockIndex).size(); j++) {
if ((neighborList.getBlockExclusions(blockIndex)[j] & mask) == 0) {
int atom1 = neighborList.getSortedAtoms()[i];
int atom2 = neighborList.getBlockNeighbors(blockIndex)[j];
pair<int, int> entry = make_pair(min(atom1, atom2), max(atom1, atom2));
ASSERT(neighbors.find(entry) == neighbors.end() && neighbors.find(make_pair(entry.second, entry.first)) == neighbors.end()); // No duplicates ASSERT(neighbors.find(entry) == neighbors.end() && neighbors.find(make_pair(entry.second, entry.first)) == neighbors.end()); // No duplicates
neighbors.insert(entry); neighbors.insert(entry);
} }
}
}
// Check each particle pair and figure out whether they should be in the neighbor list. // Check each particle pair and figure out whether they should be in the neighbor list.
...@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) { ...@@ -90,7 +100,8 @@ void testNeighborList(bool periodic) {
if (dx*dx + dy*dy + dz*dz > cutoff*cutoff) if (dx*dx + dy*dy + dz*dz > cutoff*cutoff)
shouldInclude = false; shouldInclude = false;
bool isIncluded = (neighbors.find(make_pair(i, j)) != neighbors.end() || neighbors.find(make_pair(j, i)) != neighbors.end()); bool isIncluded = (neighbors.find(make_pair(i, j)) != neighbors.end() || neighbors.find(make_pair(j, i)) != neighbors.end());
ASSERT_EQUAL(shouldInclude, isIncluded); if (shouldInclude)
ASSERT(isIncluded);
} }
} }
......
...@@ -35,9 +35,9 @@ ...@@ -35,9 +35,9 @@
#include "CpuPmeKernels.h" #include "CpuPmeKernels.h"
#include "SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "openmm/internal/hardware.h" #include "openmm/internal/hardware.h"
#include "openmm/internal/vectorize.h"
#include <cmath> #include <cmath>
#include <cstring> #include <cstring>
#include <smmintrin.h>
using namespace OpenMM; using namespace OpenMM;
using namespace std; using namespace std;
...@@ -47,52 +47,50 @@ static const int PME_ORDER = 5; ...@@ -47,52 +47,50 @@ static const int PME_ORDER = 5;
bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false; bool CpuCalcPmeReciprocalForceKernel::hasInitializedThreads = false;
int CpuCalcPmeReciprocalForceKernel::numThreads = 0; int CpuCalcPmeReciprocalForceKernel::numThreads = 0;
#define EXTRACT_FLOAT(v, element) _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, element)))
static void spreadCharge(int start, int end, float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3 periodicBoxSize) { static void spreadCharge(int start, int end, float* posq, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3 periodicBoxSize) {
float temp[4]; float temp[4];
__m128 boxSize = _mm_set_ps(0, (float) periodicBoxSize[2], (float) periodicBoxSize[1], (float) periodicBoxSize[0]); fvec4 boxSize((float) periodicBoxSize[0], (float) periodicBoxSize[1], (float) periodicBoxSize[2], 0);
__m128 invBoxSize = _mm_set_ps(0, (float) (1/periodicBoxSize[2]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[0])); fvec4 invBoxSize((float) (1/periodicBoxSize[0]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[2]), 0);
__m128 gridSize = _mm_set_ps(0, gridz, gridy, gridx); fvec4 gridSize(gridx, gridy, gridz, 0);
__m128i gridSizeInt = _mm_set_epi32(0, gridz, gridy, gridx); ivec4 gridSizeInt(gridx, gridy, gridz, 0);
__m128 one = _mm_set1_ps(1); fvec4 one(1);
__m128 scale = _mm_set1_ps(1.0f/(PME_ORDER-1)); fvec4 scale(1.0f/(PME_ORDER-1));
const float epsilonFactor = sqrt(ONE_4PI_EPS0); const float epsilonFactor = sqrt(ONE_4PI_EPS0);
memset(grid, 0, sizeof(float)*gridx*gridy*gridz); memset(grid, 0, sizeof(float)*gridx*gridy*gridz);
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
// Find the position relative to the nearest grid point. // Find the position relative to the nearest grid point.
__m128 pos = _mm_loadu_ps(&posq[4*i]); fvec4 pos(&posq[4*i]);
__m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize)); fvec4 posFloor = floor(pos*invBoxSize);
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor)); fvec4 posInBox = pos-boxSize*posFloor;
__m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize); fvec4 t = posInBox*invBoxSize*gridSize;
__m128i ti = _mm_cvttps_epi32(t); ivec4 ti = t;
__m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti)); fvec4 dr = t-ti;
__m128i gridIndex = _mm_sub_epi32(ti, _mm_and_si128(gridSizeInt, _mm_cmpeq_epi32(ti, gridSizeInt))); ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
// Compute the B-spline coefficients. // Compute the B-spline coefficients.
__m128 data[PME_ORDER]; fvec4 data[PME_ORDER];
data[PME_ORDER-1] = _mm_setzero_ps(); data[PME_ORDER-1] = 0.0f;
data[1] = dr; data[1] = dr;
data[0] = _mm_sub_ps(one, dr); data[0] = one-dr;
for (int j = 3; j < PME_ORDER; j++) { for (int j = 3; j < PME_ORDER; j++) {
__m128 div = _mm_set1_ps(1.0f/(j-1)); fvec4 div(1.0f/(j-1));
data[j-1] = _mm_mul_ps(_mm_mul_ps(div, dr), data[j-2]); data[j-1] = div*dr*data[j-2];
for (int k = 1; k < j-1; k++) for (int k = 1; k < j-1; k++)
data[j-k-1] = _mm_mul_ps(div, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(k)), data[j-k-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(j-k), dr), data[j-k-1]))); data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
data[0] = _mm_mul_ps(_mm_mul_ps(div, _mm_sub_ps(one, dr)), data[0]); data[0] = div*(one-dr)*data[0];
} }
data[PME_ORDER-1] = _mm_mul_ps(_mm_mul_ps(scale, dr), data[PME_ORDER-2]); data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++) for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = _mm_mul_ps(scale, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(j)), data[PME_ORDER-j-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(PME_ORDER-j), dr), data[PME_ORDER-j-1]))); data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = _mm_mul_ps(_mm_mul_ps(scale, _mm_sub_ps(one, dr)), data[0]); data[0] = scale*(one-dr)*data[0];
// Spread the charges. // Spread the charges.
int gridIndexX = _mm_extract_epi32(gridIndex, 0); int gridIndexX = gridIndex[0];
int gridIndexY = _mm_extract_epi32(gridIndex, 1); int gridIndexY = gridIndex[1];
int gridIndexZ = _mm_extract_epi32(gridIndex, 2); int gridIndexZ = gridIndex[2];
if (gridIndexX < 0) if (gridIndexX < 0)
return; // This happens when a simulation blows up and coordinates become NaN. return; // This happens when a simulation blows up and coordinates become NaN.
int zindex[PME_ORDER]; int zindex[PME_ORDER];
...@@ -101,21 +99,21 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx ...@@ -101,21 +99,21 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
zindex[j] -= (zindex[j] >= gridz ? gridz : 0); zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
} }
float charge = epsilonFactor*posq[4*i+3]; float charge = epsilonFactor*posq[4*i+3];
__m128 zdata0to3 = _mm_set_ps(EXTRACT_FLOAT(data[3], 2), EXTRACT_FLOAT(data[2], 2), EXTRACT_FLOAT(data[1], 2), EXTRACT_FLOAT(data[0], 2)); fvec4 zdata0to3(data[0][2], data[1][2], data[2][2], data[3][2]);
float zdata4 = EXTRACT_FLOAT(data[4], 2); float zdata4 = data[4][2];
if (gridIndexZ+4 < gridz) { if (gridIndexZ+4 < gridz) {
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndexX+ix; int xbase = gridIndexX+ix;
xbase -= (xbase >= gridx ? gridx : 0); xbase -= (xbase >= gridx ? gridx : 0);
xbase = xbase*gridy*gridz; xbase = xbase*gridy*gridz;
float xdata = charge*EXTRACT_FLOAT(data[ix], 0); float xdata = charge*data[ix][0];
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndexY+iy; int ybase = gridIndexY+iy;
ybase -= (ybase >= gridy ? gridy : 0); ybase -= (ybase >= gridy ? gridy : 0);
ybase = xbase + ybase*gridz; ybase = xbase + ybase*gridz;
float multiplier = xdata*EXTRACT_FLOAT(data[iy], 1); float multiplier = xdata*data[iy][1];
__m128 add0to3 = _mm_mul_ps(zdata0to3, _mm_set1_ps(multiplier)); fvec4 add0to3 = zdata0to3*multiplier;
_mm_storeu_ps(&grid[ybase+gridIndexZ], _mm_add_ps(_mm_loadu_ps(&grid[ybase+gridIndexZ]), add0to3)); (fvec4(&grid[ybase+gridIndexZ])+add0to3).store(&grid[ybase+gridIndexZ]);
grid[ybase+zindex[4]] += multiplier*zdata4; grid[ybase+zindex[4]] += multiplier*zdata4;
} }
} }
...@@ -125,14 +123,14 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx ...@@ -125,14 +123,14 @@ static void spreadCharge(int start, int end, float* posq, float* grid, int gridx
int xbase = gridIndexX+ix; int xbase = gridIndexX+ix;
xbase -= (xbase >= gridx ? gridx : 0); xbase -= (xbase >= gridx ? gridx : 0);
xbase = xbase*gridy*gridz; xbase = xbase*gridy*gridz;
float xdata = charge*EXTRACT_FLOAT(data[ix], 0); float xdata = charge*data[ix][0];
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndexY+iy; int ybase = gridIndexY+iy;
ybase -= (ybase >= gridy ? gridy : 0); ybase -= (ybase >= gridy ? gridy : 0);
ybase = xbase + ybase*gridz; ybase = xbase + ybase*gridz;
float multiplier = xdata*EXTRACT_FLOAT(data[iy], 1); float multiplier = xdata*data[iy][1];
__m128 add0to3 = _mm_mul_ps(zdata0to3, _mm_set1_ps(multiplier)); fvec4 add0to3 = zdata0to3*multiplier;
_mm_store_ps(temp, add0to3); add0to3.store(temp);
grid[ybase+zindex[0]] += temp[0]; grid[ybase+zindex[0]] += temp[0];
grid[ybase+zindex[1]] += temp[1]; grid[ybase+zindex[1]] += temp[1];
grid[ybase+zindex[2]] += temp[2]; grid[ybase+zindex[2]] += temp[2];
...@@ -245,51 +243,51 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int g ...@@ -245,51 +243,51 @@ static void reciprocalConvolution(int start, int end, fftwf_complex* grid, int g
} }
static void interpolateForces(int start, int end, float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3 periodicBoxSize) { static void interpolateForces(int start, int end, float* posq, float* force, float* grid, int gridx, int gridy, int gridz, int numParticles, Vec3 periodicBoxSize) {
__m128 boxSize = _mm_set_ps(0, (float) periodicBoxSize[2], (float) periodicBoxSize[1], (float) periodicBoxSize[0]); fvec4 boxSize((float) periodicBoxSize[0], (float) periodicBoxSize[1], (float) periodicBoxSize[2], 0);
__m128 invBoxSize = _mm_set_ps(0, (float) (1/periodicBoxSize[2]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[0])); fvec4 invBoxSize((float) (1/periodicBoxSize[0]), (float) (1/periodicBoxSize[1]), (float) (1/periodicBoxSize[2]), 0);
__m128 gridSize = _mm_set_ps(0, gridz, gridy, gridx); fvec4 gridSize(gridx, gridy, gridz, 0);
__m128i gridSizeInt = _mm_set_epi32(0, gridz, gridy, gridx); ivec4 gridSizeInt(gridx, gridy, gridz, 0);
__m128 one = _mm_set1_ps(1); fvec4 one(1);
__m128 scale = _mm_set1_ps(1.0f/(PME_ORDER-1)); fvec4 scale(1.0f/(PME_ORDER-1));
const float epsilonFactor = sqrt(ONE_4PI_EPS0); const float epsilonFactor = sqrt(ONE_4PI_EPS0);
for (int i = start; i < end; i++) { for (int i = start; i < end; i++) {
// Find the position relative to the nearest grid point. // Find the position relative to the nearest grid point.
__m128 pos = _mm_loadu_ps(&posq[4*i]); fvec4 pos(&posq[4*i]);
__m128 posFloor = _mm_floor_ps(_mm_mul_ps(pos, invBoxSize)); fvec4 posFloor = floor(pos*invBoxSize);
__m128 posInBox = _mm_sub_ps(pos, _mm_mul_ps(boxSize, posFloor)); fvec4 posInBox = pos-boxSize*posFloor;
__m128 t = _mm_mul_ps(_mm_mul_ps(posInBox, invBoxSize), gridSize); fvec4 t = posInBox*invBoxSize*gridSize;
__m128i ti = _mm_cvttps_epi32(t); ivec4 ti = t;
__m128 dr = _mm_sub_ps(t, _mm_cvtepi32_ps(ti)); fvec4 dr = t-ti;
__m128i gridIndex = _mm_sub_epi32(ti, _mm_and_si128(gridSizeInt, _mm_cmpeq_epi32(ti, gridSizeInt))); ivec4 gridIndex = ti-(gridSizeInt&ti==gridSizeInt);
// Compute the B-spline coefficients. // Compute the B-spline coefficients.
__m128 data[PME_ORDER]; fvec4 data[PME_ORDER];
__m128 ddata[PME_ORDER]; fvec4 ddata[PME_ORDER];
data[PME_ORDER-1] = _mm_setzero_ps(); data[PME_ORDER-1] = 0.0f;
data[1] = dr; data[1] = dr;
data[0] = _mm_sub_ps(one, dr); data[0] = one-dr;
for (int j = 3; j < PME_ORDER; j++) { for (int j = 3; j < PME_ORDER; j++) {
__m128 div = _mm_set1_ps(1.0f/(j-1)); fvec4 div(1.0f/(j-1));
data[j-1] = _mm_mul_ps(_mm_mul_ps(div, dr), data[j-2]); data[j-1] = div*dr*data[j-2];
for (int k = 1; k < j-1; k++) for (int k = 1; k < j-1; k++)
data[j-k-1] = _mm_mul_ps(div, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(k)), data[j-k-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(j-k), dr), data[j-k-1]))); data[j-k-1] = div*((dr+k)*data[j-k-2]+(fvec4(j-k)-dr)*data[j-k-1]);
data[0] = _mm_mul_ps(_mm_mul_ps(div, _mm_sub_ps(one, dr)), data[0]); data[0] = div*(one-dr)*data[0];
} }
ddata[0] = _mm_sub_ps(_mm_set1_ps(0), data[0]); ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++) for (int j = 1; j < PME_ORDER; j++)
ddata[j] = _mm_sub_ps(data[j-1], data[j]); ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = _mm_mul_ps(_mm_mul_ps(scale, dr), data[PME_ORDER-2]); data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++) for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = _mm_mul_ps(scale, _mm_add_ps(_mm_mul_ps(_mm_add_ps(dr, _mm_set1_ps(j)), data[PME_ORDER-j-2]), _mm_mul_ps(_mm_sub_ps(_mm_set1_ps(PME_ORDER-j), dr), data[PME_ORDER-j-1]))); data[PME_ORDER-j-1] = scale*((dr+j)*data[PME_ORDER-j-2]+(fvec4(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = _mm_mul_ps(_mm_mul_ps(scale, _mm_sub_ps(one, dr)), data[0]); data[0] = scale*(one-dr)*data[0];
// Compute the force on this atom. // Compute the force on this atom.
int gridIndexX = _mm_extract_epi32(gridIndex, 0); int gridIndexX = gridIndex[0];
int gridIndexY = _mm_extract_epi32(gridIndex, 1); int gridIndexY = gridIndex[1];
int gridIndexZ = _mm_extract_epi32(gridIndex, 2); int gridIndexZ = gridIndex[2];
if (gridIndexX < 0) if (gridIndexX < 0)
return; // This happens when a simulation blows up and coordinates become NaN. return; // This happens when a simulation blows up and coordinates become NaN.
int zindex[PME_ORDER]; int zindex[PME_ORDER];
...@@ -297,34 +295,34 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo ...@@ -297,34 +295,34 @@ static void interpolateForces(int start, int end, float* posq, float* force, flo
zindex[j] = gridIndexZ+j; zindex[j] = gridIndexZ+j;
zindex[j] -= (zindex[j] >= gridz ? gridz : 0); zindex[j] -= (zindex[j] >= gridz ? gridz : 0);
} }
__m128 zdata[PME_ORDER]; fvec4 zdata[PME_ORDER];
for (int j = 0; j < PME_ORDER; j++) for (int j = 0; j < PME_ORDER; j++)
zdata[j] = _mm_set_ps(0, EXTRACT_FLOAT(ddata[j], 2), EXTRACT_FLOAT(data[j], 2), EXTRACT_FLOAT(data[j], 2)); zdata[j] = fvec4(data[j][2], data[j][2], ddata[j][2], 0);
__m128 f = _mm_set1_ps(0); fvec4 f = 0.0f;
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndexX+ix; int xbase = gridIndexX+ix;
xbase -= (xbase >= gridx ? gridx : 0); xbase -= (xbase >= gridx ? gridx : 0);
xbase = xbase*gridy*gridz; xbase = xbase*gridy*gridz;
float dx = EXTRACT_FLOAT(data[ix], 0); float dx = data[ix][0];
float ddx = EXTRACT_FLOAT(ddata[ix], 0); float ddx = ddata[ix][0];
__m128 xdata = _mm_set_ps(0, dx, dx, ddx); fvec4 xdata(ddx, dx, dx, 0);
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndexY+iy; int ybase = gridIndexY+iy;
ybase -= (ybase >= gridy ? gridy : 0); ybase -= (ybase >= gridy ? gridy : 0);
ybase = xbase + ybase*gridz; ybase = xbase + ybase*gridz;
float dy = EXTRACT_FLOAT(data[iy], 1); float dy = data[iy][1];
float ddy = EXTRACT_FLOAT(ddata[iy], 1); float ddy = ddata[iy][1];
__m128 xydata = _mm_mul_ps(xdata, _mm_set_ps(0, dy, ddy, dy)); fvec4 xydata = xdata*fvec4(dy, ddy, dy, 0);
for (int iz = 0; iz < PME_ORDER; iz++) { for (int iz = 0; iz < PME_ORDER; iz++) {
__m128 gridValue = _mm_set1_ps(grid[ybase+zindex[iz]]); fvec4 gridValue(grid[ybase+zindex[iz]]);
f = _mm_add_ps(f, _mm_mul_ps(xydata, _mm_mul_ps(zdata[iz], gridValue))); f = f+xydata*zdata[iz]*gridValue;
} }
} }
} }
f = _mm_mul_ps(invBoxSize, _mm_mul_ps(gridSize, _mm_mul_ps(f, _mm_set1_ps(-epsilonFactor*posq[4*i+3])))); f = invBoxSize*gridSize*f*(-epsilonFactor*posq[4*i+3]);
_mm_store_ps(&force[4*i], f); f.store(&force[4*i]);
} }
} }
...@@ -509,10 +507,10 @@ void CpuCalcPmeReciprocalForceKernel::runThread(int index) { ...@@ -509,10 +507,10 @@ void CpuCalcPmeReciprocalForceKernel::runThread(int index) {
threadWait(); threadWait();
int numGrids = threadData.size(); int numGrids = threadData.size();
for (int i = gridStart; i < gridEnd; i += 4) { for (int i = gridStart; i < gridEnd; i += 4) {
__m128 sum = _mm_load_ps(&realGrid[i]); fvec4 sum(&realGrid[i]);
for (int j = 1; j < numGrids; j++) for (int j = 1; j < numGrids; j++)
sum = _mm_add_ps(sum, _mm_load_ps(&threadData[j]->tempGrid[i])); sum += fvec4(&threadData[j]->tempGrid[i]);
_mm_store_ps(&realGrid[i], sum); sum.store(&realGrid[i]);
} }
threadWait(); threadWait();
if (lastBoxSize != periodicBoxSize) { if (lastBoxSize != periodicBoxSize) {
......
...@@ -145,6 +145,8 @@ SKIP_METHODS = [('State',), ...@@ -145,6 +145,8 @@ SKIP_METHODS = [('State',),
('IntegrateDrudeSCFStepKernel',), ('IntegrateDrudeSCFStepKernel',),
('XmlSerializer', 'serialize'), ('XmlSerializer', 'serialize'),
('XmlSerializer', 'deserialize'), ('XmlSerializer', 'deserialize'),
('fvec4',),
('ivec4',),
] ]
# The build script assumes method args that are non-const references are # The build script assumes method args that are non-const references are
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment