"platforms/cuda/vscode:/vscode.git/clone" did not exist on "f2548616f290d6a6a500f9f0328eb4ea93374fef"
Commit be04ec57 authored by peastman's avatar peastman
Browse files

Optimizations to CPU nonbonded forces: better load balancing between threads,...

Optimizations to CPU nonbonded forces: better load balancing between threads, use linear splines instead of cubic
parent 56cf0fde
...@@ -107,6 +107,7 @@ private: ...@@ -107,6 +107,7 @@ private:
float const* posq; float const* posq;
std::vector<std::vector<float> >* threadForce; std::vector<std::vector<float> >* threadForce;
bool includeEnergy; bool includeEnergy;
void* atomicCounter;
static const int NUM_TABLE_POINTS; static const int NUM_TABLE_POINTS;
......
...@@ -156,6 +156,7 @@ private: ...@@ -156,6 +156,7 @@ private:
bool periodic; bool periodic;
bool ewald; bool ewald;
bool pme; bool pme;
bool tableIsValid;
const CpuNeighborList* neighborList; const CpuNeighborList* neighborList;
float periodicBoxSize[3]; float periodicBoxSize[3];
float cutoffDistance, switchingDistance; float cutoffDistance, switchingDistance;
...@@ -174,6 +175,7 @@ private: ...@@ -174,6 +175,7 @@ private:
std::set<int> const* exclusions; std::set<int> const* exclusions;
std::vector<std::vector<float> >* threadForce; std::vector<std::vector<float> >* threadForce;
bool includeEnergy; bool includeEnergy;
void* atomicCounter;
static const float TWO_OVER_SQRT_PI; static const float TWO_OVER_SQRT_PI;
static const int NUM_TABLE_POINTS; static const int NUM_TABLE_POINTS;
......
...@@ -24,14 +24,14 @@ ...@@ -24,14 +24,14 @@
#include "CpuGBSAOBCForce.h" #include "CpuGBSAOBCForce.h"
#include "SimTKOpenMMRealType.h" #include "SimTKOpenMMRealType.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h" #include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
#include <cmath> #include <cmath>
using namespace std; using namespace std;
using namespace OpenMM; using namespace OpenMM;
const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 1025; const int CpuGBSAOBCForce::NUM_TABLE_POINTS = 2048;
class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task { class CpuGBSAOBCForce::ComputeTask : public ThreadPool::Task {
public: public:
...@@ -46,20 +46,10 @@ public: ...@@ -46,20 +46,10 @@ public:
CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) { CpuGBSAOBCForce::CpuGBSAOBCForce() : cutoff(false), periodic(false) {
logDX = 0.5/NUM_TABLE_POINTS; logDX = 0.5/NUM_TABLE_POINTS;
logDXInv = 1.0f/logDX; logDXInv = 1.0f/logDX;
vector<double> x(NUM_TABLE_POINTS+1); logTable.resize(NUM_TABLE_POINTS+1);
vector<double> y(NUM_TABLE_POINTS+1);
vector<double> deriv;
for (int i = 0; i < NUM_TABLE_POINTS+1; i++) { for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
x[i] = 0.5+i*0.5/NUM_TABLE_POINTS; double x = 0.5+i*logDX;
y[i] = log(x[i]); logTable[i] = log(x);
}
SplineFitter::createNaturalSpline(x, y, deriv);
logTable.resize(4*NUM_TABLE_POINTS);
for (int i = 0; i < NUM_TABLE_POINTS; i++) {
logTable[4*i] = (float) y[i];
logTable[4*i+1] = (float) y[i+1];
logTable[4*i+2] = (float) (deriv[i]*logDX*logDX/6);
logTable[4*i+3] = (float) (deriv[i+1]*logDX*logDX/6);
} }
} }
...@@ -104,16 +94,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector ...@@ -104,16 +94,22 @@ void CpuGBSAOBCForce::computeForce(const std::vector<float>& posq, vector<vector
threadBornForces.resize(numThreads); threadBornForces.resize(numThreads);
for (int i = 0; i < numThreads; i++) for (int i = 0; i < numThreads; i++)
threadBornForces[i].resize(particleParams.size()+3); threadBornForces[i].resize(particleParams.size()+3);
gmx_atomic_t counter;
this->atomicCounter = &counter;
// Signal the threads to start running and wait for them to finish. // Signal the threads to start running and wait for them to finish.
ComputeTask task(*this); ComputeTask task(*this);
gmx_atomic_set(&counter, 0);
threads.execute(task); threads.execute(task);
threads.waitForThreads(); // Compute Born radii threads.waitForThreads(); // Compute Born radii
gmx_atomic_set(&counter, 0);
threads.resumeThreads(); threads.resumeThreads();
threads.waitForThreads(); // Compute surface area term threads.waitForThreads(); // Compute surface area term
gmx_atomic_set(&counter, 0);
threads.resumeThreads(); threads.resumeThreads();
threads.waitForThreads(); // First loop threads.waitForThreads(); // First loop
gmx_atomic_set(&counter, 0);
threads.resumeThreads(); threads.resumeThreads();
threads.waitForThreads(); // Second loop threads.waitForThreads(); // Second loop
...@@ -141,8 +137,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) { ...@@ -141,8 +137,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
// Calculate Born radii // Calculate Born radii
for (int blockStart = start; blockStart < end; blockStart += 4) { while (true) {
int numInBlock = min(4, end-blockStart); int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
if (blockStart >= numParticles)
break;
int numInBlock = min(4, numParticles-blockStart);
ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3); ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
float atomRadius[4], atomx[4], atomy[4], atomz[4]; float atomRadius[4], atomx[4], atomy[4], atomz[4];
int blockMask[4] = {0, 0, 0, 0}; int blockMask[4] = {0, 0, 0, 0};
...@@ -213,7 +212,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) { ...@@ -213,7 +212,10 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
vector<float>& bornForces = threadBornForces[threadIndex]; vector<float>& bornForces = threadBornForces[threadIndex];
for (int i = 0; i < numParticles; i++) for (int i = 0; i < numParticles; i++)
bornForces[i] = 0.0f; bornForces[i] = 0.0f;
for (int atomI = start; atomI < end; atomI++) { while (true) {
int atomI = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (atomI >= numParticles)
break;
if (bornRadii[atomI] > 0) { if (bornRadii[atomI] > 0) {
float radiusI = particleParams[atomI].first + dielectricOffset; float radiusI = particleParams[atomI].first + dielectricOffset;
float r = radiusI + probeRadius; float r = radiusI + probeRadius;
...@@ -235,8 +237,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) { ...@@ -235,8 +237,11 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric)); preFactor = ONE_4PI_EPS0*((1.0f/solventDielectric) - (1.0f/soluteDielectric));
else else
preFactor = 0.0f; preFactor = 0.0f;
for (int blockStart = start; blockStart < end; blockStart += 4) { while (true) {
int numInBlock = min(4, end-blockStart); int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
if (blockStart >= numParticles)
break;
int numInBlock = min(4, numParticles-blockStart);
ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3); ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
float atomCharge[4], atomx[4], atomy[4], atomz[4]; float atomCharge[4], atomx[4], atomy[4], atomz[4];
int blockMask[4] = {0, 0, 0, 0}; int blockMask[4] = {0, 0, 0, 0};
...@@ -303,13 +308,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) { ...@@ -303,13 +308,16 @@ void CpuGBSAOBCForce::threadComputeForce(ThreadPool& threads, int threadIndex) {
// Second loop of Born energy computation. // Second loop of Born energy computation.
for (int blockStart = start; blockStart < end; blockStart += 4) { while (true) {
int blockStart = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 4);
if (blockStart >= numParticles)
break;
fvec4 bornForce(0.0f); fvec4 bornForce(0.0f);
for (int i = 0; i < numThreads; i++) for (int i = 0; i < numThreads; i++)
bornForce += fvec4(&threadBornForces[i][blockStart]); bornForce += fvec4(&threadBornForces[i][blockStart]);
fvec4 radii(&bornRadii[blockStart]); fvec4 radii(&bornRadii[blockStart]);
bornForce *= radii*radii*fvec4(&obcChain[blockStart]); bornForce *= radii*radii*fvec4(&obcChain[blockStart]);
int numInBlock = min(4, end-blockStart); int numInBlock = min(4, numParticles-blockStart);
ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3); ivec4 blockAtomIndex(blockStart, blockStart+1, blockStart+2, blockStart+3);
float atomRadius[4], atomx[4], atomy[4], atomz[4]; float atomRadius[4], atomx[4], atomy[4], atomz[4];
int blockMask[4] = {0, 0, 0, 0}; int blockMask[4] = {0, 0, 0, 0};
...@@ -385,21 +393,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4& ...@@ -385,21 +393,16 @@ void CpuGBSAOBCForce::getDeltaR(const fvec4& posI, const fvec4& x, const fvec4&
fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) { fvec4 CpuGBSAOBCForce::fastLog(fvec4 x) {
// Evaluate log(x) using a lookup table for speed. // Evaluate log(x) using a lookup table for speed.
float y[4];
fvec4 x1 = (x-0.5f)*logDXInv; fvec4 x1 = (x-0.5f)*logDXInv;
ivec4 index = floor(x1); ivec4 index = floor(x1);
fvec4 coeff[4]; fvec4 coeff2 = x1-index;
coeff[1] = x1-index; fvec4 coeff1 = 1.0f-coeff2;
coeff[0] = 1.0f-coeff[1]; float table1[4], table2[4];
coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
static float maxdiff = 0.0f;
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
if (index[i] >= 0 && index[i] < NUM_TABLE_POINTS) int tableIndex = index[i];
y[i] = dot4(coeff[i], fvec4(&logTable[4*index[i]])); if (tableIndex < NUM_TABLE_POINTS)
else table1[i] = logTable[tableIndex];
y[i] = logf(x[i]); table2[i] = logTable[tableIndex+1];
} }
return fvec4(y); return coeff1*fvec4(table1) + coeff2*fvec4(table2);
} }
...@@ -29,8 +29,8 @@ ...@@ -29,8 +29,8 @@
#include "CpuNonbondedForce.h" #include "CpuNonbondedForce.h"
#include "ReferenceForce.h" #include "ReferenceForce.h"
#include "ReferencePME.h" #include "ReferencePME.h"
#include "openmm/internal/SplineFitter.h"
#include "openmm/internal/vectorize.h" #include "openmm/internal/vectorize.h"
#include "gmx_atomic.h"
// In case we're using some primitive version of Visual Studio this will // In case we're using some primitive version of Visual Studio this will
// make sure that erf() and erfc() are defined. // make sure that erf() and erfc() are defined.
...@@ -40,7 +40,7 @@ using namespace std; ...@@ -40,7 +40,7 @@ using namespace std;
using namespace OpenMM; using namespace OpenMM;
const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M)); const float CpuNonbondedForce::TWO_OVER_SQRT_PI = (float) (2/sqrt(PI_M));
const int CpuNonbondedForce::NUM_TABLE_POINTS = 1025; const int CpuNonbondedForce::NUM_TABLE_POINTS = 2048;
class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task { class CpuNonbondedForce::ComputeDirectTask : public ThreadPool::Task {
public: public:
...@@ -58,10 +58,10 @@ public: ...@@ -58,10 +58,10 @@ public:
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false) { CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), periodic(false), ewald(false), pme(false), tableIsValid(false) {
} }
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Set the force to use a cutoff. Set the force to use a cutoff.
...@@ -71,8 +71,9 @@ CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), period ...@@ -71,8 +71,9 @@ CpuNonbondedForce::CpuNonbondedForce() : cutoff(false), useSwitch(false), period
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) { void CpuNonbondedForce::setUseCutoff(float distance, const CpuNeighborList& neighbors, float solventDielectric) {
if (distance != cutoffDistance)
tableIsValid = false;
cutoff = true; cutoff = true;
cutoffDistance = distance; cutoffDistance = distance;
neighborList = &neighbors; neighborList = &neighbors;
...@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) { ...@@ -127,6 +128,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) { void CpuNonbondedForce::setUseEwald(float alpha, int kmaxx, int kmaxy, int kmaxz) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha; alphaEwald = alpha;
numRx = kmaxx; numRx = kmaxx;
numRy = kmaxy; numRy = kmaxy;
...@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) { ...@@ -145,6 +148,8 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
--------------------------------------------------------------------------------------- */ --------------------------------------------------------------------------------------- */
void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) { void CpuNonbondedForce::setUsePME(float alpha, int meshSize[3]) {
if (alpha != alphaEwald)
tableIsValid = false;
alphaEwald = alpha; alphaEwald = alpha;
meshDim[0] = meshSize[0]; meshDim[0] = meshSize[0];
meshDim[1] = meshSize[1]; meshDim[1] = meshSize[1];
...@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) { ...@@ -155,24 +160,16 @@ void CpuNonbondedForce::setUseSwitchingFunction(float distance) {
void CpuNonbondedForce::tabulateEwaldScaleFactor() { void CpuNonbondedForce::tabulateEwaldScaleFactor() {
if (tableIsValid)
return;
tableIsValid = true;
ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2); ewaldDX = cutoffDistance/(NUM_TABLE_POINTS-2);
ewaldDXInv = 1.0f/ewaldDX; ewaldDXInv = 1.0f/ewaldDX;
vector<double> x(NUM_TABLE_POINTS+1); ewaldScaleTable.resize(NUM_TABLE_POINTS+1);
vector<double> y(NUM_TABLE_POINTS+1);
vector<double> deriv;
for (int i = 0; i < NUM_TABLE_POINTS+1; i++) { for (int i = 0; i < NUM_TABLE_POINTS+1; i++) {
double r = i*cutoffDistance/(NUM_TABLE_POINTS-2); double r = i*ewaldDX;
double alphaR = alphaEwald*r; double alphaR = alphaEwald*r;
x[i] = r; ewaldScaleTable[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
y[i] = erfc(alphaR) + TWO_OVER_SQRT_PI*alphaR*exp(-alphaR*alphaR);
}
SplineFitter::createNaturalSpline(x, y, deriv);
ewaldScaleTable.resize(4*NUM_TABLE_POINTS);
for (int i = 0; i < NUM_TABLE_POINTS; i++) {
ewaldScaleTable[4*i] = (float) y[i];
ewaldScaleTable[4*i+1] = (float) y[i+1];
ewaldScaleTable[4*i+2] = (float) (deriv[i]*ewaldDX*ewaldDX/6);
ewaldScaleTable[4*i+3] = (float) (deriv[i+1]*ewaldDX*ewaldDX/6);
} }
} }
...@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const ...@@ -302,6 +299,9 @@ void CpuNonbondedForce::calculateDirectIxn(int numberOfAtoms, float* posq, const
this->threadForce = &threadForce; this->threadForce = &threadForce;
includeEnergy = (totalEnergy != NULL); includeEnergy = (totalEnergy != NULL);
threadEnergy.resize(threads.getNumThreads()); threadEnergy.resize(threads.getNumThreads());
gmx_atomic_t counter;
gmx_atomic_set(&counter, 0);
this->atomicCounter = &counter;
// Signal the threads to start running and wait for them to finish. // Signal the threads to start running and wait for them to finish.
...@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex ...@@ -332,8 +332,12 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
if (ewald || pme) { if (ewald || pme) {
// Compute the interactions from the neighbor list. // Compute the interactions from the neighbor list.
for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads) while (true) {
calculateBlockEwaldIxn(i, forces, energyPtr, boxSize, invBoxSize); int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (nextBlock >= neighborList->getNumBlocks())
break;
calculateBlockEwaldIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
}
// Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum. // Now subtract off the exclusions, since they were implicitly included in the reciprocal space sum.
...@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex ...@@ -367,13 +371,20 @@ void CpuNonbondedForce::threadComputeDirect(ThreadPool& threads, int threadIndex
else if (cutoff) { else if (cutoff) {
// Compute the interactions from the neighbor list. // Compute the interactions from the neighbor list.
for (int i = threadIndex; i < neighborList->getNumBlocks(); i += numThreads) while (true) {
calculateBlockIxn(i, forces, energyPtr, boxSize, invBoxSize); int nextBlock = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (nextBlock >= neighborList->getNumBlocks())
break;
calculateBlockIxn(nextBlock, forces, energyPtr, boxSize, invBoxSize);
}
} }
else { else {
// Loop over all atom pairs // Loop over all atom pairs
for (int i = threadIndex; i < numberOfAtoms; i += numThreads){ while (true) {
int i = gmx_atomic_fetch_add(reinterpret_cast<gmx_atomic_t*>(atomicCounter), 1);
if (i >= numberOfAtoms)
break;
for (int j = i+1; j < numberOfAtoms; j++) for (int j = i+1; j < numberOfAtoms; j++)
if (exclusions[j].find(i) == exclusions[j].end()) if (exclusions[j].find(i) == exclusions[j].end())
calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize); calculateOneIxn(i, j, forces, energyPtr, boxSize, invBoxSize);
...@@ -609,10 +620,10 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do ...@@ -609,10 +620,10 @@ void CpuNonbondedForce::calculateBlockEwaldIxn(int blockIndex, float* forces, do
fvec4 sig2 = inverseR*sig; fvec4 sig2 = inverseR*sig;
sig2 *= sig2; sig2 *= sig2;
fvec4 sig6 = sig2*sig2*sig2; fvec4 sig6 = sig2*sig2*sig2;
fvec4 eps = blockAtomEpsilon*atomParameters[atom].second; fvec4 epsSig6 = blockAtomEpsilon*atomParameters[atom].second*sig6;
dEdR += switchValue*eps*(12.0f*sig6 - 6.0f)*sig6; dEdR += switchValue*epsSig6*(12.0f*sig6 - 6.0f);
dEdR *= inverseR*inverseR; dEdR *= inverseR*inverseR;
fvec4 energy = eps*(sig6-1.0f)*sig6; fvec4 energy = epsSig6*(sig6-1.0f);
if (useSwitch) { if (useSwitch) {
dEdR -= energy*switchDeriv*inverseR; dEdR -= energy*switchDeriv*inverseR;
energy *= switchValue; energy *= switchValue;
...@@ -683,18 +694,16 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) { ...@@ -683,18 +694,16 @@ fvec4 CpuNonbondedForce::erfcApprox(fvec4 x) {
fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) { fvec4 CpuNonbondedForce::ewaldScaleFunction(fvec4 x) {
// Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI) // Compute the tabulated Ewald scale factor: erfc(alpha*r) + 2*alpha*r*exp(-alpha*alpha*r*r)/sqrt(PI)
float y[4];
fvec4 x1 = x*ewaldDXInv; fvec4 x1 = x*ewaldDXInv;
ivec4 index = floor(x1); ivec4 index = floor(x1);
fvec4 coeff[4]; fvec4 coeff2 = x1-index;
coeff[1] = x1-index; fvec4 coeff1 = 1.0f-coeff2;
coeff[0] = 1.0f-coeff[1]; float table1[4], table2[4];
coeff[2] = coeff[0]*coeff[0]*coeff[0]-coeff[0];
coeff[3] = coeff[1]*coeff[1]*coeff[1]-coeff[1];
transpose(coeff[0], coeff[1], coeff[2], coeff[3]);
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
if (index[i] < NUM_TABLE_POINTS) int tableIndex = index[i];
y[i] = dot4(coeff[i], fvec4(&ewaldScaleTable[4*index[i]])); if (tableIndex < NUM_TABLE_POINTS)
table1[i] = ewaldScaleTable[tableIndex];
table2[i] = ewaldScaleTable[tableIndex+1];
} }
return fvec4(y); return coeff1*fvec4(table1) + coeff2*fvec4(table2);
} }
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment