Commit fb62fecb authored by Peter Eastman's avatar Peter Eastman
Browse files

Beginning to implement PME reciprocal space calculation

parent cf335495
...@@ -29,6 +29,9 @@ ...@@ -29,6 +29,9 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#ifdef WIN32
#define _USE_MATH_DEFINES /* M_PI */
#endif
#include "openmm/OpenMMException.h" #include "openmm/OpenMMException.h"
#include "cudaKernels.h" #include "cudaKernels.h"
#include "amoebaCudaKernels.h" #include "amoebaCudaKernels.h"
...@@ -37,9 +40,11 @@ ...@@ -37,9 +40,11 @@
extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu); extern void OPENMMCUDA_EXPORT SetCalculateObcGbsaForces2Sim(gpuContext gpu);
extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu); extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
#include <cmath>
#include <sstream> #include <sstream>
#include <limits> #include <limits>
#include <cstring> #include <cstring>
#include <vector>
#ifdef WIN32 #ifdef WIN32
// #include <windows.h> // #include <windows.h>
...@@ -49,6 +54,8 @@ extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu); ...@@ -49,6 +54,8 @@ extern void OPENMMCUDA_EXPORT SetForcesSim(gpuContext gpu);
//#define AMOEBA_DEBUG //#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
using std::vector;
extern "C" extern "C"
amoebaGpuContext amoebaGpuInit( _gpuContext* gpu ) amoebaGpuContext amoebaGpuInit( _gpuContext* gpu )
{ {
...@@ -68,6 +75,12 @@ amoebaGpuContext amoebaGpuInit( _gpuContext* gpu ) ...@@ -68,6 +75,12 @@ amoebaGpuContext amoebaGpuInit( _gpuContext* gpu )
amoebaGpu->amoebaSim.numberOfAtoms = gpu->natoms; amoebaGpu->amoebaSim.numberOfAtoms = gpu->natoms;
amoebaGpu->amoebaSim.paddedNumberOfAtoms = gpu->sim.paddedNumberOfAtoms; amoebaGpu->amoebaSim.paddedNumberOfAtoms = gpu->sim.paddedNumberOfAtoms;
amoebaGpu->psThetai1 = NULL;
amoebaGpu->psThetai2 = NULL;
amoebaGpu->psThetai3 = NULL;
amoebaGpu->psQfac = NULL;
amoebaGpu->psIgrid = NULL;
return amoebaGpu; return amoebaGpu;
} }
...@@ -2176,6 +2189,118 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu, ...@@ -2176,6 +2189,118 @@ void gpuSetAmoebaVdwParameters( amoebaGpuContext amoebaGpu,
} }
extern "C"
void gpuSetAmoebaPMEParameters(amoebaGpuContext amoebaGpu, float alpha, int gridSizeX, int gridSizeY, int gridSizeZ)
{
gpuContext gpu = amoebaGpu->gpuContext;
gpu->sim.alphaEwald = alpha;
int3 gridSize = make_int3(gridSizeX, gridSizeY, gridSizeZ);
gpu->sim.pmeGridSize = gridSize;
int3 groupSize = make_int3(2, 4, 4);
gpu->sim.pmeGroupSize = groupSize;
const int3 numGroups = make_int3((gridSize.x+groupSize.x-1)/groupSize.x, (gridSize.y+groupSize.y-1)/groupSize.y, (gridSize.z+groupSize.z-1)/groupSize.z);
const unsigned int totalGroups = numGroups.x*numGroups.y*numGroups.z;
cufftPlan3d(&gpu->fftplan, gridSize.x, gridSize.y, gridSize.z, CUFFT_C2C);
gpu->psPmeGrid = new CUDAStream<cufftComplex>(gridSize.x*gridSize.y*gridSize.z, 1, "PmeGrid");
gpu->sim.pPmeGrid = gpu->psPmeGrid->_pDevData;
gpu->psPmeBsplineModuli[0] = new CUDAStream<float>(gridSize.x, 1, "PmeBsplineModuli0");
gpu->sim.pPmeBsplineModuli[0] = gpu->psPmeBsplineModuli[0]->_pDevData;
gpu->psPmeBsplineModuli[1] = new CUDAStream<float>(gridSize.y, 1, "PmeBsplineModuli1");
gpu->sim.pPmeBsplineModuli[1] = gpu->psPmeBsplineModuli[1]->_pDevData;
gpu->psPmeBsplineModuli[2] = new CUDAStream<float>(gridSize.z, 1, "PmeBsplineModuli2");
gpu->sim.pPmeBsplineModuli[2] = gpu->psPmeBsplineModuli[2]->_pDevData;
amoebaGpu->psQfac = new CUDAStream<float>(gridSize.x*gridSize.y*gridSize.z, 1, "qfac");
amoebaGpu->amoebaSim.pQfac = amoebaGpu->psQfac->_pDevData;
amoebaGpu->psThetai1 = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai1");
amoebaGpu->amoebaSim.pThetai1 = amoebaGpu->psThetai1->_pDevData;
amoebaGpu->psThetai2 = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai2");
amoebaGpu->amoebaSim.pThetai2 = amoebaGpu->psThetai2->_pDevData;
amoebaGpu->psThetai3 = new CUDAStream<float4>(AMOEBA_PME_ORDER*gpu->natoms, 1, "thetai3");
amoebaGpu->amoebaSim.pThetai3 = amoebaGpu->psThetai3->_pDevData;
amoebaGpu->psIgrid = new CUDAStream<int4>(gpu->natoms, 1, "igrid");
amoebaGpu->amoebaSim.pIgrid = amoebaGpu->psIgrid->_pDevData;
gpu->psPmeAtomGridIndex = new CUDAStream<int2>(gpu->natoms, 1, "PmeAtomGridIndex");
gpu->sim.pPmeAtomGridIndex = gpu->psPmeAtomGridIndex->_pDevData;
gpu->psPmeBsplineTheta = new CUDAStream<float4>(1, 1, "PmeBsplineTheta"); // Not actually uesd
gpu->psPmeBsplineDtheta = new CUDAStream<float4>(1, 1, "PmeBsplineDtheta"); // Not actually used
// Initialize the b-spline moduli.
double array[AMOEBA_PME_ORDER];
double x = 0.0;
array[0] = 1.0 - x;
array[1] = x;
for (int k = 2; k < AMOEBA_PME_ORDER; k++) {
double denom = 1.0/k;
array[k] = x*array[k-1]*denom;
for (int i = 1; i < k; i++)
array[k-i] = ((x+i)*array[k-i-1] + ((k-i+1)-x)*array[k-i])*denom;
array[0] = (1.0-x)*array[0]*denom;
}
int maxSize = std::max(std::max(gridSize.x, gridSize.y), gridSize.z);
vector<double> bsarray(maxSize+1, 0.0);
for (int i = 2; i <= AMOEBA_PME_ORDER+1; i++)
bsarray[i] = array[i-2];
for (int dim = 0; dim < 3; dim++) {
float* bsmod = gpu->psPmeBsplineModuli[dim]->_pSysData;
int size = gpu->psPmeBsplineModuli[dim]->_length;
// get the modulus of the discrete Fourier transform
double factor = 2.0*M_PI/size;
for (int i = 0; i < size; i++) {
double sum1 = 0.0;
double sum2 = 0.0;
for (int j = 1; j <= size; j++) {
double arg = factor*i*(j-1);
sum1 = sum1 + bsarray[j]*cos(arg);
sum2 = sum2 + bsarray[j]*sin(arg);
}
bsmod[i] = sum1*sum1 + sum2*sum2;
}
// fix for exponential Euler spline interpolation failure
double eps = 1.0e-7;
if (bsmod[0] < eps)
bsmod[0] = 0.5 * bsmod[1];
for (int i = 1; i < size-1; i++)
if (bsmod[i] < eps)
bsmod[i] = 0.5*(bsmod[i-1]+bsmod[i+1]);
if (bsmod[size-1] < eps)
bsmod[size-1] = 0.5*bsmod[size-2];
// compute and apply the optimal zeta coefficient
int jcut = 50;
for (int i = 1; i <= size; i++) {
int k = i - 1;
if (i > size/2)
k = k - size;
double zeta;
if (k == 0)
zeta = 1.0;
else {
double sum1 = 1.0;
double sum2 = 1.0;
factor = M_PI*k/size;
for (int j = 1; j <= jcut; j++) {
double arg = factor/(factor+M_PI*j);
sum1 = sum1 + pow(arg, AMOEBA_PME_ORDER);
sum2 = sum2 + pow(arg, 2*AMOEBA_PME_ORDER);
}
for (int j = 1; j <= jcut; j++) {
double arg = factor/(factor-M_PI*j);
sum1 = sum1 + pow(arg, AMOEBA_PME_ORDER);
sum2 = sum2 + pow(arg, 2*AMOEBA_PME_ORDER);
}
zeta = sum2/sum1;
}
bsmod[i-1] = bsmod[i-1]*zeta*zeta;
}
}
}
extern "C" extern "C"
void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu, const std::vector< std::vector<int> >& exclusions ) void amoebaGpuBuildVdwExclusionList( amoebaGpuContext amoebaGpu, const std::vector< std::vector<int> >& exclusions )
{ {
...@@ -2573,6 +2698,14 @@ void amoebaGpuShutDown(amoebaGpuContext gpu) ...@@ -2573,6 +2698,14 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
delete gpu->ps_P_ScaleIndices; delete gpu->ps_P_ScaleIndices;
delete gpu->ps_M_ScaleIndices; delete gpu->ps_M_ScaleIndices;
if (gpu->psThetai1) {
delete gpu->psThetai1;
delete gpu->psThetai2;
delete gpu->psThetai3;
delete gpu->psQfac;
delete gpu->psIgrid;
}
if( gpu->pMapArray ){ if( gpu->pMapArray ){
for( unsigned int ii = 0; ii < gpu->paddedNumberOfAtoms; ii++ ){ for( unsigned int ii = 0; ii < gpu->paddedNumberOfAtoms; ii++ ){
delete gpu->pMapArray[ii]; delete gpu->pMapArray[ii];
...@@ -2626,6 +2759,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu) ...@@ -2626,6 +2759,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
SetCalculateAmoebaCudaFixedEAndGKFieldsSim( amoebaGpu ); SetCalculateAmoebaCudaFixedEAndGKFieldsSim( amoebaGpu );
SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim( amoebaGpu ); SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim( amoebaGpu );
SetCalculateObcGbsaForces2Sim( amoebaGpu->gpuContext ); SetCalculateObcGbsaForces2Sim( amoebaGpu->gpuContext );
SetCalculateAmoebaPMESim( amoebaGpu );
} }
extern "C" extern "C"
......
...@@ -156,5 +156,9 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int ...@@ -156,5 +156,9 @@ extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int
//extern int isNanOrInfinity( double number ); //extern int isNanOrInfinity( double number );
extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration); extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
// PME
extern void SetCalculateAmoebaPMESim( amoebaGpuContext amoebaGpu );
#endif //__AMOEBA_GPU_TYPES_H__ #endif //__AMOEBA_GPU_TYPES_H__
...@@ -47,6 +47,8 @@ enum CudaAmoebaNonbondedMethod ...@@ -47,6 +47,8 @@ enum CudaAmoebaNonbondedMethod
AMOEBA_PARTICLE_MESH_EWALD AMOEBA_PARTICLE_MESH_EWALD
}; };
static const int AMOEBA_PME_ORDER = 5;
struct cudaAmoebaGmxSimulation { struct cudaAmoebaGmxSimulation {
// Constants // Constants
...@@ -177,6 +179,13 @@ struct cudaAmoebaGmxSimulation { ...@@ -177,6 +179,13 @@ struct cudaAmoebaGmxSimulation {
float fd; // electric * 2.0f * (1.0f-dwater)/(1.0f+2.0f*dwater); float fd; // electric * 2.0f * (1.0f-dwater)/(1.0f+2.0f*dwater);
float fq; // electric * 3.0f * (1.0f-dwater)/(2.0f+3.0f*dwater); float fq; // electric * 3.0f * (1.0f-dwater)/(2.0f+3.0f*dwater);
// PME arrays
float* pQfac;
float4* pThetai1;
float4* pThetai2;
float4* pThetai3;
int4* pIgrid;
}; };
#endif #endif
...@@ -219,6 +219,14 @@ struct _amoebaGpuContext { ...@@ -219,6 +219,14 @@ struct _amoebaGpuContext {
// Wca dispersion fields // Wca dispersion fields
CUDAStream<float2>* psWcaDispersionRadiusEpsilon; CUDAStream<float2>* psWcaDispersionRadiusEpsilon;
// PME fields
CUDAStream<float>* psQfac;
CUDAStream<float4>* psThetai1;
CUDAStream<float4>* psThetai2;
CUDAStream<float4>* psThetai3;
CUDAStream<int4>* psIgrid;
}; };
typedef struct _amoebaGpuContext *amoebaGpuContext; typedef struct _amoebaGpuContext *amoebaGpuContext;
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaPMESim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaPMESim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaPMESim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
#define ARRAY(x,y) array[(x)-1+((y)-1)*AMOEBA_PME_ORDER]
/**
* This is called from computeBsplines(). It calculates the spline coefficients for a single atom along a single axis.
*/
__device__ void computeBSplinePoint(float4* thetai, float w, float* array)
{
// real*8 thetai(4,AMOEBA_PME_ORDER)
// real*8 array(AMOEBA_PME_ORDER,AMOEBA_PME_ORDER)
// initialization to get to 2nd order recursion
ARRAY(2,2) = w;
ARRAY(2,1) = 1.0f - w;
// perform one pass to get to 3rd order recursion
ARRAY(3,3) = 0.5f * w * ARRAY(2,2);
ARRAY(3,2) = 0.5f * ((1.0f+w)*ARRAY(2,1)+(2.0f-w)*ARRAY(2,2));
ARRAY(3,1) = 0.5f * (1.0f-w) * ARRAY(2,1);
// compute standard B-spline recursion to desired order
for (int i = 4; i <= AMOEBA_PME_ORDER; i++)
{
int k = i - 1;
float denom = 1.0f / k;
ARRAY(i,i) = denom * w * ARRAY(k,k);
for (int j = 1; j <= i-2; j++)
ARRAY(i,i-j) = denom * ((w+j)*ARRAY(k,i-j-1)+(i-j-w)*ARRAY(k,i-j));
ARRAY(i,1) = denom * (1.0f-w) * ARRAY(k,1);
}
// get coefficients for the B-spline first derivative
int k = AMOEBA_PME_ORDER - 1;
ARRAY(k,AMOEBA_PME_ORDER) = ARRAY(k,AMOEBA_PME_ORDER-1);
for (int i = AMOEBA_PME_ORDER-1; i >= 2; i--)
ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
ARRAY(k,1) = -ARRAY(k,1);
// get coefficients for the B-spline second derivative
k = AMOEBA_PME_ORDER - 2;
ARRAY(k,AMOEBA_PME_ORDER-1) = ARRAY(k,AMOEBA_PME_ORDER-2);
for (int i = AMOEBA_PME_ORDER-2; i >= 2; i--)
ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
ARRAY(k,1) = -ARRAY(k,1);
ARRAY(k,AMOEBA_PME_ORDER) = ARRAY(k,AMOEBA_PME_ORDER-1);
for (int i = AMOEBA_PME_ORDER-1; i >= 2; i--)
ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
ARRAY(k,1) = -ARRAY(k,1);
// get coefficients for the B-spline third derivative
k = AMOEBA_PME_ORDER - 3;
ARRAY(k,AMOEBA_PME_ORDER-2) = ARRAY(k,AMOEBA_PME_ORDER-3);
for (int i = AMOEBA_PME_ORDER-3; i >= 2; i--)
ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
ARRAY(k,1) = -ARRAY(k,1);
ARRAY(k,AMOEBA_PME_ORDER-1) = ARRAY(k,AMOEBA_PME_ORDER-2);
for (int i = AMOEBA_PME_ORDER-2; i >= 2; i--)
ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
ARRAY(k,1) = -ARRAY(k,1);
ARRAY(k,AMOEBA_PME_ORDER) = ARRAY(k,AMOEBA_PME_ORDER-1);
for (int i = AMOEBA_PME_ORDER-1; i >= 2; i--)
ARRAY(k,i) = ARRAY(k,i-1) - ARRAY(k,i);
ARRAY(k,1) = -ARRAY(k,1);
// copy coefficients from temporary to permanent storage
for (int i = 1; i <= AMOEBA_PME_ORDER; i++)
thetai[i] = make_float4(ARRAY(AMOEBA_PME_ORDER,i), ARRAY(AMOEBA_PME_ORDER-1,i), ARRAY(AMOEBA_PME_ORDER-2,i), ARRAY(AMOEBA_PME_ORDER-3,i));
}
/**
* Compute bspline coefficients.
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(256, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(256, 1)
#else
__launch_bounds__(128, 1)
#endif
void computeBsplines_kernel()
{
extern __shared__ float bsplines_cache[]; // size = block_size*pme_order*pme_order
float* array = &bsplines_cache[threadIdx.x*AMOEBA_PME_ORDER*AMOEBA_PME_ORDER];
// get the B-spline coefficients for each multipole site
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < cSim.atoms; i += blockDim.x*gridDim.x) {
float4 posq = cSim.pPosq[i];
posq.x -= floor(posq.x*cSim.invPeriodicBoxSizeX)*cSim.periodicBoxSizeX;
posq.y -= floor(posq.y*cSim.invPeriodicBoxSizeY)*cSim.periodicBoxSizeY;
posq.z -= floor(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.periodicBoxSizeZ;
float3 t = make_float3((posq.x*cSim.invPeriodicBoxSizeX)*cSim.pmeGridSize.x,
(posq.y*cSim.invPeriodicBoxSizeY)*cSim.pmeGridSize.y,
(posq.z*cSim.invPeriodicBoxSizeZ)*cSim.pmeGridSize.z);
// First axis.
float w = posq.x*cSim.invPeriodicBoxSizeX;
float fr = cSim.pmeGridSize.x*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
w = fr - ifr;
int igrid1 = ifr - AMOEBA_PME_ORDER;
computeBSplinePoint(&cAmoebaSim.pThetai1[i*AMOEBA_PME_ORDER], w, array);
// Second axis.
w = posq.y*cSim.invPeriodicBoxSizeY;
fr = cSim.pmeGridSize.y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid2 = ifr - AMOEBA_PME_ORDER;
computeBSplinePoint(&cAmoebaSim.pThetai2[i*AMOEBA_PME_ORDER], w, array);
// Third axis.
w = posq.z*cSim.invPeriodicBoxSizeZ;
fr = cSim.pmeGridSize.z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid3 = ifr - AMOEBA_PME_ORDER;
computeBSplinePoint(&cAmoebaSim.pThetai3[i*AMOEBA_PME_ORDER], w, array);
cAmoebaSim.pIgrid[i] = make_int4(igrid1, igrid2, igrid3, 0);
cSim.pPmeAtomGridIndex[i] = make_int2(i, igrid1*cSim.pmeGridSize.y*cSim.pmeGridSize.z+igrid2*cSim.pmeGridSize.z+igrid3);
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(1024, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(512, 1)
#else
__launch_bounds__(256, 1)
#endif
void kGridSpreadMultipoles_kernel()
{
unsigned int numGridPoints = cSim.pmeGridSize.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
unsigned int numThreads = gridDim.x*blockDim.x;
for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads)
{
int3 gridPoint;
gridPoint.x = gridIndex/(cSim.pmeGridSize.y*cSim.pmeGridSize.z);
int remainder = gridIndex-gridPoint.x*cSim.pmeGridSize.y*cSim.pmeGridSize.z;
gridPoint.y = remainder/cSim.pmeGridSize.z;
gridPoint.z = remainder-gridPoint.y*cSim.pmeGridSize.z;
float result = 0.0f;
for (int ix = 0; ix < PME_ORDER; ++ix)
{
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : cSim.pmeGridSize.x);
for (int iy = 0; iy < PME_ORDER; ++iy)
{
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : cSim.pmeGridSize.y);
int z1 = gridPoint.z-PME_ORDER+1;
z1 += (z1 >= 0 ? 0 : cSim.pmeGridSize.z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : cSim.pmeGridSize.z-1);
int gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z1;
int gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+z2;
int firstAtom = cSim.pPmeAtomRange[gridIndex1];
int lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = cSim.pPmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
float atomCharge = cSim.pPosq[atomIndex].w;
float atomDipoleX = cAmoebaSim.pLabFrameDipole[atomIndex+3];
float atomDipoleY = cAmoebaSim.pLabFrameDipole[atomIndex+3+1];
float atomDipoleZ = cAmoebaSim.pLabFrameDipole[atomIndex+3+2];
float atomQuadrupoleXX = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9];
float atomQuadrupoleXY = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+1];
float atomQuadrupoleXZ = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+2];
float atomQuadrupoleYY = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+4];
float atomQuadrupoleYZ = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+5];
float atomQuadrupoleZZ = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+8];
float4 t = cAmoebaSim.pThetai1[atomIndex+ix*cSim.atoms];
float4 u = cAmoebaSim.pThetai2[atomIndex+iy*cSim.atoms];
float4 v = cAmoebaSim.pThetai3[atomIndex+iz*cSim.atoms];
float term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
float term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
float term2 = atomQuadrupoleXX * u.x * v.x;
result += term0*t.x + term1*t.y + term2*t.z;
}
if (z1 > gridPoint.z)
{
gridIndex1 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z;
gridIndex2 = x*cSim.pmeGridSize.y*cSim.pmeGridSize.z+y*cSim.pmeGridSize.z+gridPoint.z;
firstAtom = cSim.pPmeAtomRange[gridIndex1];
lastAtom = cSim.pPmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i)
{
int2 atomData = cSim.pPmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : cSim.pmeGridSize.z);
float atomCharge = cSim.pPosq[atomIndex].w;
float atomDipoleX = cAmoebaSim.pLabFrameDipole[atomIndex+3];
float atomDipoleY = cAmoebaSim.pLabFrameDipole[atomIndex+3+1];
float atomDipoleZ = cAmoebaSim.pLabFrameDipole[atomIndex+3+2];
float atomQuadrupoleXX = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9];
float atomQuadrupoleXY = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+1];
float atomQuadrupoleXZ = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+2];
float atomQuadrupoleYY = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+4];
float atomQuadrupoleYZ = 2*cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+5];
float atomQuadrupoleZZ = cAmoebaSim.pLabFrameQuadrupole[atomIndex+9+8];
float4 t = cAmoebaSim.pThetai1[atomIndex+ix*cSim.atoms];
float4 u = cAmoebaSim.pThetai2[atomIndex+iy*cSim.atoms];
float4 v = cAmoebaSim.pThetai3[atomIndex+iz*cSim.atoms];
float term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
float term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
float term2 = atomQuadrupoleXX * u.x * v.x;
result += term0*t.x + term1*t.y + term2*t.z;
}
}
}
}
cSim.pPmeGrid[gridIndex] = make_cuComplex(result*sqrt(cSim.epsfac), 0.0f);
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment