"devtools/vscode:/vscode.git/clone" did not exist on "3be0bf11d90561aa6f64f1c97772e07516f7c51a"
Commit c22f00cf authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

AmoebaCudaPmeMutualInducedField

parent 3763b76b
...@@ -2763,6 +2763,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu) ...@@ -2763,6 +2763,7 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
SetCalculateAmoebaCudaVdw14_7Sim( amoebaGpu ); SetCalculateAmoebaCudaVdw14_7Sim( amoebaGpu );
SetCalculateAmoebaCudaWcaDispersionSim( amoebaGpu ); SetCalculateAmoebaCudaWcaDispersionSim( amoebaGpu );
SetCalculateAmoebaCudaMutualInducedFieldSim( amoebaGpu ); SetCalculateAmoebaCudaMutualInducedFieldSim( amoebaGpu );
SetCalculateAmoebaCudaPmeMutualInducedFieldSim( amoebaGpu );
SetCalculateAmoebaCudaPmeFixedEFieldSim( amoebaGpu ); SetCalculateAmoebaCudaPmeFixedEFieldSim( amoebaGpu );
SetCalculateAmoebaElectrostaticSim( amoebaGpu ); SetCalculateAmoebaElectrostaticSim( amoebaGpu );
SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpu ); SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpu );
......
...@@ -88,6 +88,10 @@ extern void SetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext gpu); ...@@ -88,6 +88,10 @@ extern void SetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext gpu);
extern void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext gpu); extern void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext gpu);
extern void cudaComputeAmoebaMutualInducedField( amoebaGpuContext gpu); extern void cudaComputeAmoebaMutualInducedField( amoebaGpuContext gpu);
extern void SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext gpu);
extern void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext gpu);
extern void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext gpu);
// mutual induced and Gk // mutual induced and Gk
extern void SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGpu); extern void SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGpu);
......
...@@ -36,14 +36,11 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp ...@@ -36,14 +36,11 @@ void GetCalculateAmoebaCudaMutualInducedAndGkFieldsSim(amoebaGpuContext amoebaGp
//#define AMOEBA_DEBUG //#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
__device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( float4 atomCoordinatesI, float4 atomCoordinatesJ, #define GK
float dampingFactorI, float dampingFactorJ, #include "kCalculateAmoebaCudaMutualInducedParticle.h"
float tholeI, float tholeJ, #undef GK
float* inducedDipoleI, float* inducedDipolePolarI,
float* inducedDipoleJ, float* inducedDipolePolarJ, __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float* inducedDipoleSI, float* inducedDipolePolarSI,
float* inducedDipoleSJ, float* inducedDipolePolarSJ,
float scalingDistanceCutoff,
float fields[8][3] float fields[8][3]
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -59,9 +56,9 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( float4 atomCoor ...@@ -59,9 +56,9 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( float4 atomCoor
// get deltaR, and r between 2 atoms // get deltaR, and r between 2 atoms
deltaR[0] = atomCoordinatesJ.x - atomCoordinatesI.x; deltaR[0] = atomJ.x - atomI.x;
deltaR[1] = atomCoordinatesJ.y - atomCoordinatesI.y; deltaR[1] = atomJ.y - atomI.y;
deltaR[2] = atomCoordinatesJ.z - atomCoordinatesI.z; deltaR[2] = atomJ.z - atomI.z;
float r = sqrtf( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] ); float r = sqrtf( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] );
float rI = 1.0f/r; float rI = 1.0f/r;
...@@ -69,112 +66,59 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( float4 atomCoor ...@@ -69,112 +66,59 @@ __device__ void calculateMutualInducedAndGkFieldsPairIxn_kernel( float4 atomCoor
float rr3 = -rI*r2I; float rr3 = -rI*r2I;
float rr5 = -3.0f*rr3*r2I; float rr5 = -3.0f*rr3*r2I;
float dampProd = dampingFactorI*dampingFactorJ; float dampProd = atomI.damp*atomJ.damp;
float ratio = (dampProd != 0.0f) ? (r/dampProd) : 1.0f; float ratio = (dampProd != 0.0f) ? (r/dampProd) : 1.0f;
float pGamma = tholeJ > tholeI ? tholeI : tholeJ; float pGamma = atomI.thole > atomJ.thole ? atomJ.thole: atomI.thole;
float damp = ratio*ratio*ratio*pGamma; float damp = ratio*ratio*ratio*pGamma;
float dampExp = ( (dampProd != 0.0f) && (r < scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f; float dampExp = ( (dampProd != 0.0f) && (r < cAmoebaSim.scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f;
rr3 *= (1.0f - dampExp); rr3 *= (1.0f - dampExp);
rr5 *= (1.0f - ( 1.0f + damp )*dampExp); rr5 *= (1.0f - ( 1.0f + damp )*dampExp);
float* dipole = inducedDipoleJ; float dDotDelta = rr5*(deltaR[0]*atomJ.inducedDipole[0] + deltaR[1]*atomJ.inducedDipole[1] + deltaR[2]*atomJ.inducedDipole[2] );
float dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[0][0] = rr3*atomJ.inducedDipole[0] + dDotDelta*deltaR[0];
fields[0][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[0][1] = rr3*atomJ.inducedDipole[1] + dDotDelta*deltaR[1];
fields[0][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[0][2] = rr3*atomJ.inducedDipole[2] + dDotDelta*deltaR[2];
fields[0][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dipole = inducedDipolePolarJ; dDotDelta = rr5*(deltaR[0]*atomJ.inducedDipolePolar[0] + deltaR[1]*atomJ.inducedDipolePolar[1] + deltaR[2]*atomJ.inducedDipolePolar[2] );
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[1][0] = rr3*atomJ.inducedDipolePolar[0] + dDotDelta*deltaR[0];
fields[1][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[1][1] = rr3*atomJ.inducedDipolePolar[1] + dDotDelta*deltaR[1];
fields[1][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[1][2] = rr3*atomJ.inducedDipolePolar[2] + dDotDelta*deltaR[2];
fields[1][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dipole = inducedDipoleI; dDotDelta = rr5*(deltaR[0]*atomI.inducedDipole[0] + deltaR[1]*atomI.inducedDipole[1] + deltaR[2]*atomI.inducedDipole[2] );
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[2][0] = rr3*atomI.inducedDipole[0] + dDotDelta*deltaR[0];
fields[2][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[2][1] = rr3*atomI.inducedDipole[1] + dDotDelta*deltaR[1];
fields[2][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[2][2] = rr3*atomI.inducedDipole[2] + dDotDelta*deltaR[2];
fields[2][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dipole = inducedDipolePolarI; dDotDelta = rr5*(deltaR[0]*atomI.inducedDipolePolar[0] + deltaR[1]*atomI.inducedDipolePolar[1] + deltaR[2]*atomI.inducedDipolePolar[2] );
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[3][0] = rr3*atomI.inducedDipolePolar[0] + dDotDelta*deltaR[0];
fields[3][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[3][1] = rr3*atomI.inducedDipolePolar[1] + dDotDelta*deltaR[1];
fields[3][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[3][2] = rr3*atomI.inducedDipolePolar[2] + dDotDelta*deltaR[2];
fields[3][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dDotDelta = rr5*(deltaR[0]*atomJ.inducedDipoleS[0] + deltaR[1]*atomJ.inducedDipoleS[1] + deltaR[2]*atomJ.inducedDipoleS[2] );
dipole = inducedDipoleSJ; fields[4][0] = rr3*atomJ.inducedDipoleS[0] + dDotDelta*deltaR[0];
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[4][1] = rr3*atomJ.inducedDipoleS[1] + dDotDelta*deltaR[1];
fields[4][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[4][2] = rr3*atomJ.inducedDipoleS[2] + dDotDelta*deltaR[2];
fields[4][1] = rr3*dipole[1] + dDotDelta*deltaR[1];
fields[4][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dipole = inducedDipolePolarSJ; dDotDelta = rr5*(deltaR[0]*atomJ.inducedDipolePolarS[0] + deltaR[1]*atomJ.inducedDipolePolarS[1] + deltaR[2]*atomJ.inducedDipolePolarS[2] );
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[5][0] = rr3*atomJ.inducedDipolePolarS[0] + dDotDelta*deltaR[0];
fields[5][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[5][1] = rr3*atomJ.inducedDipolePolarS[1] + dDotDelta*deltaR[1];
fields[5][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[5][2] = rr3*atomJ.inducedDipolePolarS[2] + dDotDelta*deltaR[2];
fields[5][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dipole = inducedDipoleSI; dDotDelta = rr5*(deltaR[0]*atomI.inducedDipoleS[0] + deltaR[1]*atomI.inducedDipoleS[1] + deltaR[2]*atomI.inducedDipoleS[2] );
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[6][0] = rr3*atomI.inducedDipoleS[0] + dDotDelta*deltaR[0];
fields[6][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[6][1] = rr3*atomI.inducedDipoleS[1] + dDotDelta*deltaR[1];
fields[6][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[6][2] = rr3*atomI.inducedDipoleS[2] + dDotDelta*deltaR[2];
fields[6][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dipole = inducedDipolePolarSI; dDotDelta = rr5*(deltaR[0]*atomI.inducedDipolePolarS[0] + deltaR[1]*atomI.inducedDipolePolarS[1] + deltaR[2]*atomI.inducedDipolePolarS[2] );
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[7][0] = rr3*atomI.inducedDipolePolarS[0] + dDotDelta*deltaR[0];
fields[7][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[7][1] = rr3*atomI.inducedDipolePolarS[1] + dDotDelta*deltaR[1];
fields[7][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[7][2] = rr3*atomI.inducedDipolePolarS[2] + dDotDelta*deltaR[2];
fields[7][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
#ifdef AMOEBA_DEBUGX
//int n2 = numberOfAtoms*numberOfAtoms;
//int debugIndex = atomI*numberOfAtoms + atomJ;
if( atomI == 0 ){
int debugIndex = atomJ;
debugArray[debugIndex].x = rr3;
debugArray[debugIndex].y = rr5;
dipole = &inducedDipole[atomJ*3];
debugArray[debugIndex].z = (deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] );
dipole = &inducedDipole[atomI*3];
debugArray[debugIndex].w = (deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] );
debugIndex += numberOfAtoms;
dipole = &inducedDipolePolar[atomJ*3];
debugArray[debugIndex].x = (deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] );
dipole = &inducedDipolePolar[atomI*3];
debugArray[debugIndex].y = (deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] );
debugIndex += numberOfAtoms;
debugArray[debugIndex].x = fields1[atomI*numberOfAtoms*3 + atomJ*3 +0];
debugArray[debugIndex].y = fields1[atomI*numberOfAtoms*3 + atomJ*3 +1];
debugArray[debugIndex].z = fields1[atomI*numberOfAtoms*3 + atomJ*3 +2];
debugIndex += numberOfAtoms;
debugArray[debugIndex].x = fields1[atomJ*numberOfAtoms*3 + atomI*3 +0];
debugArray[debugIndex].y = fields1[atomJ*numberOfAtoms*3 + atomI*3 +1];
debugArray[debugIndex].z = fields1[atomJ*numberOfAtoms*3 + atomI*3 +2];
debugIndex += numberOfAtoms;
debugArray[debugIndex].x = fields2[atomI*numberOfAtoms*3 + atomJ*3 +0];
debugArray[debugIndex].y = fields2[atomI*numberOfAtoms*3 + atomJ*3 +1];
debugArray[debugIndex].z = fields2[atomI*numberOfAtoms*3 + atomJ*3 +2];
debugIndex += numberOfAtoms;
debugArray[debugIndex].x = fields2[atomJ*numberOfAtoms*3 + atomI*3 +0];
debugArray[debugIndex].y = fields2[atomJ*numberOfAtoms*3 + atomI*3 +1];
debugArray[debugIndex].z = fields2[atomJ*numberOfAtoms*3 + atomI*3 +2];
}
#endif
} }
__device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( float4 atomCoordinatesI, float4 atomCoordinatesJ, float rb2, __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float* inducedDipoleSI, float* inducedDipolePolarSI,
float* inducedDipoleSJ, float* inducedDipolePolarSJ,
float gkField[8][3] float gkField[8][3]
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -191,14 +135,16 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( float4 atomCo ...@@ -191,14 +135,16 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( float4 atomCo
// --------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------
float xr = atomCoordinatesJ.x - atomCoordinatesI.x; float xr = atomJ.x - atomI.x;
float yr = atomCoordinatesJ.y - atomCoordinatesI.y; float yr = atomJ.y - atomI.y;
float zr = atomCoordinatesJ.z - atomCoordinatesI.z; float zr = atomJ.z - atomI.z;
float xr2 = xr*xr; float xr2 = xr*xr;
float yr2 = yr*yr; float yr2 = yr*yr;
float zr2 = zr*zr; float zr2 = zr*zr;
float rb2 = atomI.bornRadius*atomJ.bornRadius;
float r2 = xr2 + yr2 + zr2; float r2 = xr2 + yr2 + zr2;
float expterm = expf(-r2/(cAmoebaSim.gkc*rb2)); float expterm = expf(-r2/(cAmoebaSim.gkc*rb2));
float expc = expterm /cAmoebaSim.gkc; float expc = expterm /cAmoebaSim.gkc;
...@@ -209,21 +155,21 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( float4 atomCo ...@@ -209,21 +155,21 @@ __device__ void calculateMutualInducedAndGkFieldsGkPairIxn_kernel( float4 atomCo
float gf3 = gf2 * gf; float gf3 = gf2 * gf;
float gf5 = gf3 * gf2; float gf5 = gf3 * gf2;
float duixs = inducedDipoleSI[0]; float duixs = atomI.inducedDipoleS[0];
float duiys = inducedDipoleSI[1]; float duiys = atomI.inducedDipoleS[1];
float duizs = inducedDipoleSI[2]; float duizs = atomI.inducedDipoleS[2];
float puixs = inducedDipolePolarSI[0]; float puixs = atomI.inducedDipolePolarS[0];
float puiys = inducedDipolePolarSI[1]; float puiys = atomI.inducedDipolePolarS[1];
float puizs = inducedDipolePolarSI[2]; float puizs = atomI.inducedDipolePolarS[2];
float dukxs = inducedDipoleSJ[0]; float dukxs = atomJ.inducedDipoleS[0];
float dukys = inducedDipoleSJ[1]; float dukys = atomJ.inducedDipoleS[1];
float dukzs = inducedDipoleSJ[2]; float dukzs = atomJ.inducedDipoleS[2];
float pukxs = inducedDipolePolarSJ[0]; float pukxs = atomJ.inducedDipolePolarS[0];
float pukys = inducedDipolePolarSJ[1]; float pukys = atomJ.inducedDipolePolarS[1];
float pukzs = inducedDipolePolarSJ[2]; float pukzs = atomJ.inducedDipolePolarS[2];
// reaction potential auxiliary terms // reaction potential auxiliary terms
...@@ -280,9 +226,6 @@ __device__ static int debugAccumulate( int index, float4* debugArray, float* fie ...@@ -280,9 +226,6 @@ __device__ static int debugAccumulate( int index, float4* debugArray, float* fie
} }
#endif #endif
#define GK
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#undef GK
// Include versions of the kernels for N^2 calculations. // Include versions of the kernels for N^2 calculations.
...@@ -587,24 +530,16 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -587,24 +530,16 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
} }
if (gpu->bOutputBufferPerWarp){ if (gpu->bOutputBufferPerWarp){
#if 0 kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
(void) fprintf( amoebaGpu->log, "N2 warp\n" ); (void) fflush( amoebaGpu->log );
kCalculateAmoebaMutualInducedAndGkFieldsN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks,
amoebaGpu->nonbondThreadsPerBlock,
sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
amoebaGpu->psWorkArray_3_3->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_4->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom ); debugArray->_pDevStream[0], targetAtom );
#else #else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] ); amoebaGpu->psWorkArray_3_4->_pDevStream[0] );
#endif
#endif #endif
} else { } else {
...@@ -619,12 +554,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon ...@@ -619,12 +554,6 @@ static void cudaComputeAmoebaMutualInducedAndGkFieldMatrixMultiply( amoebaGpuCon
kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>( kCalculateAmoebaMutualInducedAndGkFieldsN2_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
gpu->psBornRadii->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psInducedDipoleS->_pDevStream[0],
amoebaGpu->psInducedDipolePolarS->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
amoebaGpu->psWorkArray_3_3->_pDevStream[0], amoebaGpu->psWorkArray_3_3->_pDevStream[0],
......
...@@ -36,12 +36,6 @@ __launch_bounds__(64, 1) ...@@ -36,12 +36,6 @@ __launch_bounds__(64, 1)
#endif #endif
void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)( void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
unsigned int* workUnit, unsigned int* workUnit,
float4* atomCoord,
float* bornRadii,
float* inducedDipole,
float* inducedDipolePolar,
float* inducedDipoleS,
float* inducedDipolePolarS,
float* outputField, float* outputField,
float* outputFieldPolar, float* outputFieldPolar,
float* outputFieldS, float* outputFieldS,
...@@ -60,13 +54,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)( ...@@ -60,13 +54,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
float4 jCoord;
float jBornRadius;
float jDipole[3];
float jDipolePolar[3];
float jDipoleS[3];
float jDipolePolarS[3];
while (pos < end) while (pos < end)
{ {
...@@ -84,7 +71,8 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)( ...@@ -84,7 +71,8 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
MutualInducedParticle* psA = &sA[tbx]; MutualInducedParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx; unsigned int atomI = x + tgx;
float4 iCoord = atomCoord[atomI]; MutualInducedParticle localParticle;
loadMutualInducedShared( &localParticle, atomI );
float fieldSum[3]; float fieldSum[3];
float fieldPolarSum[3]; float fieldPolarSum[3];
...@@ -115,9 +103,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)( ...@@ -115,9 +103,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
// load shared data // load shared data
loadMutualInducedShared( &(sA[threadIdx.x]), atomI, loadMutualInducedShared( &(sA[threadIdx.x]), atomI );
atomCoord, inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole,
bornRadii, inducedDipoleS, inducedDipolePolarS );
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
...@@ -126,16 +112,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)( ...@@ -126,16 +112,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedAndGkFields, _kernel)(
// load coords, charge, ... // load coords, charge, ...
loadMutualInducedData( &(psA[j]), &jCoord, jDipole, jDipolePolar, &jBornRadius, jDipoleS, jDipolePolarS ); calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[j], ijField
calculateMutualInducedAndGkFieldsPairIxn_kernel( iCoord, jCoord,
cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[j].damp,
cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[j].thole,
&(inducedDipole[atomI*3]), &(inducedDipolePolar[atomI*3]),
jDipole, jDipolePolar,
&(inducedDipoleS[atomI*3]), &(inducedDipolePolarS[atomI*3]),
jDipoleS, jDipolePolarS,
cAmoebaSim.scalingDistanceCutoff, ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray , debugArray
#endif #endif
...@@ -177,10 +154,7 @@ if( atomI == targetAtom ){ ...@@ -177,10 +154,7 @@ if( atomI == targetAtom ){
index = debugAccumulate( index, debugArray, ijField[5], mask, 4.0f ); index = debugAccumulate( index, debugArray, ijField[5], mask, 4.0f );
} }
#endif #endif
calculateMutualInducedAndGkFieldsGkPairIxn_kernel( iCoord, jCoord, bornRadii[atomI]*jBornRadius, calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[j], ijField
&(inducedDipoleS[atomI*3]), &(inducedDipolePolarS[atomI*3]),
jDipoleS, jDipolePolarS,
ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray , debugArray
#endif #endif
...@@ -247,9 +221,7 @@ if( atomI == targetAtom ){ ...@@ -247,9 +221,7 @@ if( atomI == targetAtom ){
{ {
// load coordinates, charge, ... // load coordinates, charge, ...
loadMutualInducedShared( &(sA[threadIdx.x]), (y+tgx), loadMutualInducedShared( &(sA[threadIdx.x]), (y+tgx) );
atomCoord, inducedDipole,
inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole, bornRadii, inducedDipoleS, inducedDipolePolarS );
} }
// zero shared fields // zero shared fields
...@@ -263,16 +235,7 @@ if( atomI == targetAtom ){ ...@@ -263,16 +235,7 @@ if( atomI == targetAtom ){
// load coords, charge, ... // load coords, charge, ...
loadMutualInducedData( &(psA[tj]), &jCoord, jDipole, jDipolePolar, &jBornRadius, jDipoleS, jDipolePolarS ); calculateMutualInducedAndGkFieldsPairIxn_kernel( localParticle, psA[tj], ijField
calculateMutualInducedAndGkFieldsPairIxn_kernel( iCoord, jCoord,
cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[tj].damp,
cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[tj].thole,
&(inducedDipole[atomI*3]), &(inducedDipolePolar[atomI*3]),
jDipole, jDipolePolar,
&(inducedDipoleS[atomI*3]), &(inducedDipolePolarS[atomI*3]),
jDipoleS, jDipolePolarS,
cAmoebaSim.scalingDistanceCutoff, ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray , debugArray
#endif #endif
...@@ -344,10 +307,7 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){ ...@@ -344,10 +307,7 @@ if( atomI == targetAtom || (y + tj) == targetAtom ){
index = debugAccumulate( index, debugArray, ijField[indexI+5], maskD, -4.0f ); index = debugAccumulate( index, debugArray, ijField[indexI+5], maskD, -4.0f );
} }
#endif #endif
calculateMutualInducedAndGkFieldsGkPairIxn_kernel( iCoord, jCoord, bornRadii[atomI]*jBornRadius, calculateMutualInducedAndGkFieldsGkPairIxn_kernel( localParticle, psA[tj], ijField
&(inducedDipoleS[atomI*3]), &(inducedDipolePolarS[atomI*3]),
jDipoleS, jDipolePolarS,
ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray , debugArray
#endif #endif
......
...@@ -36,12 +36,9 @@ void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext amoebaGpu) ...@@ -36,12 +36,9 @@ void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
//#define AMOEBA_DEBUG //#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG #undef AMOEBA_DEBUG
__device__ void calculateMutualInducedFieldPairIxn_kernel( float4 atomCoordinatesI, float4 atomCoordinatesJ, #include "kCalculateAmoebaCudaMutualInducedParticle.h"
float dampingFactorI, float dampingFactorJ,
float tholeI, float tholeJ, __device__ void calculateMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float* inducedDipoleI, float* inducedDipolePolarI,
float* inducedDipoleJ, float* inducedDipolePolarJ,
float scalingDistanceCutoff,
float fields[4][3] float fields[4][3]
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
...@@ -57,9 +54,9 @@ __device__ void calculateMutualInducedFieldPairIxn_kernel( float4 atomCoordinate ...@@ -57,9 +54,9 @@ __device__ void calculateMutualInducedFieldPairIxn_kernel( float4 atomCoordinate
// get deltaR, and r between 2 atoms // get deltaR, and r between 2 atoms
deltaR[0] = atomCoordinatesJ.x - atomCoordinatesI.x; deltaR[0] = atomJ.x - atomI.x;
deltaR[1] = atomCoordinatesJ.y - atomCoordinatesI.y; deltaR[1] = atomJ.y - atomI.y;
deltaR[2] = atomCoordinatesJ.z - atomCoordinatesI.z; deltaR[2] = atomJ.z - atomI.z;
float r = sqrtf( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] ); float r = sqrtf( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] );
float rI = 1.0f/r; float rI = 1.0f/r;
...@@ -67,87 +64,36 @@ __device__ void calculateMutualInducedFieldPairIxn_kernel( float4 atomCoordinate ...@@ -67,87 +64,36 @@ __device__ void calculateMutualInducedFieldPairIxn_kernel( float4 atomCoordinate
float rr3 = -rI*r2I; float rr3 = -rI*r2I;
float rr5 = -3.0f*rr3*r2I; float rr5 = -3.0f*rr3*r2I;
float dampProd = dampingFactorI*dampingFactorJ; float dampProd = atomI.damp*atomJ.damp;
float ratio = (dampProd != 0.0f) ? (r/dampProd) : 1.0f; float ratio = (dampProd != 0.0f) ? (r/dampProd) : 1.0f;
float pGamma = tholeJ > tholeI ? tholeI : tholeJ; float pGamma = atomJ.thole > atomI.thole ? atomI.thole: atomJ.thole;
float damp = ratio*ratio*ratio*pGamma; float damp = ratio*ratio*ratio*pGamma;
float dampExp = ( (dampProd != 0.0f) && (r < scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f; float dampExp = ( (dampProd != 0.0f) && (r < cAmoebaSim.scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f;
rr3 *= (1.0f - dampExp); rr3 *= (1.0f - dampExp);
rr5 *= (1.0f - ( 1.0f + damp )*dampExp); rr5 *= (1.0f - ( 1.0f + damp )*dampExp);
float* dipole = inducedDipoleJ; float dDotDelta = rr5*(deltaR[0]*atomJ.inducedDipole[0] + deltaR[1]*atomJ.inducedDipole[1] + deltaR[2]*atomJ.inducedDipole[2] );
float dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[0][0] = rr3*atomJ.inducedDipole[0] + dDotDelta*deltaR[0];
fields[0][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[0][1] = rr3*atomJ.inducedDipole[1] + dDotDelta*deltaR[1];
fields[0][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[0][2] = rr3*atomJ.inducedDipole[2] + dDotDelta*deltaR[2];
fields[0][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dipole = inducedDipolePolarJ; dDotDelta = rr5*(deltaR[0]*atomJ.inducedDipolePolar[0] + deltaR[1]*atomJ.inducedDipolePolar[1] + deltaR[2]*atomJ.inducedDipolePolar[2] );
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[1][0] = rr3*atomJ.inducedDipolePolar[0] + dDotDelta*deltaR[0];
fields[1][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[1][1] = rr3*atomJ.inducedDipolePolar[1] + dDotDelta*deltaR[1];
fields[1][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[1][2] = rr3*atomJ.inducedDipolePolar[2] + dDotDelta*deltaR[2];
fields[1][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dipole = inducedDipoleI; dDotDelta = rr5*(deltaR[0]*atomI.inducedDipole[0] + deltaR[1]*atomI.inducedDipole[1] + deltaR[2]*atomI.inducedDipole[2] );
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[2][0] = rr3*atomI.inducedDipole[0] + dDotDelta*deltaR[0];
fields[2][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[2][1] = rr3*atomI.inducedDipole[1] + dDotDelta*deltaR[1];
fields[2][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[2][2] = rr3*atomI.inducedDipole[2] + dDotDelta*deltaR[2];
fields[2][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
dipole = inducedDipolePolarI; dDotDelta = rr5*(deltaR[0]*atomI.inducedDipolePolar[0] + deltaR[1]*atomI.inducedDipolePolar[1] + deltaR[2]*atomI.inducedDipolePolar[2] );
dDotDelta = rr5*(deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] ); fields[3][0] = rr3*atomI.inducedDipolePolar[0] + dDotDelta*deltaR[0];
fields[3][0] = rr3*dipole[0] + dDotDelta*deltaR[0]; fields[3][1] = rr3*atomI.inducedDipolePolar[1] + dDotDelta*deltaR[1];
fields[3][1] = rr3*dipole[1] + dDotDelta*deltaR[1]; fields[3][2] = rr3*atomI.inducedDipolePolar[2] + dDotDelta*deltaR[2];
fields[3][2] = rr3*dipole[2] + dDotDelta*deltaR[2];
#ifdef AMOEBA_DEBUGX
//int n2 = numberOfAtoms*numberOfAtoms;
//int debugIndex = atomI*numberOfAtoms + atomJ;
if( atomI == 0 ){
int debugIndex = atomJ;
debugArray[debugIndex].x = rr3;
debugArray[debugIndex].y = rr5;
dipole = &inducedDipole[atomJ*3];
debugArray[debugIndex].z = (deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] );
dipole = &inducedDipole[atomI*3];
debugArray[debugIndex].w = (deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] );
debugIndex += numberOfAtoms;
dipole = &inducedDipolePolar[atomJ*3];
debugArray[debugIndex].x = (deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] );
dipole = &inducedDipolePolar[atomI*3];
debugArray[debugIndex].y = (deltaR[0]*dipole[0] + deltaR[1]*dipole[1] + deltaR[2]*dipole[2] );
debugIndex += numberOfAtoms;
debugArray[debugIndex].x = fields1[atomI*numberOfAtoms*3 + atomJ*3 +0];
debugArray[debugIndex].y = fields1[atomI*numberOfAtoms*3 + atomJ*3 +1];
debugArray[debugIndex].z = fields1[atomI*numberOfAtoms*3 + atomJ*3 +2];
debugIndex += numberOfAtoms;
debugArray[debugIndex].x = fields1[atomJ*numberOfAtoms*3 + atomI*3 +0];
debugArray[debugIndex].y = fields1[atomJ*numberOfAtoms*3 + atomI*3 +1];
debugArray[debugIndex].z = fields1[atomJ*numberOfAtoms*3 + atomI*3 +2];
debugIndex += numberOfAtoms;
debugArray[debugIndex].x = fields2[atomI*numberOfAtoms*3 + atomJ*3 +0];
debugArray[debugIndex].y = fields2[atomI*numberOfAtoms*3 + atomJ*3 +1];
debugArray[debugIndex].z = fields2[atomI*numberOfAtoms*3 + atomJ*3 +2];
debugIndex += numberOfAtoms;
debugArray[debugIndex].x = fields2[atomJ*numberOfAtoms*3 + atomI*3 +0];
debugArray[debugIndex].y = fields2[atomJ*numberOfAtoms*3 + atomI*3 +1];
debugArray[debugIndex].z = fields2[atomJ*numberOfAtoms*3 + atomI*3 +2];
}
#endif
} }
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
// Include versions of the kernels for N^2 calculations. // Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b #define METHOD_NAME(a, b) a##N2##b
...@@ -230,18 +176,9 @@ void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDe ...@@ -230,18 +176,9 @@ void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDe
{ {
epsilon[0] = delta[0].x > delta[0].y ? delta[0].x : delta[0].y; epsilon[0] = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
epsilon[0] = 4.8033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) ); epsilon[0] = 4.8033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
#ifdef AMOEBA_DEBUG
epsilon[1] = 4.8033324f*sqrtf( delta[0].x/( (float) (numberOfEntries/3)) );
epsilon[2] = 4.8033324f*sqrtf( delta[0].y/( (float) (numberOfEntries/3)) );
#endif
} }
} }
/**
matrixProduct/matrixProductP contains epsilon**2 on output
*/
__global__ __global__
#if (__CUDA_ARCH__ >= 200) #if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1) __launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
...@@ -291,46 +228,6 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl ...@@ -291,46 +228,6 @@ static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<fl
LAUNCHERROR("kReduceMI_Fields2"); LAUNCHERROR("kReduceMI_Fields2");
} }
#ifdef AMOEBA_DEBUG
#if 0
static void printMiFieldBuffer( amoebaGpuContext amoebaGpu, unsigned int bufferIndex )
{
(void) fprintf( amoebaGpu->log, "MI Field Buffer %u\n", bufferIndex );
unsigned int start = bufferIndex*3*amoebaGpu->paddedNumberOfAtoms;
unsigned int stop = (bufferIndex+1)*3*amoebaGpu->paddedNumberOfAtoms;
for( unsigned int ii = start; ii < stop; ii += 3 ){
unsigned int ii3Index = ii/3;
unsigned int bufferIndex = ii3Index/(amoebaGpu->paddedNumberOfAtoms);
unsigned int particleIndex = ii3Index - bufferIndex*(amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %6u %3u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii/3, bufferIndex, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][ii+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][ii+2] );
}
}
static void printMiFieldAtomBuffers( amoebaGpuContext amoebaGpu, unsigned int targetAtom )
{
(void) fprintf( amoebaGpu->log, "MI Field atom %u\n", targetAtom );
for( unsigned int ii = 0; ii < amoebaGpu->outputBuffers; ii++ ){
unsigned int particleIndex = 3*(targetAtom + ii*amoebaGpu->paddedNumberOfAtoms);
(void) fprintf( amoebaGpu->log, " %2u %6u [%14.6e %14.6e %14.6e] [%14.6e %14.6e %14.6e]\n",
ii, particleIndex,
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_1->_pSysStream[0][particleIndex+2],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+1],
amoebaGpu->psWorkArray_3_2->_pSysStream[0][particleIndex+2] );
}
}
#endif
#endif
/**--------------------------------------------------------------------------------------- /**---------------------------------------------------------------------------------------
Compute mutual induce field Compute mutual induce field
...@@ -367,9 +264,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext ...@@ -367,9 +264,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->nonbondThreadsPerBlock,
sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -392,9 +286,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext ...@@ -392,9 +286,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->nonbondThreadsPerBlock,
sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>( sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0], amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psWorkArray_3_2->_pDevStream[0],
...@@ -410,17 +301,8 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext ...@@ -410,17 +301,8 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray ); kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_1->Download(); amoebaGpu->psWorkArray_3_1->Download();
amoebaGpu->psWorkArray_3_2->Download(); amoebaGpu->psWorkArray_3_2->Download();
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 0) );
//printMiFieldAtomBuffers( amoebaGpu, (targetAtom + 1) );
//printMiFieldAtomBuffers( amoebaGpu, 100 );
//printMiFieldBuffer( amoebaGpu, 0 );
//printMiFieldBuffer( amoebaGpu, 1 );
//printMiFieldBuffer( amoebaGpu, 37 );
//printMiFieldBuffer( amoebaGpu, 38 );
if( amoebaGpu->log && iteration == -1 ){ if( amoebaGpu->log && iteration == -1 ){
(void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log ); (void) fprintf( amoebaGpu->log, "Finished MI kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
...@@ -523,50 +405,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext ...@@ -523,50 +405,6 @@ static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext
} }
(void) fprintf( amoebaGpu->log,"\n" ); (void) fprintf( amoebaGpu->log,"\n" );
count++; count++;
#if 0
//debugIndex += gpu->natoms;
debugIndex += paddedNumberOfAtoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
int index = 0;
sums[index][0] += debugArray->_pSysStream[0][debugIndex].x;
sums[index][1] += debugArray->_pSysStream[0][debugIndex].y;
sums[index][2] += debugArray->_pSysStream[0][debugIndex].z;
debugIndex += gpu->natoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
index++;
sums[index][0] += debugArray->_pSysStream[0][debugIndex].x;
sums[index][1] += debugArray->_pSysStream[0][debugIndex].y;
sums[index][2] += debugArray->_pSysStream[0][debugIndex].z;
debugIndex += gpu->natoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z );
index++;
sums[index][0] += debugArray->_pSysStream[0][debugIndex].x;
sums[index][1] += debugArray->_pSysStream[0][debugIndex].y;
sums[index][2] += debugArray->_pSysStream[0][debugIndex].z;
debugIndex += gpu->natoms;
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
index++;
sums[index][0] += debugArray->_pSysStream[0][debugIndex].x;
sums[index][1] += debugArray->_pSysStream[0][debugIndex].y;
sums[index][2] += debugArray->_pSysStream[0][debugIndex].z;
#endif
} }
(void) fprintf( amoebaGpu->log,"\n" ); (void) fprintf( amoebaGpu->log,"\n" );
...@@ -693,43 +531,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu ...@@ -693,43 +531,6 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] ); cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] );
LAUNCHERROR("cudaComputeAmoebaMutualInducedFieldMatrixMultiply Loop\n"); LAUNCHERROR("cudaComputeAmoebaMutualInducedFieldMatrixMultiply Loop\n");
//#ifdef AMOEBA_DEBUG
#if 0
if( 0 && amoebaGpu->log ){
amoebaGpu->psInducedDipole->Download();
amoebaGpu->psInducedDipolePolar->Download();
(void) fprintf( amoebaGpu->log, "%s Post cudaComputeAmoebaMutualInducedFieldMatrixMultiply\n", methodName );
int offset = 0;
int maxPrint = 20;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d ", ii );
(void) fprintf( amoebaGpu->log," MMi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
(void) fprintf( amoebaGpu->log,"MMip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
ii = (gpu->natoms - maxPrint);
offset = ii*3;
} else {
offset += 3;
}
}
(void) fflush( amoebaGpu->log );
// write results to file
if( iteration == -1 ){
fileId[1] = iteration;
cudaWriteFloat1AndFloat1ArraysToFile( gpu->natoms, "CudaMI_MatrixProduct1", fileId, 3, amoebaGpu->psWorkVector[0],
3, amoebaGpu->psWorkVector[1] );
}
}
#endif
// ---------------------------------------------------------------------------------------
// post matrix multiply // post matrix multiply
kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>( kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
...@@ -763,18 +564,8 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu ...@@ -763,18 +564,8 @@ static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu
if( amoebaGpu->log ){ if( amoebaGpu->log ){
amoebaGpu->psInducedDipole->Download(); amoebaGpu->psInducedDipole->Download();
amoebaGpu->psInducedDipolePolar->Download(); amoebaGpu->psInducedDipolePolar->Download();
#if 1 (void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e done=%d\n",
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n", methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon, done );
methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysStream[0][1],
amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
#else
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysStream[0][0],
amoebaGpu->psCurrentEpsilon->_pSysStream[0][1],
amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
#endif
(void) fflush( amoebaGpu->log ); (void) fflush( amoebaGpu->log );
int offset = 0; int offset = 0;
......
...@@ -36,11 +36,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1) ...@@ -36,11 +36,7 @@ __launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif #endif
void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)( void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
unsigned int* workUnit, unsigned int* workUnit,
float4* atomCoord, float* outputField, float* outputFieldPolar
float* inducedDipole,
float* inducedDipolePolar,
float* outputField,
float* outputFieldPolar
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom , float4* debugArray, unsigned int targetAtom
#endif #endif
...@@ -55,10 +51,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)( ...@@ -55,10 +51,6 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
unsigned int end = (warp+1)*numWorkUnits/totalWarps; unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
float4 jCoord;
float jDipole[3];
float jDipolePolar[3];
while (pos < end) while (pos < end)
{ {
...@@ -76,7 +68,8 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)( ...@@ -76,7 +68,8 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
MutualInducedParticle* psA = &sA[tbx]; MutualInducedParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx; unsigned int atomI = x + tgx;
float4 iCoord = atomCoord[atomI]; MutualInducedParticle localParticle;
loadMutualInducedShared( &localParticle, atomI );
float fieldSum[3]; float fieldSum[3];
float fieldPolarSum[3]; float fieldPolarSum[3];
...@@ -97,9 +90,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)( ...@@ -97,9 +90,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
// load shared data // load shared data
loadMutualInducedShared( &(sA[threadIdx.x]), atomI, loadMutualInducedShared( &(sA[threadIdx.x]), atomI );
atomCoord, inducedDipole,
inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
for (unsigned int j = 0; j < GRID; j++) for (unsigned int j = 0; j < GRID; j++)
{ {
...@@ -108,14 +99,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)( ...@@ -108,14 +99,7 @@ void METHOD_NAME(kCalculateAmoebaMutualInducedField, _kernel)(
// load coords, charge, ... // load coords, charge, ...
loadMutualInducedData( &(psA[j]), &jCoord, jDipole, jDipolePolar ); calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[j], ijField
calculateMutualInducedFieldPairIxn_kernel( iCoord, jCoord,
cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[j].damp,
cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[j].thole,
&(inducedDipole[atomI*3]), &(inducedDipolePolar[atomI*3]),
jDipole, jDipolePolar,
cAmoebaSim.scalingDistanceCutoff, ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray , debugArray
#endif #endif
...@@ -187,9 +171,7 @@ if( atomI == targetAtom ){ ...@@ -187,9 +171,7 @@ if( atomI == targetAtom ){
// load coordinates, charge, ... // load coordinates, charge, ...
loadMutualInducedShared( &(sA[threadIdx.x]), atomJ, loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
atomCoord, inducedDipole,
inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
} }
// zero shared fields // zero shared fields
...@@ -203,14 +185,7 @@ if( atomI == targetAtom ){ ...@@ -203,14 +185,7 @@ if( atomI == targetAtom ){
// load coords, charge, ... // load coords, charge, ...
loadMutualInducedData( &(psA[tj]), &jCoord, jDipole, jDipolePolar ); calculateMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], ijField
calculateMutualInducedFieldPairIxn_kernel( iCoord, jCoord,
cAmoebaSim.pDampingFactorAndThole[atomI].x, psA[tj].damp,
cAmoebaSim.pDampingFactorAndThole[atomI].y, psA[tj].thole,
&(inducedDipole[atomI*3]), &(inducedDipolePolar[atomI*3]),
jDipole, jDipolePolar,
cAmoebaSim.scalingDistanceCutoff, ijField
#ifdef AMOEBA_DEBUG #ifdef AMOEBA_DEBUG
, debugArray , debugArray
#endif #endif
......
...@@ -26,58 +26,50 @@ struct MutualInducedParticle { ...@@ -26,58 +26,50 @@ struct MutualInducedParticle {
#endif #endif
}; };
__device__ void loadMutualInducedShared( struct MutualInducedParticle* sA, unsigned int atomI, __device__ static void loadMutualInducedShared( MutualInducedParticle* sA, unsigned int atomI )
float4* atomCoord,
float* inducedDipole, float* inducedDipolePolar,
float2* dampingFactorAndThole
#ifdef GK
, float* bornRadii, float* inducedDipoleS, float* inducedDipolePolarS
#endif
)
{ {
// coordinates & charge // coordinates & charge
sA->x = atomCoord[atomI].x; sA->x = cSim.pPosq[atomI].x;
sA->y = atomCoord[atomI].y; sA->y = cSim.pPosq[atomI].y;
sA->z = atomCoord[atomI].z; sA->z = cSim.pPosq[atomI].z;
sA->q = atomCoord[atomI].w; sA->q = cSim.pPosq[atomI].w;
// dipole // dipole
sA->inducedDipole[0] = inducedDipole[atomI*3]; sA->inducedDipole[0] = cAmoebaSim.pInducedDipole[atomI*3];
sA->inducedDipole[1] = inducedDipole[atomI*3+1]; sA->inducedDipole[1] = cAmoebaSim.pInducedDipole[atomI*3+1];
sA->inducedDipole[2] = inducedDipole[atomI*3+2]; sA->inducedDipole[2] = cAmoebaSim.pInducedDipole[atomI*3+2];
// dipole polar // dipole polar
sA->inducedDipolePolar[0] = inducedDipolePolar[atomI*3]; sA->inducedDipolePolar[0] = cAmoebaSim.pInducedDipolePolar[atomI*3];
sA->inducedDipolePolar[1] = inducedDipolePolar[atomI*3+1]; sA->inducedDipolePolar[1] = cAmoebaSim.pInducedDipolePolar[atomI*3+1];
sA->inducedDipolePolar[2] = inducedDipolePolar[atomI*3+2]; sA->inducedDipolePolar[2] = cAmoebaSim.pInducedDipolePolar[atomI*3+2];
sA->damp = dampingFactorAndThole[atomI].x; sA->damp = cAmoebaSim.pDampingFactorAndThole[atomI].x;
sA->thole = dampingFactorAndThole[atomI].y; sA->thole = cAmoebaSim.pDampingFactorAndThole[atomI].y;
#ifdef GK #ifdef GK
sA->bornRadius = bornRadii[atomI]; sA->bornRadius = cSim.pBornRadii[atomI];
// dipoleS // dipoleS
sA->inducedDipoleS[0] = inducedDipoleS[atomI*3]; sA->inducedDipoleS[0] = cAmoebaSim.pInducedDipoleS[atomI*3];
sA->inducedDipoleS[1] = inducedDipoleS[atomI*3+1]; sA->inducedDipoleS[1] = cAmoebaSim.pInducedDipoleS[atomI*3+1];
sA->inducedDipoleS[2] = inducedDipoleS[atomI*3+2]; sA->inducedDipoleS[2] = cAmoebaSim.pInducedDipoleS[atomI*3+2];
// dipole polar S // dipole polar S
sA->inducedDipolePolarS[0] = inducedDipolePolarS[atomI*3]; sA->inducedDipolePolarS[0] = cAmoebaSim.pInducedDipolePolarS[atomI*3];
sA->inducedDipolePolarS[1] = inducedDipolePolarS[atomI*3+1]; sA->inducedDipolePolarS[1] = cAmoebaSim.pInducedDipolePolarS[atomI*3+1];
sA->inducedDipolePolarS[2] = inducedDipolePolarS[atomI*3+2]; sA->inducedDipolePolarS[2] = cAmoebaSim.pInducedDipolePolarS[atomI*3+2];
#endif #endif
} }
__device__ static void zeroMutualInducedParticleSharedField( struct MutualInducedParticle* sA ) __device__ static void zeroMutualInducedParticleSharedField( MutualInducedParticle* sA )
{ {
// zero shared fields // zero shared fields
...@@ -101,44 +93,3 @@ __device__ static void zeroMutualInducedParticleSharedField( struct MutualInduce ...@@ -101,44 +93,3 @@ __device__ static void zeroMutualInducedParticleSharedField( struct MutualInduce
#endif #endif
} }
// load struct and arrays w/ shared data in sA
__device__ void loadMutualInducedData( struct MutualInducedParticle* sA, float4* jCoord,
float* jInducedDipole, float* jInducedDipolePolar
#ifdef GK
, float* jBornRadius, float* jInducedDipoleS, float* jInducedDipolePolarS
#endif
)
{
// load coords, charge, ...
jCoord->x = sA->x;
jCoord->y = sA->y;
jCoord->z = sA->z;
jCoord->w = sA->q;
jInducedDipole[0] = sA->inducedDipole[0];
jInducedDipole[1] = sA->inducedDipole[1];
jInducedDipole[2] = sA->inducedDipole[2];
jInducedDipolePolar[0] = sA->inducedDipolePolar[0];
jInducedDipolePolar[1] = sA->inducedDipolePolar[1];
jInducedDipolePolar[2] = sA->inducedDipolePolar[2];
#ifdef GK
*jBornRadius = sA->bornRadius;
jInducedDipoleS[0] = sA->inducedDipoleS[0];
jInducedDipoleS[1] = sA->inducedDipoleS[1];
jInducedDipoleS[2] = sA->inducedDipoleS[2];
jInducedDipolePolarS[0] = sA->inducedDipolePolarS[0];
jInducedDipolePolarS[1] = sA->inducedDipolePolarS[1];
jInducedDipolePolarS[2] = sA->inducedDipolePolarS[2];
#endif
}
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
#include <stdio.h>
using namespace std;
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
//#define AMOEBA_DEBUG
#undef AMOEBA_DEBUG
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float uscale, float fields[4][3]
#ifdef AMOEBA_DEBUG
, float4* pullBack
#endif
){
// compute the real space portion of the Ewald summation
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic boundary conditions
xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr;
float r = sqrtf(r2);
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn[3];
bn[0] = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
bn[1] = (bn[0]+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
bn[2] = (3.0f*bn[1]+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
}
}
float dsc3 = uscale*scale3;
float dsc5 = uscale*scale5;
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f * (1.0f-dsc5)/r5;
float duir = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
float dukr = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
float fimd[3],fkmd[3];
float fimp[3],fkmp[3];
float fid[3],fkd[3];
float fip[3],fkp[3];
bn[1] *= -1.0f;
fimd[0] = bn[1]*atomJ.inducedDipole[0] + bn[2]*dukr*xr;
fimd[1] = bn[1]*atomJ.inducedDipole[1] + bn[2]*dukr*yr;
fimd[2] = bn[1]*atomJ.inducedDipole[2] + bn[2]*dukr*zr;
fkmd[0] = bn[1]*atomI.inducedDipole[0] + bn[2]*duir*xr;
fkmd[1] = bn[1]*atomI.inducedDipole[1] + bn[2]*duir*yr;
fkmd[2] = bn[1]*atomI.inducedDipole[2] + bn[2]*duir*zr;
fimp[0] = bn[1]*atomJ.inducedDipolePolar[0] + bn[2]*pukr*xr;
fimp[1] = bn[1]*atomJ.inducedDipolePolar[1] + bn[2]*pukr*yr;
fimp[2] = bn[1]*atomJ.inducedDipolePolar[2] + bn[2]*pukr*zr;
fkmp[0] = bn[1]*atomI.inducedDipolePolar[0] + bn[2]*puir*xr;
fkmp[1] = bn[1]*atomI.inducedDipolePolar[1] + bn[2]*puir*yr;
fkmp[2] = bn[1]*atomI.inducedDipolePolar[2] + bn[2]*puir*zr;
rr3 *= -1.0f;;
fid[0] = rr3*atomJ.inducedDipole[0] + rr5*dukr*xr;
fid[1] = rr3*atomJ.inducedDipole[1] + rr5*dukr*yr;
fid[2] = rr3*atomJ.inducedDipole[2] + rr5*dukr*zr;
fkd[0] = rr3*atomI.inducedDipole[0] + rr5*duir*xr;
fkd[1] = rr3*atomI.inducedDipole[1] + rr5*duir*yr;
fkd[2] = rr3*atomI.inducedDipole[2] + rr5*duir*zr;
fip[0] = rr3*atomJ.inducedDipolePolar[0] + rr5*pukr*xr;
fip[1] = rr3*atomJ.inducedDipolePolar[1] + rr5*pukr*yr;
fip[2] = rr3*atomJ.inducedDipolePolar[2] + rr5*pukr*zr;
fkp[0] = rr3*atomI.inducedDipolePolar[0] + rr5*puir*xr;
fkp[1] = rr3*atomI.inducedDipolePolar[1] + rr5*puir*yr;
fkp[2] = rr3*atomI.inducedDipolePolar[2] + rr5*puir*zr;
// increment the field at each site due to this interaction
if( r2 <= cAmoebaSim.cutoffDistance2 ){
fields[0][0] = fimd[0] - fid[0];
fields[1][0] = fkmd[0] - fkd[0];
fields[2][0] = fimp[0] - fip[0];
fields[3][0] = fkmp[0] - fkp[0];
fields[0][1] = fimd[1] - fid[1];
fields[1][1] = fkmd[1] - fkd[1];
fields[2][1] = fimp[1] - fip[1];
fields[3][1] = fkmp[1] - fkp[1];
fields[0][2] = fimd[2] - fid[2];
fields[1][2] = fkmd[2] - fkd[2];
fields[2][2] = fimp[2] - fip[2];
fields[3][2] = fkmp[2] - fkp[2];
} else {
fields[0][0] = 0.0f;
fields[1][0] = 0.0f;
fields[2][0] = 0.0f;
fields[3][0] = 0.0f;
fields[0][1] = 0.0f;
fields[1][1] = 0.0f;
fields[2][1] = 0.0f;
fields[3][1] = 0.0f;
fields[0][2] = 0.0f;
fields[1][2] = 0.0f;
fields[2][2] = 0.0f;
fields[3][2] = 0.0f;
}
#ifdef AMOEBA_DEBUG
pullBack[0].x = xr;
pullBack[0].y = yr;
pullBack[0].z = zr;
pullBack[0].w = r2;
pullBack[1].x = alsq2;
pullBack[1].y = bn[0];
pullBack[1].z = bn[2];
pullBack[1].w = exp2a;
/*
pullBack[1].x = atomJ.x - atomI.x;
pullBack[1].y = atomJ.y - atomI.y;
pullBack[1].z = atomJ.z - atomI.z;
pullBack[1].w = (atomJ.x - atomI.x)*(atomJ.x - atomI.x) + (atomJ.y - atomI.y)*(atomJ.y - atomI.y)+ (atomJ.z - atomI.z)*(atomJ.z - atomI.z);
pullBack[1].x = scale3;
pullBack[1].y = scale5;
pullBack[1].z = scale7;
*/
#endif
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kInitializeMutualInducedField_kernel(
int numberOfAtoms,
float* fixedEField,
float* fixedEFieldPolar,
float* polarizability,
float* inducedDipole,
float* inducedDipolePolar )
{
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfAtoms )return;
fixedEField[threadId] *= polarizability[threadId];
inducedDipole[threadId] = fixedEField[threadId];
fixedEFieldPolar[threadId] *= polarizability[threadId];
inducedDipolePolar[threadId] = fixedEFieldPolar[threadId];
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDeltas1, float* arrayOfDeltas2, float* epsilon )
{
extern __shared__ float2 delta[];
delta[threadIdx.x].x = 0.0f;
delta[threadIdx.x].y = 0.0f;
unsigned int pos = threadIdx.x;
// load deltas
while( pos < numberOfEntries )
{
delta[threadIdx.x].x += arrayOfDeltas1[pos];
delta[threadIdx.x].y += arrayOfDeltas2[pos];
pos += blockDim.x*gridDim.x;
}
__syncthreads();
// sum the deltas
for (int offset = 1; offset < blockDim.x; offset *= 2 )
{
if (threadIdx.x + offset < blockDim.x && (threadIdx.x & (2*offset-1)) == 0)
{
delta[threadIdx.x].x += delta[threadIdx.x+offset].x;
delta[threadIdx.x].y += delta[threadIdx.x+offset].y;
}
__syncthreads();
}
// set epsilons
if (threadIdx.x == 0)
{
epsilon[0] = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
epsilon[0] = 4.8033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
#ifdef AMOEBA_DEBUG
epsilon[1] = 4.8033324f*sqrtf( delta[0].x/( (float) (numberOfEntries/3)) );
epsilon[2] = 4.8033324f*sqrtf( delta[0].y/( (float) (numberOfEntries/3)) );
#endif
}
}
/**
matrixProduct/matrixProductP contains epsilon**2 on output
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kSorUpdateMutualInducedField_kernel(
int numberOfEntries, float* polarizability,
float* inducedDipole, float* inducedDipoleP,
float* fixedEField, float* fixedEFieldP,
float* matrixProduct, float* matrixProductP )
{
int threadId = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
if( threadId >= 3*numberOfEntries )return;
float previousDipole = inducedDipole[threadId];
float previousDipoleP = inducedDipoleP[threadId];
// add self terms to fields
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
matrixProduct[threadId] += term*previousDipole;
matrixProductP[threadId] += term*previousDipoleP;
inducedDipole[threadId] = fixedEField[threadId] + polarizability[threadId]*matrixProduct[threadId];
inducedDipoleP[threadId] = fixedEFieldP[threadId] + polarizability[threadId]*matrixProductP[threadId];
const float polarSOR = 0.70f;
inducedDipole[threadId] = previousDipole + polarSOR*( inducedDipole[threadId] - previousDipole );
inducedDipoleP[threadId] = previousDipoleP + polarSOR*( inducedDipoleP[threadId] - previousDipoleP );
matrixProduct[threadId] = ( inducedDipole[threadId] - previousDipole )*( inducedDipole[threadId] - previousDipole );
matrixProductP[threadId] = ( inducedDipoleP[threadId] - previousDipoleP )*( inducedDipoleP[threadId] - previousDipoleP );
}
// reduce psWorkArray_3_1
// reduce psWorkArray_3_2
static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], outputArray->_pDevStream[0] );
LAUNCHERROR("kReducePmeMI_Fields1");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], outputPolarArray->_pDevStream[0] );
LAUNCHERROR("kReducePmeMI_Fields2");
}
/**---------------------------------------------------------------------------------------
Compute mutual induce field
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuContext amoebaGpu,
CUDAStream<float>* outputArray, CUDAStream<float>* outputPolarArray )
{
gpuContext gpu = amoebaGpu->gpuContext;
#ifdef AMOEBA_DEBUG
int targetAtom = 0;
static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply";
static int iteration = 1;
if( 1 && amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s: scalingDistanceCutoff=%.5f\n",
methodName, amoebaGpu->scalingDistanceCutoff );
(void) fflush( amoebaGpu->log );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
#endif
kClearFields_3( amoebaGpu, 2 );
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldN2ByWarp_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
#endif
} else {
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "N2 no warp\n" );
(void) fprintf( amoebaGpu->log, "AmoebaN2Forces_kernel numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock,
amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
#endif
kCalculateAmoebaPmeMutualInducedFieldN2_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->nonbondThreadsPerBlock,
sizeof(MutualInducedParticle)*amoebaGpu->nonbondThreadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
#endif
}
LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");
kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log && iteration == 1 ){
(void) fprintf( amoebaGpu->log, "Finished maxtrixMultiply kernel execution %d\n", iteration ); (void) fflush( amoebaGpu->log );
outputArray->Download();
outputPolarArray->Download();
debugArray->Download();
int maxPrint = 5;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// MI
(void) fprintf( amoebaGpu->log,"Mult[%16.9e %16.9e %16.9e] ",
outputArray->_pSysStream[0][indexOffset],
outputArray->_pSysStream[0][indexOffset+1],
outputArray->_pSysStream[0][indexOffset+2] );
// MI polar
(void) fprintf( amoebaGpu->log,"MultP[%16.9e %16.9e %16.9e]\n",
outputPolarArray->_pSysStream[0][indexOffset],
outputPolarArray->_pSysStream[0][indexOffset+1],
outputPolarArray->_pSysStream[0][indexOffset+2] );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
/*
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
(void) fprintf( amoebaGpu->log,"%5d PmeMIMult\n", jj );
for( int kk = 0; kk < 7; kk++ ){
(void) fprintf( amoebaGpu->log,"[%16.9e %16.9e %16.9e %16.9e]\n",
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
*/
(void) fflush( amoebaGpu->log );
iteration++;
}
delete debugArray;
#endif
}
/**---------------------------------------------------------------------------------------
Compute mutual induce field
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
//#define GET_EFIELD_FROM_FILE
#ifdef GET_EFIELD_FROM_FILE
#include <stdlib.h>
#endif
static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaPmeMutualInducedFieldBySOR";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
int done;
int iteration;
gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = gpu->natoms*3;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d "
"maxIterations=%d targetEpsilon=%.3e\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks,
amoebaGpu->mutualInducedMaxIterations, amoebaGpu->mutualInducedTargetEpsilon);
(void) fflush( amoebaGpu->log );
}
#endif
#ifdef GET_EFIELD_FROM_FILE
std::string fileName = "waterEField.txt";
StringVectorVector fileContents;
readFile( fileName, fileContents );
unsigned int offset = 0;
(void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush( amoebaGpu->log );
for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
StringVector lineTokens = fileContents[ii];
unsigned int lineTokenIndex = 1;
// (void) fprintf( amoebaGpu->log, " %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush( amoebaGpu->log );
amoebaGpu->psE_Field->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psE_Field->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psE_Field->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
offset -= 3;
amoebaGpu->psE_FieldPolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psE_FieldPolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psE_FieldPolar->_pSysStream[0][offset++] = static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
}
float conversion = 100.0f;
for( int ii = 0; ii < 3*gpu->natoms; ii++ ){
amoebaGpu->psE_Field->_pSysStream[0][ii] *= conversion;
amoebaGpu->psE_FieldPolar->_pSysStream[0][ii] *= conversion;
}
amoebaGpu->psE_Field->Upload();
amoebaGpu->psE_FieldPolar->Upload();
#endif
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< numBlocks, numThreads >>>(
gpu->natoms,
amoebaGpu->psE_Field->_pDevStream[0],
amoebaGpu->psE_FieldPolar->_pDevStream[0],
amoebaGpu->psPolarizability->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0] );
LAUNCHERROR("AmoebaPmeMutualInducedFieldSetup");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psE_Field->Download();
amoebaGpu->psE_FieldPolar->Download();
amoebaGpu->psInducedDipole->Download(),
amoebaGpu->psInducedDipolePolar->Download();
amoebaGpu->psPolarizability->Download();
(void) fprintf( amoebaGpu->log, "%s Initial setup for matrix multiply\n", methodName );
int offset = 0;
int maxPrint = 20000;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d pol=%12.4e ", ii,
amoebaGpu->psPolarizability->_pSysStream[0][offset] );
if( amoebaGpu->psPolarizability->_pSysStream[0][offset] != amoebaGpu->psPolarizability->_pSysStream[0][offset+1] ||
amoebaGpu->psPolarizability->_pSysStream[0][offset] != amoebaGpu->psPolarizability->_pSysStream[0][offset+2] ){
(void) fprintf( amoebaGpu->log, "PolX!!! %12.4e %12.4e ", amoebaGpu->psPolarizability->_pSysStream[0][offset+1], amoebaGpu->psPolarizability->_pSysStream[0][offset+2] );
}
(void) fprintf( amoebaGpu->log," E[%14.6e %14.6e %14.6e] Mi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psE_Field->_pSysStream[0][offset], amoebaGpu->psE_Field->_pSysStream[0][offset+1], amoebaGpu->psE_Field->_pSysStream[0][offset+2],
amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
(void) fprintf( amoebaGpu->log,"Ep[%14.6e %14.6e %14.6e] Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psE_FieldPolar->_pSysStream[0][offset], amoebaGpu->psE_FieldPolar->_pSysStream[0][offset+1], amoebaGpu->psE_FieldPolar->_pSysStream[0][offset+2],
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
offset += 3;
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) )ii = (gpu->natoms - maxPrint);
}
(void) fflush( amoebaGpu->log );
}
#endif
// ---------------------------------------------------------------------------------------
done = 0;
iteration = 1;
while( !done ){
// matrix multiply
cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] );
LAUNCHERROR("cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply Loop\n");
#ifdef GET_EFIELD_FROM_FILE
{
std::string fileName = "waterInduceRecip.txt";
StringVectorVector fileContents;
readFile( fileName, fileContents );
unsigned int offset = 0;
amoebaGpu->psWorkVector[0]->Download();
amoebaGpu->psWorkVector[1]->Download();
(void) fprintf( amoebaGpu->log, "Read file: %s %u\n", fileName.c_str(), fileContents.size() ); fflush( amoebaGpu->log );
float conversion = 100.0f;
for( unsigned int ii = 1; ii < fileContents.size()-1; ii++ ){
StringVector lineTokens = fileContents[ii];
unsigned int lineTokenIndex = 1;
// (void) fprintf( amoebaGpu->log, " %u %s %s\n", ii, lineTokens[0].c_str(), lineTokens[lineTokenIndex].c_str() ); fflush( amoebaGpu->log );
amoebaGpu->psWorkVector[0]->_pSysStream[0][offset++] += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psWorkVector[0]->_pSysStream[0][offset++] += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psWorkVector[0]->_pSysStream[0][offset++] += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
offset -= 3;
amoebaGpu->psWorkVector[1]->_pSysStream[0][offset++] += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psWorkVector[1]->_pSysStream[0][offset++] += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
amoebaGpu->psWorkVector[1]->_pSysStream[0][offset++] += conversion*static_cast<float>(atof(lineTokens[lineTokenIndex++].c_str()));
}
amoebaGpu->psWorkVector[0]->Upload();
amoebaGpu->psWorkVector[1]->Upload();
}
#endif
// post matrix multiply
kSorUpdateMutualInducedField_kernel<<< numBlocks, numThreads >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0], amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psE_Field->_pDevStream[0], amoebaGpu->psE_FieldPolar->_pDevStream[0],
amoebaGpu->psWorkVector[0]->_pDevStream[0], amoebaGpu->psWorkVector[1]->_pDevStream[0] );
LAUNCHERROR("kSorUpdatePmeMutualInducedField");
// get total epsilon -- performing sums on gpu
kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevStream[0], amoebaGpu->psWorkVector[1]->_pDevStream[0],
amoebaGpu->psCurrentEpsilon->_pDevStream[0] );
LAUNCHERROR("kReducePmeMutualInducedFieldDelta");
if( amoebaGpu->log ){
trackMutualInducedIterations( amoebaGpu, iteration);
}
// Debye=4.8033324f
amoebaGpu->psCurrentEpsilon->Download();
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysStream[0][0];
amoebaGpu->mutualInducedCurrentEpsilon = currentEpsilon;
if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){
done = 1;
}
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psInducedDipole->Download();
amoebaGpu->psInducedDipolePolar->Download();
#if 1
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e [%14.6e %14.6e] done=%d\n",
methodName, iteration, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysStream[0][1],
amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
#else
(void) fprintf( amoebaGpu->log, "%s iteration=%3d eps %14.6e %14.6e crrntEps=%14.6e %14.6e %14.6e %14.6e done=%d\n",
methodName, iteration, sum1, sum2, amoebaGpu->mutualInducedCurrentEpsilon,
amoebaGpu->psCurrentEpsilon->_pSysStream[0][0],
amoebaGpu->psCurrentEpsilon->_pSysStream[0][1],
amoebaGpu->psCurrentEpsilon->_pSysStream[0][2], done );
#endif
(void) fflush( amoebaGpu->log );
int offset = 0;
int maxPrint = 20;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%4d ", ii );
(void) fprintf( amoebaGpu->log," Mi[%14.6e %14.6e %14.6e] ",
amoebaGpu->psInducedDipole->_pSysStream[0][offset], amoebaGpu->psInducedDipole->_pSysStream[0][offset+1], amoebaGpu->psInducedDipole->_pSysStream[0][offset+2] );
(void) fprintf( amoebaGpu->log,"Mip[%14.6e %14.6e %14.6e]\n",
amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+1], amoebaGpu->psInducedDipolePolar->_pSysStream[0][offset+2] );
if( ii == maxPrint && (ii < (gpu->natoms - maxPrint) ) ){
ii = (gpu->natoms - maxPrint);
offset = 3*(ii+1);
} else {
offset += 3;
}
}
(void) fflush( amoebaGpu->log );
}
#endif
iteration++;
}
amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
#ifdef AMOEBA_DEBUG
/*
if( 0 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipole, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psInducedDipolePolar, outputVector );
cudaWriteVectorOfDoubleVectorsToFile( "CudaPmeMI", fileId, outputVector );
}
*/
#endif
// ---------------------------------------------------------------------------------------
}
void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu )
{
if( amoebaGpu->mutualInducedIterativeMethod == 0 ){
cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpu );
}
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaScaleFactors.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void METHOD_NAME(kCalculateAmoebaPmeMutualInducedField, _kernel)(
unsigned int* workUnit,
float* outputField, float* outputFieldPolar
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
extern __shared__ MutualInducedParticle sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF;
const float uscale = 1.0f;
#ifdef AMOEBA_DEBUG
float4 pullBack[4];
#endif
while (pos < end)
{
unsigned int x;
unsigned int y;
bool bExclusionFlag;
// Extract cell coordinates
decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
MutualInducedParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx;
MutualInducedParticle localParticle;
loadMutualInducedShared( &localParticle, atomI );
float fieldSum[3];
float fieldPolarSum[3];
// 0: field at i due to j
// 1: field at i due to j polar
fieldSum[0] = 0.0f;
fieldSum[1] = 0.0f;
fieldSum[2] = 0.0f;
fieldPolarSum[0] = 0.0f;
fieldPolarSum[1] = 0.0f;
fieldPolarSum[2] = 0.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// load shared data
loadMutualInducedShared( &(sA[threadIdx.x]), atomI );
for (unsigned int j = 0; j < GRID; j++)
{
float ijField[4][3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[j], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0][0] : 0.0f;
fieldSum[1] += mask ? ijField[0][1] : 0.0f;
fieldSum[2] += mask ? ijField[0][2] : 0.0f;
fieldPolarSum[0] += mask ? ijField[2][0] : 0.0f;
fieldPolarSum[1] += mask ? ijField[2][1] : 0.0f;
fieldPolarSum[2] += mask ? ijField[2][2] : 0.0f;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+j) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+j) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = cAmoebaSim.cutoffDistance2;
debugArray[index].w = 6.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
float flag = 6.0f;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
}
#endif
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar);
#endif
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
if (lasty != y)
{
unsigned int atomJ = y + tgx;
// load coordinates, charge, ...
loadMutualInducedShared( &(sA[threadIdx.x]), atomJ );
}
// zero shared fields
zeroMutualInducedParticleSharedField( &(sA[threadIdx.x]) );
for (unsigned int j = 0; j < GRID; j++)
{
float ijField[4][3];
// load coords, charge, ...
calculatePmeDirectMutualInducedFieldPairIxn_kernel( localParticle, psA[tj], uscale, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ((y+tj) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's dipole
fieldSum[0] += mask ? ijField[0][0] : 0.0f;
fieldSum[1] += mask ? ijField[0][1] : 0.0f;
fieldSum[2] += mask ? ijField[0][2] : 0.0f;
// add to polar field at atomI the field due atomJ's dipole
fieldPolarSum[0] += mask ? ijField[2][0] : 0.0f;
fieldPolarSum[1] += mask ? ijField[2][1] : 0.0f;
fieldPolarSum[2] += mask ? ijField[2][2] : 0.0f;
// add to field at atomJ the field due atomI's dipole
psA[tj].field[0] += mask ? ijField[1][0] : 0.0f;
psA[tj].field[1] += mask ? ijField[1][1] : 0.0f;
psA[tj].field[2] += mask ? ijField[1][2] : 0.0f;
// add to polar field at atomJ the field due atomI's dipole
psA[tj].fieldPolar[0] += mask ? ijField[3][0] : 0.0f;
psA[tj].fieldPolar[1] += mask ? ijField[3][1] : 0.0f;
psA[tj].fieldPolar[2] += mask ? ijField[3][2] : 0.0f;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || (y+tj) == targetAtom ){
unsigned int index = atomI == targetAtom ? (y+tj) : atomI;
unsigned int pullBackIndex = 0;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + tj);
debugArray[index].z = cAmoebaSim.cutoffDistance2;
debugArray[index].w = 7.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
pullBackIndex++;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullBackIndex].x;
debugArray[index].y = pullBack[pullBackIndex].y;
debugArray[index].z = pullBack[pullBackIndex].z;
debugArray[index].w = pullBack[pullBackIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
float flag = 7.0f;
debugArray[index].x = ijField[indexI][0];
debugArray[index].y = ijField[indexI][1];
debugArray[index].z = ijField[indexI][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ][0];
debugArray[index].y = ijField[indexJ][1];
debugArray[index].z = ijField[indexJ][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexI+1][0];
debugArray[index].y = ijField[indexI+1][1];
debugArray[index].z = ijField[indexI+1][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = ijField[indexJ+1][0];
debugArray[index].y = ijField[indexJ+1][1];
debugArray[index].z = ijField[indexJ+1][2];
debugArray[index].w = flag;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indexI][0];
debugArray[index].y = match ? 0.0f : ijField[indexI][1];
debugArray[index].z = match ? 0.0f : ijField[indexI][2];
index += cAmoebaSim.paddedNumberOfAtoms;
unsigned int mask = 1 << j;
unsigned int pScaleIndex = (scaleMask.x & mask) ? 1 : 0;
pScaleIndex += (scaleMask.y & mask) ? 2 : 0;
debugArray[index].x = (float) pScaleIndex;
debugArray[index].y = scaleMask.x & mask ? 1.0f : -1.0f;
debugArray[index].z = scaleMask.y & mask ? 1.0f : -1.0f;
debugArray[index].w = + 10.0f;
*/
}
#endif
tj = (tj + 1) & (GRID - 1);
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputFieldPolar);
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].field, outputField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputField );
load3dArray( offset, fieldPolarSum, outputFieldPolar);
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].field, outputField );
load3dArray( offset, sA[threadIdx.x].fieldPolar, outputFieldPolar);
#endif
lasty = y;
}
pos++;
}
}
...@@ -373,6 +373,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG ...@@ -373,6 +373,7 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
cudaComputeAmoebaMutualInducedField( amoebaGpu ); cudaComputeAmoebaMutualInducedField( amoebaGpu );
} else { } else {
cudaComputeAmoebaPmeFixedEField( amoebaGpu ); cudaComputeAmoebaPmeFixedEField( amoebaGpu );
cudaComputeAmoebaPmeMutualInducedField( amoebaGpu );
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment