Commit af4d503a authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Minor optimizations

parent d4559528
//-----------------------------------------------------------------------------------------
///-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
......@@ -115,7 +115,7 @@ __device__ static void debugSetup( unsigned int atomI, unsigned int atomJ,
__device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI, float* energy)
{
float term = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float fterm = -(cAmoebaSim.electric/cAmoebaSim.dielec)*cSim.alphaEwald/cAmoebaSim.sqrtPi;
float fterm = -cSim.alphaEwald/cAmoebaSim.sqrtPi;
float cii = atomI.q*atomI.q;
......@@ -141,7 +141,7 @@ __device__ static void calculatePmeSelfEnergyElectrostaticPairIxn_kernel( PmeDir
__device__ static void calculatePmeSelfTorqueElectrostaticPairIxn_kernel( PmeDirectElectrostaticParticle& atomI)
{
float term = (4.0f/3.0f)*(cAmoebaSim.electric/cAmoebaSim.dielec)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
float uix = 0.5f*(atomI.inducedDipole[0] + atomI.inducedDipoleP[0]);
float uiy = 0.5f*(atomI.inducedDipole[1] + atomI.inducedDipoleP[1]);
......@@ -175,8 +175,6 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float r = sqrt(r2);
float ck = atomJ.q;
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
// set the permanent multipole and induced dipole values;
float ci = atomI.q;
......@@ -209,7 +207,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
float qk8 = atomJ.labFrameQuadrupole[7];
float qk9 = atomJ.labFrameQuadrupole[8];
// calculate the real space error function terms;
// calculate the real space error function terms
float ralpha = cSim.alphaEwald*r;
float bn0 = erfc(ralpha)/r;
......@@ -236,7 +234,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
alsq2n *= alsq2;
float bn5 = (9.0f*bn4+alsq2n*exp2a)/r2;
// apply Thole polarization damping to scale factors;
// apply Thole polarization damping to scale factors
float rr1 = 1.0f/r;
float rr3 = rr1 / r2;
......@@ -511,7 +509,7 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
ei = ei - erli;
forceTorqueEnergy[0].w = -conversionFactor*(e + ei);
forceTorqueEnergy[0].w = (e + ei);
// increment the total intramolecular energy; assumes;
// intramolecular distances are less than half of cell;
......@@ -843,18 +841,17 @@ __device__ void calculatePmeDirectElectrostaticPairIxn_kernel( PmeDirectElectros
// increment gradient due to force and torque on first site;
forceTorqueEnergy[0].x = conversionFactor*(ftm21 + ftm2i1);
forceTorqueEnergy[0].y = conversionFactor*(ftm22 + ftm2i2);
forceTorqueEnergy[0].z = conversionFactor*(ftm23 + ftm2i3);
forceTorqueEnergy[0].x = (ftm21 + ftm2i1);
forceTorqueEnergy[0].y = (ftm22 + ftm2i2);
forceTorqueEnergy[0].z = (ftm23 + ftm2i3);
conversionFactor *= -1.0;
forceTorqueEnergy[1].x = conversionFactor*(ttm21 + ttm2i1);
forceTorqueEnergy[1].y = conversionFactor*(ttm22 + ttm2i2);
forceTorqueEnergy[1].z = conversionFactor*(ttm23 + ttm2i3);
forceTorqueEnergy[1].x = (ttm21 + ttm2i1);
forceTorqueEnergy[1].y = (ttm22 + ttm2i2);
forceTorqueEnergy[1].z = (ttm23 + ttm2i3);
forceTorqueEnergy[2].x = conversionFactor*(ttm31 + ttm3i1);
forceTorqueEnergy[2].y = conversionFactor*(ttm32 + ttm3i2);
forceTorqueEnergy[2].z = conversionFactor*(ttm33 + ttm3i3);
forceTorqueEnergy[2].x = (ttm31 + ttm3i1);
forceTorqueEnergy[2].y = (ttm32 + ttm3i2);
forceTorqueEnergy[2].z = (ttm33 + ttm3i3);
#ifdef AMOEBA_DEBUG
int debugIndex = 0;
......@@ -1138,7 +1135,7 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u gpu->nonbond_threads_per_block=%u\n",
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaPmeDirectElectrostaticCutoffForces: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u gpu->nonbond_threads_per_block=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(PmeDirectElectrostaticParticle), (sizeof(PmeDirectElectrostaticParticle))*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits, gpu->sim.nonbond_threads_per_block );
......@@ -1159,6 +1156,12 @@ void cudaComputeAmoebaPmeDirectElectrostatic( amoebaGpuContext amoebaGpu )
} else {
/*
if (gpu->sm_version >= SM_20)
cudaFuncSetCacheConfig(kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel, cudaFuncCachePreferL1 );
//cudaFuncSetCacheConfig(kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel, cudaFuncCachePreferShared );
*/
kCalculateAmoebaPmeDirectElectrostaticCutoffForces_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(PmeDirectElectrostaticParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
#ifdef AMOEBA_DEBUG
......
......@@ -58,6 +58,7 @@ void METHOD_NAME(kCalculateAmoebaPmeDirectElectrostatic, Forces_kernel)(
float4 forceTorqueEnergy[3];
float scalingFactors[LastScalingIndex];
float conversionFactor = (-cAmoebaSim.electric/cAmoebaSim.dielec);
while (pos < end)
{
......@@ -203,6 +204,14 @@ if( atomI == targetAtom || atomJ == targetAtom ){
totalEnergy += energy;
}
localParticle.force[0] *= conversionFactor;
localParticle.force[1] *= conversionFactor;
localParticle.force[2] *= conversionFactor;
localParticle.torque[0] *= -conversionFactor;
localParticle.torque[1] *= -conversionFactor;
localParticle.torque[2] *= -conversionFactor;
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
......@@ -386,6 +395,22 @@ if( atomI == targetAtom || atomJ == targetAtom ){
} // end of j-loop
localParticle.force[0] *= conversionFactor;
localParticle.force[1] *= conversionFactor;
localParticle.force[2] *= conversionFactor;
localParticle.torque[0] *= -conversionFactor;
localParticle.torque[1] *= -conversionFactor;
localParticle.torque[2] *= -conversionFactor;
sA[threadIdx.x].force[0] *= conversionFactor;
sA[threadIdx.x].force[1] *= conversionFactor;
sA[threadIdx.x].force[2] *= conversionFactor;
sA[threadIdx.x].torque[0] *= -conversionFactor;
sA[threadIdx.x].torque[1] *= -conversionFactor;
sA[threadIdx.x].torque[2] *= -conversionFactor;
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
......@@ -415,5 +440,5 @@ if( atomI == targetAtom || atomJ == targetAtom ){
}
pos++;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] -= conversionFactor*totalEnergy;
}
......@@ -82,12 +82,12 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
float ralpha = cSim.alphaEwald*r;
float bn0 = erfc(ralpha)/r;
float bn0 = erfc(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
......@@ -116,66 +116,41 @@ __device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInduce
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f * (1.0f-dsc5)/r5;
float rr5 = 3.0f*(1.0f-dsc5)/r5;
float duir = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
float dukr = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
float preFactor1 = rr3 - bn1;
float preFactor2 = bn2 - rr5;
bn1 *= -1.0f;
float dukr = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float preFactor3 = preFactor2*dukr;
float fimd0 = bn1*atomJ.inducedDipole[0] + bn2*dukr*xr;
float fimd1 = bn1*atomJ.inducedDipole[1] + bn2*dukr*yr;
float fimd2 = bn1*atomJ.inducedDipole[2] + bn2*dukr*zr;
fields[0].x = preFactor3*xr + preFactor1*atomJ.inducedDipole[0];
fields[1].x = preFactor3*yr + preFactor1*atomJ.inducedDipole[1];
fields[2].x = preFactor3*zr + preFactor1*atomJ.inducedDipole[2];
float fkmd0 = bn1*atomI.inducedDipole[0] + bn2*duir*xr;
float fkmd1 = bn1*atomI.inducedDipole[1] + bn2*duir*yr;
float fkmd2 = bn1*atomI.inducedDipole[2] + bn2*duir*zr;
float fimp0 = bn1*atomJ.inducedDipolePolar[0] + bn2*pukr*xr;
float fimp1 = bn1*atomJ.inducedDipolePolar[1] + bn2*pukr*yr;
float fimp2 = bn1*atomJ.inducedDipolePolar[2] + bn2*pukr*zr;
float duir = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
preFactor3 = preFactor2*duir;
float fkmp0 = bn1*atomI.inducedDipolePolar[0] + bn2*puir*xr;
float fkmp1 = bn1*atomI.inducedDipolePolar[1] + bn2*puir*yr;
float fkmp2 = bn1*atomI.inducedDipolePolar[2] + bn2*puir*zr;
fields[0].y = preFactor3*xr + preFactor1*atomI.inducedDipole[0];
fields[1].y = preFactor3*yr + preFactor1*atomI.inducedDipole[1];
fields[2].y = preFactor3*zr + preFactor1*atomI.inducedDipole[2];
rr3 *= -1.0f;
float fid0 = rr3*atomJ.inducedDipole[0] + rr5*dukr*xr;
float fid1 = rr3*atomJ.inducedDipole[1] + rr5*dukr*yr;
float fid2 = rr3*atomJ.inducedDipole[2] + rr5*dukr*zr;
float fkd0 = rr3*atomI.inducedDipole[0] + rr5*duir*xr;
float fkd1 = rr3*atomI.inducedDipole[1] + rr5*duir*yr;
float fkd2 = rr3*atomI.inducedDipole[2] + rr5*duir*zr;
float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
preFactor3 = preFactor2*pukr;
float fip0 = rr3*atomJ.inducedDipolePolar[0] + rr5*pukr*xr;
float fip1 = rr3*atomJ.inducedDipolePolar[1] + rr5*pukr*yr;
float fip2 = rr3*atomJ.inducedDipolePolar[2] + rr5*pukr*zr;
fields[0].z = preFactor3*xr + preFactor1*atomJ.inducedDipolePolar[0];
fields[1].z = preFactor3*yr + preFactor1*atomJ.inducedDipolePolar[1];
fields[2].z = preFactor3*zr + preFactor1*atomJ.inducedDipolePolar[2];
float fkp0 = rr3*atomI.inducedDipolePolar[0] + rr5*puir*xr;
float fkp1 = rr3*atomI.inducedDipolePolar[1] + rr5*puir*yr;
float fkp2 = rr3*atomI.inducedDipolePolar[2] + rr5*puir*zr;
// increment the field at each site due to this interaction
float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
preFactor3 = preFactor2*puir;
fields[0].w = preFactor3*xr + preFactor1*atomI.inducedDipolePolar[0];
fields[1].w = preFactor3*yr + preFactor1*atomI.inducedDipolePolar[1];
fields[2].w = preFactor3*zr + preFactor1*atomI.inducedDipolePolar[2];
fields[0].x = fimd0 - fid0;
fields[0].y = fkmd0 - fkd0;
fields[0].z = fimp0 - fip0;
fields[0].w = fkmp0 - fkp0;
fields[1].x = fimd1 - fid1;
fields[1].y = fkmd1 - fkd1;
fields[1].z = fimp1 - fip1;
fields[1].w = fkmp1 - fkp1;
fields[2].x = fimd2 - fid2;
fields[2].y = fkmd2 - fkd2;
fields[2].z = fimp2 - fip2;
fields[2].w = fkmp2 - fkp2;
} else {
fields[0].x = 0.0f;
......@@ -240,7 +215,7 @@ static void kInitializeMutualInducedField_kernel(
float* polarizability )
{
int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
int pos = blockIdx.x*blockDim.x + threadIdx.x;
while( pos < 3*cSim.atoms )
{
fixedEField[pos] *= polarizability[pos];
......@@ -323,7 +298,7 @@ static void kSorUpdateMutualInducedField_kernel(
float* matrixProduct, float* matrixProductP )
{
int pos = __mul24(blockIdx.x,blockDim.x) + threadIdx.x;
int pos = blockIdx.x*blockDim.x + threadIdx.x;
while( pos < 3*cSim.atoms )
{
......@@ -415,9 +390,8 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "Cutoff -- use warp\n" );
(void) fprintf( amoebaGpu->log, "%s numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u ixnCt=%u workUnits=%u\n",
methodName, gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
(void) fprintf( amoebaGpu->log, "cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%lu shrd=%lu ixnCt=%lu workUnits=%u\n",
gpu->sim.nonbond_blocks, threadsPerBlock, gpu->bOutputBufferPerWarp,
sizeof(MutualInducedParticle), sizeof(MutualInducedParticle)*threadsPerBlock,
(*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
......@@ -426,8 +400,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuConte
if (gpu->bOutputBufferPerWarp){
//gpu->sim.pInteractingWorkUnit,
//amoebaGpu->psWorkUnit->_pDevData,
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<<gpu->sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
......@@ -591,7 +563,6 @@ static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoeba
cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] );
kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
LAUNCHERROR("cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply Loop\n");
// post matrix multiply
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment