/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see . *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
#include "openmm/OpenMMException.h"
#include
using namespace std;
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
void GetCalculateAmoebaCudaPmeMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaPmeMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
#undef INCLUDE_MI_FIELD_BUFFERS
#define INCLUDE_MI_FIELD_BUFFERS
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
#ifdef INCLUDE_MI_FIELD_BUFFERS
__device__ void sumTempBuffer( MutualInducedParticle& atomI, MutualInducedParticle& atomJ ){
atomI.tempBuffer[0] += atomJ.tempBuffer[0];
atomI.tempBuffer[1] += atomJ.tempBuffer[1];
atomI.tempBuffer[2] += atomJ.tempBuffer[2];
atomI.tempBufferP[0] += atomJ.tempBufferP[0];
atomI.tempBufferP[1] += atomJ.tempBufferP[1];
atomI.tempBufferP[2] += atomJ.tempBufferP[2];
}
#endif
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void setupMutualInducedFieldPairIxn_kernel( const MutualInducedParticle& atomI, const MutualInducedParticle& atomJ,
const float uscale, float4* delta, float* preFactor2 ) {
// compute thedelta->xeal space portion of the Ewald summation
delta->x = atomJ.x - atomI.x;
delta->y = atomJ.y - atomI.y;
delta->z = atomJ.z - atomI.z;
// pdelta->xiodic boundary conditions
delta->x -= floorf(delta->x*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
delta->y -= floorf(delta->y*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
delta->z -= floorf(delta->z*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = (delta->x*delta->x) + (delta->y*delta->y) + (delta->z*delta->z);
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn0 = erfcf(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = expf(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = expf(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
}
}
float dsc3 = uscale*scale3;
float dsc5 = uscale*scale5;
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f*(1.0f-dsc5)/r5;
delta->w = rr3 - bn1;
*preFactor2 = bn2 - rr5;
} else {
delta->w = *preFactor2 = 0.0f;
}
}
__device__ void calculateMutualInducedFieldPairIxn_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
float preFactor3 = preFactor2*(inducedDipole[0]*delta.x + inducedDipole[1]*delta.y + inducedDipole[2]*delta.z);
fieldSum[0] += preFactor3*delta.x + delta.w*inducedDipole[0];
fieldSum[1] += preFactor3*delta.y + delta.w*inducedDipole[1];
fieldSum[2] += preFactor3*delta.z + delta.w*inducedDipole[2];
}
__device__ void calculateMutualInducedFieldPairIxnNoAdd_kernel( const float inducedDipole[3], const float4 delta, const float preFactor2, float fieldSum[3] ) {
float preFactor3 = preFactor2*(inducedDipole[0]*delta.x + inducedDipole[1]*delta.y + inducedDipole[2]*delta.z);
fieldSum[0] = preFactor3*delta.x + delta.w*inducedDipole[0];
fieldSum[1] = preFactor3*delta.y + delta.w*inducedDipole[1];
fieldSum[2] = preFactor3*delta.z + delta.w*inducedDipole[2];
}
// file includes FixedFieldParticle struct definition/load/unload struct and body kernel for fixed E-field
__device__ void calculatePmeDirectMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float uscale, float4 fields[3] ){
// compute the real space portion of the Ewald summation
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic boundary conditions
xr -= floorf(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floorf(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floorf(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr* yr + zr*zr;
if( r2 <= cSim.nonbondedCutoffSqr ){
float r = sqrtf(r2);
// calculate the error function damping terms
float ralpha = cSim.alphaEwald*r;
float bn0 = erfcf(ralpha)/r;
float alsq2 = 2.0f*cSim.alphaEwald*cSim.alphaEwald;
float alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cSim.alphaEwald);
float exp2a = expf(-(ralpha*ralpha));
alsq2n *= alsq2;
float bn1 = (bn0+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
float bn2 = (3.0f*bn1+alsq2n*exp2a)/r2;
// compute the error function scaled and unscaled terms
float scale3 = 1.0f;
float scale5 = 1.0f;
float damp = atomI.damp*atomJ.damp;
if( damp != 0.0f ){
float ratio = (r/damp);
ratio = ratio*ratio*ratio;
float pgamma = atomI.thole < atomJ.thole ? atomI.thole : atomJ.thole;
damp = -pgamma*ratio;
if( damp > -50.0f) {
float expdamp = expf(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - expdamp*(1.0f-damp);
}
}
float dsc3 = uscale*scale3;
float dsc5 = uscale*scale5;
float r3 = (r*r2);
float r5 = (r3*r2);
float rr3 = (1.0f-dsc3)/r3;
float rr5 = 3.0f*(1.0f-dsc5)/r5;
float preFactor1 = rr3 - bn1;
float preFactor2 = bn2 - rr5;
float dukr = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
float preFactor3 = preFactor2*dukr;
fields[0].x = preFactor3*xr + preFactor1*atomJ.inducedDipole[0];
fields[1].x = preFactor3*yr + preFactor1*atomJ.inducedDipole[1];
fields[2].x = preFactor3*zr + preFactor1*atomJ.inducedDipole[2];
float duir = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
preFactor3 = preFactor2*duir;
fields[0].y = preFactor3*xr + preFactor1*atomI.inducedDipole[0];
fields[1].y = preFactor3*yr + preFactor1*atomI.inducedDipole[1];
fields[2].y = preFactor3*zr + preFactor1*atomI.inducedDipole[2];
float pukr = atomJ.inducedDipolePolar[0]*xr + atomJ.inducedDipolePolar[1]*yr + atomJ.inducedDipolePolar[2]*zr;
preFactor3 = preFactor2*pukr;
fields[0].z = preFactor3*xr + preFactor1*atomJ.inducedDipolePolar[0];
fields[1].z = preFactor3*yr + preFactor1*atomJ.inducedDipolePolar[1];
fields[2].z = preFactor3*zr + preFactor1*atomJ.inducedDipolePolar[2];
float puir = atomI.inducedDipolePolar[0]*xr + atomI.inducedDipolePolar[1]*yr + atomI.inducedDipolePolar[2]*zr;
preFactor3 = preFactor2*puir;
fields[0].w = preFactor3*xr + preFactor1*atomI.inducedDipolePolar[0];
fields[1].w = preFactor3*yr + preFactor1*atomI.inducedDipolePolar[1];
fields[2].w = preFactor3*zr + preFactor1*atomI.inducedDipolePolar[2];
} else {
fields[0].x = 0.0f;
fields[0].y = 0.0f;
fields[0].z = 0.0f;
fields[0].w = 0.0f;
fields[1].x = 0.0f;
fields[1].y = 0.0f;
fields[1].z = 0.0f;
fields[1].w = 0.0f;
fields[2].x = 0.0f;
fields[2].y = 0.0f;
fields[2].z = 0.0f;
fields[2].w = 0.0f;
}
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##Cutoff##b
#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##CutoffByWarp##b
#include "kCalculateAmoebaCudaPmeMutualInducedField.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kInitializeMutualInducedField_kernel(
int numberOfAtoms,
float* fixedEField,
float* fixedEFieldPolar,
float* polarizability )
{
int pos = blockIdx.x*blockDim.x + threadIdx.x;
while( pos < 3*cSim.atoms )
{
fixedEField[pos] *= polarizability[pos];
fixedEFieldPolar[pos] *= polarizability[pos];
pos += blockDim.x*gridDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDeltas1, float* arrayOfDeltas2, float* epsilon )
{
extern __shared__ float2 delta[];
delta[threadIdx.x].x = 0.0f;
delta[threadIdx.x].y = 0.0f;
unsigned int pos = threadIdx.x;
// load deltas
while( pos < numberOfEntries )
{
delta[threadIdx.x].x += arrayOfDeltas1[pos];
delta[threadIdx.x].y += arrayOfDeltas2[pos];
pos += blockDim.x*gridDim.x;
}
__syncthreads();
// sum the deltas
for (int offset = 1; offset < blockDim.x; offset *= 2 )
{
if (threadIdx.x + offset < blockDim.x && (threadIdx.x & (2*offset-1)) == 0)
{
delta[threadIdx.x].x += delta[threadIdx.x+offset].x;
delta[threadIdx.x].y += delta[threadIdx.x+offset].y;
}
__syncthreads();
}
// set epsilons
if (threadIdx.x == 0)
{
epsilon[0] = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
}
}
/**
matrixProduct/matrixProductP contains epsilon**2 on output
*/
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
static void kSorUpdateMutualInducedField_kernel(
float* polarizability,
float* inducedDipole, float* inducedDipoleP,
float* fixedEField, float* fixedEFieldP,
float* matrixProduct, float* matrixProductP )
{
int pos = blockIdx.x*blockDim.x + threadIdx.x;
const float term = (4.0f/3.0f)*(cSim.alphaEwald*cSim.alphaEwald*cSim.alphaEwald)/cAmoebaSim.sqrtPi;
const float polarSOR = 0.55f;
while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[pos];
// add self terms to fields
float mProd = matrixProduct[pos];
float mProdP = matrixProductP[pos];
mProd += term*previousDipole;
mProdP += term*previousDipoleP;
float inducedDipoleI = fixedEField[pos] + polarizability[pos]*mProd;
float inducedDipoleIP = fixedEFieldP[pos] + polarizability[pos]*mProdP;
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipoleI - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleIP - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
}
// reduce psWorkArray_3_1
// reduce psWorkArray_3_2
static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream* outputArray, CUDAStream* outputPolarArray )
{
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReducePmeMI_Fields1");
kReduceFields_kernel<<sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
LAUNCHERROR("kReducePmeMI_Fields2");
}
/**---------------------------------------------------------------------------------------
Compute mutual induce field
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpuContext amoebaGpu,
CUDAStream* outputArray, CUDAStream* outputPolarArray )
{
static unsigned int threadsPerBlock = 0;
gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_3( amoebaGpu, 2 );
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaPmeMutualInducedFieldCutoffByWarp_kernel<<sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData );
} else {
kCalculateAmoebaPmeMutualInducedFieldCutoff_kernel<<sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->sim.pInteractingWorkUnit,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData );
}
LAUNCHERROR("kCalculateAmoebaPmeMutualInducedField");
kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
}
/**---------------------------------------------------------------------------------------
Compute mutual induce field
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
int done;
int iteration;
gpuContext gpu = amoebaGpu->gpuContext;
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
gpu->natoms,
amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psPolarizability->_pDevData );
LAUNCHERROR("AmoebaPmeMutualInducedFieldSetup");
cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psE_Field->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
// if polarization type is direct, set flags signalling done and return
if( amoebaGpu->amoebaSim.polarizationType )
{
amoebaGpu->mutualInducedDone = 1;
amoebaGpu->mutualInducedConverged = 1;
kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
return;
}
// ---------------------------------------------------------------------------------------
done = 0;
iteration = 1;
while( !done ){
// apply SOR
cudaComputeAmoebaPmeMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] );
kCalculateAmoebaPMEInducedDipoleField( amoebaGpu );
// post matrix multiply
kSorUpdateMutualInducedField_kernel<<< gpu->sim.blocks, gpu->sim.threads_per_block >>>(
amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("kSorUpdatePmeMutualInducedField");
// get total epsilon -- performing sums on gpu
kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData,
amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReducePmeMutualInducedFieldDelta");
// Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download();
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0];
amoebaGpu->mutualInducedCurrentEpsilon = currentEpsilon;
if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){
done = 1;
}
// throw exception if nan detected
if( amoebaGpu->mutualInducedCurrentEpsilon != amoebaGpu->mutualInducedCurrentEpsilon ){
throw OpenMM::OpenMMException("PME induced dipole calculation detected nans." );
}
iteration++;
}
amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
}
void cudaComputeAmoebaPmeMutualInducedField( amoebaGpuContext amoebaGpu )
{
if( amoebaGpu->mutualInducedIterativeMethod == 0 ){
cudaComputeAmoebaPmeMutualInducedFieldBySOR( amoebaGpu );
}
}