/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see . *
* -------------------------------------------------------------------------- */
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
#include
using namespace std;
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaMutualInducedFieldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
void GetCalculateAmoebaCudaMutualInducedFieldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaMutualInducedFieldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
#include "kCalculateAmoebaCudaMutualInducedParticle.h"
__device__ void calculateMutualInducedFieldPairIxn_kernel( MutualInducedParticle& atomI, MutualInducedParticle& atomJ,
float fields[4][3] )
{
float deltaR[3];
// ---------------------------------------------------------------------------------------
// get deltaR, and r between 2 atoms
deltaR[0] = atomJ.x - atomI.x;
deltaR[1] = atomJ.y - atomI.y;
deltaR[2] = atomJ.z - atomI.z;
float r = sqrtf( deltaR[0]*deltaR[0] + deltaR[1]*deltaR[1] + deltaR[2]*deltaR[2] );
float rI = 1.0f/r;
float r2I = rI*rI;
float rr3 = -rI*r2I;
float rr5 = -3.0f*rr3*r2I;
float dampProd = atomI.damp*atomJ.damp;
float ratio = (dampProd != 0.0f) ? (r/dampProd) : 1.0f;
float pGamma = atomJ.thole > atomI.thole ? atomI.thole: atomJ.thole;
float damp = ratio*ratio*ratio*pGamma;
float dampExp = ( (dampProd != 0.0f) && (r < cAmoebaSim.scalingDistanceCutoff) ) ? expf( -damp ) : 0.0f;
rr3 *= (1.0f - dampExp);
rr5 *= (1.0f - ( 1.0f + damp )*dampExp);
float dDotDelta = rr5*(deltaR[0]*atomJ.inducedDipole[0] + deltaR[1]*atomJ.inducedDipole[1] + deltaR[2]*atomJ.inducedDipole[2] );
fields[0][0] = rr3*atomJ.inducedDipole[0] + dDotDelta*deltaR[0];
fields[0][1] = rr3*atomJ.inducedDipole[1] + dDotDelta*deltaR[1];
fields[0][2] = rr3*atomJ.inducedDipole[2] + dDotDelta*deltaR[2];
dDotDelta = rr5*(deltaR[0]*atomJ.inducedDipolePolar[0] + deltaR[1]*atomJ.inducedDipolePolar[1] + deltaR[2]*atomJ.inducedDipolePolar[2] );
fields[1][0] = rr3*atomJ.inducedDipolePolar[0] + dDotDelta*deltaR[0];
fields[1][1] = rr3*atomJ.inducedDipolePolar[1] + dDotDelta*deltaR[1];
fields[1][2] = rr3*atomJ.inducedDipolePolar[2] + dDotDelta*deltaR[2];
dDotDelta = rr5*(deltaR[0]*atomI.inducedDipole[0] + deltaR[1]*atomI.inducedDipole[1] + deltaR[2]*atomI.inducedDipole[2] );
fields[2][0] = rr3*atomI.inducedDipole[0] + dDotDelta*deltaR[0];
fields[2][1] = rr3*atomI.inducedDipole[1] + dDotDelta*deltaR[1];
fields[2][2] = rr3*atomI.inducedDipole[2] + dDotDelta*deltaR[2];
dDotDelta = rr5*(deltaR[0]*atomI.inducedDipolePolar[0] + deltaR[1]*atomI.inducedDipolePolar[1] + deltaR[2]*atomI.inducedDipolePolar[2] );
fields[3][0] = rr3*atomI.inducedDipolePolar[0] + dDotDelta*deltaR[0];
fields[3][1] = rr3*atomI.inducedDipolePolar[1] + dDotDelta*deltaR[1];
fields[3][2] = rr3*atomI.inducedDipolePolar[2] + dDotDelta*deltaR[2];
}
// Include versions of the kernels for N^2 calculations.
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaMutualInducedField.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaMutualInducedField.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kInitializeMutualInducedField_kernel(
int numberOfAtoms,
float* fixedEField,
float* fixedEFieldPolar,
float* polarizability )
{
int pos = blockIdx.x*blockDim.x + threadIdx.x;
while( pos < 3*cSim.atoms )
{
fixedEField[pos] *= polarizability[pos];
fixedEFieldPolar[pos] *= polarizability[pos];
pos += blockDim.x*gridDim.x;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kReduceMutualInducedFieldDelta_kernel(int numberOfEntries, float* arrayOfDeltas1, float* arrayOfDeltas2, float* epsilon )
{
extern __shared__ float2 delta[];
delta[threadIdx.x].x = 0.0f;
delta[threadIdx.x].y = 0.0f;
unsigned int pos = threadIdx.x;
// load deltas
while( pos < numberOfEntries )
{
delta[threadIdx.x].x += arrayOfDeltas1[pos];
delta[threadIdx.x].y += arrayOfDeltas2[pos];
pos += blockDim.x*gridDim.x;
}
__syncthreads();
// sum the deltas
for (int offset = 1; offset < blockDim.x; offset *= 2 )
{
if (threadIdx.x + offset < blockDim.x && (threadIdx.x & (2*offset-1)) == 0)
{
delta[threadIdx.x].x += delta[threadIdx.x+offset].x;
delta[threadIdx.x].y += delta[threadIdx.x+offset].y;
}
__syncthreads();
}
// set epsilons
if (threadIdx.x == 0)
{
epsilon[0] = delta[0].x > delta[0].y ? delta[0].x : delta[0].y;
epsilon[0] = 48.033324f*sqrtf( epsilon[0]/( (float) (numberOfEntries/3)) );
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(GT2XX_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_THREADS_PER_BLOCK, 1)
#endif
void kSorUpdateMutualInducedField_kernel(
int numberOfEntries, float* polarizability,
float* inducedDipole, float* inducedDipoleP,
float* fixedEField, float* fixedEFieldP,
float* matrixProduct, float* matrixProductP )
{
float polarSOR = 0.70f;
int pos = blockIdx.x*blockDim.x + threadIdx.x;
while( pos < 3*cSim.atoms )
{
float previousDipole = inducedDipole[pos];
float previousDipoleP = inducedDipoleP[pos];
inducedDipole[pos] = fixedEField[pos] + polarizability[pos]*matrixProduct[pos];
inducedDipoleP[pos] = fixedEFieldP[pos] + polarizability[pos]*matrixProductP[pos];
inducedDipole[pos] = previousDipole + polarSOR*( inducedDipole[pos] - previousDipole );
inducedDipoleP[pos] = previousDipoleP + polarSOR*( inducedDipoleP[pos] - previousDipoleP );
matrixProduct[pos] = ( inducedDipole[pos] - previousDipole )*( inducedDipole[pos] - previousDipole );
matrixProductP[pos] = ( inducedDipoleP[pos] - previousDipoleP )*( inducedDipoleP[pos] - previousDipoleP );
pos += blockDim.x*gridDim.x;
}
}
// reduce psWorkArray_3_1
// reduce psWorkArray_3_2
static void kReduceMutualInducedFields(amoebaGpuContext amoebaGpu, CUDAStream* outputArray, CUDAStream* outputPolarArray )
{
gpuContext gpu = amoebaGpu->gpuContext;
kReduceFields_kernel<<sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevData, outputArray->_pDevData, 0 );
LAUNCHERROR("kReduceMI_Fields1");
kReduceFields_kernel<<sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block>>>(
gpu->sim.paddedNumberOfAtoms*3, gpu->sim.outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevData, outputPolarArray->_pDevData, 0 );
LAUNCHERROR("kReduceMI_Fields2");
}
/**---------------------------------------------------------------------------------------
Compute mutual induce field
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpuContext amoebaGpu,
CUDAStream* outputArray, CUDAStream* outputPolarArray )
{
// ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
kClearFields_3( amoebaGpu, 2 );
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 512;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(MutualInducedParticle), gpu->sharedMemoryPerBlock ), maxThreads);
}
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaMutualInducedFieldN2ByWarp_kernel<<sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData );
} else {
kCalculateAmoebaMutualInducedFieldN2_kernel<<sim.nonbond_blocks, threadsPerBlock, sizeof(MutualInducedParticle)*threadsPerBlock>>>(
gpu->psWorkUnit->_pDevData,
amoebaGpu->psWorkArray_3_1->_pDevData,
amoebaGpu->psWorkArray_3_2->_pDevData );
}
LAUNCHERROR("kCalculateAmoebaMutualInducedField");
kReduceMutualInducedFields( amoebaGpu, outputArray, outputPolarArray );
}
/**---------------------------------------------------------------------------------------
Compute mutual induce field
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
static void cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
int done;
int iteration;
gpuContext gpu = amoebaGpu->gpuContext;
// ---------------------------------------------------------------------------------------
// set E_Field & E_FieldPolar] to [ E_Field & E_FieldPolar]*Polarizability
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kInitializeMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms,
amoebaGpu->psE_Field->_pDevData,
amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psPolarizability->_pDevData );
LAUNCHERROR("AmoebaMutualInducedFieldSetup");
cudaMemcpy( amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psE_Field->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
cudaMemcpy( amoebaGpu->psInducedDipolePolar->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData, 3*gpu->sim.paddedNumberOfAtoms*sizeof( float ), cudaMemcpyDeviceToDevice );
// if polarization type is direct, set flags signalling done and return
if( amoebaGpu->amoebaSim.polarizationType )
{
amoebaGpu->mutualInducedDone = 1;
amoebaGpu->mutualInducedConverged = 1;
return;
}
// ---------------------------------------------------------------------------------------
done = 0;
iteration = 1;
while( !done ){
// matrix multiply
cudaComputeAmoebaMutualInducedFieldMatrixMultiply( amoebaGpu, amoebaGpu->psWorkVector[0], amoebaGpu->psWorkVector[1] );
LAUNCHERROR("cudaComputeAmoebaMutualInducedFieldMatrixMultiply Loop\n");
// post matrix multiply
kSorUpdateMutualInducedField_kernel<<< gpu->sim.nonbond_blocks, gpu->sim.bsf_reduce_threads_per_block >>>(
gpu->natoms, amoebaGpu->psPolarizability->_pDevData,
amoebaGpu->psInducedDipole->_pDevData, amoebaGpu->psInducedDipolePolar->_pDevData,
amoebaGpu->psE_Field->_pDevData, amoebaGpu->psE_FieldPolar->_pDevData,
amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData );
LAUNCHERROR("kSorUpdateMutualInducedField");
// get total epsilon -- performing sums on gpu
kReduceMutualInducedFieldDelta_kernel<<<1, amoebaGpu->epsilonThreadsPerBlock, 2*sizeof(float)*amoebaGpu->epsilonThreadsPerBlock>>>(
3*gpu->natoms, amoebaGpu->psWorkVector[0]->_pDevData, amoebaGpu->psWorkVector[1]->_pDevData,
amoebaGpu->psCurrentEpsilon->_pDevData );
LAUNCHERROR("kReduceMutualInducedFieldDelta");
// Debye=48.033324f
amoebaGpu->psCurrentEpsilon->Download();
float currentEpsilon = amoebaGpu->psCurrentEpsilon->_pSysData[0];
amoebaGpu->mutualInducedCurrentEpsilon = currentEpsilon;
if( iteration > amoebaGpu->mutualInducedMaxIterations || amoebaGpu->mutualInducedCurrentEpsilon < amoebaGpu->mutualInducedTargetEpsilon ){
done = 1;
}
iteration++;
}
amoebaGpu->mutualInducedDone = done;
amoebaGpu->mutualInducedConverged = ( !done || iteration > amoebaGpu->mutualInducedMaxIterations ) ? 0 : 1;
}
void cudaComputeAmoebaMutualInducedField( amoebaGpuContext amoebaGpu )
{
if( amoebaGpu->mutualInducedIterativeMethod == 0 ){
cudaComputeAmoebaMutualInducedFieldBySOR( amoebaGpu );
}
}