Commit 3bcfe998 authored by Mark Friedrichs's avatar Mark Friedrichs
Browse files

Initial RealSpaceEwald code for AmoebaMultipoleForce

Removal SASA code
Versioning introduced in AmoebaTinkerParameterFile
Decomposition of PME forces included in prm files 
parent f981842a
......@@ -50,6 +50,19 @@ namespace OpenMM {
class OPENMM_EXPORT AmoebaMultipoleForce : public Force {
public:
enum AmoebaNonbondedMethod {
/**
* No cutoff is applied to nonbonded interactions. The full set of N^2 interactions is computed exactly.
* This necessarily means that periodic boundary conditions cannot be used. This is the default.
*/
NoCutoff = 0,
/**
* Periodic boundary conditions are used, and Particle-Mesh Ewald (PME) summation is used to compute the interaction of each particle
* with all periodic copies of every other particle.
*/
PME = 1
};
enum MultipoleAxisTypes { ZThenX, Bisector };
// Algorithm used to converge mutual induced dipoles:
......@@ -74,6 +87,46 @@ public:
return multipoles.size();
}
/**
* Get the method used for handling long range nonbonded interactions.
*/
AmoebaNonbondedMethod getNonbondedMethod( void ) const;
/**
* Set the method used for handling long range nonbonded interactions.
*/
void setNonbondedMethod(AmoebaNonbondedMethod method);
/**
* Get the cutoff distance (in nm) being used for nonbonded interactions. If the NonbondedMethod in use
* is NoCutoff, this value will have no effect.
*
* @return the cutoff distance, measured in nm
*/
double getCutoffDistance( void ) const;
/**
* Set the cutoff distance (in nm) being used for nonbonded interactions. If the NonbondedMethod in use
* is NoCutoff, this value will have no effect.
*
* @param distance the cutoff distance, measured in nm
*/
void setCutoffDistance(double distance);
/**
* Get the aEwald parameter
*
* @return the Ewald parameter
*/
double getAEwald() const;
/**
* Set the aEwald parameter
*
* @param Ewald parameter
*/
void setAEwald(double aewald);
/**
* Add multipole-related info for a particle
*
......@@ -224,6 +277,9 @@ protected:
ForceImpl* createImpl();
private:
AmoebaNonbondedMethod nonbondedMethod;
double cutoffDistance;
double aewald;
MutualInducedIterationMethod mutualInducedIterationMethod;
int mutualInducedMaxIterations;
double mutualInducedTargetEpsilon;
......
......@@ -36,15 +36,32 @@
using namespace OpenMM;
AmoebaMultipoleForce::AmoebaMultipoleForce() {
AmoebaMultipoleForce::AmoebaMultipoleForce() : nonbondedMethod(NoCutoff), cutoffDistance(1.0), aewald(0.0), mutualInducedIterationMethod(SOR), mutualInducedMaxIterations(60),
mutualInducedTargetEpsilon(1.0e-05), scalingDistanceCutoff(100.0), electricConstant(138.9354558456) {
}
AmoebaMultipoleForce::AmoebaNonbondedMethod AmoebaMultipoleForce::getNonbondedMethod( void ) const {
return nonbondedMethod;
}
void AmoebaMultipoleForce::setNonbondedMethod( AmoebaMultipoleForce::AmoebaNonbondedMethod method) {
nonbondedMethod = method;
}
mutualInducedIterationMethod = SOR;
mutualInducedMaxIterations = 60;
mutualInducedTargetEpsilon = 1.0e-06;
scalingDistanceCutoff = 100.0;
double AmoebaMultipoleForce::getCutoffDistance( void ) const {
return cutoffDistance;
}
void AmoebaMultipoleForce::setCutoffDistance(double distance) {
cutoffDistance = distance;
}
double AmoebaMultipoleForce::getAEwald() const {
return aewald;
}
// ONE_4PI_EPS0
electricConstant = 138.9354558456;
void AmoebaMultipoleForce::setAEwald(double inputAewald ) {
aewald = inputAewald;
}
AmoebaMultipoleForce::MutualInducedIterationMethod AmoebaMultipoleForce::getMutualInducedIterationMethod( void ) const {
......
......@@ -30,7 +30,6 @@
#include "CudaPlatform.h"
#include "kernels/amoebaGpuTypes.h"
#include "kernels/cudaKernels.h"
#include "kernels/amoebaCudaKernels.h"
#include "openmm/KernelImpl.h"
namespace OpenMM {
......
......@@ -575,12 +575,20 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
throw OpenMMException("Iterative method for mutual induced dipoles not recognized.\n");
}
int nonbondedMethod = static_cast<int>(force.getNonbondedMethod());
if( nonbondedMethod != 0 && nonbondedMethod != 1 ){
throw OpenMMException("AmoebaMultipoleForce nonbonded method not recognized.\n");
}
gpuSetAmoebaMultipoleParameters(data.getAmoebaGpu(), charges, dipoles, quadrupoles, axisTypes, multipoleAtomId1s, multipoleAtomId2s,
tholes, scalingDistanceCutoff, dampingFactors, polarity,
multipoleAtomCovalentInfo, covalentDegree, minCovalentIndices, minCovalentPolarizationIndices, (maxCovalentRange+2),
static_cast<int>(force.getMutualInducedIterationMethod()),
force.getMutualInducedMaxIterations(),
static_cast<float>( force.getMutualInducedTargetEpsilon()),
nonbondedMethod,
static_cast<float>( force.getCutoffDistance()),
static_cast<float>( force.getAEwald()),
static_cast<float>( force.getElectricConstant()) );
}
......
......@@ -29,6 +29,7 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "openmm/OpenMMException.h"
#include "cudaKernels.h"
#include "amoebaCudaKernels.h"
......@@ -332,7 +333,11 @@ void gpuPrintCudaAmoebaGmxSimulation(amoebaGpuContext amoebaGpu, FILE* log )
(void) fprintf( log, " pD_ScaleIndices %p\n", amoebaGpu->amoebaSim.pD_ScaleIndices );
(void) fprintf( log, " pP_ScaleIndices %p\n", amoebaGpu->amoebaSim.pP_ScaleIndices );
(void) fprintf( log, " pM_ScaleIndices %p\n", amoebaGpu->amoebaSim.pM_ScaleIndices );
(void) fprintf( log, " sqrtPi %15.7e\n", amoebaGpu->amoebaSim.sqrtPi );
(void) fprintf( log, " cutoffDistance2 %15.7e\n", amoebaGpu->amoebaSim.cutoffDistance2 );
(void) fprintf( log, " aewald %15.7e\n", amoebaGpu->amoebaSim.aewald );
(void) fprintf( log, " electric %15.7e\n", amoebaGpu->amoebaSim.electric );
(void) fprintf( log, " box %15.7e %15.7e %15.7e\n", gpu->sim.periodicBoxSizeX, gpu->sim.periodicBoxSizeY, gpu->sim.periodicBoxSizeZ);
(void) fprintf( log, " gkc %15.7e\n", amoebaGpu->amoebaSim.gkc );
(void) fprintf( log, " dielec %15.7e\n", amoebaGpu->amoebaSim.dielec );
(void) fprintf( log, " dwater %15.7e\n", amoebaGpu->amoebaSim.dwater );
......@@ -1429,7 +1434,8 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
const std::vector<float>& tholes, float scalingDistanceCutoff,const std::vector<float>& dampingFactors, const std::vector<float>& polarity,
const std::vector< std::vector< std::vector<int> > >& multipoleParticleCovalentInfo, const std::vector<int>& covalentDegree,
const std::vector<int>& minCovalentIndices, const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange,
int mutualInducedIterativeMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon, float electricConstant ){
int mutualInducedIterativeMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
int nonbondedMethod, float cutoffDistance, float aewald, float electricConstant ){
// ---------------------------------------------------------------------------------------
......@@ -1514,6 +1520,16 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
amoebaGpu->psMultipoleParticlesIdsAndAxisType->_pSysStream[0][ii].z = ii;
}
if( nonbondedMethod == 0 ){
amoebaGpu->multipoleNonbondedMethod = AMOEBA_NO_CUTOFF;
} else if( nonbondedMethod == 1 ){
amoebaGpu->multipoleNonbondedMethod = AMOEBA_PARTICLE_MESH_EWALD;
} else {
throw OpenMM::OpenMMException("multipoleNonbondedMethod not recognzied.\n" );
}
amoebaGpu->amoebaSim.cutoffDistance2 = cutoffDistance*cutoffDistance;
amoebaGpu->amoebaSim.sqrtPi = sqrt( 3.1415926535897932384626433832795 );
amoebaGpu->amoebaSim.aewald = aewald;
amoebaGpu->amoebaSim.electric = electricConstant;
if( amoebaGpu->amoebaSim.dielec < 1.0e-05 ){
amoebaGpu->amoebaSim.dielec = 1.0f;
......@@ -2432,73 +2448,6 @@ void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
}
extern "C"
void gpuSetAmoebaSASAParameters( amoebaGpuContext amoebaGpu, float probeRadius,
const std::vector<float>& radii, const std::vector<float>& weights )
{
// ---------------------------------------------------------------------------------------
static const char* methodName = "gpuSetAmoebaSASAParameters";
static const int maxarc = 500;
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
amoebaGpu->paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
unsigned int particles = radii.size();
if( particles < 1 ){
(void) fprintf( stderr, "%s no particles\n", methodName );
return;
}
(void) fprintf( stderr, "%s radius converted Ang !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n", methodName );
float scaleRadius = 10.0f;
amoebaGpu->amoebaSim.probeRadius = probeRadius;
amoebaGpu->amoebaSim.maxarc = maxarc;
amoebaGpu->psSASA_Radii = new CUDAStream<float>(amoebaGpu->paddedNumberOfAtoms, 1, "SASARadii");
amoebaGpu->psSASA_Weights = new CUDAStream<float>(amoebaGpu->paddedNumberOfAtoms, 1, "SASAWeights");
for (unsigned int ii = 0; ii < particles; ii++)
{
amoebaGpu->psSASA_Radii->_pSysStream[0][ii] = scaleRadius*( radii[ii] + probeRadius );
amoebaGpu->psSASA_Weights->_pSysStream[0][ii] = weights[ii];
}
// Dummy out extra particles data
for (unsigned int ii = particles; ii < amoebaGpu->paddedNumberOfAtoms; ii++)
{
amoebaGpu->psSASA_Radii->_pSysStream[0][ii] = 1.0f;
amoebaGpu->psSASA_Weights->_pSysStream[0][ii] = 0.0f;
}
for (unsigned int ii = 0; ii < 4; ii++)
{
amoebaGpu->psIntWorkArray[ii] = new CUDAStream<int>(amoebaGpu->amoebaSim.maxarc*amoebaGpu->paddedNumberOfAtoms, 1, "SASAIntWorkArray");
}
amoebaGpu->psFloatWorkArray = new CUDAStream<float>(amoebaGpu->amoebaSim.maxarc*amoebaGpu->paddedNumberOfAtoms, 1, "SASAFloatWorkArray");
amoebaGpu->psIoListCount = new CUDAStream<int>(amoebaGpu->paddedNumberOfAtoms, 1, "SASAIoCount");
amoebaGpu->psDoneAtom = new CUDAStream<int>(amoebaGpu->paddedNumberOfAtoms, 1, "SASADoneAtom");
amoebaGpu->psSASA_Radii->Upload();
amoebaGpu->psSASA_Weights->Upload();
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
unsigned int maxPrint = 32;
(void) fprintf( amoebaGpu->log, "%s probeRadius=%12.3f\n", methodName, probeRadius );
for (unsigned int ii = 0; ii < gpu->natoms; ii++)
{
(void) fprintf( amoebaGpu->log, "%5u %15.7e %15.7e\n", ii, radii[ii], weights[ii] );
if( ii == maxPrint && ii < (amoebaGpu->paddedNumberOfAtoms - maxPrint) )
{
ii = (amoebaGpu->paddedNumberOfAtoms - maxPrint);
}
}
(void) fflush( amoebaGpu->log );
}
#endif
}
extern "C"
void amoebaGpuShutDown(amoebaGpuContext gpu)
{
......@@ -2584,16 +2533,6 @@ void amoebaGpuShutDown(amoebaGpuContext gpu)
delete gpu->psWcaDispersionRadiusEpsilon;
delete gpu->psSASA_Radii;
delete gpu->psSASA_Weights;
//delete gpu->psSASA_WeightIntWorkArray[0];
//delete gpu->psSASA_WeightIntWorkArray[1];
//delete gpu->psSASA_WeightIntWorkArray[2];
//delete gpu->psSASA_WeightIntWorkArray[3];
delete gpu->psDoneAtom;
delete gpu->psIoListCount;
delete gpu->psFloatWorkArray;
delete gpu->psWorkArray_3_1;
delete gpu->psWorkArray_3_2;
delete gpu->psWorkArray_3_3;
......@@ -2648,7 +2587,6 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
gpuSetAmoebaBondOffsets( amoebaGpu );
SetCalculateAmoebaLocalForcesSim( amoebaGpu );
//SetCalculateAmoebaCudaSASAForcesSim( amoebaGpu );
SetForcesSim( amoebaGpu->gpuContext );
SetCalculateAmoebaMultipoleForcesSim( amoebaGpu );
SetCalculateAmoebaCudaFixedEFieldSim( amoebaGpu );
......@@ -2656,12 +2594,12 @@ void amoebaGpuSetConstants(amoebaGpuContext amoebaGpu)
SetCalculateAmoebaCudaWcaDispersionSim( amoebaGpu );
SetCalculateAmoebaCudaMutualInducedFieldSim( amoebaGpu );
SetCalculateAmoebaElectrostaticSim( amoebaGpu );
SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpu );
SetCalculateAmoebaCudaMapTorquesSim( amoebaGpu );
SetCalculateAmoebaKirkwoodSim( amoebaGpu );
SetCalculateAmoebaKirkwoodEDiffSim( amoebaGpu );
SetCalculateAmoebaCudaFixedEAndGKFieldsSim( amoebaGpu );
SetCalculateAmoebaCudaMutualInducedAndGkFieldsSim( amoebaGpu );
//SetCalculateAmoebaObcGbsaForces2Sim( amoebaGpu );
SetCalculateObcGbsaForces2Sim( amoebaGpu->gpuContext );
}
......
......@@ -97,6 +97,10 @@ extern void SetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaElectrostaticSim( amoebaGpuContext amoebaGpu );
extern void cudaComputeAmoebaElectrostatic( amoebaGpuContext amoebaGpu );
extern void SetCalculateAmoebaRealSpaceEwaldSim( amoebaGpuContext amoebaGpu );
extern void GetCalculateAmoebaRealSpaceEwaldSim( amoebaGpuContext amoebaGpu );
extern void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu );
extern void SetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
extern void GetCalculateAmoebaCudaMapTorquesSim(amoebaGpuContext gpu);
extern void cudaComputeAmoebaMapTorques( amoebaGpuContext gpu, CUDAStream<float>* psTorque, CUDAStream<float>* psForce);
......@@ -144,7 +148,7 @@ extern void kClearFields_1( amoebaGpuContext amoebaGpu );
extern void kClearFields_3( amoebaGpuContext amoebaGpu, unsigned int numberToClear );
extern unsigned int getThreadsPerBlock( amoebaGpuContext amoebaGpu, unsigned int sharedMemoryPerThread );
extern int isNanOrInfinity( double number );
//extern int isNanOrInfinity( double number );
extern void trackMutualInducedIterations( amoebaGpuContext amoebaGpu, int iteration);
#endif //__AMOEBA_GPU_TYPES_H__
......
......@@ -41,6 +41,12 @@
#include <builtin_types.h>
#include <vector_functions.h>
enum CudaAmoebaNonbondedMethod
{
AMOEBA_NO_CUTOFF,
AMOEBA_PARTICLE_MESH_EWALD
};
struct cudaAmoebaGmxSimulation {
// Constants
......@@ -118,6 +124,9 @@ struct cudaAmoebaGmxSimulation {
unsigned int numberOfAtoms; // number of atoms
unsigned int paddedNumberOfAtoms; // padded number of atoms
float cutoffDistance2; // cutoff distance squared for PME
float sqrtPi; // sqrt(PI)
float aewald; // aewald parameter
float scalingDistanceCutoff; // scaling cutoff
float2* pDampingFactorAndThole; // Thole & damping factors
......@@ -168,89 +177,6 @@ struct cudaAmoebaGmxSimulation {
float fd; // electric * 2.0f * (1.0f-dwater)/(1.0f+2.0f*dwater);
float fq; // electric * 3.0f * (1.0f-dwater)/(2.0f+3.0f*dwater);
// SASA probe radius
float probeRadius;
int maxarc;
// texture<float4,2,cudaReadModeElementType> texTorTorGrid;
#if 0
unsigned int dihedrals; // Number of dihedrals
int4* pDihedralID1; // Dihedral IDs
int4* pDihedralID2; // Dihedral output buffer IDs
float4* pDihedralParameter; // Dihedral parameters
unsigned int rb_dihedrals; // Number of Ryckaert Bellemans dihedrals
int4* pRbDihedralID1; // Ryckaert Bellemans Dihedral IDs
int4* pRbDihedralID2; // Ryckaert Bellemans Dihedral output buffer IDs
float4* pRbDihedralParameter1; // Ryckaert Bellemans Dihedral parameters
float2* pRbDihedralParameter2; // Ryckaert Bellemans Dihedral parameters
unsigned int LJ14s; // Number of Lennard Jones 1-4 interactions
int4* pLJ14ID; // Lennard Jones 1-4 atom and output buffer IDs
float4* pLJ14Parameter; // Lennard Jones 1-4 parameters
float inverseTotalMass; // Used in linear momentum removal
unsigned int ShakeConstraints; // Total number of Shake constraints
unsigned int settleConstraints; // Total number of Settle constraints
unsigned int ccmaConstraints; // Total number of CCMA constraints.
unsigned int rigidClusters; // Total number of rigid clusters
unsigned int maxRigidClusterSize; // The size of the largest rigid cluster
unsigned int clusterShakeBlockSize; // The number of threads to process each rigid cluster
unsigned int NonShakeConstraints; // Total number of NonShake atoms
unsigned int maxShakeIterations; // Maximum shake iterations
unsigned int degreesOfFreedom; // Number of degrees of freedom in system
float shakeTolerance; // Shake tolerance
float InvMassJ; // Shake inverse mass for hydrogens
int* pNonShakeID; // Not Shaking atoms
int4* pShakeID; // Shake atoms and phase
float4* pShakeParameter; // Shake parameters
int4* pSettleID; // Settle atoms
float2* pSettleParameter; // Settle parameters
unsigned int* pExclusion; // Nonbond exclusion data
unsigned int* pExclusionIndex; // Index of exclusion data for each work unit
unsigned int dihedral_offset; // Offset to end of dihedrals
unsigned int rb_dihedral_offset; // Offset to end of Ryckaert Bellemans dihedrals
unsigned int LJ14_offset; // Offset to end of Lennard Jones 1-4 parameters
int* pAtomIndex; // The original index of each atom
float4* pGridBoundingBox; // The size of each grid cell
float4* pGridCenter; // The center of each grid cell
int2* pCcmaAtoms; // The atoms connected by each CCMA constraint
float4* pCcmaDistance; // The displacement vector (x, y, z) and constraint distance (w) for each CCMA constraint
float* pCcmaDelta1; // Workspace for CCMA
float* pCcmaDelta2; // Workspace for CCMA
int* pCcmaAtomConstraints; // The indices of constraints involving each atom
int* pCcmaNumAtomConstraints; // The number of constraints involving each atom
short* pSyncCounter; // Used for global thread synchronization
unsigned int* pRequiredIterations; // Used by CCMA to communicate whether iteration has converged
float* pCcmaReducedMass; // The reduced mass for each CCMA constraint
unsigned int* pConstraintMatrixColumn; // The column of each element in the constraint matrix.
float* pConstraintMatrixValue; // The value of each element in the constraint matrix.
// Mutable stuff
float4* pPosq; // Pointer to atom positions and charges
float4* pPosqP; // Pointer to mid-integration atom positions
float4* pOldPosq; // Pointer to old atom positions
float4* pVelm4; // Pointer to atom velocity and inverse mass
float4* pvVector4; // Pointer to atom v Vector
float4* pxVector4; // Pointer to atom x Vector
float* pBornForce; // Pointer to Born force data
float* pBornSum; // Pointer to Born Radii calculation output buffers
float* pBornRadii; // Pointer to Born Radii
float* pObcChain; // Pointer to OBC chain data
float4* pLinearMomentum; // Pointer to linear momentum
// Random numbers
float4* pRandom4a; // Pointer to first set of 4 random numbers
float4* pRandom4b; // Pointer to second set of 4 random numbers
float2* pRandom2a; // Pointer to first set of 2 random numbers
float2* pRandom2b; // Pointer to second set of 2 random numbers
uint4* pRandomSeed; // Pointer to random seeds
int* pRandomPosition; // Pointer to random number positions
unsigned int randoms; // Number of randoms
unsigned int totalRandoms; // Number of randoms plus overflow.
unsigned int totalRandomsTimesTwo; // Used for generating randoms
unsigned int randomIterations; // Number of iterations before regenerating randoms
unsigned int randomFrames; // Number of frames of random numbers
#endif
};
#endif
......@@ -152,6 +152,10 @@ struct _amoebaGpuContext {
CUDAStream<float>* psE_Field;
CUDAStream<float>* psE_FieldPolar;
int multipoleNonbondedMethod;
double cutoffDistance;
double aewald;
// mutual induced field
int mutualInducedIterativeMethod;
......@@ -215,17 +219,6 @@ struct _amoebaGpuContext {
// Wca dispersion fields
CUDAStream<float2>* psWcaDispersionRadiusEpsilon;
// SASA fields
CUDAStream<float>* psSASA_Radii;
CUDAStream<float>* psSASA_Weights;
CUDAStream<int>* psIntWorkArray[4];
CUDAStream<int>* psDoneAtom;
CUDAStream<int>* psIoListCount;
CUDAStream<float>* psFloatWorkArray;
float sasaArea;
};
typedef struct _amoebaGpuContext *amoebaGpuContext;
......@@ -303,7 +296,8 @@ void gpuSetAmoebaMultipoleParameters(amoebaGpuContext amoebaGpu, const std::vect
const std::vector<float>& tholes, float scalingDistanceCutoff,const std::vector<float>& dampingFactors, const std::vector<float>& polarity,
const std::vector< std::vector< std::vector<int> > >& multipoleAtomCovalentInfo, const std::vector<int>& covalentDegree,
const std::vector<int>& minCovalentIndices, const std::vector<int>& minCovalentPolarizationIndices, int maxCovalentRange,
int mutualInducedIterationMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon, float electricConstant );
int mutualInducedIterationMethod, int mutualInducedMaxIterations, float mutualInducedTargetEpsilon,
int nonbondedMethod, float cutoffDistance, float aewald, float electricConstant );
extern "C"
......@@ -332,9 +326,6 @@ void gpuSetAmoebaWcaDispersionParameters( amoebaGpuContext amoebaGpu,
const float epso, const float epsh, const float rmino, const float rminh,
const float awater, const float shctd, const float dispoff );
extern "C"
void gpuSetAmoebaSASAParameters( amoebaGpuContext amoebaGpu , float probeRadius, const std::vector<float>& radii, const std::vector<float>& weights );
extern "C"
void amoebaGpuSetConstants(amoebaGpuContext gpu);
......
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "amoebaGpuTypes.h"
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaRealSpaceEwaldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
void GetCalculateAmoebaRealSpaceEwaldSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaRealSpaceEwaldSim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
texture<float, 1, cudaReadModeElementType> tabulatedErfcRef;
static __device__ float fastErfc(float r)
{
float normalized = cSim.tabulatedErfcScale*r;
int index = (int) normalized;
float fract2 = normalized-index;
float fract1 = 1.0f-fract2;
return fract1*tex1Dfetch(tabulatedErfcRef, index) + fract2*tex1Dfetch(tabulatedErfcRef, index+1);
}
static int const PScaleIndex = 0;
static int const DScaleIndex = 1;
static int const UScaleIndex = 2;
static int const MScaleIndex = 3;
//static int const Scale3Index = 4;
//static int const Scale5Index = 5;
//static int const Scale7Index = 6;
//static int const Scale9Index = 7;
//static int const Ddsc30Index = 8;
//static int const Ddsc31Index = 9;
//static int const Ddsc32Index = 10;
//static int const Ddsc50Index = 11;
//static int const Ddsc51Index = 12;
//static int const Ddsc52Index = 13;
//static int const Ddsc70Index = 14;
//static int const Ddsc71Index = 15;
//static int const Ddsc72Index = 16;
static int const LastScalingIndex = 4;
struct RealSpaceEwaldParticle {
// coordinates charge
float x;
float y;
float z;
float q;
// lab frame dipole
float labFrameDipole[3];
// lab frame quadrupole
float labFrameQuadrupole[9];
// induced dipole
float inducedDipole[3];
// polar induced dipole
float inducedDipoleP[3];
// scaling factors
float thole;
float damp;
float force[3];
float torque[3];
float padding;
};
__device__ void calculateRealSpaceEwaldPairIxn_kernel( RealSpaceEwaldParticle& atomI, RealSpaceEwaldParticle& atomJ,
float scalingDistanceCutoff, float* scalingFactors,
float* outputForce, float outputTorque[2][3],
float* energy
#ifdef AMOEBA_DEBUG
,float4* debugArray
#endif
){
float e,ei,f;
//float gfd,gfdr;
//float xix,yix,zix;
//float xiy,yiy,ziy;
//float xiz,yiz,ziz;
//float xkx,ykx,zkx;
//float xky,yky,zky;
//float xkz,ykz,zkz;
//float rr1,rr3;
//float rr5,rr7,rr9,rr11;
float erl,erli;
//float frcxi[4],frcxk[4];
//float frcyi[4],frcyk[4];
//float frczi[4],frczk[4];
float di[4],qi[10];
float dk[4],qk[10];
float fridmp[4],findmp[4];
float ftm2[4],ftm2i[4];
float ftm2r[4],ftm2ri[4];
float ttm2[4],ttm3[4];
float ttm2i[4],ttm3i[4];
float ttm2r[4],ttm3r[4];
float ttm2ri[4],ttm3ri[4];
//float fdir[4]
float dixdk[4];
float dkxui[4],dixuk[4];
float dixukp[4],dkxuip[4];
float uixqkr[4],ukxqir[4];
float uixqkrp[4],ukxqirp[4];
float qiuk[4],qkui[4];
float qiukp[4],qkuip[4];
float rxqiuk[4],rxqkui[4];
float rxqiukp[4],rxqkuip[4];
float qidk[4],qkdi[4];
float qir[4],qkr[4];
float qiqkr[4],qkqir[4];
float qixqk[4],rxqir[4];
float dixr[4],dkxr[4];
float dixqkr[4],dkxqir[4];
float rxqkr[4],qkrxqir[4];
float rxqikr[4],rxqkir[4];
float rxqidk[4],rxqkdi[4];
float ddsc3[4],ddsc5[4];
float ddsc7[4];
float bn[6];
float sc[11],gl[9];
float sci[9],scip[9];
float gli[8],glip[8];
float gf[8],gfi[7];
float gfr[8],gfri[7];
float gti[7],gtri[7];
f = (cAmoebaSim.electric/cAmoebaSim.dielec);
// set the permanent multipole and induced dipole values;
float pdi = atomI.damp;
float pti = atomI.thole;
float ci = atomI.q;
di[1] = atomI.labFrameDipole[0];
di[2] = atomI.labFrameDipole[1];
di[3] = atomI.labFrameDipole[2];
qi[1] = atomI.labFrameQuadrupole[0];
qi[2] = atomI.labFrameQuadrupole[1];
qi[3] = atomI.labFrameQuadrupole[2];
qi[4] = atomI.labFrameQuadrupole[3];
qi[5] = atomI.labFrameQuadrupole[4];
qi[6] = atomI.labFrameQuadrupole[5];
qi[7] = atomI.labFrameQuadrupole[6];
qi[8] = atomI.labFrameQuadrupole[7];
qi[9] = atomI.labFrameQuadrupole[8];
float xr = atomJ.x - atomI.x;
float yr = atomJ.y - atomI.y;
float zr = atomJ.z - atomI.z;
// periodic box
xr -= floor(xr*cSim.invPeriodicBoxSizeX+0.5f)*cSim.periodicBoxSizeX;
yr -= floor(yr*cSim.invPeriodicBoxSizeY+0.5f)*cSim.periodicBoxSizeY;
zr -= floor(zr*cSim.invPeriodicBoxSizeZ+0.5f)*cSim.periodicBoxSizeZ;
float r2 = xr*xr + yr*yr + zr*zr;
if( r2 <= cAmoebaSim.cutoffDistance2 ){
float r = sqrt(r2);
float ck = atomJ.q;
dk[1] = atomJ.labFrameDipole[0];
dk[2] = atomJ.labFrameDipole[1];
dk[3] = atomJ.labFrameDipole[2];
qk[1] = atomJ.labFrameQuadrupole[0];
qk[2] = atomJ.labFrameQuadrupole[1];
qk[3] = atomJ.labFrameQuadrupole[2];
qk[4] = atomJ.labFrameQuadrupole[3];
qk[5] = atomJ.labFrameQuadrupole[4];
qk[6] = atomJ.labFrameQuadrupole[5];
qk[7] = atomJ.labFrameQuadrupole[6];
qk[8] = atomJ.labFrameQuadrupole[7];
qk[9] = atomJ.labFrameQuadrupole[8];
// calculate the real space error function terms;
float ralpha = cAmoebaSim.aewald*r;
bn[0] = fastErfc(ralpha)/r;
float alsq2 = 2.0f*cAmoebaSim.aewald*cAmoebaSim.aewald;
float alsq2n = 0.0f;
if( cAmoebaSim.aewald > 0.0f){
alsq2n = 1.0f/(cAmoebaSim.sqrtPi*cAmoebaSim.aewald);
}
float exp2a = exp(-(ralpha*ralpha));
alsq2n *= alsq2;
bn[1] = (bn[0]+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
bn[2] = (3.0f*bn[1]+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
bn[3] = (5.0f*bn[2]+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
bn[4] = (7.0f*bn[3]+alsq2n*exp2a)/r2;
alsq2n *= alsq2;
bn[5] = (9.0f*bn[4]+alsq2n*exp2a)/r2;
// apply Thole polarization damping to scale factors;
float rr1 = 1.0f/r;
float rr3 = rr1 / r2;
float rr5 = 3.0f * rr3 / r2;
float rr7 = 5.0f * rr5 / r2;
float rr9 = 7.0f * rr7 / r2;
float rr11 = 9.0f * rr9 / r2;
float scale3 = 1.0f;
float scale5 = 1.0f;
float scale7 = 1.0f;
ddsc3[0] = 0.0f;
ddsc3[1] = 0.0f;
ddsc3[2] = 0.0f;
ddsc5[0] = 0.0f;
ddsc5[1] = 0.0f;
ddsc5[2] = 0.0f;
ddsc7[0] = 0.0f;
ddsc7[1] = 0.0f;
ddsc7[2] = 0.0f;
float pdk = atomJ.damp;
float ptk = atomJ.thole;
float damp = pdi*pdk;
if( damp != 0.0f ){
float pgamma = pti < ptk ? pti : ptk;
float ratio = r/damp;
damp = -pgamma * ratio*ratio*ratio;
if( damp > -50.0f ){
float expdamp = exp(damp);
scale3 = 1.0f - expdamp;
scale5 = 1.0f - (1.0f-damp)*expdamp;
scale7 = 1.0f - (1.0f-damp+0.6f*damp*damp)*expdamp;
float temp3 = -3.0f * damp * expdamp / r2;
float temp5 = -damp;
float temp7 = -0.2f - 0.6f*damp;
ddsc3[1] = temp3 * xr;
ddsc3[2] = temp3 * yr;
ddsc3[3] = temp3 * zr;
ddsc5[1] = temp5 * ddsc3[1];
ddsc5[2] = temp5 * ddsc3[2];
ddsc5[3] = temp5 * ddsc3[3];
ddsc7[1] = temp7 * ddsc5[1];
ddsc7[2] = temp7 * ddsc5[2];
ddsc7[3] = temp7 * ddsc5[3];
}
}
float dsc3 = 1.0f - scale3*scalingFactors[DScaleIndex];
float dsc5 = 1.0f - scale5*scalingFactors[DScaleIndex];
float dsc7 = 1.0f - scale7*scalingFactors[DScaleIndex];
float psc3 = 1.0f - scale3*scalingFactors[PScaleIndex];
float psc5 = 1.0f - scale5*scalingFactors[PScaleIndex];
float psc7 = 1.0f - scale7*scalingFactors[PScaleIndex];
float usc3 = 1.0f - scale3*scalingFactors[UScaleIndex];
float usc5 = 1.0f - scale5*scalingFactors[UScaleIndex];
// construct necessary auxiliary vectors
dixdk[1] = di[2]*dk[3] - di[3]*dk[2];
dixdk[2] = di[3]*dk[1] - di[1]*dk[3];
dixdk[3] = di[1]*dk[2] - di[2]*dk[1];
dixuk[1] = di[2]*atomJ.inducedDipole[2] - di[3]*atomJ.inducedDipole[1];
dixuk[2] = di[3]*atomJ.inducedDipole[0] - di[1]*atomJ.inducedDipole[2];
dixuk[3] = di[1]*atomJ.inducedDipole[1] - di[2]*atomJ.inducedDipole[0];
dkxui[1] = dk[2]*atomI.inducedDipole[2] - dk[3]*atomI.inducedDipole[1];
dkxui[2] = dk[3]*atomI.inducedDipole[0] - dk[1]*atomI.inducedDipole[2];
dkxui[3] = dk[1]*atomI.inducedDipole[1] - dk[2]*atomI.inducedDipole[0];
dixukp[1] = di[2]*atomJ.inducedDipoleP[2] - di[3]*atomJ.inducedDipoleP[1];
dixukp[2] = di[3]*atomJ.inducedDipoleP[0] - di[1]*atomJ.inducedDipoleP[2];
dixukp[3] = di[1]*atomJ.inducedDipoleP[1] - di[2]*atomJ.inducedDipoleP[0];
dkxuip[1] = dk[2]*atomI.inducedDipoleP[2] - dk[3]*atomI.inducedDipoleP[1];
dkxuip[2] = dk[3]*atomI.inducedDipoleP[0] - dk[1]*atomI.inducedDipoleP[2];
dkxuip[3] = dk[1]*atomI.inducedDipoleP[1] - dk[2]*atomI.inducedDipoleP[0];
dixr[1] = di[2]*zr - di[3]*yr;
dixr[2] = di[3]*xr - di[1]*zr;
dixr[3] = di[1]*yr - di[2]*xr;
dkxr[1] = dk[2]*zr - dk[3]*yr;
dkxr[2] = dk[3]*xr - dk[1]*zr;
dkxr[3] = dk[1]*yr - dk[2]*xr;
qir[1] = qi[1]*xr + qi[4]*yr + qi[7]*zr;
qir[2] = qi[2]*xr + qi[5]*yr + qi[8]*zr;
qir[3] = qi[3]*xr + qi[6]*yr + qi[9]*zr;
qkr[1] = qk[1]*xr + qk[4]*yr + qk[7]*zr;
qkr[2] = qk[2]*xr + qk[5]*yr + qk[8]*zr;
qkr[3] = qk[3]*xr + qk[6]*yr + qk[9]*zr;
qiqkr[1] = qi[1]*qkr[1] + qi[4]*qkr[2] + qi[7]*qkr[3];
qiqkr[2] = qi[2]*qkr[1] + qi[5]*qkr[2] + qi[8]*qkr[3];
qiqkr[3] = qi[3]*qkr[1] + qi[6]*qkr[2] + qi[9]*qkr[3];
qkqir[1] = qk[1]*qir[1] + qk[4]*qir[2] + qk[7]*qir[3];
qkqir[2] = qk[2]*qir[1] + qk[5]*qir[2] + qk[8]*qir[3];
qkqir[3] = qk[3]*qir[1] + qk[6]*qir[2] + qk[9]*qir[3];
qixqk[1] = qi[2]*qk[3] + qi[5]*qk[6] + qi[8]*qk[9]
- qi[3]*qk[2] - qi[6]*qk[5] - qi[9]*qk[8];
qixqk[2] = qi[3]*qk[1] + qi[6]*qk[4] + qi[9]*qk[7]
- qi[1]*qk[3] - qi[4]*qk[6] - qi[7]*qk[9];
qixqk[3] = qi[1]*qk[2] + qi[4]*qk[5] + qi[7]*qk[8]
- qi[2]*qk[1] - qi[5]*qk[4] - qi[8]*qk[7];
rxqir[1] = yr*qir[3] - zr*qir[2];
rxqir[2] = zr*qir[1] - xr*qir[3];
rxqir[3] = xr*qir[2] - yr*qir[1];
rxqkr[1] = yr*qkr[3] - zr*qkr[2];
rxqkr[2] = zr*qkr[1] - xr*qkr[3];
rxqkr[3] = xr*qkr[2] - yr*qkr[1];
rxqikr[1] = yr*qiqkr[3] - zr*qiqkr[2];
rxqikr[2] = zr*qiqkr[1] - xr*qiqkr[3];
rxqikr[3] = xr*qiqkr[2] - yr*qiqkr[1];
rxqkir[1] = yr*qkqir[3] - zr*qkqir[2];
rxqkir[2] = zr*qkqir[1] - xr*qkqir[3];
rxqkir[3] = xr*qkqir[2] - yr*qkqir[1];
qkrxqir[1] = qkr[2]*qir[3] - qkr[3]*qir[2];
qkrxqir[2] = qkr[3]*qir[1] - qkr[1]*qir[3];
qkrxqir[3] = qkr[1]*qir[2] - qkr[2]*qir[1];
qidk[1] = qi[1]*dk[1] + qi[4]*dk[2] + qi[7]*dk[3];
qidk[2] = qi[2]*dk[1] + qi[5]*dk[2] + qi[8]*dk[3];
qidk[3] = qi[3]*dk[1] + qi[6]*dk[2] + qi[9]*dk[3];
qkdi[1] = qk[1]*di[1] + qk[4]*di[2] + qk[7]*di[3];
qkdi[2] = qk[2]*di[1] + qk[5]*di[2] + qk[8]*di[3];
qkdi[3] = qk[3]*di[1] + qk[6]*di[2] + qk[9]*di[3];
qiuk[1] = qi[1]*atomJ.inducedDipole[0] + qi[4]*atomJ.inducedDipole[1]
+ qi[7]*atomJ.inducedDipole[2];
qiuk[2] = qi[2]*atomJ.inducedDipole[0] + qi[5]*atomJ.inducedDipole[1]
+ qi[8]*atomJ.inducedDipole[2];
qiuk[3] = qi[3]*atomJ.inducedDipole[0] + qi[6]*atomJ.inducedDipole[1]
+ qi[9]*atomJ.inducedDipole[2];
qkui[1] = qk[1]*atomI.inducedDipole[0] + qk[4]*atomI.inducedDipole[1]
+ qk[7]*atomI.inducedDipole[2];
qkui[2] = qk[2]*atomI.inducedDipole[0] + qk[5]*atomI.inducedDipole[1]
+ qk[8]*atomI.inducedDipole[2];
qkui[3] = qk[3]*atomI.inducedDipole[0] + qk[6]*atomI.inducedDipole[1]
+ qk[9]*atomI.inducedDipole[2];
qiukp[1] = qi[1]*atomJ.inducedDipoleP[0] + qi[4]*atomJ.inducedDipoleP[1]
+ qi[7]*atomJ.inducedDipoleP[2];
qiukp[2] = qi[2]*atomJ.inducedDipoleP[0] + qi[5]*atomJ.inducedDipoleP[1]
+ qi[8]*atomJ.inducedDipoleP[2];
qiukp[3] = qi[3]*atomJ.inducedDipoleP[0] + qi[6]*atomJ.inducedDipoleP[1]
+ qi[9]*atomJ.inducedDipoleP[2];
qkuip[1] = qk[1]*atomI.inducedDipoleP[0] + qk[4]*atomI.inducedDipoleP[1]
+ qk[7]*atomI.inducedDipoleP[2];
qkuip[2] = qk[2]*atomI.inducedDipoleP[0] + qk[5]*atomI.inducedDipoleP[1]
+ qk[8]*atomI.inducedDipoleP[2];
qkuip[3] = qk[3]*atomI.inducedDipoleP[0] + qk[6]*atomI.inducedDipoleP[1]
+ qk[9]*atomI.inducedDipoleP[2];
dixqkr[1] = di[2]*qkr[3] - di[3]*qkr[2];
dixqkr[2] = di[3]*qkr[1] - di[1]*qkr[3];
dixqkr[3] = di[1]*qkr[2] - di[2]*qkr[1];
dkxqir[1] = dk[2]*qir[3] - dk[3]*qir[2];
dkxqir[2] = dk[3]*qir[1] - dk[1]*qir[3];
dkxqir[3] = dk[1]*qir[2] - dk[2]*qir[1];
uixqkr[1] = atomI.inducedDipole[1]*qkr[3] - atomI.inducedDipole[2]*qkr[2];
uixqkr[2] = atomI.inducedDipole[2]*qkr[1] - atomI.inducedDipole[0]*qkr[3];
uixqkr[3] = atomI.inducedDipole[0]*qkr[2] - atomI.inducedDipole[1]*qkr[1];
ukxqir[1] = atomJ.inducedDipole[1]*qir[3] - atomJ.inducedDipole[2]*qir[2];
ukxqir[2] = atomJ.inducedDipole[2]*qir[1] - atomJ.inducedDipole[0]*qir[3];
ukxqir[3] = atomJ.inducedDipole[0]*qir[2] - atomJ.inducedDipole[1]*qir[1];
uixqkrp[1] = atomI.inducedDipoleP[1]*qkr[3] - atomI.inducedDipoleP[2]*qkr[2];
uixqkrp[2] = atomI.inducedDipoleP[2]*qkr[1] - atomI.inducedDipoleP[0]*qkr[3];
uixqkrp[3] = atomI.inducedDipoleP[0]*qkr[2] - atomI.inducedDipoleP[1]*qkr[1];
ukxqirp[1] = atomJ.inducedDipoleP[1]*qir[3] - atomJ.inducedDipoleP[2]*qir[2];
ukxqirp[2] = atomJ.inducedDipoleP[2]*qir[1] - atomJ.inducedDipoleP[0]*qir[3];
ukxqirp[3] = atomJ.inducedDipoleP[0]*qir[2] - atomJ.inducedDipoleP[1]*qir[1];
rxqidk[1] = yr*qidk[3] - zr*qidk[2];
rxqidk[2] = zr*qidk[1] - xr*qidk[3];
rxqidk[3] = xr*qidk[2] - yr*qidk[1];
rxqkdi[1] = yr*qkdi[3] - zr*qkdi[2];
rxqkdi[2] = zr*qkdi[1] - xr*qkdi[3];
rxqkdi[3] = xr*qkdi[2] - yr*qkdi[1];
rxqiuk[1] = yr*qiuk[3] - zr*qiuk[2];
rxqiuk[2] = zr*qiuk[1] - xr*qiuk[3];
rxqiuk[3] = xr*qiuk[2] - yr*qiuk[1];
rxqkui[1] = yr*qkui[3] - zr*qkui[2];
rxqkui[2] = zr*qkui[1] - xr*qkui[3];
rxqkui[3] = xr*qkui[2] - yr*qkui[1];
rxqiukp[1] = yr*qiukp[3] - zr*qiukp[2];
rxqiukp[2] = zr*qiukp[1] - xr*qiukp[3];
rxqiukp[3] = xr*qiukp[2] - yr*qiukp[1];
rxqkuip[1] = yr*qkuip[3] - zr*qkuip[2];
rxqkuip[2] = zr*qkuip[1] - xr*qkuip[3];
rxqkuip[3] = xr*qkuip[2] - yr*qkuip[1];
// calculate the scalar products for permanent components
sc[2] = di[1]*dk[1] + di[2]*dk[2] + di[3]*dk[3];
sc[3] = di[1]*xr + di[2]*yr + di[3]*zr;
sc[4] = dk[1]*xr + dk[2]*yr + dk[3]*zr;
sc[5] = qir[1]*xr + qir[2]*yr + qir[3]*zr;
sc[6] = qkr[1]*xr + qkr[2]*yr + qkr[3]*zr;
sc[7] = qir[1]*dk[1] + qir[2]*dk[2] + qir[3]*dk[3];
sc[8] = qkr[1]*di[1] + qkr[2]*di[2] + qkr[3]*di[3];
sc[9] = qir[1]*qkr[1] + qir[2]*qkr[2] + qir[3]*qkr[3];
sc[10] = qi[1]*qk[1] + qi[2]*qk[2] + qi[3]*qk[3]
+ qi[4]*qk[4] + qi[5]*qk[5] + qi[6]*qk[6]
+ qi[7]*qk[7] + qi[8]*qk[8] + qi[9]*qk[9];
// calculate the scalar products for induced components
sci[1] = atomI.inducedDipole[0]*dk[1] + atomI.inducedDipole[1]*dk[2]
+ atomI.inducedDipole[2]*dk[3] + di[1]*atomJ.inducedDipole[0]
+ di[2]*atomJ.inducedDipole[1] + di[3]*atomJ.inducedDipole[2];
sci[2] = atomI.inducedDipole[0]*atomJ.inducedDipole[0] + atomI.inducedDipole[1]*atomJ.inducedDipole[1]
+ atomI.inducedDipole[2]*atomJ.inducedDipole[2];
sci[3] = atomI.inducedDipole[0]*xr + atomI.inducedDipole[1]*yr + atomI.inducedDipole[2]*zr;
sci[4] = atomJ.inducedDipole[0]*xr + atomJ.inducedDipole[1]*yr + atomJ.inducedDipole[2]*zr;
sci[7] = qir[1]*atomJ.inducedDipole[0] + qir[2]*atomJ.inducedDipole[1]
+ qir[3]*atomJ.inducedDipole[2];
sci[8] = qkr[1]*atomI.inducedDipole[0] + qkr[2]*atomI.inducedDipole[1]
+ qkr[3]*atomI.inducedDipole[2];
scip[1] = atomI.inducedDipoleP[0]*dk[1] + atomI.inducedDipoleP[1]*dk[2]
+ atomI.inducedDipoleP[2]*dk[3] + di[1]*atomJ.inducedDipoleP[0]
+ di[2]*atomJ.inducedDipoleP[1] + di[3]*atomJ.inducedDipoleP[2];
scip[2] = atomI.inducedDipole[0]*atomJ.inducedDipoleP[0]+atomI.inducedDipole[1]*atomJ.inducedDipoleP[1]
+ atomI.inducedDipole[2]*atomJ.inducedDipoleP[2]+atomI.inducedDipoleP[0]*atomJ.inducedDipole[0]
+ atomI.inducedDipoleP[1]*atomJ.inducedDipole[1]+atomI.inducedDipoleP[2]*atomJ.inducedDipole[2];
scip[3] = atomI.inducedDipoleP[0]*xr + atomI.inducedDipoleP[1]*yr + atomI.inducedDipoleP[2]*zr;
scip[4] = atomJ.inducedDipoleP[0]*xr + atomJ.inducedDipoleP[1]*yr + atomJ.inducedDipoleP[2]*zr;
scip[7] = qir[1]*atomJ.inducedDipoleP[0] + qir[2]*atomJ.inducedDipoleP[1]
+ qir[3]*atomJ.inducedDipoleP[2];
scip[8] = qkr[1]*atomI.inducedDipoleP[0] + qkr[2]*atomI.inducedDipoleP[1]
+ qkr[3]*atomI.inducedDipoleP[2];
// calculate the gl functions for permanent components
gl[0] = ci*ck;
gl[1] = ck*sc[3] - ci*sc[4];
gl[2] = ci*sc[6] + ck*sc[5] - sc[3]*sc[4];
gl[3] = sc[3]*sc[6] - sc[4]*sc[5];
gl[4] = sc[5]*sc[6];
gl[5] = -4.0f * sc[9];
gl[6] = sc[2];
gl[7] = 2.0f * (sc[7]-sc[8]);
gl[8] = 2.0f * sc[10];
// calculate the gl functions for induced components
gli[1] = ck*sci[3] - ci*sci[4];
gli[2] = -sc[3]*sci[4] - sci[3]*sc[4];
gli[3] = sci[3]*sc[6] - sci[4]*sc[5];
gli[6] = sci[1];
gli[7] = 2.0f * (sci[7]-sci[8]);
glip[1] = ck*scip[3] - ci*scip[4];
glip[2] = -sc[3]*scip[4] - scip[3]*sc[4];
glip[3] = scip[3]*sc[6] - scip[4]*sc[5];
glip[6] = scip[1];
glip[7] = 2.0f * (scip[7]-scip[8]);
// compute the energy contributions for this interaction
e = bn[0]*gl[0] + bn[1]*(gl[1]+gl[6])
+ bn[2]*(gl[2]+gl[7]+gl[8])
+ bn[3]*(gl[3]+gl[5]) + bn[4]*gl[4];
ei = 0.5f * (bn[1]*(gli[1]+gli[6])
+ bn[2]*(gli[2]+gli[7]) + bn[3]*gli[3]);
// get the real energy without any screening function
erl = rr1*gl[0] + rr3*(gl[1]+gl[6])
+ rr5*(gl[2]+gl[7]+gl[8])
+ rr7*(gl[3]+gl[5]) + rr9*gl[4];
erli = 0.5f*(rr3*(gli[1]+gli[6])*psc3
+ rr5*(gli[2]+gli[7])*psc5
+ rr7*gli[3]*psc7);
e = e - (1.0f-scalingFactors[MScaleIndex])*erl;
ei = ei - erli;
e = f * e;
ei = f * ei;
// em = em + e;
// ep = ep + ei;
// increment the total intramolecular energy; assumes;
// intramolecular distances are less than half of cell;
// length and less than the ewald cutoff;
/*
if (molcule(ii) .eq. molcule(kk)) {
eintra = eintra + mscale(kk)*erl*f;
eintra = eintra + 0.5f*pscale(kk);
& * (rr3*(gli[1]+gli[6])*scale3;
& + rr5*(gli[2]+gli[7])*scale5;
& + rr7*gli[3]*scale7);
}
*/
// intermediate variables for permanent force terms
gf[1] = bn[1]*gl[0] + bn[2]*(gl[1]+gl[6])
+ bn[3]*(gl[2]+gl[7]+gl[8])
+ bn[4]*(gl[3]+gl[5]) + bn[5]*gl[4];
gf[2] = -ck*bn[1] + sc[4]*bn[2] - sc[6]*bn[3];
gf[3] = ci*bn[1] + sc[3]*bn[2] + sc[5]*bn[3];
gf[4] = 2.0f * bn[2];
gf[5] = 2.0f * (-ck*bn[2]+sc[4]*bn[3]-sc[6]*bn[4]);
gf[6] = 2.0f * (-ci*bn[2]-sc[3]*bn[3]-sc[5]*bn[4]);
gf[7] = 4.0f * bn[3];
gfr[1] = rr3*gl[0] + rr5*(gl[1]+gl[6])
+ rr7*(gl[2]+gl[7]+gl[8])
+ rr9*(gl[3]+gl[5]) + rr11*gl[4];
gfr[2] = -ck*rr3 + sc[4]*rr5 - sc[6]*rr7;
gfr[3] = ci*rr3 + sc[3]*rr5 + sc[5]*rr7;
gfr[4] = 2.0f * rr5;
gfr[5] = 2.0f * (-ck*rr5+sc[4]*rr7-sc[6]*rr9);
gfr[6] = 2.0f * (-ci*rr5-sc[3]*rr7-sc[5]*rr9);
gfr[7] = 4.0f * rr7;
// intermediate variables for induced force terms
gfi[1] = 0.5f*bn[2]*(gli[1]+glip[1]+gli[6]+glip[6])
+ 0.5f*bn[2]*scip[2]
+ 0.5f*bn[3]*(gli[2]+glip[2]+gli[7]+glip[7])
- 0.5f*bn[3]*(sci[3]*scip[4]+scip[3]*sci[4])
+ 0.5f*bn[4]*(gli[3]+glip[3]);
gfi[2] = -ck*bn[1] + sc[4]*bn[2] - sc[6]*bn[3];
gfi[3] = ci*bn[1] + sc[3]*bn[2] + sc[5]*bn[3];
gfi[4] = 2.0f * bn[2];
gfi[5] = bn[3] * (sci[4]+scip[4]);
gfi[6] = -bn[3] * (sci[3]+scip[3]);
gfri[1] = 0.5f*rr5*((gli[1]+gli[6])*psc3
+ (glip[1]+glip[6])*dsc3
+ scip[2]*usc3)
+ 0.5f*rr7*((gli[7]+gli[2])*psc5
+ (glip[7]+glip[2])*dsc5
- (sci[3]*scip[4]+scip[3]*sci[4])*usc5)
+ 0.5f*rr9*(gli[3]*psc7+glip[3]*dsc7);
gfri[2] = -rr3*ck + rr5*sc[4] - rr7*sc[6];
gfri[3] = rr3*ci + rr5*sc[3] + rr7*sc[5];
gfri[4] = 2.0f * rr5;
gfri[5] = rr7 * (sci[4]*psc7+scip[4]*dsc7);
gfri[6] = -rr7 * (sci[3]*psc7+scip[3]*dsc7);
// get the permanent force with screening
ftm2[1] = gf[1]*xr + gf[2]*di[1] + gf[3]*dk[1]
+ gf[4]*(qkdi[1]-qidk[1]) + gf[5]*qir[1]
+ gf[6]*qkr[1] + gf[7]*(qiqkr[1]+qkqir[1]);
ftm2[2] = gf[1]*yr + gf[2]*di[2] + gf[3]*dk[2]
+ gf[4]*(qkdi[2]-qidk[2]) + gf[5]*qir[2]
+ gf[6]*qkr[2] + gf[7]*(qiqkr[2]+qkqir[2]);
ftm2[3] = gf[1]*zr + gf[2]*di[3] + gf[3]*dk[3]
+ gf[4]*(qkdi[3]-qidk[3]) + gf[5]*qir[3]
+ gf[6]*qkr[3] + gf[7]*(qiqkr[3]+qkqir[3]);
// get the permanent force without screening
ftm2r[1] = gfr[1]*xr + gfr[2]*di[1] + gfr[3]*dk[1]
+ gfr[4]*(qkdi[1]-qidk[1]) + gfr[5]*qir[1]
+ gfr[6]*qkr[1] + gfr[7]*(qiqkr[1]+qkqir[1]);
ftm2r[2] = gfr[1]*yr + gfr[2]*di[2] + gfr[3]*dk[2]
+ gfr[4]*(qkdi[2]-qidk[2]) + gfr[5]*qir[2]
+ gfr[6]*qkr[2] + gfr[7]*(qiqkr[2]+qkqir[2]);
ftm2r[3] = gfr[1]*zr + gfr[2]*di[3] + gfr[3]*dk[3]
+ gfr[4]*(qkdi[3]-qidk[3]) + gfr[5]*qir[3]
+ gfr[6]*qkr[3] + gfr[7]*(qiqkr[3]+qkqir[3]);
// get the induced force with screening
ftm2i[1] = gfi[1]*xr + 0.5f*
(gfi[2]*(atomI.inducedDipole[0]+atomI.inducedDipoleP[0])
+ bn[2]*(sci[4]*atomI.inducedDipoleP[0]+scip[4]*atomI.inducedDipole[0])
+ gfi[3]*(atomJ.inducedDipole[0]+atomJ.inducedDipoleP[0])
+ bn[2]*(sci[3]*atomJ.inducedDipoleP[0]+scip[3]*atomJ.inducedDipole[0])
+ (sci[4]+scip[4])*bn[2]*di[1]
+ (sci[3]+scip[3])*bn[2]*dk[1]
+ gfi[4]*(qkui[1]+qkuip[1]-qiuk[1]-qiukp[1]))
+ gfi[5]*qir[1] + gfi[6]*qkr[1];
ftm2i[2] = gfi[1]*yr + 0.5f*
(gfi[2]*(atomI.inducedDipole[1]+atomI.inducedDipoleP[1])
+ bn[2]*(sci[4]*atomI.inducedDipoleP[1]+scip[4]*atomI.inducedDipole[1])
+ gfi[3]*(atomJ.inducedDipole[1]+atomJ.inducedDipoleP[1])
+ bn[2]*(sci[3]*atomJ.inducedDipoleP[1]+scip[3]*atomJ.inducedDipole[1])
+ (sci[4]+scip[4])*bn[2]*di[2]
+ (sci[3]+scip[3])*bn[2]*dk[2]
+ gfi[4]*(qkui[2]+qkuip[2]-qiuk[2]-qiukp[2]))
+ gfi[5]*qir[2] + gfi[6]*qkr[2];
ftm2i[3] = gfi[1]*zr + 0.5f*
(gfi[2]*(atomI.inducedDipole[2]+atomI.inducedDipoleP[2])
+ bn[2]*(sci[4]*atomI.inducedDipoleP[2]+scip[4]*atomI.inducedDipole[2])
+ gfi[3]*(atomJ.inducedDipole[2]+atomJ.inducedDipoleP[2])
+ bn[2]*(sci[3]*atomJ.inducedDipoleP[2]+scip[3]*atomJ.inducedDipole[2])
+ (sci[4]+scip[4])*bn[2]*di[3]
+ (sci[3]+scip[3])*bn[2]*dk[3]
+ gfi[4]*(qkui[3]+qkuip[3]-qiuk[3]-qiukp[3]))
+ gfi[5]*qir[3] + gfi[6]*qkr[3];
// get the induced force without screening
ftm2ri[1] = gfri[1]*xr + 0.5f*
(- rr3*ck*(atomI.inducedDipole[0]*psc3+atomI.inducedDipoleP[0]*dsc3)
+ rr5*sc[4]*(atomI.inducedDipole[0]*psc5+atomI.inducedDipoleP[0]*dsc5)
- rr7*sc[6]*(atomI.inducedDipole[0]*psc7+atomI.inducedDipoleP[0]*dsc7))
+ (rr3*ci*(atomJ.inducedDipole[0]*psc3+atomJ.inducedDipoleP[0]*dsc3)
+ rr5*sc[3]*(atomJ.inducedDipole[0]*psc5+atomJ.inducedDipoleP[0]*dsc5)
+ rr7*sc[5]*(atomJ.inducedDipole[0]*psc7+atomJ.inducedDipoleP[0]*dsc7))*0.5f
+ rr5*usc5*(sci[4]*atomI.inducedDipoleP[0]+scip[4]*atomI.inducedDipole[0]
+ sci[3]*atomJ.inducedDipoleP[0]+scip[3]*atomJ.inducedDipole[0])*0.5f
+ 0.5f*(sci[4]*psc5+scip[4]*dsc5)*rr5*di[1]
+ 0.5f*(sci[3]*psc5+scip[3]*dsc5)*rr5*dk[1]
+ 0.5f*gfri[4]*((qkui[1]-qiuk[1])*psc5
+ (qkuip[1]-qiukp[1])*dsc5)
+ gfri[5]*qir[1] + gfri[6]*qkr[1];
ftm2ri[2] = gfri[1]*yr + 0.5f*
(- rr3*ck*(atomI.inducedDipole[1]*psc3+atomI.inducedDipoleP[1]*dsc3)
+ rr5*sc[4]*(atomI.inducedDipole[1]*psc5+atomI.inducedDipoleP[1]*dsc5)
- rr7*sc[6]*(atomI.inducedDipole[1]*psc7+atomI.inducedDipoleP[1]*dsc7))
+ (rr3*ci*(atomJ.inducedDipole[1]*psc3+atomJ.inducedDipoleP[1]*dsc3)
+ rr5*sc[3]*(atomJ.inducedDipole[1]*psc5+atomJ.inducedDipoleP[1]*dsc5)
+ rr7*sc[5]*(atomJ.inducedDipole[1]*psc7+atomJ.inducedDipoleP[1]*dsc7))*0.5f
+ rr5*usc5*(sci[4]*atomI.inducedDipoleP[1]+scip[4]*atomI.inducedDipole[1]
+ sci[3]*atomJ.inducedDipoleP[1]+scip[3]*atomJ.inducedDipole[1])*0.5f
+ 0.5f*(sci[4]*psc5+scip[4]*dsc5)*rr5*di[2]
+ 0.5f*(sci[3]*psc5+scip[3]*dsc5)*rr5*dk[2]
+ 0.5f*gfri[4]*((qkui[2]-qiuk[2])*psc5
+ (qkuip[2]-qiukp[2])*dsc5)
+ gfri[5]*qir[2] + gfri[6]*qkr[2];
ftm2ri[3] = gfri[1]*zr + 0.5f*
(- rr3*ck*(atomI.inducedDipole[2]*psc3+atomI.inducedDipoleP[2]*dsc3)
+ rr5*sc[4]*(atomI.inducedDipole[2]*psc5+atomI.inducedDipoleP[2]*dsc5)
- rr7*sc[6]*(atomI.inducedDipole[2]*psc7+atomI.inducedDipoleP[2]*dsc7))
+ (rr3*ci*(atomJ.inducedDipole[2]*psc3+atomJ.inducedDipoleP[2]*dsc3)
+ rr5*sc[3]*(atomJ.inducedDipole[2]*psc5+atomJ.inducedDipoleP[2]*dsc5)
+ rr7*sc[5]*(atomJ.inducedDipole[2]*psc7+atomJ.inducedDipoleP[2]*dsc7))*0.5f
+ rr5*usc5*(sci[4]*atomI.inducedDipoleP[2]+scip[4]*atomI.inducedDipole[2]
+ sci[3]*atomJ.inducedDipoleP[2]+scip[3]*atomJ.inducedDipole[2])*0.5f
+ 0.5f*(sci[4]*psc5+scip[4]*dsc5)*rr5*di[3]
+ 0.5f*(sci[3]*psc5+scip[3]*dsc5)*rr5*dk[3]
+ 0.5f*gfri[4]*((qkui[3]-qiuk[3])*psc5
+ (qkuip[3]-qiukp[3])*dsc5)
+ gfri[5]*qir[3] + gfri[6]*qkr[3];
// account for partially excluded induced interactions
float temp3 = 0.5f * rr3 * ((gli[1]+gli[6])*scalingFactors[PScaleIndex]
+(glip[1]+glip[6])*scalingFactors[DScaleIndex]);
float temp5 = 0.5f * rr5 * ((gli[2]+gli[7])*scalingFactors[PScaleIndex]
+(glip[2]+glip[7])*scalingFactors[DScaleIndex]);
float temp7 = 0.5f * rr7 * (gli[3]*scalingFactors[PScaleIndex]
+glip[3]*scalingFactors[DScaleIndex]);
fridmp[1] = temp3*ddsc3[1] + temp5*ddsc5[1]
+ temp7*ddsc7[1];
fridmp[2] = temp3*ddsc3[2] + temp5*ddsc5[2]
+ temp7*ddsc7[2];
fridmp[3] = temp3*ddsc3[3] + temp5*ddsc5[3]
+ temp7*ddsc7[3];
// find some scaling terms for induced-induced force
temp3 = 0.5f * rr3 * scalingFactors[UScaleIndex] * scip[2];
temp5 = -0.5f * rr5 * scalingFactors[UScaleIndex] * (sci[3]*scip[4]+scip[3]*sci[4]);
findmp[1] = temp3*ddsc3[1] + temp5*ddsc5[1];
findmp[2] = temp3*ddsc3[2] + temp5*ddsc5[2];
findmp[3] = temp3*ddsc3[3] + temp5*ddsc5[3];
// modify the forces for partially excluded interactions
ftm2i[1] = ftm2i[1] - fridmp[1] - findmp[1];
ftm2i[2] = ftm2i[2] - fridmp[2] - findmp[2];
ftm2i[3] = ftm2i[3] - fridmp[3] - findmp[3];
// correction to convert mutual to direct polarization force
/*
if (poltyp .eq. 'DIRECT') {
gfd = 0.5f * (bn[2]*scip[2];
& - bn[3]*(scip[3]*sci[4]+sci[3]*scip[4]));
gfdr = 0.5f * (rr5*scip[2]*usc3;
& - rr7*(scip[3]*sci[4];
& +sci[3]*scip[4])*usc5);
ftm2i[1] = ftm2i[1] - gfd*xr - 0.5f*bn[2]*;
& (sci[4]*atomI.inducedDipoleP[0]+scip[4]*atomI.inducedDipole[0];
& +sci[3]*atomJ.inducedDipoleP[0]+scip[3]*atomJ.inducedDipole[0]);
ftm2i[2] = ftm2i[2] - gfd*yr - 0.5f*bn[2]*;
& (sci[4]*atomI.inducedDipoleP[1]+scip[4]*atomI.inducedDipole[1];
& +sci[3]*atomJ.inducedDipoleP[1]+scip[3]*atomJ.inducedDipole[1]);
ftm2i[3] = ftm2i[3] - gfd*zr - 0.5f*bn[2]*;
& (sci[4]*atomI.inducedDipoleP[2]+scip[4]*atomI.inducedDipole[2];
& +sci[3]*atomJ.inducedDipoleP[2]+scip[3]*atomJ.inducedDipole[2]);
fdir[1] = gfdr*xr + 0.5f*usc5*rr5*;
& (sci[4]*atomI.inducedDipoleP[0]+scip[4]*atomI.inducedDipole[0];
& + sci[3]*atomJ.inducedDipoleP[0]+scip[3]*atomJ.inducedDipole[0]);
fdir[2] = gfdr*yr + 0.5f*usc5*rr5*;
& (sci[4]*atomI.inducedDipoleP[1]+scip[4]*atomI.inducedDipole[1];
& + sci[3]*atomJ.inducedDipoleP[1]+scip[3]*atomJ.inducedDipole[1]);
fdir[3] = gfdr*zr + 0.5f*usc5*rr5*;
& (sci[4]*atomI.inducedDipoleP[2]+scip[4]*atomI.inducedDipole[2];
& + sci[3]*atomJ.inducedDipoleP[2]+scip[3]*atomJ.inducedDipole[2]);
ftm2i[1] = ftm2i[1] + fdir[1] + findmp[1];
ftm2i[2] = ftm2i[2] + fdir[2] + findmp[2];
ftm2i[3] = ftm2i[3] + fdir[3] + findmp[3];
}
*/
// intermediate variables for induced torque terms
gti[2] = 0.5f * bn[2] * (sci[4]+scip[4]);
gti[3] = 0.5f * bn[2] * (sci[3]+scip[3]);
gti[4] = gfi[4];
gti[5] = gfi[5];
gti[6] = gfi[6];
gtri[2] = 0.5f * rr5 * (sci[4]*psc5+scip[4]*dsc5);
gtri[3] = 0.5f * rr5 * (sci[3]*psc5+scip[3]*dsc5);
gtri[4] = gfri[4];
gtri[5] = gfri[5];
gtri[6] = gfri[6];
// get the permanent torque with screening
ttm2[1] = -bn[1]*dixdk[1] + gf[2]*dixr[1]
+ gf[4]*(dixqkr[1]+dkxqir[1]+rxqidk[1]-2.0f*qixqk[1])
- gf[5]*rxqir[1] - gf[7]*(rxqikr[1]+qkrxqir[1]);
ttm2[2] = -bn[1]*dixdk[2] + gf[2]*dixr[2]
+ gf[4]*(dixqkr[2]+dkxqir[2]+rxqidk[2]-2.0f*qixqk[2])
- gf[5]*rxqir[2] - gf[7]*(rxqikr[2]+qkrxqir[2]);
ttm2[3] = -bn[1]*dixdk[3] + gf[2]*dixr[3]
+ gf[4]*(dixqkr[3]+dkxqir[3]+rxqidk[3]-2.0f*qixqk[3])
- gf[5]*rxqir[3] - gf[7]*(rxqikr[3]+qkrxqir[3]);
ttm3[1] = bn[1]*dixdk[1] + gf[3]*dkxr[1]
- gf[4]*(dixqkr[1]+dkxqir[1]+rxqkdi[1]-2.0f*qixqk[1])
- gf[6]*rxqkr[1] - gf[7]*(rxqkir[1]-qkrxqir[1]);
ttm3[2] = bn[1]*dixdk[2] + gf[3]*dkxr[2]
- gf[4]*(dixqkr[2]+dkxqir[2]+rxqkdi[2]-2.0f*qixqk[2])
- gf[6]*rxqkr[2] - gf[7]*(rxqkir[2]-qkrxqir[2]);
ttm3[3] = bn[1]*dixdk[3] + gf[3]*dkxr[3]
- gf[4]*(dixqkr[3]+dkxqir[3]+rxqkdi[3]-2.0f*qixqk[3])
- gf[6]*rxqkr[3] - gf[7]*(rxqkir[3]-qkrxqir[3]);
// get the permanent torque without screening
ttm2r[1] = -rr3*dixdk[1] + gfr[2]*dixr[1]-gfr[5]*rxqir[1]
+ gfr[4]*(dixqkr[1]+dkxqir[1]+rxqidk[1]-2.0f*qixqk[1])
- gfr[7]*(rxqikr[1]+qkrxqir[1]);
ttm2r[2] = -rr3*dixdk[2] + gfr[2]*dixr[2]-gfr[5]*rxqir[2]
+ gfr[4]*(dixqkr[2]+dkxqir[2]+rxqidk[2]-2.0f*qixqk[2])
- gfr[7]*(rxqikr[2]+qkrxqir[2]);
ttm2r[3] = -rr3*dixdk[3] + gfr[2]*dixr[3]-gfr[5]*rxqir[3]
+ gfr[4]*(dixqkr[3]+dkxqir[3]+rxqidk[3]-2.0f*qixqk[3])
- gfr[7]*(rxqikr[3]+qkrxqir[3]);
ttm3r[1] = rr3*dixdk[1] + gfr[3]*dkxr[1] -gfr[6]*rxqkr[1]
- gfr[4]*(dixqkr[1]+dkxqir[1]+rxqkdi[1]-2.0f*qixqk[1])
- gfr[7]*(rxqkir[1]-qkrxqir[1]);
ttm3r[2] = rr3*dixdk[2] + gfr[3]*dkxr[2] -gfr[6]*rxqkr[2]
- gfr[4]*(dixqkr[2]+dkxqir[2]+rxqkdi[2]-2.0f*qixqk[2])
- gfr[7]*(rxqkir[2]-qkrxqir[2]);
ttm3r[3] = rr3*dixdk[3] + gfr[3]*dkxr[3] -gfr[6]*rxqkr[3]
- gfr[4]*(dixqkr[3]+dkxqir[3]+rxqkdi[3]-2.0f*qixqk[3])
- gfr[7]*(rxqkir[3]-qkrxqir[3]);
// get the induced torque with screening
ttm2i[1] = -bn[1]*(dixuk[1]+dixukp[1])*0.5f
+ gti[2]*dixr[1] + gti[4]*(ukxqir[1]+rxqiuk[1]
+ ukxqirp[1]+rxqiukp[1])*0.5f - gti[5]*rxqir[1];
ttm2i[2] = -bn[1]*(dixuk[2]+dixukp[2])*0.5f
+ gti[2]*dixr[2] + gti[4]*(ukxqir[2]+rxqiuk[2]
+ ukxqirp[2]+rxqiukp[2])*0.5f - gti[5]*rxqir[2];
ttm2i[3] = -bn[1]*(dixuk[3]+dixukp[3])*0.5f
+ gti[2]*dixr[3] + gti[4]*(ukxqir[3]+rxqiuk[3]
+ ukxqirp[3]+rxqiukp[3])*0.5f - gti[5]*rxqir[3];
ttm3i[1] = -bn[1]*(dkxui[1]+dkxuip[1])*0.5f
+ gti[3]*dkxr[1] - gti[4]*(uixqkr[1]+rxqkui[1]
+ uixqkrp[1]+rxqkuip[1])*0.5f - gti[6]*rxqkr[1];
ttm3i[2] = -bn[1]*(dkxui[2]+dkxuip[2])*0.5f
+ gti[3]*dkxr[2] - gti[4]*(uixqkr[2]+rxqkui[2]
+ uixqkrp[2]+rxqkuip[2])*0.5f - gti[6]*rxqkr[2];
ttm3i[3] = -bn[1]*(dkxui[3]+dkxuip[3])*0.5f
+ gti[3]*dkxr[3] - gti[4]*(uixqkr[3]+rxqkui[3]
+ uixqkrp[3]+rxqkuip[3])*0.5f - gti[6]*rxqkr[3];
// get the induced torque without screening
ttm2ri[1] = -rr3*(dixuk[1]*psc3+dixukp[1]*dsc3)*0.5f
+ gtri[2]*dixr[1] + gtri[4]*((ukxqir[1]+rxqiuk[1])*psc5
+(ukxqirp[1]+rxqiukp[1])*dsc5)*0.5f - gtri[5]*rxqir[1];
ttm2ri[2] = -rr3*(dixuk[2]*psc3+dixukp[2]*dsc3)*0.5f
+ gtri[2]*dixr[2] + gtri[4]*((ukxqir[2]+rxqiuk[2])*psc5
+(ukxqirp[2]+rxqiukp[2])*dsc5)*0.5f - gtri[5]*rxqir[2];
ttm2ri[3] = -rr3*(dixuk[3]*psc3+dixukp[3]*dsc3)*0.5f
+ gtri[2]*dixr[3] + gtri[4]*((ukxqir[3]+rxqiuk[3])*psc5
+(ukxqirp[3]+rxqiukp[3])*dsc5)*0.5f - gtri[5]*rxqir[3];
ttm3ri[1] = -rr3*(dkxui[1]*psc3+dkxuip[1]*dsc3)*0.5f
+ gtri[3]*dkxr[1] - gtri[4]*((uixqkr[1]+rxqkui[1])*psc5
+(uixqkrp[1]+rxqkuip[1])*dsc5)*0.5f - gtri[6]*rxqkr[1];
ttm3ri[2] = -rr3*(dkxui[2]*psc3+dkxuip[2]*dsc3)*0.5f
+ gtri[3]*dkxr[2] - gtri[4]*((uixqkr[2]+rxqkui[2])*psc5
+(uixqkrp[2]+rxqkuip[2])*dsc5)*0.5f - gtri[6]*rxqkr[2];
ttm3ri[3] = -rr3*(dkxui[3]*psc3+dkxuip[3]*dsc3)*0.5f
+ gtri[3]*dkxr[3] - gtri[4]*((uixqkr[3]+rxqkui[3])*psc5
+(uixqkrp[3]+rxqkuip[3])*dsc5)*0.5f - gtri[6]*rxqkr[3];
// handle the case where scaling is used
for( int j = 1; j <= 3; j++ ){
ftm2[j] = f * (ftm2[j]-(1.0f-scalingFactors[MScaleIndex])*ftm2r[j]);
ftm2i[j] = f * (ftm2i[j]-ftm2ri[j]);
ttm2[j] = f * (ttm2[j]-(1.0f-scalingFactors[MScaleIndex])*ttm2r[j]);
ttm2i[j] = f * (ttm2i[j]-ttm2ri[j]);
ttm3[j] = f * (ttm3[j]-(1.0f-scalingFactors[MScaleIndex])*ttm3r[j]);
ttm3i[j] = f * (ttm3i[j]-ttm3ri[j]);
}
// increment gradient due to force and torque on first site;
/*
dem(1,ii) = dem(1,ii) + ftm2[1];
dem(2,ii) = dem(2,ii) + ftm2[2];
dem(3,ii) = dem(3,ii) + ftm2[3];
dep(1,ii) = dep(1,ii) + ftm2i[1];
dep(2,ii) = dep(2,ii) + ftm2i[2];
dep(3,ii) = dep(3,ii) + ftm2i[3];
call torque (i,ttm2,ttm2i,frcxi,frcyi,frczi);
// increment gradient due to force and torque on second site;
dem(1,kk) = dem(1,kk) - ftm2[1];
dem(2,kk) = dem(2,kk) - ftm2[2];
dem(3,kk) = dem(3,kk) - ftm2[3];
dep(1,kk) = dep(1,kk) - ftm2i[1];
dep(2,kk) = dep(2,kk) - ftm2i[2];
dep(3,kk) = dep(3,kk) - ftm2i[3];
call torque (k,ttm3,ttm3i,frcxk,frcyk,frczk);
*/
}
return;
}
__device__ void loadRealSpaceEwaldShared( struct RealSpaceEwaldParticle* sA, unsigned int atomI,
float4* atomCoord, float* labFrameDipoleJ, float* labQuadrupole,
float* inducedDipole, float* inducedDipolePolar, float2* dampingFactorAndThole )
{
// coordinates & charge
sA->x = atomCoord[atomI].x;
sA->y = atomCoord[atomI].y;
sA->z = atomCoord[atomI].z;
sA->q = atomCoord[atomI].w;
// lab dipole
sA->labFrameDipole[0] = labFrameDipoleJ[atomI*3];
sA->labFrameDipole[1] = labFrameDipoleJ[atomI*3+1];
sA->labFrameDipole[2] = labFrameDipoleJ[atomI*3+2];
// lab quadrupole
sA->labFrameQuadrupole[0] = labQuadrupole[atomI*9];
sA->labFrameQuadrupole[1] = labQuadrupole[atomI*9+1];
sA->labFrameQuadrupole[2] = labQuadrupole[atomI*9+2];
sA->labFrameQuadrupole[3] = labQuadrupole[atomI*9+3];
sA->labFrameQuadrupole[4] = labQuadrupole[atomI*9+4];
sA->labFrameQuadrupole[5] = labQuadrupole[atomI*9+5];
sA->labFrameQuadrupole[6] = labQuadrupole[atomI*9+6];
sA->labFrameQuadrupole[7] = labQuadrupole[atomI*9+7];
sA->labFrameQuadrupole[8] = labQuadrupole[atomI*9+8];
// induced dipole
sA->inducedDipole[0] = inducedDipole[atomI*3];
sA->inducedDipole[1] = inducedDipole[atomI*3+1];
sA->inducedDipole[2] = inducedDipole[atomI*3+2];
// induced dipole polar
sA->inducedDipoleP[0] = inducedDipolePolar[atomI*3];
sA->inducedDipoleP[1] = inducedDipolePolar[atomI*3+1];
sA->inducedDipoleP[2] = inducedDipolePolar[atomI*3+2];
sA->damp = dampingFactorAndThole[atomI].x;
sA->thole = dampingFactorAndThole[atomI].y;
}
// Include versions of the kernels for N^2 calculations.
#undef USE_OUTPUT_BUFFER_PER_WARP
#define METHOD_NAME(a, b) a##N2##b
#include "kCalculateAmoebaCudaRealSpaceEwald.h"
#define USE_OUTPUT_BUFFER_PER_WARP
#undef METHOD_NAME
#define METHOD_NAME(a, b) a##N2ByWarp##b
#include "kCalculateAmoebaCudaRealSpaceEwald.h"
// reduce psWorkArray_3_1 -> force
// reduce psWorkArray_3_2 -> torque
static void kReduceForceTorque(amoebaGpuContext amoebaGpu )
{
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_1->_pDevStream[0], amoebaGpu->psForce->_pDevStream[0] );
LAUNCHERROR("kReduceRealSpaceEwaldForce");
kReduceFields_kernel<<<amoebaGpu->nonbondBlocks, amoebaGpu->fieldReduceThreadsPerBlock>>>(
amoebaGpu->paddedNumberOfAtoms*3, amoebaGpu->outputBuffers,
amoebaGpu->psWorkArray_3_2->_pDevStream[0], amoebaGpu->psTorque->_pDevStream[0] );
LAUNCHERROR("kReduceRealSpaceEwaldTorque");
}
/**---------------------------------------------------------------------------------------
Compute Amoeba electrostatic force & torque
@param amoebaGpu amoebaGpu context
@param gpu OpenMM gpu Cuda context
--------------------------------------------------------------------------------------- */
void cudaComputeAmoebaRealSpaceEwald( amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
static unsigned int threadsPerBlock = 0;
#ifdef AMOEBA_DEBUG
static const char* methodName = "cudaComputeAmoebaRealSpaceEwald";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
// apparently debug array can take up nontrivial no. registers
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d maxCovalentDegreeSz=%d"
" gamma=%.3e scalingDistanceCutoff=%.3f ZZZ\n",
methodName, gpu->natoms,
amoebaGpu->maxCovalentDegreeSz, amoebaGpu->pGamma,
amoebaGpu->scalingDistanceCutoff );
}
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
CUDAStream<float4>* debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
unsigned int targetAtom = 0;
#endif
// on first pass, set threads/block
if( threadsPerBlock == 0 ){
unsigned int maxThreads;
if (gpu->sm_version >= SM_20)
maxThreads = 384;
else if (gpu->sm_version >= SM_12)
maxThreads = 128;
else
maxThreads = 64;
threadsPerBlock = std::min(getThreadsPerBlock(amoebaGpu, sizeof(RealSpaceEwaldParticle)), maxThreads);
}
kClearFields_3( amoebaGpu, 2 );
if (gpu->bOutputBufferPerWarp){
kCalculateAmoebaCudaRealSpaceEwaldN2ByWarpForces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(RealSpaceEwaldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0],
amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
#endif
} else {
#ifdef AMOEBA_DEBUG
(void) fprintf( amoebaGpu->log, "kCalculateAmoebaCudaRealSpaceEwaldN2Forces no warp: numBlocks=%u numThreads=%u bufferPerWarp=%u atm=%u shrd=%u Ebuf=%u ixnCt=%u workUnits=%u\n",
amoebaGpu->nonbondBlocks, threadsPerBlock, amoebaGpu->bOutputBufferPerWarp,
sizeof(RealSpaceEwaldParticle), sizeof(RealSpaceEwaldParticle)*threadsPerBlock, amoebaGpu->energyOutputBuffers, (*gpu->psInteractionCount)[0], gpu->sim.workUnits );
(void) fflush( amoebaGpu->log );
#endif
kCalculateAmoebaCudaRealSpaceEwaldN2Forces_kernel<<<amoebaGpu->nonbondBlocks, threadsPerBlock, sizeof(RealSpaceEwaldParticle)*threadsPerBlock>>>(
amoebaGpu->psWorkUnit->_pDevStream[0],
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psLabFrameDipole->_pDevStream[0],
amoebaGpu->psLabFrameQuadrupole->_pDevStream[0],
amoebaGpu->psInducedDipole->_pDevStream[0],
amoebaGpu->psInducedDipolePolar->_pDevStream[0],
amoebaGpu->psWorkArray_3_1->_pDevStream[0],
#ifdef AMOEBA_DEBUG
amoebaGpu->psWorkArray_3_2->_pDevStream[0],
debugArray->_pDevStream[0], targetAtom );
#else
amoebaGpu->psWorkArray_3_2->_pDevStream[0] );
#endif
}
LAUNCHERROR("kCalculateAmoebaCudaRealSpaceEwaldN2Forces");
kReduceForceTorque( amoebaGpu );
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
amoebaGpu->psForce->Download();
amoebaGpu->psTorque->Download();
debugArray->Download();
(void) fprintf( amoebaGpu->log, "Finished RealSpaceEwald kernel execution\n" ); (void) fflush( amoebaGpu->log );
int maxPrint = 1400;
for( int ii = 0; ii < gpu->natoms; ii++ ){
(void) fprintf( amoebaGpu->log, "%5d ", ii);
int indexOffset = ii*3;
// force
(void) fprintf( amoebaGpu->log,"RealSpaceEwaldF [%16.9e %16.9e %16.9e] ",
amoebaGpu->psForce->_pSysStream[0][indexOffset],
amoebaGpu->psForce->_pSysStream[0][indexOffset+1],
amoebaGpu->psForce->_pSysStream[0][indexOffset+2] );
// torque
(void) fprintf( amoebaGpu->log,"RealSpaceEwaldT [%16.9e %16.9e %16.9e] ",
amoebaGpu->psTorque->_pSysStream[0][indexOffset],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+1],
amoebaGpu->psTorque->_pSysStream[0][indexOffset+2] );
// coords
#if 0
(void) fprintf( amoebaGpu->log,"x[%16.9e %16.9e %16.9e] ",
gpu->psPosq4->_pSysStream[0][ii].x,
gpu->psPosq4->_pSysStream[0][ii].y,
gpu->psPosq4->_pSysStream[0][ii].z);
for( int jj = 0; jj < gpu->natoms && jj < 5; jj++ ){
int debugIndex = jj*gpu->natoms + ii;
float xx = gpu->psPosq4->_pSysStream[0][jj].x - gpu->psPosq4->_pSysStream[0][ii].x;
float yy = gpu->psPosq4->_pSysStream[0][jj].y - gpu->psPosq4->_pSysStream[0][ii].y;
float zz = gpu->psPosq4->_pSysStream[0][jj].z - gpu->psPosq4->_pSysStream[0][ii].z;
(void) fprintf( amoebaGpu->log,"\n%4d %4d delta [%16.9e %16.9e %16.9e] [%16.9e %16.9e %16.9e] ",
ii, jj, xx, yy, zz,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y, debugArray->_pSysStream[0][debugIndex].z );
}
#endif
(void) fprintf( amoebaGpu->log,"\n" );
if( ii == maxPrint && (gpu->natoms - maxPrint) > ii ){
ii = gpu->natoms - maxPrint;
}
}
if( 1 ){
(void) fprintf( amoebaGpu->log,"DebugElec\n" );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
for( int jj = 0; jj < gpu->natoms; jj++ ){
int debugIndex = jj;
for( int kk = 0; kk < 5; kk++ ){
(void) fprintf( amoebaGpu->log,"%5d %5d [%16.9e %16.9e %16.9e %16.9e] E11\n", targetAtom, jj,
debugArray->_pSysStream[0][debugIndex].x, debugArray->_pSysStream[0][debugIndex].y,
debugArray->_pSysStream[0][debugIndex].z, debugArray->_pSysStream[0][debugIndex].w );
debugIndex += paddedNumberOfAtoms;
}
(void) fprintf( amoebaGpu->log,"\n" );
}
}
(void) fflush( amoebaGpu->log );
if( 0 ){
(void) fprintf( amoebaGpu->log, "%s Tiled F & T\n", methodName ); fflush( amoebaGpu->log );
int maxPrint = 12;
for( int ii = 0; ii < gpu->natoms; ii++ ){
// print cpu & gpu reductions
int offset = 3*ii;
(void) fprintf( amoebaGpu->log,"%6d F[%16.7e %16.7e %16.7e] T[%16.7e %16.7e %16.7e]\n", ii,
amoebaGpu->psForce->_pSysStream[0][offset],
amoebaGpu->psForce->_pSysStream[0][offset+1],
amoebaGpu->psForce->_pSysStream[0][offset+2],
amoebaGpu->psTorque->_pSysStream[0][offset],
amoebaGpu->psTorque->_pSysStream[0][offset+1],
amoebaGpu->psTorque->_pSysStream[0][offset+2] );
if( (ii == maxPrint) && (ii < (gpu->natoms - maxPrint)) )ii = gpu->natoms - maxPrint;
}
}
if( 1 ){
std::vector<int> fileId;
//fileId.push_back( 0 );
VectorOfDoubleVectors outputVector;
cudaLoadCudaFloat4Array( gpu->natoms, 3, gpu->psPosq4, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psForce, outputVector );
cudaLoadCudaFloatArray( gpu->natoms, 3, amoebaGpu->psTorque, outputVector);
cudaWriteVectorOfDoubleVectorsToFile( "CudaForceTorque", fileId, outputVector );
}
}
delete debugArray;
#endif
// ---------------------------------------------------------------------------------------
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
#include "amoebaScaleFactors.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(384, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(128, 1)
#else
__launch_bounds__(64, 1)
#endif
void METHOD_NAME(kCalculateAmoebaCudaRealSpaceEwald, Forces_kernel)(
unsigned int* workUnit,
float4* atomCoord,
float* labFrameDipole,
float* labFrameQuadrupole,
float* inducedDipole,
float* inducedDipolePolar,
float* outputForce,
float* outputTorque
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
float4 pullBack[20];
#endif
extern __shared__ RealSpaceEwaldParticle sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF;
float totalEnergy = 0.0f;
float scalingFactors[LastScalingIndex];
while (pos < end)
{
unsigned int x;
unsigned int y;
bool bExclusionFlag;
// Extract cell coordinates
decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
RealSpaceEwaldParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx;
RealSpaceEwaldParticle localParticle;
loadRealSpaceEwaldShared(&localParticle, atomI,
atomCoord, labFrameDipole, labFrameQuadrupole,
inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
localParticle.force[0] = 0.0f;
localParticle.force[1] = 0.0f;
localParticle.force[2] = 0.0f;
localParticle.torque[0] = 0.0f;
localParticle.torque[1] = 0.0f;
localParticle.torque[2] = 0.0f;
scalingFactors[PScaleIndex] = 1.0f;
scalingFactors[DScaleIndex] = 1.0f;
scalingFactors[UScaleIndex] = 1.0f;
scalingFactors[MScaleIndex] = 1.0f;
if (x == y) // Handle diagonals uniquely at 50% efficiency
{
// load shared data
loadRealSpaceEwaldShared( &(sA[threadIdx.x]), atomI,
atomCoord, labFrameDipole, labFrameQuadrupole,
inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
if (!bExclusionFlag)
{
// this branch is never exercised since it includes the
// interaction between atomI and itself which is always excluded
for (unsigned int j = 0; j < GRID; j++)
{
float force[3];
float torque[2][3];
float energy;
calculateRealSpaceEwaldPairIxn_kernel( localParticle, psA[j],
cAmoebaSim.scalingDistanceCutoff, scalingFactors,
force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f;
}
}
else // bExclusion
{
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
float force[3];
float torque[2][3];
unsigned int atomJ = y + j;
// set scale factors
getMaskedDScaleFactor( j, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( j, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( j, mScaleMask, scalingFactors + MScaleIndex );
// force
float energy;
calculateRealSpaceEwaldPairIxn_kernel( localParticle, psA[j],
cAmoebaSim.scalingDistanceCutoff, scalingFactors,
force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
unsigned int mask = ( (atomI == atomJ) || (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? 0.5*energy : 0.0f;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom ){
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = 1.0f;
debugArray[index].w = (float) y;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? force[0] : 0.0f;
debugArray[index].y = mask ? force[1] : 0.0f;
debugArray[index].z = mask ? force[2] : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[0][0] : 0.0f;
debugArray[index].y = mask ? torque[0][1] : 0.0f;
debugArray[index].z = mask ? torque[0][2] : 0.0f;
*/
index += cAmoebaSim.paddedNumberOfAtoms;
int pullIndex = 0;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
pullIndex++;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
pullIndex++;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
pullIndex++;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
#endif
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float of;
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += localParticle.force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += localParticle.force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += localParticle.force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += localParticle.torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += localParticle.torque[1];
outputTorque[offset+1] = of;
of = outputTorque[offset+2];
of += localParticle.torque[2];
outputTorque[offset+2] = of;
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = localParticle.force[0];
outputForce[offset+1] = localParticle.force[1];
outputForce[offset+2] = localParticle.force[2];
outputTorque[offset] = localParticle.torque[0];
outputTorque[offset+1] = localParticle.torque[1];
outputTorque[offset+2] = localParticle.torque[2];
#endif
}
else // 100% utilization
{
// Read fixed atom data into registers and GRF
if (lasty != y)
{
// load shared data
loadRealSpaceEwaldShared( &(sA[threadIdx.x]), (y+tgx),
atomCoord, labFrameDipole, labFrameQuadrupole,
inducedDipole, inducedDipolePolar, cAmoebaSim.pDampingFactorAndThole );
}
sA[threadIdx.x].force[0] = 0.0f;
sA[threadIdx.x].force[1] = 0.0f;
sA[threadIdx.x].force[2] = 0.0f;
sA[threadIdx.x].torque[0] = 0.0f;
sA[threadIdx.x].torque[1] = 0.0f;
sA[threadIdx.x].torque[2] = 0.0f;
if (!bExclusionFlag)
{
for (unsigned int j = 0; j < GRID; j++)
{
float force[3];
float torque[2][3];
unsigned int atomJ = y + tj;
float energy;
calculateRealSpaceEwaldPairIxn_kernel( localParticle, psA[tj],
cAmoebaSim.scalingDistanceCutoff, scalingFactors,
force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || ( atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add force and torque to atom I due atom J
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? energy : 0.0f;
// add force and torque to atom J due atom I
psA[tj].force[0] -= mask ? force[0] : 0.0f;
psA[tj].force[1] -= mask ? force[1] : 0.0f;
psA[tj].force[2] -= mask ? force[2] : 0.0f;
psA[tj].torque[0] += mask ? torque[1][0] : 0.0f;
psA[tj].torque[1] += mask ? torque[1][1] : 0.0f;
psA[tj].torque[2] += mask ? torque[1][2] : 0.0f;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
float forceSign = (atomI == targetAtom) ? 1.0f : -1.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = 2.0f;
debugArray[index].w = (float) y;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceSign*force[0] : 0.0f;
debugArray[index].y = mask ? forceSign*force[1] : 0.0f;
debugArray[index].z = mask ? forceSign*force[2] : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexI][0] : 0.0f;
debugArray[index].y = mask ? torque[indexI][1] : 0.0f;
debugArray[index].z = mask ? torque[indexI][2] : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexJ][0] : 0.0f;
debugArray[index].y = mask ? torque[indexJ][1] : 0.0f;
debugArray[index].z = mask ? torque[indexJ][2] : 0.0f;
*/
index += cAmoebaSim.paddedNumberOfAtoms;
int pullIndex = 0;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
pullIndex++;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
pullIndex++;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
pullIndex++;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
#endif
tj = (tj + 1) & (GRID - 1);
}
}
else // bExclusion
{
// Read fixed atom data into registers and GRF
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
int dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
int2 mScaleMask = cAmoebaSim.pM_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
for (unsigned int j = 0; j < GRID; j++)
{
float force[3];
float torque[2][3];
unsigned int atomJ = y + tj;
// set scale factors
getMaskedDScaleFactor( tj, dScaleMask, scalingFactors + DScaleIndex );
getMaskedPScaleFactor( tj, pScaleMask, scalingFactors + PScaleIndex );
getMaskedMScaleFactor( tj, mScaleMask, scalingFactors + MScaleIndex );
// force
float energy;
calculateRealSpaceEwaldPairIxn_kernel( localParticle, psA[tj],
cAmoebaSim.scalingDistanceCutoff, scalingFactors,
force, torque, &energy
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
// check if atoms out-of-bounds
unsigned int mask = ( (atomI >= cAmoebaSim.numberOfAtoms) || (atomJ >= cAmoebaSim.numberOfAtoms) ) ? 0 : 1;
// add force and torque to atom I due atom J
localParticle.force[0] += mask ? force[0] : 0.0f;
localParticle.force[1] += mask ? force[1] : 0.0f;
localParticle.force[2] += mask ? force[2] : 0.0f;
localParticle.torque[0] += mask ? torque[0][0] : 0.0f;
localParticle.torque[1] += mask ? torque[0][1] : 0.0f;
localParticle.torque[2] += mask ? torque[0][2] : 0.0f;
totalEnergy += mask ? energy : 0.0f;
// add force and torque to atom J due atom I
psA[tj].force[0] -= mask ? force[0] : 0.0f;
psA[tj].force[1] -= mask ? force[1] : 0.0f;
psA[tj].force[2] -= mask ? force[2] : 0.0f;
psA[tj].torque[0] += mask ? torque[1][0] : 0.0f;
psA[tj].torque[1] += mask ? torque[1][1] : 0.0f;
psA[tj].torque[2] += mask ? torque[1][2] : 0.0f;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || atomJ == targetAtom ){
unsigned int index = (atomI == targetAtom) ? atomJ : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 1;
unsigned int indexJ = (atomI == targetAtom) ? 1 : 0;
float forceSign = (atomI == targetAtom) ? 1.0f : -1.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) atomJ;
debugArray[index].z = 3.0f;
debugArray[index].w = (float) y;
/*
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? forceSign*force[0] : 0.0f;
debugArray[index].y = mask ? forceSign*force[1] : 0.0f;
debugArray[index].z = mask ? forceSign*force[2] : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexI][0] : 0.0f;
debugArray[index].y = mask ? torque[indexI][1] : 0.0f;
debugArray[index].z = mask ? torque[indexI][2] : 0.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = mask ? torque[indexJ][0] : 0.0f;
debugArray[index].y = mask ? torque[indexJ][1] : 0.0f;
debugArray[index].z = mask ? torque[indexJ][2] : 0.0f;
*/
index += cAmoebaSim.paddedNumberOfAtoms;
int pullIndex = 0;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
pullIndex++;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
pullIndex++;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
index += cAmoebaSim.paddedNumberOfAtoms;
pullIndex++;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
#if 0
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[DScaleIndex];
debugArray[index].y = dScaleVal;
debugArray[index].z = scalingFactors[PScaleIndex];
debugArray[index].w = pScaleVal;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = scalingFactors[MScaleIndex];
debugArray[index].y = mScaleVal;
for( int pIndex = 0; pIndex < 14; pIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pIndex].x;
debugArray[index].y = pullBack[pIndex].y;
debugArray[index].z = pullBack[pIndex].z;
debugArray[index].w = pullBack[pIndex].w;
}
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomI];
debugArray[index].y = labFrameDipole[3*atomI+1];
debugArray[index].z = labFrameDipole[3*atomI+2];
debugArray[index].w = 25.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = labFrameDipole[3*atomJ];
debugArray[index].y = labFrameDipole[3*atomJ+1];
debugArray[index].z = labFrameDipole[3*atomJ+2];
debugArray[index].w = 26.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = jDipole[0];
debugArray[index].y = jDipole[1];
debugArray[index].z = jDipole[2];
debugArray[index].w = 27.0f;
#endif
}
#endif
tj = (tj + 1) & (GRID - 1);
}
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
float of;
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += localParticle.force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += localParticle.force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += localParticle.force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += localParticle.torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += localParticle.torque[1];
outputTorque[offset+1] = of;
of = outputTorque[offset+2];
of += localParticle.torque[2];
outputTorque[offset+2] = of;
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
of = outputForce[offset];
of += sA[threadIdx.x].force[0];
outputForce[offset] = of;
of = outputForce[offset+1];
of += sA[threadIdx.x].force[1];
outputForce[offset+1] = of;
of = outputForce[offset+2];
of += sA[threadIdx.x].force[2];
outputForce[offset+2] = of;
of = outputTorque[offset];
of += sA[threadIdx.x].torque[0];
outputTorque[offset] = of;
of = outputTorque[offset+1];
of += sA[threadIdx.x].torque[1];
outputTorque[offset+1] = of;
of = outputTorque[offset+2];
of += sA[threadIdx.x].torque[2];
outputTorque[offset+2] = of;
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = localParticle.force[0];
outputForce[offset+1] = localParticle.force[1];
outputForce[offset+2] = localParticle.force[2];
outputTorque[offset] = localParticle.torque[0];
outputTorque[offset+1] = localParticle.torque[1];
outputTorque[offset+2] = localParticle.torque[2];
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
outputForce[offset] = sA[threadIdx.x].force[0];
outputForce[offset+1] = sA[threadIdx.x].force[1];
outputForce[offset+2] = sA[threadIdx.x].force[2];
outputTorque[offset] = sA[threadIdx.x].torque[0];
outputTorque[offset+1] = sA[threadIdx.x].torque[1];
outputTorque[offset+2] = sA[threadIdx.x].torque[2];
#endif
lasty = y;
}
pos++;
}
cSim.pEnergy[blockIdx.x * blockDim.x + threadIdx.x] += totalEnergy;
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see <http://www.gnu.org/licenses/>. *
* -------------------------------------------------------------------------- */
//-----------------------------------------------------------------------------------------
//-----------------------------------------------------------------------------------------
#include "amoebaCudaKernels.h"
#include "kCalculateAmoebaCudaUtilities.h"
#include <time.h>
//#define AMOEBA_DEBUG
static __constant__ cudaGmxSimulation cSim;
static __constant__ cudaAmoebaGmxSimulation cAmoebaSim;
void SetCalculateAmoebaCudaSASAForcesSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyToSymbol(cSim, &gpu->sim, sizeof(cudaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaSASASim: cudaMemcpyToSymbol: SetSim copy to cSim failed");
status = cudaMemcpyToSymbol(cAmoebaSim, &amoebaGpu->amoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "SetCalculateAmoebaCudaSASASim: cudaMemcpyToSymbol: SetSim copy to cAmoebaSim failed");
}
void GetCalculateAmoebaCudaSASAForcesSim(amoebaGpuContext amoebaGpu)
{
cudaError_t status;
gpuContext gpu = amoebaGpu->gpuContext;
status = cudaMemcpyFromSymbol(&gpu->sim, cSim, sizeof(cudaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaSASASim: cudaMemcpyFromSymbol: SetSim copy from cSim failed");
status = cudaMemcpyFromSymbol(&amoebaGpu->amoebaSim, cAmoebaSim, sizeof(cudaAmoebaGmxSimulation));
RTERROR(status, "GetCalculateAmoebaCudaSASASim: cudaMemcpyFromSymbol: SetSim copy from cAmoebaSim failed");
}
__device__ void insertionSort( unsigned int length, float* A, int* P)
{
for( unsigned int ii = 1; ii < length; ii++ ){
float value = A[ii];
int index = P[ii];
int jj = ii;
while( jj > 0 && A[jj-1] > value ){
A[jj] = A[jj-1];
P[jj] = P[jj-1];
jj--;
}
A[jj] = value;
P[jj] = index;
}
}
__device__ void insertionSortArc( unsigned int length, float4* arcList )
{
for( unsigned int ii = 1; ii < length; ii++ ){
float4 value = arcList[ii];
int jj = ii;
while( jj > 0 && arcList[jj-1].x > value.x ){
arcList[jj] = arcList[jj-1];
jj--;
}
arcList[jj] = value;
}
}
__device__ int setOmitListOrig( float4 atomCoordI, float4* atomCoord, float* grArray, unsigned int ioListLength, int* ioList,
int* omitList, float delta )
{
float pi_f = 3.1415926535897f;
for( unsigned int jj = 0; jj < ioListLength-1; jj++ ){
int atomJ = ioList[jj];
float txj = atomCoord[atomJ].x - atomCoordI.x;
float tyj = atomCoord[atomJ].y - atomCoordI.y;
float tzj = atomCoord[atomJ].z - atomCoordI.z;
float bj = sqrtf( txj*txj + tyj*tyj + tzj*tzj );
float therj = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[jj])));
for( unsigned int kk = jj + 1; kk < ioListLength; kk++ ){
int atomK = ioList[kk];
float txk = atomCoord[atomK].x - atomCoordI.x;
float tyk = atomCoord[atomK].y - atomCoordI.y;
float tzk = atomCoord[atomK].z - atomCoordI.z;
float bk = sqrt( txk*txk + tyk*tyk + tzk*tzk );
float therk = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[kk])));
// check to see if kk circle is intersecting jj circle
// get distance between circle centers and sum of radii
float cc = (txj*txk + tyj*tyk + tzj*tzk)/(bj*bk);
cc = acos(min(1.0f,max(-1.0f,cc)));
float td = therj + therk;
// check to see if circles enclose separate regions
int mask1 = omitList[jj] == 0 && omitList[kk] == 0 && cc < td ? 1 : 0;
int mask2 = (cc + therk) < therj ? 1 : 0;
int mask3 = cc <= delta ? 1 : 0;
omitList[kk] = omitList[kk] || (mask1 && (mask2 || mask3)) ? 1 : 0;
if( (cc > delta) && ( (2.0f*pi_f - cc) <= td) ){
return 1; // done atom
}
}
}
return 0;
}
__device__ int setOmitList( float4 atomCoordI, float4* atomCoord, float* grArray, unsigned int ioListLength, int* ioList,
int* omitList, float delta )
{
float pi_f = 3.1415926535897f;
unsigned int outputIncludeCount = 0;
ioList[outputIncludeCount++] = ioList[0];
for( unsigned int jj = 0; jj < ioListLength-1; jj++ ){
int atomJ = ioList[jj];
float txj = atomCoord[atomJ].x - atomCoordI.x;
float tyj = atomCoord[atomJ].y - atomCoordI.y;
float tzj = atomCoord[atomJ].z - atomCoordI.z;
float bj = txj*txj + tyj*tyj + tzj*tzj;
float therj = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[jj])));
for( unsigned int kk = jj + 1; kk < ioListLength; kk++ ){
int atomK = ioList[kk];
float txk = atomCoord[atomK].x - atomCoordI.x;
float tyk = atomCoord[atomK].y - atomCoordI.y;
float tzk = atomCoord[atomK].z - atomCoordI.z;
float bk = txk*txk + tyk*tyk + tzk*tzk;
float therk = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[kk])));
// check to see if kk circle is intersecting jj circle
// get distance between circle centers and sum of radii
float cc = (txj*txk + tyj*tyk + tzj*tzk)/sqrtf(bj*bk);
cc = acos(min(1.0f,max(-1.0f,cc)));
float td = therj + therk;
// check to see if circles enclose separate regions
int mask1 = omitList[jj] == 0 && omitList[kk] == 0 && cc < td ? 1 : 0;
int mask2 = (cc + therk) < therj ? 1 : 0;
int mask3 = cc <= delta ? 1 : 0;
omitList[kk] = omitList[kk] || (mask1 && (mask2 || mask3)) ? 1 : 0;
if( (cc > delta) && ( (2.0f*pi_f - cc) <= td) ){
return 0; // done atom
}
}
if( omitList[jj+1] == 0 ){
ioList[outputIncludeCount++] = atomJ;
}
}
return outputIncludeCount;
}
__device__ int setArcList( float4 atomCoordI, float4* atomCoord, float* grArray, unsigned int ioListLength, int* ioList,
int* omitList, float delta )
{
float pi_f = 3.1415926535897f;
unsigned int outputIncludeCount = 0;
ioList[outputIncludeCount++] = ioList[0];
for( unsigned int jj = 0; jj < ioListLength-1; jj++ ){
int atomJ = ioList[jj];
float txj = atomCoord[atomJ].x - atomCoordI.x;
float tyj = atomCoord[atomJ].y - atomCoordI.y;
float tzj = atomCoord[atomJ].z - atomCoordI.z;
float bj = sqrtf( txj*txj + tyj*tyj + tzj*tzj );
float therj = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[jj])));
for( unsigned int kk = jj + 1; kk < ioListLength; kk++ ){
int atomK = ioList[kk];
float txk = atomCoord[atomK].x - atomCoordI.x;
float tyk = atomCoord[atomK].y - atomCoordI.y;
float tzk = atomCoord[atomK].z - atomCoordI.z;
float bk = sqrt( txk*txk + tyk*tyk + tzk*tzk );
float therk = 0.5f*pi_f - asin(min(1.0f,max(-1.0f,grArray[kk])));
// check to see if kk circle is intersecting jj circle
// get distance between circle centers and sum of radii
float cc = (txj*txk + tyj*tyk + tzj*tzk)/(bj*bk);
cc = acos(min(1.0f,max(-1.0f,cc)));
float td = therj + therk;
// check to see if circles enclose separate regions
int mask1 = omitList[jj] == 0 && omitList[kk] == 0 && cc < td ? 1 : 0;
int mask2 = (cc + therk) < therj ? 1 : 0;
int mask3 = cc <= delta ? 1 : 0;
omitList[kk] = omitList[kk] || (mask1 && (mask2 || mask3)) ? 1 : 0;
if( (cc > delta) && ( (2.0f*pi_f - cc) <= td) ){
return 0; // done atom
}
}
if( omitList[jj+1] == 0 ){
ioList[outputIncludeCount++] = atomJ;
}
}
return outputIncludeCount;
}
void insertionSortCpu( unsigned int length, float* A, int* P)
{
for( unsigned int ii = 1; ii < length; ii++ ){
float value = A[ii];
int index = P[ii];
int jj = ii;
while( jj > 0 && A[jj-1] > value ){
A[jj] = A[jj-1];
P[jj] = P[jj-1];
jj--;
}
A[jj] = value;
P[jj] = index;
}
}
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(GF1XX_NONBOND_THREADS_PER_BLOCK, 1)
#elif (__CUDA_ARCH__ >= 130)
__launch_bounds__(GT2XX_NONBOND_THREADS_PER_BLOCK, 1)
#else
__launch_bounds__(G8X_NONBOND_THREADS_PER_BLOCK, 1)
#endif
void kCalculateAmoebaCudaSASA( float4* atomCoord, float* rPlusProbe,
int* doneAtom, int* ioListCount,
int* workI1Array, int* workI2Array, int* workI3Array, int* workI4Array,
float* workFArray
#ifdef AMOEBA_DEBUG
,float4* debugArray
#endif
)
{
unsigned int atomI = blockIdx.x*blockDim.x+threadIdx.x;
if( atomI >= cAmoebaSim.numberOfAtoms)
{
return;
}
unsigned int arrayOffset = atomI*cAmoebaSim.maxarc;
float* grArray = workFArray + arrayOffset;
int* ioList = workI1Array + arrayOffset;
int* omitList = workI2Array + arrayOffset;
unsigned int atomJ = 0;
float4 coordI = atomCoord[atomI];
float delta = 1.0e-08f;
float rPlusProbeI = rPlusProbe[atomI];
unsigned int io = 0;
while (atomJ < cAmoebaSim.numberOfAtoms)
{
float4 coordJ = atomCoord[atomJ];
float rPlusProbeJ = rPlusProbe[atomJ];
float rplus = rPlusProbeI + rPlusProbeJ;
float tx = coordJ.x - coordI.x;
float ty = coordJ.y - coordI.y;
float tz = coordJ.z - coordI.z;
float ccsq = tx*tx + ty*ty + tz*tz;
float cc = sqrtf( ccsq );
float rminus = rPlusProbeI - rPlusProbeJ;
int mask1 = (atomI != atomJ) && (fabsf( tx ) < rplus) && (fabsf( ty ) < rplus) && (fabsf( tz ) < rplus) ? 1 : 0;
int mask2 = ((rplus-cc) > delta) ? mask1 : 0;
int mask3 = (cc-fabsf(rminus) > delta) ? mask2 : 0;
if( mask3 ){
ioList[io] = atomJ;
omitList[io] = 0;
grArray[io] = (ccsq+rplus*rminus) / (2.0f*rPlusProbeI*cc);
io++;
#ifdef AMOEBA_DEBUG
debugArray[arrayOffset+io].x = tx;
debugArray[arrayOffset+io].y = ty;
debugArray[arrayOffset+io].z = rplus;
debugArray[arrayOffset+io].w = rPlusProbe[atomJ];
#endif
}
atomJ++;
}
// ioListCount[atomI] = io;
insertionSort( io, grArray, ioList );
// set omit/doneAtom values
io = setOmitList( coordI, atomCoord, grArray, io, ioList, omitList, delta );
doneAtom[atomI] = (io == 0) ? 1 : 0;
}
/**---------------------------------------------------------------------------------------
Compute SASA force/energy
@param amoebaGpu amoebaGpu context
--------------------------------------------------------------------------------------- */
void kCalculateAmoebaSASAForces(amoebaGpuContext amoebaGpu )
{
// ---------------------------------------------------------------------------------------
#ifdef AMOEBA_DEBUG
static const char* methodName = "kCalculateAmoebaSASAForces";
static int timestep = 0;
std::vector<int> fileId;
timestep++;
fileId.resize( 2 );
fileId[0] = timestep;
fileId[1] = 1;
#endif
if( amoebaGpu )return;
// ---------------------------------------------------------------------------------------
gpuContext gpu = amoebaGpu->gpuContext;
int numOfElems = amoebaGpu->paddedNumberOfAtoms;
int numThreads = min( THREADS_PER_BLOCK, numOfElems );
int numBlocks = numOfElems/numThreads;
if( (numOfElems % numThreads) != 0 )numBlocks++;
#ifdef AMOEBA_DEBUG
CUDAStream<float4>* debugArray = NULL;
if( amoebaGpu->log ){
(void) fprintf( amoebaGpu->log, "%s %d numOfElems=%d numThreads=%d numBlocks=%d maxarc=%d\n",
methodName, gpu->natoms, numOfElems, numThreads, numBlocks, amoebaGpu->amoebaSim.maxarc );
(void) fflush( amoebaGpu->log );
int paddedNumberOfAtoms = amoebaGpu->gpuContext->sim.paddedNumberOfAtoms;
debugArray = new CUDAStream<float4>(paddedNumberOfAtoms*paddedNumberOfAtoms, 1, "DebugArray");
memset( debugArray->_pSysStream[0], 0, sizeof( float )*4*paddedNumberOfAtoms*paddedNumberOfAtoms);
debugArray->Upload();
}
#endif
// initialize [ InducedDipole & InducedDipolePolar ] to [ E_Field & E_FieldPolar]*Polarizability
kCalculateAmoebaCudaSASA<<< numBlocks, numThreads >>>(
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psSASA_Radii->_pDevStream[0],
amoebaGpu->psDoneAtom->_pDevStream[0],
amoebaGpu->psIoListCount->_pDevStream[0],
amoebaGpu->psIntWorkArray[0]->_pDevStream[0],
amoebaGpu->psIntWorkArray[1]->_pDevStream[0],
amoebaGpu->psIntWorkArray[2]->_pDevStream[0],
amoebaGpu->psIntWorkArray[3]->_pDevStream[0],
amoebaGpu->psFloatWorkArray->_pDevStream[0]
#ifdef AMOEBA_DEBUG
, debugArray->_pDevStream[0]
#endif
);
LAUNCHERROR("kCalculateAmoebaCudaSASA");
#ifdef AMOEBA_DEBUG
if( amoebaGpu->log ){
debugArray->Download();
amoebaGpu->psIntWorkArray[0]->Download();
amoebaGpu->psIntWorkArray[1]->Download();
amoebaGpu->psIoListCount->Download();
amoebaGpu->psDoneAtom->Download();
amoebaGpu->psFloatWorkArray->Download();
(void) fprintf( amoebaGpu->log, "Post kCalculateAmoebaCudaSASA \n" );
for( unsigned int ii = 0; ii < amoebaGpu->paddedNumberOfAtoms; ii++)
{
int index = ii*amoebaGpu->amoebaSim.maxarc;
unsigned int length = (unsigned int) amoebaGpu->psIoListCount->_pSysStream[0][ii];
int omitSum = 0;
for( unsigned int jj = 0; jj < length; jj++)
{
omitSum += amoebaGpu->psIntWorkArray[1]->_pSysStream[0][index+jj];
}
(void) fprintf( amoebaGpu->log, "%5u %5d omitSum=%5d done=%d\n", ii, length, omitSum, amoebaGpu->psDoneAtom->_pSysStream[0][ii] );
if( ii == 0 ){
for( unsigned int jj = 0; jj < length; jj++)
{
(void) fprintf( amoebaGpu->log, " %5d omt=%5d %14.6e %14.6e %14.6e %14.6e %14.6e \n",
amoebaGpu->psIntWorkArray[0]->_pSysStream[0][index+jj],
amoebaGpu->psIntWorkArray[1]->_pSysStream[0][index+jj],
amoebaGpu->psFloatWorkArray->_pSysStream[0][index+jj],
debugArray->_pSysStream[0][index+jj].x, debugArray->_pSysStream[0][index+jj].y, debugArray->_pSysStream[0][index+jj].z, debugArray->_pSysStream[0][index+jj].w);
}
/*
int* permutation = (int*) malloc( sizeof( int )*length );
float* values = (float*) malloc( sizeof( float )*length );
for( unsigned int jj = 1; jj <= length; jj++)
{
permutation[jj-1] = amoebaGpu->psIntWorkArray[0]->_pSysStream[0][index+jj];
values[jj-1] = amoebaGpu->psFloatWorkArray->_pSysStream[0][index+jj];
}
insertionSortCpu( length, values, permutation);
for( unsigned int jj = 0; jj < length; jj++)
{
(void) fprintf( amoebaGpu->log, "Sortd: %5u %5d %14.6e\n", jj, permutation[jj], values[jj] );
}
free( permutation );
free( values );
*/
}
}
(void) fflush( amoebaGpu->log );
}
#endif
if( 0 )
{
unsigned int iterations = 5000;
time_t start = clock();
for( unsigned int ii = 0; ii < iterations; ii++)
{
kCalculateAmoebaCudaSASA<<< numBlocks, numThreads >>>(
gpu->psPosq4->_pDevStream[0],
amoebaGpu->psSASA_Radii->_pDevStream[0],
amoebaGpu->psDoneAtom->_pDevStream[0],
amoebaGpu->psIoListCount->_pDevStream[0],
amoebaGpu->psIntWorkArray[0]->_pDevStream[0],
amoebaGpu->psIntWorkArray[1]->_pDevStream[0],
amoebaGpu->psIntWorkArray[2]->_pDevStream[0],
amoebaGpu->psIntWorkArray[3]->_pDevStream[0],
amoebaGpu->psFloatWorkArray->_pDevStream[0]
#ifdef AMOEBA_DEBUG
, debugArray->_pDevStream[0]
#endif
);
LAUNCHERROR("kCalculateAmoebaCudaSASA");
}
float sasaTime = (float)(clock()-start)/CLOCKS_PER_SEC;
(void) fprintf( amoebaGpu->log, "Total sasa kernel time=%12.5e time/iter=%12.5e %u %u\n", sasaTime, sasaTime/(float) iterations, (unsigned int) start, (unsigned int) clock());
}
#ifdef AMOEBA_DEBUG
delete debugArray;
#endif
}
......@@ -382,7 +382,11 @@ void kCalculateAmoebaMultipoleForces(amoebaGpuContext amoebaGpu, bool hasAmoebaG
// calculate electrostatic forces
if( amoebaGpu->multipoleNonbondedMethod == AMOEBA_NO_CUTOFF ){
cudaComputeAmoebaElectrostatic( amoebaGpu );
} else {
cudaComputeAmoebaRealSpaceEwald( amoebaGpu );
}
// map torques to forces
......
......@@ -40,6 +40,7 @@
#include <sys/time.h>
#endif
extern int isNanOrInfinity( double number );
using namespace std;
......@@ -1937,6 +1938,7 @@ static void readAmoebaMultipoleCovalent( FILE* filePtr, AmoebaMultipoleForce* mu
Read Amoeba multipole parameters
@param filePtr file pointer to parameter file
@param version version id for file
@param forceMap map of forces to be included
@param tokens array of strings from first line of parameter file for this block of parameters
@param system System reference
......@@ -1949,7 +1951,7 @@ static void readAmoebaMultipoleCovalent( FILE* filePtr, AmoebaMultipoleForce* mu
--------------------------------------------------------------------------------------- */
static int readAmoebaMultipoleParameters( FILE* filePtr, MapStringInt& forceMap, const StringVector& tokens,
static int readAmoebaMultipoleParameters( FILE* filePtr, int version, MapStringInt& forceMap, const StringVector& tokens,
System& system, int useOpenMMUnits, MapStringVectorOfVectors& supplementary,
MapStringString& inputArgumentMap, int* lineCount, FILE* log ){
......@@ -1984,8 +1986,27 @@ static int readAmoebaMultipoleParameters( FILE* filePtr, MapStringInt& forceMap,
// load in parameters
int numberOfMultipoles = atoi( tokens[1].c_str() );
int usePme = 0;
double aewald = 0.0;
double cutoffDistance = 0.0;
// usePme, aewald, cutoffDistance added w/ Version 1
if( version > 0 ){
usePme = atoi( tokens[2].c_str() );
aewald = atof( tokens[3].c_str() );
cutoffDistance = atof( tokens[4].c_str() );
}
if( usePme ){
multipoleForce->setNonbondedMethod( AmoebaMultipoleForce::PME );
} else {
multipoleForce->setNonbondedMethod( AmoebaMultipoleForce::NoCutoff );
}
multipoleForce->setAEwald( aewald );
multipoleForce->setCutoffDistance( cutoffDistance );
if( log ){
(void) fprintf( log, "%s number of MultipoleParameter terms=%d\n", methodName.c_str(), numberOfMultipoles );
(void) fprintf( log, "%s number of MultipoleParameter terms=%d usePme=%d aewald=%15.7e cutoffDistance=%12.4f\n",
methodName.c_str(), numberOfMultipoles, usePme, aewald, cutoffDistance );
(void) fflush( log );
}
for( int ii = 0; ii < numberOfMultipoles; ii++ ){
......@@ -3380,7 +3401,7 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
}
int lineCount = 0;
std::string version = "0.1";
int version = 0;
int isNotEof = 1;
Integrator* returnIntegrator = NULL;
......@@ -3404,9 +3425,9 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
if( field == "Version" ){
if( tokens.size() > 1 ){
version = tokens[1];
version = atoi( tokens[1].c_str() );
if( log ){
(void) fprintf( log, "Version=<%s> at line=%d\n", version.c_str(), lineCount );
(void) fprintf( log, "Version=%d at line=%d\n", version, lineCount );
}
}
} else if( field == "Masses" ){
......@@ -3522,13 +3543,28 @@ Integrator* readAmoebaParameterFile( const std::string& inputParameterFile, MapS
// AmoebaMultipole
} else if( field == "AmoebaMultipoleParameters" ){
readAmoebaMultipoleParameters( filePtr, forceMap, tokens, system, useOpenMMUnits, supplementary, inputArgumentMap, &lineCount, log );
} else if( field == "AmoebaMultipoleForce" ){
readAmoebaMultipoleParameters( filePtr, version, forceMap, tokens, system, useOpenMMUnits, supplementary, inputArgumentMap, &lineCount, log );
} else if( field == "AmoebaMultipoleForce" || field == "AmoebaPmeForce" ){
readVec3( filePtr, tokens, forces[AMOEBA_MULTIPOLE_FORCE], &lineCount, field, log );
} else if( field == "AmoebaMultipoleEnergy" ){
} else if( field == "AmoebaMultipoleEnergy" || field == "AmoebaPmeEnergy" ){
if( tokens.size() > 1 ){
potentialEnergy[AMOEBA_MULTIPOLE_FORCE] = atof( tokens[1].c_str() );
}
} else if( field == "AmoebaRealPmeForce" ||
field == "AmoebaKSpacePmeForce" ||
field == "AmoebaSelfPmeForce" ){
std::vector< std::vector<double> > vectorOfDoubleVectors;
readVectorOfDoubleVectors( filePtr, tokens, vectorOfDoubleVectors, &lineCount, field, log );
supplementary[field] = vectorOfDoubleVectors;
} else if( field == "AmoebaRealPmeEnergy" ||
field == "AmoebaKSpacePmeEnergy" ||
field == "AmoebaSelfPmeEnergy" ){
double value = atof( tokens[1].c_str() );
std::vector< std::vector<double> > vectorOfDoubleVectors;
std::vector<double> doubleVectors;
doubleVectors.push_back( value );
vectorOfDoubleVectors.push_back( doubleVectors );
supplementary[field] = vectorOfDoubleVectors;
// Amoeba GK
......@@ -3677,7 +3713,6 @@ void initializeForceMap( MapStringInt& forceMap, int initialValue ){
forceMap[AMOEBA_GK_FORCE] = initialValue;
forceMap[AMOEBA_VDW_FORCE] = initialValue;
forceMap[AMOEBA_WCA_DISPERSION_FORCE] = initialValue;
forceMap[AMOEBA_SASA_FORCE] = 0;
return;
......@@ -5830,8 +5865,7 @@ Double "simulationTimeBetweenReportsRatio" simulationTimeBetweenReportsRat
key == AMOEBA_MULTIPOLE_FORCE ||
key == AMOEBA_GK_FORCE ||
key == AMOEBA_VDW_FORCE ||
key == AMOEBA_WCA_DISPERSION_FORCE ||
key == AMOEBA_SASA_FORCE ){
key == AMOEBA_WCA_DISPERSION_FORCE ){
forceMap[key] = atoi( value.c_str() );
} else {
inputArgumentMap[key] = value;
......
......@@ -77,7 +77,6 @@ static std::string AMOEBA_GK_FORCE = "AmoebaG
static std::string AMOEBA_GK_CAVITY_FORCE = "AmoebaGkAndCavity";
static std::string AMOEBA_VDW_FORCE = "AmoebaVdw";
static std::string AMOEBA_WCA_DISPERSION_FORCE = "AmoebaWcaDispersion";
static std::string AMOEBA_SASA_FORCE = "AmoebaSASA";
static std::string ALL_FORCES = "AllForces";
static std::string AMOEBA_MULTIPOLE_ROTATION_MATRICES = "AmoebaMultipoleRotationMatrices";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment