Commit d0426ba9 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing converting AMOEBA to the new CUDA platform: starting on AmoebaMultipoleForce

parent a99a2d0e
......@@ -78,3 +78,14 @@ void CudaArray::download(void* data, bool blocking) const {
throw OpenMMException(str.str());
}
}
void CudaArray::copyTo(CudaArray& dest) const {
if (dest.getSize() != size || dest.getElementSize() != elementSize)
throw OpenMMException("Error copying array "+name+" to "+dest.getName()+": The destination array does not match the size of the array");
CUresult result = cuMemcpyDtoDAsync(dest.getDevicePointer(), pointer, size*elementSize, 0);
if (result != CUDA_SUCCESS) {
std::stringstream str;
str<<"Error copying array "<<name<<" to "<<dest.getName()<<": "<<CudaContext::getErrorString(result)<<" ("<<result<<")";
throw OpenMMException(str.str());
}
}
......@@ -127,6 +127,12 @@ public:
* the destination array must be in page-locked memory.
*/
void download(void* data, bool blocking = true) const;
/**
* Copy the values in the device memory to a second array.
*
* @param dest the destination array to copy to
*/
void copyTo(CudaArray& dest) const;
private:
CudaContext& context;
CUdeviceptr pointer;
......
......@@ -468,6 +468,10 @@ void CudaContext::clearBuffer(CUdeviceptr memory, int size) {
executeKernel(clearBufferKernel, args, size, 128);
}
void CudaContext::addAutoclearBuffer(CudaArray& array) {
addAutoclearBuffer(array.getDevicePointer(), array.getSize()*array.getElementSize());
}
void CudaContext::addAutoclearBuffer(CUdeviceptr memory, int size) {
autoclearBuffers.push_back(memory);
autoclearBufferSizes.push_back(size/4);
......
......@@ -231,6 +231,10 @@ public:
* @param size the size of the buffer in bytes
*/
void clearBuffer(CUdeviceptr memory, int size);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*/
void addAutoclearBuffer(CudaArray& array);
/**
* Register a buffer that should be automatically cleared (all elements set to 0) at the start of each force or energy computation.
*
......
......@@ -1454,7 +1454,7 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
int elementSize = (cu.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
pmeGrid = new CudaArray(cu, gridSizeX*gridSizeY*gridSizeZ, 2*elementSize, "pmeGrid");
cu.addAutoclearBuffer(pmeGrid->getDevicePointer(), pmeGrid->getSize()*2*elementSize);
cu.addAutoclearBuffer(*pmeGrid);
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
......@@ -1928,8 +1928,8 @@ void CudaCalcGBSAOBCForceKernel::initialize(const System& system, const GBSAOBCF
}
bornSum = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "bornSum");
bornForce = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "bornForce");
cu.addAutoclearBuffer(bornSum->getDevicePointer(), bornSum->getSize()*sizeof(long long));
cu.addAutoclearBuffer(bornForce->getDevicePointer(), bornForce->getSize()*sizeof(long long));
cu.addAutoclearBuffer(*bornSum);
cu.addAutoclearBuffer(*bornForce);
CudaArray& posq = cu.getPosq();
float4* posqf = (float4*) cu.getPinnedBuffer();
double4* posqd = (double4*) cu.getPinnedBuffer();
......@@ -2757,7 +2757,7 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
cu.getNonbondedUtilities().addArgument(arguments[i]);
}
cu.addForce(new CudaCustomGBForceInfo(force));
cu.addAutoclearBuffer(longEnergyDerivs->getDevicePointer(), sizeof(long long)*longEnergyDerivs->getSize());
cu.addAutoclearBuffer(*longEnergyDerivs);
}
double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
......@@ -2766,7 +2766,7 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
hasInitializedKernels = true;
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
valueBuffers = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "customGBValueBuffers");
cu.addAutoclearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
cu.addAutoclearBuffer(*valueBuffers);
cu.clearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
pairValueArgs.push_back(&cu.getPosq().getDevicePointer());
pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
......
......@@ -177,9 +177,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Record the exclusion data.
exclusions = CudaArray::create<unsigned int>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
vector<unsigned int> exclusionVec(exclusions->getSize());
for (int i = 0; i < exclusions->getSize(); ++i)
exclusionVec[i] = 0xFFFFFFFF;
vector<unsigned int> exclusionVec(exclusions->getSize(), 0xFFFFFFFF);
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/CudaContext::TileSize;
int offset1 = atom1-x*CudaContext::TileSize;
......@@ -249,6 +247,7 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Create kernels.
if (kernelSource.size() > 0)
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
if (useCutoff) {
map<string, string> defines;
......@@ -291,6 +290,8 @@ void CudaNonbondedUtilities::initialize(const System& system) {
}
int CudaNonbondedUtilities::findExclusionIndex(int x, int y, const vector<unsigned int>& exclusionIndices, const vector<unsigned int>& exclusionRowIndices) {
if (x < y)
throw OpenMMException("Internal error: called findExclusionIndex with x<y");
int start = exclusionRowIndices[x];
int end = exclusionRowIndices[x+1];
for (int i = start; i < end; i++)
......@@ -317,7 +318,7 @@ void CudaNonbondedUtilities::prepareInteractions() {
}
void CudaNonbondedUtilities::computeInteractions() {
if (cutoff != -1.0)
if (kernelSource.size() > 0)
context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
}
......
......@@ -232,8 +232,20 @@ public:
* @param isSymmetric specifies whether the interaction is symmetric
*/
CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
private:
/**
* This is a utility routine for locating data in the exclusions array. It takes the (x,y) indices of a tile,
* and returns the location in the array where the data for that tile begins.
*
* This routine requires that x >= y. If not, it will throw an exception.
*
* @param x the x index of the tile
* @param y the y index of the tile
* @param exclusionIndices the content of the exclusionIndices array
* @param exclusionRowIndices the content of the exclusionRowIndices array
* @return the index in the exclusions array at which the data for that tile begins
*/
static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
private:
CudaContext& context;
CUfunction forceKernel;
CUfunction findBlockBoundsKernel;
......
......@@ -97,9 +97,9 @@ KernelImpl* AmoebaCudaKernelFactory::createKernelImpl(std::string name, const Pl
if (name == CalcAmoebaTorsionTorsionForceKernel::Name())
return new CudaCalcAmoebaTorsionTorsionForceKernel(name, platform, cu, context.getSystem());
// if (name == CalcAmoebaMultipoleForceKernel::Name())
// return new CudaCalcAmoebaMultipoleForceKernel(name, platform, cu, context.getSystem());
//
if (name == CalcAmoebaMultipoleForceKernel::Name())
return new CudaCalcAmoebaMultipoleForceKernel(name, platform, cu, context.getSystem());
// if (name == CalcAmoebaGeneralizedKirkwoodForceKernel::Name())
// return new CudaCalcAmoebaGeneralizedKirkwoodForceKernel(name, platform, cu, context.getSystem());
......
......@@ -30,8 +30,10 @@
#include "openmm/amoebaKernels.h"
#include "openmm/kernels.h"
#include "openmm/System.h"
#include "CudaContext.h"
#include "CudaArray.h"
#include "CudaContext.h"
#include "CudaSort.h"
#include <cufft.h>
namespace OpenMM {
......@@ -371,9 +373,46 @@ public:
private:
class ForceInfo;
void initializeScaleFactors();
int numMultipoles;
bool hasInitializedScaleFactors;
CudaContext& cu;
System& system;
std::vector<int3> covalentFlagValues;
std::vector<int2> polarizationFlagValues;
CudaArray* multipoleParticles;
CudaArray* torqueBufferIndices;
CudaArray* molecularDipoles;
CudaArray* molecularQuadrupoles;
CudaArray* labFrameDipoles;
CudaArray* labFrameQuadrupoles;
CudaArray* field;
CudaArray* fieldPolar;
CudaArray* dampingAndThole;
CudaArray* inducedDipole;
CudaArray* inducedDipolePolar;
CudaArray* currentEpsilon;
CudaArray* polarizability;
CudaArray* covalentFlags;
CudaArray* polarizationGroupFlags;
CudaArray* pmeGrid;
CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ;
CudaArray* pmeTheta1;
CudaArray* pmeTheta2;
CudaArray* pmeIgrid;
CudaArray* pmePhi;
CudaArray* pmePhid;
CudaArray* pmePhip;
CudaArray* pmePhidp;
CudaArray* pmeBsplineTheta;
CudaArray* pmeBsplineDTheta;
CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex;
CudaSort* sort;
cufftHandle fft;
CUfunction computeMomentsKernel, computeFixedFieldKernel;
};
/**
......
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
real4 posq;
real3 field, fieldPolar, dipole;
real quadrupoleXX, quadrupoleXY, quadrupoleXZ;
real quadrupoleYY, quadrupoleYZ, quadrupoleZZ;
float thole, damp;
} AtomData;
inline __device__ void loadAtomData(AtomData& data, int atom, const real4* __restrict__ posq, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) {
data.posq = posq[atom];
data.dipole.x = labFrameDipole[atom*3];
data.dipole.y = labFrameDipole[atom*3+1];
data.dipole.z = labFrameDipole[atom*3+2];
data.quadrupoleXX = labFrameQuadrupole[atom*9];
data.quadrupoleXY = labFrameQuadrupole[atom*9+1];
data.quadrupoleXZ = labFrameQuadrupole[atom*9+2];
data.quadrupoleYY = labFrameQuadrupole[atom*9+4];
data.quadrupoleYZ = labFrameQuadrupole[atom*9+5];
data.quadrupoleZZ = labFrameQuadrupole[atom*9+8];
float2 temp = dampingAndThole[atom];
data.damp = temp.x;
data.thole = temp.y;
}
__device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 deltaR, real3& field1, real3& field2) {
real rI = RSQRT(dot(deltaR, deltaR));
real r = RECIP(rI);
real r2I = rI*rI;
real rr3 = rI*r2I;
real rr5 = 3*rr3*r2I;
real rr7 = 5*rr5*r2I;
// get scaling factors, if needed
float damp = atom1.damp*atom2.damp;
real dampExp;
if (damp != 0 && r < SCALING_DISTANCE_CUTOFF) {
// get scaling factors
real ratio = r/damp;
float pGamma = atom2.thole > atom1.thole ? atom1.thole : atom2.thole;
damp = ratio*ratio*ratio*pGamma;
dampExp = EXP(-damp);
}
else
dampExp = 0;
rr3 *= 1 - dampExp;
rr5 *= 1 - (1+damp)*dampExp;
rr7 *= 1 - (1+damp+(0.6f*damp*damp))*dampExp;
real rr5_2 = 2*rr5;
real3 qDotDelta;
qDotDelta.x = deltaR.x*atom2.quadrupoleXX + deltaR.y*atom2.quadrupoleXY + deltaR.z*atom2.quadrupoleXZ;
qDotDelta.y = deltaR.x*atom2.quadrupoleXY + deltaR.y*atom2.quadrupoleYY + deltaR.z*atom2.quadrupoleYZ;
qDotDelta.z = deltaR.x*atom2.quadrupoleXZ + deltaR.y*atom2.quadrupoleYZ + deltaR.z*atom2.quadrupoleZZ;
real dotdd = dot(deltaR, atom2.dipole);
real dotqd = dot(deltaR, qDotDelta);
real factor = -rr3*atom2.posq.w + rr5*dotdd - rr7*dotqd;
field1 = deltaR*factor - rr3*atom2.dipole + rr5_2*qDotDelta;
qDotDelta.x = deltaR.x*atom1.quadrupoleXX + deltaR.y*atom1.quadrupoleXY + deltaR.z*atom1.quadrupoleXZ;
qDotDelta.y = deltaR.x*atom1.quadrupoleXY + deltaR.y*atom1.quadrupoleYY + deltaR.z*atom1.quadrupoleYZ;
qDotDelta.z = deltaR.x*atom1.quadrupoleXZ + deltaR.y*atom1.quadrupoleYZ + deltaR.z*atom1.quadrupoleZZ;
dotdd = dot(deltaR, atom1.dipole);
dotqd = dot(deltaR, qDotDelta);
factor = rr3*atom1.posq.w + rr5*dotdd + rr7*dotqd;
field2 = deltaR*factor - rr3*atom1.dipole - rr5_2*qDotDelta;
}
__device__ real computeDScaleFactor(unsigned int polarizationGroup) {
return (polarizationGroup & 1 ? 0 : 1);
}
__device__ float computeDScaleFactor(uint2 covalent) {
// int f1 = (value == 0 || value == 1 ? 1 : 0);
// int f2 = (value == 0 || value == 2 ? 1 : 0);
// 0 = 12 or 13: x and y: 0
// 1 = 14: x: 0.4
// 2 = 15: y: 0.8
bool x = (covalent.x & 1);
bool y = (covalent.y & 1);
return (x ? (y ? 0.0f : 0.4f) : (y ? 0.8f : 1.0f));
}
__device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGroup) {
bool x = (covalent.x & 1);
bool y = (covalent.y & 1);
bool p = (polarizationGroup & 1);
return (x && y ? 0.0f : (x && p ? 0.5f : 1.0f));
}
/**
* Compute nonbonded interactions.
*/
extern "C" __global__ void computeFixedField(
unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
#endif
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
#endif
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
AtomData data;
data.field = make_real3(0);
data.fieldPolar = make_real3(0);
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
// Locate the exclusion data for this tile.
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = data.posq;
localData[localAtomIndex].dipole = data.dipole;
localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
localData[localAtomIndex].thole = data.thole; // IS THIS CORRECT?
localData[localAtomIndex].damp = data.damp; // IS THIS CORRECT?
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
bool isExcluded = !(excl & 0x1);
int atom2 = tbx+j;
real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real3 field1;
real3 field2;
computeOneInteraction(data, localData[atom2], delta, field1, field2);
if (!isExcluded) {
float d = computeDScaleFactor(covalent);
data.field += d*field1;
float p = computePScaleFactor(covalent, polarizationGroup);
data.fieldPolar += p*field1;
}
excl >>= 1;
covalent.x >>= 1;
covalent.y >>= 1;
polarizationGroup >>= 1;
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
localData[localAtomIndex].field = make_real3(0);
localData[localAtomIndex].fieldPolar = make_real3(0);
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
int bufferIndex = 3*threadIdx.x;
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
COMPUTE_INTERACTION
#ifdef USE_CUTOFF
}
#endif
#ifdef ENABLE_SHUFFLE
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
for (int i = 16; i >= 1; i /= 2) {
dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
dEdR2.z += __shfl_xor(dEdR2.z, i, 32);
}
if (tgx == 0) {
localData[tbx+j].fx += dEdR2.x;
localData[tbx+j].fy += dEdR2.y;
localData[tbx+j].fz += dEdR2.z;
}
#else
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z;
// Sum the forces on atom2.
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
covalent.x = (covalent.x >> tgx) | (covalent.x << (TILE_SIZE - tgx));
covalent.y = (covalent.y >> tgx) | (covalent.y << (TILE_SIZE - tgx));
polarizationGroup = (polarizationGroup >> tgx) | (polarizationGroup << (TILE_SIZE - tgx));
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
bool isExcluded = !(excl & 0x1);
int atom2 = tbx+tj;
real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real3 field1;
real3 field2;
computeOneInteraction(data, localData[atom2], delta, field1, field2);
if (!isExcluded) {
float d = computeDScaleFactor(covalent);
data.field += d*field1;
localData[atom2].field += d*field2;
float p = computePScaleFactor(covalent, polarizationGroup);
data.fieldPolar += p*field1;
localData[atom2].fieldPolar += p*field2;
}
excl >>= 1;
covalent.x >>= 1;
covalent.y >>= 1;
polarizationGroup >>= 1;
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0xFFFFFFFF)));
atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0xFFFFFFFF)));
atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0xFFFFFFFF)));
atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0xFFFFFFFF)));
atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0xFFFFFFFF)));
atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0xFFFFFFFF)));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0xFFFFFFFF)));
atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0xFFFFFFFF)));
atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0xFFFFFFFF)));
atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0xFFFFFFFF)));
atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0xFFFFFFFF)));
atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0xFFFFFFFF)));
}
pos++;
} while (pos < end);
}
extern "C" __global__ void computeLabFrameMoments(real4* __restrict__ posq, int4* __restrict__ multipoleParticles, float* __restrict__ molecularDipoles,
float* __restrict__ molecularQuadrupoles, real* __restrict__ labFrameDipoles, real* __restrict__ labFrameQuadrupoles) {
// get coordinates of this atom and the z & x axis atoms
// compute the vector between the atoms and 1/sqrt(d2), d2 is distance between
// this atom and the axis atom
// this atom is referred to as the k-atom in notes below
// code common to ZThenX and Bisector
for (int particleIndex = blockIdx.x*blockDim.x+threadIdx.x; particleIndex < NUM_ATOMS; particleIndex += gridDim.x*blockDim.x) {
int4 particles = multipoleParticles[particleIndex];
if (particles.x >= 0 && particles.z >= 0) {
real4 thisParticlePos = posq[particleIndex];
real4 posZ = posq[particles.z];
real3 vectorZ = make_real3(posZ.x-thisParticlePos.x, posZ.y-thisParticlePos.y, posZ.z-thisParticlePos.z);
real4 posX = posq[particles.x];
real3 vectorX = make_real3(posX.x-thisParticlePos.x, posX.y-thisParticlePos.y, posX.z-thisParticlePos.z);
int axisType = particles.w;
/*
z-only
(1) norm z
(2) select random x
(3) x = x - (x.z)z
(4) norm x
z-then-x
(1) norm z
(2) norm x (not needed)
(3) x = x - (x.z)z
(4) norm x
bisector
(1) norm z
(2) norm x
(3) z = x + z
(4) norm z
(5) x = x - (x.z)z
(6) norm x
z-bisect
(1) norm z
(2) norm x
(3) norm y
(3) x = x + y
(4) norm x
(5) x = x - (x.z)z
(6) norm x
3-fold
(1) norm z
(2) norm x
(3) norm y
(4) z = x + y + z
(5) norm z
(6) x = x - (x.z)z
(7) norm x
*/
// branch based on axis type
vectorZ = normalize(vectorZ);
if (axisType == 1) {
// bisector
vectorX = normalize(vectorX);
vectorZ += vectorX;
vectorZ = normalize(vectorZ);
}
else if (axisType == 2 || axisType == 3) {
// z-bisect
if (particles.y >= 0 && particles.y < NUM_ATOMS) {
real4 posY = posq[particles.y];
real3 vectorY = make_real3(posY.x-thisParticlePos.x, posY.y-thisParticlePos.y, posY.z-thisParticlePos.z);
vectorY = normalize(vectorY);
vectorX = normalize(vectorX);
if (axisType == 2) {
vectorX += vectorY;
vectorX = normalize(vectorX);
}
else {
// 3-fold
vectorZ += vectorX + vectorY;
vectorZ = normalize(vectorZ);
}
}
}
else if (axisType >= 4)
vectorX = make_real3((real) 0.1f);
// x = x - (x.z)z
vectorX -= dot(vectorZ, vectorX)*vectorZ;
vectorX = normalize(vectorX);
real3 vectorY = cross(vectorZ, vectorX);
// use identity rotation matrix for unrecognized axis types
if (axisType < 0 || axisType > 4) {
vectorX.x = 1;
vectorX.y = 0;
vectorX.z = 0;
vectorY.x = 0;
vectorY.y = 1;
vectorY.z = 0;
vectorZ.x = 0;
vectorZ.y = 0;
vectorZ.z = 1;
}
// Check the chirality and see whether it needs to be reversed
bool reverse = false;
if (axisType != 0 && particles.x >= 0 && particles.y >=0 && particles.z >= 0) {
real4 posY = posq[particles.y];
real delta[4][3];
delta[0][0] = thisParticlePos.x - posY.x;
delta[0][1] = thisParticlePos.y - posY.y;
delta[0][2] = thisParticlePos.z - posY.z;
delta[1][0] = posZ.x - posY.x;
delta[1][1] = posZ.y - posY.y;
delta[1][2] = posZ.z - posY.z;
delta[2][0] = posX.x - posY.x;
delta[2][1] = posX.y - posY.y;
delta[2][2] = posX.z - posY.z;
delta[3][0] = delta[1][1]*delta[2][2] - delta[1][2]*delta[2][1];
delta[3][1] = delta[2][1]*delta[0][2] - delta[2][2]*delta[0][1];
delta[3][2] = delta[0][1]*delta[1][2] - delta[0][2]*delta[1][1];
real volume = delta[3][0]*delta[0][0] + delta[3][1]*delta[1][0] + delta[3][2]*delta[2][0];
reverse = (volume < 0);
}
// Transform the dipole
unsigned int offset = 3*particleIndex;
real molDipole[3];
molDipole[0] = molecularDipoles[offset];
molDipole[1] = molecularDipoles[offset+1];
molDipole[2] = molecularDipoles[offset+2];
if (reverse)
molDipole[1] *= -1;
labFrameDipoles[offset] = molDipole[0]*vectorX.x + molDipole[1]*vectorY.x + molDipole[2]*vectorZ.x;
labFrameDipoles[offset+1] = molDipole[0]*vectorX.y + molDipole[1]*vectorY.y + molDipole[2]*vectorZ.y;
labFrameDipoles[offset+2] = molDipole[0]*vectorX.z + molDipole[1]*vectorY.z + molDipole[2]*vectorZ.z;
// ---------------------------------------------------------------------------------------
// Transform the quadrupole
real mPole[3][3];
offset = 9*particleIndex;
mPole[0][0] = molecularQuadrupoles[offset];
mPole[0][1] = molecularQuadrupoles[offset+1];
mPole[0][2] = molecularQuadrupoles[offset+2];
mPole[1][0] = molecularQuadrupoles[offset+3];
mPole[1][1] = molecularQuadrupoles[offset+4];
mPole[1][2] = molecularQuadrupoles[offset+5];
mPole[2][0] = molecularQuadrupoles[offset+6];
mPole[2][1] = molecularQuadrupoles[offset+7];
mPole[2][2] = molecularQuadrupoles[offset+8];
if (reverse) {
mPole[0][1] *= -1;
mPole[1][0] *= -1;
mPole[1][2] *= -1;
mPole[2][1] *= -1;
}
labFrameQuadrupoles[offset+8] = vectorX.z*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
labFrameQuadrupoles[offset+8] += vectorY.z*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
labFrameQuadrupoles[offset+8] += vectorZ.z*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
labFrameQuadrupoles[offset+4] = vectorX.y*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
labFrameQuadrupoles[offset+4] += vectorY.y*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
labFrameQuadrupoles[offset+4] += vectorZ.y*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
labFrameQuadrupoles[offset+5] = vectorX.y*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
labFrameQuadrupoles[offset+5] += vectorY.y*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
labFrameQuadrupoles[offset+5] += vectorZ.y*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
labFrameQuadrupoles[offset] = vectorX.x*(vectorX.x*mPole[0][0] + vectorY.x*mPole[0][1] + vectorZ.x*mPole[0][2]);
labFrameQuadrupoles[offset] += vectorY.x*(vectorX.x*mPole[1][0] + vectorY.x*mPole[1][1] + vectorZ.x*mPole[1][2]);
labFrameQuadrupoles[offset] += vectorZ.x*(vectorX.x*mPole[2][0] + vectorY.x*mPole[2][1] + vectorZ.x*mPole[2][2]);
labFrameQuadrupoles[offset+1] = vectorX.x*(vectorX.y*mPole[0][0] + vectorY.y*mPole[0][1] + vectorZ.y*mPole[0][2]);
labFrameQuadrupoles[offset+1] += vectorY.x*(vectorX.y*mPole[1][0] + vectorY.y*mPole[1][1] + vectorZ.y*mPole[1][2]);
labFrameQuadrupoles[offset+1] += vectorZ.x*(vectorX.y*mPole[2][0] + vectorY.y*mPole[2][1] + vectorZ.y*mPole[2][2]);
labFrameQuadrupoles[offset+2] = vectorX.x*(vectorX.z*mPole[0][0] + vectorY.z*mPole[0][1] + vectorZ.z*mPole[0][2]);
labFrameQuadrupoles[offset+2] += vectorY.x*(vectorX.z*mPole[1][0] + vectorY.z*mPole[1][1] + vectorZ.z*mPole[1][2]);
labFrameQuadrupoles[offset+2] += vectorZ.x*(vectorX.z*mPole[2][0] + vectorY.z*mPole[2][1] + vectorZ.z*mPole[2][2]);
labFrameQuadrupoles[offset+3] = labFrameQuadrupoles[offset+1];
labFrameQuadrupoles[offset+6] = labFrameQuadrupoles[offset+2];
labFrameQuadrupoles[offset+7] = labFrameQuadrupoles[offset+5];
}
}
}
/* -------------------------------------------------------------------------- *
* OpenMMAmoeba *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2010 Stanford University and the Authors. *
* Authors: Mark Friedrichs *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
/**
* This tests the CUDA implementation of AmoebaMultipoleForce.
*/
#include "openmm/internal/AssertionUtilities.h"
#include "AmoebaTinkerParameterFile.h"
#include "openmm/Context.h"
#include "CudaPlatform.h"
#include "openmm/AmoebaMultipoleForce.h"
#include "openmm/System.h"
#include "openmm/VerletIntegrator.h"
#include "../src/SimTKUtilities/SimTKOpenMMRealType.h"
#include <iostream>
#include <vector>
using namespace OpenMM;
using namespace std;
const double TOL = 1e-5;
void testPMEWater() {
Platform& platform = Platform::getPlatformByName("CUDA");
System system;
system.addParticle(16);
system.addParticle(1);
system.addParticle(1);
VerletIntegrator integrator(0.01);
AmoebaMultipoleForce* mp = new AmoebaMultipoleForce();
mp->setNonbondedMethod(AmoebaMultipoleForce::PME);
vector<double> dipole(3, 0.0);
dipole[2] = 7.556121361e-2;
vector<double> quadrupole(9, 0.0);
quadrupole[0] = 3.540307211e-2;
quadrupole[4] = -3.902570771e-2;
quadrupole[8] = 3.622635596e-3;
double damp = 9.707801995e-01*sqrt(0.1);
double polarity = 0.837*0.001;
mp->addParticle(-0.51966, dipole, quadrupole, 1, 1, 2, -1, 0.39, damp, polarity);
dipole[0] = -2.042094848e-2;
dipole[2] = -3.078753000e-2;
quadrupole[0] = -3.428482490e-3;
quadrupole[2] = -1.894859639e-4;
quadrupole[4] = -1.002408752e-2;
quadrupole[6] = -1.894859639e-4;
quadrupole[8] = 1.345257001e-2;
damp = 8.897068742e-01*sqrt(0.1);
polarity = 0.496*0.001;
mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 2, -1, 0.39, damp, polarity);
mp->addParticle(0.25983, dipole, quadrupole, 0, 0, 1, -1, 0.39, damp, polarity);
mp->setCutoffDistance(1.0);
std::vector<int> intVector;
intVector.push_back( 0 );
intVector.push_back( 1 );
intVector.push_back( 2 );
mp->setCovalentMap( 0, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
mp->setCovalentMap( 1, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
mp->setCovalentMap( 2, AmoebaMultipoleForce::PolarizationCovalent11, intVector );
intVector.resize(0); intVector.push_back( 1 ); intVector.push_back( 2 );
mp->setCovalentMap( 0, AmoebaMultipoleForce::Covalent12, intVector );
intVector.resize(0); intVector.push_back( 0 ); intVector.push_back( 2 );
mp->setCovalentMap( 1, AmoebaMultipoleForce::Covalent12, intVector );
intVector.resize(0); intVector.push_back( 0 ); intVector.push_back( 1 );
mp->setCovalentMap( 2, AmoebaMultipoleForce::Covalent12, intVector );
mp->setEwaldErrorTolerance(TOL);
system.setDefaultPeriodicBoxVectors(Vec3(2, 0, 0), Vec3(0, 2, 0), Vec3(0, 0, 2));
system.addForce(mp);
Context context(system, integrator, platform);
vector<Vec3> positions(3);
const double angle = 109.47*M_PI/180;
const double dOH = 0.1;
positions[0] = Vec3();
positions[1] = Vec3(dOH, 0, 0);
positions[2] = Vec3(dOH*std::cos(angle), dOH*std::sin(angle), 0);
context.setPositions(positions);
State state = context.getState(State::Forces | State::Energy);
const vector<Vec3>& forces = state.getForces();
#ifdef AMOEBA_DEBUG
(void) fprintf( stderr, "PME forces\n" );
for( unsigned int ii = 0; ii < forces.size(); ii++ ){
(void) fprintf( stderr, "%6u [%14.7e %14.7e %14.7e]\n", ii,
forces[ii][0], forces[ii][1], forces[ii][2] );
}
(void) fflush( stderr );
#endif
}
int main() {
try {
registerAmoebaCudaKernelFactories();
testPMEWater();
} catch(const exception& e) {
cout << "exception: " << e.what() << endl;
return 1;
}
std::cout << "Done" << std::endl;
return 0;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment