/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009 Stanford University and the Authors. *
* Authors: Scott Le Grand, Peter Eastman *
* Contributors: *
* *
* This program is free software: you can redistribute it and/or modify *
* it under the terms of the GNU Lesser General Public License as published *
* by the Free Software Foundation, either version 3 of the License, or *
* (at your option) any later version. *
* *
* This program is distributed in the hope that it will be useful, *
* but WITHOUT ANY WARRANTY; without even the implied warranty of *
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
* GNU Lesser General Public License for more details. *
* *
* You should have received a copy of the GNU Lesser General Public License *
* along with this program. If not, see . *
* -------------------------------------------------------------------------- */
#include "amoebaScaleFactors.h"
__global__
#if (__CUDA_ARCH__ >= 200)
__launch_bounds__(384, 1)
#elif (__CUDA_ARCH__ >= 120)
__launch_bounds__(192, 1)
#else
__launch_bounds__(64, 1)
#endif
void METHOD_NAME(kCalculateAmoebaPmeDirectFixedE_Field, _kernel)(
unsigned int* workUnit,
float* outputEField,
float* outputEFieldPolar
#ifdef AMOEBA_DEBUG
, float4* debugArray, unsigned int targetAtom
#endif
){
#ifdef AMOEBA_DEBUG
int maxPullIndex = 1;
float4 pullBack[12];
#endif
extern __shared__ FixedFieldParticle sA[];
unsigned int totalWarps = gridDim.x*blockDim.x/GRID;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/GRID;
unsigned int numWorkUnits = cSim.pInteractionCount[0];
unsigned int pos = warp*numWorkUnits/totalWarps;
unsigned int end = (warp+1)*numWorkUnits/totalWarps;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end)
{
unsigned int x;
unsigned int y;
bool bExclusionFlag;
float dScaleValue;
float pScaleValue;
int dScaleMask;
int2 pScaleMask;
// extract cell coordinates
decodeCell( workUnit[pos], &x, &y, &bExclusionFlag );
unsigned int tgx = threadIdx.x & (GRID - 1);
unsigned int tbx = threadIdx.x - tgx;
unsigned int tj = tgx;
FixedFieldParticle* psA = &sA[tbx];
unsigned int atomI = x + tgx;
FixedFieldParticle localParticle;
loadFixedFieldShared( &localParticle, atomI );
float fieldSum[3];
float fieldPolarSum[3];
fieldSum[0] = 0.0f;
fieldSum[1] = 0.0f;
fieldSum[2] = 0.0f;
fieldPolarSum[0] = 0.0f;
fieldPolarSum[1] = 0.0f;
fieldPolarSum[2] = 0.0f;
if (x == y)
{
// load coordinates, charge, ...
loadFixedFieldShared( &(sA[threadIdx.x]), atomI );
if( bExclusionFlag ){
unsigned int xi = x >> GRIDBITS;
unsigned int cell = xi + xi*cAmoebaSim.paddedNumberOfAtoms/GRID-xi*(xi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
dScaleValue = pScaleValue = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++)
{
if( bExclusionFlag ){
getMaskedDScaleFactor( j, dScaleMask, &dScaleValue );
getMaskedPScaleFactor( j, pScaleMask, &pScaleValue );
}
float4 ijField[3];
calculateFixedFieldRealSpacePairIxn_kernel( localParticle, psA[j], dScaleValue, pScaleValue, ijField
#ifdef AMOEBA_DEBUG
, pullBack
#endif
);
// nan*0.0 = nan not 0.0, so explicitly exclude (atomI == atomJ) contribution
// by setting match flag
unsigned int match = ( (atomI == (y + j)) || (atomI >= cAmoebaSim.numberOfAtoms) || ((y+j) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += match ? 0.0f : ijField[0].x;
fieldSum[1] += match ? 0.0f : ijField[1].x;
fieldSum[2] += match ? 0.0f : ijField[2].x;
fieldPolarSum[0] += match ? 0.0f : ijField[0].z;
fieldPolarSum[1] += match ? 0.0f : ijField[1].z;
fieldPolarSum[2] += match ? 0.0f : ijField[2].z;
#ifdef AMOEBA_DEBUG
if( atomI == targetAtom || targetAtom == (y+j) ){
unsigned int index = atomI == targetAtom ? (y + j) : atomI;
unsigned int indexI = 0;
unsigned int indexJ = indexI ? 0 : 2;
unsigned int indices[4] = { indexI, indexJ, indexI+1, indexJ+1 };
float flag = 7.0f;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + j);
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = flag;
for( int ii = 0; ii < 4; ii++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = match ? 0.0f : ijField[indices[ii]][0];
debugArray[index].y = match ? 0.0f : ijField[indices[ii]][1];
debugArray[index].z = match ? 0.0f : ijField[indices[ii]][2];
debugArray[index].w = flag;
}
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
}
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
#else
unsigned int offset = 3*(x + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar );
#endif
} else {
if (lasty != y ) {
// load coordinates, charge, ...
loadFixedFieldShared( &(sA[threadIdx.x]), (y+tgx) );
}
unsigned int flags = cSim.pInteractionFlag[pos];
if (flags == 0) {
// No interactions in this block.
} else {
// zero shared fields
zeroFixedFieldParticleSharedField( &(sA[threadIdx.x]) );
if( bExclusionFlag ) {
unsigned int xi = x >> GRIDBITS;
unsigned int yi = y >> GRIDBITS;
unsigned int cell = xi+yi*cAmoebaSim.paddedNumberOfAtoms/GRID-yi*(yi+1)/2;
dScaleMask = cAmoebaSim.pD_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
pScaleMask = cAmoebaSim.pP_ScaleIndices[cAmoebaSim.pScaleIndicesIndex[cell]+tgx];
} else {
dScaleValue = pScaleValue = 1.0f;
}
for (unsigned int j = 0; j < GRID; j++){
if ((flags&(1<= cAmoebaSim.numberOfAtoms) || ((y+jIdx) >= cAmoebaSim.numberOfAtoms) ) ? 1 : 0;
// add to field at atomI the field due atomJ's charge/dipole/quadrupole
fieldSum[0] += outOfBounds ? 0.0f : ijField[0].x;
fieldSum[1] += outOfBounds ? 0.0f : ijField[1].x;
fieldSum[2] += outOfBounds ? 0.0f : ijField[2].x;
fieldPolarSum[0] += outOfBounds ? 0.0f : ijField[0].z;
fieldPolarSum[1] += outOfBounds ? 0.0f : ijField[1].z;
fieldPolarSum[2] += outOfBounds ? 0.0f : ijField[2].z;
if( flags == 0xFFFFFFFF ){
// add to field at atomJ the field due atomI's charge/dipole/quadrupole
psA[jIdx].eField[0] += outOfBounds ? 0.0f : ijField[0].y;
psA[jIdx].eField[1] += outOfBounds ? 0.0f : ijField[1].y;
psA[jIdx].eField[2] += outOfBounds ? 0.0f : ijField[2].y;
psA[jIdx].eFieldP[0] += outOfBounds ? 0.0f : ijField[0].w;
psA[jIdx].eFieldP[1] += outOfBounds ? 0.0f : ijField[1].w;
psA[jIdx].eFieldP[2] += outOfBounds ? 0.0f : ijField[2].w;
} else {
sA[threadIdx.x].tempBuffer[0] = outOfBounds ? 0.0f : ijField[0].y;
sA[threadIdx.x].tempBuffer[1] = outOfBounds ? 0.0f : ijField[1].y;
sA[threadIdx.x].tempBuffer[2] = outOfBounds ? 0.0f : ijField[2].y;
sA[threadIdx.x].tempBufferP[0] = outOfBounds ? 0.0f : ijField[0].w;
sA[threadIdx.x].tempBufferP[1] = outOfBounds ? 0.0f : ijField[1].w;
sA[threadIdx.x].tempBufferP[2] = outOfBounds ? 0.0f : ijField[2].w;
if( tgx % 2 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+1] );
}
if( tgx % 4 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+2] );
}
if( tgx % 8 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+4] );
}
if( tgx % 16 == 0 ){
sumTempBuffer( sA[threadIdx.x], sA[threadIdx.x+8] );
}
if (tgx == 0)
{
psA[jIdx].eField[0] += sA[threadIdx.x].tempBuffer[0] + sA[threadIdx.x+16].tempBuffer[0];
psA[jIdx].eField[1] += sA[threadIdx.x].tempBuffer[1] + sA[threadIdx.x+16].tempBuffer[1];
psA[jIdx].eField[2] += sA[threadIdx.x].tempBuffer[2] + sA[threadIdx.x+16].tempBuffer[2];
psA[jIdx].eFieldP[0] += sA[threadIdx.x].tempBufferP[0] + sA[threadIdx.x+16].tempBufferP[0];
psA[jIdx].eFieldP[1] += sA[threadIdx.x].tempBufferP[1] + sA[threadIdx.x+16].tempBufferP[1];
psA[jIdx].eFieldP[2] += sA[threadIdx.x].tempBufferP[2] + sA[threadIdx.x+16].tempBufferP[2];
}
}
#ifdef AMOEBA_DEBUG
if( (atomI == targetAtom || (y + jIdx) == targetAtom) ){
unsigned int index = (atomI == targetAtom) ? (y + jIdx) : atomI;
unsigned int indexI = (atomI == targetAtom) ? 0 : 2;
unsigned int indexJ = (atomI == targetAtom) ? 2 : 0;
debugArray[index].x = (float) atomI;
debugArray[index].y = (float) (y + jIdx);
debugArray[index].z = dScaleValue;
debugArray[index].w = pScaleValue;
float flag = 9.0f;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) bExclusionFlag;
debugArray[index].y = (float) (tgx);
debugArray[index].z = (float) j;
debugArray[index].w = jIdx;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = (float) dScaleMask;
debugArray[index].y = (float) pScaleMask.x;
debugArray[index].z = (float) pScaleMask.y;
debugArray[index].w = (float) flags;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexI][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexI][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexI][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexJ][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexJ][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexJ][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexI+1][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexI+1][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexI+1][2];
debugArray[index].w = flag;
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = outOfBounds ? 0.0f : ijField[indexJ+1][0];
debugArray[index].y = outOfBounds ? 0.0f : ijField[indexJ+1][1];
debugArray[index].z = outOfBounds ? 0.0f : ijField[indexJ+1][2];
debugArray[index].w = flag;
for( int pullIndex = 0; pullIndex < maxPullIndex; pullIndex++ ){
index += cAmoebaSim.paddedNumberOfAtoms;
debugArray[index].x = pullBack[pullIndex].x;
debugArray[index].y = pullBack[pullIndex].y;
debugArray[index].z = pullBack[pullIndex].z;
debugArray[index].w = pullBack[pullIndex].w;
}
}
#endif
}
tj = (tj + 1) & (GRID - 1);
} // j-loop block
// Write results
#ifdef USE_OUTPUT_BUFFER_PER_WARP
unsigned int offset = 3*(x + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, fieldSum, outputEField );
load3dArrayBufferPerWarp( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + warp*cAmoebaSim.paddedNumberOfAtoms);
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eField, outputEField );
load3dArrayBufferPerWarp( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#else
unsigned int offset = 3*(x + tgx + (y >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, fieldSum, outputEField );
load3dArray( offset, fieldPolarSum, outputEFieldPolar );
offset = 3*(y + tgx + (x >> GRIDBITS) * cAmoebaSim.paddedNumberOfAtoms);
load3dArray( offset, sA[threadIdx.x].eField, outputEField );
load3dArray( offset, sA[threadIdx.x].eFieldP, outputEFieldPolar );
#endif
} // end of pInteractionFlag block
lasty = y;
} // x == y block
pos++;
}
}