"plugins/rpmd/tests/TestRpmd.h" did not exist on "644cc275ab5a549a82cce4d49680da729c2051e6"
Commit 93c467b2 authored by Peter Eastman's avatar Peter Eastman
Browse files

Merged 5.1Optimizations branch back to trunk

parent f6d4557d
......@@ -606,181 +606,255 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/
extern "C" __global__ void computeEDiffForce(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, const real* __restrict__ inducedDipoleS, const real* __restrict__ inducedDipolePolarS,
const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
__shared__ AtomData4 localData[EDIFF_THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*(EDIFF_THREAD_BLOCK_SIZE/TILE_SIZE)];
__shared__ int exclusionIndex[EDIFF_THREAD_BLOCK_SIZE/TILE_SIZE];
// First loop: process tiles that contain exclusions.
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
AtomData4 data;
if (pos < end) {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
data.force = make_real3(0);
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData4(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].q = data.q;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].quadrupoleZZ = data.quadrupoleZZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
computeOneEDiffInteractionF1(data, localData[tbx+j], d, p, tempEnergy, tempForce);
energy += 0.25f*tempEnergy;
data.force += tempForce;
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData4(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
data.force = make_real3(0);
// Locate the exclusion data for this tile.
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].q = data.q;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].quadrupoleZZ = data.quadrupoleZZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
computeOneEDiffInteractionF1(data, localData[tbx+j], d, p, tempEnergy, tempForce);
energy += 0.25f*tempEnergy;
data.force += tempForce;
}
// Compute torques.
data.force = make_real3(0);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempTorque;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
computeOneEDiffInteractionT1(data, localData[tbx+j], d, p, tempTorque);
data.force += tempTorque;
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData4(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
// Compute forces.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
computeOneEDiffInteractionF1(data, localData[tbx+tj], d, p, tempEnergy, tempForce);
energy += 0.5f*tempEnergy;
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
// Compute torques.
// Compute torques.
data.force = make_real3(0);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempTorque;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
computeOneEDiffInteractionT1(data, localData[tbx+j], d, p, tempTorque);
data.force += tempTorque;
}
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempTorque;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
computeOneEDiffInteractionT1(data, localData[tbx+tj], d, p, tempTorque);
data.force += tempTorque;
computeOneEDiffInteractionT3(data, localData[tbx+tj], d, p, tempTorque);
localData[tbx+tj].force += tempTorque;
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
tj = (tj + 1) & (TILE_SIZE - 1);
}
else {
// This is an off-diagonal tile.
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
}
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData4(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
// Second loop: tiles without exclusions (by enumerating all of them, since there's no cutoff).
// Compute forces.
const unsigned int numTiles = numTileIndices;
int pos = startTileIndex+warp*numTiles/totalWarps;
int end = startTileIndex+(warp+1)*numTiles/totalWarps;
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int skipTiles[EDIFF_THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
computeOneEDiffInteractionF1(data, localData[tbx+tj], d, p, tempEnergy, tempForce);
energy += 0.5f*tempEnergy;
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
while (pos < end) {
// Extract the coordinates of this tile.
// Compute torques.
unsigned int x, y;
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempTorque;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
computeOneEDiffInteractionT1(data, localData[tbx+tj], d, p, tempTorque);
data.force += tempTorque;
computeOneEDiffInteractionT3(data, localData[tbx+tj], d, p, tempTorque);
localData[tbx+tj].force += tempTorque;
}
tj = (tj + 1) & (TILE_SIZE - 1);
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
bool includeTile = (skipTiles[currentSkipIndex] != pos);
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData4 data;
data.force = make_real3(0);
loadAtomData4(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
loadAtomData4(localData[threadIdx.x], atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData4(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
// Compute forces.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
computeOneEDiffInteractionF1(data, localData[tbx+tj], 1, 1, tempEnergy, tempForce);
energy += 0.5f*tempEnergy;
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
// Compute torques.
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempTorque;
computeOneEDiffInteractionT1(data, localData[tbx+tj], 1, 1, tempTorque);
data.force += tempTorque;
computeOneEDiffInteractionT3(data, localData[tbx+tj], 1, 1, tempTorque);
localData[tbx+tj].force += tempTorque;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
pos++;
} while (pos < end);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR;
}
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
......@@ -59,331 +58,286 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/
extern "C" __global__ void computeElectrostatics(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#endif
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
AtomData data;
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
data.force = make_real3(0);
uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].posq = data.posq;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteractionF1(data, localData[tbx+j], d, p, m, tempEnergy, tempForce);
data.force += tempForce;
energy += 0.5f*tempEnergy;
}
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
// Compute torques.
data.force = make_real3(0);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteractionT1(data, localData[tbx+j], d, p, m, tempForce);
data.force += tempForce;
}
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteractionF1(data, localData[tbx+tj], d, p, m, tempEnergy, tempForce);
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
energy += tempEnergy;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
// Compute torques.
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteractionT1(data, localData[tbx+tj], d, p, m, tempForce);
data.force += tempForce;
computeOneInteractionT3(data, localData[tbx+tj], d, p, m, tempForce);
localData[tbx+tj].force += tempForce;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
int pos = startTileIndex+warp*numTiles/totalWarps;
int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
while (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
AtomData data;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData data;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
data.force = make_real3(0);
// Locate the exclusion data for this tile.
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
// Compute forces.
localData[threadIdx.x].posq = data.posq;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteractionF1(data, localData[tbx+j], d, p, m, tempEnergy, tempForce);
data.force += tempForce;
energy += 0.5f*tempEnergy;
}
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
computeOneInteractionF1(data, localData[tbx+tj], 1, 1, 1, tempEnergy, tempForce);
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
energy += tempEnergy;
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
// Compute torques.
data.force = make_real3(0);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteractionT1(data, localData[tbx+j], d, p, m, tempForce);
data.force += tempForce;
}
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
tj = (tj + 1) & (TILE_SIZE - 1);
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real3 tempForce;
real tempEnergy;
computeOneInteractionF1(data, localData[atom2], 1, 1, 1, tempEnergy, tempForce);
data.force += tempForce;
localData[atom2].force -= tempForce;
energy += tempEnergy;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
tempForce.x += __shfl_xor(tempForce.x, i, 32);
tempForce.y += __shfl_xor(tempForce.y, i, 32);
tempForce.z += __shfl_xor(tempForce.z, i, 32);
}
if (tgx == 0)
localData[atom2].force -= tempForce;
offset = atomIndices[threadIdx.x];
#else
int bufferIndex = 3*threadIdx.x;
tempBuffer[bufferIndex] = tempForce.x;
tempBuffer[bufferIndex+1] = tempForce.y;
tempBuffer[bufferIndex+2] = tempForce.z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].force.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].force.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].force.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
offset = y*TILE_SIZE + tgx;
#endif
}
}
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
// Compute torques.
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real3 tempForce;
computeOneInteractionT1(data, localData[atom2], 1, 1, 1, tempForce);
data.force += tempForce;
computeOneInteractionT3(data, localData[atom2], 1, 1, 1, tempForce);
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
tempForce.x += __shfl_xor(tempForce.x, i, 32);
tempForce.y += __shfl_xor(tempForce.y, i, 32);
tempForce.z += __shfl_xor(tempForce.z, i, 32);
}
if (tgx == 0)
localData[atom2].force -= tempForce;
#else
int bufferIndex = 3*threadIdx.x;
tempBuffer[bufferIndex] = tempForce.x;
tempBuffer[bufferIndex+1] = tempForce.y;
tempBuffer[bufferIndex+2] = tempForce.z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].force.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].force.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].force.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif
}
}
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
// Compute torques.
uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
// Compute forces.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteractionF1(data, localData[tbx+tj], d, p, m, tempEnergy, tempForce);
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
energy += tempEnergy;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
// Compute torques.
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteractionT1(data, localData[tbx+tj], d, p, m, tempForce);
data.force += tempForce;
computeOneInteractionT3(data, localData[tbx+tj], d, p, m, tempForce);
localData[tbx+tj].force += tempForce;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
computeOneInteractionT1(data, localData[tbx+tj], 1, 1, 1, tempForce);
data.force += tempForce;
computeOneInteractionT3(data, localData[tbx+tj], 1, 1, 1, tempForce);
localData[tbx+tj].force += tempForce;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
#ifdef USE_CUTOFF
offset = atomIndices[threadIdx.x];
#else
offset = y*TILE_SIZE + tgx;
#endif
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
pos++;
} while (pos < end);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR;
}
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
......@@ -398,245 +397,268 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/
extern "C" __global__ void computeFixedField(
unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, const ushort2* __restrict__ exclusionTiles,
unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#elif defined USE_GK
const real* __restrict__ bornRadii, unsigned long long* __restrict__ gkFieldBuffers,
#endif
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
#endif
// First loop: process tiles that contain exclusions.
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
AtomData data;
data.field = make_real3(0);
data.fieldPolar = make_real3(0);
#ifdef USE_GK
data.gkField = make_real3(0);
#endif
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
#ifdef USE_GK
data.bornRadius = bornRadii[atom1];
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = data.posq;
localData[localAtomIndex].dipole = data.dipole;
localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
localData[localAtomIndex].thole = data.thole;
localData[localAtomIndex].damp = data.damp;
#ifdef USE_GK
localData[localAtomIndex].bornRadius = data.bornRadius;
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+j].posq-data.posq);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[4];
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
computeOneInteraction(data, localData[tbx+j], delta, d, p, fields);
data.field += fields[0];
data.fieldPolar += fields[1];
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
#ifdef USE_GK
data.bornRadius = bornRadii[atom1];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[2];
computeOneGkInteraction(data, localData[tbx+j], delta, fields);
data.gkField += fields[0];
}
#endif
// Locate the exclusion data for this tile.
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = data.posq;
localData[localAtomIndex].dipole = data.dipole;
localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
localData[localAtomIndex].thole = data.thole;
localData[localAtomIndex].damp = data.damp;
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
localData[localAtomIndex].field = make_real3(0);
localData[localAtomIndex].fieldPolar = make_real3(0);
#ifdef USE_GK
localData[localAtomIndex].bornRadius = data.bornRadius;
localData[localAtomIndex].bornRadius = bornRadii[j];
localData[localAtomIndex].gkField = make_real3(0);
#endif
uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+j].posq-data.posq);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[4];
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
computeOneInteraction(data, localData[tbx+j], delta, d, p, fields);
data.field += fields[0];
data.fieldPolar += fields[1];
}
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[4];
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
computeOneInteraction(data, localData[tbx+tj], delta, d, p, fields);
data.field += fields[0];
data.fieldPolar += fields[1];
localData[tbx+tj].field += fields[2];
localData[tbx+tj].fieldPolar += fields[3];
#ifdef USE_GK
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[2];
computeOneGkInteraction(data, localData[tbx+j], delta, fields);
data.gkField += fields[0];
}
computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
data.gkField += fields[0];
localData[tbx+tj].gkField += fields[1];
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
localData[localAtomIndex].field = make_real3(0);
localData[localAtomIndex].fieldPolar = make_real3(0);
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
#ifdef USE_GK
localData[localAtomIndex].bornRadius = bornRadii[j];
localData[localAtomIndex].gkField = make_real3(0);
atomicAdd(&gkFieldBuffers[offset], static_cast<unsigned long long>((long long) (data.gkField.x*0x100000000)));
atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.y*0x100000000)));
atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.z*0x100000000)));
#endif
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) { // TODO: Why doesn't the flags != 0 block work?
// if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
#ifdef USE_GK
atomicAdd(&gkFieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].gkField.x*0x100000000)));
atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].gkField.y*0x100000000)));
atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].gkField.z*0x100000000)));
#endif
real3 fields[4];
computeOneInteraction(data, localData[atom2], delta, 1, 1, fields);
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
fields[2].x += __shfl_xor(fields[2].x, i, 32);
fields[2].y += __shfl_xor(fields[2].y, i, 32);
fields[2].z += __shfl_xor(fields[2].z, i, 32);
fields[3].x += __shfl_xor(fields[3].x, i, 32);
fields[3].y += __shfl_xor(fields[3].y, i, 32);
fields[3].z += __shfl_xor(fields[3].z, i, 32);
}
if (tgx == 0) {
localData[atom2].field += fields[2];
localData[atom2].fieldPolar += fields[3];
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
int bufferIndex = 3*threadIdx.x;
tempBuffer[bufferIndex] = fields[2].x;
tempBuffer[bufferIndex+1] = fields[2].y;
tempBuffer[bufferIndex+2] = fields[2].z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].field.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].field.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].field.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
tempBuffer[bufferIndex] = fields[3].x;
tempBuffer[bufferIndex+1] = fields[3].y;
tempBuffer[bufferIndex+2] = fields[3].z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].fieldPolar.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].fieldPolar.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].fieldPolar.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
const unsigned int numTiles = numTileIndices;
int pos = startTileIndex+warp*numTiles/totalWarps;
int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
}
}
}
}
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData data;
data.field = make_real3(0);
data.fieldPolar = make_real3(0);
#ifdef USE_GK
data.gkField = make_real3(0);
#endif
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
#ifdef USE_GK
data.bornRadius = bornRadii[atom1];
#endif
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
{
// Compute the full set of interactions in this tile.
uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
atomIndices[threadIdx.x] = j;
const unsigned int localAtomIndex = threadIdx.x;
loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
localData[localAtomIndex].field = make_real3(0);
localData[localAtomIndex].fieldPolar = make_real3(0);
#ifdef USE_GK
localData[localAtomIndex].bornRadius = bornRadii[j];
localData[localAtomIndex].gkField = make_real3(0);
#endif
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[4];
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
computeOneInteraction(data, localData[tbx+tj], delta, d, p, fields);
data.field += fields[0];
data.fieldPolar += fields[1];
localData[tbx+tj].field += fields[2];
localData[tbx+tj].fieldPolar += fields[3];
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[4];
computeOneInteraction(data, localData[tbx+tj], delta, 1, 1, fields);
data.field += fields[0];
data.fieldPolar += fields[1];
localData[tbx+tj].field += fields[2];
localData[tbx+tj].fieldPolar += fields[3];
#ifdef USE_GK
computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
data.gkField += fields[0];
localData[tbx+tj].gkField += fields[1];
computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
data.gkField += fields[0];
localData[tbx+tj].gkField += fields[1];
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
......@@ -648,9 +670,11 @@ extern "C" __global__ void computeFixedField(
atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.y*0x100000000)));
atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.z*0x100000000)));
#endif
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
#ifdef USE_CUTOFF
offset = atomIndices[threadIdx.x];
#else
offset = y*TILE_SIZE + tgx;
#endif
atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
......@@ -664,5 +688,5 @@ extern "C" __global__ void computeFixedField(
#endif
}
pos++;
} while (pos < end);
}
}
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
......@@ -199,194 +198,221 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
* Compute the mutual induced field.
*/
extern "C" __global__ void computeInducedField(
unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq,
unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq, const ushort2* __restrict__ exclusionTiles,
const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#elif defined USE_GK
unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS, const real* __restrict__ inducedDipoleS,
const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii,
#endif
const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
#ifndef ENABLE_SHUFFLE
// __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
#endif
// First loop: process tiles that contain exclusions.
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
unsigned int x, y;
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
AtomData data;
zeroAtomData(data);
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
unsigned int atom1 = x*TILE_SIZE + tgx;
#ifdef USE_GK
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
#else
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
#ifdef USE_GK
localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
localData[threadIdx.x].bornRadius = data.bornRadius;
localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
localData[threadIdx.x].bornRadius = data.bornRadius;
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+j].pos-data.pos;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+j].pos-data.pos;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
int atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
computeOneInteraction(data, localData[tbx+j], delta, atom1 == atom2);
}
int atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
computeOneInteraction(data, localData[tbx+j], delta, atom1 == atom2);
}
else {
// This is an off-diagonal tile.
}
else {
// This is an off-diagonal tile.
#ifdef USE_GK
loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
#else
loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
#endif
zeroAtomData(localData[threadIdx.x]);
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (flags == 0) { // TODO: Figure out what the flags != 0 case doesn't work!!!
// if (flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
/* else {
// Compute only a subset of the interactions in this tile.
for (int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 delta = localData[atom2].pos-data.pos;
zeroAtomData(localData[threadIdx.x]);
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+tj].pos-data.pos;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
int atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
computeOneInteraction(data, localData[tbx+tj], delta, false);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
#ifdef USE_GK
atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (data.fieldS.x*0x100000000)));
atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0x100000000)));
atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0x100000000)));
atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0x100000000)));
atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
#endif
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
#ifdef USE_GK
atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.x*0x100000000)));
atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.y*0x100000000)));
atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.z*0x100000000)));
atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.x*0x100000000)));
atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.y*0x100000000)));
atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.z*0x100000000)));
#endif
real3 fields[4];
computeOneInteraction(data, localData[atom2], delta, fields);
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
data.field += fields[0];
data.fieldPolar += fields[1];
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
fields[2].x += __shfl_xor(fields[2].x, i, 32);
fields[2].y += __shfl_xor(fields[2].y, i, 32);
fields[2].z += __shfl_xor(fields[2].z, i, 32);
fields[3].x += __shfl_xor(fields[3].x, i, 32);
fields[3].y += __shfl_xor(fields[3].y, i, 32);
fields[3].z += __shfl_xor(fields[3].z, i, 32);
}
if (tgx == 0) {
localData[atom2].field += fields[2];
localData[atom2].fieldPolar += fields[3];
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
int bufferIndex = 3*threadIdx.x;
tempBuffer[bufferIndex] = fields[2].x;
tempBuffer[bufferIndex+1] = fields[2].y;
tempBuffer[bufferIndex+2] = fields[2].z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].field.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].field.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].field.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
tempBuffer[bufferIndex] = fields[3].x;
tempBuffer[bufferIndex+1] = fields[3].y;
tempBuffer[bufferIndex+2] = fields[3].z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].fieldPolar.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].fieldPolar.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].fieldPolar.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
const unsigned int numTiles = numTileIndices;
int pos = startTileIndex+warp*numTiles/totalWarps;
int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
}
else
#endif
}
}
}
}*/
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData data;
zeroAtomData(data);
#ifdef USE_GK
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
#else
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
#endif
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
{
// Compute the full set of interactions in this tile.
atomIndices[threadIdx.x] = j;
#ifdef USE_GK
loadAtomData(localData[threadIdx.x], j, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
#else
loadAtomData(localData[threadIdx.x], j, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
#endif
zeroAtomData(localData[threadIdx.x]);
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+tj].pos-data.pos;
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+tj].pos-data.pos;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
int atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
computeOneInteraction(data, localData[tbx+tj], delta, false);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
computeOneInteraction(data, localData[tbx+tj], delta, false);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
......@@ -401,9 +427,11 @@ extern "C" __global__ void computeInducedField(
atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
#endif
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
#ifdef USE_CUTOFF
offset = atomIndices[threadIdx.x];
#else
offset = y*TILE_SIZE + tgx;
#endif
atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
......@@ -420,7 +448,7 @@ extern "C" __global__ void computeInducedField(
#endif
}
pos++;
} while (pos < end);
}
}
extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__ fixedField, const long long* __restrict__ fixedFieldPolar,
......
#define ARRAY(x,y) array[(x)-1+((y)-1)*PME_ORDER]
/**
* This is called from updateBsplines(). It calculates the spline coefficients for a single atom along a single axis.
* Calculate the spline coefficients for a single atom along a single axis.
*/
__device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
// initialization to get to 2nd order recursion
......@@ -70,15 +70,10 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
}
/**
* Compute bspline coefficients.
* Compute the index of the grid point each atom is associated with.
*/
extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4* __restrict__ igrid, int2* __restrict__ pmeAtomGridIndex,
real4* __restrict__ theta1, real4* __restrict__ theta2, real4* __restrict__ theta3, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
extern __shared__ real bsplines_cache[]; // size = block_size*pme_order*pme_order
real* array = &bsplines_cache[threadIdx.x*PME_ORDER*PME_ORDER];
// get the B-spline coefficients for each multipole site
extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
......@@ -90,256 +85,226 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4*
real w = pos.x*invPeriodicBoxSize.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(&theta1[i*PME_ORDER], w, array);
// Second axis.
w = pos.y*invPeriodicBoxSize.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(&theta2[i*PME_ORDER], w, array);
// Third axis.
w = pos.z*invPeriodicBoxSize.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(&theta3[i*PME_ORDER], w, array);
// Record the grid point.
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
igrid[i] = make_int4(igrid1, igrid2, igrid3, 0);
pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3);
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int thread = blockIdx.x*blockDim.x+threadIdx.x;
int start = (NUM_ATOMS*thread)/(blockDim.x*gridDim.x);
int end = (NUM_ATOMS*(thread+1))/(blockDim.x*gridDim.x);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j)
pmeAtomRange[j] = i;
last = gridIndex;
}
}
// Fill in values beyond the last atom.
if (thread == blockDim.x*gridDim.x-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j)
pmeAtomRange[j] = NUM_ATOMS;
}
}
/**
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel.
*/
extern "C" __global__ void recordZIndex(int2* __restrict__ pmeAtomGridIndex, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int thread = blockIdx.x*blockDim.x+threadIdx.x;
int start = (NUM_ATOMS*thread)/(blockDim.x*gridDim.x);
int end = (NUM_ATOMS*(thread+1))/(blockDim.x*gridDim.x);
for (int i = start; i < end; ++i) {
real posz = posq[pmeAtomGridIndex[i].x].z;
posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
real w = posz*invPeriodicBoxSize.z;
real fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
int z = ((int) fr)-PME_ORDER+1;
pmeAtomGridIndex[i].y = z;
}
}
extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
const real* __restrict__ labFrameQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
unsigned int numThreads = gridDim.x*blockDim.x;
for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) {
int3 gridPoint;
gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
real result = 0;
for (int ix = 0; ix < PME_ORDER; ++ix) {
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
for (int iy = 0; iy < PME_ORDER; ++iy) {
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
int z1 = gridPoint.z-PME_ORDER+1;
z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
int firstAtom = pmeAtomRange[gridIndex1];
int lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
if (iz >= GRID_SIZE_Z)
iz -= GRID_SIZE_Z;
real atomCharge = posq[atomIndex].w;
real atomDipoleX = xscale*labFrameDipole[atomIndex*3];
real atomDipoleY = yscale*labFrameDipole[atomIndex*3+1];
real atomDipoleZ = zscale*labFrameDipole[atomIndex*3+2];
real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[atomIndex*5];
real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[atomIndex*5+1];
real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[atomIndex*5+2];
real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[atomIndex*5+3];
real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[atomIndex*5+4];
real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[atomIndex*5]+labFrameQuadrupole[atomIndex*5+3]);
real4 t = theta1[atomIndex*PME_ORDER+ix];
real4 u = theta2[atomIndex*PME_ORDER+iy];
real4 v = theta3[atomIndex*PME_ORDER+iz];
real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real w = pos.x*invPeriodicBoxSize.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(theta3, w, array);
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
// Spread the charge from this atom onto each grid point.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = igrid1+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real4 t = theta1[ix];
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = igrid2+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real4 u = theta2[iy];
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = igrid3+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = ybase + zindex;
real4 v = theta3[iz];
real atomCharge = pos.w;
real atomDipoleX = xscale*labFrameDipole[m*3];
real atomDipoleY = yscale*labFrameDipole[m*3+1];
real atomDipoleZ = zscale*labFrameDipole[m*3+2];
real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[m*5];
real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[m*5+1];
real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[m*5+2];
real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[m*5+3];
real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[m*5+4];
real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[m*5]+labFrameQuadrupole[m*5+3]);
real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
real term2 = atomQuadrupoleXX * u.x * v.x;
result += term0*t.x + term1*t.y + term2*t.z;
}
if (z1 > gridPoint.z) {
gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
firstAtom = pmeAtomRange[gridIndex1];
lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
if (iz >= GRID_SIZE_Z)
iz -= GRID_SIZE_Z;
real atomCharge = posq[atomIndex].w;
real atomDipoleX = xscale*labFrameDipole[atomIndex*3];
real atomDipoleY = yscale*labFrameDipole[atomIndex*3+1];
real atomDipoleZ = zscale*labFrameDipole[atomIndex*3+2];
real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[atomIndex*5];
real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[atomIndex*5+1];
real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[atomIndex*5+2];
real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[atomIndex*5+3];
real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[atomIndex*5+4];
real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[atomIndex*5]+labFrameQuadrupole[atomIndex*5+3]);
real4 t = theta1[atomIndex*PME_ORDER+ix];
real4 u = theta2[atomIndex*PME_ORDER+iy];
real4 v = theta3[atomIndex*PME_ORDER+iz];
real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
real term2 = atomQuadrupoleXX * u.x * v.x;
result += term0*t.x + term1*t.y + term2*t.z;
}
real add = term0*t.x + term1*t.y + term2*t.z;
#ifdef USE_DOUBLE_PRECISION
unsigned long long * ulonglong_p = (unsigned long long *) pmeGrid;
atomicAdd(&ulonglong_p[2*index], static_cast<unsigned long long>((long long) (add*0x100000000)));
#else
atomicAdd(&pmeGrid[index].x, add);
#endif
}
}
}
pmeGrid[gridIndex] = make_real2(result, 0);
}
}
extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
unsigned int numThreads = gridDim.x*blockDim.x;
for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) {
int3 gridPoint;
gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
real2 result = make_real2(0, 0);
for (int ix = 0; ix < PME_ORDER; ++ix) {
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
for (int iy = 0; iy < PME_ORDER; ++iy) {
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y);
int z1 = gridPoint.z-PME_ORDER+1;
z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1);
int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1;
int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
int firstAtom = pmeAtomRange[gridIndex1];
int lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
if (iz >= GRID_SIZE_Z)
iz -= GRID_SIZE_Z;
real inducedDipoleX = xscale*inducedDipole[atomIndex*3];
real inducedDipoleY = yscale*inducedDipole[atomIndex*3+1];
real inducedDipoleZ = zscale*inducedDipole[atomIndex*3+2];
real inducedDipolePolarX = xscale*inducedDipolePolar[atomIndex*3];
real inducedDipolePolarY = yscale*inducedDipolePolar[atomIndex*3+1];
real inducedDipolePolarZ = zscale*inducedDipolePolar[atomIndex*3+2];
real4 t = theta1[atomIndex*PME_ORDER+ix];
real4 u = theta2[atomIndex*PME_ORDER+iy];
real4 v = theta3[atomIndex*PME_ORDER+iz];
real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real w = pos.x*invPeriodicBoxSize.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(theta3, w, array);
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
// Spread the charge from this atom onto each grid point.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = igrid1+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real4 t = theta1[ix];
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = igrid2+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real4 u = theta2[iy];
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = igrid3+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = ybase + zindex;
real4 v = theta3[iz];
real inducedDipoleX = xscale*inducedDipole[m*3];
real inducedDipoleY = yscale*inducedDipole[m*3+1];
real inducedDipoleZ = zscale*inducedDipole[m*3+2];
real inducedDipolePolarX = xscale*inducedDipolePolar[m*3];
real inducedDipolePolarY = yscale*inducedDipolePolar[m*3+1];
real inducedDipolePolarZ = zscale*inducedDipolePolar[m*3+2];
real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
real term11 = inducedDipoleX*u.x*v.x;
real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
real term12 = inducedDipolePolarX*u.x*v.x;
result.x += term01*t.x + term11*t.y;
result.y += term02*t.x + term12*t.y;
}
if (z1 > gridPoint.z) {
gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z;
gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z;
firstAtom = pmeAtomRange[gridIndex1];
lastAtom = pmeAtomRange[gridIndex2+1];
for (int i = firstAtom; i < lastAtom; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
if (iz >= GRID_SIZE_Z)
iz -= GRID_SIZE_Z;
real inducedDipoleX = xscale*inducedDipole[atomIndex*3];
real inducedDipoleY = yscale*inducedDipole[atomIndex*3+1];
real inducedDipoleZ = zscale*inducedDipole[atomIndex*3+2];
real inducedDipolePolarX = xscale*inducedDipolePolar[atomIndex*3];
real inducedDipolePolarY = yscale*inducedDipolePolar[atomIndex*3+1];
real inducedDipolePolarZ = zscale*inducedDipolePolar[atomIndex*3+2];
real4 t = theta1[atomIndex*PME_ORDER+ix];
real4 u = theta2[atomIndex*PME_ORDER+iy];
real4 v = theta3[atomIndex*PME_ORDER+iz];
real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
real term11 = inducedDipoleX*u.x*v.x;
real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
real term12 = inducedDipolePolarX*u.x*v.x;
result.x += term01*t.x + term11*t.y;
result.y += term02*t.x + term12*t.y;
}
real add1 = term01*t.x + term11*t.y;
real add2 = term02*t.x + term12*t.y;
#ifdef USE_DOUBLE_PRECISION
unsigned long long * ulonglong_p = (unsigned long long *) pmeGrid;
atomicAdd(&ulonglong_p[2*index], static_cast<unsigned long long>((long long) (add1*0x100000000)));
atomicAdd(&ulonglong_p[2*index+1], static_cast<unsigned long long>((long long) (add2*0x100000000)));
#else
atomicAdd(&pmeGrid[index].x, add1);
atomicAdd(&pmeGrid[index].y, add2);
#endif
}
}
}
pmeGrid[gridIndex] = result;
}
}
/**
* In double precision, we have to use fixed point to accumulate the grid values, so convert them to floating point.
*/
extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) {
real* floatGrid = (real*) pmeGrid;
const unsigned int gridSize = 2*GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = 1/(real) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
floatGrid[index] = scale*pmeGrid[index];
}
extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, const real* __restrict__ pmeBsplineModuliX,
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
......@@ -372,12 +337,50 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
}
extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi,
long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const int4* __restrict__ igrid, const real4* __restrict__ theta1,
const real4* __restrict__ theta2, const real4* __restrict__ theta3, const real* __restrict__ labFrameDipole, real4 invPeriodicBoxSize) {
// extract the permanent multipole field at each site
long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
const real* __restrict__ labFrameDipole, real4 periodicBoxSize, real4 invPeriodicBoxSize, int2* __restrict__ pmeAtomGridIndex) {
real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real w = pos.x*invPeriodicBoxSize.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(theta3, w, array);
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
// Compute the potential from this grid point.
for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
int4 gridPoint = igrid[m];
real tuv000 = 0;
real tuv001 = 0;
real tuv010 = 0;
......@@ -399,8 +402,8 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real tuv012 = 0;
real tuv111 = 0;
for (int iz = 0; iz < PME_ORDER; iz++) {
int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real4 v = theta3[m*PME_ORDER+iz];
int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real4 v = theta3[iz];
real tu00 = 0;
real tu10 = 0;
real tu01 = 0;
......@@ -412,14 +415,14 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real tu12 = 0;
real tu03 = 0;
for (int iy = 0; iy < PME_ORDER; iy++) {
int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real4 u = theta2[m*PME_ORDER+iy];
int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real4 u = theta2[iy];
real4 t = make_real4(0, 0, 0, 0);
for (int ix = 0; ix < PME_ORDER; ix++) {
int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
real tq = pmeGrid[gridIndex].x;
real4 tadd = theta1[m*PME_ORDER+ix];
real4 tadd = theta1[ix];
t.x += tq*tadd.x;
t.y += tq*tadd.y;
t.z += tq*tadd.z;
......@@ -491,12 +494,50 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
}
extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid,
real* __restrict__ phip, real* __restrict__ phidp, const int4* __restrict__ igrid, const real4* __restrict__ theta1,
const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) {
// extract the induced dipole field at each site
real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize, int2* __restrict__ pmeAtomGridIndex) {
real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real w = pos.x*invPeriodicBoxSize.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(theta3, w, array);
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
// Compute the potential from this grid point.
for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
int4 gridPoint = igrid[m];
real tuv100_1 = 0;
real tuv010_1 = 0;
real tuv001_1 = 0;
......@@ -536,8 +577,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real tuv012 = 0;
real tuv111 = 0;
for (int iz = 0; iz < PME_ORDER; iz++) {
int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real4 v = theta3[m*PME_ORDER+iz];
int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real4 v = theta3[iz];
real tu00_1 = 0;
real tu01_1 = 0;
real tu10_1 = 0;
......@@ -561,8 +602,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real tu12 = 0;
real tu03 = 0;
for (int iy = 0; iy < PME_ORDER; iy++) {
int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real4 u = theta2[m*PME_ORDER+iy];
int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real4 u = theta2[iy];
real t0_1 = 0;
real t1_1 = 0;
real t2_1 = 0;
......@@ -571,10 +612,10 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real t2_2 = 0;
real t3 = 0;
for (int ix = 0; ix < PME_ORDER; ix++) {
int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
real2 tq = pmeGrid[gridIndex];
real4 tadd = theta1[m*PME_ORDER+ix];
real4 tadd = theta1[ix];
t0_1 += tq.x*tadd.x;
t1_1 += tq.x*tadd.y;
t2_1 += tq.x*tadd.z;
......
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
......@@ -182,253 +181,223 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
*/
extern "C" __global__ void computeElectrostatics(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices,
const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#endif
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
AtomData data;
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
data.force = make_real3(0);
data.torque = make_real3(0);
uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].q = data.q;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteraction(data, localData[tbx+j], true, d, p, m, 0.5f, energy, periodicBoxSize, invPeriodicBoxSize);
}
}
if (atom1 < NUM_ATOMS)
computeSelfEnergyAndTorque(data, energy);
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
localData[threadIdx.x].torque = make_real3(0);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteraction(data, localData[tbx+tj], true, d, p, m, 1, energy, periodicBoxSize, invPeriodicBoxSize);
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
int pos = startTileIndex+warp*numTiles/totalWarps;
int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
while (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
AtomData data;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData data;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
data.force = make_real3(0);
data.torque = make_real3(0);
// Locate the exclusion data for this tile.
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].q = data.q;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteraction(data, localData[tbx+j], hasExclusions, d, p, m, 0.5f, energy, periodicBoxSize, invPeriodicBoxSize);
}
}
if (atom1 < NUM_ATOMS)
computeSelfEnergyAndTorque(data, energy);
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
localData[threadIdx.x].torque = make_real3(0);
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) { // TODO: Why doesn't the flags != 0 block work?
// if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 oldForce = localData[atom2].force;
real3 oldTorque = localData[atom2].torque;
localData[atom2].force = make_real3(0);
localData[atom2].torque = make_real3(0);
computeOneInteraction(data, localData[tbx+j], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize);
real3 newForce = localData[atom2].force;
real3 newTorque = localData[atom2].torque;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
newForce.x += __shfl_xor(newForce.x, i, 32);
newForce.y += __shfl_xor(newForce.y, i, 32);
newForce.z += __shfl_xor(newForce.z, i, 32);
newTorque.x += __shfl_xor(newTorque.x, i, 32);
newTorque.y += __shfl_xor(newTorque.y, i, 32);
newTorque.z += __shfl_xor(newTorque.z, i, 32);
}
if (tgx == 0) {
localData[atom2].force -= newForce;
localData[atom2].torque -= newTorque;
}
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
int bufferIndex = 3*threadIdx.x;
tempBuffer[bufferIndex] = newForce.x;
tempBuffer[bufferIndex+1] = newForce.y;
tempBuffer[bufferIndex+2] = newForce.z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].force.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].force.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].force.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
tempBuffer[bufferIndex] = newTorque.x;
tempBuffer[bufferIndex+1] = newTorque.y;
tempBuffer[bufferIndex+2] = newTorque.z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].torque.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].torque.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].torque.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif
}
}
}
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].torque *= -ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
}
}
}
else
unsigned int j = y*TILE_SIZE + tgx;
#endif
{
// Compute the full set of interactions in this tile.
uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
// Compute forces.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteraction(data, localData[tbx+tj], hasExclusions, d, p, m, 1, energy, periodicBoxSize, invPeriodicBoxSize);
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
}
atomIndices[threadIdx.x] = j;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
localData[threadIdx.x].torque = make_real3(0);
// Compute forces.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
computeOneInteraction(data, localData[tbx+tj], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize);
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
#ifdef USE_CUTOFF
offset = atomIndices[threadIdx.x];
#else
offset = y*TILE_SIZE + tgx;
#endif
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
}
pos++;
} while (pos < end);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR;
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment