"src/webui/vscode:/vscode.git/clone" did not exist on "a2b2c56d93e1c65ba540fb0667e53a3f6cdc412c"
Commit 93c467b2 authored by Peter Eastman's avatar Peter Eastman
Browse files

Merged 5.1Optimizations branch back to trunk

parent f6d4557d
...@@ -606,181 +606,255 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr ...@@ -606,181 +606,255 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/ */
extern "C" __global__ void computeEDiffForce( extern "C" __global__ void computeEDiffForce(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices, const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, const real* __restrict__ inducedDipoleS, const real* __restrict__ inducedDipolePolarS, const real* __restrict__ inducedDipolePolar, const real* __restrict__ inducedDipoleS, const real* __restrict__ inducedDipolePolarS,
const float2* __restrict__ dampingAndThole) { const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int numTiles = numTileIndices; const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
unsigned int pos = startTileIndex+warp*numTiles/totalWarps; const unsigned int tbx = threadIdx.x - tgx;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
real energy = 0; real energy = 0;
__shared__ AtomData4 localData[EDIFF_THREAD_BLOCK_SIZE]; __shared__ AtomData4 localData[EDIFF_THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*(EDIFF_THREAD_BLOCK_SIZE/TILE_SIZE)];
__shared__ int exclusionIndex[EDIFF_THREAD_BLOCK_SIZE/TILE_SIZE]; // First loop: process tiles that contain exclusions.
do { const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
// Extract the coordinates of this tile const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const unsigned int tbx = threadIdx.x - tgx; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; const unsigned int x = tileIndices.x;
unsigned int x, y; const unsigned int y = tileIndices.y;
AtomData4 data; AtomData4 data;
if (pos < end) { data.force = make_real3(0);
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); unsigned int atom1 = x*TILE_SIZE + tgx;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); loadAtomData4(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
y += (x < y ? -1 : 1); unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].q = data.q;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].quadrupoleZZ = data.quadrupoleZZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
computeOneEDiffInteractionF1(data, localData[tbx+j], d, p, tempEnergy, tempForce);
energy += 0.25f*tempEnergy;
data.force += tempForce;
}
} }
unsigned int atom1 = x*TILE_SIZE + tgx; data.force *= ENERGY_SCALE_FACTOR;
loadAtomData4(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole); atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
data.force = make_real3(0); atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
// Locate the exclusion data for this tile.
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].pos = data.pos; // Compute torques.
localData[threadIdx.x].q = data.q;
localData[threadIdx.x].dipole = data.dipole; data.force = make_real3(0);
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX; for (unsigned int j = 0; j < TILE_SIZE; j++) {
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY; int atom2 = y*TILE_SIZE+j;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ; if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY; real3 tempTorque;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ; float d = computeDScaleFactor(polarizationGroup, j);
localData[threadIdx.x].quadrupoleZZ = data.quadrupoleZZ; float p = computePScaleFactor(covalent, polarizationGroup, j);
localData[threadIdx.x].inducedDipole = data.inducedDipole; computeOneEDiffInteractionT1(data, localData[tbx+j], d, p, tempTorque);
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar; data.force += tempTorque;
localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
computeOneEDiffInteractionF1(data, localData[tbx+j], d, p, tempEnergy, tempForce);
energy += 0.25f*tempEnergy;
data.force += tempForce;
}
} }
data.force *= ENERGY_SCALE_FACTOR; }
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000))); data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000))); atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000))); atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData4(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
// Compute forces.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
computeOneEDiffInteractionF1(data, localData[tbx+tj], d, p, tempEnergy, tempForce);
energy += 0.5f*tempEnergy;
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
// Compute torques.
data.force = make_real3(0);
// Compute torques. localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
data.force = make_real3(0); int atom2 = y*TILE_SIZE+tj;
for (unsigned int j = 0; j < TILE_SIZE; j++) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
int atom2 = y*TILE_SIZE+j; real3 tempTorque;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { float d = computeDScaleFactor(polarizationGroup, tj);
real3 tempTorque; float p = computePScaleFactor(covalent, polarizationGroup, tj);
float d = computeDScaleFactor(polarizationGroup, j); computeOneEDiffInteractionT1(data, localData[tbx+tj], d, p, tempTorque);
float p = computePScaleFactor(covalent, polarizationGroup, j); data.force += tempTorque;
computeOneEDiffInteractionT1(data, localData[tbx+j], d, p, tempTorque); computeOneEDiffInteractionT3(data, localData[tbx+tj], d, p, tempTorque);
data.force += tempTorque; localData[tbx+tj].force += tempTorque;
}
} }
data.force *= ENERGY_SCALE_FACTOR; tj = (tj + 1) & (TILE_SIZE - 1);
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
} }
else { data.force *= ENERGY_SCALE_FACTOR;
// This is an off-diagonal tile. localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
}
unsigned int j = y*TILE_SIZE + tgx; // Second loop: tiles without exclusions (by enumerating all of them, since there's no cutoff).
loadAtomData4(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0));
unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
// Compute forces. const unsigned int numTiles = numTileIndices;
int pos = startTileIndex+warp*numTiles/totalWarps;
int end = startTileIndex+(warp+1)*numTiles/totalWarps;
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int skipTiles[EDIFF_THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
unsigned int tj = tgx; while (pos < end) {
for (j = 0; j < TILE_SIZE; j++) { // Extract the coordinates of this tile.
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
computeOneEDiffInteractionF1(data, localData[tbx+tj], d, p, tempEnergy, tempForce);
energy += 0.5f*tempEnergy;
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
// Compute torques. unsigned int x, y;
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
data.force = make_real3(0); // Skip over tiles that have exclusions, since they were already processed.
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
int atom2 = y*TILE_SIZE+tj; if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { ushort2 tile = exclusionTiles[skipBase+tgx];
real3 tempTorque; skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
float d = computeDScaleFactor(polarizationGroup, tj); }
float p = computePScaleFactor(covalent, polarizationGroup, tj); else
computeOneEDiffInteractionT1(data, localData[tbx+tj], d, p, tempTorque); skipTiles[threadIdx.x] = end;
data.force += tempTorque; skipBase += TILE_SIZE;
computeOneEDiffInteractionT3(data, localData[tbx+tj], d, p, tempTorque); currentSkipIndex = tbx;
localData[tbx+tj].force += tempTorque; }
} while (skipTiles[currentSkipIndex] < pos)
tj = (tj + 1) & (TILE_SIZE - 1); currentSkipIndex++;
bool includeTile = (skipTiles[currentSkipIndex] != pos);
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData4 data;
data.force = make_real3(0);
loadAtomData4(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
loadAtomData4(localData[threadIdx.x], atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData4(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, inducedDipoleS, inducedDipolePolarS, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
// Compute forces.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
computeOneEDiffInteractionF1(data, localData[tbx+tj], 1, 1, tempEnergy, tempForce);
energy += 0.5f*tempEnergy;
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
} }
data.force *= ENERGY_SCALE_FACTOR; tj = (tj + 1) & (TILE_SIZE - 1);
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR; }
if (pos < end) { data.force *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx; localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000))); unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
offset = y*TILE_SIZE + tgx; atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000))); offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
// Compute torques.
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempTorque;
computeOneEDiffInteractionT1(data, localData[tbx+tj], 1, 1, tempTorque);
data.force += tempTorque;
computeOneEDiffInteractionT3(data, localData[tbx+tj], 1, 1, tempTorque);
localData[tbx+tj].force += tempTorque;
} }
tj = (tj + 1) & (TILE_SIZE - 1);
} }
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
} }
pos++; pos++;
} while (pos < end); }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR;
} }
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct { typedef struct {
...@@ -59,331 +58,286 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr ...@@ -59,331 +58,286 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/ */
extern "C" __global__ void computeElectrostatics( extern "C" __global__ void computeElectrostatics(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices, const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#endif #endif
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) { const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
AtomData data;
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
data.force = make_real3(0);
uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].posq = data.posq;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteractionF1(data, localData[tbx+j], d, p, m, tempEnergy, tempForce);
data.force += tempForce;
energy += 0.5f*tempEnergy;
}
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
// Compute torques.
data.force = make_real3(0);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteractionT1(data, localData[tbx+j], d, p, m, tempForce);
data.force += tempForce;
}
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteractionF1(data, localData[tbx+tj], d, p, m, tempEnergy, tempForce);
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
energy += tempEnergy;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
// Compute torques.
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteractionT1(data, localData[tbx+tj], d, p, m, tempForce);
data.force += tempForce;
computeOneInteractionT3(data, localData[tbx+tj], d, p, m, tempForce);
localData[tbx+tj].force += tempForce;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0]; const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps); int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps); int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else #else
const unsigned int numTiles = numTileIndices; const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps; int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps; int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
#endif #endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
do { while (pos < end) {
// Extract the coordinates of this tile bool includeTile = true;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx; // Extract the coordinates of this tile.
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
AtomData data;
if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos]; ushort2 tileIndices = tiles[pos];
x = tileIndices.x; x = tileIndices.x;
y = tileIndices.y; }
} else
else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. }
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); // Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData data;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole); loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
data.force = make_real3(0); data.force = make_real3(0);
#ifdef USE_CUTOFF
// Locate the exclusion data for this tile. unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
if (tgx < 2) // Compute forces.
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].posq = data.posq; unsigned int tj = tgx;
localData[threadIdx.x].dipole = data.dipole; for (j = 0; j < TILE_SIZE; j++) {
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX; int atom2 = atomIndices[tbx+tj];
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY; if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ; real3 tempForce;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY; real tempEnergy;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ; computeOneInteractionF1(data, localData[tbx+tj], 1, 1, 1, tempEnergy, tempForce);
localData[threadIdx.x].inducedDipole = data.inducedDipole; data.force += tempForce;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar; localData[tbx+tj].force -= tempForce;
localData[threadIdx.x].thole = data.thole; energy += tempEnergy;
localData[threadIdx.x].damp = data.damp;
uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteractionF1(data, localData[tbx+j], d, p, m, tempEnergy, tempForce);
data.force += tempForce;
energy += 0.5f*tempEnergy;
}
} }
data.force *= ENERGY_SCALE_FACTOR; tj = (tj + 1) & (TILE_SIZE - 1);
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
// Compute torques.
data.force = make_real3(0);
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteractionT1(data, localData[tbx+j], d, p, m, tempForce);
data.force += tempForce;
}
}
data.force *= ENERGY_SCALE_FACTOR;
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
} }
else { data.force *= ENERGY_SCALE_FACTOR;
// This is an off-diagonal tile. localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx;
unsigned int j = y*TILE_SIZE + tgx; atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
localData[threadIdx.x].force = make_real3(0); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); offset = atomIndices[threadIdx.x];
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real3 tempForce;
real tempEnergy;
computeOneInteractionF1(data, localData[atom2], 1, 1, 1, tempEnergy, tempForce);
data.force += tempForce;
localData[atom2].force -= tempForce;
energy += tempEnergy;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
tempForce.x += __shfl_xor(tempForce.x, i, 32);
tempForce.y += __shfl_xor(tempForce.y, i, 32);
tempForce.z += __shfl_xor(tempForce.z, i, 32);
}
if (tgx == 0)
localData[atom2].force -= tempForce;
#else #else
int bufferIndex = 3*threadIdx.x; offset = y*TILE_SIZE + tgx;
tempBuffer[bufferIndex] = tempForce.x;
tempBuffer[bufferIndex+1] = tempForce.y;
tempBuffer[bufferIndex+2] = tempForce.z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].force.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].force.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].force.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif #endif
} atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
} atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
} atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
// Compute torques.
data.force = make_real3(0); // Compute torques.
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real3 tempForce;
computeOneInteractionT1(data, localData[atom2], 1, 1, 1, tempForce);
data.force += tempForce;
computeOneInteractionT3(data, localData[atom2], 1, 1, 1, tempForce);
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
tempForce.x += __shfl_xor(tempForce.x, i, 32);
tempForce.y += __shfl_xor(tempForce.y, i, 32);
tempForce.z += __shfl_xor(tempForce.z, i, 32);
}
if (tgx == 0)
localData[atom2].force -= tempForce;
#else
int bufferIndex = 3*threadIdx.x;
tempBuffer[bufferIndex] = tempForce.x;
tempBuffer[bufferIndex+1] = tempForce.y;
tempBuffer[bufferIndex+2] = tempForce.z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].force.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].force.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].force.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif
}
}
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0)); data.force = make_real3(0);
unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0); localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
// Compute forces. int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
unsigned int tj = tgx; real3 tempForce;
for (j = 0; j < TILE_SIZE; j++) { computeOneInteractionT1(data, localData[tbx+tj], 1, 1, 1, tempForce);
int atom2 = y*TILE_SIZE+tj; data.force += tempForce;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { computeOneInteractionT3(data, localData[tbx+tj], 1, 1, 1, tempForce);
real3 tempForce; localData[tbx+tj].force += tempForce;
real tempEnergy;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteractionF1(data, localData[tbx+tj], d, p, m, tempEnergy, tempForce);
data.force += tempForce;
localData[tbx+tj].force -= tempForce;
energy += tempEnergy;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
// Compute torques.
data.force = make_real3(0);
localData[threadIdx.x].force = make_real3(0);
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 tempForce;
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteractionT1(data, localData[tbx+tj], d, p, m, tempForce);
data.force += tempForce;
computeOneInteractionT3(data, localData[tbx+tj], d, p, m, tempForce);
localData[tbx+tj].force += tempForce;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
}
} }
tj = (tj + 1) & (TILE_SIZE - 1);
} }
data.force *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= ENERGY_SCALE_FACTOR;
offset = x*TILE_SIZE + tgx;
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
#ifdef USE_CUTOFF
offset = atomIndices[threadIdx.x];
#else
offset = y*TILE_SIZE + tgx;
#endif
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
} }
pos++; pos++;
} while (pos < end); }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR;
} }
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct { typedef struct {
...@@ -398,245 +397,268 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr ...@@ -398,245 +397,268 @@ __device__ float computePScaleFactor(uint2 covalent, unsigned int polarizationGr
*/ */
extern "C" __global__ void computeFixedField( extern "C" __global__ void computeFixedField(
unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq, unsigned long long* __restrict__ fieldBuffers, unsigned long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, const ushort2* __restrict__ exclusionTiles,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#elif defined USE_GK #elif defined USE_GK
const real* __restrict__ bornRadii, unsigned long long* __restrict__ gkFieldBuffers, const real* __restrict__ bornRadii, unsigned long long* __restrict__ gkFieldBuffers,
#endif #endif
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) { const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int numTiles = interactionCount[0]; const unsigned int tbx = threadIdx.x - tgx;
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
__shared__ AtomData localData[THREAD_BLOCK_SIZE]; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP]; // First loop: process tiles that contain exclusions.
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
#endif
do { const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
// Extract the coordinates of this tile const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const unsigned int tbx = threadIdx.x - tgx; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; const unsigned int x = tileIndices.x;
unsigned int x, y; const unsigned int y = tileIndices.y;
AtomData data; AtomData data;
data.field = make_real3(0); data.field = make_real3(0);
data.fieldPolar = make_real3(0); data.fieldPolar = make_real3(0);
#ifdef USE_GK #ifdef USE_GK
data.gkField = make_real3(0); data.gkField = make_real3(0);
#endif #endif
if (pos < end) { unsigned int atom1 = x*TILE_SIZE + tgx;
#ifdef USE_CUTOFF loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
if (numTiles <= maxTiles) { #ifdef USE_GK
ushort2 tileIndices = tiles[pos]; data.bornRadius = bornRadii[atom1];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif #endif
{ uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); if (x == y) {
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. // This tile is on the diagonal.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = data.posq;
localData[localAtomIndex].dipole = data.dipole;
localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
localData[localAtomIndex].thole = data.thole;
localData[localAtomIndex].damp = data.damp;
#ifdef USE_GK
localData[localAtomIndex].bornRadius = data.bornRadius;
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+j].posq-data.posq);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[4];
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
computeOneInteraction(data, localData[tbx+j], delta, d, p, fields);
data.field += fields[0];
data.fieldPolar += fields[1];
} }
}
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
#ifdef USE_GK #ifdef USE_GK
data.bornRadius = bornRadii[atom1]; if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[2];
computeOneGkInteraction(data, localData[tbx+j], delta, fields);
data.gkField += fields[0];
}
#endif #endif
}
// Locate the exclusion data for this tile. }
else {
if (tgx < 2) // This is an off-diagonal tile.
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0) const unsigned int localAtomIndex = threadIdx.x;
exclusionIndex[localGroupIndex] = -1; unsigned int j = y*TILE_SIZE + tgx;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE) loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
if (exclusionIndices[i] == y) localData[localAtomIndex].field = make_real3(0);
exclusionIndex[localGroupIndex] = i*TILE_SIZE; localData[localAtomIndex].fieldPolar = make_real3(0);
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = data.posq;
localData[localAtomIndex].dipole = data.dipole;
localData[localAtomIndex].quadrupoleXX = data.quadrupoleXX;
localData[localAtomIndex].quadrupoleXY = data.quadrupoleXY;
localData[localAtomIndex].quadrupoleXZ = data.quadrupoleXZ;
localData[localAtomIndex].quadrupoleYY = data.quadrupoleYY;
localData[localAtomIndex].quadrupoleYZ = data.quadrupoleYZ;
localData[localAtomIndex].quadrupoleZZ = data.quadrupoleZZ;
localData[localAtomIndex].thole = data.thole;
localData[localAtomIndex].damp = data.damp;
#ifdef USE_GK #ifdef USE_GK
localData[localAtomIndex].bornRadius = data.bornRadius; localData[localAtomIndex].bornRadius = bornRadii[j];
localData[localAtomIndex].gkField = make_real3(0);
#endif #endif
uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx]; unsigned int tj = tgx;
unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx]; for (j = 0; j < TILE_SIZE; j++) {
for (unsigned int j = 0; j < TILE_SIZE; j++) { real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
real3 delta = trimTo3(localData[tbx+j].posq-data.posq);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = y*TILE_SIZE+j; int atom2 = y*TILE_SIZE+tj;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[4]; real3 fields[4];
float d = computeDScaleFactor(polarizationGroup, j); float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, j); float p = computePScaleFactor(covalent, polarizationGroup, tj);
computeOneInteraction(data, localData[tbx+j], delta, d, p, fields); computeOneInteraction(data, localData[tbx+tj], delta, d, p, fields);
data.field += fields[0]; data.field += fields[0];
data.fieldPolar += fields[1]; data.fieldPolar += fields[1];
} localData[tbx+tj].field += fields[2];
localData[tbx+tj].fieldPolar += fields[3];
#ifdef USE_GK #ifdef USE_GK
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
real3 fields[2]; data.gkField += fields[0];
computeOneGkInteraction(data, localData[tbx+j], delta, fields); localData[tbx+tj].gkField += fields[1];
data.gkField += fields[0];
}
#endif #endif
} }
tj = (tj + 1) & (TILE_SIZE - 1);
} }
else { }
// This is an off-diagonal tile.
// Write results.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole); atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
localData[localAtomIndex].field = make_real3(0); atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
localData[localAtomIndex].fieldPolar = make_real3(0); atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
#ifdef USE_GK #ifdef USE_GK
localData[localAtomIndex].bornRadius = bornRadii[j]; atomicAdd(&gkFieldBuffers[offset], static_cast<unsigned long long>((long long) (data.gkField.x*0x100000000)));
localData[localAtomIndex].gkField = make_real3(0); atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.y*0x100000000)));
atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.z*0x100000000)));
#endif #endif
#ifdef USE_CUTOFF if (x != y) {
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); offset = y*TILE_SIZE + tgx;
if (!hasExclusions && flags == 0) { // TODO: Why doesn't the flags != 0 block work? atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
// if (!hasExclusions && flags != 0xFFFFFFFF) { atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
if (flags == 0) { atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
// No interactions in this tile. atomicAdd(&fieldPolarBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
} atomicAdd(&fieldPolarBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
else { atomicAdd(&fieldPolarBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
// Compute only a subset of the interactions in this tile. #ifdef USE_GK
atomicAdd(&gkFieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].gkField.x*0x100000000)));
for (j = 0; j < TILE_SIZE; j++) { atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].gkField.y*0x100000000)));
if ((flags&(1<<j)) != 0) { atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].gkField.z*0x100000000)));
int atom2 = tbx+j;
real3 delta = make_real3(localData[atom2].posq.x-data.posq.x, localData[atom2].posq.y-data.posq.y, localData[atom2].posq.z-data.posq.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
real3 fields[4]; }
computeOneInteraction(data, localData[atom2], delta, 1, 1, fields); }
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
for (int i = 16; i >= 1; i /= 2) { // of them (no cutoff).
fields[2].x += __shfl_xor(fields[2].x, i, 32);
fields[2].y += __shfl_xor(fields[2].y, i, 32); #ifdef USE_CUTOFF
fields[2].z += __shfl_xor(fields[2].z, i, 32); const unsigned int numTiles = interactionCount[0];
fields[3].x += __shfl_xor(fields[3].x, i, 32); int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
fields[3].y += __shfl_xor(fields[3].y, i, 32); int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
fields[3].z += __shfl_xor(fields[3].z, i, 32);
}
if (tgx == 0) {
localData[atom2].field += fields[2];
localData[atom2].fieldPolar += fields[3];
}
#else #else
int bufferIndex = 3*threadIdx.x; const unsigned int numTiles = numTileIndices;
tempBuffer[bufferIndex] = fields[2].x; int pos = startTileIndex+warp*numTiles/totalWarps;
tempBuffer[bufferIndex+1] = fields[2].y; int end = startTileIndex+(warp+1)*numTiles/totalWarps;
tempBuffer[bufferIndex+2] = fields[2].z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].field.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].field.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].field.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
tempBuffer[bufferIndex] = fields[3].x;
tempBuffer[bufferIndex+1] = fields[3].y;
tempBuffer[bufferIndex+2] = fields[3].z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].fieldPolar.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].fieldPolar.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].fieldPolar.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif #endif
} int skipBase = 0;
} int currentSkipIndex = tbx;
} __shared__ int atomIndices[THREAD_BLOCK_SIZE];
} __shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData data;
data.field = make_real3(0);
data.fieldPolar = make_real3(0);
#ifdef USE_GK
data.gkField = make_real3(0);
#endif
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
#ifdef USE_GK
data.bornRadius = bornRadii[atom1];
#endif
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
{ atomIndices[threadIdx.x] = j;
// Compute the full set of interactions in this tile. const unsigned int localAtomIndex = threadIdx.x;
loadAtomData(localData[localAtomIndex], j, posq, labFrameDipole, labFrameQuadrupole, dampingAndThole);
uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0)); localData[localAtomIndex].field = make_real3(0);
unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0); localData[localAtomIndex].fieldPolar = make_real3(0);
unsigned int tj = tgx; #ifdef USE_GK
for (j = 0; j < TILE_SIZE; j++) { localData[localAtomIndex].bornRadius = bornRadii[j];
real3 delta = trimTo3(localData[tbx+tj].posq-data.posq); localData[localAtomIndex].gkField = make_real3(0);
#endif
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = trimTo3(localData[tbx+tj].posq-data.posq);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = y*TILE_SIZE+tj; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 fields[4]; real3 fields[4];
float d = computeDScaleFactor(polarizationGroup, tj); computeOneInteraction(data, localData[tbx+tj], delta, 1, 1, fields);
float p = computePScaleFactor(covalent, polarizationGroup, tj); data.field += fields[0];
computeOneInteraction(data, localData[tbx+tj], delta, d, p, fields); data.fieldPolar += fields[1];
data.field += fields[0]; localData[tbx+tj].field += fields[2];
data.fieldPolar += fields[1]; localData[tbx+tj].fieldPolar += fields[3];
localData[tbx+tj].field += fields[2];
localData[tbx+tj].fieldPolar += fields[3];
#ifdef USE_GK #ifdef USE_GK
computeOneGkInteraction(data, localData[tbx+tj], delta, fields); computeOneGkInteraction(data, localData[tbx+tj], delta, fields);
data.gkField += fields[0]; data.gkField += fields[0];
localData[tbx+tj].gkField += fields[1]; localData[tbx+tj].gkField += fields[1];
#endif #endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
} }
tj = (tj + 1) & (TILE_SIZE - 1);
} }
}
// Write results.
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000))); atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000))); atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000))); atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
...@@ -648,9 +670,11 @@ extern "C" __global__ void computeFixedField( ...@@ -648,9 +670,11 @@ extern "C" __global__ void computeFixedField(
atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.y*0x100000000))); atomicAdd(&gkFieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.y*0x100000000)));
atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.z*0x100000000))); atomicAdd(&gkFieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.gkField.z*0x100000000)));
#endif #endif
} #ifdef USE_CUTOFF
if (pos < end && x != y) { offset = atomIndices[threadIdx.x];
const unsigned int offset = y*TILE_SIZE + tgx; #else
offset = y*TILE_SIZE + tgx;
#endif
atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000))); atomicAdd(&fieldBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000))); atomicAdd(&fieldBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000))); atomicAdd(&fieldBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
...@@ -664,5 +688,5 @@ extern "C" __global__ void computeFixedField( ...@@ -664,5 +688,5 @@ extern "C" __global__ void computeFixedField(
#endif #endif
} }
pos++; pos++;
} while (pos < end); }
} }
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct { typedef struct {
...@@ -199,194 +198,221 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de ...@@ -199,194 +198,221 @@ __device__ void computeOneInteraction(AtomData& atom1, AtomData& atom2, real3 de
* Compute the mutual induced field. * Compute the mutual induced field.
*/ */
extern "C" __global__ void computeInducedField( extern "C" __global__ void computeInducedField(
unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq, unsigned long long* __restrict__ field, unsigned long long* __restrict__ fieldPolar, const real4* __restrict__ posq, const ushort2* __restrict__ exclusionTiles,
const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices, const real* __restrict__ inducedDipole, const real* __restrict__ inducedDipolePolar, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#elif defined USE_GK #elif defined USE_GK
unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS, const real* __restrict__ inducedDipoleS, unsigned long long* __restrict__ fieldS, unsigned long long* __restrict__ fieldPolarS, const real* __restrict__ inducedDipoleS,
const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii, const real* __restrict__ inducedDipolePolarS, const real* __restrict__ bornRadii,
#endif #endif
const float2* __restrict__ dampingAndThole) { const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int numTiles = interactionCount[0]; const unsigned int tbx = threadIdx.x - tgx;
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
__shared__ AtomData localData[THREAD_BLOCK_SIZE]; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
#ifndef ENABLE_SHUFFLE
// __shared__ real tempBuffer[3*THREAD_BLOCK_SIZE]; // First loop: process tiles that contain exclusions.
#endif
do { const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
// Extract the coordinates of this tile const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const unsigned int tbx = threadIdx.x - tgx; const ushort2 tileIndices = exclusionTiles[pos];
unsigned int x, y; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
AtomData data; AtomData data;
zeroAtomData(data); zeroAtomData(data);
if (pos < end) { unsigned int atom1 = x*TILE_SIZE + tgx;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
#ifdef USE_GK #ifdef USE_GK
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii); loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
#else #else
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole); loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
#endif #endif
if (pos >= end) if (x == y) {
; // This warp is done. // This tile is on the diagonal.
else if (x == y) {
// This tile is on the diagonal. localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].pos = data.pos; localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].inducedDipole = data.inducedDipole; localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar; localData[threadIdx.x].damp = data.damp;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
#ifdef USE_GK #ifdef USE_GK
localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS; localData[threadIdx.x].inducedDipoleS = data.inducedDipoleS;
localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS; localData[threadIdx.x].inducedDipolePolarS = data.inducedDipolePolarS;
localData[threadIdx.x].bornRadius = data.bornRadius; localData[threadIdx.x].bornRadius = data.bornRadius;
#endif #endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+j].pos-data.pos; real3 delta = localData[tbx+j].pos-data.pos;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = y*TILE_SIZE+j; int atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
computeOneInteraction(data, localData[tbx+j], delta, atom1 == atom2); computeOneInteraction(data, localData[tbx+j], delta, atom1 == atom2);
}
} }
else { }
// This is an off-diagonal tile. else {
// This is an off-diagonal tile.
#ifdef USE_GK #ifdef USE_GK
loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii); loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
#else #else
loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole); loadAtomData(localData[threadIdx.x], y*TILE_SIZE+tgx, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
#endif #endif
zeroAtomData(localData[threadIdx.x]); zeroAtomData(localData[threadIdx.x]);
#ifdef USE_CUTOFF unsigned int tj = tgx;
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (flags == 0) { // TODO: Figure out what the flags != 0 case doesn't work!!! real3 delta = localData[tbx+tj].pos-data.pos;
// if (flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
/* else {
// Compute only a subset of the interactions in this tile.
for (int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 delta = localData[atom2].pos-data.pos;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
int atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
computeOneInteraction(data, localData[tbx+tj], delta, false);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (data.fieldPolar.x*0x100000000)));
atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.y*0x100000000)));
atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolar.z*0x100000000)));
#ifdef USE_GK
atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (data.fieldS.x*0x100000000)));
atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.y*0x100000000)));
atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldS.z*0x100000000)));
atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (data.fieldPolarS.x*0x100000000)));
atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
#endif
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
atomicAdd(&fieldPolar[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.x*0x100000000)));
atomicAdd(&fieldPolar[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.y*0x100000000)));
atomicAdd(&fieldPolar[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolar.z*0x100000000)));
#ifdef USE_GK
atomicAdd(&fieldS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.x*0x100000000)));
atomicAdd(&fieldS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.y*0x100000000)));
atomicAdd(&fieldS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldS.z*0x100000000)));
atomicAdd(&fieldPolarS[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.x*0x100000000)));
atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.y*0x100000000)));
atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fieldPolarS.z*0x100000000)));
#endif #endif
real3 fields[4]; }
computeOneInteraction(data, localData[atom2], delta, fields); }
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
data.field += fields[0]; // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
data.fieldPolar += fields[1]; // of them (no cutoff).
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) { #ifdef USE_CUTOFF
fields[2].x += __shfl_xor(fields[2].x, i, 32); const unsigned int numTiles = interactionCount[0];
fields[2].y += __shfl_xor(fields[2].y, i, 32); int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
fields[2].z += __shfl_xor(fields[2].z, i, 32); int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
fields[3].x += __shfl_xor(fields[3].x, i, 32);
fields[3].y += __shfl_xor(fields[3].y, i, 32);
fields[3].z += __shfl_xor(fields[3].z, i, 32);
}
if (tgx == 0) {
localData[atom2].field += fields[2];
localData[atom2].fieldPolar += fields[3];
}
#else #else
int bufferIndex = 3*threadIdx.x; const unsigned int numTiles = numTileIndices;
tempBuffer[bufferIndex] = fields[2].x; int pos = startTileIndex+warp*numTiles/totalWarps;
tempBuffer[bufferIndex+1] = fields[2].y; int end = startTileIndex+(warp+1)*numTiles/totalWarps;
tempBuffer[bufferIndex+2] = fields[2].z; #endif
if (tgx % 4 == 0) { int skipBase = 0;
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9]; int currentSkipIndex = tbx;
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10]; __shared__ int atomIndices[THREAD_BLOCK_SIZE];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11]; __shared__ int skipTiles[THREAD_BLOCK_SIZE];
} skipTiles[threadIdx.x] = -1;
if (tgx == 0) {
localData[atom2].field.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84]; while (pos < end) {
localData[atom2].field.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85]; bool includeTile = true;
localData[atom2].field.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
} // Extract the coordinates of this tile.
tempBuffer[bufferIndex] = fields[3].x;
tempBuffer[bufferIndex+1] = fields[3].y; unsigned int x, y;
tempBuffer[bufferIndex+2] = fields[3].z; #ifdef USE_CUTOFF
if (tgx % 4 == 0) { if (numTiles <= maxTiles) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9]; ushort2 tileIndices = tiles[pos];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10]; x = tileIndices.x;
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11]; }
} else
if (tgx == 0) {
localData[atom2].fieldPolar.x += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].fieldPolar.y += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].fieldPolar.z += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif #endif
} {
} y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
} x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}*/ if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData data;
zeroAtomData(data);
#ifdef USE_GK
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
#else
loadAtomData(data, atom1, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
#endif
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
{ atomIndices[threadIdx.x] = j;
// Compute the full set of interactions in this tile. #ifdef USE_GK
loadAtomData(localData[threadIdx.x], j, posq, inducedDipole, inducedDipolePolar, dampingAndThole, inducedDipoleS, inducedDipolePolarS, bornRadii);
#else
loadAtomData(localData[threadIdx.x], j, posq, inducedDipole, inducedDipolePolar, dampingAndThole);
#endif
zeroAtomData(localData[threadIdx.x]);
unsigned int tj = tgx; // Compute the full set of interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+tj].pos-data.pos; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = localData[tbx+tj].pos-data.pos;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
int atom2 = y*TILE_SIZE+j; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS)
computeOneInteraction(data, localData[tbx+tj], delta, false); computeOneInteraction(data, localData[tbx+tj], delta, false);
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
}
}
} }
}
// Write results.
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000))); atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (data.field.x*0x100000000)));
atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000))); atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.y*0x100000000)));
atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000))); atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.field.z*0x100000000)));
...@@ -401,9 +427,11 @@ extern "C" __global__ void computeInducedField( ...@@ -401,9 +427,11 @@ extern "C" __global__ void computeInducedField(
atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000))); atomicAdd(&fieldPolarS[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.y*0x100000000)));
atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000))); atomicAdd(&fieldPolarS[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.fieldPolarS.z*0x100000000)));
#endif #endif
} #ifdef USE_CUTOFF
if (pos < end && x != y) { offset = atomIndices[threadIdx.x];
const unsigned int offset = y*TILE_SIZE + tgx; #else
offset = y*TILE_SIZE + tgx;
#endif
atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000))); atomicAdd(&field[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.x*0x100000000)));
atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000))); atomicAdd(&field[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.y*0x100000000)));
atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000))); atomicAdd(&field[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].field.z*0x100000000)));
...@@ -420,7 +448,7 @@ extern "C" __global__ void computeInducedField( ...@@ -420,7 +448,7 @@ extern "C" __global__ void computeInducedField(
#endif #endif
} }
pos++; pos++;
} while (pos < end); }
} }
extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__ fixedField, const long long* __restrict__ fixedFieldPolar, extern "C" __global__ void updateInducedFieldBySOR(const long long* __restrict__ fixedField, const long long* __restrict__ fixedFieldPolar,
......
#define ARRAY(x,y) array[(x)-1+((y)-1)*PME_ORDER] #define ARRAY(x,y) array[(x)-1+((y)-1)*PME_ORDER]
/** /**
* This is called from updateBsplines(). It calculates the spline coefficients for a single atom along a single axis. * Calculate the spline coefficients for a single atom along a single axis.
*/ */
__device__ void computeBSplinePoint(real4* thetai, real w, real* array) { __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
// initialization to get to 2nd order recursion // initialization to get to 2nd order recursion
...@@ -70,15 +70,10 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) { ...@@ -70,15 +70,10 @@ __device__ void computeBSplinePoint(real4* thetai, real w, real* array) {
} }
/** /**
* Compute bspline coefficients. * Compute the index of the grid point each atom is associated with.
*/ */
extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4* __restrict__ igrid, int2* __restrict__ pmeAtomGridIndex, extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
real4* __restrict__ theta1, real4* __restrict__ theta2, real4* __restrict__ theta3, real4 periodicBoxSize, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real4 invPeriodicBoxSize) {
extern __shared__ real bsplines_cache[]; // size = block_size*pme_order*pme_order
real* array = &bsplines_cache[threadIdx.x*PME_ORDER*PME_ORDER];
// get the B-spline coefficients for each multipole site
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) { for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i]; real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
...@@ -90,256 +85,226 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4* ...@@ -90,256 +85,226 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, int4*
real w = pos.x*invPeriodicBoxSize.x; real w = pos.x*invPeriodicBoxSize.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f); real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr; int ifr = (int) fr;
w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1; int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(&theta1[i*PME_ORDER], w, array);
// Second axis. // Second axis.
w = pos.y*invPeriodicBoxSize.y; w = pos.y*invPeriodicBoxSize.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1; int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(&theta2[i*PME_ORDER], w, array);
// Third axis. // Third axis.
w = pos.z*invPeriodicBoxSize.z; w = pos.z*invPeriodicBoxSize.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f); fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr; ifr = (int) fr;
w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1; int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(&theta3[i*PME_ORDER], w, array);
// Record the grid point. // Record the grid point.
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0); igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0); igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0); igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
igrid[i] = make_int4(igrid1, igrid2, igrid3, 0);
pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3); pmeAtomGridIndex[i] = make_int2(i, igrid1*GRID_SIZE_Y*GRID_SIZE_Z+igrid2*GRID_SIZE_Z+igrid3);
} }
} }
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange,
const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int thread = blockIdx.x*blockDim.x+threadIdx.x;
int start = (NUM_ATOMS*thread)/(blockDim.x*gridDim.x);
int end = (NUM_ATOMS*(thread+1))/(blockDim.x*gridDim.x);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j)
pmeAtomRange[j] = i;
last = gridIndex;
}
}
// Fill in values beyond the last atom.
if (thread == blockDim.x*gridDim.x-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j)
pmeAtomRange[j] = NUM_ATOMS;
}
}
/**
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel.
*/
extern "C" __global__ void recordZIndex(int2* __restrict__ pmeAtomGridIndex, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int thread = blockIdx.x*blockDim.x+threadIdx.x;
int start = (NUM_ATOMS*thread)/(blockDim.x*gridDim.x);
int end = (NUM_ATOMS*(thread+1))/(blockDim.x*gridDim.x);
for (int i = start; i < end; ++i) {
real posz = posq[pmeAtomGridIndex[i].x].z;
posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
real w = posz*invPeriodicBoxSize.z;
real fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
int z = ((int) fr)-PME_ORDER+1;
pmeAtomGridIndex[i].y = z;
}
}
extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ labFrameDipole, extern "C" __global__ void gridSpreadFixedMultipoles(const real4* __restrict__ posq, const real* __restrict__ labFrameDipole,
const real* __restrict__ labFrameQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real* __restrict__ labFrameQuadrupole, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x; const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y; const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z; const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; real array[PME_ORDER*PME_ORDER];
unsigned int numThreads = gridDim.x*blockDim.x; real4 theta1[PME_ORDER];
for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) { real4 theta2[PME_ORDER];
int3 gridPoint; real4 theta3[PME_ORDER];
gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z; // Process the atoms in spatially sorted order. This improves cache performance when loading
gridPoint.y = remainder/GRID_SIZE_Z; // the grid values.
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
real result = 0; for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
for (int ix = 0; ix < PME_ORDER; ++ix) { int m = pmeAtomGridIndex[i].x;
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X); real4 pos = posq[m];
for (int iy = 0; iy < PME_ORDER; ++iy) { pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y); pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
int z1 = gridPoint.z-PME_ORDER+1; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1); // Since we need the full set of thetas, it's faster to compute them here than load them
int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1; // from global memory.
int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
int firstAtom = pmeAtomRange[gridIndex1]; real w = pos.x*invPeriodicBoxSize.x;
int lastAtom = pmeAtomRange[gridIndex2+1]; real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
for (int i = firstAtom; i < lastAtom; ++i) { int ifr = (int) fr;
int2 atomData = pmeAtomGridIndex[i]; w = fr - ifr;
int atomIndex = atomData.x; int igrid1 = ifr-PME_ORDER+1;
int z = atomData.y; computeBSplinePoint(theta1, w, array);
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z); w = pos.y*invPeriodicBoxSize.y;
if (iz >= GRID_SIZE_Z) fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
iz -= GRID_SIZE_Z; ifr = (int) fr;
real atomCharge = posq[atomIndex].w; w = fr - ifr;
real atomDipoleX = xscale*labFrameDipole[atomIndex*3]; int igrid2 = ifr-PME_ORDER+1;
real atomDipoleY = yscale*labFrameDipole[atomIndex*3+1]; computeBSplinePoint(theta2, w, array);
real atomDipoleZ = zscale*labFrameDipole[atomIndex*3+2]; w = pos.z*invPeriodicBoxSize.z;
real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[atomIndex*5]; fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[atomIndex*5+1]; ifr = (int) fr;
real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[atomIndex*5+2]; w = fr - ifr;
real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[atomIndex*5+3]; int igrid3 = ifr-PME_ORDER+1;
real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[atomIndex*5+4]; computeBSplinePoint(theta3, w, array);
real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[atomIndex*5]+labFrameQuadrupole[atomIndex*5+3]); igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
real4 t = theta1[atomIndex*PME_ORDER+ix]; igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
real4 u = theta2[atomIndex*PME_ORDER+iy]; igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
real4 v = theta3[atomIndex*PME_ORDER+iz];
// Spread the charge from this atom onto each grid point.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = igrid1+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real4 t = theta1[ix];
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = igrid2+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real4 u = theta2[iy];
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = igrid3+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = ybase + zindex;
real4 v = theta3[iz];
real atomCharge = pos.w;
real atomDipoleX = xscale*labFrameDipole[m*3];
real atomDipoleY = yscale*labFrameDipole[m*3+1];
real atomDipoleZ = zscale*labFrameDipole[m*3+2];
real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[m*5];
real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[m*5+1];
real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[m*5+2];
real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[m*5+3];
real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[m*5+4];
real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[m*5]+labFrameQuadrupole[m*5+3]);
real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y; real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y; real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
real term2 = atomQuadrupoleXX * u.x * v.x; real term2 = atomQuadrupoleXX * u.x * v.x;
result += term0*t.x + term1*t.y + term2*t.z; real add = term0*t.x + term1*t.y + term2*t.z;
} #ifdef USE_DOUBLE_PRECISION
if (z1 > gridPoint.z) { unsigned long long * ulonglong_p = (unsigned long long *) pmeGrid;
gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z; atomicAdd(&ulonglong_p[2*index], static_cast<unsigned long long>((long long) (add*0x100000000)));
gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z; #else
firstAtom = pmeAtomRange[gridIndex1]; atomicAdd(&pmeGrid[index].x, add);
lastAtom = pmeAtomRange[gridIndex2+1]; #endif
for (int i = firstAtom; i < lastAtom; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
if (iz >= GRID_SIZE_Z)
iz -= GRID_SIZE_Z;
real atomCharge = posq[atomIndex].w;
real atomDipoleX = xscale*labFrameDipole[atomIndex*3];
real atomDipoleY = yscale*labFrameDipole[atomIndex*3+1];
real atomDipoleZ = zscale*labFrameDipole[atomIndex*3+2];
real atomQuadrupoleXX = xscale*xscale*labFrameQuadrupole[atomIndex*5];
real atomQuadrupoleXY = 2*xscale*yscale*labFrameQuadrupole[atomIndex*5+1];
real atomQuadrupoleXZ = 2*xscale*zscale*labFrameQuadrupole[atomIndex*5+2];
real atomQuadrupoleYY = yscale*yscale*labFrameQuadrupole[atomIndex*5+3];
real atomQuadrupoleYZ = 2*yscale*zscale*labFrameQuadrupole[atomIndex*5+4];
real atomQuadrupoleZZ = -zscale*zscale*(labFrameQuadrupole[atomIndex*5]+labFrameQuadrupole[atomIndex*5+3]);
real4 t = theta1[atomIndex*PME_ORDER+ix];
real4 u = theta2[atomIndex*PME_ORDER+iy];
real4 v = theta3[atomIndex*PME_ORDER+iz];
real term0 = atomCharge*u.x*v.x + atomDipoleY*u.y*v.x + atomDipoleZ*u.x*v.y + atomQuadrupoleYY*u.z*v.x + atomQuadrupoleZZ*u.x*v.z + atomQuadrupoleYZ*u.y*v.y;
real term1 = atomDipoleX*u.x*v.x + atomQuadrupoleXY*u.y*v.x + atomQuadrupoleXZ*u.x*v.y;
real term2 = atomQuadrupoleXX * u.x * v.x;
result += term0*t.x + term1*t.y + term2*t.z;
}
} }
} }
} }
pmeGrid[gridIndex] = make_real2(result, 0);
} }
} }
extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole, extern "C" __global__ void gridSpreadInducedDipoles(const real4* __restrict__ posq, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real* __restrict__ inducedDipolePolar, real2* __restrict__ pmeGrid, int2* __restrict__ pmeAtomGridIndex,
const real4* __restrict__ theta1, const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x; const real xscale = GRID_SIZE_X*invPeriodicBoxSize.x;
const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y; const real yscale = GRID_SIZE_Y*invPeriodicBoxSize.y;
const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z; const real zscale = GRID_SIZE_Z*invPeriodicBoxSize.z;
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; real array[PME_ORDER*PME_ORDER];
unsigned int numThreads = gridDim.x*blockDim.x; real4 theta1[PME_ORDER];
for (int gridIndex = blockIdx.x*blockDim.x+threadIdx.x; gridIndex < numGridPoints; gridIndex += numThreads) { real4 theta2[PME_ORDER];
int3 gridPoint; real4 theta3[PME_ORDER];
gridPoint.x = gridIndex/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z; // Process the atoms in spatially sorted order. This improves cache performance when loading
gridPoint.y = remainder/GRID_SIZE_Z; // the grid values.
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
real2 result = make_real2(0, 0); for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
for (int ix = 0; ix < PME_ORDER; ++ix) { int m = pmeAtomGridIndex[i].x;
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X); real4 pos = posq[m];
for (int iy = 0; iy < PME_ORDER; ++iy) { pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
int y = gridPoint.y-iy+(gridPoint.y >= iy ? 0 : GRID_SIZE_Y); pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
int z1 = gridPoint.z-PME_ORDER+1; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
z1 += (z1 >= 0 ? 0 : GRID_SIZE_Z);
int z2 = (z1 < gridPoint.z ? gridPoint.z : GRID_SIZE_Z-1); // Since we need the full set of thetas, it's faster to compute them here than load them
int gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z1; // from global memory.
int gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z2;
int firstAtom = pmeAtomRange[gridIndex1]; real w = pos.x*invPeriodicBoxSize.x;
int lastAtom = pmeAtomRange[gridIndex2+1]; real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
for (int i = firstAtom; i < lastAtom; ++i) { int ifr = (int) fr;
int2 atomData = pmeAtomGridIndex[i]; w = fr - ifr;
int atomIndex = atomData.x; int igrid1 = ifr-PME_ORDER+1;
int z = atomData.y; computeBSplinePoint(theta1, w, array);
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z); w = pos.y*invPeriodicBoxSize.y;
if (iz >= GRID_SIZE_Z) fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
iz -= GRID_SIZE_Z; ifr = (int) fr;
real inducedDipoleX = xscale*inducedDipole[atomIndex*3]; w = fr - ifr;
real inducedDipoleY = yscale*inducedDipole[atomIndex*3+1]; int igrid2 = ifr-PME_ORDER+1;
real inducedDipoleZ = zscale*inducedDipole[atomIndex*3+2]; computeBSplinePoint(theta2, w, array);
real inducedDipolePolarX = xscale*inducedDipolePolar[atomIndex*3]; w = pos.z*invPeriodicBoxSize.z;
real inducedDipolePolarY = yscale*inducedDipolePolar[atomIndex*3+1]; fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
real inducedDipolePolarZ = zscale*inducedDipolePolar[atomIndex*3+2]; ifr = (int) fr;
real4 t = theta1[atomIndex*PME_ORDER+ix]; w = fr - ifr;
real4 u = theta2[atomIndex*PME_ORDER+iy]; int igrid3 = ifr-PME_ORDER+1;
real4 v = theta3[atomIndex*PME_ORDER+iz]; computeBSplinePoint(theta3, w, array);
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
// Spread the charge from this atom onto each grid point.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = igrid1+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real4 t = theta1[ix];
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = igrid2+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real4 u = theta2[iy];
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = igrid3+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = ybase + zindex;
real4 v = theta3[iz];
real inducedDipoleX = xscale*inducedDipole[m*3];
real inducedDipoleY = yscale*inducedDipole[m*3+1];
real inducedDipoleZ = zscale*inducedDipole[m*3+2];
real inducedDipolePolarX = xscale*inducedDipolePolar[m*3];
real inducedDipolePolarY = yscale*inducedDipolePolar[m*3+1];
real inducedDipolePolarZ = zscale*inducedDipolePolar[m*3+2];
real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y; real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
real term11 = inducedDipoleX*u.x*v.x; real term11 = inducedDipoleX*u.x*v.x;
real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y; real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
real term12 = inducedDipolePolarX*u.x*v.x; real term12 = inducedDipolePolarX*u.x*v.x;
result.x += term01*t.x + term11*t.y; real add1 = term01*t.x + term11*t.y;
result.y += term02*t.x + term12*t.y; real add2 = term02*t.x + term12*t.y;
} #ifdef USE_DOUBLE_PRECISION
if (z1 > gridPoint.z) { unsigned long long * ulonglong_p = (unsigned long long *) pmeGrid;
gridIndex1 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z; atomicAdd(&ulonglong_p[2*index], static_cast<unsigned long long>((long long) (add1*0x100000000)));
gridIndex2 = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+gridPoint.z; atomicAdd(&ulonglong_p[2*index+1], static_cast<unsigned long long>((long long) (add2*0x100000000)));
firstAtom = pmeAtomRange[gridIndex1]; #else
lastAtom = pmeAtomRange[gridIndex2+1]; atomicAdd(&pmeGrid[index].x, add1);
for (int i = firstAtom; i < lastAtom; ++i) { atomicAdd(&pmeGrid[index].y, add2);
int2 atomData = pmeAtomGridIndex[i]; #endif
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
if (iz >= GRID_SIZE_Z)
iz -= GRID_SIZE_Z;
real inducedDipoleX = xscale*inducedDipole[atomIndex*3];
real inducedDipoleY = yscale*inducedDipole[atomIndex*3+1];
real inducedDipoleZ = zscale*inducedDipole[atomIndex*3+2];
real inducedDipolePolarX = xscale*inducedDipolePolar[atomIndex*3];
real inducedDipolePolarY = yscale*inducedDipolePolar[atomIndex*3+1];
real inducedDipolePolarZ = zscale*inducedDipolePolar[atomIndex*3+2];
real4 t = theta1[atomIndex*PME_ORDER+ix];
real4 u = theta2[atomIndex*PME_ORDER+iy];
real4 v = theta3[atomIndex*PME_ORDER+iz];
real term01 = inducedDipoleY*u.y*v.x + inducedDipoleZ*u.x*v.y;
real term11 = inducedDipoleX*u.x*v.x;
real term02 = inducedDipolePolarY*u.y*v.x + inducedDipolePolarZ*u.x*v.y;
real term12 = inducedDipolePolarX*u.x*v.x;
result.x += term01*t.x + term11*t.y;
result.y += term02*t.x + term12*t.y;
}
} }
} }
} }
pmeGrid[gridIndex] = result;
} }
} }
/**
* In double precision, we have to use fixed point to accumulate the grid values, so convert them to floating point.
*/
extern "C" __global__ void finishSpreadCharge(long long* __restrict__ pmeGrid) {
real* floatGrid = (real*) pmeGrid;
const unsigned int gridSize = 2*GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real scale = 1/(real) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x)
floatGrid[index] = scale*pmeGrid[index];
}
extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, const real* __restrict__ pmeBsplineModuliX, extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, const real* __restrict__ pmeBsplineModuliX,
const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) { const real* __restrict__ pmeBsplineModuliY, const real* __restrict__ pmeBsplineModuliZ, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
...@@ -372,12 +337,50 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co ...@@ -372,12 +337,50 @@ extern "C" __global__ void reciprocalConvolution(real2* __restrict__ pmeGrid, co
} }
extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi, extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phi,
long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const int4* __restrict__ igrid, const real4* __restrict__ theta1, long long* __restrict__ fieldBuffers, long long* __restrict__ fieldPolarBuffers, const real4* __restrict__ posq,
const real4* __restrict__ theta2, const real4* __restrict__ theta3, const real* __restrict__ labFrameDipole, real4 invPeriodicBoxSize) { const real* __restrict__ labFrameDipole, real4 periodicBoxSize, real4 invPeriodicBoxSize, int2* __restrict__ pmeAtomGridIndex) {
// extract the permanent multipole field at each site real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real w = pos.x*invPeriodicBoxSize.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(theta3, w, array);
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
// Compute the potential from this grid point.
for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
int4 gridPoint = igrid[m];
real tuv000 = 0; real tuv000 = 0;
real tuv001 = 0; real tuv001 = 0;
real tuv010 = 0; real tuv010 = 0;
...@@ -399,8 +402,8 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -399,8 +402,8 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real tuv012 = 0; real tuv012 = 0;
real tuv111 = 0; real tuv111 = 0;
for (int iz = 0; iz < PME_ORDER; iz++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real4 v = theta3[m*PME_ORDER+iz]; real4 v = theta3[iz];
real tu00 = 0; real tu00 = 0;
real tu10 = 0; real tu10 = 0;
real tu01 = 0; real tu01 = 0;
...@@ -412,14 +415,14 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -412,14 +415,14 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
real tu12 = 0; real tu12 = 0;
real tu03 = 0; real tu03 = 0;
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real4 u = theta2[m*PME_ORDER+iy]; real4 u = theta2[iy];
real4 t = make_real4(0, 0, 0, 0); real4 t = make_real4(0, 0, 0, 0);
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0); int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k; int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
real tq = pmeGrid[gridIndex].x; real tq = pmeGrid[gridIndex].x;
real4 tadd = theta1[m*PME_ORDER+ix]; real4 tadd = theta1[ix];
t.x += tq*tadd.x; t.x += tq*tadd.x;
t.y += tq*tadd.y; t.y += tq*tadd.y;
t.z += tq*tadd.z; t.z += tq*tadd.z;
...@@ -491,12 +494,50 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict ...@@ -491,12 +494,50 @@ extern "C" __global__ void computeFixedPotentialFromGrid(const real2* __restrict
} }
extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid, extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restrict__ pmeGrid, real* __restrict__ phid,
real* __restrict__ phip, real* __restrict__ phidp, const int4* __restrict__ igrid, const real4* __restrict__ theta1, real* __restrict__ phip, real* __restrict__ phidp, const real4* __restrict__ posq,
const real4* __restrict__ theta2, const real4* __restrict__ theta3, real4 invPeriodicBoxSize) { real4 periodicBoxSize, real4 invPeriodicBoxSize, int2* __restrict__ pmeAtomGridIndex) {
// extract the induced dipole field at each site real array[PME_ORDER*PME_ORDER];
real4 theta1[PME_ORDER];
real4 theta2[PME_ORDER];
real4 theta3[PME_ORDER];
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int m = pmeAtomGridIndex[i].x;
real4 pos = posq[m];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real w = pos.x*invPeriodicBoxSize.x;
real fr = GRID_SIZE_X*(w-(int)(w+0.5f)+0.5f);
int ifr = (int) fr;
w = fr - ifr;
int igrid1 = ifr-PME_ORDER+1;
computeBSplinePoint(theta1, w, array);
w = pos.y*invPeriodicBoxSize.y;
fr = GRID_SIZE_Y*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid2 = ifr-PME_ORDER+1;
computeBSplinePoint(theta2, w, array);
w = pos.z*invPeriodicBoxSize.z;
fr = GRID_SIZE_Z*(w-(int)(w+0.5f)+0.5f);
ifr = (int) fr;
w = fr - ifr;
int igrid3 = ifr-PME_ORDER+1;
computeBSplinePoint(theta3, w, array);
igrid1 += (igrid1 < 0 ? GRID_SIZE_X : 0);
igrid2 += (igrid2 < 0 ? GRID_SIZE_Y : 0);
igrid3 += (igrid3 < 0 ? GRID_SIZE_Z : 0);
// Compute the potential from this grid point.
for (int m = blockIdx.x*blockDim.x+threadIdx.x; m < NUM_ATOMS; m += blockDim.x*gridDim.x) {
int4 gridPoint = igrid[m];
real tuv100_1 = 0; real tuv100_1 = 0;
real tuv010_1 = 0; real tuv010_1 = 0;
real tuv001_1 = 0; real tuv001_1 = 0;
...@@ -536,8 +577,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -536,8 +577,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real tuv012 = 0; real tuv012 = 0;
real tuv111 = 0; real tuv111 = 0;
for (int iz = 0; iz < PME_ORDER; iz++) { for (int iz = 0; iz < PME_ORDER; iz++) {
int k = gridPoint.z+iz-(gridPoint.z+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); int k = igrid3+iz-(igrid3+iz >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real4 v = theta3[m*PME_ORDER+iz]; real4 v = theta3[iz];
real tu00_1 = 0; real tu00_1 = 0;
real tu01_1 = 0; real tu01_1 = 0;
real tu10_1 = 0; real tu10_1 = 0;
...@@ -561,8 +602,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -561,8 +602,8 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real tu12 = 0; real tu12 = 0;
real tu03 = 0; real tu03 = 0;
for (int iy = 0; iy < PME_ORDER; iy++) { for (int iy = 0; iy < PME_ORDER; iy++) {
int j = gridPoint.y+iy-(gridPoint.y+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); int j = igrid2+iy-(igrid2+iy >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real4 u = theta2[m*PME_ORDER+iy]; real4 u = theta2[iy];
real t0_1 = 0; real t0_1 = 0;
real t1_1 = 0; real t1_1 = 0;
real t2_1 = 0; real t2_1 = 0;
...@@ -571,10 +612,10 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri ...@@ -571,10 +612,10 @@ extern "C" __global__ void computeInducedPotentialFromGrid(const real2* __restri
real t2_2 = 0; real t2_2 = 0;
real t3 = 0; real t3 = 0;
for (int ix = 0; ix < PME_ORDER; ix++) { for (int ix = 0; ix < PME_ORDER; ix++) {
int i = gridPoint.x+ix-(gridPoint.x+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0); int i = igrid1+ix-(igrid1+ix >= GRID_SIZE_X ? GRID_SIZE_X : 0);
int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k; int gridIndex = i*GRID_SIZE_Y*GRID_SIZE_Z + j*GRID_SIZE_Z + k;
real2 tq = pmeGrid[gridIndex]; real2 tq = pmeGrid[gridIndex];
real4 tadd = theta1[m*PME_ORDER+ix]; real4 tadd = theta1[ix];
t0_1 += tq.x*tadd.x; t0_1 += tq.x*tadd.x;
t1_1 += tq.x*tadd.y; t1_1 += tq.x*tadd.y;
t2_1 += tq.x*tadd.z; t2_1 += tq.x*tadd.z;
......
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct { typedef struct {
...@@ -182,253 +181,223 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) { ...@@ -182,253 +181,223 @@ __device__ void computeSelfEnergyAndTorque(AtomData& atom1, real& energy) {
*/ */
extern "C" __global__ void computeElectrostatics( extern "C" __global__ void computeElectrostatics(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer, unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, const real4* __restrict__ posq, const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags,
const uint2* __restrict__ covalentFlags, const unsigned int* __restrict__ polarizationGroupFlags, unsigned int startTileIndex, unsigned int numTileIndices, const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#endif #endif
const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole, const real* __restrict__ labFrameDipole, const real* __restrict__ labFrameQuadrupole, const real* __restrict__ inducedDipole,
const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) { const real* __restrict__ inducedDipolePolar, const float2* __restrict__ dampingAndThole) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
AtomData data;
unsigned int atom1 = x*TILE_SIZE + tgx;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
data.force = make_real3(0);
data.torque = make_real3(0);
uint2 covalent = covalentFlags[pos*TILE_SIZE+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[pos*TILE_SIZE+tgx];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].q = data.q;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteraction(data, localData[tbx+j], true, d, p, m, 0.5f, energy, periodicBoxSize, invPeriodicBoxSize);
}
}
if (atom1 < NUM_ATOMS)
computeSelfEnergyAndTorque(data, energy);
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
localData[threadIdx.x].torque = make_real3(0);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+tj;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteraction(data, localData[tbx+tj], true, d, p, m, 1, energy, periodicBoxSize, invPeriodicBoxSize);
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0]; const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps); int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps); int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else #else
const unsigned int numTiles = numTileIndices; const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps; int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps; int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
#endif #endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
do { while (pos < end) {
// Extract the coordinates of this tile bool includeTile = true;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx; // Extract the coordinates of this tile.
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
AtomData data;
if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos]; ushort2 tileIndices = tiles[pos];
x = tileIndices.x; x = tileIndices.x;
y = tileIndices.y; }
} else
else
#endif #endif
{ {
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. }
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); // Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
AtomData data;
loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole); loadAtomData(data, atom1, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
data.force = make_real3(0); data.force = make_real3(0);
data.torque = make_real3(0); data.torque = make_real3(0);
// Locate the exclusion data for this tile.
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].pos = data.pos;
localData[threadIdx.x].q = data.q;
localData[threadIdx.x].dipole = data.dipole;
localData[threadIdx.x].quadrupoleXX = data.quadrupoleXX;
localData[threadIdx.x].quadrupoleXY = data.quadrupoleXY;
localData[threadIdx.x].quadrupoleXZ = data.quadrupoleXZ;
localData[threadIdx.x].quadrupoleYY = data.quadrupoleYY;
localData[threadIdx.x].quadrupoleYZ = data.quadrupoleYZ;
localData[threadIdx.x].inducedDipole = data.inducedDipole;
localData[threadIdx.x].inducedDipolePolar = data.inducedDipolePolar;
localData[threadIdx.x].thole = data.thole;
localData[threadIdx.x].damp = data.damp;
uint2 covalent = covalentFlags[exclusionIndex[localGroupIndex]+tgx];
unsigned int polarizationGroup = polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx];
// Compute forces.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = y*TILE_SIZE+j;
if (atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
float d = computeDScaleFactor(polarizationGroup, j);
float p = computePScaleFactor(covalent, polarizationGroup, j);
float m = computeMScaleFactor(covalent, j);
computeOneInteraction(data, localData[tbx+j], hasExclusions, d, p, m, 0.5f, energy, periodicBoxSize, invPeriodicBoxSize);
}
}
if (atom1 < NUM_ATOMS)
computeSelfEnergyAndTorque(data, energy);
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[atom1], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
localData[threadIdx.x].torque = make_real3(0);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
if (!hasExclusions && flags == 0) { // TODO: Why doesn't the flags != 0 block work?
// if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real3 oldForce = localData[atom2].force;
real3 oldTorque = localData[atom2].torque;
localData[atom2].force = make_real3(0);
localData[atom2].torque = make_real3(0);
computeOneInteraction(data, localData[tbx+j], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize);
real3 newForce = localData[atom2].force;
real3 newTorque = localData[atom2].torque;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
newForce.x += __shfl_xor(newForce.x, i, 32);
newForce.y += __shfl_xor(newForce.y, i, 32);
newForce.z += __shfl_xor(newForce.z, i, 32);
newTorque.x += __shfl_xor(newTorque.x, i, 32);
newTorque.y += __shfl_xor(newTorque.y, i, 32);
newTorque.z += __shfl_xor(newTorque.z, i, 32);
}
if (tgx == 0) {
localData[atom2].force -= newForce;
localData[atom2].torque -= newTorque;
}
#else #else
int bufferIndex = 3*threadIdx.x; unsigned int j = y*TILE_SIZE + tgx;
tempBuffer[bufferIndex] = newForce.x;
tempBuffer[bufferIndex+1] = newForce.y;
tempBuffer[bufferIndex+2] = newForce.z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].force.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].force.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].force.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
tempBuffer[bufferIndex] = newTorque.x;
tempBuffer[bufferIndex+1] = newTorque.y;
tempBuffer[bufferIndex+2] = newTorque.z;
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[atom2].torque.x -= tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[atom2].torque.y -= tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[atom2].torque.z -= tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
#endif
}
}
}
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].torque *= -ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
}
}
}
else
#endif #endif
{ atomIndices[threadIdx.x] = j;
// Compute the full set of interactions in this tile. loadAtomData(localData[threadIdx.x], j, posq, labFrameDipole, labFrameQuadrupole, inducedDipole, inducedDipolePolar, dampingAndThole);
localData[threadIdx.x].force = make_real3(0);
uint2 covalent = (hasExclusions ? covalentFlags[exclusionIndex[localGroupIndex]+tgx] : make_uint2(0, 0)); localData[threadIdx.x].torque = make_real3(0);
unsigned int polarizationGroup = (hasExclusions ? polarizationGroupFlags[exclusionIndex[localGroupIndex]+tgx] : 0);
// Compute forces.
// Compute forces.
unsigned int tj = tgx;
unsigned int tj = tgx; for (j = 0; j < TILE_SIZE; j++) {
for (j = 0; j < TILE_SIZE; j++) { int atom2 = atomIndices[tbx+tj];
int atom2 = y*TILE_SIZE+tj; if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { computeOneInteraction(data, localData[tbx+tj], false, 1, 1, 1, 1, energy, periodicBoxSize, invPeriodicBoxSize);
float d = computeDScaleFactor(polarizationGroup, tj);
float p = computePScaleFactor(covalent, polarizationGroup, tj);
float m = computeMScaleFactor(covalent, tj);
computeOneInteraction(data, localData[tbx+tj], hasExclusions, d, p, m, 1, energy, periodicBoxSize, invPeriodicBoxSize);
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
if (pos < end) {
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
}
} }
tj = (tj + 1) & (TILE_SIZE - 1);
} }
data.force *= -ENERGY_SCALE_FACTOR;
data.torque *= ENERGY_SCALE_FACTOR;
localData[threadIdx.x].force *= -ENERGY_SCALE_FACTOR;
localData[threadIdx.x].torque *= ENERGY_SCALE_FACTOR;
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (data.force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (data.torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (data.torque.z*0x100000000)));
#ifdef USE_CUTOFF
offset = atomIndices[threadIdx.x];
#else
offset = y*TILE_SIZE + tgx;
#endif
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
atomicAdd(&torqueBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.x*0x100000000)));
atomicAdd(&torqueBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.y*0x100000000)));
atomicAdd(&torqueBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].torque.z*0x100000000)));
} }
pos++; pos++;
} while (pos < end); }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy*ENERGY_SCALE_FACTOR;
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment