Commit 5a06df78 authored by tic20's avatar tic20
Browse files
parents 8dd60914 a9223eea
/**
* Compute the center of each group.
*/
extern "C" __global__ void computeGroupCenters(const real4* __restrict__ posq, const int* __restrict__ groupParticles,
const real* __restrict__ groupWeights, const int* __restrict__ groupOffsets, real4* __restrict__ centerPositions) {
__shared__ volatile real3 temp[64];
for (int group = blockIdx.x; group < NUM_GROUPS; group += gridDim.x) {
// The threads in this block work together to compute the center one group.
int firstIndex = groupOffsets[group];
int lastIndex = groupOffsets[group+1];
real3 center = make_real3(0, 0, 0);
for (int index = threadIdx.x; index < lastIndex-firstIndex; index += blockDim.x) {
int atom = groupParticles[firstIndex+index];
real weight = groupWeights[firstIndex+index];
real4 pos = posq[atom];
center.x += weight*pos.x;
center.y += weight*pos.y;
center.z += weight*pos.z;
}
// Sum the values.
int thread = threadIdx.x;
temp[thread].x = center.x;
temp[thread].y = center.y;
temp[thread].z = center.z;
__syncthreads();
if (thread < 32) {
temp[thread].x += temp[thread+32].x;
temp[thread].y += temp[thread+32].y;
temp[thread].z += temp[thread+32].z;
if (thread < 16) {
temp[thread].x += temp[thread+16].x;
temp[thread].y += temp[thread+16].y;
temp[thread].z += temp[thread+16].z;
}
if (thread < 8) {
temp[thread].x += temp[thread+8].x;
temp[thread].y += temp[thread+8].y;
temp[thread].z += temp[thread+8].z;
}
if (thread < 4) {
temp[thread].x += temp[thread+4].x;
temp[thread].y += temp[thread+4].y;
temp[thread].z += temp[thread+4].z;
}
if (thread < 2) {
temp[thread].x += temp[thread+2].x;
temp[thread].y += temp[thread+2].y;
temp[thread].z += temp[thread+2].z;
}
}
if (thread == 0)
centerPositions[group] = make_real4(temp[0].x+temp[1].x, temp[0].y+temp[1].y, temp[0].z+temp[1].z, 0);
}
}
/**
* Convert a real4 to a real3 by removing its last element.
*/
inline __device__ real3 trim(real4 v) {
return make_real3(v.x, v.y, v.z);
}
/**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/
inline __device__ real4 delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
if (periodic)
APPLY_PERIODIC_TO_DELTA(result);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
__device__ real computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle;
if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real3 crossProduct = cross(vec1, vec2);
real scale = vec1.w*vec2.w;
angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f)
angle = M_PI-angle;
}
else
angle = ACOS(cosine);
return angle;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
real3 cp = cross(vec1, vec2);
return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
}
/**
* Compute the forces on groups based on the bonds.
*/
extern "C" __global__ void computeGroupForces(unsigned long long* __restrict__ groupForce, mixed* __restrict__ energyBuffer, const real4* __restrict__ centerPositions,
const int* __restrict__ bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
EXTRA_ARGS) {
mixed energy = 0;
INIT_PARAM_DERIVS
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_BONDS; index += blockDim.x*gridDim.x) {
COMPUTE_FORCE
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
SAVE_PARAM_DERIVS
}
/**
* Apply the forces from the group centers to the individual atoms.
*/
extern "C" __global__ void applyForcesToAtoms(const int* __restrict__ groupParticles, const real* __restrict__ groupWeights, const int* __restrict__ groupOffsets,
const long long* __restrict__ groupForce, unsigned long long* __restrict__ atomForce) {
for (int group = blockIdx.x; group < NUM_GROUPS; group += gridDim.x) {
long long fx = groupForce[group];
long long fy = groupForce[group+NUM_GROUPS];
long long fz = groupForce[group+NUM_GROUPS*2];
int firstIndex = groupOffsets[group];
int lastIndex = groupOffsets[group+1];
for (int index = threadIdx.x; index < lastIndex-firstIndex; index += blockDim.x) {
int atom = groupParticles[firstIndex+index];
real weight = groupWeights[firstIndex+index];
atomicAdd(&atomForce[atom], static_cast<unsigned long long>((long long) (fx*weight)));
atomicAdd(&atomForce[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (fy*weight)));
atomicAdd(&atomForce[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (fz*weight)));
}
}
}
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
typedef struct {
real3 pos;
real3 force;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float padding;
#endif
} AtomData;
/**
* Compute a force based on pair interactions.
*/
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles, bool needEnergy,
#ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
mixed energy = 0;
INIT_PARAM_DERIVS
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 pos1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].pos = make_real3(pos1.x, pos1.y, pos1.z);
LOAD_LOCAL_PARAMETERS_FROM_1
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real dEdR = 0;
real tempEnergy = 0;
const real interactionScale = 0.5f;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
if (needEnergy)
energy += 0.5f*tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_EXCLUSIONS
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
real dEdR = 0;
real tempEnergy = 0;
const real interactionScale = 1;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
if (needEnergy)
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
STORE_DERIVATIVES_1
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
STORE_DERIVATIVES_2
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
#else
int pos = (int) (warp*(long long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
const bool isExcluded = false;
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
bool includeTile = true;
// Extract the coordinates of this tile.
int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
x = tiles[pos];
real4 blockSizeX = blockSize[x];
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 pos1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x].pos, blockCenterX)
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
const real interactionScale = 1;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
if (needEnergy)
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
const real interactionScale = 1;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
if (needEnergy)
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
unsigned int offset = atom1;
STORE_DERIVATIVES_1
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
offset = atom2;
STORE_DERIVATIVES_2
}
}
pos++;
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
SAVE_PARAM_DERIVS
}
/**
* Reduce the derivatives computed in the N^2 energy kernel, and compute all per-particle energy terms.
*/
extern "C" __global__ void computePerParticleEnergy(long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq
PARAMETER_ARGUMENTS) {
mixed energy = 0;
INIT_PARAM_DERIVS
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Load the derivatives
LOAD_DERIVATIVES
// Now calculate the per-particle energy terms.
real4 pos = posq[index];
real3 force = make_real3(0, 0, 0);
COMPUTE_ENERGY
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
SAVE_PARAM_DERIVS
}
typedef struct {
real3 pos;
real value;
ATOM_PARAMETER_DATA
#ifdef NEED_PADDING
float padding;
#endif
} AtomData;
/**
* Compute a value based on pair interactions.
*/
extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ global_value,
#ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real value = 0;
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 pos1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].pos = make_real3(pos1.x, pos1.y, pos1.z);
LOAD_LOCAL_PARAMETERS_FROM_1
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
ADD_TEMP_DERIVS1
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].value = 0;
#ifdef USE_EXCLUSIONS
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
localData[tbx+tj].value += tempValue2;
ADD_TEMP_DERIVS1
ADD_TEMP_DERIVS2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset1 = x*TILE_SIZE + tgx;
atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
STORE_PARAM_DERIVS1
if (x != y) {
unsigned int offset2 = y*TILE_SIZE + tgx;
atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
STORE_PARAM_DERIVS2
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
#else
int pos = (int) (warp*(long long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ volatile int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
real value = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
x = tiles[pos];
real4 blockSizeX = blockSize[x];
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 pos1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[localAtomIndex].pos = make_real3(tempPosq.x, tempPosq.y, tempPosq.z);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].value = 0;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x].pos, blockCenterX)
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real tempValue1 = 0;
real tempValue2 = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
localData[tbx+tj].value += tempValue2;
ADD_TEMP_DERIVS1
ADD_TEMP_DERIVS2
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real3 pos2 = localData[atom2].pos;
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real tempValue1 = 0;
real tempValue2 = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
localData[tbx+tj].value += tempValue2;
ADD_TEMP_DERIVS1
ADD_TEMP_DERIVS2
#ifdef USE_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset1 = atom1;
atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
STORE_PARAM_DERIVS1
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
unsigned int offset2 = atom2;
atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
STORE_PARAM_DERIVS2
}
}
pos++;
}
}
/**
* Reduce a pairwise computed value, and compute per-particle values.
*/
extern "C" __global__ void computePerParticleValues(real4* posq, long long* valueBuffers
PARAMETER_ARGUMENTS) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Load the pairwise value
real sum = valueBuffers[index]/(real) 0x100000000;
REDUCE_PARAM0_DERIV
// Now calculate other values
real4 pos = posq[index];
COMPUTE_VALUES
}
}
/**
* Convert a real4 to a real3 by removing its last element.
*/
inline __device__ real3 trim(real4 v) {
return make_real3(v.x, v.y, v.z);
}
/**
* This does nothing, and just exists to simplify the code generation.
*/
inline __device__ real3 trim(real3 v) {
return v;
}
/**
* Compute the difference between two vectors, optionally taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline __device__ real4 delta(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(result)
#endif
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
inline __device__ real computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle;
if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real3 crossProduct = cross(vec1, vec2);
real scale = vec1.w*vec2.w;
angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f)
angle = M_PI-angle;
}
else
angle = ACOS(cosine);
return angle;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
real3 result = cross(vec1, vec2);
return make_real4(result.x, result.y, result.z, result.x*result.x + result.y*result.y + result.z*result.z);
}
/**
* Compute forces on donors.
*/
extern "C" __global__ void computeDonorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
extern __shared__ real4 posBuffer[];
mixed energy = 0;
real3 f1 = make_real3(0);
real3 f2 = make_real3(0);
real3 f3 = make_real3(0);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x*gridDim.x) {
// Load information about the donor this thread will compute forces on.
int donorIndex = donorStart+blockIdx.x*blockDim.x+threadIdx.x;
int4 atoms, exclusionIndices;
real4 d1, d2, d3;
if (donorIndex < NUM_DONORS) {
atoms = donorAtoms[donorIndex];
d1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
d2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
d3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
#ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[donorIndex];
#endif
}
else
atoms = make_int4(-1, -1, -1, -1);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x) {
// Load the next block of acceptors into local memory.
__syncthreads();
int blockSize = min((int) blockDim.x, NUM_ACCEPTORS-acceptorStart);
if (threadIdx.x < blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+threadIdx.x];
posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
}
__syncthreads();
if (donorIndex < NUM_DONORS) {
for (int index = 0; index < blockSize; index++) {
int acceptorIndex = acceptorStart+index;
#ifdef USE_EXCLUSIONS
if (acceptorIndex == exclusionIndices.x || acceptorIndex == exclusionIndices.y || acceptorIndex == exclusionIndices.z || acceptorIndex == exclusionIndices.w)
continue;
#endif
// Compute the interaction between a donor and an acceptor.
real4 a1 = posBuffer[3*index];
real4 a2 = posBuffer[3*index+1];
real4 a3 = posBuffer[3*index+2];
real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif
COMPUTE_DONOR_FORCE
#ifdef USE_CUTOFF
}
#endif
}
}
}
// Write results
if (donorIndex < NUM_DONORS) {
if (atoms.x > -1) {
atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0x100000000)));
atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0x100000000)));
atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0x100000000)));
__threadfence_block();
}
if (atoms.y > -1) {
atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0x100000000)));
atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0x100000000)));
atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0x100000000)));
__threadfence_block();
}
if (atoms.z > -1) {
atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0x100000000)));
atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0x100000000)));
atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0x100000000)));
__threadfence_block();
}
}
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/**
* Compute forces on acceptors.
*/
extern "C" __global__ void computeAcceptorForces(unsigned long long* __restrict__ force, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
const int4* __restrict__ exclusions, const int4* __restrict__ donorAtoms, const int4* __restrict__ acceptorAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
extern __shared__ real4 posBuffer[];
real3 f1 = make_real3(0);
real3 f2 = make_real3(0);
real3 f3 = make_real3(0);
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += blockDim.x*gridDim.x) {
// Load information about the acceptor this thread will compute forces on.
int acceptorIndex = acceptorStart+blockIdx.x*blockDim.x+threadIdx.x;
int4 atoms, exclusionIndices;
real4 a1, a2, a3;
if (acceptorIndex < NUM_ACCEPTORS) {
atoms = acceptorAtoms[acceptorIndex];
a1 = (atoms.x > -1 ? posq[atoms.x] : make_real4(0));
a2 = (atoms.y > -1 ? posq[atoms.y] : make_real4(0));
a3 = (atoms.z > -1 ? posq[atoms.z] : make_real4(0));
#ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[acceptorIndex];
#endif
}
else
atoms = make_int4(-1, -1, -1, -1);
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += blockDim.x) {
// Load the next block of donors into local memory.
__syncthreads();
int blockSize = min((int) blockDim.x, NUM_DONORS-donorStart);
if (threadIdx.x < blockSize) {
int4 atoms2 = donorAtoms[donorStart+threadIdx.x];
posBuffer[3*threadIdx.x] = (atoms2.x > -1 ? posq[atoms2.x] : make_real4(0));
posBuffer[3*threadIdx.x+1] = (atoms2.y > -1 ? posq[atoms2.y] : make_real4(0));
posBuffer[3*threadIdx.x+2] = (atoms2.z > -1 ? posq[atoms2.z] : make_real4(0));
}
__syncthreads();
if (acceptorIndex < NUM_ACCEPTORS) {
for (int index = 0; index < blockSize; index++) {
int donorIndex = donorStart+index;
#ifdef USE_EXCLUSIONS
if (donorIndex == exclusionIndices.x || donorIndex == exclusionIndices.y || donorIndex == exclusionIndices.z || donorIndex == exclusionIndices.w)
continue;
#endif
// Compute the interaction between a donor and an acceptor.
real4 d1 = posBuffer[3*index];
real4 d2 = posBuffer[3*index+1];
real4 d3 = posBuffer[3*index+2];
real4 deltaD1A1 = delta(d1, a1, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif
COMPUTE_ACCEPTOR_FORCE
#ifdef USE_CUTOFF
}
#endif
}
}
}
// Write results
if (acceptorIndex < NUM_ACCEPTORS) {
if (atoms.x > -1) {
atomicAdd(&force[atoms.x], static_cast<unsigned long long>((long long) (f1.x*0x100000000)));
atomicAdd(&force[atoms.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.y*0x100000000)));
atomicAdd(&force[atoms.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f1.z*0x100000000)));
__threadfence_block();
}
if (atoms.y > -1) {
atomicAdd(&force[atoms.y], static_cast<unsigned long long>((long long) (f2.x*0x100000000)));
atomicAdd(&force[atoms.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.y*0x100000000)));
atomicAdd(&force[atoms.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f2.z*0x100000000)));
__threadfence_block();
}
if (atoms.z > -1) {
atomicAdd(&force[atoms.z], static_cast<unsigned long long>((long long) (f3.x*0x100000000)));
atomicAdd(&force[atoms.z+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.y*0x100000000)));
atomicAdd(&force[atoms.z+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (f3.z*0x100000000)));
__threadfence_block();
}
}
}
}
/**
* Load the position of a particle.
*/
inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __restrict__ posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Store the position of a particle.
*/
inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
inline __device__ double4 convertToDouble4(float4 a) {
return make_double4(a.x, a.y, a.z, a.w);
}
inline __device__ double4 convertToDouble4(double4 a) {
return a;
}
inline __device__ mixed4 convertFromDouble4(double4 a) {
return make_mixed4(a.x, a.y, a.z, a.w);
}
extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta,
mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals,
mixed* __restrict__ sum, const float4* __restrict__ gaussianValues, unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues,
const mixed energy, mixed* __restrict__ energyParamDerivs
PARAMETER_ARGUMENTS) {
double3 stepSize = make_double3(dt[0].y);
int index = blockIdx.x*blockDim.x+threadIdx.x;
const double forceScale = 1.0/0xFFFFFFFF;
while (index < NUM_ATOMS) {
#ifdef LOAD_POS_AS_DELTA
double4 position = convertToDouble4(loadPos(posq, posqCorrection, index)+posDelta[index]);
#else
double4 position = convertToDouble4(loadPos(posq, posqCorrection, index));
#endif
double4 velocity = convertToDouble4(velm[index]);
double4 f = make_double4(forceScale*force[index], forceScale*force[index+PADDED_NUM_ATOMS], forceScale*force[index+PADDED_NUM_ATOMS*2], 0.0);
double3 mass = make_double3(1.0/velocity.w);
if (velocity.w != 0.0) {
int gaussianIndex = gaussianBaseIndex;
int uniformIndex = 0;
COMPUTE_STEP
}
index += blockDim.x*gridDim.x;
}
}
/**
* Record the force on an atom to global memory.
*/
inline __device__ void storeForce(int atom, real3 force, unsigned long long* __restrict__ forceBuffers) {
atomicAdd(&forceBuffers[atom], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
}
/**
* Convert a real4 to a real3 by removing its last element.
*/
inline __device__ real3 trim(real4 v) {
return make_real3(v.x, v.y, v.z);
}
/**
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
inline __device__ real4 delta(real3 vec1, real3 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = make_real4(vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(result)
#endif
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
__device__ real computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle;
if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real3 crossProduct = cross(vec1, vec2);
real scale = vec1.w*vec2.w;
angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f)
angle = M_PI-angle;
}
else
angle = ACOS(cosine);
return angle;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
inline __device__ real4 computeCross(real4 vec1, real4 vec2) {
real3 cp = cross(vec1, vec2);
return make_real4(cp.x, cp.y, cp.z, cp.x*cp.x+cp.y*cp.y+cp.z*cp.z);
}
/**
* Determine whether a particular interaction is in the list of exclusions.
*/
inline __device__ bool isInteractionExcluded(int atom1, int atom2, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) {
if (atom1 > atom2) {
int temp = atom1;
atom1 = atom2;
atom2 = temp;
}
int first = exclusionStartIndex[atom1];
int last = exclusionStartIndex[atom1+1];
for (int i = last-1; i >= first; i--) {
int excluded = exclusions[i];
if (excluded == atom2)
return true;
if (excluded <= atom1)
return false;
}
return false;
}
__constant__ float globals[NUM_GLOBALS];
/**
* Compute the interaction.
*/
extern "C" __global__ void computeInteraction(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#ifdef USE_CUTOFF
, const int* __restrict__ neighbors, const int* __restrict__ neighborStartIndex
#endif
#ifdef USE_FILTERS
, int* __restrict__ particleTypes, int* __restrict__ orderIndex, int* __restrict__ particleOrder
#endif
#ifdef USE_EXCLUSIONS
, int* __restrict__ exclusions, int* __restrict__ exclusionStartIndex
#endif
PARAMETER_ARGUMENTS) {
mixed energy = 0;
// Loop over particles to be the first one in the set.
for (int p1 = blockIdx.x; p1 < NUM_ATOMS; p1 += gridDim.x) {
#ifdef USE_CENTRAL_PARTICLE
const int a1 = p1;
#else
const int a1 = 0;
#endif
#ifdef USE_CUTOFF
int firstNeighbor = neighborStartIndex[p1];
int numNeighbors = neighborStartIndex[p1+1]-firstNeighbor;
#else
#ifdef USE_CENTRAL_PARTICLE
int numNeighbors = NUM_ATOMS;
#else
int numNeighbors = NUM_ATOMS-p1-1;
#endif
#endif
int numCombinations = NUM_CANDIDATE_COMBINATIONS;
for (int index = threadIdx.x; index < numCombinations; index += blockDim.x) {
FIND_ATOMS_FOR_COMBINATION_INDEX;
bool includeInteraction = IS_VALID_COMBINATION;
#ifdef USE_CUTOFF
if (includeInteraction) {
VERIFY_CUTOFF;
}
#endif
#ifdef USE_FILTERS
int order = orderIndex[COMPUTE_TYPE_INDEX];
if (order == -1)
includeInteraction = false;
#endif
#ifdef USE_EXCLUSIONS
if (includeInteraction) {
VERIFY_EXCLUSIONS;
}
#endif
if (includeInteraction) {
PERMUTE_ATOMS;
LOAD_PARTICLE_DATA;
COMPUTE_INTERACTION;
}
}
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ numNeighborPairs) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE;
while (base < NUM_ATOMS) {
real4 pos = posq[base];
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_POS(pos)
#endif
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, NUM_ATOMS);
for (int i = base+1; i < last; i++) {
pos = posq[i];
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
}
real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos);
index += blockDim.x*gridDim.x;
base = index*TILE_SIZE;
}
if (blockIdx.x == 0 && threadIdx.x == 0)
*numNeighborPairs = 0;
}
/**
* Find a list of neighbors for each atom.
*/
extern "C" __global__ void findNeighbors(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const real4* __restrict__ posq, const real4* __restrict__ blockCenter, const real4* __restrict__ blockBoundingBox, int2* __restrict__ neighborPairs,
int* __restrict__ numNeighborPairs, int* __restrict__ numNeighborsForAtom, int maxNeighborPairs
#ifdef USE_EXCLUSIONS
, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex
#endif
) {
__shared__ real3 positionCache[FIND_NEIGHBORS_WORKGROUP_SIZE];
int indexInWarp = threadIdx.x%32;
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < PADDED_NUM_ATOMS; atom1 += blockDim.x*gridDim.x) {
// Load data for this atom. Note that all threads in a warp are processing atoms from the same block.
real3 pos1 = trim(posq[atom1]);
int block1 = atom1/TILE_SIZE;
real4 blockCenter1 = blockCenter[block1];
real4 blockSize1 = blockBoundingBox[block1];
int totalNeighborsForAtom1 = 0;
// Loop over atom blocks to search for neighbors. The threads in a warp compare block1 against 32
// other blocks in parallel.
#ifdef USE_CENTRAL_PARTICLE
int startBlock = 0;
#else
int startBlock = block1;
#endif
for (int block2Base = startBlock; block2Base < NUM_BLOCKS; block2Base += 32) {
int block2 = block2Base+indexInWarp;
bool includeBlock2 = (block2 < NUM_BLOCKS);
if (includeBlock2) {
real4 blockCenter2 = blockCenter[block2];
real4 blockSize2 = blockBoundingBox[block2];
real4 blockDelta = blockCenter1-blockCenter2;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max(0.0f, fabs(blockDelta.x)-blockSize1.x-blockSize2.x);
blockDelta.y = max(0.0f, fabs(blockDelta.y)-blockSize1.y-blockSize2.y);
blockDelta.z = max(0.0f, fabs(blockDelta.z)-blockSize1.z-blockSize2.z);
includeBlock2 &= (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z < CUTOFF_SQUARED);
}
// Loop over any blocks we identified as potentially containing neighbors.
int includeBlockFlags = BALLOT(includeBlock2);
while (includeBlockFlags != 0) {
int i = __ffs(includeBlockFlags)-1;
includeBlockFlags &= includeBlockFlags-1;
int block2 = block2Base+i;
// Loop over atoms in this block.
int start = block2*TILE_SIZE;
int included[TILE_SIZE];
int numIncluded = 0;
positionCache[threadIdx.x] = trim(posq[start+indexInWarp]);
if (atom1 < NUM_ATOMS) {
for (int j = 0; j < 32; j++) {
int atom2 = start+j;
real3 pos2 = positionCache[threadIdx.x-indexInWarp+j];
// Decide whether to include this atom pair in the neighbor list.
real4 atomDelta = delta(pos1, pos2, periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ);
#ifdef USE_CENTRAL_PARTICLE
bool includeAtom = (atom2 != atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#else
bool includeAtom = (atom2 > atom1 && atom2 < NUM_ATOMS && atomDelta.w < CUTOFF_SQUARED);
#endif
#ifdef USE_EXCLUSIONS
if (includeAtom)
includeAtom &= !isInteractionExcluded(atom1, atom2, exclusions, exclusionStartIndex);
#endif
if (includeAtom)
included[numIncluded++] = atom2;
}
}
// If we found any neighbors, store them to the neighbor list.
if (numIncluded > 0) {
int baseIndex = atomicAdd(numNeighborPairs, numIncluded);
if (baseIndex+numIncluded <= maxNeighborPairs)
for (int j = 0; j < numIncluded; j++)
neighborPairs[baseIndex+j] = make_int2(atom1, included[j]);
totalNeighborsForAtom1 += numIncluded;
}
}
}
if (atom1 < NUM_ATOMS)
numNeighborsForAtom[atom1] = totalNeighborsForAtom1;
}
}
/**
* Sum the neighbor counts to compute the start position of each atom. This kernel
* is executed as a single work group.
*/
extern "C" __global__ void computeNeighborStartIndices(int* __restrict__ numNeighborsForAtom, int* __restrict__ neighborStartIndex,
int* __restrict__ numNeighborPairs, int maxNeighborPairs) {
extern __shared__ unsigned int posBuffer[];
if (*numNeighborPairs > maxNeighborPairs) {
// There wasn't enough memory for the neighbor list, so we'll need to rebuild it. Set the neighbor start
// indices to indicate no neighbors for any atom.
for (int i = threadIdx.x; i <= NUM_ATOMS; i += blockDim.x)
neighborStartIndex[i] = 0;
return;
}
unsigned int globalOffset = 0;
for (unsigned int startAtom = 0; startAtom < NUM_ATOMS; startAtom += blockDim.x) {
// Load the neighbor counts into local memory.
unsigned int globalIndex = startAtom+threadIdx.x;
posBuffer[threadIdx.x] = (globalIndex < NUM_ATOMS ? numNeighborsForAtom[globalIndex] : 0);
__syncthreads();
// Perform a parallel prefix sum.
for (unsigned int step = 1; step < blockDim.x; step *= 2) {
unsigned int add = (threadIdx.x >= step ? posBuffer[threadIdx.x-step] : 0);
__syncthreads();
posBuffer[threadIdx.x] += add;
__syncthreads();
}
// Write the results back to global memory.
if (globalIndex < NUM_ATOMS) {
neighborStartIndex[globalIndex+1] = posBuffer[threadIdx.x]+globalOffset;
numNeighborsForAtom[globalIndex] = 0; // Clear this so the next kernel can use it as a counter
}
globalOffset += posBuffer[blockDim.x-1];
__syncthreads();
}
if (threadIdx.x == 0)
neighborStartIndex[0] = 0;
}
/**
* Assemble the final neighbor list.
*/
extern "C" __global__ void copyPairsToNeighborList(const int2* __restrict__ neighborPairs, int* __restrict__ neighbors, int* __restrict__ numNeighborPairs,
int maxNeighborPairs, int* __restrict__ numNeighborsForAtom, const int* __restrict__ neighborStartIndex) {
int actualPairs = *numNeighborPairs;
if (actualPairs > maxNeighborPairs)
return; // There wasn't enough memory for the neighbor list, so we'll need to rebuild it.
for (unsigned int index = blockDim.x*blockIdx.x+threadIdx.x; index < actualPairs; index += blockDim.x*gridDim.x) {
int2 pair = neighborPairs[index];
int startIndex = neighborStartIndex[pair.x];
int offset = atomicAdd(numNeighborsForAtom+pair.x, 1);
neighbors[startIndex+offset] = pair.y;
}
}
typedef struct {
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real padding;
#endif
} AtomData;
/**
* Find the maximum of a value across all threads in a warp, and return that to
* every thread. This is only needed on Volta and later. On earlier architectures,
* we can just return the value that was passed in.
*/
__device__ int reduceMax(int val) {
#if __CUDA_ARCH__ >= 700
for (int mask = 16; mask > 0; mask /= 2)
val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
#endif
return val;
}
extern "C" __global__ void computeInteractionGroups(
unsigned long long* __restrict__ forceBuffers, mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const int4* __restrict__ groupData,
const int* __restrict__ numGroupTiles, bool useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
mixed energy = 0;
INIT_DERIVATIVES
__shared__ AtomData localData[LOCAL_MEMORY_SIZE];
const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
for (int tile = startTile; tile < endTile; tile++) {
const int4 atomData = groupData[TILE_SIZE*tile+tgx];
const int atom1 = atomData.x;
const int atom2 = atomData.y;
const int rangeStart = atomData.z&0xFFFF;
const int rangeEnd = (atomData.z>>16)&0xFFFF;
const int exclusions = atomData.w;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
real3 force = make_real3(0);
real4 posq2 = posq[atom2];
localData[threadIdx.x].x = posq2.x;
localData[threadIdx.x].y = posq2.y;
localData[threadIdx.x].z = posq2.z;
localData[threadIdx.x].q = posq2.w;
LOAD_LOCAL_PARAMETERS
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
int tj = tgx;
int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart);
SYNC_WARPS;
for (int j = rangeStart; j < rangeStop; j++) {
if (j < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj;
posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (!isExcluded && r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
LOAD_ATOM2_PARAMETERS
real dEdR = 0.0f;
real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[localIndex].fx += delta.x;
localData[localIndex].fy += delta.y;
localData[localIndex].fz += delta.z;
#ifdef USE_CUTOFF
}
#endif
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
}
SYNC_WARPS;
}
if (exclusions != 0) {
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
}
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
SYNC_WARPS;
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
SAVE_DERIVATIVES
}
/**
* If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is
* executed by a single thread.
*/
extern "C" __global__ void prepareToBuildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles) {
if (rebuildNeighborList[0] == 1)
numGroupTiles[0] = 0;
}
/**
* Filter the list of tiles to include only ones that have interactions within the
* padded cutoff.
*/
extern "C" __global__ void buildNeighborList(int* __restrict__ rebuildNeighborList, int* __restrict__ numGroupTiles,
const real4* __restrict__ posq, const int4* __restrict__ groupData, int4* __restrict__ filteredGroupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
// If the neighbor list doesn't need to be rebuilt on this step, return immediately.
if (rebuildNeighborList[0] == 0)
return;
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; // global warpIndex
const unsigned int local_warp = threadIdx.x/TILE_SIZE; // local warpIndex
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
__shared__ real4 localPos[LOCAL_MEMORY_SIZE];
__shared__ volatile bool anyInteraction[WARPS_IN_BLOCK];
__shared__ volatile int tileIndex[WARPS_IN_BLOCK];
const unsigned int startTile = warp*NUM_TILES/totalWarps;
const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
for (int tile = startTile; tile < endTile; tile++) {
const int4 atomData = groupData[TILE_SIZE*tile+tgx];
const int atom1 = atomData.x;
const int atom2 = atomData.y;
const int rangeStart = atomData.z&0xFFFF;
const int rangeEnd = (atomData.z>>16)&0xFFFF;
const int exclusions = atomData.w;
real4 posq1 = posq[atom1];
localPos[threadIdx.x] = posq[atom2];
if (tgx == 0)
anyInteraction[local_warp] = false;
int tj = tgx;
int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart);
SYNC_WARPS;
for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
SYNC_WARPS;
if (j < rangeEnd && tj < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj;
real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (!isExcluded && r2 < PADDED_CUTOFF_SQUARED)
anyInteraction[local_warp] = true;
}
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
SYNC_WARPS;
}
if (anyInteraction[local_warp]) {
SYNC_WARPS;
if (tgx == 0)
tileIndex[local_warp] = atomicAdd(numGroupTiles, 1);
SYNC_WARPS;
filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
}
}
}
#define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
/**
* Reduce the Born sums to compute the Born radii.
*/
extern "C" __global__ void reduceBornSum(float alpha, float beta, float gamma, const long long* __restrict__ bornSum,
const float2* __restrict__ params, real* __restrict__ bornRadii, real* __restrict__ obcChain) {
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Get summed Born data
real sum = RECIP(0x100000000)*bornSum[index];
// Now calculate Born radius and OBC term.
float offsetRadius = params[index].x;
sum *= 0.5f*offsetRadius;
real sum2 = sum*sum;
real sum3 = sum*sum2;
real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
real radius = RECIP(RECIP(offsetRadius) - tanhSum/nonOffsetRadius);
real chain = offsetRadius*(alpha - 2.0f*beta*sum + 3.0f*gamma*sum2);
chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius;
bornRadii[index] = radius;
obcChain[index] = chain;
}
}
/**
* Reduce the Born force.
*/
extern "C" __global__ void reduceBornForce(long long* __restrict__ bornForce, mixed* __restrict__ energyBuffer,
const float2* __restrict__ params, const real* __restrict__ bornRadii, const real* __restrict__ obcChain) {
mixed energy = 0;
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) {
// Get summed Born force
real force = RECIP(0x100000000)*bornForce[index];
// Now calculate the actual force
float offsetRadius = params[index].x;
real bornRadius = bornRadii[index];
real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
real ratio6 = POW((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, 6);
real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
force += saTerm/bornRadius;
energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index];
bornForce[index] = (long long) (force*0x100000000);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy/-6;
}
typedef struct {
real x, y, z;
real q;
float radius, scaledRadius;
real bornSum;
} AtomData1;
/**
* Compute the Born sum.
*/
extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const real* __restrict__ charge, const float2* __restrict__ global_params,
#ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms,
#else
unsigned int numTiles,
#endif
const ushort2* __restrict__ exclusionTiles) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
__shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real bornSum = 0;
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
float2 params1 = global_params[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = charge1;
localData[threadIdx.x].radius = params1.x;
localData[threadIdx.x].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
}
}
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = charge[j];
float2 tempParams = global_params[j];
localData[threadIdx.x].radius = tempParams.x;
localData[threadIdx.x].scaledRadius = tempParams.y;
localData[threadIdx.x].bornSum = 0.0f;
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
#else
int pos = (int) (warp*(long long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
__shared__ volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
real bornSum = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
x = tiles[pos];
real4 blockSizeX = blockSize[x];
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
float2 params1 = global_params[atom1];
#ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = charge[j];
float2 tempParams = global_params[j];
localData[threadIdx.x].radius = tempParams.x;
localData[threadIdx.x].scaledRadius = tempParams.y;
localData[threadIdx.x].bornSum = 0.0f;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
atomicAdd(&global_bornSum[atom1], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS)
atomicAdd(&global_bornSum[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
}
pos++;
}
}
typedef struct {
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
mixed* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ charge, const real* __restrict__ global_bornRadii, bool needEnergy,
#ifdef USE_CUTOFF
const int* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockSize, const unsigned int* __restrict__ interactingAtoms,
#else
unsigned int numTiles,
#endif
const ushort2* __restrict__ exclusionTiles) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
mixed energy = 0;
__shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = make_real4(0);
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
real bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = charge1;
localData[threadIdx.x].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real3 pos2 = make_real3(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
real charge2 = localData[tbx+j].q;
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real scaledChargeProduct = PREFACTOR*charge1*charge2;
real tempEnergy = scaledChargeProduct*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
#ifdef USE_CUTOFF
if (atom1 != y*TILE_SIZE+j)
tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
if (needEnergy)
energy += 0.5f*tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF
}
#endif
}
}
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = charge[j];
localData[threadIdx.x].bornRadius = global_bornRadii[j];
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
localData[threadIdx.x].fw = 0.0f;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q;
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real scaledChargeProduct = PREFACTOR*charge1*charge2;
real tempEnergy = scaledChargeProduct*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
#ifdef USE_CUTOFF
tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
if (needEnergy)
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
}
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
#else
int pos = (int) (warp*(long long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long long)numTiles/totalWarps);
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
__shared__ volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
real4 force = make_real4(0);
bool includeTile = true;
// Extract the coordinates of this tile.
int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
x = tiles[pos];
real4 blockSizeX = blockSize[x];
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
#else
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
real charge1 = charge[atom1];
real bornRadius1 = global_bornRadii[atom1];
#ifdef USE_CUTOFF
unsigned int j = interactingAtoms[pos*TILE_SIZE+tgx];
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = charge[j];
localData[threadIdx.x].bornRadius = global_bornRadii[j];
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
localData[threadIdx.x].fw = 0.0f;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[threadIdx.x], blockCenterX)
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q;
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = r2*invR;
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real scaledChargeProduct = PREFACTOR*charge1*charge2;
real tempEnergy = scaledChargeProduct*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
#ifdef USE_CUTOFF
tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
if (needEnergy)
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q;
real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = r2*invR;
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real scaledChargeProduct = PREFACTOR*charge1*charge2;
real tempEnergy = scaledChargeProduct*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
#ifdef USE_CUTOFF
tempEnergy -= scaledChargeProduct/CUTOFF;
#endif
if (needEnergy)
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
}
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atomicAdd(&global_bornForce[atom1], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
atomicAdd(&global_bornForce[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
}
}
pos++;
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
float4 exceptionParams = PARAMS[index];
real3 delta = make_real3(pos2.x-pos1.x, pos2.y-pos1.y, pos2.z-pos1.z);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real sig2 = invR*exceptionParams.y;
......
#include <initializer_list>
extern "C" __global__ void propagateNoseHooverChain(mixed2* __restrict__ chainData, const mixed2 * __restrict__ energySum, mixed2* __restrict__ scaleFactor,
mixed* __restrict__ chainMasses, mixed* __restrict__ chainForces,
int chainType, int chainLength, int numMTS, int numDOFs, float timeStep,
mixed kT, float frequency){
const mixed & kineticEnergy = chainType ? energySum[0].y : energySum[0].x;
mixed &scale = chainType ? scaleFactor[0].y : scaleFactor[0].x;
scale = (mixed) 1;
if(kineticEnergy < 1e-8) return;
for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
chainMasses[0] *= numDOFs;
mixed KE2 = 2.0f * kineticEnergy;
mixed timeOverMTS = timeStep / numMTS;
chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
for (int bead = 0; bead < chainLength - 1; ++bead) {
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
}
for (int mts = 0; mts < numMTS; ++mts) {
BEGIN_YS_LOOP
mixed wdt = ys * timeOverMTS;
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
for (int bead = chainLength - 2; bead >= 0; --bead) {
mixed aa = MIXEDEXP(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
}
// update particle velocities
mixed aa = MIXEDEXP(-0.5f * wdt * chainData[0].y);
scale *= aa;
// update the thermostat positions
for (int bead = 0; bead < chainLength; ++bead) {
chainData[bead].x += 0.5f * chainData[bead].y * wdt;
}
// update the forces
chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
// update thermostat velocities
for (int bead = 0; bead < chainLength - 1; ++bead) {
mixed aa = MIXEDEXP(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
}
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
END_YS_LOOP
} // MTS loop
}
/**
* Compute total (potential + kinetic) energy of the Nose-Hoover beads
*/
extern "C" __global__ void computeHeatBathEnergy(mixed* __restrict__ heatBathEnergy, int chainLength, int numDOFs,
mixed kT, float frequency, const mixed2* __restrict__ chainData){
// Note that this is always incremented; make sure it's zeroed properly before the first call
mixed &energy = heatBathEnergy[0];
for(int i = 0; i < chainLength; ++i) {
mixed prefac = i ? 1 : numDOFs;
mixed mass = prefac * kT / (frequency * frequency);
mixed velocity = chainData[i].y;
// The kinetic energy of this bead
energy += 0.5f * mass * velocity * velocity;
// The potential energy of this bead
mixed position = chainData[i].x;
energy += prefac * kT * position;
}
}
extern "C" __global__ void computeAtomsKineticEnergy(mixed2 * __restrict__ energyBuffer, int numAtoms,
const mixed4* __restrict__ velm, const int *__restrict__ atoms){
mixed2 energy = make_mixed2(0,0);
//energy = 1; return;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
int atom = atoms[index];
mixed4 v = velm[atom];
mixed mass = v.w == 0 ? 0 : 1 / v.w;
energy.x += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] = energy;
}
extern "C" __global__ void computePairsKineticEnergy(mixed2 * __restrict__ energyBuffer, int numPairs,
const mixed4* __restrict__ velm, const int2 *__restrict__ pairs){
mixed2 energy = make_mixed2(0,0);
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
int2 pair = pairs[index];
int atom1 = pair.x;
int atom2 = pair.y;
mixed4 v1 = velm[atom1];
mixed4 v2 = velm[atom2];
mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
mixed4 cv;
cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
mixed4 rv;
rv.x = v2.x - v1.x;
rv.y = v2.y - v1.y;
rv.z = v2.z - v1.z;
energy.x += 0.5f * (m1 + m2) * (cv.x*cv.x + cv.y*cv.y + cv.z*cv.z);
energy.y += 0.5f * (m1 * m2 / (m1 + m2)) * (rv.x*rv.x + rv.y*rv.y + rv.z*rv.z);
}
// The atoms version of this has been called already, so accumulate instead of assigning here
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x].x += energy.x;
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x].y += energy.y;
}
extern "C" __global__ void scaleAtomsVelocities(mixed2* __restrict__ scaleFactor, int numAtoms,
mixed4* __restrict__ velm, const int *__restrict__ atoms){
const mixed &scale = scaleFactor[0].x;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
int atom = atoms[index];
mixed4 &v = velm[atom];
v.x *= scale;
v.y *= scale;
v.z *= scale;
}
}
extern "C" __global__ void scalePairsVelocities(mixed2 * __restrict__ scaleFactor, int numPairs,
mixed4* __restrict__ velm, const int2 *__restrict__ pairs){
const mixed &absScale = scaleFactor[0].x;
const mixed &relScale = scaleFactor[0].y;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
int atom1 = pairs[index].x;
int atom2 = pairs[index].y;
mixed4 v1 = velm[atom1];
mixed4 v2 = velm[atom2];
mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
mixed4 cv;
cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
mixed4 rv;
rv.x = v2.x - v1.x;
rv.y = v2.y - v1.y;
rv.z = v2.z - v1.z;
v1.x = absScale * cv.x - relScale * rv.x * m2 / (m1 + m2);
v1.y = absScale * cv.y - relScale * rv.y * m2 / (m1 + m2);
v1.z = absScale * cv.z - relScale * rv.z * m2 / (m1 + m2);
v2.x = absScale * cv.x + relScale * rv.x * m1 / (m1 + m2);
v2.y = absScale * cv.y + relScale * rv.y * m1 / (m1 + m2);
v2.z = absScale * cv.z + relScale * rv.z * m1 / (m1 + m2);
velm[atom1] = v1;
velm[atom2] = v2;
}
}
/**
* Sum the energy buffer containing a pair of energies stored as mixed2. This is copied from utilities.cu with small modifications
*/
extern "C" __global__ void reduceEnergyPair(const mixed2* __restrict__ energyBuffer, mixed2* __restrict__ result, int bufferSize, int workGroupSize) {
__shared__ mixed2 tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = threadIdx.x;
mixed2 sum = make_mixed2(0,0);
for (unsigned int idx = thread; idx < bufferSize; idx += blockDim.x) {
sum.x += energyBuffer[idx].x;
sum.y += energyBuffer[idx].y;
}
tempBuffer[thread] = sum;
for (int i = 1; i < workGroupSize; i *= 2) {
__syncthreads();
if (thread%(i*2) == 0 && thread+i < workGroupSize) {
tempBuffer[thread].x += tempBuffer[thread+i].x;
tempBuffer[thread].y += tempBuffer[thread+i].y;
}
}
if (thread == 0)
*result = tempBuffer[0];
}
/**
* Calculate the center of mass momentum.
*/
extern "C" __global__ void calcCenterOfMassMomentum(int numAtoms, const mixed4* __restrict__ velm, float4* __restrict__ cmMomentum) {
extern __shared__ volatile float3 temp[];
float3 cm = make_float3(0, 0, 0);
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index];
if (velocity.w != 0) {
mixed mass = RECIP(velocity.w);
cm.x += (float) (velocity.x*mass);
cm.y += (float) (velocity.y*mass);
cm.z += (float) (velocity.z*mass);
}
}
// Sum the threads in this group.
int thread = threadIdx.x;
temp[thread].x = cm.x;
temp[thread].y = cm.y;
temp[thread].z = cm.z;
__syncthreads();
if (thread < 32) {
temp[thread].x += temp[thread+32].x;
temp[thread].y += temp[thread+32].y;
temp[thread].z += temp[thread+32].z;
if (thread < 16) {
temp[thread].x += temp[thread+16].x;
temp[thread].y += temp[thread+16].y;
temp[thread].z += temp[thread+16].z;
}
if (thread < 8) {
temp[thread].x += temp[thread+8].x;
temp[thread].y += temp[thread+8].y;
temp[thread].z += temp[thread+8].z;
}
if (thread < 4) {
temp[thread].x += temp[thread+4].x;
temp[thread].y += temp[thread+4].y;
temp[thread].z += temp[thread+4].z;
}
if (thread < 2) {
temp[thread].x += temp[thread+2].x;
temp[thread].y += temp[thread+2].y;
temp[thread].z += temp[thread+2].z;
}
}
if (thread == 0) {
float3 sum = make_float3(temp[thread].x+temp[thread+1].x, temp[thread].y+temp[thread+1].y, temp[thread].z+temp[thread+1].z);
cmMomentum[blockIdx.x] = make_float4(sum.x, sum.y, sum.z, 0.0f);
}
}
/**
* Remove center of mass motion.
*/
extern "C" __global__ void removeCenterOfMassMomentum(unsigned int numAtoms, mixed4* __restrict__ velm, const float4* __restrict__ cmMomentum) {
// First sum all of the momenta that were calculated by individual groups.
extern volatile float3 temp[];
float3 cm = make_float3(0, 0, 0);
for (unsigned int index = threadIdx.x; index < gridDim.x; index += blockDim.x) {
float4 momentum = cmMomentum[index];
cm.x += momentum.x;
cm.y += momentum.y;
cm.z += momentum.z;
}
int thread = threadIdx.x;
temp[thread].x = cm.x;
temp[thread].y = cm.y;
temp[thread].z = cm.z;
__syncthreads();
if (thread < 32) {
temp[thread].x += temp[thread+32].x;
temp[thread].y += temp[thread+32].y;
temp[thread].z += temp[thread+32].z;
if (thread < 16) {
temp[thread].x += temp[thread+16].x;
temp[thread].y += temp[thread+16].y;
temp[thread].z += temp[thread+16].z;
}
if (thread < 8) {
temp[thread].x += temp[thread+8].x;
temp[thread].y += temp[thread+8].y;
temp[thread].z += temp[thread+8].z;
}
if (thread < 4) {
temp[thread].x += temp[thread+4].x;
temp[thread].y += temp[thread+4].y;
temp[thread].z += temp[thread+4].z;
}
if (thread < 2) {
temp[thread].x += temp[thread+2].x;
temp[thread].y += temp[thread+2].y;
temp[thread].z += temp[thread+2].z;
}
}
__syncthreads();
cm = make_float3(INVERSE_TOTAL_MASS*(temp[0].x+temp[1].x), INVERSE_TOTAL_MASS*(temp[0].y+temp[1].y), INVERSE_TOTAL_MASS*(temp[0].z+temp[1].z));
// Now remove the center of mass velocity from each atom.
for (unsigned int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index];
velocity.x -= cm.x;
velocity.y -= cm.y;
velocity.z -= cm.z;
velm[index] = velocity;
}
}
......@@ -99,4 +99,4 @@ __global__ void setCharges(real* __restrict__ charges, real4* __restrict__ posq,
for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numAtoms; i += blockDim.x*gridDim.x)
posq[i].w = charges[atomOrder[i]];
}
}
\ No newline at end of file
}
......@@ -554,16 +554,16 @@ inline __device__ float3 cross(float3 a, float3 b) {
return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
}
inline __device__ float3 cross(float4 a, float4 b) {
return make_float3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
inline __device__ float4 cross(float4 a, float4 b) {
return make_float4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0f);
}
inline __device__ double3 cross(double3 a, double3 b) {
return make_double3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
}
inline __device__ double3 cross(double4 a, double4 b) {
return make_double3(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
inline __device__ double4 cross(double4 a, double4 b) {
return make_double4(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x, 0.0);
}
// Normalize a vector
......@@ -594,6 +594,14 @@ inline __device__ double4 normalize(double4 a) {
// Strip off the fourth component of a vector.
inline __device__ short3 trimTo3(short4 v) {
return make_short3(v.x, v.y, v.z);
}
inline __device__ int3 trimTo3(int4 v) {
return make_int3(v.x, v.y, v.z);
}
inline __device__ float3 trimTo3(float4 v) {
return make_float3(v.x, v.y, v.z);
}
......
/**
* Perform the first step of Velocity Verlet integration.
*
* update displacements (posDelta) and velocities (velm)
*/
extern "C" __global__ void integrateVelocityVerletPart1(int numAtoms, int numPairs, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq,
const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta,
const int* __restrict__ atomList, const int2* __restrict__ pairList) {
const mixed2 stepSize = dt[0];
const mixed dtPos = stepSize.y;
const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
int atom = atomList[index];
mixed4 velocity = velm[atom];
if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[atom];
real4 pos2 = posqCorrection[atom];
mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[atom];
#endif
velocity.x += scale*force[atom]*velocity.w;
velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w;
velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w;
pos.x = velocity.x*dtPos;
pos.y = velocity.y*dtPos;
pos.z = velocity.z*dtPos;
posDelta[atom] = pos;
velm[atom] = velocity;
}
}
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
int atom1 = pairList[index].x;
int atom2 = pairList[index].y;
mixed4 v1 = velm[atom1];
mixed4 v2 = velm[atom2];
mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
mixed mass1fract = m1 / (m1 + m2);
mixed mass2fract = m2 / (m1 + m2);
mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
mixed3 comVel;
comVel.x= v1.x*mass1fract + v2.x*mass2fract;
comVel.y= v1.y*mass1fract + v2.y*mass2fract;
comVel.z= v1.z*mass1fract + v2.z*mass2fract;
mixed3 relVel;
relVel.x= v2.x - v1.x;
relVel.y= v2.y - v1.y;
relVel.z= v2.z - v1.z;
//
mixed3 comFrc;
comFrc.x = force[atom1] + force[atom2];
comFrc.y = force[atom1 + paddedNumAtoms] + force[atom2 + paddedNumAtoms];
comFrc.z = force[atom1 + paddedNumAtoms*2] + force[atom2 + paddedNumAtoms*2];
mixed3 relFrc;
relFrc.x = mass1fract*force[atom2] - mass2fract*force[atom1];
relFrc.y = mass1fract*force[atom2+paddedNumAtoms] - mass2fract*force[atom1+paddedNumAtoms];
relFrc.z = mass1fract*force[atom2+paddedNumAtoms*2] - mass2fract*force[atom1+paddedNumAtoms*2];
comVel.x += comFrc.x * scale * invTotMass;
comVel.y += comFrc.y * scale * invTotMass;
comVel.z += comFrc.z * scale * invTotMass;
relVel.x += relFrc.x * scale * invRedMass;
relVel.y += relFrc.y * scale * invRedMass;
relVel.z += relFrc.z * scale * invRedMass;
#ifdef USE_MIXED_PRECISION
real4 posv1 = posq[atom1];
real4 posv2 = posq[atom2];
real4 posc1 = posqCorrection[atom1];
real4 posc2 = posqCorrection[atom2];
mixed4 pos1 = make_mixed4(posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
mixed4 pos2 = make_mixed4(posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
#else
real4 pos1 = posq[atom1];
real4 pos2 = posq[atom2];
#endif
if (v1.w != 0.0f) {
v1.x = comVel.x - relVel.x*mass2fract;
v1.y = comVel.y - relVel.y*mass2fract;
v1.z = comVel.z - relVel.z*mass2fract;
pos1.x = v1.x*dtPos;
pos1.y = v1.y*dtPos;
pos1.z = v1.z*dtPos;
posDelta[atom1] = pos1;
velm[atom1] = v1;
}
if (v2.w != 0.0f) {
v2.x = comVel.x + relVel.x*mass1fract;
v2.y = comVel.y + relVel.y*mass1fract;
v2.z = comVel.z + relVel.z*mass1fract;
pos2.x = v2.x*dtPos;
pos2.y = v2.y*dtPos;
pos2.z = v2.z*dtPos;
posDelta[atom2] = pos2;
velm[atom2] = v2;
}
}
}
/**
* Perform the second step of Velocity Verlet integration.
*
* apply displacements to positions (posq) after constraints have been enforced
*/
extern "C" __global__ void integrateVelocityVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) {
mixed2 stepSize = dt[0];
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index == 0)
dt[0].x = stepSize.y;
for (; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
mixed4 delta = posDelta[index];
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
}
}
/**
* Perform the third step of Velocity Verlet integration.
*
* modify the velocities (velm) after the force update
*/
extern "C" __global__ void integrateVelocityVerletPart3(int numAtoms, int numPairs, int paddedNumAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq,
real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed4* __restrict__ posDelta,
const int* __restrict__ atomList, const int2* __restrict__ pairList) {
mixed2 stepSize = dt[0];
#if __CUDA_ARCH__ >= 130
double oneOverDt = 1.0/stepSize.y;
#else
float oneOverDt = 1.0f/stepSize.y;
float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
#endif
const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = 0.5f*dtVel/(mixed) 0x100000000;
int index = blockIdx.x*blockDim.x+threadIdx.x;
if (index == 0)
dt[0].x = stepSize.y;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
int atom = atomList[index];
mixed4 velocity = velm[atom];
if (velocity.w != 0.0) {
mixed4 deltaXconstrained = posDelta[atom];
velocity.x += scale*force[atom]*velocity.w + (deltaXconstrained.x - velocity.x*stepSize.y)*oneOverDt;
velocity.y += scale*force[atom+paddedNumAtoms]*velocity.w + (deltaXconstrained.y - velocity.y*stepSize.y)*oneOverDt;
velocity.z += scale*force[atom+paddedNumAtoms*2]*velocity.w + (deltaXconstrained.z - velocity.z*stepSize.y)*oneOverDt;
#if __CUDA_ARCH__ < 130
velocity.x += (deltaXconstrained.x - velocity.x*stepSize.y)*correction;
velocity.y += (deltaXconstrained.y - velocity.y*stepSize.y)*correction;
velocity.z += (deltaXconstrained.z - velocity.z*stepSize.y)*correction;
#endif
velm[atom] = velocity;
}
}
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
int atom1 = pairList[index].x;
int atom2 = pairList[index].y;
mixed4 v1 = velm[atom1];
mixed4 v2 = velm[atom2];
mixed m1 = v1.w == 0.0f ? 0.0f : 1.0f / v1.w;
mixed m2 = v2.w == 0.0f ? 0.0f : 1.0f / v2.w;
mixed mass1fract = m1 / (m1 + m2);
mixed mass2fract = m2 / (m1 + m2);
mixed invRedMass = (m1 * m2 != 0.0f) ? (m1 + m2)/(m1 * m2) : 0.0f;
mixed invTotMass = (m1 + m2 != 0.0f) ? 1.0f /(m1 + m2) : 0.0f;
mixed3 comVel;
comVel.x= v1.x*mass1fract + v2.x*mass2fract;
comVel.y= v1.y*mass1fract + v2.y*mass2fract;
comVel.z= v1.z*mass1fract + v2.z*mass2fract;
mixed3 relVel;
relVel.x= v2.x - v1.x;
relVel.y= v2.y - v1.y;
relVel.z= v2.z - v1.z;
//
mixed3 comFrc;
comFrc.x = force[atom1] + force[atom2];
comFrc.y = force[atom1 + paddedNumAtoms] + force[atom2 + paddedNumAtoms];
comFrc.z = force[atom1 + paddedNumAtoms*2] + force[atom2 + paddedNumAtoms*2];
mixed3 relFrc;
relFrc.x = mass1fract*force[atom2] - mass2fract*force[atom1];
relFrc.y = mass1fract*force[atom2+paddedNumAtoms] - mass2fract*force[atom1+paddedNumAtoms];
relFrc.z = mass1fract*force[atom2+paddedNumAtoms*2] - mass2fract*force[atom1+paddedNumAtoms*2];
comVel.x += comFrc.x * scale * invTotMass;
comVel.y += comFrc.y * scale * invTotMass;
comVel.z += comFrc.z * scale * invTotMass;
relVel.x += relFrc.x * scale * invRedMass;
relVel.y += relFrc.y * scale * invRedMass;
relVel.z += relFrc.z * scale * invRedMass;
if (v1.w != 0.0f) {
mixed4 deltaXconstrained = posDelta[atom1];
v1.x = comVel.x - relVel.x*mass2fract + (deltaXconstrained.x - v1.x*stepSize.y)*oneOverDt;
v1.y = comVel.y - relVel.y*mass2fract + (deltaXconstrained.y - v1.y*stepSize.y)*oneOverDt;
v1.z = comVel.z - relVel.z*mass2fract + (deltaXconstrained.z - v1.z*stepSize.y)*oneOverDt;
#if __CUDA_ARCH__ < 130
v1.x += (deltaXconstrained.x - v1.x*stepSize.y)*correction;
v1.y += (deltaXconstrained.y - v1.y*stepSize.y)*correction;
v1.z += (deltaXconstrained.z - v1.z*stepSize.y)*correction;
#endif
velm[atom1] = v1;
}
if (v2.w != 0.0f) {
mixed4 deltaXconstrained = posDelta[atom2];
v2.x = comVel.x + relVel.x*mass1fract + (deltaXconstrained.x - v2.x*stepSize.y)*oneOverDt;
v2.y = comVel.y + relVel.y*mass1fract + (deltaXconstrained.y - v2.y*stepSize.y)*oneOverDt;
v2.z = comVel.z + relVel.z*mass1fract + (deltaXconstrained.z - v2.z*stepSize.y)*oneOverDt;
#if __CUDA_ARCH__ < 130
v2.x += (deltaXconstrained.x - v2.x*stepSize.y)*correction;
v2.y += (deltaXconstrained.y - v2.y*stepSize.y)*correction;
v2.z += (deltaXconstrained.z - v2.z*stepSize.y)*correction;
#endif
velm[atom2] = v2;
}
}
}
/**
* Apply the hard wall constraint
*/
extern "C" __global__ void integrateVelocityVerletHardWall(int numPairs, const float* __restrict__ maxPairDistance, mixed2* __restrict__ dt, real4* __restrict__ posq,
real4* __restrict__ posqCorrection, mixed4* __restrict__ velm,
const int2* __restrict__ pairList, const float* __restrict__ pairTemperature) {
mixed dtPos = dt[0].y;
mixed maxDelta = (mixed) maxPairDistance[0];
// Apply hard wall constraints.
if (maxDelta > 0) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numPairs; index += blockDim.x*gridDim.x) {
const mixed hardWallScale = sqrt( ((mixed) pairTemperature[index]) * ((mixed) BOLTZ));
int2 atom = make_int2(pairList[index].x, pairList[index].y);
#ifdef USE_MIXED_PRECISION
real4 posv1 = posq[atom.x];
real4 posc1 = posqCorrection[atom.x];
mixed4 pos1 = make_mixed4(posv1.x+(mixed)posc1.x, posv1.y+(mixed)posc1.y, posv1.z+(mixed)posc1.z, posv1.w);
real4 posv2 = posq[atom.y];
real4 posc2 = posqCorrection[atom.y];
mixed4 pos2 = make_mixed4(posv2.x+(mixed)posc2.x, posv2.y+(mixed)posc2.y, posv2.z+(mixed)posc2.z, posv2.w);
#else
real4 pos1 = posq[atom.x];
real4 pos2 = posq[atom.y];
#endif
mixed3 delta = make_mixed3(
mixed (pos1.x - pos2.x),
mixed (pos1.y - pos2.y),
mixed (pos1.z - pos2.z)
);
mixed r = sqrt(delta.x*delta.x + delta.y*delta.y + delta.z*delta.z);
mixed rInv = 1/r;
if (rInv*maxDelta < 1.0) {
// The constraint has been violated, so make the inter-particle distance "bounce"
// off the hard wall.
mixed3 bondDir = make_mixed3(delta.x * rInv, delta.y * rInv, delta.z * rInv);
mixed3 vel1 = make_mixed3(velm[atom.x].x, velm[atom.x].y, velm[atom.x].z);
mixed3 vel2 = make_mixed3(velm[atom.y].x, velm[atom.y].y, velm[atom.y].z);
mixed m1 = velm[atom.x].w != 0.0 ? 1.0/velm[atom.x].w : 0.0;
mixed m2 = velm[atom.y].w != 0.0 ? 1.0/velm[atom.y].w : 0.0;
mixed invTotMass = (m1 + m2 != 0.0) ? 1.0 /(m1 + m2) : 0.0;
mixed deltaR = r-maxDelta;
mixed deltaT = dtPos;
mixed dt = dtPos;
mixed dotvr1 = vel1.x*bondDir.x + vel1.y*bondDir.y + vel1.z*bondDir.z;
mixed3 vb1 = make_mixed3(bondDir.x*dotvr1, bondDir.y*dotvr1, bondDir.z*dotvr1);
mixed3 vp1 = make_mixed3(vel1.x-vb1.x, vel1.y-vb1.y, vel1.z-vb1.z);
if (m2 == 0) {
// The parent particle is massless, so move only the Drude particle.
if (dotvr1 != 0.0)
deltaT = deltaR/fabs(dotvr1);
if (deltaT > dtPos)
deltaT = dtPos;
dotvr1 = -dotvr1*hardWallScale/(fabs(dotvr1)*sqrt(m1));
mixed dr = -deltaR + deltaT*dotvr1;
pos1.x += bondDir.x*dr;
pos1.y += bondDir.y*dr;
pos1.z += bondDir.z*dr;
velm[atom.x] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom.x].w);
#ifdef USE_MIXED_PRECISION
posq[atom.x] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
posqCorrection[atom.x] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
#else
posq[atom.x] = pos1;
#endif
}
else {
// Move both particles.
mixed dotvr2 = vel2.x*bondDir.x + vel2.y*bondDir.y + vel2.z*bondDir.z;
mixed3 vb2 = make_mixed3(bondDir.x*dotvr2, bondDir.y*dotvr2, bondDir.z*dotvr2);
mixed3 vp2 = make_mixed3(vel2.x-vb2.x, vel2.y-vb2.y, vel2.z-vb2.z);
mixed vbCMass = (m1*dotvr1 + m2*dotvr2)*invTotMass;
dotvr1 -= vbCMass;
dotvr2 -= vbCMass;
if (dotvr1 != dotvr2)
deltaT = deltaR/fabs(dotvr1-dotvr2);
if (deltaT > dt)
deltaT = dt;
mixed vBond = hardWallScale/sqrt(m1);
dotvr1 = -dotvr1*vBond*m2*invTotMass/fabs(dotvr1);
dotvr2 = -dotvr2*vBond*m1*invTotMass/fabs(dotvr2);
mixed dr1 = -deltaR*m2*invTotMass + deltaT*dotvr1;
mixed dr2 = deltaR*m1*invTotMass + deltaT*dotvr2;
dotvr1 += vbCMass;
dotvr2 += vbCMass;
pos1.x += bondDir.x*dr1;
pos1.y += bondDir.y*dr1;
pos1.z += bondDir.z*dr1;
pos2.x += bondDir.x*dr2;
pos2.y += bondDir.y*dr2;
pos2.z += bondDir.z*dr2;
velm[atom.x] = make_mixed4(vp1.x + bondDir.x*dotvr1, vp1.y + bondDir.y*dotvr1, vp1.z + bondDir.z*dotvr1, velm[atom.x].w);
velm[atom.y] = make_mixed4(vp2.x + bondDir.x*dotvr2, vp2.y + bondDir.y*dotvr2, vp2.z + bondDir.z*dotvr2, velm[atom.y].w);
#ifdef USE_MIXED_PRECISION
posq[atom.x] = make_real4((real) pos1.x, (real) pos1.y, (real) pos1.z, (real) pos1.w);
posq[atom.y] = make_real4((real) pos2.x, (real) pos2.y, (real) pos2.z, (real) pos2.w);
posqCorrection[atom.x] = make_real4(pos1.x-(real) pos1.x, pos1.y-(real) pos1.y, pos1.z-(real) pos1.z, 0);
posqCorrection[atom.y] = make_real4(pos2.x-(real) pos2.x, pos2.y-(real) pos2.y, pos2.z-(real) pos2.z, 0);
#else
posq[atom.x] = pos1;
posq[atom.y] = pos2;
#endif
}
}
}
} /* end of hard wall constraint part */
}
......@@ -4,17 +4,17 @@
INCLUDE(FindCUDA)
INCLUDE_DIRECTORIES(${CUDA_TOOLKIT_INCLUDE})
FILE(GLOB CUDA_KERNELS ${CUDA_SOURCE_DIR}/kernels/*.cu)
ADD_CUSTOM_COMMAND(OUTPUT ${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H}
FILE(GLOB CUDA_KERNELS ${KERNEL_SOURCE_DIR}/kernels/*.cu)
ADD_CUSTOM_COMMAND(OUTPUT ${KERNELS_CPP} ${KERNELS_H}
COMMAND ${CMAKE_COMMAND}
ARGS -D CUDA_SOURCE_DIR=${CUDA_SOURCE_DIR} -D CUDA_KERNELS_CPP=${CUDA_KERNELS_CPP} -D CUDA_KERNELS_H=${CUDA_KERNELS_H} -D CUDA_SOURCE_CLASS=${CUDA_SOURCE_CLASS} -P ${CMAKE_CURRENT_SOURCE_DIR}/../EncodeCUDAFiles.cmake
ARGS -D KERNEL_SOURCE_DIR=${KERNEL_SOURCE_DIR} -D KERNELS_CPP=${KERNELS_CPP} -D KERNELS_H=${KERNELS_H} -D KERNEL_SOURCE_CLASS=${KERNEL_SOURCE_CLASS} -D KERNEL_FILE_EXTENSION=cu -P ${CMAKE_SOURCE_DIR}/cmake_modules/EncodeKernelFiles.cmake
DEPENDS ${CUDA_KERNELS}
)
SET_SOURCE_FILES_PROPERTIES(${CUDA_KERNELS_CPP} ${CUDA_KERNELS_H} PROPERTIES GENERATED TRUE)
SET_SOURCE_FILES_PROPERTIES(${KERNELS_CPP} ${KERNELS_H} PROPERTIES GENERATED TRUE)
ADD_LIBRARY(${STATIC_TARGET} STATIC ${SOURCE_FILES} ${SOURCE_INCLUDE_FILES} ${API_ABS_INCLUDE_FILES})
TARGET_LINK_LIBRARIES(${STATIC_TARGET} ${OPENMM_LIBRARY_NAME} ${CUDA_CUDA_LIBRARY} ${CUDA_cufft_LIBRARY} ${PTHREADS_LIB_STATIC})
SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_CUDA_BUILDING_STATIC_LIBRARY")
SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES COMPILE_FLAGS "${EXTRA_COMPILE_FLAGS} -DOPENMM_COMMON_BUILDING_STATIC_LIBRARY")
IF (APPLE)
SET_TARGET_PROPERTIES(${STATIC_TARGET} PROPERTIES LINK_FLAGS "${EXTRA_COMPILE_FLAGS} -F/Library/Frameworks -framework CUDA")
ELSE (APPLE)
......
......@@ -30,7 +30,7 @@
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestBAOABLangevinIntegrator.h"
#include "TestLangevinMiddleIntegrator.h"
void runPlatformTests() {
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Andreas Krämer and Andrew C. Simmmonett *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestNoseHooverIntegrator.h"
void runPlatformTests() {
}
/* -------------------------------------------------------------------------- *
* OpenMM *
* -------------------------------------------------------------------------- *
* This is part of the OpenMM molecular simulation toolkit originating from *
* Simbios, the NIH National Center for Physics-Based Simulation of *
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2019 Stanford University and the Authors. *
* Authors: Andreas Krämer and Andrew C. Simmonett *
* Contributors: *
* *
* Permission is hereby granted, free of charge, to any person obtaining a *
* copy of this software and associated documentation files (the "Software"), *
* to deal in the Software without restriction, including without limitation *
* the rights to use, copy, modify, merge, publish, distribute, sublicense, *
* and/or sell copies of the Software, and to permit persons to whom the *
* Software is furnished to do so, subject to the following conditions: *
* *
* The above copyright notice and this permission notice shall be included in *
* all copies or substantial portions of the Software. *
* *
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR *
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, *
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL *
* THE AUTHORS, CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, *
* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR *
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE *
* USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */
#include "CudaTests.h"
#include "TestNoseHooverThermostat.h"
void runPlatformTests() {
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment