Commit 5a06df78 authored by tic20's avatar tic20
Browse files
parents 8dd60914 a9223eea
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
typedef struct { typedef struct {
real x, y, z; real x, y, z;
real q; real q;
...@@ -16,60 +12,69 @@ typedef struct { ...@@ -16,60 +12,69 @@ typedef struct {
* Find the maximum of a value across all threads in a warp, and return that to * Find the maximum of a value across all threads in a warp, and return that to
* every thread. * every thread.
*/ */
int reduceMax(int val, __local int* temp) { DEVICE int reduceMax(int val, LOCAL_ARG int* temp) {
int indexInWarp = get_local_id(0)%32; #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
temp[get_local_id(0)] = val; // CUDA lets us do this slightly more efficiently by using shuffle operations.
for (int mask = 16; mask > 0; mask /= 2)
val = max(val, __shfl_xor_sync(0xffffffff, val, mask));
return val;
#else
int indexInWarp = LOCAL_ID%32;
temp[LOCAL_ID] = val;
SYNC_WARPS; SYNC_WARPS;
for (int offset = 16; offset > 0; offset /= 2) { for (int offset = 16; offset > 0; offset /= 2) {
if (offset < indexInWarp) if (indexInWarp < offset)
temp[get_local_id(0)] = max(temp[get_local_id(0)], temp[get_local_id(0)+offset]); temp[LOCAL_ID] = max(temp[LOCAL_ID], temp[LOCAL_ID+offset]);
SYNC_WARPS; SYNC_WARPS;
} }
return temp[get_local_id(0)-indexInWarp]; return temp[LOCAL_ID-indexInWarp];
#endif
} }
#ifndef SUPPORTS_64_BIT_ATOMICS
/** /**
* This function is used on devices that don't support 64 bit atomics. Multiple threads within * This function is used on devices that don't support 64 bit atomics. Multiple threads within
* a single tile might have computed forces on the same atom. This loops over them and makes sure * a single tile might have computed forces on the same atom. This loops over them and makes sure
* that only one thread updates the force on any given atom. * that only one thread updates the force on any given atom.
*/ */
void writeForces(__global real4* forceBuffers,__local AtomData* localData, int atomIndex) { void writeForces(GLOBAL real4* forceBuffers, LOCAL AtomData* localData, int atomIndex) {
localData[get_local_id(0)].x = atomIndex; localData[LOCAL_ID].x = atomIndex;
SYNC_WARPS; SYNC_WARPS;
real4 forceSum = (real4) 0; real4 forceSum = make_real4(0);
int start = (get_local_id(0)/TILE_SIZE)*TILE_SIZE; int start = (LOCAL_ID/TILE_SIZE)*TILE_SIZE;
int end = start+32; int end = start+32;
bool isFirst = true; bool isFirst = true;
for (int i = start; i < end; i++) for (int i = start; i < end; i++)
if (localData[i].x == atomIndex) { if (localData[i].x == atomIndex) {
forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0); forceSum += (real4) (localData[i].fx, localData[i].fy, localData[i].fz, 0);
isFirst &= (i >= get_local_id(0)); isFirst &= (i >= LOCAL_ID);
} }
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS; unsigned int offset = atomIndex + warp*PADDED_NUM_ATOMS;
if (isFirst) if (isFirst)
forceBuffers[offset] += forceSum; forceBuffers[offset] += forceSum;
SYNC_WARPS; SYNC_WARPS;
} }
#endif
__kernel void computeInteractionGroups( KERNEL void computeInteractionGroups(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers,
#else #else
__global real4* restrict forceBuffers, GLOBAL real4* RESTRICT forceBuffers,
#endif #endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict groupData, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData,
__global int* restrict numGroupTiles, int useNeighborList, GLOBAL const int* RESTRICT numGroupTiles, int useNeighborList,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex const unsigned int warp = GLOBAL_ID/TILE_SIZE; // global warpIndex
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = get_local_id(0) - tgx; // block warpIndex const unsigned int tbx = LOCAL_ID - tgx; // block warpIndex
mixed energy = 0; mixed energy = 0;
INIT_DERIVATIVES INIT_DERIVATIVES
__local AtomData localData[LOCAL_MEMORY_SIZE]; LOCAL AtomData localData[LOCAL_MEMORY_SIZE];
__local int reductionBuffer[LOCAL_MEMORY_SIZE]; LOCAL int reductionBuffer[LOCAL_MEMORY_SIZE];
const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps); const unsigned int startTile = (useNeighborList ? warp*numGroupTiles[0]/totalWarps : FIRST_TILE+warp*(LAST_TILE-FIRST_TILE)/totalWarps);
const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps); const unsigned int endTile = (useNeighborList ? (warp+1)*numGroupTiles[0]/totalWarps : FIRST_TILE+(warp+1)*(LAST_TILE-FIRST_TILE)/totalWarps);
...@@ -82,16 +87,16 @@ __kernel void computeInteractionGroups( ...@@ -82,16 +87,16 @@ __kernel void computeInteractionGroups(
const int exclusions = atomData.w; const int exclusions = atomData.w;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
real4 force = (real4) (0); real3 force = make_real3(0);
real4 posq2 = posq[atom2]; real4 posq2 = posq[atom2];
localData[get_local_id(0)].x = posq2.x; localData[LOCAL_ID].x = posq2.x;
localData[get_local_id(0)].y = posq2.y; localData[LOCAL_ID].y = posq2.y;
localData[get_local_id(0)].z = posq2.z; localData[LOCAL_ID].z = posq2.z;
localData[get_local_id(0)].q = posq2.w; localData[LOCAL_ID].q = posq2.w;
LOAD_LOCAL_PARAMETERS LOAD_LOCAL_PARAMETERS
localData[get_local_id(0)].fx = 0.0f; localData[LOCAL_ID].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f; localData[LOCAL_ID].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f; localData[LOCAL_ID].fz = 0.0f;
int tj = tgx; int tj = tgx;
int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer); int rangeStop = rangeStart + reduceMax(rangeEnd-rangeStart, reductionBuffer);
SYNC_WARPS; SYNC_WARPS;
...@@ -99,8 +104,8 @@ __kernel void computeInteractionGroups( ...@@ -99,8 +104,8 @@ __kernel void computeInteractionGroups(
if (j < rangeEnd) { if (j < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0); bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj; int localIndex = tbx+tj;
posq2 = (real4) (localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q); posq2 = make_real4(localData[localIndex].x, localData[localIndex].y, localData[localIndex].z, localData[localIndex].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -117,35 +122,38 @@ __kernel void computeInteractionGroups( ...@@ -117,35 +122,38 @@ __kernel void computeInteractionGroups(
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
delta *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[localIndex].fx += delta.x; localData[localIndex].fx += delta.x;
localData[localIndex].fy += delta.y; localData[localIndex].fy += delta.y;
localData[localIndex].fz += delta.z; localData[localIndex].fz += delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
} }
tj = (tj == rangeEnd-1 ? rangeStart : tj+1);
SYNC_WARPS; SYNC_WARPS;
} }
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
if (exclusions != 0) { if (exclusions != 0) {
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
} }
atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
SYNC_WARPS;
#else #else
writeForces(forceBuffers, localData, atom2); writeForces(forceBuffers, localData, atom2);
localData[get_local_id(0)].fx = force.x; localData[LOCAL_ID].fx = force.x;
localData[get_local_id(0)].fy = force.y; localData[LOCAL_ID].fy = force.y;
localData[get_local_id(0)].fz = force.z; localData[LOCAL_ID].fz = force.z;
writeForces(forceBuffers, localData, atom1); writeForces(forceBuffers, localData, atom1);
#endif #endif
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
SAVE_DERIVATIVES SAVE_DERIVATIVES
} }
...@@ -153,7 +161,7 @@ __kernel void computeInteractionGroups( ...@@ -153,7 +161,7 @@ __kernel void computeInteractionGroups(
* If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is * If the neighbor list needs to be rebuilt, reset the number of tiles to 0. This is
* executed by a single thread. * executed by a single thread.
*/ */
__kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles) { KERNEL void prepareToBuildNeighborList(GLOBAL int* RESTRICT rebuildNeighborList, GLOBAL int* RESTRICT numGroupTiles) {
if (rebuildNeighborList[0] == 1) if (rebuildNeighborList[0] == 1)
numGroupTiles[0] = 0; numGroupTiles[0] = 0;
} }
...@@ -162,8 +170,8 @@ __kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborL ...@@ -162,8 +170,8 @@ __kernel void prepareToBuildNeighborList(__global int* restrict rebuildNeighborL
* Filter the list of tiles to include only ones that have interactions within the * Filter the list of tiles to include only ones that have interactions within the
* padded cutoff. * padded cutoff.
*/ */
__kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __global int* restrict numGroupTiles, KERNEL void buildNeighborList(GLOBAL int* RESTRICT rebuildNeighborList, GLOBAL int* RESTRICT numGroupTiles,
__global const real4* restrict posq, __global const int4* restrict groupData, __global int4* restrict filteredGroupData, GLOBAL const real4* RESTRICT posq, GLOBAL const int4* RESTRICT groupData, GLOBAL int4* RESTRICT filteredGroupData,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) { real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
// If the neighbor list doesn't need to be rebuilt on this step, return immediately. // If the neighbor list doesn't need to be rebuilt on this step, return immediately.
...@@ -171,15 +179,15 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl ...@@ -171,15 +179,15 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if (rebuildNeighborList[0] == 0) if (rebuildNeighborList[0] == 0)
return; return;
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; // global warpIndex const unsigned int warp = GLOBAL_ID/TILE_SIZE; // global warpIndex
const unsigned int local_warp = get_local_id(0)/TILE_SIZE; // local warpIndex const unsigned int local_warp = LOCAL_ID/TILE_SIZE; // local warpIndex
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); // index within the warp const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = get_local_id(0) - tgx; // block warpIndex const unsigned int tbx = LOCAL_ID - tgx; // block warpIndex
__local real4 localPos[LOCAL_MEMORY_SIZE]; LOCAL real4 localPos[LOCAL_MEMORY_SIZE];
__local volatile bool anyInteraction[WARPS_IN_BLOCK]; LOCAL volatile bool anyInteraction[WARPS_IN_BLOCK];
__local volatile int tileIndex[WARPS_IN_BLOCK]; LOCAL volatile int tileIndex[WARPS_IN_BLOCK];
__local int reductionBuffer[LOCAL_MEMORY_SIZE]; LOCAL int reductionBuffer[LOCAL_MEMORY_SIZE];
const unsigned int startTile = warp*NUM_TILES/totalWarps; const unsigned int startTile = warp*NUM_TILES/totalWarps;
const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps; const unsigned int endTile = (warp+1)*NUM_TILES/totalWarps;
...@@ -191,7 +199,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl ...@@ -191,7 +199,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
const int rangeEnd = (atomData.z>>16)&0xFFFF; const int rangeEnd = (atomData.z>>16)&0xFFFF;
const int exclusions = atomData.w; const int exclusions = atomData.w;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
localPos[get_local_id(0)] = posq[atom2]; localPos[LOCAL_ID] = posq[atom2];
if (tgx == 0) if (tgx == 0)
anyInteraction[local_warp] = false; anyInteraction[local_warp] = false;
int tj = tgx; int tj = tgx;
...@@ -199,10 +207,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl ...@@ -199,10 +207,10 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
SYNC_WARPS; SYNC_WARPS;
for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) { for (int j = rangeStart; j < rangeStop && !anyInteraction[local_warp]; j++) {
SYNC_WARPS; SYNC_WARPS;
if (j < rangeEnd) { if (j < rangeEnd && tj < rangeEnd) {
bool isExcluded = (((exclusions>>tj)&1) == 0); bool isExcluded = (((exclusions>>tj)&1) == 0);
int localIndex = tbx+tj; int localIndex = tbx+tj;
real4 delta = (real4) (localPos[localIndex].xyz - posq1.xyz, 0); real3 delta = make_real3(localPos[localIndex].x-posq1.x, localPos[localIndex].y-posq1.y, localPos[localIndex].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -216,7 +224,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl ...@@ -216,7 +224,7 @@ __kernel void buildNeighborList(__global int* restrict rebuildNeighborList, __gl
if (anyInteraction[local_warp]) { if (anyInteraction[local_warp]) {
SYNC_WARPS; SYNC_WARPS;
if (tgx == 0) if (tgx == 0)
tileIndex[local_warp] = atomic_add(numGroupTiles, 1); tileIndex[local_warp] = ATOMIC_ADD(numGroupTiles, 1);
SYNC_WARPS; SYNC_WARPS;
filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData; filteredGroupData[TILE_SIZE*tileIndex[local_warp]+tgx] = atomData;
} }
......
...@@ -4,10 +4,10 @@ ...@@ -4,10 +4,10 @@
/** /**
* Calculate the ellipsoid coordinate frames and associated matrices. * Calculate the ellipsoid coordinate frames and associated matrices.
*/ */
extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices, KERNEL void computeEllipsoidFrames(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL int2* const RESTRICT axisParticleIndices,
const float4* __restrict__ sigParams, const float4* __restrict__ scale, real* __restrict__ aMatrix, GLOBAL const float4* RESTRICT sigParams, GLOBAL const float4* RESTRICT scale, GLOBAL real* RESTRICT aMatrix,
real* __restrict__ bMatrix, real* __restrict__ gMatrix, const int* sortedParticles) { GLOBAL real* RESTRICT bMatrix, GLOBAL real* RESTRICT gMatrix, GLOBAL const int* sortedParticles) {
for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) { for (int sortedIndex = GLOBAL_ID; sortedIndex < numParticles; sortedIndex += GLOBAL_SIZE) {
// Compute the local coordinate system of the ellipsoid; // Compute the local coordinate system of the ellipsoid;
int originalIndex = sortedParticles[sortedIndex]; int originalIndex = sortedParticles[sortedIndex];
...@@ -36,9 +36,9 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* ...@@ -36,9 +36,9 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
// Compute matrices we will need later. // Compute matrices we will need later.
real (*a)[3] = (real (*)[3]) (aMatrix+sortedIndex*9); GLOBAL real (*a)[3] = (GLOBAL real (*)[3]) (aMatrix+sortedIndex*9);
real (*b)[3] = (real (*)[3]) (bMatrix+sortedIndex*9); GLOBAL real (*b)[3] = (GLOBAL real (*)[3]) (bMatrix+sortedIndex*9);
real (*g)[3] = (real (*)[3]) (gMatrix+sortedIndex*9); GLOBAL real (*g)[3] = (GLOBAL real (*)[3]) (gMatrix+sortedIndex*9);
a[0][0] = xdir.x; a[0][0] = xdir.x;
a[0][1] = xdir.y; a[0][1] = xdir.y;
a[0][2] = xdir.z; a[0][2] = xdir.z;
...@@ -62,10 +62,10 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4* ...@@ -62,10 +62,10 @@ extern "C" __global__ void computeEllipsoidFrames(int numParticles, const real4*
/** /**
* Find a bounding box for the atoms in each block. * Find a bounding box for the atoms in each block.
*/ */
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, KERNEL void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
const int* sortedAtoms, const real4* __restrict__ posq, real4* __restrict__ sortedPos, real4* __restrict__ blockCenter, GLOBAL const int* sortedAtoms, GLOBAL const real4* RESTRICT posq, GLOBAL real4* RESTRICT sortedPos, GLOBAL real4* RESTRICT blockCenter,
real4* __restrict__ blockBoundingBox, int* __restrict__ neighborBlockCount) { GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT neighborBlockCount) {
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = GLOBAL_ID;
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
while (base < numAtoms) { while (base < numAtoms) {
real4 pos = posq[sortedAtoms[base]]; real4 pos = posq[sortedAtoms[base]];
...@@ -89,19 +89,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, ...@@ -89,19 +89,19 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
real4 blockSize = 0.5f*(maxPos-minPos); real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize; blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos); blockCenter[index] = 0.5f*(maxPos+minPos);
index += blockDim.x*gridDim.x; index += GLOBAL_SIZE;
base = index*TILE_SIZE; base = index*TILE_SIZE;
} }
if (blockIdx.x*blockDim.x+threadIdx.x == 0) if (GLOBAL_ID == 0)
*neighborBlockCount = 0; *neighborBlockCount = 0;
} }
/** /**
* This is called by findNeighbors() to write a block to the neighbor list. * This is called by findNeighbors() to write a block to the neighbor list.
*/ */
__device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, int* __restrict__ neighbors, DEVICE void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, GLOBAL int* RESTRICT neighbors,
int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount) { GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount) {
int blockIndex = atomicAdd(neighborBlockCount, 1); int blockIndex = ATOMIC_ADD(neighborBlockCount, 1);
if (blockIndex >= maxNeighborBlocks) if (blockIndex >= maxNeighborBlocks)
return; // We don't have enough room for the neighbor list. return; // We don't have enough room for the neighbor list.
neighborIndex[blockIndex] = atom1; neighborIndex[blockIndex] = atom1;
...@@ -115,12 +115,12 @@ __device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuf ...@@ -115,12 +115,12 @@ __device__ void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuf
/** /**
* Build a list of neighbors for each atom. * Build a list of neighbors for each atom.
*/ */
extern "C" __global__ void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, KERNEL void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
real4* __restrict__ sortedPos, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ neighbors, GLOBAL real4* RESTRICT sortedPos, GLOBAL real4* RESTRICT blockCenter, GLOBAL real4* RESTRICT blockBoundingBox, GLOBAL int* RESTRICT neighbors,
int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount, const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex) { GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex) {
const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE; const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
int neighborBuffer[NEIGHBOR_BLOCK_SIZE]; int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) { for (int atom1 = GLOBAL_ID; atom1 < numAtoms; atom1 += GLOBAL_SIZE) {
int nextExclusion = exclusionStartIndex[atom1]; int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1]; int lastExclusion = exclusionStartIndex[atom1+1];
real4 pos = sortedPos[atom1]; real4 pos = sortedPos[atom1];
...@@ -178,8 +178,8 @@ typedef struct { ...@@ -178,8 +178,8 @@ typedef struct {
real a[3][3], b[3][3], g[3][3]; real a[3][3], b[3][3], g[3][3];
} AtomData; } AtomData;
__device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, const real4* __restrict__ pos, const float4* __restrict__ sigParams, DEVICE void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, GLOBAL const real4* RESTRICT pos, GLOBAL const float4* RESTRICT sigParams,
const float2* __restrict__ epsParams, const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix) { GLOBAL const float2* RESTRICT epsParams, GLOBAL const real* RESTRICT aMatrix, GLOBAL const real* RESTRICT bMatrix, GLOBAL const real* RESTRICT gMatrix) {
data->sig = sigParams[originalIndex]; data->sig = sigParams[originalIndex];
data->eps = epsParams[originalIndex]; data->eps = epsParams[originalIndex];
data->pos = trimTo3(pos[sortedIndex]); data->pos = trimTo3(pos[sortedIndex]);
...@@ -192,19 +192,19 @@ __device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, ...@@ -192,19 +192,19 @@ __device__ void loadAtomData(AtomData* data, int sortedIndex, int originalIndex,
} }
} }
inline __device__ real3 matrixVectorProduct(real (*m)[3], real3 v) { inline DEVICE real3 matrixVectorProduct(real (*m)[3], real3 v) {
return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z, return make_real3(m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z, m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z); m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
} }
inline __device__ real3 vectorMatrixProduct(real3 v, real (*m)[3]) { inline DEVICE real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z, return make_real3(m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z, m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z); m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
} }
inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) { inline DEVICE void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
result[0][0] = a[0][0]+b[0][0]; result[0][0] = a[0][0]+b[0][0];
result[0][1] = a[0][1]+b[0][1]; result[0][1] = a[0][1]+b[0][1];
result[0][2] = a[0][2]+b[0][2]; result[0][2] = a[0][2]+b[0][2];
...@@ -216,12 +216,12 @@ inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) ...@@ -216,12 +216,12 @@ inline __device__ void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3])
result[2][2] = a[2][2]+b[2][2]; result[2][2] = a[2][2]+b[2][2];
} }
inline __device__ real determinant(real (*m)[3]) { inline DEVICE real determinant(real (*m)[3]) {
return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] - return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]); m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
} }
inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) { inline DEVICE void matrixInverse(real (*result)[3], real (*m)[3]) {
real invDet = RECIP(determinant(m)); real invDet = RECIP(determinant(m));
result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]); result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]); result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
...@@ -234,7 +234,7 @@ inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) { ...@@ -234,7 +234,7 @@ inline __device__ void matrixInverse(real (*result)[3], real (*m)[3]) {
result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]); result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
} }
__device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) { DEVICE void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
real rInv = RSQRT(r2); real rInv = RSQRT(r2);
real r = r2*rInv; real r = r2*rInv;
real3 drUnit = dr*rInv; real3 drUnit = dr*rInv;
...@@ -335,25 +335,25 @@ __device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sig ...@@ -335,25 +335,25 @@ __device__ void computeOneInteraction(AtomData* data1, AtomData* data2, real sig
/** /**
* Compute the interactions. * Compute the interactions.
*/ */
extern "C" __global__ void computeForce( KERNEL void computeForce(
unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ torqueBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT torqueBuffers,
int numAtoms, int numExceptions, mixed* __restrict__ energyBuffer, const real4* __restrict__ pos, int numAtoms, int numExceptions, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT pos,
const float4* __restrict__ sigParams, const float2* __restrict__ epsParams, const int* __restrict__ sortedAtoms, GLOBAL const float4* RESTRICT sigParams, GLOBAL const float2* RESTRICT epsParams, GLOBAL const int* RESTRICT sortedAtoms,
const real* __restrict__ aMatrix, const real* __restrict__ bMatrix, const real* __restrict__ gMatrix, GLOBAL const real* RESTRICT aMatrix, GLOBAL const real* RESTRICT bMatrix, GLOBAL const real* RESTRICT gMatrix,
const int* __restrict__ exclusions, const int* __restrict__ exclusionStartIndex, GLOBAL const int* RESTRICT exclusions, GLOBAL const int* RESTRICT exclusionStartIndex,
const int4* __restrict__ exceptionParticles, const float2* __restrict__ exceptionParams GLOBAL const int4* RESTRICT exceptionParticles, GLOBAL const float2* RESTRICT exceptionParams
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
, int maxNeighborBlocks, int* __restrict__ neighbors, int* __restrict__ neighborIndex, int* __restrict__ neighborBlockCount, , int maxNeighborBlocks, GLOBAL int* RESTRICT neighbors, GLOBAL int* RESTRICT neighborIndex, GLOBAL int* RESTRICT neighborBlockCount,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif #endif
) { ) {
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
mixed energy = 0; mixed energy = 0;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const int numBlocks = *neighborBlockCount; const int numBlocks = *neighborBlockCount;
if (numBlocks > maxNeighborBlocks) if (numBlocks > maxNeighborBlocks)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
for (int block = blockIdx.x*blockDim.x+threadIdx.x; block < numBlocks; block += blockDim.x*gridDim.x) { for (int block = GLOBAL_ID; block < numBlocks; block += GLOBAL_SIZE) {
// Load parameters for atom1. // Load parameters for atom1.
int atom1 = neighborIndex[block]; int atom1 = neighborIndex[block];
...@@ -384,22 +384,22 @@ extern "C" __global__ void computeForce( ...@@ -384,22 +384,22 @@ extern "C" __global__ void computeForce(
real sigma = data1.sig.x+data2.sig.x; real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x; real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy); computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
} }
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
} }
#else #else
for (int atom1 = blockIdx.x*blockDim.x+threadIdx.x; atom1 < numAtoms; atom1 += blockDim.x*gridDim.x) { for (int atom1 = GLOBAL_ID; atom1 < numAtoms; atom1 += GLOBAL_SIZE) {
// Load parameters for atom1. // Load parameters for atom1.
int index1 = sortedAtoms[atom1]; int index1 = sortedAtoms[atom1];
...@@ -432,25 +432,25 @@ extern "C" __global__ void computeForce( ...@@ -432,25 +432,25 @@ extern "C" __global__ void computeForce(
real sigma = data1.sig.x+data2.sig.x; real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x; real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy); computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
} }
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
} }
#endif #endif
// Now compute exceptions. // Now compute exceptions.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numExceptions; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < numExceptions; index += GLOBAL_SIZE) {
int4 atomIndices = exceptionParticles[index]; int4 atomIndices = exceptionParticles[index];
float2 params = exceptionParams[index]; float2 params = exceptionParams[index];
int index1 = atomIndices.x, index2 = atomIndices.y; int index1 = atomIndices.x, index2 = atomIndices.y;
...@@ -466,34 +466,34 @@ extern "C" __global__ void computeForce( ...@@ -466,34 +466,34 @@ extern "C" __global__ void computeForce(
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy); computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atomicAdd(&forceBuffers[index1], static_cast<unsigned long long>((long long) (force1.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index1], (mm_ulong) ((mm_long) (force1.x*0x100000000)));
atomicAdd(&forceBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.y*0x100000000)));
atomicAdd(&forceBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force1.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force1.z*0x100000000)));
atomicAdd(&forceBuffers[index2], static_cast<unsigned long long>((long long) (force2.x*0x100000000))); ATOMIC_ADD(&forceBuffers[index2], (mm_ulong) ((mm_long) (force2.x*0x100000000)));
atomicAdd(&forceBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.y*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.y*0x100000000)));
atomicAdd(&forceBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force2.z*0x100000000))); ATOMIC_ADD(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force2.z*0x100000000)));
atomicAdd(&torqueBuffers[index1], static_cast<unsigned long long>((long long) (torque1.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1], (mm_ulong) ((mm_long) (torque1.x*0x100000000)));
atomicAdd(&torqueBuffers[index1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.y*0x100000000)));
atomicAdd(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque1.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque1.z*0x100000000)));
atomicAdd(&torqueBuffers[index2], static_cast<unsigned long long>((long long) (torque2.x*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2], (mm_ulong) ((mm_long) (torque2.x*0x100000000)));
atomicAdd(&torqueBuffers[index2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.y*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.y*0x100000000)));
atomicAdd(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (torque2.z*0x100000000))); ATOMIC_ADD(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (torque2.z*0x100000000)));
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
} }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[GLOBAL_ID] += energy;
} }
/** /**
* Convert the torques to forces on the connected particles. * Convert the torques to forces on the connected particles.
*/ */
extern "C" __global__ void applyTorques( KERNEL void applyTorques(
unsigned long long* __restrict__ forceBuffers, long long* __restrict__ torqueBuffers, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL const mm_long* RESTRICT torqueBuffers,
int numParticles, const real4* __restrict__ posq, int2* const __restrict__ axisParticleIndices, int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL int2* const RESTRICT axisParticleIndices,
const int* sortedParticles) { GLOBAL const int* sortedParticles) {
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
for (int sortedIndex = blockIdx.x*blockDim.x+threadIdx.x; sortedIndex < numParticles; sortedIndex += blockDim.x*gridDim.x) { for (int sortedIndex = GLOBAL_ID; sortedIndex < numParticles; sortedIndex += GLOBAL_SIZE) {
int originalIndex = sortedParticles[sortedIndex]; int originalIndex = sortedParticles[sortedIndex];
real3 pos = trimTo3(posq[originalIndex]); real3 pos = trimTo3(posq[originalIndex]);
int2 axisParticles = axisParticleIndices[originalIndex]; int2 axisParticles = axisParticleIndices[originalIndex];
...@@ -522,16 +522,16 @@ extern "C" __global__ void applyTorques( ...@@ -522,16 +522,16 @@ extern "C" __global__ void applyTorques(
yforce += f; yforce += f;
force -= f; force -= f;
} }
atomicAdd(&forceBuffers[originalIndex], static_cast<unsigned long long>((long long) (force.x*0x100000000))); ATOMIC_ADD(&forceBuffers[originalIndex], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); ATOMIC_ADD(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000))); ATOMIC_ADD(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x], static_cast<unsigned long long>((long long) (xforce.x*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.x], (mm_ulong) ((mm_long) (xforce.x*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.y*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (xforce.y*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (xforce.z*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (xforce.z*0x100000000)));
if (axisParticles.y != -1) { if (axisParticles.y != -1) {
atomicAdd(&forceBuffers[axisParticles.y], static_cast<unsigned long long>((long long) (yforce.x*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.y], (mm_ulong) ((mm_long) (yforce.x*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.y*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (yforce.y*0x100000000)));
atomicAdd(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (yforce.z*0x100000000))); ATOMIC_ADD(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (yforce.z*0x100000000)));
} }
} }
} }
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct { typedef struct {
...@@ -13,26 +10,26 @@ typedef struct { ...@@ -13,26 +10,26 @@ typedef struct {
/** /**
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum( KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum, GLOBAL mm_ulong* RESTRICT global_bornSum,
#else #else
__global real* restrict global_bornSum, GLOBAL real* RESTRICT global_bornSum,
#endif #endif
__global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms, GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
__global const ushort2* exclusionTiles) { GLOBAL const ushort2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = LOCAL_ID - tgx;
__local AtomData1 localData[FORCE_WORK_GROUP_SIZE]; LOCAL AtomData1 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
...@@ -42,7 +39,7 @@ __kernel void computeBornSum( ...@@ -42,7 +39,7 @@ __kernel void computeBornSum(
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real bornSum = 0.0f; real bornSum = 0;
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
...@@ -50,15 +47,15 @@ __kernel void computeBornSum( ...@@ -50,15 +47,15 @@ __kernel void computeBornSum(
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x; localData[LOCAL_ID].x = posq1.x;
localData[get_local_id(0)].y = posq1.y; localData[LOCAL_ID].y = posq1.y;
localData[get_local_id(0)].z = posq1.z; localData[LOCAL_ID].z = posq1.z;
localData[get_local_id(0)].q = charge1; localData[LOCAL_ID].q = charge1;
localData[get_local_id(0)].radius = params1.x; localData[LOCAL_ID].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y; localData[LOCAL_ID].scaledRadius = params1.y;
SYNC_WARPS; SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0); real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -70,7 +67,7 @@ __kernel void computeBornSum( ...@@ -70,7 +67,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius); float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -91,21 +88,21 @@ __kernel void computeBornSum( ...@@ -91,21 +88,21 @@ __kernel void computeBornSum(
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[LOCAL_ID].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[LOCAL_ID].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[LOCAL_ID].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j]; localData[LOCAL_ID].q = charge[j];
float2 tempParams = global_params[j]; float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x; localData[LOCAL_ID].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y; localData[LOCAL_ID].scaledRadius = tempParams.y;
localData[get_local_id(0)].bornSum = 0.0f; localData[LOCAL_ID].bornSum = 0.0f;
SYNC_WARPS; SYNC_WARPS;
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0); real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -117,7 +114,7 @@ __kernel void computeBornSum( ...@@ -117,7 +114,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius); float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -151,17 +148,17 @@ __kernel void computeBornSum( ...@@ -151,17 +148,17 @@ __kernel void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) ((mm_long) (bornSum*0x100000000)));
if (x != y) { if (x != y) {
offset = y*TILE_SIZE + tgx; offset = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (localData[get_local_id(0)].bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].bornSum*0x100000000)));
} }
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum; global_bornSum[offset1] += bornSum;
if (x != y) if (x != y)
global_bornSum[offset2] += localData[get_local_id(0)].bornSum; global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif #endif
} }
...@@ -172,17 +169,17 @@ __kernel void computeBornSum( ...@@ -172,17 +169,17 @@ __kernel void computeBornSum(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else #else
int pos = (int) (warp*(long)numTiles/totalWarps); int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps); int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif #endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE]; LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE]; LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1; skipTiles[LOCAL_ID] = -1;
while (pos < end) { while (pos < end) {
real bornSum = 0; real bornSum = 0;
...@@ -213,10 +210,10 @@ __kernel void computeBornSum( ...@@ -213,10 +210,10 @@ __kernel void computeBornSum(
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
skipTiles[get_local_id(0)] = end; skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE; skipBase += TILE_SIZE;
currentSkipIndex = tbx; currentSkipIndex = tbx;
SYNC_WARPS; SYNC_WARPS;
...@@ -238,17 +235,17 @@ __kernel void computeBornSum( ...@@ -238,17 +235,17 @@ __kernel void computeBornSum(
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[get_local_id(0)] = j; atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[LOCAL_ID].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[LOCAL_ID].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[LOCAL_ID].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j]; localData[LOCAL_ID].q = charge[j];
float2 tempParams = global_params[j]; float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x; localData[LOCAL_ID].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y; localData[LOCAL_ID].scaledRadius = tempParams.y;
localData[get_local_id(0)].bornSum = 0.0f; localData[LOCAL_ID].bornSum = 0.0f;
} }
SYNC_WARPS; SYNC_WARPS;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -258,17 +255,17 @@ __kernel void computeBornSum( ...@@ -258,17 +255,17 @@ __kernel void computeBornSum(
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[get_local_id(0)], blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
SYNC_WARPS; SYNC_WARPS;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0); real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj]; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius); float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -304,7 +301,7 @@ __kernel void computeBornSum( ...@@ -304,7 +301,7 @@ __kernel void computeBornSum(
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0); real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -317,7 +314,7 @@ __kernel void computeBornSum( ...@@ -317,7 +314,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius); float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -350,20 +347,20 @@ __kernel void computeBornSum( ...@@ -350,20 +347,20 @@ __kernel void computeBornSum(
// Write results. // Write results.
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)]; unsigned int atom2 = atomIndices[LOCAL_ID];
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_ulong) ((mm_long) (bornSum*0x100000000)));
if (atom2 < PADDED_NUM_ATOMS) if (atom2 < PADDED_NUM_ATOMS)
atom_add(&global_bornSum[atom2], (long) (localData[get_local_id(0)].bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].bornSum*0x100000000)));
#else #else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS; unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS; unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
global_bornSum[offset1] += bornSum; global_bornSum[offset1] += bornSum;
if (atom2 < PADDED_NUM_ATOMS) if (atom2 < PADDED_NUM_ATOMS)
global_bornSum[offset2] += localData[get_local_id(0)].bornSum; global_bornSum[offset2] += localData[LOCAL_ID].bornSum;
#endif #endif
} }
pos++; pos++;
...@@ -381,28 +378,28 @@ typedef struct { ...@@ -381,28 +378,28 @@ typedef struct {
* First part of computing the GBSA interaction. * First part of computing the GBSA interaction.
*/ */
__kernel void computeGBSAForce1( KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce, GLOBAL mm_ulong* RESTRICT forceBuffers, GLOBAL mm_ulong* RESTRICT global_bornForce,
#else #else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce, GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif #endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
__global const real* restrict global_bornRadii, int needEnergy, GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms, GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
__global const ushort2* exclusionTiles) { GLOBAL const ushort2* RESTRICT exclusionTiles) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE; const unsigned int totalWarps = GLOBAL_SIZE/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = GLOBAL_ID/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = LOCAL_ID & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = LOCAL_ID - tgx;
mixed energy = 0; mixed energy = 0;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE]; LOCAL AtomData2 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
...@@ -412,7 +409,7 @@ __kernel void computeGBSAForce1( ...@@ -412,7 +409,7 @@ __kernel void computeGBSAForce1(
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y; const unsigned int y = tileIndices.y;
real4 force = 0.0f; real4 force = make_real4(0);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
...@@ -420,18 +417,17 @@ __kernel void computeGBSAForce1( ...@@ -420,18 +417,17 @@ __kernel void computeGBSAForce1(
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0); localData[LOCAL_ID].x = posq1.x;
localData[localAtomIndex].x = posq1.x; localData[LOCAL_ID].y = posq1.y;
localData[localAtomIndex].y = posq1.y; localData[LOCAL_ID].z = posq1.z;
localData[localAtomIndex].z = posq1.z; localData[LOCAL_ID].q = charge1;
localData[localAtomIndex].q = charge1; localData[LOCAL_ID].bornRadius = bornRadius1;
localData[get_local_id(0)].bornRadius = bornRadius1;
SYNC_WARPS; SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z); real3 pos2 = make_real3(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z);
real charge2 = localData[tbx+j].q; real charge2 = localData[tbx+j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -459,8 +455,10 @@ __kernel void computeGBSAForce1( ...@@ -459,8 +455,10 @@ __kernel void computeGBSAForce1(
#endif #endif
if (needEnergy) if (needEnergy)
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
...@@ -473,22 +471,22 @@ __kernel void computeGBSAForce1( ...@@ -473,22 +471,22 @@ __kernel void computeGBSAForce1(
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[LOCAL_ID].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[LOCAL_ID].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[LOCAL_ID].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j]; localData[LOCAL_ID].q = charge[j];
localData[get_local_id(0)].bornRadius = global_bornRadii[j]; localData[LOCAL_ID].bornRadius = global_bornRadii[j];
localData[get_local_id(0)].fx = 0.0f; localData[LOCAL_ID].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f; localData[LOCAL_ID].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f; localData[LOCAL_ID].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f; localData[LOCAL_ID].fw = 0.0f;
SYNC_WARPS; SYNC_WARPS;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z); real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q; real charge2 = localData[tbx+tj].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -515,8 +513,10 @@ __kernel void computeGBSAForce1( ...@@ -515,8 +513,10 @@ __kernel void computeGBSAForce1(
#endif #endif
if (needEnergy) if (needEnergy)
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
...@@ -534,25 +534,25 @@ __kernel void computeGBSAForce1( ...@@ -534,25 +534,25 @@ __kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
atom_add(&global_bornForce[offset], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) ((mm_long) (force.w*0x100000000)));
if (x != y) { if (x != y) {
offset = y*TILE_SIZE + tgx; offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0x100000000)); ATOMIC_ADD(&global_bornForce[offset], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fw*0x100000000)));
} }
#else #else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS; unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w; global_bornForce[offset1] += force.w;
if (x != y) { if (x != y) {
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f); forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[get_local_id(0)].fw; global_bornForce[offset2] += localData[LOCAL_ID].fw;
} }
#endif #endif
} }
...@@ -564,20 +564,20 @@ __kernel void computeGBSAForce1( ...@@ -564,20 +564,20 @@ __kernel void computeGBSAForce1(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : (mm_long)numTiles)/totalWarps);
#else #else
int pos = (int) (warp*(long)numTiles/totalWarps); int pos = (int) (warp*(mm_long)numTiles/totalWarps);
int end = (int) ((warp+1)*(long)numTiles/totalWarps); int end = (int) ((warp+1)*(mm_long)numTiles/totalWarps);
#endif #endif
int skipBase = 0; int skipBase = 0;
int currentSkipIndex = tbx; int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE]; LOCAL int atomIndices[FORCE_WORK_GROUP_SIZE];
__local volatile int skipTiles[FORCE_WORK_GROUP_SIZE]; LOCAL volatile int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1; skipTiles[LOCAL_ID] = -1;
while (pos < end) { while (pos < end) {
real4 force = 0; real4 force = make_real4(0);
bool includeTile = true; bool includeTile = true;
// Extract the coordinates of this tile. // Extract the coordinates of this tile.
...@@ -605,10 +605,10 @@ __kernel void computeGBSAForce1( ...@@ -605,10 +605,10 @@ __kernel void computeGBSAForce1(
SYNC_WARPS; SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[LOCAL_ID] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else else
skipTiles[get_local_id(0)] = end; skipTiles[LOCAL_ID] = end;
skipBase += TILE_SIZE; skipBase += TILE_SIZE;
currentSkipIndex = tbx; currentSkipIndex = tbx;
SYNC_WARPS; SYNC_WARPS;
...@@ -630,18 +630,18 @@ __kernel void computeGBSAForce1( ...@@ -630,18 +630,18 @@ __kernel void computeGBSAForce1(
#else #else
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[get_local_id(0)] = j; atomIndices[LOCAL_ID] = j;
if (j < PADDED_NUM_ATOMS) { if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x; localData[LOCAL_ID].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y; localData[LOCAL_ID].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z; localData[LOCAL_ID].z = tempPosq.z;
localData[get_local_id(0)].q = charge[j]; localData[LOCAL_ID].q = charge[j];
localData[get_local_id(0)].bornRadius = global_bornRadii[j]; localData[LOCAL_ID].bornRadius = global_bornRadii[j];
localData[get_local_id(0)].fx = 0.0f; localData[LOCAL_ID].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f; localData[LOCAL_ID].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f; localData[LOCAL_ID].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f; localData[LOCAL_ID].fw = 0.0f;
} }
SYNC_WARPS; SYNC_WARPS;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -651,15 +651,15 @@ __kernel void computeGBSAForce1( ...@@ -651,15 +651,15 @@ __kernel void computeGBSAForce1(
real4 blockCenterX = blockCenter[x]; real4 blockCenterX = blockCenter[x];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[get_local_id(0)], blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(localData[LOCAL_ID], blockCenterX)
SYNC_WARPS; SYNC_WARPS;
unsigned int tj = tgx; unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj]; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z); real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q; real charge2 = localData[tbx+tj].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
...@@ -681,8 +681,10 @@ __kernel void computeGBSAForce1( ...@@ -681,8 +681,10 @@ __kernel void computeGBSAForce1(
#endif #endif
if (needEnergy) if (needEnergy)
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
...@@ -702,9 +704,9 @@ __kernel void computeGBSAForce1( ...@@ -702,9 +704,9 @@ __kernel void computeGBSAForce1(
for (j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj]; int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real3 pos2 = (real3) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z); real3 pos2 = make_real3(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z);
real charge2 = localData[tbx+tj].q; real charge2 = localData[tbx+tj].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -731,8 +733,10 @@ __kernel void computeGBSAForce1( ...@@ -731,8 +733,10 @@ __kernel void computeGBSAForce1(
#endif #endif
if (needEnergy) if (needEnergy)
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y; localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z; localData[tbx+tj].fz += delta.z;
...@@ -745,37 +749,37 @@ __kernel void computeGBSAForce1( ...@@ -745,37 +749,37 @@ __kernel void computeGBSAForce1(
SYNC_WARPS; SYNC_WARPS;
} }
} }
// Write results. // Write results.
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)]; unsigned int atom2 = atomIndices[LOCAL_ID];
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) ((mm_long) (force.x*0x100000000)));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.y*0x100000000)));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (force.z*0x100000000)));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_ulong) ((mm_long) (force.w*0x100000000)));
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fx*0x100000000)));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fy*0x100000000)));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fz*0x100000000)));
atom_add(&global_bornForce[atom2], (long) (localData[get_local_id(0)].fw*0x100000000)); ATOMIC_ADD(&global_bornForce[atom2], (mm_ulong) ((mm_long) (localData[LOCAL_ID].fw*0x100000000)));
} }
#else #else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS; unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS; unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz; forceBuffers[offset1] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset1] += force.w; global_bornForce[offset1] += force.w;
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f); forceBuffers[offset2] += (real4) (localData[LOCAL_ID].fx, localData[LOCAL_ID].fy, localData[LOCAL_ID].fz, 0.0f);
global_bornForce[offset2] += localData[get_local_id(0)].fw; global_bornForce[offset2] += localData[LOCAL_ID].fw;
} }
#endif #endif
} }
pos++; pos++;
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
} }
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
real invRSquaredOver4 = 0.25f*invR*invR; real invRSquaredOver4 = 0.25f*invR*invR;
real rScaledRadiusJ = r+OBC_PARAMS2.y; real rScaledRadiusJ = r+OBC_PARAMS2.y;
real rScaledRadiusI = r+OBC_PARAMS1.y; real rScaledRadiusI = r+OBC_PARAMS1.y;
real l_ijJ = RECIP(max(OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y))); real l_ijJ = RECIP(max((real) OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
real l_ijI = RECIP(max(OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y))); real l_ijI = RECIP(max((real) OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
real u_ijJ = RECIP(rScaledRadiusJ); real u_ijJ = RECIP(rScaledRadiusJ);
real u_ijI = RECIP(rScaledRadiusI); real u_ijI = RECIP(rScaledRadiusI);
real l_ij2J = l_ijJ*l_ijJ; real l_ij2J = l_ijJ*l_ijJ;
...@@ -16,12 +16,17 @@ ...@@ -16,12 +16,17 @@
real t2I = (l_ij2I-u_ij2I); real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR; real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR; real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
#ifdef SUPPORTS_64_BIT_ATOMICS
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0); real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1/0x100000000 : 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0); tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2/0x100000000 : 0);
#else
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
#endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED); unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else #else
unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2); unsigned int includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
#endif #endif
dEdR += (includeInteraction ? tempdEdR : 0); dEdR += (includeInteraction ? tempdEdR : (real) 0);
} }
...@@ -5,22 +5,21 @@ ...@@ -5,22 +5,21 @@
* Reduce the Born sums to compute the Born radii. * Reduce the Born sums to compute the Born radii.
*/ */
__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma, KERNEL void reduceBornSum(float alpha, float beta, float gamma,
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornSum, GLOBAL const mm_long* RESTRICT bornSum,
#else #else
__global const real* restrict bornSum, GLOBAL const real* RESTRICT bornSum, int bufferSize, int numBuffers,
#endif #endif
__global const float2* restrict params, __global real* restrict bornRadii, __global real* restrict obcChain) { GLOBAL const float2* RESTRICT params, GLOBAL real* RESTRICT bornRadii, GLOBAL real* RESTRICT obcChain) {
unsigned int index = get_global_id(0); for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
while (index < NUM_ATOMS) {
// Get summed Born data // Get summed Born data
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
real sum = (1/(real) 0x100000000)*bornSum[index]; real sum = RECIP((real) 0x100000000)*bornSum[index];
#else #else
real sum = bornSum[index]; real sum = bornSum[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize) for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i]; sum += bornSum[i];
#endif #endif
...@@ -33,12 +32,11 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b ...@@ -33,12 +32,11 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
real sum3 = sum*sum2; real sum3 = sum*sum2;
real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3); real tanhSum = tanh(alpha*sum - beta*sum2 + gamma*sum3);
real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET; real nonOffsetRadius = offsetRadius + DIELECTRIC_OFFSET;
real radius = 1/(1/offsetRadius - tanhSum/nonOffsetRadius); real radius = RECIP(RECIP(offsetRadius) - tanhSum/nonOffsetRadius);
real chain = offsetRadius*(alpha - 2*beta*sum + 3*gamma*sum2); real chain = offsetRadius*(alpha - 2*beta*sum + 3*gamma*sum2);
chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius; chain = (1-tanhSum*tanhSum)*chain / nonOffsetRadius;
bornRadii[index] = radius; bornRadii[index] = radius;
obcChain[index] = chain; obcChain[index] = chain;
index += get_global_size(0);
} }
} }
...@@ -46,21 +44,22 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b ...@@ -46,21 +44,22 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
* Reduce the Born force. * Reduce the Born force.
*/ */
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bornForce, KERNEL void reduceBornForce(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global const long* restrict bornForceIn, GLOBAL mm_long* RESTRICT bornForce,
#else
GLOBAL real* bornForce, int bufferSize, int numBuffers,
#endif #endif
__global mixed* restrict energyBuffer, __global const float2* restrict params, __global const real* restrict bornRadii, __global const real* restrict obcChain) { GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const float2* RESTRICT params, GLOBAL const real* RESTRICT bornRadii, GLOBAL const real* RESTRICT obcChain) {
mixed energy = 0; mixed energy = 0;
unsigned int index = get_global_id(0); for (unsigned int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
while (index < NUM_ATOMS) { // Get summed Born force
// Sum the Born force
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
real force = (1/(real) 0x100000000)*bornForceIn[index]; real force = RECIP((real) 0x100000000)*bornForce[index];
#else #else
real force = bornForce[index]; real force = bornForce[index];
int totalSize = bufferSize*numBuffers;
for (int i = index+bufferSize; i < totalSize; i += bufferSize) for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i]; force += bornForce[i];
#endif #endif
...@@ -69,13 +68,16 @@ __kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bor ...@@ -69,13 +68,16 @@ __kernel void reduceBornForce(int bufferSize, int numBuffers, __global real* bor
float offsetRadius = params[index].x; float offsetRadius = params[index].x;
real bornRadius = bornRadii[index]; real bornRadius = bornRadii[index];
real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS; real r = offsetRadius+DIELECTRIC_OFFSET+PROBE_RADIUS;
real ratio6 = pow((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6); real ratio6 = POW((offsetRadius+DIELECTRIC_OFFSET)/bornRadius, (real) 6);
real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6; real saTerm = SURFACE_AREA_FACTOR*r*r*ratio6;
force += saTerm/bornRadius; force += saTerm/bornRadius;
energy += saTerm; energy += saTerm;
force *= bornRadius*bornRadius*obcChain[index]; force *= bornRadius*bornRadius*obcChain[index];
#ifdef SUPPORTS_64_BIT_ATOMICS
bornForce[index] = (mm_long) (force*0x100000000);
#else
bornForce[index] = force; bornForce[index] = force;
index += get_global_size(0); #endif
} }
energyBuffer[get_global_id(0)] += energy/-6.0f; energyBuffer[GLOBAL_ID] += energy/-6;
} }
\ No newline at end of file
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
typedef struct { typedef struct {
real x, y, z; real x, y, z;
real q; real q;
...@@ -12,27 +8,27 @@ typedef struct { ...@@ -12,27 +8,27 @@ typedef struct {
/** /**
* Compute the Born sum. * Compute the Born sum.
*/ */
__kernel void computeBornSum( KERNEL void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum, GLOBAL mm_long* RESTRICT global_bornSum,
#else #else
__global real* restrict global_bornSum, GLOBAL real* RESTRICT global_bornSum,
#endif #endif
__global const real4* restrict posq, __global const real* restrict charge, __global const float2* restrict global_params, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge, GLOBAL const float2* RESTRICT global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms, GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
__global const ushort2* exclusionTiles) { GLOBAL const ushort2* exclusionTiles) {
__local AtomData1 localData[TILE_SIZE]; LOCAL AtomData1 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
...@@ -56,17 +52,17 @@ __kernel void computeBornSum( ...@@ -56,17 +52,17 @@ __kernel void computeBornSum(
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real bornSum = 0.0f; real bornSum = 0;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
real r2 = dot(delta.xyz, delta.xyz); real r2 = dot(trimTo3(delta), trimTo3(delta));
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
...@@ -74,7 +70,7 @@ __kernel void computeBornSum( ...@@ -74,7 +70,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) { if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -92,9 +88,9 @@ __kernel void computeBornSum( ...@@ -92,9 +88,9 @@ __kernel void computeBornSum(
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
#endif #endif
} }
...@@ -110,9 +106,9 @@ __kernel void computeBornSum( ...@@ -110,9 +106,9 @@ __kernel void computeBornSum(
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -124,7 +120,7 @@ __kernel void computeBornSum( ...@@ -124,7 +120,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -154,9 +150,9 @@ __kernel void computeBornSum( ...@@ -154,9 +150,9 @@ __kernel void computeBornSum(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
#endif #endif
} }
...@@ -166,9 +162,9 @@ __kernel void computeBornSum( ...@@ -166,9 +162,9 @@ __kernel void computeBornSum(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx; unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (localData[tgx].bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[offset], (mm_long) (localData[tgx].bornSum*0x100000000));
#else #else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum; global_bornSum[offset] += localData[tgx].bornSum;
#endif #endif
} }
...@@ -182,15 +178,15 @@ __kernel void computeBornSum( ...@@ -182,15 +178,15 @@ __kernel void computeBornSum(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
#else #else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0)); int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
#endif #endif
int nextToSkip = -1; int nextToSkip = -1;
int currentSkipIndex = 0; int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE]; LOCAL int atomIndices[TILE_SIZE];
while (pos < end) { while (pos < end) {
bool includeTile = true; bool includeTile = true;
...@@ -263,15 +259,15 @@ __kernel void computeBornSum( ...@@ -263,15 +259,15 @@ __kernel void computeBornSum(
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j]; int atom2 = atomIndices[j];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -301,9 +297,9 @@ __kernel void computeBornSum( ...@@ -301,9 +297,9 @@ __kernel void computeBornSum(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
#endif #endif
} }
...@@ -319,9 +315,9 @@ __kernel void computeBornSum( ...@@ -319,9 +315,9 @@ __kernel void computeBornSum(
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -334,7 +330,7 @@ __kernel void computeBornSum( ...@@ -334,7 +330,7 @@ __kernel void computeBornSum(
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = r2*invR; real r = r2*invR;
float2 params2 = (float2) (localData[j].radius, localData[j].scaledRadius); float2 params2 = make_float2(localData[j].radius, localData[j].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y))); real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
...@@ -364,9 +360,9 @@ __kernel void computeBornSum( ...@@ -364,9 +360,9 @@ __kernel void computeBornSum(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom1], (long) (bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom1], (mm_long) (bornSum*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum; global_bornSum[offset] += bornSum;
#endif #endif
} }
...@@ -382,9 +378,9 @@ __kernel void computeBornSum( ...@@ -382,9 +378,9 @@ __kernel void computeBornSum(
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&global_bornSum[atom2], (long) (localData[tgx].bornSum*0x100000000)); ATOMIC_ADD(&global_bornSum[atom2], (mm_long) (localData[tgx].bornSum*0x100000000));
#else #else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum; global_bornSum[offset] += localData[tgx].bornSum;
#endif #endif
} }
...@@ -405,29 +401,29 @@ typedef struct { ...@@ -405,29 +401,29 @@ typedef struct {
* First part of computing the GBSA interaction. * First part of computing the GBSA interaction.
*/ */
__kernel void computeGBSAForce1( KERNEL void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce, GLOBAL mm_long* RESTRICT forceBuffers, GLOBAL mm_long* RESTRICT global_bornForce,
#else #else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce, GLOBAL real4* RESTRICT forceBuffers, GLOBAL real* RESTRICT global_bornForce,
#endif #endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict charge, GLOBAL mixed* RESTRICT energyBuffer, GLOBAL const real4* RESTRICT posq, GLOBAL const real* RESTRICT charge,
__global const real* restrict global_bornRadii, int needEnergy, GLOBAL const real* RESTRICT global_bornRadii, int needEnergy,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const int* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, GLOBAL const int* RESTRICT tiles, GLOBAL const unsigned int* RESTRICT interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, __global const real4* restrict blockCenter, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ, unsigned int maxTiles, GLOBAL const real4* RESTRICT blockCenter,
__global const real4* restrict blockSize, __global const int* restrict interactingAtoms, GLOBAL const real4* RESTRICT blockSize, GLOBAL const int* RESTRICT interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
__global const ushort2* exclusionTiles) { GLOBAL const ushort2* exclusionTiles) {
mixed energy = 0; mixed energy = 0;
__local AtomData2 localData[TILE_SIZE]; LOCAL AtomData2 localData[TILE_SIZE];
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+GROUP_ID*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0); const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(GROUP_ID+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/NUM_GROUPS;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x; const unsigned int x = tileIndices.x;
...@@ -449,14 +445,14 @@ __kernel void computeGBSAForce1( ...@@ -449,14 +445,14 @@ __kernel void computeGBSAForce1(
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = make_real4(0);
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
real bornRadius1 = global_bornRadii[atom1]; real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -485,21 +481,23 @@ __kernel void computeGBSAForce1( ...@@ -485,21 +481,23 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF; tempEnergy -= scaledChargeProduct/CUTOFF;
#endif #endif
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
} }
} }
// Write results. // Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
#endif #endif
} }
...@@ -515,14 +513,14 @@ __kernel void computeGBSAForce1( ...@@ -515,14 +513,14 @@ __kernel void computeGBSAForce1(
} }
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = make_real4(0);
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
real bornRadius1 = global_bornRadii[atom1]; real bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -550,8 +548,10 @@ __kernel void computeGBSAForce1( ...@@ -550,8 +548,10 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF; tempEnergy -= scaledChargeProduct/CUTOFF;
#endif #endif
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[j].fx += delta.x; localData[j].fx += delta.x;
localData[j].fy += delta.y; localData[j].fy += delta.y;
localData[j].fz += delta.z; localData[j].fz += delta.z;
...@@ -562,13 +562,13 @@ __kernel void computeGBSAForce1( ...@@ -562,13 +562,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
#endif #endif
} }
...@@ -578,12 +578,12 @@ __kernel void computeGBSAForce1( ...@@ -578,12 +578,12 @@ __kernel void computeGBSAForce1(
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx; unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[offset], (mm_long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
atom_add(&global_bornForce[offset], (long) (localData[tgx].fw*0x100000000)); ATOMIC_ADD(&global_bornForce[offset], (mm_long) (localData[tgx].fw*0x100000000));
#else #else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = y*TILE_SIZE+tgx + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset]; real4 f = forceBuffers[offset];
f.x += localData[tgx].fx; f.x += localData[tgx].fx;
f.y += localData[tgx].fy; f.y += localData[tgx].fy;
...@@ -602,15 +602,15 @@ __kernel void computeGBSAForce1( ...@@ -602,15 +602,15 @@ __kernel void computeGBSAForce1(
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list. return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int pos = (int) (GROUP_ID*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(numTiles > maxTiles ? NUM_BLOCKS*((mm_long)NUM_BLOCKS+1)/2 : numTiles)/NUM_GROUPS);
#else #else
int pos = (int) (get_group_id(0)*(long)numTiles/get_num_groups(0)); int pos = (int) (GROUP_ID*(mm_long)numTiles/NUM_GROUPS);
int end = (int) ((get_group_id(0)+1)*(long)numTiles/get_num_groups(0)); int end = (int) ((GROUP_ID+1)*(mm_long)numTiles/NUM_GROUPS);
#endif #endif
int nextToSkip = -1; int nextToSkip = -1;
int currentSkipIndex = 0; int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE]; LOCAL int atomIndices[TILE_SIZE];
while (pos < end) { while (pos < end) {
bool includeTile = true; bool includeTile = true;
...@@ -679,15 +679,15 @@ __kernel void computeGBSAForce1( ...@@ -679,15 +679,15 @@ __kernel void computeGBSAForce1(
} }
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = make_real4(0);
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX) APPLY_PERIODIC_TO_POS_WITH_CENTER(posq1, blockCenterX)
float bornRadius1 = global_bornRadii[atom1]; float bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[j]; int atom2 = atomIndices[j];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
...@@ -709,8 +709,10 @@ __kernel void computeGBSAForce1( ...@@ -709,8 +709,10 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF; tempEnergy -= scaledChargeProduct/CUTOFF;
#endif #endif
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[j].fx += delta.x; localData[j].fx += delta.x;
localData[j].fy += delta.y; localData[j].fy += delta.y;
localData[j].fz += delta.z; localData[j].fz += delta.z;
...@@ -721,13 +723,13 @@ __kernel void computeGBSAForce1( ...@@ -721,13 +723,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
#endif #endif
} }
...@@ -739,14 +741,14 @@ __kernel void computeGBSAForce1( ...@@ -739,14 +741,14 @@ __kernel void computeGBSAForce1(
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = make_real4(0);
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real charge1 = charge[atom1]; real charge1 = charge[atom1];
float bornRadius1 = global_bornRadii[atom1]; float bornRadius1 = global_bornRadii[atom1];
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 pos2 = (real3) (localData[j].x, localData[j].y, localData[j].z); real3 pos2 = make_real3(localData[j].x, localData[j].y, localData[j].z);
real charge2 = localData[j].q; real charge2 = localData[j].q;
real4 delta = (real4) (pos2 - posq1.xyz, 0); real3 delta = make_real3(pos2.x-posq1.x, pos2.y-posq1.y, pos2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta) APPLY_PERIODIC_TO_DELTA(delta)
#endif #endif
...@@ -775,8 +777,10 @@ __kernel void computeGBSAForce1( ...@@ -775,8 +777,10 @@ __kernel void computeGBSAForce1(
tempEnergy -= scaledChargeProduct/CUTOFF; tempEnergy -= scaledChargeProduct/CUTOFF;
#endif #endif
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta *= dEdR;
force.xyz -= delta.xyz; force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[j].fx += delta.x; localData[j].fx += delta.x;
localData[j].fy += delta.y; localData[j].fy += delta.y;
localData[j].fz += delta.z; localData[j].fz += delta.z;
...@@ -787,13 +791,13 @@ __kernel void computeGBSAForce1( ...@@ -787,13 +791,13 @@ __kernel void computeGBSAForce1(
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1], (mm_long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_long) (force.z*0x100000000));
atom_add(&global_bornForce[atom1], (long) (force.w*0x100000000)); ATOMIC_ADD(&global_bornForce[atom1], (mm_long) (force.w*0x100000000));
#else #else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + GROUP_ID*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset] += make_real4(force.x, force.y, force.z, 0);
global_bornForce[offset] += force.w; global_bornForce[offset] += force.w;
#endif #endif
} }
...@@ -809,12 +813,12 @@ __kernel void computeGBSAForce1( ...@@ -809,12 +813,12 @@ __kernel void computeGBSAForce1(
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) { if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom2], (long) (localData[tgx].fx*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2], (mm_long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000)); ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_long) (localData[tgx].fz*0x100000000));
atom_add(&global_bornForce[atom2], (long) (localData[tgx].fw*0x100000000)); ATOMIC_ADD(&global_bornForce[atom2], (mm_long) (localData[tgx].fw*0x100000000));
#else #else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom2 + GROUP_ID*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset]; real4 f = forceBuffers[offset];
f.x += localData[tgx].fx; f.x += localData[tgx].fx;
f.y += localData[tgx].fy; f.y += localData[tgx].fy;
...@@ -827,5 +831,5 @@ __kernel void computeGBSAForce1( ...@@ -827,5 +831,5 @@ __kernel void computeGBSAForce1(
} }
pos++; pos++;
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[GLOBAL_ID] += energy;
} }
/** /**
* Generate random numbers * Generate random numbers
*/ */
extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restrict__ random, uint4* __restrict__ seed) { KERNEL void generateRandomNumbers(int numValues, GLOBAL float4* RESTRICT random, GLOBAL uint4* RESTRICT seed) {
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = GLOBAL_ID;
uint4 state = seed[index]; uint4 state = seed[index];
unsigned int carry = 0; unsigned int carry = 0;
while (index < numValues) { while (index < numValues) {
...@@ -63,15 +63,15 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -63,15 +63,15 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
// Record the values. // Record the values.
random[index] = value; random[index] = value;
index += blockDim.x*gridDim.x; index += GLOBAL_SIZE;
} }
seed[blockIdx.x*blockDim.x+threadIdx.x] = state; seed[GLOBAL_ID] = state;
} }
/** /**
* Load the position of a particle. * Load the position of a particle.
*/ */
inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __restrict__ posqCorrection, int index) { inline DEVICE mixed4 loadPos(GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index]; real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index]; real4 pos2 = posqCorrection[index];
...@@ -84,7 +84,7 @@ inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __ ...@@ -84,7 +84,7 @@ inline __device__ mixed4 loadPos(const real4* __restrict__ posq, const real4* __
/** /**
* Store the position of a particle. * Store the position of a particle.
*/ */
inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ posqCorrection, int index, mixed4 pos) { inline DEVICE void storePos(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w); posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0); posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
...@@ -96,16 +96,24 @@ inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ po ...@@ -96,16 +96,24 @@ inline __device__ void storePos(real4* __restrict__ posq, real4* __restrict__ po
/** /**
* Enforce constraints on SHAKE clusters * Enforce constraints on SHAKE clusters
*/ */
extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, const int4* __restrict__ clusterAtoms, const float4* __restrict__ clusterParams) { KERNEL void applyShakeToPositions(int numClusters, mixed tol, GLOBAL const real4* RESTRICT oldPos,
int index = blockIdx.x*blockDim.x+threadIdx.x; GLOBAL mixed4* RESTRICT posDelta, GLOBAL const int4* RESTRICT clusterAtoms, GLOBAL const float4* RESTRICT clusterParams
#ifdef USE_MIXED_PRECISION
, GLOBAL const real4* RESTRICT posqCorrection
#endif
) {
#ifndef USE_MIXED_PRECISION
GLOBAL real4* posqCorrection = 0;
#endif
int index = GLOBAL_ID;
while (index < numClusters) { while (index < numClusters) {
// Load the data for this cluster. // Load the data for this cluster.
int4 atoms = clusterAtoms[index]; int4 atoms = clusterAtoms[index];
float4 params = clusterParams[index]; float4 params = clusterParams[index];
mixed4 pos = loadPos(oldPos, posCorrection, atoms.x); mixed4 pos = loadPos(oldPos, posqCorrection, atoms.x);
mixed4 xpi = posDelta[atoms.x]; mixed4 xpi = posDelta[atoms.x];
mixed4 pos1 = loadPos(oldPos, posCorrection, atoms.y); mixed4 pos1 = loadPos(oldPos, posqCorrection, atoms.y);
mixed4 xpj1 = posDelta[atoms.y]; mixed4 xpj1 = posDelta[atoms.y];
mixed4 pos2 = make_mixed4(0); mixed4 pos2 = make_mixed4(0);
mixed4 xpj2 = make_mixed4(0); mixed4 xpj2 = make_mixed4(0);
...@@ -114,13 +122,13 @@ extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, con ...@@ -114,13 +122,13 @@ extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, con
float d2 = params.z; float d2 = params.z;
float invMassPeripheral = params.w; float invMassPeripheral = params.w;
if (atoms.z != -1) { if (atoms.z != -1) {
pos2 = loadPos(oldPos, posCorrection, atoms.z); pos2 = loadPos(oldPos, posqCorrection, atoms.z);
xpj2 = posDelta[atoms.z]; xpj2 = posDelta[atoms.z];
} }
mixed4 pos3 = make_mixed4(0); mixed4 pos3 = make_mixed4(0);
mixed4 xpj3 = make_mixed4(0); mixed4 xpj3 = make_mixed4(0);
if (atoms.w != -1) { if (atoms.w != -1) {
pos3 = loadPos(oldPos, posCorrection, atoms.w); pos3 = loadPos(oldPos, posqCorrection, atoms.w);
xpj3 = posDelta[atoms.w]; xpj3 = posDelta[atoms.w];
} }
...@@ -202,23 +210,31 @@ extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, con ...@@ -202,23 +210,31 @@ extern "C" __global__ void applyShakeToPositions(int numClusters, mixed tol, con
posDelta[atoms.z] = xpj2; posDelta[atoms.z] = xpj2;
if (atoms.w != -1) if (atoms.w != -1)
posDelta[atoms.w] = xpj3; posDelta[atoms.w] = xpj3;
index += blockDim.x*gridDim.x; index += GLOBAL_SIZE;
} }
} }
/** /**
* Enforce velocity constraints on SHAKE clusters * Enforce velocity constraints on SHAKE clusters
*/ */
extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, const int4* __restrict__ clusterAtoms, const float4* __restrict__ clusterParams) { KERNEL void applyShakeToVelocities(int numClusters, mixed tol, GLOBAL const real4* RESTRICT oldPos,
int index = blockIdx.x*blockDim.x+threadIdx.x; GLOBAL mixed4* RESTRICT posDelta, GLOBAL const int4* RESTRICT clusterAtoms, GLOBAL const float4* RESTRICT clusterParams
#ifdef USE_MIXED_PRECISION
, GLOBAL const real4* RESTRICT posqCorrection
#endif
) {
#ifndef USE_MIXED_PRECISION
GLOBAL real4* posqCorrection = 0;
#endif
int index = GLOBAL_ID;
while (index < numClusters) { while (index < numClusters) {
// Load the data for this cluster. // Load the data for this cluster.
int4 atoms = clusterAtoms[index]; int4 atoms = clusterAtoms[index];
float4 params = clusterParams[index]; float4 params = clusterParams[index];
mixed4 pos = loadPos(oldPos, posCorrection, atoms.x); mixed4 pos = loadPos(oldPos, posqCorrection, atoms.x);
mixed4 xpi = posDelta[atoms.x]; mixed4 xpi = posDelta[atoms.x];
mixed4 pos1 = loadPos(oldPos, posCorrection, atoms.y); mixed4 pos1 = loadPos(oldPos, posqCorrection, atoms.y);
mixed4 xpj1 = posDelta[atoms.y]; mixed4 xpj1 = posDelta[atoms.y];
mixed4 pos2 = make_mixed4(0); mixed4 pos2 = make_mixed4(0);
mixed4 xpj2 = make_mixed4(0); mixed4 xpj2 = make_mixed4(0);
...@@ -226,13 +242,13 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co ...@@ -226,13 +242,13 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
float avgMass = params.y; float avgMass = params.y;
float invMassPeripheral = params.w; float invMassPeripheral = params.w;
if (atoms.z != -1) { if (atoms.z != -1) {
pos2 = loadPos(oldPos, posCorrection, atoms.z); pos2 = loadPos(oldPos, posqCorrection, atoms.z);
xpj2 = posDelta[atoms.z]; xpj2 = posDelta[atoms.z];
} }
mixed4 pos3 = make_mixed4(0); mixed4 pos3 = make_mixed4(0);
mixed4 xpj3 = make_mixed4(0); mixed4 xpj3 = make_mixed4(0);
if (atoms.w != -1) { if (atoms.w != -1) {
pos3 = loadPos(oldPos, posCorrection, atoms.w); pos3 = loadPos(oldPos, posqCorrection, atoms.w);
xpj3 = posDelta[atoms.w]; xpj3 = posDelta[atoms.w];
} }
...@@ -302,25 +318,34 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co ...@@ -302,25 +318,34 @@ extern "C" __global__ void applyShakeToVelocities(int numClusters, mixed tol, co
posDelta[atoms.z] = xpj2; posDelta[atoms.z] = xpj2;
if (atoms.w != -1) if (atoms.w != -1)
posDelta[atoms.w] = xpj3; posDelta[atoms.w] = xpj3;
index += blockDim.x*gridDim.x; index += GLOBAL_SIZE;
} }
} }
/** /**
* Enforce constraints on SETTLE clusters * Enforce constraints on SETTLE clusters
*/ */
extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, const mixed4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) { KERNEL void applySettleToPositions(int numClusters, mixed tol, GLOBAL const real4* RESTRICT oldPos,
int index = blockIdx.x*blockDim.x+threadIdx.x; GLOBAL mixed4* RESTRICT posDelta, GLOBAL const mixed4* RESTRICT velm, GLOBAL const int4* RESTRICT clusterAtoms,
GLOBAL const float2* RESTRICT clusterParams
#ifdef USE_MIXED_PRECISION
, GLOBAL const real4* RESTRICT posqCorrection
#endif
) {
#ifndef USE_MIXED_PRECISION
GLOBAL real4* posqCorrection = 0;
#endif
int index = GLOBAL_ID;
while (index < numClusters) { while (index < numClusters) {
// Load the data for this cluster. // Load the data for this cluster.
int4 atoms = clusterAtoms[index]; int4 atoms = clusterAtoms[index];
float2 params = clusterParams[index]; float2 params = clusterParams[index];
mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x); mixed4 apos0 = loadPos(oldPos, posqCorrection, atoms.x);
mixed4 xp0 = posDelta[atoms.x]; mixed4 xp0 = posDelta[atoms.x];
mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y); mixed4 apos1 = loadPos(oldPos, posqCorrection, atoms.y);
mixed4 xp1 = posDelta[atoms.y]; mixed4 xp1 = posDelta[atoms.y];
mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z); mixed4 apos2 = loadPos(oldPos, posqCorrection, atoms.z);
mixed4 xp2 = posDelta[atoms.z]; mixed4 xp2 = posDelta[atoms.z];
mixed m0 = 1/velm[atoms.x].w; mixed m0 = 1/velm[atoms.x].w;
mixed m1 = 1/velm[atoms.y].w; mixed m1 = 1/velm[atoms.y].w;
...@@ -454,21 +479,30 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -454,21 +479,30 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
posDelta[atoms.x] = xp0; posDelta[atoms.x] = xp0;
posDelta[atoms.y] = xp1; posDelta[atoms.y] = xp1;
posDelta[atoms.z] = xp2; posDelta[atoms.z] = xp2;
index += blockDim.x*gridDim.x; index += GLOBAL_SIZE;
} }
} }
/** /**
* Enforce velocity constraints on SETTLE clusters * Enforce velocity constraints on SETTLE clusters
*/ */
extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, const real4* __restrict__ oldPos, real4* __restrict__ posCorrection, mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const int4* __restrict__ clusterAtoms, const float2* __restrict__ clusterParams) { KERNEL void applySettleToVelocities(int numClusters, mixed tol, GLOBAL const real4* RESTRICT oldPos,
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numClusters; index += blockDim.x*gridDim.x) { GLOBAL mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT velm, GLOBAL const int4* RESTRICT clusterAtoms,
GLOBAL const float2* RESTRICT clusterParams
#ifdef USE_MIXED_PRECISION
, GLOBAL const real4* RESTRICT posqCorrection
#endif
) {
#ifndef USE_MIXED_PRECISION
GLOBAL real4* posqCorrection = 0;
#endif
for (int index = GLOBAL_ID; index < numClusters; index += GLOBAL_SIZE) {
// Load the data for this cluster. // Load the data for this cluster.
int4 atoms = clusterAtoms[index]; int4 atoms = clusterAtoms[index];
mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x); mixed4 apos0 = loadPos(oldPos, posqCorrection, atoms.x);
mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y); mixed4 apos1 = loadPos(oldPos, posqCorrection, atoms.y);
mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z); mixed4 apos2 = loadPos(oldPos, posqCorrection, atoms.z);
mixed4 v0 = velm[atoms.x]; mixed4 v0 = velm[atoms.x];
mixed4 v1 = velm[atoms.y]; mixed4 v1 = velm[atoms.y];
mixed4 v2 = velm[atoms.z]; mixed4 v2 = velm[atoms.z];
...@@ -522,9 +556,16 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c ...@@ -522,9 +556,16 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
/** /**
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation. * Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/ */
extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance, KERNEL void computeCCMAConstraintDirections(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL mixed4* RESTRICT constraintDistance,
const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection, int* __restrict__ converged) { GLOBAL const real4* RESTRICT atomPositions, GLOBAL int* RESTRICT converged
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) { #ifdef USE_MIXED_PRECISION
, GLOBAL const real4* RESTRICT posqCorrection
#endif
) {
#ifndef USE_MIXED_PRECISION
GLOBAL real4* posqCorrection = 0;
#endif
for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
// Compute the direction for this constraint. // Compute the direction for this constraint.
int2 atoms = constraintAtoms[index]; int2 atoms = constraintAtoms[index];
...@@ -536,7 +577,7 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric ...@@ -536,7 +577,7 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
dir.z = oldPos1.z-oldPos2.z; dir.z = oldPos1.z-oldPos2.z;
constraintDistance[index] = dir; constraintDistance[index] = dir;
} }
if (threadIdx.x == 0 && blockIdx.x == 0) { if (GLOBAL_ID == 0) {
converged[0] = 1; converged[0] = 1;
converged[1] = 0; converged[1] = 0;
} }
...@@ -545,23 +586,24 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric ...@@ -545,23 +586,24 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
/** /**
* Compute the force applied by each CCMA position constraint. * Compute the force applied by each CCMA position constraint.
*/ */
extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __restrict__ constraintAtoms, const mixed4* __restrict__ constraintDistance, const mixed4* __restrict__ atomPositions, KERNEL void computeCCMAPositionConstraintForce(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, int* __restrict__ hostConvergedFlag, mixed tol, int iteration) { GLOBAL const mixed4* RESTRICT atomPositions, GLOBAL const mixed* RESTRICT reducedMass, GLOBAL mixed* RESTRICT delta1,
__shared__ int groupConverged; GLOBAL int* RESTRICT converged, GLOBAL int* RESTRICT hostConvergedFlag, mixed tol, int iteration) {
LOCAL int groupConverged;
if (converged[1-iteration%2]) { if (converged[1-iteration%2]) {
if (blockIdx.x == 0 && threadIdx.x == 0) { if (GLOBAL_ID == 0) {
converged[iteration%2] = 1; converged[iteration%2] = 1;
hostConvergedFlag[0] = 1; hostConvergedFlag[0] = 1;
} }
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
} }
if (threadIdx.x == 0) if (LOCAL_ID == 0)
groupConverged = 1; groupConverged = 1;
__syncthreads(); SYNC_THREADS;
mixed lowerTol = 1-2*tol+tol*tol; mixed lowerTol = 1-2*tol+tol*tol;
mixed upperTol = 1+2*tol+tol*tol; mixed upperTol = 1+2*tol+tol*tol;
bool threadConverged = true; bool threadConverged = true;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
// Compute the force due to this constraint. // Compute the force due to this constraint.
int2 atoms = constraintAtoms[index]; int2 atoms = constraintAtoms[index];
...@@ -580,28 +622,29 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest ...@@ -580,28 +622,29 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
} }
if (groupConverged && !threadConverged) if (groupConverged && !threadConverged)
groupConverged = 0; groupConverged = 0;
__syncthreads(); SYNC_THREADS;
if (threadIdx.x == 0 && !groupConverged) if (LOCAL_ID == 0 && !groupConverged)
converged[iteration%2] = 0; converged[iteration%2] = 0;
} }
/** /**
* Compute the force applied by each CCMA velocity constraint. * Compute the force applied by each CCMA velocity constraint.
*/ */
extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __restrict__ constraintAtoms, const mixed4* __restrict__ constraintDistance, const mixed4* __restrict__ atomPositions, KERNEL void computeCCMAVelocityConstraintForce(GLOBAL const int2* RESTRICT constraintAtoms, GLOBAL const mixed4* RESTRICT constraintDistance,
const mixed* __restrict__ reducedMass, mixed* __restrict__ delta1, int* __restrict__ converged, int* __restrict__ hostConvergedFlag, mixed tol, int iteration) { GLOBAL const mixed4* RESTRICT atomPositions, GLOBAL const mixed* RESTRICT reducedMass, GLOBAL mixed* RESTRICT delta1,
__shared__ int groupConverged; GLOBAL int* RESTRICT converged, GLOBAL int* RESTRICT hostConvergedFlag, mixed tol, int iteration) {
LOCAL int groupConverged;
if (converged[1-iteration%2]) { if (converged[1-iteration%2]) {
if (blockIdx.x == 0 && threadIdx.x == 0) { if (GROUP_ID == 0 && LOCAL_ID == 0) {
converged[iteration%2] = 1; converged[iteration%2] = 1;
hostConvergedFlag[0] = 1; hostConvergedFlag[0] = 1;
} }
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
} }
if (threadIdx.x == 0) if (LOCAL_ID == 0)
groupConverged = 1; groupConverged = 1;
__syncthreads(); SYNC_THREADS;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
// Compute the force due to this constraint. // Compute the force due to this constraint.
int2 atoms = constraintAtoms[index]; int2 atoms = constraintAtoms[index];
...@@ -623,14 +666,14 @@ extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __rest ...@@ -623,14 +666,14 @@ extern "C" __global__ void computeCCMAVelocityConstraintForce(const int2* __rest
/** /**
* Multiply the vector of CCMA constraint forces by the constraint matrix. * Multiply the vector of CCMA constraint forces by the constraint matrix.
*/ */
extern "C" __global__ void multiplyByCCMAConstraintMatrix(const mixed* __restrict__ delta1, mixed* __restrict__ delta2, const int* __restrict__ constraintMatrixColumn, KERNEL void multiplyByCCMAConstraintMatrix(GLOBAL const mixed* RESTRICT delta1, GLOBAL mixed* RESTRICT delta2, GLOBAL const int* RESTRICT constraintMatrixColumn,
const mixed* __restrict__ constraintMatrixValue, const int* __restrict__ converged, int iteration) { GLOBAL const mixed* RESTRICT constraintMatrixValue, GLOBAL const int* RESTRICT converged, int iteration) {
if (converged[iteration%2]) if (converged[iteration%2])
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
// Multiply by the inverse constraint matrix. // Multiply by the inverse constraint matrix.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_CCMA_CONSTRAINTS; index += GLOBAL_SIZE) {
mixed sum = 0; mixed sum = 0;
for (int i = 0; ; i++) { for (int i = 0; ; i++) {
int element = index+i*NUM_CCMA_CONSTRAINTS; int element = index+i*NUM_CCMA_CONSTRAINTS;
...@@ -646,14 +689,15 @@ extern "C" __global__ void multiplyByCCMAConstraintMatrix(const mixed* __restric ...@@ -646,14 +689,15 @@ extern "C" __global__ void multiplyByCCMAConstraintMatrix(const mixed* __restric
/** /**
* Update the atom positions based on CCMA constraint forces. * Update the atom positions based on CCMA constraint forces.
*/ */
extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAtomConstraints, const int* __restrict__ atomConstraints, const mixed4* __restrict__ constraintDistance, KERNEL void updateCCMAAtomPositions(GLOBAL const int* RESTRICT numAtomConstraints, GLOBAL const int* RESTRICT atomConstraints,
mixed4* __restrict__ atomPositions, const mixed4* __restrict__ velm, const mixed* __restrict__ delta1, const mixed* __restrict__ delta2, int* __restrict__ converged, int iteration) { GLOBAL const mixed4* RESTRICT constraintDistance, GLOBAL mixed4* RESTRICT atomPositions, GLOBAL const mixed4* RESTRICT velm,
if (blockIdx.x == 0 && threadIdx.x == 0) GLOBAL const mixed* RESTRICT delta1, GLOBAL const mixed* RESTRICT delta2, GLOBAL int* RESTRICT converged, int iteration) {
if (GROUP_ID == 0 && LOCAL_ID == 0)
converged[1-iteration%2] = 1; converged[1-iteration%2] = 1;
if (converged[iteration%2]) if (converged[iteration%2])
return; // The constraint iteration has already converged. return; // The constraint iteration has already converged.
mixed damping = (iteration < 2 ? 0.5f : 1.0f); mixed damping = (iteration < 2 ? 0.5f : 1.0f);
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
// Compute the new position of this atom. // Compute the new position of this atom.
mixed4 atomPos = atomPositions[index]; mixed4 atomPos = atomPositions[index];
...@@ -677,16 +721,17 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt ...@@ -677,16 +721,17 @@ extern "C" __global__ void updateCCMAAtomPositions(const int* __restrict__ numAt
/** /**
* Compute the positions of virtual sites * Compute the positions of virtual sites
*/ */
extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* __restrict__ posqCorrection, const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights, KERNEL void computeVirtualSites(GLOBAL real4* RESTRICT posq, GLOBAL real4* RESTRICT posqCorrection,
const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights, GLOBAL const int4* RESTRICT avg2Atoms, GLOBAL const real2* RESTRICT avg2Weights,
const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights, GLOBAL const int4* RESTRICT avg3Atoms, GLOBAL const real4* RESTRICT avg3Weights,
const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms, GLOBAL const int4* RESTRICT outOfPlaneAtoms, GLOBAL const real4* RESTRICT outOfPlaneWeights,
const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos, GLOBAL const int* RESTRICT localCoordsIndex, GLOBAL const int* RESTRICT localCoordsAtoms,
const int* __restrict__ localCoordsStartIndex) { GLOBAL const real* RESTRICT localCoordsWeights, GLOBAL const real4* RESTRICT localCoordsPos,
GLOBAL const int* RESTRICT localCoordsStartIndex) {
// Two particle average sites. // Two particle average sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_2_AVERAGE; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_2_AVERAGE; index += GLOBAL_SIZE) {
int4 atoms = avg2Atoms[index]; int4 atoms = avg2Atoms[index];
real2 weights = avg2Weights[index]; real2 weights = avg2Weights[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x); mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
...@@ -700,7 +745,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* ...@@ -700,7 +745,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
// Three particle average sites. // Three particle average sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_3_AVERAGE; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_3_AVERAGE; index += GLOBAL_SIZE) {
int4 atoms = avg3Atoms[index]; int4 atoms = avg3Atoms[index];
real4 weights = avg3Weights[index]; real4 weights = avg3Weights[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x); mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
...@@ -715,7 +760,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* ...@@ -715,7 +760,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
// Out of plane sites. // Out of plane sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_OUT_OF_PLANE; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_OUT_OF_PLANE; index += GLOBAL_SIZE) {
int4 atoms = outOfPlaneAtoms[index]; int4 atoms = outOfPlaneAtoms[index];
real4 weights = outOfPlaneWeights[index]; real4 weights = outOfPlaneWeights[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x); mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
...@@ -724,7 +769,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* ...@@ -724,7 +769,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w); mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
mixed4 v12 = pos2-pos1; mixed4 v12 = pos2-pos1;
mixed4 v13 = pos3-pos1; mixed4 v13 = pos3-pos1;
mixed3 cr = cross(v12, v13); mixed4 cr = cross(v12, v13);
pos.x = pos1.x + v12.x*weights.x + v13.x*weights.y + cr.x*weights.z; pos.x = pos1.x + v12.x*weights.x + v13.x*weights.y + cr.x*weights.z;
pos.y = pos1.y + v12.y*weights.x + v13.y*weights.y + cr.y*weights.z; pos.y = pos1.y + v12.y*weights.x + v13.y*weights.y + cr.y*weights.z;
pos.z = pos1.z + v12.z*weights.x + v13.z*weights.y + cr.z*weights.z; pos.z = pos1.z + v12.z*weights.x + v13.z*weights.y + cr.z*weights.z;
...@@ -733,7 +778,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* ...@@ -733,7 +778,7 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
// Local coordinates sites. // Local coordinates sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_LOCAL_COORDS; index += GLOBAL_SIZE) {
int siteAtomIndex = localCoordsIndex[index]; int siteAtomIndex = localCoordsIndex[index];
int start = localCoordsStartIndex[index]; int start = localCoordsStartIndex[index];
int end = localCoordsStartIndex[index+1]; int end = localCoordsStartIndex[index+1];
...@@ -762,32 +807,38 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4* ...@@ -762,32 +807,38 @@ extern "C" __global__ void computeVirtualSites(real4* __restrict__ posq, real4*
} }
} }
inline __device__ real3 loadForce(int index, long long* __restrict__ force) { inline DEVICE real3 loadForce(int index, GLOBAL const mm_long* RESTRICT force) {
real scale = 1/((real) 0x100000000); real scale = 1/((real) 0x100000000);
return make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]); return make_real3(scale*force[index], scale*force[index+PADDED_NUM_ATOMS], scale*force[index+PADDED_NUM_ATOMS*2]);
} }
inline __device__ void addForce(int index, long long* __restrict__ force, real3 value) { inline DEVICE void addForce(int index, GLOBAL mm_long* RESTRICT force, real3 value) {
unsigned long long* f = (unsigned long long*) force; GLOBAL mm_ulong* f = (GLOBAL mm_ulong*) force;
atomicAdd(&f[index], static_cast<unsigned long long>((long long) (value.x*0x100000000))); #ifdef HAS_OVERLAPPING_VSITES
atomicAdd(&f[index+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (value.y*0x100000000))); ATOMIC_ADD(&f[index], (mm_ulong) ((mm_long) (value.x*0x100000000)));
atomicAdd(&f[index+PADDED_NUM_ATOMS*2], static_cast<unsigned long long>((long long) (value.z*0x100000000))); ATOMIC_ADD(&f[index+PADDED_NUM_ATOMS], (mm_ulong) ((mm_long) (value.y*0x100000000)));
ATOMIC_ADD(&f[index+PADDED_NUM_ATOMS*2], (mm_ulong) ((mm_long) (value.z*0x100000000)));
#else
f[index] += (mm_ulong) ((mm_long) (value.x*0x100000000));
f[index+PADDED_NUM_ATOMS] += (mm_ulong) ((mm_long) (value.y*0x100000000));
f[index+PADDED_NUM_ATOMS*2] += (mm_ulong) ((mm_long) (value.z*0x100000000));
#endif
} }
/** /**
* Distribute forces from virtual sites to the atoms they are based on. * Distribute forces from virtual sites to the atoms they are based on.
*/ */
extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ posq, const real4* __restrict__ posqCorrection, long long* __restrict__ force, KERNEL void distributeVirtualSiteForces(GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT posqCorrection, GLOBAL mm_long* RESTRICT force,
const int4* __restrict__ avg2Atoms, const real2* __restrict__ avg2Weights, GLOBAL const int4* RESTRICT avg2Atoms, GLOBAL const real2* RESTRICT avg2Weights,
const int4* __restrict__ avg3Atoms, const real4* __restrict__ avg3Weights, GLOBAL const int4* RESTRICT avg3Atoms, GLOBAL const real4* RESTRICT avg3Weights,
const int4* __restrict__ outOfPlaneAtoms, const real4* __restrict__ outOfPlaneWeights, GLOBAL const int4* RESTRICT outOfPlaneAtoms, GLOBAL const real4* RESTRICT outOfPlaneWeights,
const int* __restrict__ localCoordsIndex, const int* __restrict__ localCoordsAtoms, GLOBAL const int* RESTRICT localCoordsIndex, GLOBAL const int* RESTRICT localCoordsAtoms,
const real* __restrict__ localCoordsWeights, const real4* __restrict__ localCoordsPos, GLOBAL const real* RESTRICT localCoordsWeights, GLOBAL const real4* RESTRICT localCoordsPos,
const int* __restrict__ localCoordsStartIndex) { GLOBAL const int* RESTRICT localCoordsStartIndex) {
// Two particle average sites. // Two particle average sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_2_AVERAGE; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_2_AVERAGE; index += GLOBAL_SIZE) {
int4 atoms = avg2Atoms[index]; int4 atoms = avg2Atoms[index];
real2 weights = avg2Weights[index]; real2 weights = avg2Weights[index];
real3 f = loadForce(atoms.x, force); real3 f = loadForce(atoms.x, force);
...@@ -797,7 +848,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -797,7 +848,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
// Three particle average sites. // Three particle average sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_3_AVERAGE; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_3_AVERAGE; index += GLOBAL_SIZE) {
int4 atoms = avg3Atoms[index]; int4 atoms = avg3Atoms[index];
real4 weights = avg3Weights[index]; real4 weights = avg3Weights[index];
real3 f = loadForce(atoms.x, force); real3 f = loadForce(atoms.x, force);
...@@ -808,7 +859,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -808,7 +859,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
// Out of plane sites. // Out of plane sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_OUT_OF_PLANE; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_OUT_OF_PLANE; index += GLOBAL_SIZE) {
int4 atoms = outOfPlaneAtoms[index]; int4 atoms = outOfPlaneAtoms[index];
real4 weights = outOfPlaneWeights[index]; real4 weights = outOfPlaneWeights[index];
mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y); mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
...@@ -830,7 +881,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -830,7 +881,7 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
// Local coordinates sites. // Local coordinates sites.
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_LOCAL_COORDS; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_LOCAL_COORDS; index += GLOBAL_SIZE) {
int siteAtomIndex = localCoordsIndex[index]; int siteAtomIndex = localCoordsIndex[index];
int start = localCoordsStartIndex[index]; int start = localCoordsStartIndex[index];
int end = localCoordsStartIndex[index+1]; int end = localCoordsStartIndex[index+1];
...@@ -884,12 +935,22 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__ ...@@ -884,12 +935,22 @@ extern "C" __global__ void distributeVirtualSiteForces(const real4* __restrict__
} }
} }
/**
* Copy the distributed forces from the long buffer back to the float buffer.
*/
KERNEL void saveDistributedForces(GLOBAL const mm_long* RESTRICT longForces, GLOBAL real4* RESTRICT forces) {
for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
real3 f = loadForce(index, longForces);
forces[index] = make_real4(f.x, f.y, f.z, 0);
}
}
/** /**
* Apply a time shift to the velocities before computing kinetic energy. * Apply a time shift to the velocities before computing kinetic energy.
*/ */
extern "C" __global__ void timeShiftVelocities(mixed4* __restrict__ velm, const long long* __restrict__ force, real timeShift) { KERNEL void timeShiftVelocities(GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, real timeShift) {
const mixed scale = timeShift/(mixed) 0x100000000; const mixed scale = timeShift/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_ATOMS; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < NUM_ATOMS; index += GLOBAL_SIZE) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
velocity.x += scale*force[index]*velocity.w; velocity.x += scale*force[index]*velocity.w;
......
...@@ -4,13 +4,13 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams}; ...@@ -4,13 +4,13 @@ enum {VelScale, ForceScale, NoiseScale, MaxParams};
* Perform the first step of Langevin integration. * Perform the first step of Langevin integration.
*/ */
extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta, KERNEL void integrateLangevinPart1(int numAtoms, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed4* RESTRICT posDelta,
const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) { GLOBAL const mixed* RESTRICT paramBuffer, GLOBAL const mixed2* RESTRICT dt, GLOBAL const float4* RESTRICT random, unsigned int randomIndex) {
mixed vscale = paramBuffer[VelScale]; mixed vscale = paramBuffer[VelScale];
mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000; mixed fscale = paramBuffer[ForceScale]/(mixed) 0x100000000;
mixed noisescale = paramBuffer[NoiseScale]; mixed noisescale = paramBuffer[NoiseScale];
mixed stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = GLOBAL_ID;
randomIndex += index; randomIndex += index;
while (index < numAtoms) { while (index < numAtoms) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
...@@ -22,8 +22,8 @@ extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAto ...@@ -22,8 +22,8 @@ extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAto
velm[index] = velocity; velm[index] = velocity;
posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0); posDelta[index] = make_mixed4(stepSize*velocity.x, stepSize*velocity.y, stepSize*velocity.z, 0);
} }
randomIndex += blockDim.x*gridDim.x; randomIndex += GLOBAL_SIZE;
index += blockDim.x*gridDim.x; index += GLOBAL_SIZE;
} }
} }
...@@ -31,14 +31,18 @@ extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAto ...@@ -31,14 +31,18 @@ extern "C" __global__ void integrateLangevinPart1(int numAtoms, int paddedNumAto
* Perform the second step of Langevin integration. * Perform the second step of Langevin integration.
*/ */
extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, const mixed4* __restrict__ posDelta, mixed4* __restrict__ velm, const mixed2* __restrict__ dt) { KERNEL void integrateLangevinPart2(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL const mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT velm, GLOBAL const mixed2* RESTRICT dt
#if __CUDA_ARCH__ >= 130 #ifdef USE_MIXED_PRECISION
, GLOBAL real4* RESTRICT posqCorrection
#endif
) {
#ifdef SUPPORTS_DOUBLE_PRECISION
double invStepSize = 1.0/dt[0].y; double invStepSize = 1.0/dt[0].y;
#else #else
float invStepSize = 1.0f/dt[0].y; float invStepSize = 1.0f/dt[0].y;
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y; float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif #endif
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = GLOBAL_ID;
while (index < numAtoms) { while (index < numAtoms) {
mixed4 vel = velm[index]; mixed4 vel = velm[index];
if (vel.w != 0) { if (vel.w != 0) {
...@@ -53,7 +57,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric ...@@ -53,7 +57,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
pos.x += delta.x; pos.x += delta.x;
pos.y += delta.y; pos.y += delta.y;
pos.z += delta.z; pos.z += delta.z;
#if __CUDA_ARCH__ >= 130 #ifdef SUPPORTS_DOUBLE_PRECISION
vel.x = (mixed) (invStepSize*delta.x); vel.x = (mixed) (invStepSize*delta.x);
vel.y = (mixed) (invStepSize*delta.y); vel.y = (mixed) (invStepSize*delta.y);
vel.z = (mixed) (invStepSize*delta.z); vel.z = (mixed) (invStepSize*delta.z);
...@@ -70,7 +74,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric ...@@ -70,7 +74,7 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
#endif #endif
velm[index] = vel; velm[index] = vel;
} }
index += blockDim.x*gridDim.x; index += GLOBAL_SIZE;
} }
} }
...@@ -78,32 +82,30 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric ...@@ -78,32 +82,30 @@ extern "C" __global__ void integrateLangevinPart2(int numAtoms, real4* __restric
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed friction, mixed kT, mixed2* __restrict__ dt, KERNEL void selectLangevinStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed friction, mixed kT, GLOBAL mixed2* RESTRICT dt,
const mixed4* __restrict__ velm, const long long* __restrict__ force, mixed* __restrict__ paramBuffer) { GLOBAL const mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed* RESTRICT paramBuffer) {
// Calculate the error. // Calculate the error.
extern __shared__ mixed params[]; LOCAL mixed error[256];
mixed* error = &params[MaxParams]; LOCAL mixed params[MaxParams];
mixed err = 0; mixed err = 0;
unsigned int index = threadIdx.x;
const mixed scale = RECIP((mixed) 0x100000000); const mixed scale = RECIP((mixed) 0x100000000);
while (index < numAtoms) { for (int index = LOCAL_ID; index < numAtoms; index += LOCAL_SIZE) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
index += blockDim.x*gridDim.x;
} }
error[threadIdx.x] = err; error[LOCAL_ID] = err;
__syncthreads(); SYNC_THREADS;
// Sum the errors from all threads. // Sum the errors from all threads.
for (unsigned int offset = 1; offset < blockDim.x; offset *= 2) { for (unsigned int offset = 1; offset < LOCAL_SIZE; offset *= 2) {
if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0) if (LOCAL_ID+offset < LOCAL_SIZE && (LOCAL_ID&(2*offset-1)) == 0)
error[threadIdx.x] += error[threadIdx.x+offset]; error[LOCAL_ID] += error[LOCAL_ID+offset];
__syncthreads(); SYNC_THREADS;
} }
if (blockIdx.x*blockDim.x+threadIdx.x == 0) { if (GLOBAL_ID == 0) {
// Select the new step size. // Select the new step size.
mixed totalError = SQRT(error[0]/(numAtoms*3)); mixed totalError = SQRT(error[0]/(numAtoms*3));
...@@ -126,7 +128,7 @@ extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAto ...@@ -126,7 +128,7 @@ extern "C" __global__ void selectLangevinStepSize(int numAtoms, int paddedNumAto
params[ForceScale] = fscale; params[ForceScale] = fscale;
params[NoiseScale] = noisescale; params[NoiseScale] = noisescale;
} }
__syncthreads(); SYNC_THREADS;
if (threadIdx.x < MaxParams) if (LOCAL_ID < MaxParams)
paramBuffer[threadIdx.x] = params[threadIdx.x]; paramBuffer[LOCAL_ID] = params[LOCAL_ID];
} }
enum {VelScale, NoiseScale}; enum {VelScale, NoiseScale};
/** /**
* Perform the first part of BAOAB integration: velocity half step, then position half step. * Perform the first part of integration: velocity step.
*/ */
extern "C" __global__ void integrateBAOABPart1(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta, KERNEL void integrateLangevinMiddlePart1(int numAtoms, int paddedNumAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force,
mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) { GLOBAL const mixed2* RESTRICT dt) {
mixed halfdt = 0.5*dt[0].y; mixed fscale = dt[0].y/(mixed) 0x100000000;
mixed fscale = halfdt/(mixed) 0x100000000; for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
velocity.x += fscale*velocity.w*force[index]; velocity.x += fscale*velocity.w*force[index];
velocity.y += fscale*velocity.w*force[index+paddedNumAtoms]; velocity.y += fscale*velocity.w*force[index+paddedNumAtoms];
velocity.z += fscale*velocity.w*force[index+paddedNumAtoms*2]; velocity.z += fscale*velocity.w*force[index+paddedNumAtoms*2];
velm[index] = velocity; velm[index] = velocity;
mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
posDelta[index] = delta;
oldDelta[index] = delta;
} }
} }
} }
/** /**
* Perform the second part of BAOAB integration: apply constraint forces to velocities, then interact with heat bath, * Perform the second part of integration: position half step, then interact with heat bath,
* then position half step. * then another position half step.
*/ */
extern "C" __global__ void integrateBAOABPart2(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, mixed4* __restrict__ posDelta, KERNEL void integrateLangevinMiddlePart2(int numAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL mixed4* RESTRICT posDelta,
mixed4* __restrict__ oldDelta, const mixed* __restrict__ paramBuffer, const mixed2* __restrict__ dt, const float4* __restrict__ random, unsigned int randomIndex) { GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed* RESTRICT paramBuffer, GLOBAL const mixed2* RESTRICT dt, GLOBAL const float4* RESTRICT random, unsigned int randomIndex
) {
mixed vscale = paramBuffer[VelScale]; mixed vscale = paramBuffer[VelScale];
mixed noisescale = paramBuffer[NoiseScale]; mixed noisescale = paramBuffer[NoiseScale];
mixed halfdt = 0.5*dt[0].y; mixed halfdt = 0.5*dt[0].y;
mixed invHalfdt = 1/halfdt; int index = GLOBAL_ID;
int index = blockIdx.x*blockDim.x+threadIdx.x;
randomIndex += index; randomIndex += index;
while (index < numAtoms) { while (index < numAtoms) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
mixed4 delta = posDelta[index]; mixed4 delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
mixed sqrtInvMass = SQRT(velocity.w); mixed sqrtInvMass = SQRT(velocity.w);
velocity.x += (delta.x-oldDelta[index].x)*invHalfdt;
velocity.y += (delta.y-oldDelta[index].y)*invHalfdt;
velocity.z += (delta.z-oldDelta[index].z)*invHalfdt;
velocity.x = vscale*velocity.x + noisescale*sqrtInvMass*random[randomIndex].x; velocity.x = vscale*velocity.x + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + noisescale*sqrtInvMass*random[randomIndex].y; velocity.y = vscale*velocity.y + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + noisescale*sqrtInvMass*random[randomIndex].z; velocity.z = vscale*velocity.z + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity; velm[index] = velocity;
#ifdef USE_MIXED_PRECISION delta += make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = make_mixed4(pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.x += delta.x;
pos.y += delta.y;
pos.z += delta.z;
#ifdef USE_MIXED_PRECISION
posq[index] = make_real4((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = make_real4(pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
delta = make_mixed4(halfdt*velocity.x, halfdt*velocity.y, halfdt*velocity.z, 0);
posDelta[index] = delta; posDelta[index] = delta;
oldDelta[index] = delta; oldDelta[index] = delta;
} }
randomIndex += blockDim.x*gridDim.x; randomIndex += GLOBAL_SIZE;
index += blockDim.x*gridDim.x; index += GLOBAL_SIZE;
} }
} }
/** /**
* Perform the third part of BAOAB integration: apply constraint forces to velocities, then record * Perform the third part of integration: apply constraint forces to velocities, then record
* the constrained positions in preparation for computing forces. * the constrained positions.
*/ */
extern "C" __global__ void integrateBAOABPart3(int numAtoms, real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, KERNEL void integrateLangevinMiddlePart3(int numAtoms, GLOBAL real4* RESTRICT posq, GLOBAL mixed4* RESTRICT velm,
mixed4* __restrict__ posDelta, mixed4* __restrict__ oldDelta, const mixed2* __restrict__ dt) { GLOBAL mixed4* RESTRICT posDelta, GLOBAL mixed4* RESTRICT oldDelta, GLOBAL const mixed2* RESTRICT dt
mixed halfdt = 0.5*dt[0].y; #ifdef USE_MIXED_PRECISION
mixed invHalfdt = 1/halfdt; , GLOBAL real4* RESTRICT posqCorrection
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) { #endif
) {
mixed invDt = 1/dt[0].y;
for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
mixed4 delta = posDelta[index]; mixed4 delta = posDelta[index];
velocity.x += (delta.x-oldDelta[index].x)*invHalfdt; velocity.x += (delta.x-oldDelta[index].x)*invDt;
velocity.y += (delta.y-oldDelta[index].y)*invHalfdt; velocity.y += (delta.y-oldDelta[index].y)*invDt;
velocity.z += (delta.z-oldDelta[index].z)*invHalfdt; velocity.z += (delta.z-oldDelta[index].z)*invDt;
velm[index] = velocity; velm[index] = velocity;
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index]; real4 pos1 = posq[index];
...@@ -108,22 +88,3 @@ extern "C" __global__ void integrateBAOABPart3(int numAtoms, real4* __restrict__ ...@@ -108,22 +88,3 @@ extern "C" __global__ void integrateBAOABPart3(int numAtoms, real4* __restrict__
} }
} }
} }
/**
* Perform the fourth part of BAOAB integration: velocity half step.
*/
extern "C" __global__ void integrateBAOABPart4(int numAtoms, int paddedNumAtoms, mixed4* __restrict__ velm,
const long long* __restrict__ force, const mixed2* __restrict__ dt) {
mixed halfdt = 0.5*dt[0].y;
mixed fscale = halfdt/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
velocity.x += fscale*velocity.w*force[index];
velocity.y += fscale*velocity.w*force[index+paddedNumAtoms];
velocity.z += fscale*velocity.w*force[index+paddedNumAtoms*2];
velm[index] = velocity;
}
}
}
...@@ -2,111 +2,79 @@ ...@@ -2,111 +2,79 @@
* Calculate the center of mass momentum. * Calculate the center of mass momentum.
*/ */
__kernel void calcCenterOfMassMomentum(int numAtoms, __global const mixed4* restrict velm, __global float4* restrict cmMomentum, __local volatile float4* restrict temp) { KERNEL void calcCenterOfMassMomentum(int numAtoms, GLOBAL const mixed4* RESTRICT velm, GLOBAL float3* RESTRICT cmMomentum) {
int index = get_global_id(0); LOCAL float3 temp[64];
float4 cm = 0.0f; float3 cm = make_float3(0, 0, 0);
while (index < numAtoms) { for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0) { if (velocity.w != 0) {
cm.x += velocity.x/velocity.w; mixed mass = RECIP(velocity.w);
cm.y += velocity.y/velocity.w; cm.x += (float) (velocity.x*mass);
cm.z += velocity.z/velocity.w; cm.y += (float) (velocity.y*mass);
cm.z += (float) (velocity.z*mass);
} }
index += get_global_size(0);
} }
// Sum the threads in this group. // Sum the threads in this group.
int thread = get_local_id(0); int thread = LOCAL_ID;
temp[thread] = cm; temp[thread] = cm;
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
#ifdef WARPS_ARE_ATOMIC
if (thread < 32) {
temp[thread] += temp[thread+32];
if (thread < 16)
temp[thread] += temp[thread+16];
if (thread < 8)
temp[thread] += temp[thread+8];
if (thread < 4)
temp[thread] += temp[thread+4];
if (thread < 2)
temp[thread] += temp[thread+2];
}
#else
if (thread < 32) if (thread < 32)
temp[thread] += temp[thread+32]; temp[thread] += temp[thread+32];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (thread < 16) if (thread < 16)
temp[thread] += temp[thread+16]; temp[thread] += temp[thread+16];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (thread < 8) if (thread < 8)
temp[thread] += temp[thread+8]; temp[thread] += temp[thread+8];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (thread < 4) if (thread < 4)
temp[thread] += temp[thread+4]; temp[thread] += temp[thread+4];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (thread < 2) if (thread < 2)
temp[thread] += temp[thread+2]; temp[thread] += temp[thread+2];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
#endif
if (thread == 0) if (thread == 0)
cmMomentum[get_group_id(0)] = temp[thread]+temp[thread+1]; cmMomentum[GROUP_ID] = temp[thread]+temp[thread+1];
} }
/** /**
* Remove center of mass motion. * Remove center of mass motion.
*/ */
__kernel void removeCenterOfMassMomentum(unsigned int numAtoms, __global mixed4* restrict velm, __global const float4* restrict cmMomentum, __local volatile float4* restrict temp) { KERNEL void removeCenterOfMassMomentum(int numAtoms, GLOBAL mixed4* RESTRICT velm, GLOBAL const float3* RESTRICT cmMomentum) {
// First sum all of the momenta that were calculated by individual groups. // First sum all of the momenta that were calculated by individual groups.
unsigned int index = get_local_id(0); LOCAL float3 temp[64];
float4 cm = 0.0f; float3 cm = make_float3(0, 0, 0);
while (index < get_num_groups(0)) { for (int index = LOCAL_ID; index < NUM_GROUPS; index += LOCAL_SIZE)
cm += cmMomentum[index]; cm += cmMomentum[index];
index += get_local_size(0); int thread = LOCAL_ID;
}
int thread = get_local_id(0);
temp[thread] = cm; temp[thread] = cm;
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
#ifdef WARPS_ARE_ATOMIC
if (thread < 32) {
temp[thread] += temp[thread+32];
if (thread < 16)
temp[thread] += temp[thread+16];
if (thread < 8)
temp[thread] += temp[thread+8];
if (thread < 4)
temp[thread] += temp[thread+4];
if (thread < 2)
temp[thread] += temp[thread+2];
}
#else
if (thread < 32) if (thread < 32)
temp[thread] += temp[thread+32]; temp[thread] += temp[thread+32];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (thread < 16) if (thread < 16)
temp[thread] += temp[thread+16]; temp[thread] += temp[thread+16];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (thread < 8) if (thread < 8)
temp[thread] += temp[thread+8]; temp[thread] += temp[thread+8];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (thread < 4) if (thread < 4)
temp[thread] += temp[thread+4]; temp[thread] += temp[thread+4];
barrier(CLK_LOCAL_MEM_FENCE); SYNC_THREADS;
if (thread < 2) if (thread < 2)
temp[thread] += temp[thread+2]; temp[thread] += temp[thread+2];
#endif SYNC_THREADS;
barrier(CLK_LOCAL_MEM_FENCE); cm = make_float3(INVERSE_TOTAL_MASS*(temp[0].x+temp[1].x), INVERSE_TOTAL_MASS*(temp[0].y+temp[1].y), INVERSE_TOTAL_MASS*(temp[0].z+temp[1].z));
cm = (float4) (INVERSE_TOTAL_MASS*(temp[0].x+temp[1].x), INVERSE_TOTAL_MASS*(temp[0].y+temp[1].y), INVERSE_TOTAL_MASS*(temp[0].z+temp[1].z), 0);
// Now remove the center of mass velocity from each atom. // Now remove the center of mass velocity from each atom.
index = get_global_id(0); for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
while (index < numAtoms) {
velm[index].x -= cm.x; velm[index].x -= cm.x;
velm[index].y -= cm.y; velm[index].y -= cm.y;
velm[index].z -= cm.z; velm[index].z -= cm.z;
index += get_global_size(0);
} }
} }
...@@ -4,20 +4,20 @@ ...@@ -4,20 +4,20 @@
/** /**
* Sum a value over all threads. * Sum a value over all threads.
*/ */
__device__ real reduceValue(real value, volatile real* temp) { DEVICE real reduceValue(real value, LOCAL_ARG volatile real* temp) {
const int thread = threadIdx.x; const int thread = LOCAL_ID;
__syncthreads(); SYNC_THREADS;
temp[thread] = value; temp[thread] = value;
__syncthreads(); SYNC_THREADS;
for (unsigned int step = 1; step < 32; step *= 2) { for (int step = 1; step < 32; step *= 2) {
if (thread+step < blockDim.x && thread%(2*step) == 0) if (thread+step < LOCAL_SIZE && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step]; temp[thread] = temp[thread] + temp[thread+step];
SYNC_WARPS SYNC_WARPS;
} }
for (unsigned int step = 32; step < blockDim.x; step *= 2) { for (int step = 32; step < LOCAL_SIZE; step *= 2) {
if (thread+step < blockDim.x && thread%(2*step) == 0) if (thread+step < LOCAL_SIZE && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step]; temp[thread] = temp[thread] + temp[thread+step];
__syncthreads(); SYNC_THREADS;
} }
return temp[0]; return temp[0];
} }
...@@ -25,14 +25,14 @@ __device__ real reduceValue(real value, volatile real* temp) { ...@@ -25,14 +25,14 @@ __device__ real reduceValue(real value, volatile real* temp) {
/** /**
* Perform the first step of computing the RMSD. This is executed as a single work group. * Perform the first step of computing the RMSD. This is executed as a single work group.
*/ */
extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __restrict__ posq, const real4* __restrict__ referencePos, KERNEL void computeRMSDPart1(int numParticles, GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT referencePos,
const int* __restrict__ particles, real* buffer) { GLOBAL const int* RESTRICT particles, GLOBAL real* buffer) {
extern __shared__ volatile real temp[]; LOCAL volatile real temp[THREAD_BLOCK_SIZE];
// Compute the center of the particle positions. // Compute the center of the particle positions.
real3 center = make_real3(0); real3 center = make_real3(0);
for (int i = threadIdx.x; i < numParticles; i += blockDim.x) for (int i = LOCAL_ID; i < numParticles; i += LOCAL_SIZE)
center += trimTo3(posq[particles[i]]); center += trimTo3(posq[particles[i]]);
center.x = reduceValue(center.x, temp)/numParticles; center.x = reduceValue(center.x, temp)/numParticles;
center.y = reduceValue(center.y, temp)/numParticles; center.y = reduceValue(center.y, temp)/numParticles;
...@@ -42,7 +42,7 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res ...@@ -42,7 +42,7 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res
real R[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}}; real R[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
real sum = 0; real sum = 0;
for (int i = threadIdx.x; i < numParticles; i += blockDim.x) { for (int i = LOCAL_ID; i < numParticles; i += LOCAL_SIZE) {
int index = particles[i]; int index = particles[i];
real3 pos = trimTo3(posq[index]) - center; real3 pos = trimTo3(posq[index]) - center;
real3 refPos = trimTo3(referencePos[index]); real3 refPos = trimTo3(referencePos[index]);
...@@ -64,7 +64,7 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res ...@@ -64,7 +64,7 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res
// Copy everything into the output buffer to send back to the host. // Copy everything into the output buffer to send back to the host.
if (threadIdx.x == 0) { if (LOCAL_ID == 0) {
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) for (int j = 0; j < 3; j++)
buffer[3*i+j] = R[i][j]; buffer[3*i+j] = R[i][j];
...@@ -78,11 +78,11 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res ...@@ -78,11 +78,11 @@ extern "C" __global__ void computeRMSDPart1(int numParticles, const real4* __res
/** /**
* Apply forces based on the RMSD. * Apply forces based on the RMSD.
*/ */
extern "C" __global__ void computeRMSDForces(int numParticles, int paddedNumAtoms, const real4* __restrict__ posq, const real4* __restrict__ referencePos, KERNEL void computeRMSDForces(int numParticles, int paddedNumAtoms, GLOBAL const real4* RESTRICT posq, GLOBAL const real4* RESTRICT referencePos,
const int* __restrict__ particles, const real* buffer, unsigned long long* __restrict__ forceBuffers) { GLOBAL const int* RESTRICT particles, GLOBAL const real* buffer, GLOBAL mm_long* RESTRICT forceBuffers) {
real3 center = make_real3(buffer[10], buffer[11], buffer[12]); real3 center = make_real3(buffer[10], buffer[11], buffer[12]);
real scale = 1 / (real) (buffer[9]*numParticles); real scale = 1 / (real) (buffer[9]*numParticles);
for (int i = blockDim.x*blockIdx.x+threadIdx.x; i < numParticles; i += blockDim.x*gridDim.x) { for (int i = GLOBAL_ID; i < numParticles; i += GLOBAL_SIZE) {
int index = particles[i]; int index = particles[i];
real3 pos = trimTo3(posq[index]) - center; real3 pos = trimTo3(posq[index]) - center;
real3 refPos = trimTo3(referencePos[index]); real3 refPos = trimTo3(referencePos[index]);
...@@ -90,8 +90,8 @@ extern "C" __global__ void computeRMSDForces(int numParticles, int paddedNumAtom ...@@ -90,8 +90,8 @@ extern "C" __global__ void computeRMSDForces(int numParticles, int paddedNumAtom
buffer[1]*refPos.x + buffer[4]*refPos.y + buffer[7]*refPos.z, buffer[1]*refPos.x + buffer[4]*refPos.y + buffer[7]*refPos.z,
buffer[2]*refPos.x + buffer[5]*refPos.y + buffer[8]*refPos.z); buffer[2]*refPos.x + buffer[5]*refPos.y + buffer[8]*refPos.z);
real3 force = (rotatedRef-pos)*scale; real3 force = (rotatedRef-pos)*scale;
atomicAdd(&forceBuffers[index], static_cast<unsigned long long>((long long) (force.x*0x100000000))); forceBuffers[index] += (mm_long) (force.x*0x100000000);
atomicAdd(&forceBuffers[index+paddedNumAtoms], static_cast<unsigned long long>((long long) (force.y*0x100000000))); forceBuffers[index+paddedNumAtoms] += (mm_long) (force.y*0x100000000);
atomicAdd(&forceBuffers[index+2*paddedNumAtoms], static_cast<unsigned long long>((long long) (force.z*0x100000000))); forceBuffers[index+2*paddedNumAtoms] += (mm_long) (force.z*0x100000000);
} }
} }
const real PI = (real) 3.14159265358979323846;
real3 v0 = make_real3(pos1.x-pos2.x, pos1.y-pos2.y, pos1.z-pos2.z);
real3 v1 = make_real3(pos3.x-pos2.x, pos3.y-pos2.y, pos3.z-pos2.z);
real3 v2 = make_real3(pos3.x-pos4.x, pos3.y-pos4.y, pos3.z-pos4.z);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA(v0)
APPLY_PERIODIC_TO_DELTA(v1)
APPLY_PERIODIC_TO_DELTA(v2)
#endif
real3 cp0 = cross(v0, v1);
real3 cp1 = cross(v1, v2);
real cosangle = dot(normalize(cp0), normalize(cp1));
real theta;
if (cosangle > 0.99f || cosangle < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real3 cross_prod = cross(cp0, cp1);
real scale = dot(cp0, cp0)*dot(cp1, cp1);
theta = ASIN(SQRT(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0)
theta = PI-theta;
}
else
theta = ACOS(cosangle);
theta = (dot(v0, cp1) >= 0 ? theta : -theta);
COMPUTE_FORCE
real normCross1 = dot(cp0, cp0);
real normSqrBC = dot(v1, v1);
real normBC = SQRT(normSqrBC);
real normCross2 = dot(cp1, cp1);
real dp = RECIP(normSqrBC);
real4 ff = make_real4((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
real3 force1 = ff.x*cp0;
real3 force4 = ff.w*cp1;
real3 s = ff.y*force1 - ff.z*force4;
real3 force2 = s-force1;
real3 force3 = -s-force4;
...@@ -2,13 +2,17 @@ ...@@ -2,13 +2,17 @@
* Perform the first step of Verlet integration. * Perform the first step of Verlet integration.
*/ */
extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms, const mixed2* __restrict__ dt, const real4* __restrict__ posq, KERNEL void integrateVerletPart1(int numAtoms, int paddedNumAtoms, GLOBAL const mixed2* RESTRICT dt, GLOBAL const real4* RESTRICT posq,
const real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const long long* __restrict__ force, mixed4* __restrict__ posDelta) { GLOBAL mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force, GLOBAL mixed4* RESTRICT posDelta
#ifdef USE_MIXED_PRECISION
, GLOBAL const real4* RESTRICT posqCorrection
#endif
) {
const mixed2 stepSize = dt[0]; const mixed2 stepSize = dt[0];
const mixed dtPos = stepSize.y; const mixed dtPos = stepSize.y;
const mixed dtVel = 0.5f*(stepSize.x+stepSize.y); const mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
const mixed scale = dtVel/(mixed) 0x100000000; const mixed scale = dtVel/(mixed) 0x100000000;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) { for (int index = GLOBAL_ID; index < numAtoms; index += GLOBAL_SIZE) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
...@@ -34,19 +38,24 @@ extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms ...@@ -34,19 +38,24 @@ extern "C" __global__ void integrateVerletPart1(int numAtoms, int paddedNumAtoms
* Perform the second step of Verlet integration. * Perform the second step of Verlet integration.
*/ */
extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict__ dt, real4* __restrict__ posq, KERNEL void integrateVerletPart2(int numAtoms, GLOBAL mixed2* RESTRICT dt, GLOBAL real4* RESTRICT posq,
real4* __restrict__ posqCorrection, mixed4* __restrict__ velm, const mixed4* __restrict__ posDelta) { GLOBAL mixed4* RESTRICT velm, GLOBAL const mixed4* RESTRICT posDelta
#ifdef USE_MIXED_PRECISION
, GLOBAL real4* RESTRICT posqCorrection
#endif
) {
mixed2 stepSize = dt[0]; mixed2 stepSize = dt[0];
#if __CUDA_ARCH__ >= 130 #ifdef SUPPORTS_DOUBLE_PRECISION
double oneOverDt = 1.0/stepSize.y; double oneOverDt = 1.0/stepSize.y;
#else #else
float oneOverDt = 1.0f/stepSize.y; float oneOverDt = 1.0f/stepSize.y;
float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y; float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
#endif #endif
int index = blockIdx.x*blockDim.x+threadIdx.x; if (GLOBAL_ID == 0)
if (index == 0)
dt[0].x = stepSize.y; dt[0].x = stepSize.y;
for (; index < numAtoms; index += blockDim.x*gridDim.x) { SYNC_THREADS;
int index = GLOBAL_ID;
for (; index < numAtoms; index += GLOBAL_SIZE) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION #ifdef USE_MIXED_PRECISION
...@@ -60,7 +69,7 @@ extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict ...@@ -60,7 +69,7 @@ extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict
pos.x += delta.x; pos.x += delta.x;
pos.y += delta.y; pos.y += delta.y;
pos.z += delta.z; pos.z += delta.z;
#if __CUDA_ARCH__ >= 130 #ifdef SUPPORTS_DOUBLE_PRECISION
velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w); velocity = make_mixed4((mixed) (delta.x*oneOverDt), (mixed) (delta.y*oneOverDt), (mixed) (delta.z*oneOverDt), velocity.w);
#else #else
velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w); velocity = make_mixed4((mixed) (delta.x*oneOverDt+delta.x*correction), (mixed) (delta.y*oneOverDt+delta.y*correction), (mixed) (delta.z*oneOverDt+delta.z*correction), velocity.w);
...@@ -80,28 +89,28 @@ extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict ...@@ -80,28 +89,28 @@ extern "C" __global__ void integrateVerletPart2(int numAtoms, mixed2* __restrict
* Select the step size to use for the next step. * Select the step size to use for the next step.
*/ */
extern "C" __global__ void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, mixed2* __restrict__ dt, const mixed4* __restrict__ velm, const long long* __restrict__ force) { KERNEL void selectVerletStepSize(int numAtoms, int paddedNumAtoms, mixed maxStepSize, mixed errorTol, GLOBAL mixed2* RESTRICT dt, GLOBAL const mixed4* RESTRICT velm, GLOBAL const mm_long* RESTRICT force) {
// Calculate the error. // Calculate the error.
extern __shared__ mixed error[]; LOCAL mixed error[256];
mixed err = 0.0f; mixed err = 0;
const mixed scale = RECIP((mixed) 0x100000000); const mixed scale = RECIP((mixed) 0x100000000);
for (int index = threadIdx.x; index < numAtoms; index += blockDim.x*gridDim.x) { for (int index = LOCAL_ID; index < numAtoms; index += LOCAL_SIZE) {
mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]); mixed3 f = make_mixed3(scale*force[index], scale*force[index+paddedNumAtoms], scale*force[index+paddedNumAtoms*2]);
mixed invMass = velm[index].w; mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass; err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
} }
error[threadIdx.x] = err; error[LOCAL_ID] = err;
__syncthreads(); SYNC_THREADS;
// Sum the errors from all threads. // Sum the errors from all threads.
for (unsigned int offset = 1; offset < blockDim.x; offset *= 2) { for (unsigned int offset = 1; offset < LOCAL_SIZE; offset *= 2) {
if (threadIdx.x+offset < blockDim.x && (threadIdx.x&(2*offset-1)) == 0) if (LOCAL_ID+offset < LOCAL_SIZE && (LOCAL_ID&(2*offset-1)) == 0)
error[threadIdx.x] += error[threadIdx.x+offset]; error[LOCAL_ID] += error[LOCAL_ID+offset];
__syncthreads(); SYNC_THREADS;
} }
if (threadIdx.x == 0) { if (LOCAL_ID == 0) {
mixed totalError = SQRT(error[0]/(numAtoms*3)); mixed totalError = SQRT(error[0]/(numAtoms*3));
mixed newStepSize = SQRT(errorTol/totalError); mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2013-2019 Stanford University and the Authors. * * Portions copyright (c) 2013-2020 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -32,7 +32,6 @@ ...@@ -32,7 +32,6 @@
* USE OR OTHER DEALINGS IN THE SOFTWARE. * * USE OR OTHER DEALINGS IN THE SOFTWARE. *
* -------------------------------------------------------------------------- */ * -------------------------------------------------------------------------- */
#include "CpuBAOABDynamics.h"
#include "CpuBondForce.h" #include "CpuBondForce.h"
#include "CpuCustomGBForce.h" #include "CpuCustomGBForce.h"
#include "CpuCustomManyParticleForce.h" #include "CpuCustomManyParticleForce.h"
...@@ -40,6 +39,7 @@ ...@@ -40,6 +39,7 @@
#include "CpuGayBerneForce.h" #include "CpuGayBerneForce.h"
#include "CpuGBSAOBCForce.h" #include "CpuGBSAOBCForce.h"
#include "CpuLangevinDynamics.h" #include "CpuLangevinDynamics.h"
#include "CpuLangevinMiddleDynamics.h"
#include "CpuNeighborList.h" #include "CpuNeighborList.h"
#include "CpuNonbondedForce.h" #include "CpuNonbondedForce.h"
#include "CpuPlatform.h" #include "CpuPlatform.h"
...@@ -274,7 +274,7 @@ private: ...@@ -274,7 +274,7 @@ private:
std::vector<std::vector<double> > bonded14ParamArray; std::vector<std::vector<double> > bonded14ParamArray;
double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, ewaldSelfEnergy, dispersionCoefficient; double nonbondedCutoff, switchingDistance, rfDielectric, ewaldAlpha, ewaldDispersionAlpha, ewaldSelfEnergy, dispersionCoefficient;
int kmax[3], gridSize[3], dispersionGridSize[3]; int kmax[3], gridSize[3], dispersionGridSize[3];
bool useSwitchingFunction, useOptimizedPme, hasInitializedPme, hasInitializedDispersionPme, hasParticleOffsets, hasExceptionOffsets; bool useSwitchingFunction, exceptionsArePeriodic, useOptimizedPme, hasInitializedPme, hasInitializedDispersionPme, hasParticleOffsets, hasExceptionOffsets;
std::vector<std::set<int> > exclusions; std::vector<std::set<int> > exclusions;
std::vector<std::pair<float, float> > particleParams; std::vector<std::pair<float, float> > particleParams;
std::vector<float> C6params; std::vector<float> C6params;
...@@ -538,42 +538,38 @@ private: ...@@ -538,42 +538,38 @@ private:
}; };
/** /**
* This kernel is invoked by BAOABLangevinIntegrator to take one time step. * This kernel is invoked by LangevinMiddleIntegrator to take one time step.
*/ */
class CpuIntegrateBAOABStepKernel : public IntegrateBAOABStepKernel { class CpuIntegrateLangevinMiddleStepKernel : public IntegrateLangevinMiddleStepKernel {
public: public:
CpuIntegrateBAOABStepKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : IntegrateBAOABStepKernel(name, platform), CpuIntegrateLangevinMiddleStepKernel(std::string name, const Platform& platform, CpuPlatform::PlatformData& data) : IntegrateLangevinMiddleStepKernel(name, platform),
data(data), dynamics(0) { data(data), dynamics(0) {
} }
~CpuIntegrateBAOABStepKernel(); ~CpuIntegrateLangevinMiddleStepKernel();
/** /**
* Initialize the kernel, setting up the particle masses. * Initialize the kernel, setting up the particle masses.
* *
* @param system the System this kernel will be applied to * @param system the System this kernel will be applied to
* @param integrator the BAOABLangevinIntegrator this kernel will be used for * @param integrator the LangevinMiddleIntegrator this kernel will be used for
*/ */
void initialize(const System& system, const BAOABLangevinIntegrator& integrator); void initialize(const System& system, const LangevinMiddleIntegrator& integrator);
/** /**
* Execute the kernel. * Execute the kernel.
* *
* @param context the context in which to execute this kernel * @param context the context in which to execute this kernel
* @param integrator the BAOABLangevinIntegrator this kernel is being used for * @param integrator the LangevinMiddleIntegrator this kernel is being used for
* @param forcesAreValid if the context has been modified since the last time step, this will be
* false to show that cached forces are invalid and must be recalculated.
* On exit, this should specify whether the cached forces are valid at the
* end of the step.
*/ */
void execute(ContextImpl& context, const BAOABLangevinIntegrator& integrator, bool& forcesAreValid); void execute(ContextImpl& context, const LangevinMiddleIntegrator& integrator);
/** /**
* Compute the kinetic energy. * Compute the kinetic energy.
* *
* @param context the context in which to execute this kernel * @param context the context in which to execute this kernel
* @param integrator the BAOABLangevinIntegrator this kernel is being used for * @param integrator the LangevinMiddleIntegrator this kernel is being used for
*/ */
double computeKineticEnergy(ContextImpl& context, const BAOABLangevinIntegrator& integrator); double computeKineticEnergy(ContextImpl& context, const LangevinMiddleIntegrator& integrator);
private: private:
CpuPlatform::PlatformData& data; CpuPlatform::PlatformData& data;
CpuBAOABDynamics* dynamics; CpuLangevinMiddleDynamics* dynamics;
std::vector<double> masses; std::vector<double> masses;
double prevTemp, prevFriction, prevStepSize; double prevTemp, prevFriction, prevStepSize;
}; };
......
/* Portions copyright (c) 2013-2019 Stanford University and Simbios. /* Portions copyright (c) 2013-2020 Stanford University and Simbios.
* Authors: Peter Eastman * Authors: Peter Eastman
* Contributors: * Contributors:
* *
...@@ -23,17 +23,17 @@ ...@@ -23,17 +23,17 @@
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/ */
#ifndef __CPU_BAOAB_DYNAMICS_H__ #ifndef __CPU_LANGEVIN_MIDDLE_DYNAMICS_H__
#define __CPU_BAOAB_DYNAMICS_H__ #define __CPU_LANGEVIN_MIDDLE_DYNAMICS_H__
#include "ReferenceBAOABDynamics.h" #include "ReferenceLangevinMiddleDynamics.h"
#include "CpuRandom.h" #include "CpuRandom.h"
#include "openmm/internal/ThreadPool.h" #include "openmm/internal/ThreadPool.h"
#include "sfmt/SFMT.h" #include "sfmt/SFMT.h"
namespace OpenMM { namespace OpenMM {
class CpuBAOABDynamics : public ReferenceBAOABDynamics { class CpuLangevinMiddleDynamics : public ReferenceLangevinMiddleDynamics {
public: public:
/** /**
* Constructor. * Constructor.
...@@ -45,25 +45,22 @@ public: ...@@ -45,25 +45,22 @@ public:
* @param threads thread pool for parallelizing computation * @param threads thread pool for parallelizing computation
* @param random random number generator * @param random random number generator
*/ */
CpuBAOABDynamics(int numberOfAtoms, double deltaT, double friction, double temperature, OpenMM::ThreadPool& threads, OpenMM::CpuRandom& random); CpuLangevinMiddleDynamics(int numberOfAtoms, double deltaT, double friction, double temperature, OpenMM::ThreadPool& threads, OpenMM::CpuRandom& random);
/** /**
* Destructor. * Destructor.
*/ */
~CpuBAOABDynamics(); ~CpuLangevinMiddleDynamics();
/** /**
* First update step. * First update step.
* *
* @param numberOfAtoms number of atoms * @param numberOfAtoms number of atoms
* @param atomCoordinates atom coordinates
* @param velocities velocities * @param velocities velocities
* @param forces forces * @param forces forces
* @param inverseMasses inverse atom masses * @param inverseMasses inverse atom masses
* @param xPrime xPrime
*/ */
void updatePart1(int numberOfAtoms, std::vector<OpenMM::Vec3>& atomCoordinates, std::vector<OpenMM::Vec3>& velocities, void updatePart1(int numberOfAtoms, std::vector<OpenMM::Vec3>& velocities, std::vector<OpenMM::Vec3>& forces, std::vector<double>& inverseMasses);
std::vector<OpenMM::Vec3>& forces, std::vector<double>& inverseMasses, std::vector<OpenMM::Vec3>& xPrime);
/** /**
* Second update step. * Second update step.
...@@ -94,7 +91,6 @@ private: ...@@ -94,7 +91,6 @@ private:
void threadUpdate1(int threadIndex); void threadUpdate1(int threadIndex);
void threadUpdate2(int threadIndex); void threadUpdate2(int threadIndex);
void threadUpdate3(int threadIndex); void threadUpdate3(int threadIndex);
void threadUpdate4(int threadIndex);
OpenMM::ThreadPool& threads; OpenMM::ThreadPool& threads;
OpenMM::CpuRandom& random; OpenMM::CpuRandom& random;
std::vector<OpenMM_SFMT::SFMT> threadRandom; std::vector<OpenMM_SFMT::SFMT> threadRandom;
...@@ -109,4 +105,4 @@ private: ...@@ -109,4 +105,4 @@ private:
} // namespace OpenMM } // namespace OpenMM
#endif // __CPU_BAOAB_DYNAMICS_H__ #endif // __CPU_LANGEVIN_MIDDLE_DYNAMICS_H__
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment