"plugins/vscode:/vscode.git/clone" did not exist on "f794f818b20e2322d1feecfd6f4338c4e0b551aa"
Commit 93c467b2 authored by Peter Eastman's avatar Peter Eastman
Browse files

Merged 5.1Optimizations branch back to trunk

parent f6d4557d
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct {
real x, y, z;
real q;
float radius, scaledRadius;
real bornSum;
} AtomData1;
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict global_bornSum,
#else
__global real* restrict global_bornSum,
#endif
__global const real4* restrict posq, __global const float2* restrict global_params,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
unsigned int numTiles,
#endif
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices) {
unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
unsigned int lasty = 0xFFFFFFFF;
__local AtomData1 localData[FORCE_WORK_GROUP_SIZE];
__local real tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
real bornSum = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].radius = params1.x;
localData[get_local_id(0)].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
}
}
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[get_local_id(0)].radius = tempParams.x;
localData[get_local_id(0)].scaledRadius = tempParams.y;
}
localData[get_local_id(0)].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
bool computeSubset = false;
if (flags != 0xFFFFFFFF) {
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
computeSubset = (exclusionIndex[localGroupIndex] == -1);
}
if (computeSubset) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
real4 delta = (real4) (localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
tempBuffer[get_local_id(0)] = 0.0f;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
tempBuffer[get_local_id(0)] = term;
}
}
// Sum the forces on atom j.
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 delta = (real4) (localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = (float2) (localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max((real) params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max((real) params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
#ifdef SUPPORTS_64_BIT_ATOMICS
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (bornSum*0x100000000));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (localData[get_local_id(0)].bornSum*0x100000000));
}
#else
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += bornSum;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[get_local_id(0)].bornSum;
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
#endif
lasty = y;
pos++;
} while (pos < end);
}
typedef struct {
real x, y, z;
real q;
real fx, fy, fz, fw;
real bornRadius;
} AtomData2;
/**
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict global_bornForce,
#else
__global real4* restrict forceBuffers, __global real* restrict global_bornForce,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const real* restrict global_bornRadii,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags,
#else
unsigned int numTiles,
#endif
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices) {
unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__local real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
__local int2 reservedBlocks[WARPS_PER_GROUP];
__local unsigned int* exclusionRange = (__local unsigned int*) reservedBlocks;
__local int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
real4 force = 0.0f;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[get_local_id(0)].x = posq1.x;
localData[get_local_id(0)].y = posq1.y;
localData[get_local_id(0)].z = posq1.z;
localData[get_local_id(0)].q = posq1.w;
localData[get_local_id(0)].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
}
}
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[get_local_id(0)].x = tempPosq.x;
localData[get_local_id(0)].y = tempPosq.y;
localData[get_local_id(0)].z = tempPosq.z;
localData[get_local_id(0)].q = tempPosq.w;
localData[get_local_id(0)].bornRadius = global_bornRadii[j];
}
localData[get_local_id(0)].fx = 0.0f;
localData[get_local_id(0)].fy = 0.0f;
localData[get_local_id(0)].fz = 0.0f;
localData[get_local_id(0)].fw = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
bool computeSubset = false;
#ifdef USE_APPLE_WORKAROUND
computeSubset = (flags == 0); // Workaround for a compiler bug in Apple's OpenCL on Lion
#else
if (flags != 0xFFFFFFFF) {
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
computeSubset = (exclusionIndex[localGroupIndex] == -1);
}
#endif
if (computeSubset) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
real4 posq2 = (real4) (localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
#endif
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
tempEnergy = 0.0f;
}
energy += tempEnergy;
force.w += dGpol_dalpha2_ij*bornRadius2;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
tempBuffer[get_local_id(0)] = (real4) (delta.xyz, dGpol_dalpha2_ij*bornRadius1);
#ifdef USE_CUTOFF
}
else
tempBuffer[get_local_id(0)] = (real4) 0;
#endif
// Sum the forces on atom j.
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
if (tgx == 0) {
real4 sum = tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real4 posq2 = (real4) (localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
}
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
#ifdef SUPPORTS_64_BIT_ATOMICS
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&global_bornForce[offset], (long) (force.w*0x100000000));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0x100000000));
}
#else
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
global_bornForce[offset] += force.w;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
global_bornForce[offset] += localData[get_local_id(0)].fw;
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
#endif
lasty = y;
pos++;
} while (pos < end);
energyBuffer[get_global_id(0)] += energy;
}
...@@ -15,7 +15,7 @@ __kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global co ...@@ -15,7 +15,7 @@ __kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global co
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
mixed4 velocity = velm[index]; mixed4 velocity = velm[index];
if (velocity.w != 0.0) { if (velocity.w != 0.0) {
mixed sqrtInvMass = sqrt(velocity.w); mixed sqrtInvMass = SQRT(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x; velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y; velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z; velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
...@@ -96,8 +96,8 @@ __kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed ta ...@@ -96,8 +96,8 @@ __kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed ta
if (get_global_id(0) == 0) { if (get_global_id(0) == 0) {
// Select the new step size. // Select the new step size.
mixed totalError = sqrt(error[0]/(NUM_ATOMS*3)); mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
mixed newStepSize = sqrt(errorTol/totalError); mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase. newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
...@@ -109,9 +109,9 @@ __kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed ta ...@@ -109,9 +109,9 @@ __kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed ta
// Recalculate the integration parameters. // Recalculate the integration parameters.
mixed vscale = exp(-newStepSize/tau); mixed vscale = EXP(-newStepSize/tau);
mixed fscale = (1-vscale)*tau; mixed fscale = (1-vscale)*tau;
mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau); mixed noisescale = SQRT(2*kT/tau)*SQRT(0.5f*(1-vscale*vscale)*tau);
params[VelScale] = vscale; params[VelScale] = vscale;
params[ForceScale] = fscale; params[ForceScale] = fscale;
params[NoiseScale] = noisescale; params[NoiseScale] = noisescale;
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct {
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real padding;
#endif
} AtomData;
/**
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global real4* restrict forceBuffers,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF
, __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms
#endif
PARAMETER_ARGUMENTS) {
const unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
const unsigned int warp = get_global_id(0)/TILE_SIZE;
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
real energy = 0;
__local AtomData localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = 0;
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
const bool hasExclusions = true;
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
SYNC_WARPS;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
force.xyz -= delta.xyz*dEdR;
#else
force.xyz -= dEdR1.xyz;
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
SYNC_WARPS;
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0;
localData[localAtomIndex].fy = 0;
localData[localAtomIndex].fz = 0;
SYNC_WARPS;
#ifdef USE_EXCLUSIONS
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
force.xyz -= dEdR1.xyz;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
}
#else
unsigned int offset1 = x*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = y*TILE_SIZE + tgx + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
if (x != y)
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
#endif
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
int pos = startTileIndex+warp*numTiles/totalWarps;
int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__local int atomIndices[FORCE_WORK_GROUP_SIZE];
__local int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[get_local_id(0)] = -1;
while (pos < end) {
const bool hasExclusions = false;
real4 force = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
SYNC_WARPS;
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS;
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[get_local_id(0)] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
SYNC_WARPS;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = get_local_id(0);
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[get_local_id(0)] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0;
localData[localAtomIndex].fy = 0;
localData[localAtomIndex].fz = 0;
}
SYNC_WARPS;
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
SYNC_WARPS;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
force.xyz -= dEdR1.xyz;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
force.xyz -= dEdR1.xyz;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
SYNC_WARPS;
}
}
// Write results.
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[get_local_id(0)];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
if (atom2 < PADDED_NUM_ATOMS) {
atom_add(&forceBuffers[atom2], (long) (localData[get_local_id(0)].fx*0x100000000));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
}
#else
unsigned int offset1 = atom1 + warp*PADDED_NUM_ATOMS;
unsigned int offset2 = atom2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
if (atom2 < PADDED_NUM_ATOMS)
forceBuffers[offset2] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
#endif
}
pos++;
}
energyBuffer[get_global_id(0)] += energy;
}
#define TILE_SIZE 32 #ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
typedef struct { typedef struct {
real x, y, z; real x, y, z;
...@@ -11,89 +13,54 @@ typedef struct { ...@@ -11,89 +13,54 @@ typedef struct {
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel void computeNonbonded(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions, __kernel void computeNonbonded(
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, #ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int startTileIndex, unsigned int endTileIndex, __global long* restrict forceBuffers,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles __global real4* restrict forceBuffers,
#endif #endif
PARAMETER_ARGUMENTS) { __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const ushort2* restrict exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; , __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const real4* restrict blockCenter, __global const int* restrict interactingAtoms
unsigned int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*(endTileIndex-startTileIndex)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
unsigned int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*(endTileIndex-startTileIndex)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
#else
unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
PARAMETER_ARGUMENTS) {
real energy = 0; real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData localData[TILE_SIZE]; __local AtomData localData[TILE_SIZE];
while (pos < end) { // First loop: process tiles that contain exclusions.
// Extract the coordinates of this tile
unsigned int x, y; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+get_group_id(0)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
#ifdef USE_CUTOFF const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(get_group_id(0)+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/get_num_groups(0);
if (numTiles <= maxTiles) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
ushort2 tileIndices = tiles[pos]; const ushort2 tileIndices = exclusionTiles[pos];
x = tileIndices.x; const unsigned int x = tileIndices.x;
y = tileIndices.y; const unsigned int y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS // Load the data for this tile.
unsigned int exclusionStart = exclusionRowIndices[x];
unsigned int exclusionEnd = exclusionRowIndices[x+1];
int exclusionIndex = -1;
for (int i = exclusionStart; i < exclusionEnd; i++)
if (exclusionIndices[i] == y) {
exclusionIndex = i*TILE_SIZE;
break;
}
bool hasExclusions = (exclusionIndex > -1);
#endif
// Load the data for this tile if we don't already have it cached.
if (lasty != y) { for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) { unsigned int j = y*TILE_SIZE + localAtomIndex;
unsigned int j = y*TILE_SIZE + localAtomIndex; real4 tempPosq = posq[j];
real4 tempPosq = posq[j]; localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].x = tempPosq.x; localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].y = tempPosq.y; localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].z = tempPosq.z; localData[localAtomIndex].q = tempPosq.w;
localData[localAtomIndex].q = tempPosq.w; LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
} }
const bool hasExclusions = true;
if (x == y) { if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex+tgx]; unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif #endif
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = 0;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -103,35 +70,46 @@ __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global r ...@@ -103,35 +70,46 @@ __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global r
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real dEdR = 0; real dEdR = 0;
#else #else
real4 dEdR1 = (real4) 0; real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0; real4 dEdR2 = (real4) 0;
#endif #endif
real tempEnergy = 0; #ifdef USE_EXCLUSIONS
COMPUTE_INTERACTION bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
energy += 0.5f*tempEnergy; #endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
force.xyz -= delta.xyz*dEdR; force.xyz -= delta.xyz*dEdR;
#else #else
force.xyz -= dEdR1.xyz; force.xyz -= dEdR1.xyz;
#endif #endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
#ifdef USE_EXCLUSIONS
excl >>= 1; excl >>= 1;
#endif
} }
// Write results. // Write results.
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; #ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
} }
} }
else { else {
...@@ -142,82 +120,244 @@ __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global r ...@@ -142,82 +120,244 @@ __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global r
localData[tgx].fy = 0; localData[tgx].fy = 0;
localData[tgx].fz = 0; localData[tgx].fz = 0;
} }
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF); if (r2 < CUTOFF_SQUARED) {
unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF); #endif
if (!hasExclusions && (flags1 != 0xFFFFFFFF || flags2 != 0xFFFFFFFF)) { real invR = RSQRT(r2);
// Compute only a subset of the interactions in this tile. real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[j].fx += delta.x;
localData[j].fy += delta.y;
localData[j].fz += delta.z;
#else
force.xyz -= dEdR1.xyz;
localData[j].fx += dEdR2.x;
localData[j].fy += dEdR2.y;
localData[j].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { // Write results for atom1.
if ((flags2&(1<<tgx)) != 0) {
unsigned int atom1 = x*TILE_SIZE+tgx; #ifdef SUPPORTS_64_BIT_ATOMICS
real4 force = 0; atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
real4 posq1 = posq[atom1]; atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
LOAD_ATOM1_PARAMETERS atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
for (unsigned int j = 0; j < TILE_SIZE; j++) { #else
if ((flags1&(1<<j)) != 0) { unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
bool isExcluded = false; forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); #endif
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); }
// Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
#endif
}
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*numTileIndices/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*numTileIndices/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
#else
const unsigned int numTiles = numTileIndices;
int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
int nextToSkip = -1;
int currentSkipIndex = 0;
__local int atomIndices[TILE_SIZE];
while (pos < end) {
const bool hasExclusions = false;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
nextToSkip = end;
}
includeTile = (nextToSkip != pos);
}
if (includeTile) {
// Load the data for this tile.
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+localAtomIndex] : y*TILE_SIZE+localAtomIndex);
#else
unsigned int j = y*TILE_SIZE+localAtomIndex;
#endif
atomIndices[localAtomIndex] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0;
localData[localAtomIndex].fy = 0;
localData[localAtomIndex].fz = 0;
}
}
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; if (singlePeriodicCopy) {
#endif // The box is small enough that we can just translate all the atoms into a single periodic
real r2 = dot(delta.xyz, delta.xyz); // box, then skip having to apply periodic boundary conditions later.
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2); real4 blockCenterX = blockCenter[x];
real r = RECIP(invR); for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom2 = j; localData[tgx].x -= floor((localData[tgx].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
LOAD_ATOM2_PARAMETERS localData[tgx].y -= floor((localData[tgx].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
atom2 = y*TILE_SIZE+j; localData[tgx].z -= floor((localData[tgx].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
}
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0;
real4 posq1 = posq[atom1];
posq1.xyz -= floor((posq1.xyz-blockCenterX.xyz)*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
real r2 = dot(delta.xyz, delta.xyz);
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[j];
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real dEdR = 0; real dEdR = 0;
#else #else
real4 dEdR1 = (real4) 0; real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0; real4 dEdR2 = (real4) 0;
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif #endif
real tempEnergy = 0; real tempEnergy = 0;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
localData[j].fx += delta.x; localData[j].fx += delta.x;
localData[j].fy += delta.y; localData[j].fy += delta.y;
localData[j].fz += delta.z; localData[j].fz += delta.z;
#else #else
force.xyz -= dEdR1.xyz; force.xyz -= dEdR1.xyz;
localData[j].fx += dEdR2.x; localData[j].fx += dEdR2.x;
localData[j].fy += dEdR2.y; localData[j].fy += dEdR2.y;
localData[j].fz += dEdR2.z; localData[j].fz += dEdR2.z;
#endif #endif
}
}
} }
}
// Write results for atom1. // Write results for atom1.
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; #ifdef SUPPORTS_64_BIT_ATOMICS
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
} atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
} }
} }
else else
#endif #endif
{ {
// Compute the full set of interactions in this tile. // We need to apply periodic boundary conditions separately for each interaction.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
real4 force = 0; real4 force = 0;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -227,59 +367,77 @@ __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global r ...@@ -227,59 +367,77 @@ __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global r
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = atomIndices[j];
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
real dEdR = 0; real dEdR = 0;
#else #else
real4 dEdR1 = (real4) 0; real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0; real4 dEdR2 = (real4) 0;
#endif #endif
real tempEnergy = 0; #ifdef USE_EXCLUSIONS
COMPUTE_INTERACTION bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
energy += tempEnergy; #endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
delta.xyz *= dEdR; delta.xyz *= dEdR;
force.xyz -= delta.xyz; force.xyz -= delta.xyz;
localData[j].fx += delta.x; localData[j].fx += delta.x;
localData[j].fy += delta.y; localData[j].fy += delta.y;
localData[j].fz += delta.z; localData[j].fz += delta.z;
#else #else
force.xyz -= dEdR1.xyz; force.xyz -= dEdR1.xyz;
localData[j].fx += dEdR2.x; localData[j].fx += dEdR2.x;
localData[j].fy += dEdR2.y; localData[j].fy += dEdR2.y;
localData[j].fz += dEdR2.z; localData[j].fz += dEdR2.z;
#endif #endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif #endif
} }
// Write results for atom1. // Write results for atom1.
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom1], (long) (force.x*0x100000000));
atom_add(&forceBuffers[atom1+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz; forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
} }
} }
// Write results. // Write results.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; #ifdef USE_CUTOFF
real4 f = forceBuffers[offset]; unsigned int atom2 = atomIndices[tgx];
f.x += localData[tgx].fx; #else
f.y += localData[tgx].fy; unsigned int atom2 = y*TILE_SIZE + tgx;
f.z += localData[tgx].fz; #endif
forceBuffers[offset] = f; if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[atom2], (long) (localData[tgx].fx*0x100000000));
atom_add(&forceBuffers[atom2+PADDED_NUM_ATOMS], (long) (localData[tgx].fy*0x100000000));
atom_add(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (long) (localData[tgx].fz*0x100000000));
#else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
#endif
}
} }
} }
lasty = y;
pos++; pos++;
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
......
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32
// Cannot use float3 as OpenCL defines it to be 4 DWORD aligned. This would
// cause every element of array to have DWORD of padding to make it 4 DWORD
// aligned which wastes space and causes LDS bank conflicts as stride is no
// longer odd DWORDS.
typedef struct {
real x, y, z;
} UnalignedReal3;
typedef struct {
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real padding;
#endif
} AtomData;
/**
* Compute nonbonded interactions.
*/
__kernel __attribute__((reqd_work_group_size(FORCE_WORK_GROUP_SIZE, 1, 1)))
void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global real4* restrict forceBuffers,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+get_group_id(0)*(endTileIndex-startTileIndex)/get_num_groups(0) : get_group_id(0)*numTiles/get_num_groups(0));
unsigned int end = (numTiles > maxTiles ? startTileIndex+(get_group_id(0)+1)*(endTileIndex-startTileIndex)/get_num_groups(0) : (get_group_id(0)+1)*numTiles/get_num_groups(0));
#else
unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData localData[TILE_SIZE];
__local UnalignedReal3 localForce[FORCE_WORK_GROUP_SIZE];
#ifdef USE_EXCLUSIONS
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
#endif
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int baseLocalAtom = (get_local_id(0) < TILE_SIZE ? 0 : TILE_SIZE/2);
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 force = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (get_local_id(0) < 2)
exclusionRange[get_local_id(0)] = exclusionRowIndices[x+get_local_id(0)];
if (get_local_id(0) == 0)
exclusionIndex[0] = -1;
barrier(CLK_LOCAL_MEM_FENCE);
for (int i = exclusionRange[0]+get_local_id(0); i < exclusionRange[1]; i += FORCE_WORK_GROUP_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[0] = i*TILE_SIZE;
barrier(CLK_LOCAL_MEM_FENCE);
bool hasExclusions = (exclusionIndex[0] > -1);
#endif
if (x == y) {
// This tile is on the diagonal.
if (get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
}
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[0]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
unsigned int atom2 = baseLocalAtom+j;
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j;
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
force.xyz -= delta.xyz*dEdR;
#else
force.xyz -= dEdR1.xyz;
#endif
excl >>= 1;
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) {
localData[tgx].fx = force.x;
localData[tgx].fy = force.y;
localData[tgx].fz = force.z;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) ((force.x + localData[tgx].fx)*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0x100000000));
#else
force.x += localData[tgx].fx;
force.y += localData[tgx].fy;
force.z += localData[tgx].fz;
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store real4 than real3.
real4 sum = forceBuffers[offset];
sum.xyz += force.xyz;
forceBuffers[offset] = sum;
#endif
}
// barrier not required here as localData[*].temp is not accessed before encountering another barrier.
}
else {
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx;
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localForce[get_local_id(0)].x = 0;
localForce[get_local_id(0)].y = 0;
localForce[get_local_id(0)].z = 0;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
unsigned int tj = (tgx+baseLocalAtom) & (TILE_SIZE-1);
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[0]+tgx] : 0xFFFFFFFF);
excl = (excl >> tj) | (excl << (TILE_SIZE - tj));
#endif
for (unsigned int j = 0; j < TILE_SIZE/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
real4 posq2 = (real4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
int atom2 = tj;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localForce[tj+localForceOffset].x += delta.x;
localForce[tj+localForceOffset].y += delta.y;
localForce[tj+localForceOffset].z += delta.z;
#else
force.xyz -= dEdR1.xyz;
localForce[tj+localForceOffset].x += dEdR2.x;
localForce[tj+localForceOffset].y += dEdR2.y;
localForce[tj+localForceOffset].z += dEdR2.z;
#endif
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj+1) & (TILE_SIZE-1);
}
// Sum the forces and write results.
if (get_local_id(0) >= TILE_SIZE) {
localData[tgx].fx = force.x;
localData[tgx].fy = force.y;
localData[tgx].fz = force.z;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TILE_SIZE) {
#ifdef SUPPORTS_64_BIT_ATOMICS
const unsigned int offset1 = x*TILE_SIZE + tgx;
const unsigned int offset2 = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset1], (long) ((force.x + localData[tgx].fx)*0x100000000));
atom_add(&forceBuffers[offset1+PADDED_NUM_ATOMS], (long) ((force.y + localData[tgx].fy)*0x100000000));
atom_add(&forceBuffers[offset1+2*PADDED_NUM_ATOMS], (long) ((force.z + localData[tgx].fz)*0x100000000));
atom_add(&forceBuffers[offset2], (long) ((localForce[tgx].x + localForce[tgx+TILE_SIZE].x)*0x100000000));
atom_add(&forceBuffers[offset2+PADDED_NUM_ATOMS], (long) ((localForce[tgx].y + localForce[tgx+TILE_SIZE].y)*0x100000000));
atom_add(&forceBuffers[offset2+2*PADDED_NUM_ATOMS], (long) ((localForce[tgx].z + localForce[tgx+TILE_SIZE].z)*0x100000000));
#else
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
const unsigned int offset1 = x*TILE_SIZE + tgx + y*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + x*PADDED_NUM_ATOMS;
#else
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
real4 sum1 = forceBuffers[offset1];
real4 sum2 = forceBuffers[offset2];
sum1.x += localData[tgx].fx + force.x;
sum1.y += localData[tgx].fy + force.y;
sum1.z += localData[tgx].fz + force.z;
sum2.x += localForce[tgx].x + localForce[tgx+TILE_SIZE].x;
sum2.y += localForce[tgx].y + localForce[tgx+TILE_SIZE].y;
sum2.z += localForce[tgx].z + localForce[tgx+TILE_SIZE].z;
forceBuffers[offset1] = sum1;
forceBuffers[offset2] = sum2;
#endif
}
barrier(CLK_LOCAL_MEM_FENCE);
}
lasty = y;
pos++;
}
energyBuffer[get_global_id(0)] += energy;
}
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct {
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
real padding;
#endif
} AtomData;
/**
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global real4* restrict forceBuffers,
#endif
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*(endTileIndex-startTileIndex)/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*(endTileIndex-startTileIndex)/totalWarps : (warp+1)*numTiles/totalWarps);
#else
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
__local AtomData localData[FORCE_WORK_GROUP_SIZE];
__local real tempBuffer[3*FORCE_WORK_GROUP_SIZE];
__local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange;
do {
// Extract the coordinates of this tile
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
real4 force = 0;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = get_local_id(0);
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
force.xyz -= delta.xyz*dEdR;
#else
force.xyz -= dEdR1.xyz;
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0;
localData[localAtomIndex].fy = 0;
localData[localAtomIndex].fz = 0;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
int bufferIndex = 3*get_local_id(0);
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
real tempEnergy = 0;
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z;
#else
force.xyz -= dEdR1.xyz;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z;
#endif
// Sum the forces on atom2.
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
}
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
real dEdR = 0;
#else
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
force.xyz -= dEdR1.xyz;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
}
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
#ifdef SUPPORTS_64_BIT_ATOMICS
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0x100000000));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0x100000000));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0x100000000));
}
#else
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(writeX, writeY);
bool done = false;
int doneIndex = 0;
int checkIndex = 0;
while (true) {
// See if any warp still needs to write its data.
bool allDone = true;
barrier(CLK_LOCAL_MEM_FENCE);
while (doneIndex < WARPS_PER_GROUP && allDone) {
if (reservedBlocks[doneIndex].x != -1)
allDone = false;
else
doneIndex++;
}
if (allDone)
break;
if (!done) {
// See whether this warp can write its data. This requires that no previous warp
// is trying to write to the same block of the buffer.
bool canWrite = (writeX != -1);
while (checkIndex < localGroupIndex && canWrite) {
if ((reservedBlocks[checkIndex].x == x || reservedBlocks[checkIndex].y == x) ||
(writeY != -1 && (reservedBlocks[checkIndex].x == y || reservedBlocks[checkIndex].y == y)))
canWrite = false;
else
checkIndex++;
}
if (canWrite) {
// Write the data to global memory, then mark this warp as done.
if (writeX > -1) {
const unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force.xyz;
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
}
done = true;
if (tgx == 0)
reservedBlocks[localGroupIndex] = (int2)(-1, -1);
}
}
}
#endif
pos++;
} while (pos < end);
energyBuffer[get_global_id(0)] += energy;
}
...@@ -15,6 +15,7 @@ __kernel void updateBsplines(__global const real4* restrict posq, __global real4 ...@@ -15,6 +15,7 @@ __kernel void updateBsplines(__global const real4* restrict posq, __global real4
((int) t.y) % GRID_SIZE_Y, ((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0); ((int) t.z) % GRID_SIZE_Z, 0);
pmeAtomGridIndex[i] = (int2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z); pmeAtomGridIndex[i] = (int2) (i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
#ifndef SUPPORTS_64_BIT_ATOMICS
data[PME_ORDER-1] = 0.0f; data[PME_ORDER-1] = 0.0f;
data[1] = dr; data[1] = dr;
data[0] = 1.0f-dr; data[0] = 1.0f-dr;
...@@ -33,6 +34,7 @@ __kernel void updateBsplines(__global const real4* restrict posq, __global real4 ...@@ -33,6 +34,7 @@ __kernel void updateBsplines(__global const real4* restrict posq, __global real4
data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
pmeBsplineTheta[i+j*NUM_ATOMS] = data[j]; pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
} }
#endif
} }
} }
...@@ -80,56 +82,66 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co ...@@ -80,56 +82,66 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER) __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) { __global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int ix = get_local_id(0)/(PME_ORDER*PME_ORDER); const real4 scale = 1/(real) (PME_ORDER-1);
int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER; real4 data[PME_ORDER];
int iy = remainder/PME_ORDER;
int iz = remainder-iy*PME_ORDER; // Process the atoms in spatially sorted order. This improves efficiency when writing
__local real4 theta[PME_ORDER]; // the grid values.
__local real charge[BUFFER_SIZE];
__local int basex[BUFFER_SIZE]; for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local int basey[BUFFER_SIZE]; int atom = pmeAtomGridIndex[i].x;
__local int basez[BUFFER_SIZE]; real4 pos = posq[atom];
if (ix < PME_ORDER) { pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
for (int baseIndex = get_group_id(0)*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += get_num_groups(0)*BUFFER_SIZE) { pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
// Load the next block of atoms into the buffers. pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
if (get_local_id(0) < BUFFER_SIZE) { // Since we need the full set of thetas, it's faster to compute them here than load them
int atomIndex = baseIndex+get_local_id(0); // from global memory.
if (atomIndex < NUM_ATOMS) {
real4 pos = posq[atomIndex]; real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
charge[get_local_id(0)] = pos.w; data[PME_ORDER-1] = 0.0f;
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; data[1] = dr;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; data[0] = 1.0f-dr;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; for (int j = 3; j < PME_ORDER; j++) {
basex[get_local_id(0)] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X); real div = RECIP(j-1.0f);
basey[get_local_id(0)] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y); data[j-1] = div*dr*data[j-2];
basez[get_local_id(0)] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z); for (int k = 1; k < (j-1); k++)
} data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
} data[0] = div*(- dr+1.0f)*data[0];
barrier(CLK_LOCAL_MEM_FENCE); }
int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex); data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int index = 0; index < lastIndex; index++) { for (int j = 1; j < (PME_ORDER-1); j++)
int atomIndex = index+baseIndex; data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
if (get_local_id(0) < PME_ORDER) data[0] = scale*(-dr+1.0f)*data[0];
theta[get_local_id(0)] = pmeBsplineTheta[atomIndex+get_local_id(0)*NUM_ATOMS];
barrier(CLK_LOCAL_MEM_FENCE); // Spread the charge from this atom onto each grid point.
real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
int x = basex[index]+ix; for (int ix = 0; ix < PME_ORDER; ix++) {
int y = basey[index]+iy; int xindex = gridIndex.x+ix;
int z = basez[index]+iz; xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0); for (int iy = 0; iy < PME_ORDER; iy++) {
y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); int yindex = gridIndex.y+iy;
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
real add = pos.w*data[ix].x*data[iy].y*data[iz].z;
#ifdef USE_DOUBLE_PRECISION #ifdef USE_DOUBLE_PRECISION
atom_add(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], (long) (add*0x100000000)); atom_add(&pmeGrid[2*index], (long) (add*0x100000000));
#else #else
atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0x100000000)); atom_add(&pmeGrid[index], (long) (add*0x100000000));
#endif #endif
}
} }
} }
} }
...@@ -149,6 +161,75 @@ __kernel void finishSpreadCharge(__global long* restrict pmeGrid) { ...@@ -149,6 +161,75 @@ __kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
realGrid[index] = realValue; realGrid[index] = realValue;
} }
} }
#elif defined(DEVICE_IS_CPU)
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
if (firstx == lastx)
return;
const real4 scale = 1/(real) (PME_ORDER-1);
real4 data[PME_ORDER];
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
for (int i = 0; i < NUM_ATOMS; i++) {
int atom = i;//pmeAtomGridIndex[i].x;
real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
// Spread the charge from this atom onto each grid point.
bool hasComputedThetas = false;
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
if (xindex < firstx || xindex >= lastx)
continue;
if (!hasComputedThetas) {
hasComputedThetas = true;
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
data[PME_ORDER-1] = 0.0f;
data[1] = dr;
data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1.0f);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0];
}
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index].x += EPSILON_FACTOR*pos.w*data[ix].x*data[iy].y*data[iz].z;
}
}
}
}
}
#else #else
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) { __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
...@@ -239,11 +320,16 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r ...@@ -239,11 +320,16 @@ __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global r
} }
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid, __kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, __local real4* restrict bsplinesCache) { real4 periodicBoxSize, real4 invPeriodicBoxSize, __global int2* restrict pmeAtomGridIndex) {
const real4 scale = 1/(real) (PME_ORDER-1); const real4 scale = 1/(real) (PME_ORDER-1);
__local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER]; real4 data[PME_ORDER];
__local real4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER]; real4 ddata[PME_ORDER];
for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
int atom = pmeAtomGridIndex[i].x;
real4 force = 0.0f; real4 force = 0.0f;
real4 pos = posq[atom]; real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
...@@ -293,29 +379,10 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global ...@@ -293,29 +379,10 @@ __kernel void gridInterpolateForce(__global const real4* restrict posq, __global
real gridvalue = pmeGrid[index].x; real gridvalue = pmeGrid[index].x;
force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue; force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue; force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
#ifndef MAC_AMD_WORKAROUND
force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
#endif
}
}
}
#ifdef MAC_AMD_WORKAROUND
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
real gridvalue = pmeGrid[index].x;
force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue; force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
} }
} }
} }
#endif
real4 totalForce = forceBuffers[atom]; real4 totalForce = forceBuffers[atom];
real q = pos.w*EPSILON_FACTOR; real q = pos.w*EPSILON_FACTOR;
totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x; totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
......
__kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache, __global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global real4* restrict pmeBsplineDTheta) {
const real4 scale = 1.0f/(PME_ORDER-1);
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
__local real4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
for (int j = 0; j < PME_ORDER; j++) {
data[j] = 0;
ddata[j] = 0;
}
real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0);
real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0);
data[PME_ORDER-1] = 0;
data[1] = dr;
data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = 1.0f/(j-1.0f);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0];
}
ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++)
ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0];
for (int j = 0; j < PME_ORDER; j++) {
pmeBsplineTheta[i+j*NUM_ATOMS] = data[j];
pmeBsplineDTheta[i+j*NUM_ATOMS] = ddata[j];
}
}
}
/**
* This kernel is not actually used when running on a CPU.
*/
__kernel void findAtomRangeForGrid(__global const int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
}
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const int firstx = get_global_id(0)*GRID_SIZE_X/get_global_size(0);
const int lastx = (get_global_id(0)+1)*GRID_SIZE_X/get_global_size(0);
for (int gridIndex = firstx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex < lastx*GRID_SIZE_Y*GRID_SIZE_Z; gridIndex++)
pmeGrid[gridIndex] = (real2) 0;
for (int atom = 0; atom < NUM_ATOMS; atom++) {
real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0);
real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
real atomCharge = pos.w*EPSILON_FACTOR;
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
if (xindex < firstx || xindex >= lastx)
continue;
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
for(int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
pmeGrid[index].x += atomCharge*pmeBsplineTheta[atom+ix*NUM_ATOMS].x*pmeBsplineTheta[atom+iy*NUM_ATOMS].y*pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
}
}
}
}
}
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
__global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 invPeriodicBoxSize, real recipScaleFactor) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
real energy = 0;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
int ky = remainder/GRID_SIZE_Z;
int kz = remainder-ky*GRID_SIZE_Z;
if (kx == 0 && ky == 0 && kz == 0)
continue;
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*invPeriodicBoxSize.x;
real mhy = my*invPeriodicBoxSize.y;
real mhz = mz*invPeriodicBoxSize.z;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real2 grid = pmeGrid[index];
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
energyBuffer[get_global_id(0)] += 0.5f*energy;
}
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict pmeBsplineTheta, __global const real4* restrict pmeBsplineDTheta) {
for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
real4 force = 0;
real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
for (int ix = 0; ix < PME_ORDER; ix++) {
int xindex = gridIndex.x+ix;
xindex -= (xindex >= GRID_SIZE_X ? GRID_SIZE_X : 0);
real tx = pmeBsplineTheta[atom+ix*NUM_ATOMS].x;
real dtx = pmeBsplineDTheta[atom+ix*NUM_ATOMS].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int yindex = gridIndex.y+iy;
yindex -= (yindex >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
real ty = pmeBsplineTheta[atom+iy*NUM_ATOMS].y;
real dty = pmeBsplineDTheta[atom+iy*NUM_ATOMS].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real tz = pmeBsplineTheta[atom+iz*NUM_ATOMS].z;
real dtz = pmeBsplineDTheta[atom+iz*NUM_ATOMS].z;
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
real gridvalue = pmeGrid[index].x;
force.x += dtx*ty*tz*gridvalue;
force.y += tx*dty*tz*gridvalue;
force.z += tx*ty*dtz*gridvalue;
}
}
}
real4 totalForce = forceBuffers[atom];
real q = pos.w*EPSILON_FACTOR;
totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y;
totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z;
forceBuffers[atom] = totalForce;
}
}
...@@ -25,7 +25,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran ...@@ -25,7 +25,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x1 = sqrt(-2.0f * log(x1)); x1 = SQRT(-2.0f * LOG(x1));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
...@@ -50,7 +50,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran ...@@ -50,7 +50,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x3 = sqrt(-2.0f * log(x3)); x3 = SQRT(-2.0f * LOG(x3));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
...@@ -75,7 +75,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran ...@@ -75,7 +75,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x5 = sqrt(-2.0f * log(x5)); x5 = SQRT(-2.0f * LOG(x5));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
...@@ -100,7 +100,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran ...@@ -100,7 +100,7 @@ __kernel void generateRandomNumbers(int numValues, __global float4* restrict ran
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x7 = sqrt(-2.0f * log(x7)); x7 = SQRT(-2.0f * LOG(x7));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
......
...@@ -63,9 +63,9 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest ...@@ -63,9 +63,9 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest
mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd; mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd; mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd); mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd); mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd); mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
mixed trns11 = xaksXd / axlng; mixed trns11 = xaksXd / axlng;
mixed trns21 = yaksXd / axlng; mixed trns21 = yaksXd / axlng;
mixed trns31 = zaksXd / axlng; mixed trns31 = zaksXd / axlng;
...@@ -91,13 +91,13 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest ...@@ -91,13 +91,13 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest
// --- Step2 A2' --- // --- Step2 A2' ---
float rc = 0.5*params.y; float rc = 0.5*params.y;
mixed rb = sqrt(params.x*params.x-rc*rc); mixed rb = SQRT(params.x*params.x-rc*rc);
mixed ra = rb*(m1+m2)*invTotalMass; mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra; rb -= ra;
mixed sinphi = za1d / ra; mixed sinphi = za1d / ra;
mixed cosphi = sqrt(1.0f - sinphi*sinphi); mixed cosphi = SQRT(1.0f - sinphi*sinphi);
mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi); mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
mixed cospsi = sqrt(1.0f - sinpsi*sinpsi); mixed cospsi = SQRT(1.0f - sinpsi*sinpsi);
mixed ya2d = ra*cosphi; mixed ya2d = ra*cosphi;
mixed xb2d = - rc*cospsi; mixed xb2d = - rc*cospsi;
...@@ -105,7 +105,7 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest ...@@ -105,7 +105,7 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest
mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi; mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
mixed xb2d2 = xb2d*xb2d; mixed xb2d2 = xb2d*xb2d;
mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d); mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y); mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5; xb2d -= deltx*0.5;
// --- Step3 al,be,ga --- // --- Step3 al,be,ga ---
...@@ -115,11 +115,11 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest ...@@ -115,11 +115,11 @@ __kernel void applySettle(int numClusters, mixed tol, __global const real4* rest
mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d; mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
mixed al2be2 = alpha*alpha + beta*beta; mixed al2be2 = alpha*alpha + beta*beta;
mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2; mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' --- // --- Step4 A3' ---
mixed costheta = sqrt(1.0f - sintheta*sintheta); mixed costheta = SQRT(1.0f - sintheta*sintheta);
mixed xa3d = - ya2d*sintheta; mixed xa3d = - ya2d*sintheta;
mixed ya3d = ya2d*costheta; mixed ya3d = ya2d*costheta;
mixed za3d = za1d; mixed za3d = za1d;
...@@ -186,9 +186,9 @@ __kernel void constrainVelocities(int numClusters, mixed tol, __global const rea ...@@ -186,9 +186,9 @@ __kernel void constrainVelocities(int numClusters, mixed tol, __global const rea
mixed4 eAB = apos1-apos0; mixed4 eAB = apos1-apos0;
mixed4 eBC = apos2-apos1; mixed4 eBC = apos2-apos1;
mixed4 eCA = apos0-apos2; mixed4 eCA = apos0-apos2;
eAB.xyz /= sqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z); eAB.xyz /= SQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC.xyz /= sqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z); eBC.xyz /= SQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA.xyz /= sqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z); eCA.xyz /= SQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z; mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z; mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z; mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
......
...@@ -4,6 +4,47 @@ KEY_TYPE getValue(DATA_TYPE value) { ...@@ -4,6 +4,47 @@ KEY_TYPE getValue(DATA_TYPE value) {
return SORT_KEY; return SORT_KEY;
} }
/**
* Sort a list that is short enough to entirely fit in local memory. This is executed as
* a single thread block.
*/
__kernel void sortShortList(__global DATA_TYPE* __restrict__ data, uint length, __local DATA_TYPE* dataBuffer) {
// Load the data into local memory.
for (int index = get_local_id(0); index < length; index += get_local_size(0))
dataBuffer[index] = data[index];
barrier(CLK_LOCAL_MEM_FENCE);
// Perform a bitonic sort in local memory.
for (unsigned int k = 2; k < 2*length; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = get_local_id(0); i < length; i += get_local_size(0)) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = dataBuffer[i];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = ((i&k) == 0);
for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
dataBuffer[i] = value2;
dataBuffer[ixj] = value1;
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
}
// Write the data back to global memory.
for (int index = get_local_id(0); index < length; index += get_local_size(0))
data[index] = dataBuffer[index];
}
/** /**
* Calculate the minimum and maximum value in the array to be sorted. This kernel * Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group. * is executed as a single work group.
......
...@@ -11,7 +11,7 @@ if (cosangle > 0.99f || cosangle < -0.99f) { ...@@ -11,7 +11,7 @@ if (cosangle > 0.99f || cosangle < -0.99f) {
real4 cross_prod = cross(cp0, cp1); real4 cross_prod = cross(cp0, cp1);
real scale = dot(cp0, cp0)*dot(cp1, cp1); real scale = dot(cp0, cp0)*dot(cp1, cp1);
theta = asin(sqrt(dot(cross_prod, cross_prod)/scale)); theta = asin(SQRT(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0) if (cosangle < 0)
theta = PI-theta; theta = PI-theta;
} }
...@@ -21,7 +21,7 @@ theta = (dot(v0, cp1) >= 0 ? theta : -theta); ...@@ -21,7 +21,7 @@ theta = (dot(v0, cp1) >= 0 ? theta : -theta);
COMPUTE_FORCE COMPUTE_FORCE
real normCross1 = dot(cp0, cp0); real normCross1 = dot(cp0, cp0);
real normSqrBC = dot(v1, v1); real normSqrBC = dot(v1, v1);
real normBC = sqrt(normSqrBC); real normBC = SQRT(normSqrBC);
real normCross2 = dot(cp1, cp1); real normCross2 = dot(cp1, cp1);
real dp = 1.0f/normSqrBC; real dp = 1.0f/normSqrBC;
real4 ff = (real4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2); real4 ff = (real4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
......
...@@ -98,8 +98,8 @@ __kernel void selectVerletStepSize(int numAtoms, mixed maxStepSize, mixed errorT ...@@ -98,8 +98,8 @@ __kernel void selectVerletStepSize(int numAtoms, mixed maxStepSize, mixed errorT
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
} }
if (get_local_id(0) == 0) { if (get_local_id(0) == 0) {
mixed totalError = sqrt(error[0]/(numAtoms*3)); mixed totalError = SQRT(error[0]/(numAtoms*3));
mixed newStepSize = sqrt(errorTol/totalError); mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y; mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f) if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase. newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
......
...@@ -26,9 +26,16 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec ...@@ -26,9 +26,16 @@ void storePos(__global real4* restrict posq, __global real4* restrict posqCorrec
/** /**
* Compute the positions of virtual sites * Compute the positions of virtual sites
*/ */
__kernel void computeVirtualSites(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const int4* restrict avg2Atoms, __kernel void computeVirtualSites(__global real4* restrict posq,
__global const real2* restrict avg2Weights, __global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights, #ifdef USE_MIXED_PRECISION
__global real4* restrict posqCorrection,
#endif
__global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
__global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
__global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights) { __global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights) {
#ifndef USE_MIXED_PRECISION
__global real4* posqCorrection = 0;
#endif
// Two particle average sites. // Two particle average sites.
...@@ -74,11 +81,17 @@ __kernel void computeVirtualSites(__global real4* restrict posq, __global real4* ...@@ -74,11 +81,17 @@ __kernel void computeVirtualSites(__global real4* restrict posq, __global real4*
/** /**
* Distribute forces from virtual sites to the atoms they are based on. * Distribute forces from virtual sites to the atoms they are based on.
*/ */
__kernel void distributeForces(__global const real4* restrict posq, __global real4* restrict posqCorrection, __global real4* restrict force, __kernel void distributeForces(__global const real4* restrict posq, __global real4* restrict force,
#ifdef USE_MIXED_PRECISION
__global real4* restrict posqCorrection,
#endif
__global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights, __global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
__global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights, __global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
__global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights) { __global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights) {
#ifndef USE_MIXED_PRECISION
__global real4* posqCorrection = 0;
#endif
// Two particle average sites. // Two particle average sites.
for (int index = get_global_id(0); index < NUM_2_AVERAGE; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_2_AVERAGE; index += get_global_size(0)) {
......
...@@ -59,7 +59,7 @@ void testTransform() { ...@@ -59,7 +59,7 @@ void testTransform() {
context.initialize(); context.initialize();
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
int xsize = 32, ysize = 25, zsize = 30; int xsize = 28, ysize = 25, zsize = 30;
vector<Real2> original(xsize*ysize*zsize); vector<Real2> original(xsize*ysize*zsize);
vector<t_complex> reference(original.size()); vector<t_complex> reference(original.size());
for (int i = 0; i < (int) original.size(); i++) { for (int i = 0; i < (int) original.size(); i++) {
...@@ -81,8 +81,8 @@ void testTransform() { ...@@ -81,8 +81,8 @@ void testTransform() {
fftpack_init_3d(&plan, xsize, ysize, zsize); fftpack_init_3d(&plan, xsize, ysize, zsize);
fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]); fftpack_exec_3d(plan, FFTPACK_FORWARD, &reference[0], &reference[0]);
for (int i = 0; i < (int) result.size(); ++i) { for (int i = 0; i < (int) result.size(); ++i) {
ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-4); ASSERT_EQUAL_TOL(reference[i].re, result[i].x, 1e-3);
ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-4); ASSERT_EQUAL_TOL(reference[i].im, result[i].y, 1e-3);
} }
fftpack_destroy(plan); fftpack_destroy(plan);
......
...@@ -438,7 +438,7 @@ void testLargeSystem() { ...@@ -438,7 +438,7 @@ void testLargeSystem() {
} }
ASSERT_EQUAL_TOL(clState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol); ASSERT_EQUAL_TOL(clState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
} }
/*
void testBlockInteractions(bool periodic) { void testBlockInteractions(bool periodic) {
const int blockSize = 32; const int blockSize = 32;
const int numBlocks = 100; const int numBlocks = 100;
...@@ -619,13 +619,13 @@ void testBlockInteractions(bool periodic) { ...@@ -619,13 +619,13 @@ void testBlockInteractions(bool periodic) {
} }
} }
} }
*/
void testDispersionCorrection() { void testDispersionCorrection() {
// Create a box full of identical particles. // Create a box full of identical particles.
int gridSize = 5; int gridSize = 5;
int numParticles = gridSize*gridSize*gridSize; int numParticles = gridSize*gridSize*gridSize;
double boxSize = gridSize*0.5; double boxSize = gridSize*0.7;
double cutoff = boxSize/3; double cutoff = boxSize/3;
System system; System system;
VerletIntegrator integrator(0.01); VerletIntegrator integrator(0.01);
...@@ -827,8 +827,8 @@ int main(int argc, char* argv[]) { ...@@ -827,8 +827,8 @@ int main(int argc, char* argv[]) {
testCutoff14(); testCutoff14();
testPeriodic(); testPeriodic();
testLargeSystem(); testLargeSystem();
testBlockInteractions(false); // testBlockInteractions(false);
testBlockInteractions(true); // testBlockInteractions(true);
testDispersionCorrection(); testDispersionCorrection();
testChangingParameters(); testChangingParameters();
testParallelComputation(false); testParallelComputation(false);
......
...@@ -48,15 +48,15 @@ using namespace std; ...@@ -48,15 +48,15 @@ using namespace std;
OpenCLPlatform platform; OpenCLPlatform platform;
struct SortTrait { class SortTrait : public OpenCLSort::SortTrait {
typedef cl_float DataType; int getDataSize() const {return 4;}
typedef cl_float KeyType; int getKeySize() const {return 4;}
static const char* clDataType() {return "float";} const char* getDataType() const {return "float";}
static const char* clKeyType() {return "float";} const char* getKeyType() const {return "float";}
static const char* clMinKey() {return "-MAXFLOAT";} const char* getMinKey() const {return "-MAXFLOAT";}
static const char* clMaxKey() {return "MAXFLOAT";} const char* getMaxKey() const {return "MAXFLOAT";}
static const char* clMaxValue() {return "MAXFLOAT";} const char* getMaxValue() const {return "MAXFLOAT";}
static const char* clSortKey() {return "value";} const char* getSortKey() const {return "value";}
}; };
void verifySorting(vector<float> array) { void verifySorting(vector<float> array) {
...@@ -69,7 +69,7 @@ void verifySorting(vector<float> array) { ...@@ -69,7 +69,7 @@ void verifySorting(vector<float> array) {
context.initialize(); context.initialize();
OpenCLArray data(context, array.size(), sizeof(float), "sortData"); OpenCLArray data(context, array.size(), sizeof(float), "sortData");
data.upload(array); data.upload(array);
OpenCLSort<SortTrait> sort(context, array.size()); OpenCLSort sort(context, new SortTrait(), array.size());
sort.sort(data); sort.sort(data);
vector<float> sorted; vector<float> sorted;
data.download(sorted); data.download(sorted);
...@@ -86,8 +86,7 @@ void verifySorting(vector<float> array) { ...@@ -86,8 +86,7 @@ void verifySorting(vector<float> array) {
ASSERT(elements1 == elements2); ASSERT(elements1 == elements2);
} }
void testUniformValues() void testUniformValues() {
{
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
...@@ -97,8 +96,7 @@ void testUniformValues() ...@@ -97,8 +96,7 @@ void testUniformValues()
verifySorting(array); verifySorting(array);
} }
void testLogValues() void testLogValues() {
{
OpenMM_SFMT::SFMT sfmt; OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt); init_gen_rand(0, sfmt);
...@@ -108,12 +106,23 @@ void testLogValues() ...@@ -108,12 +106,23 @@ void testLogValues()
verifySorting(array); verifySorting(array);
} }
void testShortList() {
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(500);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
}
int main(int argc, char* argv[]) { int main(int argc, char* argv[]) {
try { try {
if (argc > 1) if (argc > 1)
platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1])); platform.setPropertyDefaultValue("OpenCLPrecision", string(argv[1]));
testUniformValues(); testUniformValues();
testLogValues(); testLogValues();
testShortList();
} }
catch(const exception& e) { catch(const exception& e) {
cout << "exception: " << e.what() << endl; cout << "exception: " << e.what() << endl;
......
...@@ -356,7 +356,7 @@ void testDispersionCorrection() { ...@@ -356,7 +356,7 @@ void testDispersionCorrection() {
int gridSize = 5; int gridSize = 5;
int numParticles = gridSize*gridSize*gridSize; int numParticles = gridSize*gridSize*gridSize;
double boxSize = gridSize*0.5; double boxSize = gridSize*0.7;
double cutoff = boxSize/3; double cutoff = boxSize/3;
ReferencePlatform platform; ReferencePlatform platform;
System system; System system;
......
...@@ -787,8 +787,8 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri ...@@ -787,8 +787,8 @@ CudaCalcAmoebaMultipoleForceKernel::CudaCalcAmoebaMultipoleForceKernel(std::stri
multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL), multipoleParticles(NULL), molecularDipoles(NULL), molecularQuadrupoles(NULL), labFrameDipoles(NULL), labFrameQuadrupoles(NULL),
field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL), field(NULL), fieldPolar(NULL), inducedField(NULL), inducedFieldPolar(NULL), torque(NULL), dampingAndThole(NULL),
inducedDipole(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL), inducedDipole(NULL), inducedDipolePolar(NULL), inducedDipoleErrors(NULL), polarizability(NULL), covalentFlags(NULL), polarizationGroupFlags(NULL),
pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeTheta1(NULL), pmeTheta2(NULL), pmeTheta3(NULL), pmeGrid(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeIgrid(NULL), pmePhi(NULL),
pmeIgrid(NULL), pmePhi(NULL), pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), gkKernel(NULL) { pmePhid(NULL), pmePhip(NULL), pmePhidp(NULL), pmeAtomGridIndex(NULL), sort(NULL), gkKernel(NULL) {
} }
CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
...@@ -835,12 +835,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { ...@@ -835,12 +835,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete pmeBsplineModuliY; delete pmeBsplineModuliY;
if (pmeBsplineModuliZ != NULL) if (pmeBsplineModuliZ != NULL)
delete pmeBsplineModuliZ; delete pmeBsplineModuliZ;
if (pmeTheta1 != NULL)
delete pmeTheta1;
if (pmeTheta2 != NULL)
delete pmeTheta2;
if (pmeTheta3 != NULL)
delete pmeTheta3;
if (pmeIgrid != NULL) if (pmeIgrid != NULL)
delete pmeIgrid; delete pmeIgrid;
if (pmePhi != NULL) if (pmePhi != NULL)
...@@ -851,8 +845,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() { ...@@ -851,8 +845,6 @@ CudaCalcAmoebaMultipoleForceKernel::~CudaCalcAmoebaMultipoleForceKernel() {
delete pmePhip; delete pmePhip;
if (pmePhidp != NULL) if (pmePhidp != NULL)
delete pmePhidp; delete pmePhidp;
if (pmeAtomRange != NULL)
delete pmeAtomRange;
if (pmeAtomGridIndex != NULL) if (pmeAtomGridIndex != NULL)
delete pmeAtomGridIndex; delete pmeAtomGridIndex;
if (sort != NULL) if (sort != NULL)
...@@ -987,6 +979,15 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -987,6 +979,15 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
if (find(atoms12.begin(), atoms12.end(), atoms[j]) == atoms12.end()) if (find(atoms12.begin(), atoms12.end(), atoms[j]) == atoms12.end())
polarizationFlagValues.push_back(make_int2(i, atoms[j])); polarizationFlagValues.push_back(make_int2(i, atoms[j]));
} }
set<pair<int, int> > tilesWithExclusions;
for (int atom1 = 0; atom1 < (int) exclusions.size(); ++atom1) {
int x = atom1/CudaContext::TileSize;
for (int j = 0; j < (int) exclusions[atom1].size(); ++j) {
int atom2 = exclusions[atom1][j];
int y = atom2/CudaContext::TileSize;
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
// Record other options. // Record other options.
...@@ -1024,6 +1025,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1024,6 +1025,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
defines["DIRECT_POLARIZATION"] = ""; defines["DIRECT_POLARIZATION"] = "";
if (useShuffle) if (useShuffle)
defines["USE_SHUFFLE"] = ""; defines["USE_SHUFFLE"] = "";
defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
int numExclusionTiles = tilesWithExclusions.size();
defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
int numContexts = cu.getPlatformData().contexts.size();
int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
double alpha = force.getAEwald(); double alpha = force.getAEwald();
int gridSizeX, gridSizeY, gridSizeZ; int gridSizeX, gridSizeY, gridSizeZ;
if (usePME) { if (usePME) {
...@@ -1128,17 +1137,20 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1128,17 +1137,20 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
if (force.getPolarizationType() == AmoebaMultipoleForce::Direct) if (force.getPolarizationType() == AmoebaMultipoleForce::Direct)
pmeDefines["DIRECT_POLARIZATION"] = ""; pmeDefines["DIRECT_POLARIZATION"] = "";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaAmoebaKernelSources::multipolePme, pmeDefines);
pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines"); pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
pmeZIndexKernel = cu.getKernel(module, "recordZIndex");
pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles"); pmeSpreadFixedMultipolesKernel = cu.getKernel(module, "gridSpreadFixedMultipoles");
pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles"); pmeSpreadInducedDipolesKernel = cu.getKernel(module, "gridSpreadInducedDipoles");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution"); pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeFixedPotentialKernel = cu.getKernel(module, "computeFixedPotentialFromGrid"); pmeFixedPotentialKernel = cu.getKernel(module, "computeFixedPotentialFromGrid");
pmeInducedPotentialKernel = cu.getKernel(module, "computeInducedPotentialFromGrid"); pmeInducedPotentialKernel = cu.getKernel(module, "computeInducedPotentialFromGrid");
pmeFixedForceKernel = cu.getKernel(module, "computeFixedMultipoleForceAndEnergy"); pmeFixedForceKernel = cu.getKernel(module, "computeFixedMultipoleForceAndEnergy");
pmeInducedForceKernel = cu.getKernel(module, "computeInducedDipoleForceAndEnergy"); pmeInducedForceKernel = cu.getKernel(module, "computeInducedDipoleForceAndEnergy");
pmeRecordInducedFieldDipolesKernel = cu.getKernel(module, "recordInducedFieldDipoles"); pmeRecordInducedFieldDipolesKernel = cu.getKernel(module, "recordInducedFieldDipoles");
cuFuncSetCacheConfig(pmeSpreadFixedMultipolesKernel, CU_FUNC_CACHE_PREFER_L1);
cuFuncSetCacheConfig(pmeSpreadInducedDipolesKernel, CU_FUNC_CACHE_PREFER_L1);
cuFuncSetCacheConfig(pmeFixedPotentialKernel, CU_FUNC_CACHE_PREFER_L1);
cuFuncSetCacheConfig(pmeInducedPotentialKernel, CU_FUNC_CACHE_PREFER_L1);
// Create required data structures. // Create required data structures.
...@@ -1148,9 +1160,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1148,9 +1160,6 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeTheta1 = new CudaArray(cu, PmeOrder*numMultipoles, 4*elementSize, "pmeTheta1");
pmeTheta2 = new CudaArray(cu, PmeOrder*numMultipoles, 4*elementSize, "pmeTheta2");
pmeTheta3 = new CudaArray(cu, PmeOrder*numMultipoles, 4*elementSize, "pmeTheta3");
pmeIgrid = CudaArray::create<int4>(cu, numMultipoles, "pmeIgrid"); pmeIgrid = CudaArray::create<int4>(cu, numMultipoles, "pmeIgrid");
pmePhi = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhi"); pmePhi = new CudaArray(cu, 20*numMultipoles, elementSize, "pmePhi");
pmePhid = new CudaArray(cu, 10*numMultipoles, elementSize, "pmePhid"); pmePhid = new CudaArray(cu, 10*numMultipoles, elementSize, "pmePhid");
...@@ -1264,6 +1273,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const ...@@ -1264,6 +1273,7 @@ void CudaCalcAmoebaMultipoleForceKernel::initialize(const System& system, const
// just so that CudaNonbondedUtilities will build the exclusion flags and maintain the neighbor list. // just so that CudaNonbondedUtilities will build the exclusion flags and maintain the neighbor list.
cu.getNonbondedUtilities().addInteraction(usePME, usePME, true, force.getCutoffDistance(), exclusions, "", force.getForceGroup()); cu.getNonbondedUtilities().addInteraction(usePME, usePME, true, force.getCutoffDistance(), exclusions, "", force.getForceGroup());
cu.getNonbondedUtilities().setUsePadding(false);
cu.addForce(new ForceInfo(force)); cu.addForce(new ForceInfo(force));
} }
...@@ -1272,11 +1282,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() { ...@@ -1272,11 +1282,14 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
CudaNonbondedUtilities& nb = cu.getNonbondedUtilities(); CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
// Figure out the covalent flag values to use for each atom pair. // Figure out the covalent flag values to use for each atom pair.
vector<unsigned int> exclusionIndices; vector<ushort2> exclusionTiles;
vector<unsigned int> exclusionRowIndices; nb.getExclusionTiles().download(exclusionTiles);
nb.getExclusionIndices().download(exclusionIndices); map<pair<int, int>, int> exclusionTileMap;
nb.getExclusionRowIndices().download(exclusionRowIndices); for (int i = 0; i < (int) exclusionTiles.size(); i++) {
ushort2 tile = exclusionTiles[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i;
}
covalentFlags = CudaArray::create<uint2>(cu, nb.getExclusions().getSize(), "covalentFlags"); covalentFlags = CudaArray::create<uint2>(cu, nb.getExclusions().getSize(), "covalentFlags");
vector<uint2> covalentFlagsVec(nb.getExclusions().getSize(), make_uint2(0, 0)); vector<uint2> covalentFlagsVec(nb.getExclusions().getSize(), make_uint2(0, 0));
for (int i = 0; i < (int) covalentFlagValues.size(); i++) { for (int i = 0; i < (int) covalentFlagValues.size(); i++) {
...@@ -1290,19 +1303,19 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() { ...@@ -1290,19 +1303,19 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
int f1 = (value == 0 || value == 1 ? 1 : 0); int f1 = (value == 0 || value == 1 ? 1 : 0);
int f2 = (value == 0 || value == 2 ? 1 : 0); int f2 = (value == 0 || value == 2 ? 1 : 0);
if (x == y) { if (x == y) {
int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices); int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
covalentFlagsVec[index+offset1].x |= f1<<offset2; covalentFlagsVec[index+offset1].x |= f1<<offset2;
covalentFlagsVec[index+offset1].y |= f2<<offset2; covalentFlagsVec[index+offset1].y |= f2<<offset2;
covalentFlagsVec[index+offset2].x |= f1<<offset1; covalentFlagsVec[index+offset2].x |= f1<<offset1;
covalentFlagsVec[index+offset2].y |= f2<<offset1; covalentFlagsVec[index+offset2].y |= f2<<offset1;
} }
else if (x > y) { else if (x > y) {
int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices); int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
covalentFlagsVec[index+offset1].x |= f1<<offset2; covalentFlagsVec[index+offset1].x |= f1<<offset2;
covalentFlagsVec[index+offset1].y |= f2<<offset2; covalentFlagsVec[index+offset1].y |= f2<<offset2;
} }
else { else {
int index = CudaNonbondedUtilities::findExclusionIndex(y, x, exclusionIndices, exclusionRowIndices); int index = exclusionTileMap[make_pair(y, x)]*CudaContext::TileSize;
covalentFlagsVec[index+offset2].x |= f1<<offset1; covalentFlagsVec[index+offset2].x |= f1<<offset1;
covalentFlagsVec[index+offset2].y |= f2<<offset1; covalentFlagsVec[index+offset2].y |= f2<<offset1;
} }
...@@ -1321,16 +1334,16 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() { ...@@ -1321,16 +1334,16 @@ void CudaCalcAmoebaMultipoleForceKernel::initializeScaleFactors() {
int y = atom2/CudaContext::TileSize; int y = atom2/CudaContext::TileSize;
int offset2 = atom2-y*CudaContext::TileSize; int offset2 = atom2-y*CudaContext::TileSize;
if (x == y) { if (x == y) {
int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices); int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
polarizationGroupFlagsVec[index+offset1] |= 1<<offset2; polarizationGroupFlagsVec[index+offset1] |= 1<<offset2;
polarizationGroupFlagsVec[index+offset2] |= 1<<offset1; polarizationGroupFlagsVec[index+offset2] |= 1<<offset1;
} }
else if (x > y) { else if (x > y) {
int index = CudaNonbondedUtilities::findExclusionIndex(x, y, exclusionIndices, exclusionRowIndices); int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
polarizationGroupFlagsVec[index+offset1] |= 1<<offset2; polarizationGroupFlagsVec[index+offset1] |= 1<<offset2;
} }
else { else {
int index = CudaNonbondedUtilities::findExclusionIndex(y, x, exclusionIndices, exclusionRowIndices); int index = exclusionTileMap[make_pair(y, x)]*CudaContext::TileSize;
polarizationGroupFlagsVec[index+offset2] |= 1<<offset1; polarizationGroupFlagsVec[index+offset2] |= 1<<offset1;
} }
} }
...@@ -1364,8 +1377,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1364,8 +1377,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
if (gkKernel == NULL) { if (gkKernel == NULL) {
void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads); cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
...@@ -1375,8 +1387,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1375,8 +1387,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
else { else {
gkKernel->computeBornRadii(); gkKernel->computeBornRadii();
void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
&gkKernel->getBornRadii()->getDevicePointer(), &gkKernel->getField()->getDevicePointer(), &gkKernel->getBornRadii()->getDevicePointer(), &gkKernel->getField()->getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads); cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
...@@ -1395,7 +1406,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1395,7 +1406,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.clearBuffer(*inducedFieldPolar); cu.clearBuffer(*inducedFieldPolar);
if (gkKernel == NULL) { if (gkKernel == NULL) {
void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices, &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
&dampingAndThole->getDevicePointer()}; &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads); cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
} }
...@@ -1403,7 +1414,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1403,7 +1414,7 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.clearBuffer(*gkKernel->getInducedField()); cu.clearBuffer(*gkKernel->getInducedField());
cu.clearBuffer(*gkKernel->getInducedFieldPolar()); cu.clearBuffer(*gkKernel->getInducedFieldPolar());
void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices, &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
&gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(), &gkKernel->getInducedField()->getDevicePointer(), &gkKernel->getInducedFieldPolar()->getDevicePointer(),
&gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(), &gkKernel->getInducedDipoles()->getDevicePointer(), &gkKernel->getInducedDipolesPolar()->getDevicePointer(),
&gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &gkKernel->getBornRadii()->getDevicePointer(), &dampingAndThole->getDevicePointer()};
...@@ -1431,8 +1442,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1431,8 +1442,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
// Compute electrostatic force. // Compute electrostatic force.
void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&cu.getPosq().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(), &cu.getPosq().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(),
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices, &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads); cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
...@@ -1443,20 +1454,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1443,20 +1454,16 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
// Reciprocal space calculation. // Reciprocal space calculation.
unsigned int maxTiles = nb.getInteractingTiles().getSize(); unsigned int maxTiles = nb.getInteractingTiles().getSize();
void* pmeUpdateBsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeIgrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
&pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.getInvPeriodicBoxSizePointer()}; cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
cu.executeKernel(pmeUpdateBsplinesKernel, pmeUpdateBsplinesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, cu.ThreadBlockSize*PmeOrder*PmeOrder*elementSize);
sort->sort(*pmeAtomGridIndex); sort->sort(*pmeAtomGridIndex);
void* pmeAtomRangeArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(),
&cu.getPosq().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeAtomRangeKernel, pmeAtomRangeArgs, cu.getNumAtoms());
void* pmeZIndexArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeZIndexKernel, pmeZIndexArgs, cu.getNumAtoms());
void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), void* pmeSpreadFixedMultipolesArgs[] = {&cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(), &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
&pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadFixedMultipolesKernel, pmeSpreadFixedMultipolesArgs, cu.getNumAtoms());
void* finishSpreadArgs[] = {&pmeGrid->getDevicePointer()};
if (cu.getUseDoublePrecision())
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
else else
...@@ -1469,8 +1476,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1469,8 +1476,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(), void* pmeFixedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhi->getDevicePointer(), &field->getDevicePointer(),
&fieldPolar ->getDevicePointer(), &pmeIgrid->getDevicePointer(), &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &fieldPolar ->getDevicePointer(), &cu.getPosq().getDevicePointer(), &labFrameDipoles->getDevicePointer(),
&pmeTheta3->getDevicePointer(), &labFrameDipoles->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeFixedPotentialKernel, pmeFixedPotentialArgs, cu.getNumAtoms());
void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(), void* pmeFixedForceArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &torque->getDevicePointer(),
&cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(),
...@@ -1480,10 +1487,9 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1480,10 +1487,9 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
// Direct space calculation. // Direct space calculation.
void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* computeFixedFieldArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices,
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(), &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads); cu.executeKernel(computeFixedFieldKernel, computeFixedFieldArgs, numForceThreadBlocks*fixedFieldThreads, fixedFieldThreads);
void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(), void* recordInducedDipolesArgs[] = {&field->getDevicePointer(), &fieldPolar->getDevicePointer(),
...@@ -1492,10 +1498,12 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1492,10 +1498,12 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
// Reciprocal space calculation for the induced dipoles. // Reciprocal space calculation for the induced dipoles.
cu.clearBuffer(*pmeGrid);
void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), void* pmeSpreadInducedDipolesArgs[] = {&cu.getPosq().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(),
&pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(), &pmeGrid->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
&pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmeTheta3->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision())
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
else else
...@@ -1506,8 +1514,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1506,8 +1514,8 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
else else
cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE); cufftExecC2C(fft, (float2*) pmeGrid->getDevicePointer(), (float2*) pmeGrid->getDevicePointer(), CUFFT_INVERSE);
void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(), void* pmeInducedPotentialArgs[] = {&pmeGrid->getDevicePointer(), &pmePhid->getDevicePointer(), &pmePhip->getDevicePointer(),
&pmePhidp->getDevicePointer(), &pmeIgrid->getDevicePointer(), &pmeTheta1->getDevicePointer(), &pmeTheta2->getDevicePointer(), &pmePhidp->getDevicePointer(), &cu.getPosq().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(),
&pmeTheta3->getDevicePointer(), cu.getInvPeriodicBoxSizePointer()}; &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms()); cu.executeKernel(pmeInducedPotentialKernel, pmeInducedPotentialArgs, cu.getNumAtoms());
// Iterate until the dipoles converge. // Iterate until the dipoles converge.
...@@ -1517,12 +1525,15 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1517,12 +1525,15 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
cu.clearBuffer(*inducedField); cu.clearBuffer(*inducedField);
cu.clearBuffer(*inducedFieldPolar); cu.clearBuffer(*inducedFieldPolar);
void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* computeInducedFieldArgs[] = {&inducedField->getDevicePointer(), &inducedFieldPolar->getDevicePointer(), &cu.getPosq().getDevicePointer(),
&inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices, &nb.getExclusionTiles().getDevicePointer(), &inducedDipole->getDevicePointer(), &inducedDipolePolar->getDevicePointer(), &startTileIndex, &numTileIndices,
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(), &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
&dampingAndThole->getDevicePointer()}; &dampingAndThole->getDevicePointer()};
cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads); cu.executeKernel(computeInducedFieldKernel, computeInducedFieldArgs, numForceThreadBlocks*inducedFieldThreads, inducedFieldThreads);
cu.clearBuffer(*pmeGrid);
cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadInducedDipolesKernel, pmeSpreadInducedDipolesArgs, cu.getNumAtoms());
if (cu.getUseDoublePrecision())
cu.executeKernel(pmeFinishSpreadChargeKernel, finishSpreadArgs, pmeGrid->getSize());
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD); cufftExecZ2Z(fft, (double2*) pmeGrid->getDevicePointer(), (double2*) pmeGrid->getDevicePointer(), CUFFT_FORWARD);
else else
...@@ -1553,10 +1564,10 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in ...@@ -1553,10 +1564,10 @@ double CudaCalcAmoebaMultipoleForceKernel::execute(ContextImpl& context, bool in
// Compute electrostatic force. // Compute electrostatic force.
void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* electrostaticsArgs[] = {&cu.getForce().getDevicePointer(), &torque->getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&cu.getPosq().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(), &cu.getPosq().getDevicePointer(), &covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(),
&covalentFlags->getDevicePointer(), &polarizationGroupFlags->getDevicePointer(), &startTileIndex, &numTileIndices, &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
&nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(), &nb.getInteractingTiles().getDevicePointer(), &nb.getInteractionCount().getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getInteractionFlags().getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &maxTiles, &nb.getBlockCenters().getDevicePointer(), &nb.getInteractingAtoms().getDevicePointer(),
&labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(), &labFrameDipoles->getDevicePointer(), &labFrameQuadrupoles->getDevicePointer(), &inducedDipole->getDevicePointer(),
&inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()}; &inducedDipolePolar->getDevicePointer(), &dampingAndThole->getDevicePointer()};
cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads); cu.executeKernel(electrostaticsKernel, electrostaticsArgs, numForceThreadBlocks*electrostaticsThreads, electrostaticsThreads);
...@@ -1811,7 +1822,7 @@ private: ...@@ -1811,7 +1822,7 @@ private:
}; };
CudaCalcAmoebaGeneralizedKirkwoodForceKernel::CudaCalcAmoebaGeneralizedKirkwoodForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CudaCalcAmoebaGeneralizedKirkwoodForceKernel::CudaCalcAmoebaGeneralizedKirkwoodForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) :
CalcAmoebaGeneralizedKirkwoodForceKernel(name, platform), cu(cu), system(system), params(NULL), bornRadii(NULL), field(NULL), CalcAmoebaGeneralizedKirkwoodForceKernel(name, platform), cu(cu), system(system), hasInitializedKernels(false), params(NULL), bornRadii(NULL), field(NULL),
inducedField(NULL), inducedFieldPolar(NULL), inducedDipoleS(NULL), inducedDipolePolarS(NULL), bornSum(NULL), bornForce(NULL) { inducedField(NULL), inducedFieldPolar(NULL), inducedDipoleS(NULL), inducedDipolePolarS(NULL), bornSum(NULL), bornForce(NULL) {
} }
...@@ -1892,9 +1903,8 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst ...@@ -1892,9 +1903,8 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
chainRuleThreads = min(maxThreads, cu.computeThreadBlockSize(chainRuleThreadMemory)); chainRuleThreads = min(maxThreads, cu.computeThreadBlockSize(chainRuleThreadMemory));
ediffThreads = min(maxThreads, cu.computeThreadBlockSize(ediffThreadMemory)); ediffThreads = min(maxThreads, cu.computeThreadBlockSize(ediffThreadMemory));
// Create the kernels. // Set preprocessor macros we will use when we create the kernels.
map<string, string> defines;
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(paddedNumAtoms); defines["PADDED_NUM_ATOMS"] = cu.intToString(paddedNumAtoms);
defines["BORN_SUM_THREAD_BLOCK_SIZE"] = cu.intToString(computeBornSumThreads); defines["BORN_SUM_THREAD_BLOCK_SIZE"] = cu.intToString(computeBornSumThreads);
...@@ -1918,42 +1928,6 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst ...@@ -1918,42 +1928,6 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::initialize(const System& syst
defines["PROBE_RADIUS"] = cu.doubleToString(force.getProbeRadius()); defines["PROBE_RADIUS"] = cu.doubleToString(force.getProbeRadius());
defines["DIELECTRIC_OFFSET"] = cu.doubleToString(0.009); defines["DIELECTRIC_OFFSET"] = cu.doubleToString(0.009);
} }
stringstream forceSource;
forceSource << CudaKernelSources::vectorOps;
forceSource << CudaAmoebaKernelSources::amoebaGk;
forceSource << "#define F1\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
forceSource << "#undef F1\n";
forceSource << "#define F2\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
forceSource << "#undef F2\n";
forceSource << "#define T1\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
forceSource << "#undef T1\n";
forceSource << "#define T2\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
forceSource << "#undef T2\n";
forceSource << "#define T3\n";
forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
forceSource << "#undef T3\n";
forceSource << "#define B1\n";
forceSource << "#define B2\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
CUmodule module = cu.createModule(forceSource.str(), defines);
computeBornSumKernel = cu.getKernel(module, "computeBornSum");
reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
gkForceKernel = cu.getKernel(module, "computeGKForces");
chainRuleKernel = cu.getKernel(module, "computeChainRuleForce");
ediffKernel = cu.getKernel(module, "computeEDiffForce");
if (includeSurfaceArea)
surfaceAreaKernel = cu.getKernel(module, "computeSurfaceAreaForce");
cu.addForce(new ForceInfo(force)); cu.addForce(new ForceInfo(force));
} }
...@@ -1964,6 +1938,55 @@ double CudaCalcAmoebaGeneralizedKirkwoodForceKernel::execute(ContextImpl& contex ...@@ -1964,6 +1938,55 @@ double CudaCalcAmoebaGeneralizedKirkwoodForceKernel::execute(ContextImpl& contex
} }
void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::computeBornRadii() { void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::computeBornRadii() {
if (!hasInitializedKernels) {
hasInitializedKernels = true;
// Create the kernels.
int numExclusionTiles = cu.getNonbondedUtilities().getExclusionTiles().getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
int numContexts = cu.getPlatformData().contexts.size();
int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
stringstream forceSource;
forceSource << CudaKernelSources::vectorOps;
forceSource << CudaAmoebaKernelSources::amoebaGk;
forceSource << "#define F1\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
forceSource << "#undef F1\n";
forceSource << "#define F2\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
forceSource << "#undef F2\n";
forceSource << "#define T1\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
forceSource << "#undef T1\n";
forceSource << "#define T2\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
forceSource << "#undef T2\n";
forceSource << "#define T3\n";
forceSource << CudaAmoebaKernelSources::gkEDiffPairForce;
forceSource << "#undef T3\n";
forceSource << "#define B1\n";
forceSource << "#define B2\n";
forceSource << CudaAmoebaKernelSources::gkPairForce1;
forceSource << CudaAmoebaKernelSources::gkPairForce2;
CUmodule module = cu.createModule(forceSource.str(), defines);
computeBornSumKernel = cu.getKernel(module, "computeBornSum");
reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
gkForceKernel = cu.getKernel(module, "computeGKForces");
chainRuleKernel = cu.getKernel(module, "computeChainRuleForce");
ediffKernel = cu.getKernel(module, "computeEDiffForce");
if (includeSurfaceArea)
surfaceAreaKernel = cu.getKernel(module, "computeSurfaceAreaForce");
}
CudaNonbondedUtilities& nb = cu.getNonbondedUtilities(); CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
int numTiles = nb.getNumTiles(); int numTiles = nb.getNumTiles();
int numForceThreadBlocks = nb.getNumForceThreadBlocks(); int numForceThreadBlocks = nb.getNumForceThreadBlocks();
...@@ -2002,8 +2025,8 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::finishComputation(CudaArray& ...@@ -2002,8 +2025,8 @@ void CudaCalcAmoebaGeneralizedKirkwoodForceKernel::finishComputation(CudaArray&
&params->getDevicePointer(), &bornRadii->getDevicePointer(), &bornForce->getDevicePointer()}; &params->getDevicePointer(), &bornRadii->getDevicePointer(), &bornForce->getDevicePointer()};
cu.executeKernel(chainRuleKernel, chainRuleArgs, numForceThreadBlocks*chainRuleThreads, chainRuleThreads); cu.executeKernel(chainRuleKernel, chainRuleArgs, numForceThreadBlocks*chainRuleThreads, chainRuleThreads);
void* ediffArgs[] = {&cu.getForce().getDevicePointer(), &torque.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(), void* ediffArgs[] = {&cu.getForce().getDevicePointer(), &torque.getDevicePointer(), &cu.getEnergyBuffer().getDevicePointer(),
&cu.getPosq().getDevicePointer(), &nb.getExclusionIndices().getDevicePointer(), &nb.getExclusionRowIndices().getDevicePointer(), &cu.getPosq().getDevicePointer(), &covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(),
&covalentFlags.getDevicePointer(), &polarizationGroupFlags.getDevicePointer(), &startTileIndex, &numTileIndices, &nb.getExclusionTiles().getDevicePointer(), &startTileIndex, &numTileIndices,
&labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(), &inducedDipole.getDevicePointer(), &labFrameDipoles.getDevicePointer(), &labFrameQuadrupoles.getDevicePointer(), &inducedDipole.getDevicePointer(),
&inducedDipolePolar.getDevicePointer(), &inducedDipoleS->getDevicePointer(), &inducedDipolePolarS->getDevicePointer(), &inducedDipolePolar.getDevicePointer(), &inducedDipoleS->getDevicePointer(), &inducedDipolePolarS->getDevicePointer(),
&dampingAndThole.getDevicePointer()}; &dampingAndThole.getDevicePointer()};
......
...@@ -398,9 +398,6 @@ private: ...@@ -398,9 +398,6 @@ private:
CudaArray* pmeBsplineModuliX; CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY; CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ; CudaArray* pmeBsplineModuliZ;
CudaArray* pmeTheta1;
CudaArray* pmeTheta2;
CudaArray* pmeTheta3;
CudaArray* pmeIgrid; CudaArray* pmeIgrid;
CudaArray* pmePhi; CudaArray* pmePhi;
CudaArray* pmePhid; CudaArray* pmePhid;
...@@ -411,8 +408,8 @@ private: ...@@ -411,8 +408,8 @@ private:
CudaSort* sort; CudaSort* sort;
cufftHandle fft; cufftHandle fft;
CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel; CUfunction computeMomentsKernel, recordInducedDipolesKernel, computeFixedFieldKernel, computeInducedFieldKernel, updateInducedFieldKernel, electrostaticsKernel, mapTorqueKernel;
CUfunction pmeUpdateBsplinesKernel, pmeAtomRangeKernel, pmeZIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeConvolutionKernel, pmeFixedPotentialKernel, pmeInducedPotentialKernel; CUfunction pmeGridIndexKernel, pmeSpreadFixedMultipolesKernel, pmeSpreadInducedDipolesKernel, pmeFinishSpreadChargeKernel, pmeConvolutionKernel;
CUfunction pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel; CUfunction pmeFixedPotentialKernel, pmeInducedPotentialKernel, pmeFixedForceKernel, pmeInducedForceKernel, pmeRecordInducedFieldDipolesKernel, computePotentialKernel;
CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel; CudaCalcAmoebaGeneralizedKirkwoodForceKernel* gkKernel;
static const int PmeOrder = 5; static const int PmeOrder = 5;
}; };
...@@ -477,8 +474,9 @@ private: ...@@ -477,8 +474,9 @@ private:
class ForceInfo; class ForceInfo;
CudaContext& cu; CudaContext& cu;
System& system; System& system;
bool includeSurfaceArea; bool includeSurfaceArea, hasInitializedKernels;
int computeBornSumThreads, gkForceThreads, chainRuleThreads, ediffThreads; int computeBornSumThreads, gkForceThreads, chainRuleThreads, ediffThreads;
std::map<std::string, std::string> defines;
CudaArray* params; CudaArray* params;
CudaArray* bornSum; CudaArray* bornSum;
CudaArray* bornRadii; CudaArray* bornRadii;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment