Commit 93c467b2 authored by Peter Eastman's avatar Peter Eastman
Browse files

Merged 5.1Optimizations branch back to trunk

parent f6d4557d
......@@ -95,8 +95,8 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
if (blockIdx.x*blockDim.x+threadIdx.x == 0) {
// Select the new step size.
mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
mixed newStepSize = sqrt(errorTol/totalError);
mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
......@@ -108,9 +108,9 @@ extern "C" __global__ void selectLangevinStepSize(mixed maxStepSize, mixed error
// Recalculate the integration parameters.
mixed vscale = exp(-newStepSize/tau);
mixed vscale = EXP(-newStepSize/tau);
mixed fscale = (1-vscale)*tau;
mixed noisescale = sqrt(2*kT/tau)*sqrt(0.5f*(1-vscale*vscale)*tau);
mixed noisescale = SQRT(2*kT/tau)*SQRT(0.5f*(1-vscale*vscale)*tau);
params[VelScale] = vscale;
params[ForceScale] = fscale;
params[NoiseScale] = noisescale;
......
#define TILE_SIZE 32
#define WARPS_PER_GROUP (THREAD_BLOCK_SIZE/TILE_SIZE)
typedef struct {
......@@ -15,133 +14,245 @@ typedef struct {
* Compute nonbonded interactions.
*/
extern "C" __global__ void computeNonbonded(
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices,
unsigned int startTileIndex, unsigned int numTileIndices
unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const tileflags* __restrict__ exclusions,
const ushort2* __restrict__ exclusionTiles, unsigned int startTileIndex, unsigned int numTileIndices
#ifdef USE_CUTOFF
, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
#endif
PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
unsigned int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
unsigned int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0.0f;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[3*THREAD_BLOCK_SIZE];
#endif
// First loop: process tiles that contain exclusions.
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real3 force = make_real3(0);
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
tileflags excl = exclusions[pos*TILE_SIZE+tgx];
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
const bool hasExclusions = true;
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
force.x -= delta.x*dEdR;
force.y -= delta.y*dEdR;
force.z -= delta.z*dEdR;
#else
bool hasExclusions = false;
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].x = posq1.x;
localData[localAtomIndex].y = posq1.y;
localData[localAtomIndex].z = posq1.z;
localData[localAtomIndex].q = posq1.w;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
excl >>= 1;
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
int atom2 = tbx+j;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
energy += tempEnergy;
#ifdef USE_SYMMETRIC
force.x -= delta.x*dEdR;
force.y -= delta.y*dEdR;
force.z -= delta.z*dEdR;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0];
int pos = (numTiles > maxTiles ? startTileIndex+warp*numTileIndices/totalWarps : warp*numTiles/totalWarps);
int end = (numTiles > maxTiles ? startTileIndex+(warp+1)*numTileIndices/totalWarps : (warp+1)*numTiles/totalWarps);
#else
const unsigned int numTiles = numTileIndices;
int pos = startTileIndex+warp*numTiles/totalWarps;
int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
const bool hasExclusions = false;
real3 force = make_real3(0);
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
else {
// This is an off-diagonal tile.
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
......@@ -151,195 +262,137 @@ extern "C" __global__ void computeNonbonded(
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
for (j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
int bufferIndex = 3*threadIdx.x;
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[localAtomIndex].x -= floor((localData[localAtomIndex].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[localAtomIndex].y -= floor((localData[localAtomIndex].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[localAtomIndex].z -= floor((localData[localAtomIndex].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
#ifdef USE_SYMMETRIC
real dEdR = 0;
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
real tempEnergy = 0.0f;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_CUTOFF
}
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif
#ifdef ENABLE_SHUFFLE
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
for (int i = 16; i >= 1; i /= 2) {
delta.x += __shfl_xor(delta.x, i, 32);
delta.y += __shfl_xor(delta.y, i, 32);
delta.z += __shfl_xor(delta.z, i, 32);
}
if (tgx == 0) {
localData[tbx+j].fx += delta.x;
localData[tbx+j].fy += delta.y;
localData[tbx+j].fz += delta.z;
}
#else
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
for (int i = 16; i >= 1; i /= 2) {
dEdR2.x += __shfl_xor(dEdR2.x, i, 32);
dEdR2.y += __shfl_xor(dEdR2.y, i, 32);
dEdR2.z += __shfl_xor(dEdR2.z, i, 32);
}
if (tgx == 0) {
localData[tbx+j].fx += dEdR2.x;
localData[tbx+j].fy += dEdR2.y;
localData[tbx+j].fz += dEdR2.z;
}
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
tempBuffer[bufferIndex] = delta.x;
tempBuffer[bufferIndex+1] = delta.y;
tempBuffer[bufferIndex+2] = delta.z;
#else
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
tempBuffer[bufferIndex] = dEdR2.x;
tempBuffer[bufferIndex+1] = dEdR2.y;
tempBuffer[bufferIndex+2] = dEdR2.z;
#endif
// Sum the forces on atom2.
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
}
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
else
}
else
#endif
{
// Compute the full set of interactions in this tile.
{
// We need to apply periodic boundary conditions separately for each interaction.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = make_real4(localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
#ifdef USE_SYMMETRIC
real dEdR = 0.0f;
real dEdR = 0.0f;
#else
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
real3 dEdR1 = make_real3(0);
real3 dEdR2 = make_real3(0);
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif
real tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
force.x -= dEdR1.x;
force.y -= dEdR1.y;
force.z -= dEdR1.z;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
// Write results.
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
}
}
pos++;
} while (pos < end);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4* __restrict__ pmeBsplineTheta, int2* __restrict__ pmeAtomGridIndex,
extern "C" __global__ void findAtomGridIndex(const real4* __restrict__ posq, int2* __restrict__ pmeAtomGridIndex,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
extern __shared__ real3 bsplinesCache[];
real3* data = &bsplinesCache[threadIdx.x*PME_ORDER];
const real3 scale = make_real3(RECIP(PME_ORDER-1));
// Compute the index of the grid point each atom is associated with.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
......@@ -11,11 +10,40 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
pmeAtomGridIndex[i] = make_int2(i, gridIndex.x*GRID_SIZE_Y*GRID_SIZE_Z+gridIndex.y*GRID_SIZE_Z+gridIndex.z);
}
}
extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, const int2* __restrict__ pmeAtomGridIndex) {
real3 data[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
// Process the atoms in spatially sorted order. This improves efficiency when writing
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int atom = pmeAtomGridIndex[i].x;
real charge = posq[atom].w;
real3 force = make_real3(0);
real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
real3 t = make_real3((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
real3 dr = make_real3(t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z);
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
......@@ -23,98 +51,46 @@ extern "C" __global__ void updateBsplines(const real4* __restrict__ posq, real4*
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+make_real3(k)) *data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[j-k-1] = div*((dr+make_real3(k))*data[j-k-2] + (make_real3(j-k)-dr)*data[j-k-1]);
data[0] = div*(make_real3(1)-dr)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
for (int j = 0; j < PME_ORDER; j++) {
real3 d = data[j]; // Copy it as a workaround for a bug in CUDA 5.0
pmeBsplineTheta[i+j*NUM_ATOMS] = make_real4(d.x, d.y, d.z, pos.w); // Storing the charge here improves cache coherency in the charge spreading kernel
}
}
}
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
extern "C" __global__ void findAtomRangeForGrid(int2* __restrict__ pmeAtomGridIndex, int* __restrict__ pmeAtomRange, const real4* __restrict__ posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x))/(blockDim.x*gridDim.x);
int end = (NUM_ATOMS*(blockIdx.x*blockDim.x+threadIdx.x+1))/(blockDim.x*gridDim.x);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
for (int i = start; i < end; ++i) {
int2 atomData = pmeAtomGridIndex[i];
int gridIndex = atomData.y;
if (gridIndex != last) {
for (int j = last+1; j <= gridIndex; ++j)
pmeAtomRange[j] = i;
last = gridIndex;
}
}
// Fill in values beyond the last atom.
if (blockIdx.x == gridDim.x-1 && threadIdx.x == blockDim.x-1) {
int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int j = last+1; j <= gridSize; ++j)
pmeAtomRange[j] = NUM_ATOMS;
}
}
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
extern "C" __global__ void gridSpreadCharge(const real4* __restrict__ posq, real* __restrict__ originalPmeGrid,
const real4* __restrict__ pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int ix = threadIdx.x/(PME_ORDER*PME_ORDER);
int remainder = threadIdx.x-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER;
int iz = remainder-iy*PME_ORDER;
__shared__ real4 theta[PME_ORDER];
__shared__ real charge[BUFFER_SIZE];
__shared__ int basex[BUFFER_SIZE];
__shared__ int basey[BUFFER_SIZE];
__shared__ int basez[BUFFER_SIZE];
if (ix < PME_ORDER) {
for (int baseIndex = blockIdx.x*BUFFER_SIZE; baseIndex < NUM_ATOMS; baseIndex += gridDim.x*BUFFER_SIZE) {
// Load the next block of atoms into the buffers.
// Spread the charge from this atom onto each grid point.
for (int ix = 0; ix < PME_ORDER; ix++) {
int xbase = gridIndex.x+ix;
xbase -= (xbase >= GRID_SIZE_X ? GRID_SIZE_X : 0);
xbase = xbase*GRID_SIZE_Y*GRID_SIZE_Z;
real dx = data[ix].x;
for (int iy = 0; iy < PME_ORDER; iy++) {
int ybase = gridIndex.y+iy;
ybase -= (ybase >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
ybase = xbase + ybase*GRID_SIZE_Z;
real dy = data[iy].y;
for (int iz = 0; iz < PME_ORDER; iz++) {
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = ybase + zindex;
int atomIndex = baseIndex+threadIdx.x;
if (atomIndex < NUM_ATOMS) {
real4 pos = posq[atomIndex];
charge[threadIdx.x] = pos.w;
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
basex[threadIdx.x] = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X);
basey[threadIdx.x] = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y);
basez[threadIdx.x] = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
}
__syncthreads();
int lastIndex = min(BUFFER_SIZE, NUM_ATOMS-baseIndex);
for (int index = 0; index < lastIndex; index++) {
int atomIndex = index+baseIndex;
if (threadIdx.x < PME_ORDER)
theta[threadIdx.x] = pmeBsplineTheta[atomIndex+threadIdx.x*NUM_ATOMS];
__syncthreads();
real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
int x = basex[index]+ix;
int y = basey[index]+iy;
int z = basez[index]+iz;
x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
real add = charge*dx*dy*data[iz].z;
#ifdef USE_DOUBLE_PRECISION
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], static_cast<unsigned long long>((long long) (add*0x100000000)));
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
atomicAdd(&ulonglong_p[index], static_cast<unsigned long long>((long long) (add*0x100000000)));
#elif __CUDA_ARCH__ < 200
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
int gridIndex = x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z;
gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0x100000000)));
unsigned long long * ulonglong_p = (unsigned long long *) originalPmeGrid;
int gridIndex = index;
gridIndex = (gridIndex%2 == 0 ? gridIndex/2 : (gridIndex+GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z)/2);
atomicAdd(&ulonglong_p[gridIndex], static_cast<unsigned long long>((long long) (add*0x100000000)));
#else
atomicAdd(&originalPmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], add*EPSILON_FACTOR);
atomicAdd(&originalPmeGrid[index], add*EPSILON_FACTOR);
#endif
}
}
}
}
......@@ -182,48 +158,52 @@ gridEvaluateEnergy(real2* __restrict__ halfcomplex_pmeGrid, real* __restrict__ e
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
const real recipScaleFactor = RECIP(M_PI*periodicBoxSize.x*periodicBoxSize.y*periodicBoxSize.z);
real energy = 0;
real energy = 0;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < gridSize; index += blockDim.x*gridDim.x) {
// real indices
int kx = index/(GRID_SIZE_Y*(GRID_SIZE_Z));
int remainder = index-kx*GRID_SIZE_Y*(GRID_SIZE_Z);
int ky = remainder/(GRID_SIZE_Z);
int kz = remainder-ky*(GRID_SIZE_Z);
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*invPeriodicBoxSize.x;
real mhy = my*invPeriodicBoxSize.y;
real mhz = mz*invPeriodicBoxSize.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
real mhx = mx*invPeriodicBoxSize.x;
real mhy = my*invPeriodicBoxSize.y;
real mhz = mz*invPeriodicBoxSize.z;
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
if(kz >= (GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
if (kx != 0 || ky != 0 || kz != 0) {
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
if(kz >= (GRID_SIZE_Z/2+1)) {
kx = ((kx == 0) ? kx : GRID_SIZE_X-kx);
ky = ((ky == 0) ? ky : GRID_SIZE_Y-ky);
kz = GRID_SIZE_Z-kz;
}
int indexInHalfComplexGrid = kz + ky*(GRID_SIZE_Z/2+1)+kx*(GRID_SIZE_Y*(GRID_SIZE_Z/2+1));
real2 grid = halfcomplex_pmeGrid[indexInHalfComplexGrid];
if (kx != 0 || ky != 0 || kz != 0) {
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += 0.5f*energy;
}
extern "C" __global__
void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __restrict__ forceBuffers, const real* __restrict__ originalPmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize) {
real4 periodicBoxSize, real4 invPeriodicBoxSize, const int2* __restrict__ pmeAtomGridIndex) {
real3 data[PME_ORDER];
real3 ddata[PME_ORDER];
const real scale = RECIP(PME_ORDER-1);
for (int atom = blockIdx.x*blockDim.x+threadIdx.x; atom < NUM_ATOMS; atom += blockDim.x*gridDim.x) {
// Process the atoms in spatially sorted order. This improves cache performance when loading
// the grid values.
for (int i = blockIdx.x*blockDim.x+threadIdx.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
int atom = pmeAtomGridIndex[i].x;
real3 force = make_real3(0);
real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
......@@ -233,8 +213,8 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z);
int3 gridIndex = make_int3(((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z);
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
......@@ -243,7 +223,6 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
data[PME_ORDER-1] = make_real3(0);
data[1] = dr;
data[0] = make_real3(1)-dr;
for (int j = 3; j < PME_ORDER; j++) {
real div = RECIP(j-1);
data[j-1] = div*dr*data[j-2];
......@@ -252,15 +231,13 @@ void gridInterpolateForce(const real4* __restrict__ posq, unsigned long long* __
data[0] = div*(make_real3(1)-dr)*data[0];
}
ddata[0] = -data[0];
for (int j = 1; j < PME_ORDER; j++)
ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+make_real3(j))*data[PME_ORDER-j-2] + (make_real3(PME_ORDER-j)-dr)*data[PME_ORDER-j-1]);
data[0] = scale*(make_real3(1)-dr)*data[0];
// Compute the force on this atom.
for (int ix = 0; ix < PME_ORDER; ix++) {
......
......@@ -3,7 +3,49 @@ __device__ KEY_TYPE getValue(DATA_TYPE value) {
}
extern "C" {
/**
* Sort a list that is short enough to entirely fit in local memory. This is executed as
* a single thread block.
*/
__global__ void sortShortList(DATA_TYPE* __restrict__ data, unsigned int length) {
// Load the data into local memory.
extern __shared__ DATA_TYPE dataBuffer[];
for (int index = threadIdx.x; index < length; index += blockDim.x)
dataBuffer[index] = data[index];
__syncthreads();
// Perform a bitonic sort in local memory.
for (unsigned int k = 2; k < 2*length; k *= 2) {
for (unsigned int j = k/2; j > 0; j /= 2) {
for (unsigned int i = threadIdx.x; i < length; i += blockDim.x) {
int ixj = i^j;
if (ixj > i && ixj < length) {
DATA_TYPE value1 = dataBuffer[i];
DATA_TYPE value2 = dataBuffer[ixj];
bool ascending = ((i&k) == 0);
for (unsigned int mask = k*2; mask < 2*length; mask *= 2)
ascending = ((i&mask) == 0 ? !ascending : ascending);
KEY_TYPE lowKey = (ascending ? getValue(value1) : getValue(value2));
KEY_TYPE highKey = (ascending ? getValue(value2) : getValue(value1));
if (lowKey > highKey) {
dataBuffer[i] = value2;
dataBuffer[ixj] = value1;
}
}
}
__syncthreads();
}
}
// Write the data back to global memory.
for (int index = threadIdx.x; index < length; index += blockDim.x)
data[index] = dataBuffer[index];
}
/**
* Calculate the minimum and maximum value in the array to be sorted. This kernel
* is executed as a single work group.
......
......@@ -16,12 +16,12 @@ if (cosangle > 0.99f || cosangle < -0.99f) {
theta = PI-theta;
}
else
theta = acos(cosangle);
theta = ACOS(cosangle);
theta = (dot(v0, cp1) >= 0 ? theta : -theta);
COMPUTE_FORCE
real normCross1 = dot(cp0, cp0);
real normSqrBC = dot(v1, v1);
real normBC = sqrt(normSqrBC);
real normBC = SQRT(normSqrBC);
real normCross2 = dot(cp1, cp1);
real dp = RECIP(normSqrBC);
real4 ff = make_real4((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
......
......@@ -93,8 +93,8 @@ extern "C" __global__ void selectVerletStepSize(mixed maxStepSize, mixed errorTo
__syncthreads();
}
if (threadIdx.x == 0) {
mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
mixed newStepSize = sqrt(errorTol/totalError);
mixed totalError = SQRT(error[0]/(NUM_ATOMS*3));
mixed newStepSize = SQRT(errorTol/totalError);
mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
......
......@@ -438,9 +438,9 @@ void testLargeSystem() {
}
ASSERT_EQUAL_TOL(cuState.getPotentialEnergy(), referenceState.getPotentialEnergy(), tol);
}
/*
void testBlockInteractions(bool periodic) {
const int blockSize = 32;
const int blockSize = CudaContext::TileSize;
const int numBlocks = 100;
const int numParticles = blockSize*numBlocks;
const double cutoff = 1.0;
......@@ -597,6 +597,8 @@ void testBlockInteractions(bool periodic) {
if (!hasInteractions[i]) {
unsigned int y = (unsigned int) std::floor(numBlocks+0.5-std::sqrt((numBlocks+0.5)*(numBlocks+0.5)-2*i));
unsigned int x = (i-y*numBlocks+y*(y+1)/2);
if (x == y)
continue; // This block has exclusions, so it will not be in the neighbor list.
for (int atom1 = 0; atom1 < blockSize; ++atom1) {
double4 pos1 = posq[x*blockSize+atom1];
for (int atom2 = 0; atom2 < blockSize; ++atom2) {
......@@ -613,14 +615,14 @@ void testBlockInteractions(bool periodic) {
}
}
}
}
}*/
void testDispersionCorrection() {
// Create a box full of identical particles.
int gridSize = 5;
int numParticles = gridSize*gridSize*gridSize;
double boxSize = gridSize*0.5;
double boxSize = gridSize*0.7;
double cutoff = boxSize/3;
System system;
VerletIntegrator integrator(0.01);
......@@ -822,8 +824,8 @@ int main(int argc, char* argv[]) {
testCutoff14();
testPeriodic();
testLargeSystem();
testBlockInteractions(false);
testBlockInteractions(true);
//testBlockInteractions(false);
//testBlockInteractions(true);
testDispersionCorrection();
testChangingParameters();
testParallelComputation(false);
......
......@@ -87,8 +87,7 @@ void verifySorting(vector<float> array) {
ASSERT(elements1 == elements2);
}
void testUniformValues()
{
void testUniformValues() {
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
......@@ -98,8 +97,7 @@ void testUniformValues()
verifySorting(array);
}
void testLogValues()
{
void testLogValues() {
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
......@@ -109,12 +107,23 @@ void testLogValues()
verifySorting(array);
}
void testShortList() {
OpenMM_SFMT::SFMT sfmt;
init_gen_rand(0, sfmt);
vector<float> array(500);
for (int i = 0; i < (int) array.size(); i++)
array[i] = (float) log(genrand_real2(sfmt));
verifySorting(array);
}
int main(int argc, char* argv[]) {
try {
if (argc > 1)
platform.setPropertyDefaultValue("CudaPrecision", string(argv[1]));
testUniformValues();
testLogValues();
testShortList();
}
catch(const exception& e) {
cout << "exception: " << e.what() << endl;
......
......@@ -99,43 +99,55 @@ void OpenCLBondedUtilities::initialize(const System& system) {
numBuffers[i] = max(numBuffers[i], bufferCounter[i][j]);
}
// Figure out how many force buffers will be required.
for (int i = 0; i < numForces; i++)
numForceBuffers = max(numForceBuffers, numBuffers[i]);
int bufferLimit = max(numForceBuffers, (int) context.getPlatformData().contexts.size());
if (context.getNonbondedUtilities().getHasInteractions())
bufferLimit = max(bufferLimit, context.getNonbondedUtilities().getNumForceBuffers());
// For efficiency, we want to merge multiple forces into a single kernel - but only if that
// won't increase the number of force buffers. Figure out sets of forces that can be merged.
// won't increase the number of force buffers.
vector<int> unmerged(numForces);
for (int i = 0; i < numForces; i++)
unmerged[i] = i;
for (int i = 0; i < numForces; i++)
for (int j = i-1; j >= 0; j--) {
if (numBuffers[unmerged[j]] <= numBuffers[unmerged[j+1]])
break;
int temp = unmerged[j+1];
unmerged[j+1] = unmerged[j];
unmerged[j] = temp;
}
while (unmerged.size() > 0) {
int sum = numBuffers[unmerged.back()];
int i;
for (i = 0; i < (int) unmerged.size()-1; i++) {
if (sum+numBuffers[unmerged[i]] > bufferLimit)
break;
sum += numBuffers[unmerged[i]];
}
if (context.getSupports64BitGlobalAtomics()) {
// Put all the forces in the same set.
numForceBuffers = 1;
forceSets.push_back(vector<int>());
for (int j = 0; j < i; j++)
forceSets.back().push_back(unmerged[j]);
forceSets.back().push_back(unmerged.back());
for (int j = 0; j < i; j++)
unmerged.erase(unmerged.begin());
unmerged.pop_back();
for (int i = 0; i < numForces; i++)
forceSets[0].push_back(i);
}
else {
// Figure out how many force buffers will be required.
for (int i = 0; i < numForces; i++)
numForceBuffers = max(numForceBuffers, numBuffers[i]);
int bufferLimit = max(numForceBuffers, (int) context.getPlatformData().contexts.size());
if (context.getNonbondedUtilities().getHasInteractions())
bufferLimit = max(bufferLimit, context.getNonbondedUtilities().getNumForceBuffers());
// Figure out sets of forces that can be merged.
vector<int> unmerged(numForces);
for (int i = 0; i < numForces; i++)
unmerged[i] = i;
for (int i = 0; i < numForces; i++)
for (int j = i-1; j >= 0; j--) {
if (numBuffers[unmerged[j]] <= numBuffers[unmerged[j+1]])
break;
int temp = unmerged[j+1];
unmerged[j+1] = unmerged[j];
unmerged[j] = temp;
}
while (unmerged.size() > 0) {
int sum = numBuffers[unmerged.back()];
int i;
for (i = 0; i < (int) unmerged.size()-1; i++) {
if (sum+numBuffers[unmerged[i]] > bufferLimit)
break;
sum += numBuffers[unmerged[i]];
}
forceSets.push_back(vector<int>());
for (int j = 0; j < i; j++)
forceSets.back().push_back(unmerged[j]);
forceSets.back().push_back(unmerged.back());
for (int j = 0; j < i; j++)
unmerged.erase(unmerged.begin());
unmerged.pop_back();
}
}
// Update the buffer indices based on merged sets.
......@@ -162,9 +174,13 @@ void OpenCLBondedUtilities::initialize(const System& system) {
const vector<int>& set = *iter;
int setSize = set.size();
stringstream s;
s<<"#ifdef SUPPORTS_64_BIT_ATOMICS\n";
s<<"#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable\n";
s<<"#endif\n";
for (int i = 0; i < (int) prefixCode.size(); i++)
s<<prefixCode[i];
s<<"__kernel void computeBondedForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups";
string bufferType = (context.getSupports64BitGlobalAtomics() ? "long" : "real4");
s<<"__kernel void computeBondedForces(__global "<<bufferType<<"* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, int groups";
for (int i = 0; i < setSize; i++) {
int force = set[i];
string indexType = "uint"+(indexWidth[force] == 1 ? "" : context.intToString(indexWidth[force]));
......@@ -219,10 +235,17 @@ string OpenCLBondedUtilities::createForceSource(int forceIndex, int numBonds, in
s<<computeForce<<"\n";
for (int i = 0; i < numAtoms; i++) {
s<<" {\n";
s<<" unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
s<<" real4 force = forceBuffers[offset];\n";
s<<" force.xyz += force"<<(i+1)<<".xyz;\n";
s<<" forceBuffers[offset] = force;\n";
if (context.getSupports64BitGlobalAtomics()) {
s<<" atom_add(&forceBuffers[atom"<<(i+1)<<"], (long) (force"<<(i+1)<<".x*0x100000000));\n";
s<<" atom_add(&forceBuffers[atom"<<(i+1)<<"+PADDED_NUM_ATOMS], (long) (force"<<(i+1)<<".y*0x100000000));\n";
s<<" atom_add(&forceBuffers[atom"<<(i+1)<<"+2*PADDED_NUM_ATOMS], (long) (force"<<(i+1)<<".z*0x100000000));\n";
}
else {
s<<" unsigned int offset = atom"<<(i+1)<<"+buffers"<<suffix[i]<<"*PADDED_NUM_ATOMS;\n";
s<<" real4 force = forceBuffers[offset];\n";
s<<" force.xyz += force"<<(i+1)<<".xyz;\n";
s<<" forceBuffers[offset] = force;\n";
}
s<<" }\n";
}
s<<"}\n";
......@@ -235,7 +258,10 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
for (int i = 0; i < (int) forceSets.size(); i++) {
int index = 0;
cl::Kernel& kernel = kernels[i];
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
if (context.getSupports64BitGlobalAtomics())
kernel.setArg<cl::Buffer>(index++, context.getLongForceBuffer().getDeviceBuffer());
else
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
index++;
......
......@@ -97,6 +97,7 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// Try to figure out which device is the fastest.
int bestSpeed = -1;
bool bestSupportsDouble = false;
for (int i = 0; i < (int) devices.size(); i++) {
if (platformVendor == "Apple" && devices[i].getInfo<CL_DEVICE_VENDOR>() == "AMD")
continue; // Don't use AMD GPUs on OS X due to serious bugs.
......@@ -135,9 +136,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
}
int speed = devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>()*processingElementsPerComputeUnit*devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
if (maxSize >= minThreadBlockSize && speed > bestSpeed) {
bool supportsDouble = (devices[i].getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
if (maxSize >= minThreadBlockSize && speed > bestSpeed && (supportsDouble || !bestSupportsDouble)) {
deviceIndex = i;
bestSpeed = speed;
bestSupportsDouble = supportsDouble;
}
}
}
......@@ -173,9 +176,6 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
}
}
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
// Disable 64 bit atomics. A future version of the driver will support them, but until we can test that,
// it's safest not to use them.
supports64BitGlobalAtomics = false;
if (device.getInfo<CL_DEVICE_TYPE>() != CL_DEVICE_TYPE_GPU) {
/// \todo Is 6 a good value for the OpenCL CPU device?
// numThreadBlocksPerComputeUnit = ?;
......@@ -190,14 +190,11 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
// check for errors.
try {
#ifdef CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD
// AMD has both 32 and 64 width SIMDs. Can determine by using:
// simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
// Must catch cl:Error as will fail if runtime does not support queries.
// However, the 32 width NVIDIA kernels do not have all the necessary
// barriers and so will not work for AMD.
// So for now leave default of 1 which will use the default kernels.
cl_uint simdPerComputeUnit = device.getInfo<CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD>();
simdWidth = device.getInfo<CL_DEVICE_WAVEFRONT_WIDTH_AMD>();
// If the GPU has multiple SIMDs per compute unit then it is uses the scalar instruction
// set instead of the VLIW instruction set. It therefore needs more thread blocks per
// compute unit to hide memory latency.
......@@ -226,6 +223,10 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["SUPPORTS_64_BIT_ATOMICS"] = "";
if (supportsDoublePrecision)
compilationDefines["SUPPORTS_DOUBLE_PRECISION"] = "";
if (simdWidth >= 32)
compilationDefines["SYNC_WARPS"] = "";
else
compilationDefines["SYNC_WARPS"] = "barrier(CLK_LOCAL_MEM_FENCE)";
vector<cl::Device> contextDevices;
contextDevices.push_back(device);
cl_context_properties cprops[] = {CL_CONTEXT_PLATFORM, (cl_context_properties) platforms[platformIndex](), 0};
......
......@@ -36,27 +36,24 @@ using namespace OpenMM;
using namespace std;
OpenCLFFT3D::OpenCLFFT3D(OpenCLContext& context, int xsize, int ysize, int zsize) : context(context), xsize(xsize), ysize(ysize), zsize(zsize) {
zkernel = createKernel(xsize, ysize, zsize);
xkernel = createKernel(ysize, zsize, xsize);
ykernel = createKernel(zsize, xsize, ysize);
zkernel = createKernel(xsize, ysize, zsize, zthreads);
xkernel = createKernel(ysize, zsize, xsize, xthreads);
ykernel = createKernel(zsize, xsize, ysize, ythreads);
}
void OpenCLFFT3D::execFFT(OpenCLArray& in, OpenCLArray& out, bool forward) {
int maxSize = xkernel.getWorkGroupInfo<CL_KERNEL_WORK_GROUP_SIZE>(context.getDevice());
if (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU)
maxSize = 1;
zkernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
zkernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
zkernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(zkernel, xsize*ysize*zsize, min(zsize, (int) maxSize));
context.executeKernel(zkernel, xsize*ysize*zsize, zthreads);
xkernel.setArg<cl::Buffer>(0, out.getDeviceBuffer());
xkernel.setArg<cl::Buffer>(1, in.getDeviceBuffer());
xkernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(xkernel, xsize*ysize*zsize, min(xsize, (int) maxSize));
context.executeKernel(xkernel, xsize*ysize*zsize, xthreads);
ykernel.setArg<cl::Buffer>(0, in.getDeviceBuffer());
ykernel.setArg<cl::Buffer>(1, out.getDeviceBuffer());
ykernel.setArg<cl_int>(2, forward ? 1 : -1);
context.executeKernel(ykernel, xsize*ysize*zsize, min(ysize, (int) maxSize));
context.executeKernel(ykernel, xsize*ysize*zsize, ythreads);
}
int OpenCLFFT3D::findLegalDimension(int minimum) {
......@@ -66,7 +63,7 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
// Attempt to factor the current value.
int unfactored = minimum;
for (int factor = 2; factor < 6; factor++) {
for (int factor = 2; factor < 8; factor++) {
while (unfactored > 1 && unfactored%factor == 0)
unfactored /= factor;
}
......@@ -76,9 +73,10 @@ int OpenCLFFT3D::findLegalDimension(int minimum) {
}
}
cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize, int& threads) {
bool loopRequired = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
stringstream source;
int blocksPerGroup = (loopRequired ? 1 : max(1, 256/zsize));
int stage = 0;
int L = zsize;
int m = 1;
......@@ -88,22 +86,85 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
while (L > 1) {
int input = stage%2;
int output = 1-input;
int radix;
if (L%7 == 0)
radix = 7;
else if (L%5 == 0)
radix = 5;
else if (L%4 == 0)
radix = 4;
else if (L%3 == 0)
radix = 3;
else if (L%2 == 0)
radix = 2;
else
throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
source<<"{\n";
if (L%5 == 0) {
L = L/5;
source<<"// Pass "<<(stage+1)<<" (radix 5)\n";
if (loopRequired)
source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
else {
source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
source<<"int i = get_local_id(0);\n";
}
source<<"int j = i/"<<m<<";\n";
source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
source<<"real2 c4 = data"<<input<<"[i+"<<(4*L*m)<<"];\n";
L = L/radix;
source<<"// Pass "<<(stage+1)<<" (radix "<<radix<<")\n";
if (loopRequired) {
source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
source<<"int base = i;\n";
}
else {
source<<"if (get_local_id(0) < "<<(blocksPerGroup*L*m)<<") {\n";
source<<"int block = get_local_id(0)/"<<(L*m)<<";\n";
source<<"int i = get_local_id(0)-block*"<<(L*m)<<";\n";
source<<"int base = i+block*"<<zsize<<";\n";
}
source<<"int j = i/"<<m<<";\n";
if (radix == 7) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
source<<"real2 c5 = data"<<input<<"[base+"<<(5*L*m)<<"];\n";
source<<"real2 c6 = data"<<input<<"[base+"<<(6*L*m)<<"];\n";
source<<"real2 d0 = c1+c6;\n";
source<<"real2 d1 = c1-c6;\n";
source<<"real2 d2 = c2+c5;\n";
source<<"real2 d3 = c2-c5;\n";
source<<"real2 d4 = c4+c3;\n";
source<<"real2 d5 = c4-c3;\n";
source<<"real2 d6 = d2+d0;\n";
source<<"real2 d7 = d5+d3;\n";
source<<"real2 b0 = c0+d6+d4;\n";
source<<"real2 b1 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)+cos(6*M_PI/7))/3-1)<<"*(d6+d4);\n";
source<<"real2 b2 = "<<context.doubleToString((2*cos(2*M_PI/7)-cos(4*M_PI/7)-cos(6*M_PI/7))/3)<<"*(d0-d4);\n";
source<<"real2 b3 = "<<context.doubleToString((cos(2*M_PI/7)-2*cos(4*M_PI/7)+cos(6*M_PI/7))/3)<<"*(d4-d2);\n";
source<<"real2 b4 = "<<context.doubleToString((cos(2*M_PI/7)+cos(4*M_PI/7)-2*cos(6*M_PI/7))/3)<<"*(d2-d0);\n";
source<<"real2 b5 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d7+d1);\n";
source<<"real2 b6 = -sign*"<<context.doubleToString((2*sin(2*M_PI/7)-sin(4*M_PI/7)+sin(6*M_PI/7))/3)<<"*(d1-d5);\n";
source<<"real2 b7 = -sign*"<<context.doubleToString((sin(2*M_PI/7)-2*sin(4*M_PI/7)-sin(6*M_PI/7))/3)<<"*(d5-d3);\n";
source<<"real2 b8 = -sign*"<<context.doubleToString((sin(2*M_PI/7)+sin(4*M_PI/7)+2*sin(6*M_PI/7))/3)<<"*(d3-d1);\n";
source<<"real2 t0 = b0+b1;\n";
source<<"real2 t1 = b2+b3;\n";
source<<"real2 t2 = b4-b3;\n";
source<<"real2 t3 = -b2-b4;\n";
source<<"real2 t4 = b6+b7;\n";
source<<"real2 t5 = b8-b7;\n";
source<<"real2 t6 = -b8-b6;\n";
source<<"real2 t7 = t0+t1;\n";
source<<"real2 t8 = t0+t2;\n";
source<<"real2 t9 = t0+t3;\n";
source<<"real2 t10 = (real2) (t4.y+b5.y, -(t4.x+b5.x));\n";
source<<"real2 t11 = (real2) (t5.y+b5.y, -(t5.x+b5.x));\n";
source<<"real2 t12 = (real2) (t6.y+b5.y, -(t6.x+b5.x));\n";
source<<"data"<<output<<"[base+6*j*"<<m<<"] = b0;\n";
source<<"data"<<output<<"[base+(6*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(7*L)<<"], t7-t10);\n";
source<<"data"<<output<<"[base+(6*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(7*L)<<"], t9-t12);\n";
source<<"data"<<output<<"[base+(6*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(7*L)<<"], t8+t11);\n";
source<<"data"<<output<<"[base+(6*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(7*L)<<"], t8-t11);\n";
source<<"data"<<output<<"[base+(6*j+5)*"<<m<<"] = multiplyComplex(w[j*"<<(5*zsize)<<"/"<<(7*L)<<"], t9+t12);\n";
source<<"data"<<output<<"[base+(6*j+6)*"<<m<<"] = multiplyComplex(w[j*"<<(6*zsize)<<"/"<<(7*L)<<"], t7+t10);\n";
}
else if (radix == 5) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
source<<"real2 c4 = data"<<input<<"[base+"<<(4*L*m)<<"];\n";
source<<"real2 d0 = c1+c4;\n";
source<<"real2 d1 = c2+c3;\n";
source<<"real2 d2 = "<<context.doubleToString(sin(0.4*M_PI))<<"*(c1-c4);\n";
......@@ -116,80 +177,45 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
string coeff = context.doubleToString(sin(0.2*M_PI)/sin(0.4*M_PI));
source<<"real2 d9 = sign*(real2) (d2.y+"<<coeff<<"*d3.y, -d2.x-"<<coeff<<"*d3.x);\n";
source<<"real2 d10 = sign*(real2) ("<<coeff<<"*d2.y-d3.y, d3.x-"<<coeff<<"*d2.x);\n";
source<<"data"<<output<<"[i+4*j*"<<m<<"] = c0+d4;\n";
source<<"data"<<output<<"[i+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
source<<"data"<<output<<"[i+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
source<<"data"<<output<<"[i+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(5*L)<<"], d8-d10);\n";
source<<"data"<<output<<"[i+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(5*L)<<"], d7-d9);\n";
source<<"}\n";
m = m*5;
source<<"data"<<output<<"[base+4*j*"<<m<<"] = c0+d4;\n";
source<<"data"<<output<<"[base+(4*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(5*L)<<"], d7+d9);\n";
source<<"data"<<output<<"[base+(4*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(5*L)<<"], d8+d10);\n";
source<<"data"<<output<<"[base+(4*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(5*L)<<"], d8-d10);\n";
source<<"data"<<output<<"[base+(4*j+4)*"<<m<<"] = multiplyComplex(w[j*"<<(4*zsize)<<"/"<<(5*L)<<"], d7-d9);\n";
}
else if (L%4 == 0) {
L = L/4;
source<<"// Pass "<<(stage+1)<<" (radix 4)\n";
if (loopRequired)
source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
else {
source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
source<<"int i = get_local_id(0);\n";
}
source<<"int j = i/"<<m<<";\n";
source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[i+"<<(3*L*m)<<"];\n";
else if (radix == 4) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 c3 = data"<<input<<"[base+"<<(3*L*m)<<"];\n";
source<<"real2 d0 = c0+c2;\n";
source<<"real2 d1 = c0-c2;\n";
source<<"real2 d2 = c1+c3;\n";
source<<"real2 d3 = sign*(real2) (c1.y-c3.y, c3.x-c1.x);\n";
source<<"data"<<output<<"[i+3*j*"<<m<<"] = d0+d2;\n";
source<<"data"<<output<<"[i+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
source<<"data"<<output<<"[i+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
source<<"data"<<output<<"[i+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(4*L)<<"], d1-d3);\n";
source<<"}\n";
m = m*4;
source<<"data"<<output<<"[base+3*j*"<<m<<"] = d0+d2;\n";
source<<"data"<<output<<"[base+(3*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(4*L)<<"], d1+d3);\n";
source<<"data"<<output<<"[base+(3*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(4*L)<<"], d0-d2);\n";
source<<"data"<<output<<"[base+(3*j+3)*"<<m<<"] = multiplyComplex(w[j*"<<(3*zsize)<<"/"<<(4*L)<<"], d1-d3);\n";
}
else if (L%3 == 0) {
L = L/3;
source<<"// Pass "<<(stage+1)<<" (radix 3)\n";
if (loopRequired)
source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
else {
source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
source<<"int i = get_local_id(0);\n";
}
source<<"int j = i/"<<m<<";\n";
source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[i+"<<(2*L*m)<<"];\n";
else if (radix == 3) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"real2 c2 = data"<<input<<"[base+"<<(2*L*m)<<"];\n";
source<<"real2 d0 = c1+c2;\n";
source<<"real2 d1 = c0-0.5f*d0;\n";
source<<"real2 d2 = sign*"<<context.doubleToString(sin(M_PI/3.0))<<"*(real2) (c1.y-c2.y, c2.x-c1.x);\n";
source<<"data"<<output<<"[i+2*j*"<<m<<"] = c0+d0;\n";
source<<"data"<<output<<"[i+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
source<<"data"<<output<<"[i+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
source<<"}\n";
m = m*3;
source<<"data"<<output<<"[base+2*j*"<<m<<"] = c0+d0;\n";
source<<"data"<<output<<"[base+(2*j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(3*L)<<"], d1+d2);\n";
source<<"data"<<output<<"[base+(2*j+2)*"<<m<<"] = multiplyComplex(w[j*"<<(2*zsize)<<"/"<<(3*L)<<"], d1-d2);\n";
}
else if (L%2 == 0) {
L = L/2;
source<<"// Pass "<<(stage+1)<<" (radix 2)\n";
if (loopRequired)
source<<"for (int i = get_local_id(0); i < "<<(L*m)<<"; i += get_local_size(0)) {\n";
else {
source<<"if (get_local_id(0) < "<<(L*m)<<") {\n";
source<<"int i = get_local_id(0);\n";
}
source<<"int j = i/"<<m<<";\n";
source<<"real2 c0 = data"<<input<<"[i];\n";
source<<"real2 c1 = data"<<input<<"[i+"<<(L*m)<<"];\n";
source<<"data"<<output<<"[i+j*"<<m<<"] = c0+c1;\n";
source<<"data"<<output<<"[i+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
source<<"}\n";
m = m*2;
else if (radix == 2) {
source<<"real2 c0 = data"<<input<<"[base];\n";
source<<"real2 c1 = data"<<input<<"[base+"<<(L*m)<<"];\n";
source<<"data"<<output<<"[base+j*"<<m<<"] = c0+c1;\n";
source<<"data"<<output<<"[base+(j+1)*"<<m<<"] = multiplyComplex(w[j*"<<zsize<<"/"<<(2*L)<<"], c0-c1);\n";
}
else
throw OpenMMException("Illegal size for FFT: "+context.intToString(zsize));
source<<"}\n";
m = m*radix;
source<<"barrier(CLK_LOCAL_MEM_FENCE);\n";
source<<"}\n";
++stage;
......@@ -202,20 +228,22 @@ cl::Kernel OpenCLFFT3D::createKernel(int xsize, int ysize, int zsize) {
source<<"out[y*(ZSIZE*XSIZE)+z*XSIZE+x] = data"<<(stage%2)<<"[z];\n";
}
else
source<<"out[y*(ZSIZE*XSIZE)+get_local_id(0)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
source<<"out[y*(ZSIZE*XSIZE)+(get_local_id(0)%ZSIZE)*XSIZE+x] = data"<<(stage%2)<<"[get_local_id(0)];\n";
source<<"barrier(CLK_GLOBAL_MEM_FENCE);";
map<string, string> replacements;
replacements["XSIZE"] = context.intToString(xsize);
replacements["YSIZE"] = context.intToString(ysize);
replacements["ZSIZE"] = context.intToString(zsize);
replacements["BLOCKS_PER_GROUP"] = context.intToString(blocksPerGroup);
replacements["M_PI"] = context.doubleToString(M_PI);
replacements["COMPUTE_FFT"] = source.str();
replacements["LOOP_REQUIRED"] = (loopRequired ? "1" : "0");
cl::Program program = context.createProgram(context.replaceStrings(OpenCLKernelSources::fft, replacements));
cl::Kernel kernel(program, "execFFT");
int bufferSize = zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
int bufferSize = blocksPerGroup*zsize*(context.getUseDoublePrecision() ? sizeof(mm_double2) : sizeof(mm_float2));
kernel.setArg(3, bufferSize, NULL);
kernel.setArg(4, bufferSize, NULL);
kernel.setArg(5, bufferSize, NULL);
threads = (loopRequired ? 1 : blocksPerGroup*zsize);
return kernel;
}
......@@ -81,8 +81,9 @@ public:
*/
static int findLegalDimension(int minimum);
private:
cl::Kernel createKernel(int xsize, int ysize, int zsize);
cl::Kernel createKernel(int xsize, int ysize, int zsize, int& threads);
int xsize, ysize, zsize;
int xthreads, ythreads, zthreads;
OpenCLContext& context;
cl::Kernel xkernel, ykernel, zkernel;
};
......
......@@ -99,7 +99,7 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
ccmaConvergedBuffer(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL), hasInitializedPosConstraintKernels(false), hasInitializedVelConstraintKernels(false) {
// Create workspace arrays.
......@@ -479,8 +479,6 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
ccmaNumAtomConstraints = OpenCLArray::create<cl_int>(context, numAtoms, "CcmaAtomConstraintsIndex");
ccmaConstraintMatrixColumn = OpenCLArray::create<cl_int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = OpenCLArray::create<cl_int>(context, 2, "CcmaConverged");
ccmaConvergedBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, 2*sizeof(cl_int));
ccmaConvergedMemory = (cl_int*) context.getQueue().enqueueMapBuffer(*ccmaConvergedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, 2*sizeof(cl_int));
vector<mm_int2> atomsVec(ccmaAtoms->getSize());
vector<cl_int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<cl_int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
......@@ -660,24 +658,28 @@ OpenCLIntegrationUtilities::OpenCLIntegrationUtilities(OpenCLContext& context, c
defines["NUM_OUT_OF_PLANE"] = context.intToString(numOutOfPlane);
cl::Program vsiteProgram = context.createProgram(OpenCLKernelSources::virtualSites, defines);
vsitePositionKernel = cl::Kernel(vsiteProgram, "computeVirtualSites");
vsitePositionKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
setPosqCorrectionArg(context, vsitePositionKernel, 1);
vsitePositionKernel.setArg<cl::Buffer>(2, vsite2AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(3, vsite2AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(4, vsite3AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(5, vsite3AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(6, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneWeights->getDeviceBuffer());
int index = 0;
vsitePositionKernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
if (context.getUseMixedPrecision())
vsitePositionKernel.setArg<cl::Buffer>(index++, context.getPosqCorrection().getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsitePositionKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights->getDeviceBuffer());
vsiteForceKernel = cl::Kernel(vsiteProgram, "distributeForces");
vsiteForceKernel.setArg<cl::Buffer>(0, context.getPosq().getDeviceBuffer());
setPosqCorrectionArg(context, vsiteForceKernel, 1);
// Skip argument 2: the force array hasn't been created yet.
vsiteForceKernel.setArg<cl::Buffer>(3, vsite2AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(4, vsite2AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(5, vsite3AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(6, vsite3AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(7, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(8, vsiteOutOfPlaneWeights->getDeviceBuffer());
index = 0;
vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
index++; // Skip argument 1: the force array hasn't been created yet.
if (context.getUseMixedPrecision())
vsiteForceKernel.setArg<cl::Buffer>(index++, context.getPosqCorrection().getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite2AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsite3AvgWeights->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneAtoms->getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(index++, vsiteOutOfPlaneWeights->getDeviceBuffer());
numVsites = num2Avg+num3Avg+numOutOfPlane;
}
......@@ -718,8 +720,6 @@ OpenCLIntegrationUtilities::~OpenCLIntegrationUtilities() {
delete ccmaDelta2;
if (ccmaConverged != NULL)
delete ccmaConverged;
if (ccmaConvergedBuffer != NULL)
delete ccmaConvergedBuffer;
if (vsite2AvgAtoms != NULL)
delete vsite2AvgAtoms;
if (vsite2AvgWeights != NULL)
......@@ -807,6 +807,7 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
ccmaDirectionsKernel.setArg<cl::Buffer>(3, context.getPosqCorrection().getDeviceBuffer());
else
ccmaDirectionsKernel.setArg<void*>(3, NULL);
ccmaDirectionsKernel.setArg<cl::Buffer>(4, ccmaConverged->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(0, ccmaAtoms->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(1, ccmaDistance->getDeviceBuffer());
ccmaForceKernel.setArg<cl::Buffer>(2, constrainVelocities ? context.getVelm().getDeviceBuffer() : posDelta->getDeviceBuffer());
......@@ -834,23 +835,19 @@ void OpenCLIntegrationUtilities::applyConstraints(bool constrainVelocities, doub
context.executeKernel(ccmaDirectionsKernel, ccmaAtoms->getSize());
const int checkInterval = 4;
cl::Event event;
int* converged = (int*) context.getPinnedBuffer();
for (int i = 0; i < 150; i++) {
ccmaForceKernel.setArg<cl_int>(7, i);
if (i == 0) {
ccmaConvergedMemory[0] = 1;
ccmaConvergedMemory[1] = 0;
context.getQueue().enqueueWriteBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), ccmaConvergedMemory);
}
context.executeKernel(ccmaForceKernel, ccmaAtoms->getSize());
if ((i+1)%checkInterval == 0)
context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), ccmaConvergedMemory, NULL, &event);
context.getQueue().enqueueReadBuffer(ccmaConverged->getDeviceBuffer(), CL_FALSE, 0, 2*sizeof(cl_int), converged, NULL, &event);
ccmaMultiplyKernel.setArg<cl_int>(5, i);
context.executeKernel(ccmaMultiplyKernel, ccmaAtoms->getSize());
ccmaUpdateKernel.setArg<cl_int>(8, i);
context.executeKernel(ccmaUpdateKernel, context.getNumAtoms());
if ((i+1)%checkInterval == 0) {
event.wait();
if (ccmaConvergedMemory[i%2])
if (converged[i%2])
break;
}
}
......@@ -864,7 +861,7 @@ void OpenCLIntegrationUtilities::computeVirtualSites() {
void OpenCLIntegrationUtilities::distributeForcesFromVirtualSites() {
if (numVsites > 0) {
vsiteForceKernel.setArg<cl::Buffer>(2, context.getForce().getDeviceBuffer());
vsiteForceKernel.setArg<cl::Buffer>(1, context.getForce().getDeviceBuffer());
context.executeKernel(vsiteForceKernel, numVsites);
}
}
......
......@@ -141,8 +141,6 @@ private:
OpenCLArray* ccmaDelta1;
OpenCLArray* ccmaDelta2;
OpenCLArray* ccmaConverged;
cl::Buffer* ccmaConvergedBuffer;
cl_int* ccmaConvergedMemory;
OpenCLArray* vsite2AvgAtoms;
OpenCLArray* vsite2AvgWeights;
OpenCLArray* vsite3AvgAtoms;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2010 Stanford University and the Authors. *
* Portions copyright (c) 2008-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -1345,8 +1345,6 @@ OpenCLCalcNonbondedForceKernel::~OpenCLCalcNonbondedForceKernel() {
delete pmeBsplineModuliZ;
if (pmeBsplineTheta != NULL)
delete pmeBsplineTheta;
if (pmeBsplineDTheta != NULL)
delete pmeBsplineDTheta;
if (pmeAtomRange != NULL)
delete pmeAtomRange;
if (pmeAtomGridIndex != NULL)
......@@ -1468,6 +1466,9 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeDefines["GRID_SIZE_Y"] = cl.intToString(gridSizeY);
pmeDefines["GRID_SIZE_Z"] = cl.intToString(gridSizeZ);
pmeDefines["EPSILON_FACTOR"] = cl.doubleToString(sqrt(ONE_4PI_EPS0));
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu)
pmeDefines["DEVICE_IS_CPU"] = "1";
// Create required data structures.
......@@ -1479,12 +1480,9 @@ void OpenCLCalcNonbondedForceKernel::initialize(const System& system, const Nonb
pmeBsplineModuliY = new OpenCLArray(cl, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new OpenCLArray(cl, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeBsplineTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
if (deviceIsCpu)
pmeBsplineDTheta = new OpenCLArray(cl, PmeOrder*numParticles, 4*elementSize, "pmeBsplineDTheta");
pmeAtomRange = OpenCLArray::create<cl_int>(cl, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = OpenCLArray::create<mm_int2>(cl, numParticles, "pmeAtomGridIndex");
sort = new OpenCLSort<SortTrait>(cl, cl.getNumAtoms());
sort = new OpenCLSort(cl, new SortTrait(), cl.getNumAtoms());
fft = new OpenCLFFT3D(cl, gridSizeX, gridSizeY, gridSizeZ);
// Initialize the b-spline moduli.
......@@ -1608,12 +1606,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
ewaldForcesKernel.setArg<cl::Buffer>(2, cosSinSums->getDeviceBuffer());
}
if (pmeGrid != NULL) {
string file = (deviceIsCpu ? OpenCLKernelSources::pme_cpu : OpenCLKernelSources::pme);
cl::Program program = cl.createProgram(file, pmeDefines);
cl::Program program = cl.createProgram(OpenCLKernelSources::pme, pmeDefines);
pmeUpdateBsplinesKernel = cl::Kernel(program, "updateBsplines");
pmeAtomRangeKernel = cl::Kernel(program, "findAtomRangeForGrid");
if (!deviceIsCpu)
pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
pmeZIndexKernel = cl::Kernel(program, "recordZIndex");
pmeSpreadChargeKernel = cl::Kernel(program, "gridSpreadCharge");
pmeConvolutionKernel = cl::Kernel(program, "reciprocalConvolution");
pmeInterpolateForceKernel = cl::Kernel(program, "gridInterpolateForce");
......@@ -1622,15 +1618,11 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(1, pmeBsplineTheta->getDeviceBuffer());
pmeUpdateBsplinesKernel.setArg(2, OpenCLContext::ThreadBlockSize*PmeOrder*elementSize, NULL);
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(3, pmeAtomGridIndex->getDeviceBuffer());
if (deviceIsCpu)
pmeUpdateBsplinesKernel.setArg<cl::Buffer>(6, pmeBsplineDTheta->getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(1, pmeAtomRange->getDeviceBuffer());
pmeAtomRangeKernel.setArg<cl::Buffer>(2, cl.getPosq().getDeviceBuffer());
if (!deviceIsCpu) {
pmeZIndexKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
pmeZIndexKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
}
pmeZIndexKernel.setArg<cl::Buffer>(0, pmeAtomGridIndex->getDeviceBuffer());
pmeZIndexKernel.setArg<cl::Buffer>(1, cl.getPosq().getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(1, pmeAtomGridIndex->getDeviceBuffer());
pmeSpreadChargeKernel.setArg<cl::Buffer>(2, pmeAtomRange->getDeviceBuffer());
......@@ -1641,16 +1633,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeConvolutionKernel.setArg<cl::Buffer>(2, pmeBsplineModuliX->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(3, pmeBsplineModuliY->getDeviceBuffer());
pmeConvolutionKernel.setArg<cl::Buffer>(4, pmeBsplineModuliZ->getDeviceBuffer());
interpolateForceThreads = (cl.getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>() > 2*128*PmeOrder*elementSize ? 128 : 64);
pmeInterpolateForceKernel.setArg<cl::Buffer>(0, cl.getPosq().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(1, cl.getForceBuffers().getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeGrid->getDeviceBuffer());
if (deviceIsCpu) {
pmeInterpolateForceKernel.setArg<cl::Buffer>(5, pmeBsplineTheta->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(6, pmeBsplineDTheta->getDeviceBuffer());
}
else
pmeInterpolateForceKernel.setArg(5, 2*interpolateForceThreads*PmeOrder*elementSize, NULL);
pmeInterpolateForceKernel.setArg<cl::Buffer>(5, pmeAtomGridIndex->getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
......@@ -1687,16 +1673,16 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
}
else {
sort->sort(*pmeAtomGridIndex);
setPeriodicBoxSizeArg(cl, pmeAtomRangeKernel, 3);
setInvPeriodicBoxSizeArg(cl, pmeAtomRangeKernel, 4);
cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
if (cl.getSupports64BitGlobalAtomics()) {
setPeriodicBoxSizeArg(cl, pmeSpreadChargeKernel, 5);
setInvPeriodicBoxSizeArg(cl, pmeSpreadChargeKernel, 6);
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
cl.executeKernel(pmeFinishSpreadChargeKernel, pmeGrid->getSize());
}
else {
setPeriodicBoxSizeArg(cl, pmeAtomRangeKernel, 3);
setInvPeriodicBoxSizeArg(cl, pmeAtomRangeKernel, 4);
cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
setPeriodicBoxSizeArg(cl, pmeZIndexKernel, 2);
setInvPeriodicBoxSizeArg(cl, pmeZIndexKernel, 3);
cl.executeKernel(pmeZIndexKernel, cl.getNumAtoms());
......@@ -1715,7 +1701,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
fft->execFFT(*pmeGrid2, *pmeGrid, false);
setPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 3);
setInvPeriodicBoxSizeArg(cl, pmeInterpolateForceKernel, 4);
cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms(), interpolateForceThreads);
if (deviceIsCpu)
cl.executeKernel(pmeInterpolateForceKernel, 2*cl.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(), 1);
else
cl.executeKernel(pmeInterpolateForceKernel, cl.getNumAtoms());
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
if (dispersionCoefficient != 0.0 && includeDirect) {
......@@ -2078,8 +2067,6 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
hasCreatedKernels = true;
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
map<string, string> defines;
if (nb.getForceBufferPerAtomBlock())
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (nb.getUseCutoff())
defines["USE_CUTOFF"] = "1";
if (nb.getUsePeriodic())
......@@ -2090,18 +2077,24 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
defines["FORCE_WORK_GROUP_SIZE"] = cl.intToString(nb.getForceThreadBlockSize());
defines["TILE_SIZE"] = cl.intToString(OpenCLContext::TileSize);
int numExclusionTiles = nb.getExclusionTiles().getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = cl.intToString(numExclusionTiles);
int numContexts = cl.getPlatformData().contexts.size();
int startExclusionIndex = cl.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cl.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = cl.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = cl.intToString(endExclusionIndex);
string platformVendor = cl::Platform(cl.getDevice().getInfo<CL_DEVICE_PLATFORM>()).getInfo<CL_PLATFORM_VENDOR>();
if (platformVendor == "Apple")
defines["USE_APPLE_WORKAROUND"] = "1";
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::gbsaObc_cpu;
else if (cl.getSIMDWidth() == 32)
file = OpenCLKernelSources::gbsaObc_nvidia;
else
file = OpenCLKernelSources::gbsaObc_default;
file = OpenCLKernelSources::gbsaObc;
cl::Program program = cl.createProgram(file, defines);
bool useLong = (cl.getSupports64BitGlobalAtomics() && !deviceIsCpu);
bool useLong = cl.getSupports64BitGlobalAtomics();
int index = 0;
computeBornSumKernel = cl::Kernel(program, "computeBornSum");
computeBornSumKernel.setArg<cl::Buffer>(index++, (useLong ? longBornSum->getDeviceBuffer() : bornSum->getDeviceBuffer()));
......@@ -2112,15 +2105,12 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // The periodic box size arguments are set when the kernel is executed.
computeBornSumKernel.setArg<cl_uint>(index++, maxTiles);
if (cl.getSIMDWidth() == 32 || deviceIsCpu)
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getBlockCenters().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingAtoms().getDeviceBuffer());
}
else
computeBornSumKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
if (cl.getSIMDWidth() == 32) {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getExclusionIndices().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getExclusionRowIndices().getDeviceBuffer());
}
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getExclusionTiles().getDeviceBuffer());
force1Kernel = cl::Kernel(program, "computeGBSAForce1");
index = 0;
force1Kernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer().getDeviceBuffer() : cl.getForceBuffers().getDeviceBuffer()));
......@@ -2133,15 +2123,12 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // The periodic box size arguments are set when the kernel is executed.
force1Kernel.setArg<cl_uint>(index++, maxTiles);
if (cl.getSIMDWidth() == 32 || deviceIsCpu)
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getBlockCenters().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingAtoms().getDeviceBuffer());
}
else
force1Kernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
if (cl.getSIMDWidth() == 32) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getExclusionIndices().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getExclusionRowIndices().getDeviceBuffer());
}
force1Kernel.setArg<cl::Buffer>(index++, nb.getExclusionTiles().getDeviceBuffer());
program = cl.createProgram(OpenCLKernelSources::gbsaObcReductions, defines);
reduceBornSumKernel = cl::Kernel(program, "reduceBornSum");
reduceBornSumKernel.setArg<cl_int>(0, cl.getPaddedNumAtoms());
......@@ -2174,12 +2161,10 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
maxTiles = nb.getInteractingTiles().getSize();
computeBornSumKernel.setArg<cl::Buffer>(3, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl_uint>(7, maxTiles);
computeBornSumKernel.setArg<cl::Buffer>(9, nb.getInteractingAtoms().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(5, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl_uint>(9, maxTiles);
if (cl.getSIMDWidth() == 32 || deviceIsCpu) {
computeBornSumKernel.setArg<cl::Buffer>(8, nb.getInteractionFlags().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(10, nb.getInteractionFlags().getDeviceBuffer());
}
force1Kernel.setArg<cl::Buffer>(11, nb.getInteractingAtoms().getDeviceBuffer());
}
}
cl.executeKernel(computeBornSumKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......@@ -2301,16 +2286,17 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
// Record parameters and exclusions.
int numParticles = force.getNumParticles();
params = new OpenCLParameterSet(cl, force.getNumPerParticleParameters(), numParticles, "customGBParameters", true);
computedValues = new OpenCLParameterSet(cl, force.getNumComputedValues(), numParticles, "customGBComputedValues", true, cl.getUseDoublePrecision());
int paddedNumParticles = cl.getPaddedNumAtoms();
int numParams = force.getNumPerParticleParameters();
params = new OpenCLParameterSet(cl, force.getNumPerParticleParameters(), paddedNumParticles, "customGBParameters", true);
computedValues = new OpenCLParameterSet(cl, force.getNumComputedValues(), paddedNumParticles, "customGBComputedValues", true, cl.getUseDoublePrecision());
if (force.getNumGlobalParameters() > 0)
globals = OpenCLArray::create<cl_float>(cl, force.getNumGlobalParameters(), "customGBGlobals", CL_MEM_READ_ONLY);
vector<vector<cl_float> > paramVector(numParticles);
vector<vector<cl_float> > paramVector(paddedNumParticles, vector<cl_float>(numParams, 0));
vector<vector<int> > exclusionList(numParticles);
for (int i = 0; i < numParticles; i++) {
vector<double> parameters;
force.getParticleParameters(i, parameters);
paramVector[i].resize(parameters.size());
for (int j = 0; j < (int) parameters.size(); j++)
paramVector[i][j] = (cl_float) parameters[j];
exclusionList[i].push_back(i);
......@@ -2402,7 +2388,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
}
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
bool useLong = (cl.getSupports64BitGlobalAtomics() && !deviceIsCpu);
bool useLong = cl.getSupports64BitGlobalAtomics();
if (useLong) {
longEnergyDerivs = OpenCLArray::create<cl_long>(cl, force.getNumComputedValues()*cl.getPaddedNumAtoms(), "customGBLongEnergyDerivatives");
energyDerivs = new OpenCLParameterSet(cl, force.getNumComputedValues(), cl.getPaddedNumAtoms(), "customGBEnergyDerivatives", true);
......@@ -2465,30 +2451,24 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
map<string, string> defines;
if (cl.getNonbondedUtilities().getForceBufferPerAtomBlock())
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (useCutoff)
defines["USE_CUTOFF"] = "1";
pairValueDefines["USE_CUTOFF"] = "1";
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
pairValueDefines["USE_PERIODIC"] = "1";
if (useExclusionsForValue)
defines["USE_EXCLUSIONS"] = "1";
if (cl.getSIMDWidth() == 32)
defines["WARPS_PER_GROUP"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
defines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
pairValueDefines["USE_EXCLUSIONS"] = "1";
pairValueDefines["FORCE_WORK_GROUP_SIZE"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize());
pairValueDefines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
pairValueDefines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
pairValueDefines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
pairValueDefines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
pairValueDefines["TILE_SIZE"] = cl.intToString(OpenCLContext::TileSize);
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::customGBValueN2_cpu;
else if (cl.getSIMDWidth() == 32)
file = OpenCLKernelSources::customGBValueN2_nvidia;
else
file = OpenCLKernelSources::customGBValueN2_default;
cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
pairValueKernel = cl::Kernel(program, "computeN2Value");
file = OpenCLKernelSources::customGBValueN2;
pairValueSrc = cl.replaceStrings(file, replacements);
if (useExclusionsForValue)
cl.getNonbondedUtilities().requestExclusions(exclusionList);
}
......@@ -2664,30 +2644,24 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
replacements["DECLARE_TEMP_BUFFERS"] = declareTemps.str();
replacements["SET_TEMP_BUFFERS"] = setTemps.str();
map<string, string> defines;
if (cl.getNonbondedUtilities().getForceBufferPerAtomBlock())
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (useCutoff)
defines["USE_CUTOFF"] = "1";
pairEnergyDefines["USE_CUTOFF"] = "1";
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
pairEnergyDefines["USE_PERIODIC"] = "1";
if (anyExclusions)
defines["USE_EXCLUSIONS"] = "1";
if (cl.getSIMDWidth() == 32)
defines["WARPS_PER_GROUP"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize()/OpenCLContext::TileSize);
defines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
pairEnergyDefines["USE_EXCLUSIONS"] = "1";
pairEnergyDefines["FORCE_WORK_GROUP_SIZE"] = cl.intToString(cl.getNonbondedUtilities().getForceThreadBlockSize());
pairEnergyDefines["CUTOFF_SQUARED"] = cl.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
pairEnergyDefines["NUM_ATOMS"] = cl.intToString(cl.getNumAtoms());
pairEnergyDefines["PADDED_NUM_ATOMS"] = cl.intToString(cl.getPaddedNumAtoms());
pairEnergyDefines["NUM_BLOCKS"] = cl.intToString(cl.getNumAtomBlocks());
pairEnergyDefines["TILE_SIZE"] = cl.intToString(OpenCLContext::TileSize);
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::customGBEnergyN2_cpu;
else if (cl.getSIMDWidth() == 32)
file = OpenCLKernelSources::customGBEnergyN2_nvidia;
else
file = OpenCLKernelSources::customGBEnergyN2_default;
cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
pairEnergyKernel = cl::Kernel(program, "computeN2Energy");
file = OpenCLKernelSources::customGBEnergyN2;
pairEnergySrc = cl.replaceStrings(file, replacements);
}
{
// Create the kernel to reduce the derivatives and calculate per-particle energy terms.
......@@ -2943,8 +2917,41 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
int elementSize = (cl.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
if (!hasInitializedKernels) {
hasInitializedKernels = true;
// These two kernels can't be compiled in initialize(), because the nonbonded utilities object
// has not yet been initialized then.
{
int numExclusionTiles = nb.getExclusionTiles().getSize();
pairValueDefines["NUM_TILES_WITH_EXCLUSIONS"] = cl.intToString(numExclusionTiles);
int numContexts = cl.getPlatformData().contexts.size();
int startExclusionIndex = cl.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cl.getContextIndex()+1)*numExclusionTiles/numContexts;
pairValueDefines["FIRST_EXCLUSION_TILE"] = cl.intToString(startExclusionIndex);
pairValueDefines["LAST_EXCLUSION_TILE"] = cl.intToString(endExclusionIndex);
cl::Program program = cl.createProgram(pairValueSrc, pairValueDefines);
pairValueKernel = cl::Kernel(program, "computeN2Value");
pairValueSrc = "";
pairValueDefines.clear();
}
{
int numExclusionTiles = nb.getExclusionTiles().getSize();
pairEnergyDefines["NUM_TILES_WITH_EXCLUSIONS"] = cl.intToString(numExclusionTiles);
int numContexts = cl.getPlatformData().contexts.size();
int startExclusionIndex = cl.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cl.getContextIndex()+1)*numExclusionTiles/numContexts;
pairEnergyDefines["FIRST_EXCLUSION_TILE"] = cl.intToString(startExclusionIndex);
pairEnergyDefines["LAST_EXCLUSION_TILE"] = cl.intToString(endExclusionIndex);
cl::Program program = cl.createProgram(pairEnergySrc, pairEnergyDefines);
pairEnergyKernel = cl::Kernel(program, "computeN2Energy");
pairEnergySrc = "";
pairEnergyDefines.clear();
}
// Set arguments for kernels.
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : 0);
bool useLong = (cl.getSupports64BitGlobalAtomics() && !deviceIsCpu);
bool useLong = cl.getSupports64BitGlobalAtomics();
if (useLong) {
longValueBuffers = OpenCLArray::create<cl_long>(cl, cl.getPaddedNumAtoms(), "customGBLongValueBuffers");
cl.addAutoclearBuffer(*longValueBuffers);
......@@ -2959,20 +2966,16 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionTiles().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, useLong ? longValueBuffers->getDeviceBuffer() : valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*elementSize, NULL);
/// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
/// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*elementSize, NULL);
if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed.
pairValueKernel.setArg<cl_uint>(index++, maxTiles);
if (cl.getSIMDWidth() == 32 || deviceIsCpu)
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getBlockCenters().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingAtoms().getDeviceBuffer());
}
else
pairValueKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
......@@ -3013,18 +3016,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : nb.getForceThreadBlockSize())*4*elementSize, NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
/// \todo Eliminate this argument and make local to the kernel. For *_default.cl kernel can actually make it TileSize rather than getForceThreadBlockSize as only half the workgroup stores to it as was done with nonbonded_default.cl.
/// \todo Also make the previous __local argument local as was done with nonbonded_default.cl.
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : (cl.getSIMDWidth() == 32 ? 1 : nb.getForceThreadBlockSize()))*4*elementSize, NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionTiles().getDeviceBuffer());
if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed.
pairEnergyKernel.setArg<cl_uint>(index++, maxTiles);
if (cl.getSIMDWidth() == 32 || deviceIsCpu)
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getBlockCenters().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingAtoms().getDeviceBuffer());
}
else
pairEnergyKernel.setArg<cl_uint>(index++, cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2);
......@@ -3108,20 +3107,18 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
globals->upload(globalParamValues);
}
if (nb.getUseCutoff()) {
setPeriodicBoxSizeArg(cl, pairValueKernel, 10);
setInvPeriodicBoxSizeArg(cl, pairValueKernel, 11);
setPeriodicBoxSizeArg(cl, pairEnergyKernel, 11);
setInvPeriodicBoxSizeArg(cl, pairEnergyKernel, 12);
setPeriodicBoxSizeArg(cl, pairValueKernel, 8);
setInvPeriodicBoxSizeArg(cl, pairValueKernel, 9);
setPeriodicBoxSizeArg(cl, pairEnergyKernel, 9);
setInvPeriodicBoxSizeArg(cl, pairEnergyKernel, 10);
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
pairValueKernel.setArg<cl::Buffer>(8, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl_uint>(12, maxTiles);
pairEnergyKernel.setArg<cl::Buffer>(9, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl_uint>(13, maxTiles);
if (cl.getSIMDWidth() == 32 || deviceIsCpu) {
pairValueKernel.setArg<cl::Buffer>(13, nb.getInteractionFlags().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(14, nb.getInteractionFlags().getDeviceBuffer());
}
pairValueKernel.setArg<cl::Buffer>(6, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl_uint>(11, maxTiles);
pairValueKernel.setArg<cl::Buffer>(12, nb.getInteractingAtoms().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(7, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl_uint>(12, maxTiles);
pairEnergyKernel.setArg<cl::Buffer>(13, nb.getInteractingAtoms().getDeviceBuffer());
}
}
cl.executeKernel(pairValueKernel, nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......@@ -3140,11 +3137,10 @@ void OpenCLCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context
// Record the per-particle parameters.
vector<vector<cl_float> > paramVector(numParticles);
vector<vector<cl_float> > paramVector(cl.getPaddedNumAtoms(), vector<cl_float>(force.getNumPerParticleParameters(), 0));
vector<double> parameters;
for (int i = 0; i < numParticles; i++) {
force.getParticleParameters(i, parameters);
paramVector[i].resize(parameters.size());
for (int j = 0; j < (int) parameters.size(); j++)
paramVector[i][j] = (cl_float) parameters[j];
}
......@@ -4573,7 +4569,7 @@ double OpenCLIntegrateVariableLangevinStepKernel::execute(ContextImpl& context,
selectSizeKernel.setArg<cl::Buffer>(5, cl.getVelm().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(6, cl.getForce().getDeviceBuffer());
selectSizeKernel.setArg<cl::Buffer>(7, params->getDeviceBuffer());
int elementSize = (useDouble ? sizeof(cl_double) : sizeof(cl_float));
int elementSize = (useDouble ? sizeof(cl_double) : sizeof(cl_float));
selectSizeKernel.setArg(8, params->getSize()*elementSize, NULL);
selectSizeKernel.setArg(9, blockSize*elementSize, NULL);
}
......
......@@ -556,7 +556,7 @@ class OpenCLCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
OpenCLCalcNonbondedForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcNonbondedForceKernel(name, platform),
hasInitializedKernel(false), cl(cl), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), pmeGrid(NULL),
pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
pmeGrid2(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL),
pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL), fft(NULL) {
}
~OpenCLCalcNonbondedForceKernel();
......@@ -586,15 +586,15 @@ public:
*/
void copyParametersToContext(ContextImpl& context, const NonbondedForce& force);
private:
struct SortTrait {
typedef mm_int2 DataType;
typedef cl_int KeyType;
static const char* clDataType() {return "int2";}
static const char* clKeyType() {return "int";}
static const char* clMinKey() {return "INT_MIN";}
static const char* clMaxKey() {return "INT_MAX";}
static const char* clMaxValue() {return "(int2) (INT_MAX, INT_MAX)";}
static const char* clSortKey() {return "value.y";}
class SortTrait : public OpenCLSort::SortTrait {
int getDataSize() const {return 8;}
int getKeySize() const {return 4;}
const char* getDataType() const {return "int2";}
const char* getKeyType() const {return "int";}
const char* getMinKey() const {return "INT_MIN";}
const char* getMaxKey() const {return "INT_MAX";}
const char* getMaxValue() const {return "(int2) (INT_MAX, INT_MAX)";}
const char* getSortKey() const {return "value.y";}
};
OpenCLContext& cl;
bool hasInitializedKernel;
......@@ -607,10 +607,9 @@ private:
OpenCLArray* pmeBsplineModuliY;
OpenCLArray* pmeBsplineModuliZ;
OpenCLArray* pmeBsplineTheta;
OpenCLArray* pmeBsplineDTheta;
OpenCLArray* pmeAtomRange;
OpenCLArray* pmeAtomGridIndex;
OpenCLSort<SortTrait>* sort;
OpenCLSort* sort;
OpenCLFFT3D* fft;
cl::Kernel ewaldSumsKernel;
cl::Kernel ewaldForcesKernel;
......@@ -625,7 +624,6 @@ private:
std::map<std::string, std::string> pmeDefines;
std::vector<std::pair<int, int> > exceptionAtoms;
double ewaldSelfEnergy, dispersionCoefficient, alpha;
int interpolateForceThreads;
bool hasCoulomb, hasLJ;
static const int PmeOrder = 5;
};
......@@ -775,6 +773,8 @@ private:
std::vector<bool> pairValueUsesParam, pairEnergyUsesParam, pairEnergyUsesValue;
System& system;
cl::Kernel pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
std::string pairValueSrc, pairEnergySrc;
std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
};
/**
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -29,6 +29,8 @@
#include "OpenCLArray.h"
#include "OpenCLKernelSources.h"
#include "OpenCLExpressionUtilities.h"
#include "OpenCLSort.h"
#include <algorithm>
#include <map>
#include <set>
#include <utility>
......@@ -36,13 +38,29 @@
using namespace OpenMM;
using namespace std;
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false),
numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), nonbondedForceGroup(0) {
class OpenCLNonbondedUtilities::BlockSortTrait : public OpenCLSort::SortTrait {
public:
BlockSortTrait(bool useDouble) : useDouble(useDouble) {
}
int getDataSize() const {return useDouble ? sizeof(mm_double2) : sizeof(mm_float2);}
int getKeySize() const {return useDouble ? sizeof(cl_double) : sizeof(cl_float);}
const char* getDataType() const {return "real2";}
const char* getKeyType() const {return "real";}
const char* getMinKey() const {return "-MAXFLOAT";}
const char* getMaxKey() const {return "MAXFLOAT";}
const char* getMaxValue() const {return "(real2) (MAXFLOAT, MAXFLOAT)";}
const char* getSortKey() const {return "value.x";}
private:
bool useDouble;
};
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true),
numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) {
// Decide how many thread blocks and force buffers to use.
deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
forceBufferPerAtomBlock = false;
if (deviceIsCpu) {
numForceThreadBlocks = context.getNumThreadBlocks();
forceThreadBlockSize = 1;
......@@ -50,15 +68,15 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
}
else if (context.getSIMDWidth() == 32) {
if (context.getSupports64BitGlobalAtomics()) {
numForceThreadBlocks = 2*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
// Even though using longForceBuffer, still need a single forceBuffer for the reduceForces kernel to convert the long results into float4 which will be used by later kernels.
numForceBuffers = 1;
}
else {
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 128;
numForceBuffers = numForceThreadBlocks;
numForceThreadBlocks = 3*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
}
}
else {
......@@ -69,13 +87,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
numForceBuffers = 1;
}
else {
numForceBuffers = numForceThreadBlocks;
if (numForceBuffers >= context.getNumAtomBlocks()) {
// For small systems, it is more efficient to have one force buffer per block of 32 atoms instead of one per warp.
forceBufferPerAtomBlock = true;
numForceBuffers = context.getNumAtomBlocks();
}
numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
}
}
}
......@@ -85,18 +97,32 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete exclusionIndices;
if (exclusionRowIndices != NULL)
delete exclusionRowIndices;
if (exclusionTiles != NULL)
delete exclusionTiles;
if (exclusions != NULL)
delete exclusions;
if (interactingTiles != NULL)
delete interactingTiles;
if (interactionFlags != NULL)
delete interactionFlags;
if (interactingAtoms != NULL)
delete interactingAtoms;
if (interactionCount != NULL)
delete interactionCount;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (sortedBlocks != NULL)
delete sortedBlocks;
if (sortedBlockCenter != NULL)
delete sortedBlockCenter;
if (sortedBlockBoundingBox != NULL)
delete sortedBlockBoundingBox;
if (oldPositions != NULL)
delete oldPositions;
if (rebuildNeighborList != NULL)
delete rebuildNeighborList;
if (blockSorter != NULL)
delete blockSorter;
}
void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
......@@ -149,6 +175,10 @@ void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exc
}
}
static bool compareUshort2(mm_ushort2 a, mm_ushort2 b) {
return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}
void OpenCLNonbondedUtilities::initialize(const System& system) {
if (atomExclusions.size() == 0) {
// No exclusions were specifically requested, so just mark every atom as not interacting with itself.
......@@ -161,14 +191,11 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
// Create the list of tiles.
int numAtomBlocks = context.getNumAtomBlocks();
int totalTiles = numAtomBlocks*(numAtomBlocks+1)/2;
int numContexts = context.getPlatformData().contexts.size();
startTileIndex = context.getContextIndex()*totalTiles/numContexts;
int endTileIndex = (context.getContextIndex()+1)*totalTiles/numContexts;
numTiles = endTileIndex-startTileIndex;
// Build a list of indices for the tiles with exclusions.
setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
// Build a list of tiles that contain exclusions.
set<pair<int, int> > tilesWithExclusions;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/OpenCLContext::TileSize;
......@@ -178,19 +205,29 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
for (int i = 0; i < numAtomBlocks; ++i)
tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i));
vector<mm_ushort2> exclusionTilesVec;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(mm_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2);
exclusionTiles = OpenCLArray::create<mm_ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles->upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
mm_ushort2 tile = exclusionTilesVec[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i;
}
vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
exclusionBlocksForBlock[iter->first].push_back(iter->second);
if (iter->first != iter->second)
exclusionBlocksForBlock[iter->second].push_back(iter->first);
}
vector<cl_uint> exclusionRowIndicesVec(numAtomBlocks+1, 0);
vector<cl_uint> exclusionIndicesVec;
int currentRow = 0;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
while (iter->first != currentRow)
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndicesVec.push_back(iter->second);
for (int i = 0; i < numAtomBlocks; i++) {
exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
}
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndices = OpenCLArray::create<cl_uint>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices = OpenCLArray::create<cl_uint>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices->upload(exclusionIndicesVec);
......@@ -199,7 +236,8 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
// Record the exclusion data.
exclusions = OpenCLArray::create<cl_uint>(context, tilesWithExclusions.size()*OpenCLContext::TileSize, "exclusions");
vector<cl_uint> exclusionVec(exclusions->getSize());
cl_uint allFlags = (cl_uint) -1;
vector<cl_uint> exclusionVec(exclusions->getSize(), allFlags);
for (int i = 0; i < exclusions->getSize(); ++i)
exclusionVec[i] = 0xFFFFFFFF;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
......@@ -210,31 +248,12 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
int y = atom2/OpenCLContext::TileSize;
int offset2 = atom2-y*OpenCLContext::TileSize;
if (x > y) {
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
int index = exclusionTileMap[make_pair(x, y)]*OpenCLContext::TileSize;
exclusionVec[index+offset1] &= allFlags-(1<<offset2);
}
else {
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
// Mark all interactions that involve a padding atom as being excluded.
for (int atom1 = context.getNumAtoms(); atom1 < context.getPaddedNumAtoms(); ++atom1) {
int x = atom1/OpenCLContext::TileSize;
int offset1 = atom1-x*OpenCLContext::TileSize;
for (int atom2 = 0; atom2 < context.getPaddedNumAtoms(); ++atom2) {
int y = atom2/OpenCLContext::TileSize;
int offset2 = atom2-y*OpenCLContext::TileSize;
if (x >= y) {
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
if (y >= x) {
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
int index = exclusionTileMap[make_pair(y, x)]*OpenCLContext::TileSize;
exclusionVec[index+offset2] &= allFlags-(1<<offset1);
}
}
}
......@@ -244,21 +263,35 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
// Create data structures for the neighbor list.
if (useCutoff) {
// Select a size for the arrays that hold the neighbor list. This estimate is intentionally very
// high, because if it ever is too small, we have to fall back to the N^2 algorithm.
mm_float4 boxSize = context.getPeriodicBoxSize();
int maxInteractingTiles = (int) (numTiles*(cutoff/boxSize.x+cutoff/boxSize.y+cutoff/boxSize.z));
if (maxInteractingTiles > numTiles)
maxInteractingTiles = numTiles;
if (maxInteractingTiles < 1)
maxInteractingTiles = 1;
interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxInteractingTiles, "interactingTiles");
interactionFlags = OpenCLArray::create<cl_uint>(context, context.getSIMDWidth() == 32 ? maxInteractingTiles : (deviceIsCpu ? 2*maxInteractingTiles : 1), "interactionFlags");
// Select a size for the arrays that hold the neighbor list. We have to make a fairly
// arbitrary guess, but if this turns out to be too small we'll increase it later.
int maxTiles = 20*numAtomBlocks;
if (maxTiles > numTiles)
maxTiles = numTiles;
if (maxTiles < 1)
maxTiles = 1;
int numAtoms = context.getNumAtoms();
interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxTiles, "interactingTiles");
interactingAtoms = OpenCLArray::create<cl_int>(context, OpenCLContext::TileSize*maxTiles, "interactingAtoms");
interactionCount = OpenCLArray::create<cl_uint>(context, 1, "interactionCount");
int elementSize = (context.getUseDoublePrecision() ? sizeof(mm_double4) : sizeof(mm_float4));
blockCenter = new OpenCLArray(context, numAtomBlocks, elementSize, "blockCenter");
blockBoundingBox = new OpenCLArray(context, numAtomBlocks, elementSize, "blockBoundingBox");
int elementSize = (context.getUseDoublePrecision() ? sizeof(cl_double) : sizeof(cl_float));
blockCenter = new OpenCLArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new OpenCLArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks = new OpenCLArray(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlockCenter = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox = new OpenCLArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions = new OpenCLArray(context, numAtoms, 4*elementSize, "oldPositions");
if (context.getUseDoublePrecision()) {
vector<mm_double4> oldPositionsVec(numAtoms, mm_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<mm_float4> oldPositionsVec(numAtoms, mm_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
rebuildNeighborList = OpenCLArray::create<int>(context, 1, "rebuildNeighborList");
blockSorter = new OpenCLSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<cl_uint> count(1, 0);
interactionCount->upload(count);
}
......@@ -268,12 +301,24 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
if (kernelSource.size() > 0)
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
if (useCutoff) {
double padding = (usePadding ? 0.1*cutoff : 0.0);
double paddedCutoff = cutoff+padding;
map<string, string> defines;
defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDING"] = context.doubleToString(padding);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
int maxExclusions = 0;
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
defines["GROUP_SIZE"] = (deviceIsCpu ? "32" : "256");
defines["BUFFER_GROUPS"] = (deviceIsCpu ? "4" : "2");
string file = (deviceIsCpu ? OpenCLKernelSources::findInteractingBlocks_cpu : OpenCLKernelSources::findInteractingBlocks);
cl::Program interactingBlocksProgram = context.createProgram(file, defines);
findBlockBoundsKernel = cl::Kernel(interactingBlocksProgram, "findBlockBounds");
......@@ -281,48 +326,38 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findBlockBoundsKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(4, blockCenter->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(5, blockBoundingBox->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(6, interactionCount->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(6, rebuildNeighborList->getDeviceBuffer());
findBlockBoundsKernel.setArg<cl::Buffer>(7, sortedBlocks->getDeviceBuffer());
sortBoxDataKernel = cl::Kernel(interactingBlocksProgram, "sortBoxData");
sortBoxDataKernel.setArg<cl::Buffer>(0, sortedBlocks->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(1, blockCenter->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(2, blockBoundingBox->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(3, sortedBlockCenter->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(4, sortedBlockBoundingBox->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(5, context.getPosq().getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(6, oldPositions->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(7, interactionCount->getDeviceBuffer());
sortBoxDataKernel.setArg<cl::Buffer>(8, rebuildNeighborList->getDeviceBuffer());
findInteractingBlocksKernel = cl::Kernel(interactingBlocksProgram, "findBlocksWithInteractions");
if (context.getUseDoublePrecision())
findInteractingBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
else
findInteractingBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, blockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(8, context.getPosq().getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, interactingTiles->getSize());
findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
findInteractionsWithinBlocksKernel = cl::Kernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
if (context.getUseDoublePrecision())
findInteractionsWithinBlocksKernel.setArg<cl_double>(0, cutoff*cutoff);
else
findInteractionsWithinBlocksKernel.setArg<cl_float>(0, (cl_float) (cutoff*cutoff));
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(3, context.getPosq().getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(5, blockCenter->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(6, blockBoundingBox->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(8, interactionCount->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg(9, 128*sizeof(cl_uint), NULL);
findInteractionsWithinBlocksKernel.setArg<cl_uint>(10, interactingTiles->getSize());
}
findInteractingBlocksKernel.setArg<cl::Buffer>(2, blockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(3, blockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(4, interactionCount->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingAtoms->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, context.getPosq().getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(8, interactingTiles->getSize());
findInteractingBlocksKernel.setArg<cl_uint>(9, startBlockIndex);
findInteractingBlocksKernel.setArg<cl_uint>(10, numBlocks);
findInteractingBlocksKernel.setArg<cl::Buffer>(11, sortedBlocks->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(12, sortedBlockCenter->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(13, sortedBlockBoundingBox->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(14, exclusionIndices->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(15, exclusionRowIndices->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(16, oldPositions->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(17, rebuildNeighborList->getDeviceBuffer());
}
}
int OpenCLNonbondedUtilities::findExclusionIndex(int x, int y, const vector<cl_uint>& exclusionIndices, const vector<cl_uint>& exclusionRowIndices) {
int start = exclusionRowIndices[x];
int end = exclusionRowIndices[x+1];
for (int i = start; i < end; i++)
if (exclusionIndices[i] == y)
return i*OpenCLContext::TileSize;
throw OpenMMException("Internal error: exclusion in unexpected tile");
}
static void setPeriodicBoxSizeArg(OpenCLContext& cl, cl::Kernel& kernel, int index) {
if (cl.getUseDoublePrecision())
kernel.setArg<mm_double4>(index, cl.getPeriodicBoxSizeDouble());
......@@ -352,23 +387,22 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
setPeriodicBoxSizeArg(context, findBlockBoundsKernel, 1);
setInvPeriodicBoxSizeArg(context, findBlockBoundsKernel, 2);
context.executeKernel(findBlockBoundsKernel, context.getNumAtoms());
setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 2);
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : -1);
if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
setPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 1);
setInvPeriodicBoxSizeArg(context, findInteractionsWithinBlocksKernel, 2);
context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms(), 128);
}
blockSorter->sort(*sortedBlocks);
context.executeKernel(sortBoxDataKernel, context.getNumAtoms());
setPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 0);
setInvPeriodicBoxSizeArg(context, findInteractingBlocksKernel, 1);
context.executeKernel(findInteractingBlocksKernel, context.getNumAtoms(), deviceIsCpu ? 1 : 256);
}
void OpenCLNonbondedUtilities::computeInteractions() {
if (kernelSource.size() > 0) {
if (useCutoff) {
setPeriodicBoxSizeArg(context, forceKernel, 10);
setInvPeriodicBoxSizeArg(context, forceKernel, 11);
setPeriodicBoxSizeArg(context, forceKernel, 9);
setInvPeriodicBoxSizeArg(context, forceKernel, 10);
}
context.executeKernel(forceKernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
if (context.getComputeForceCount() == 1)
updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
}
}
......@@ -383,42 +417,52 @@ void OpenCLNonbondedUtilities::updateNeighborListSize() {
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future.
int newSize = (int) (1.2*pinnedInteractionCount[0]);
int numTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (newSize > numTiles)
newSize = numTiles;
int maxTiles = (int) (1.2*pinnedInteractionCount[0]);
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > totalTiles)
maxTiles = totalTiles;
delete interactingTiles;
interactingTiles = OpenCLArray::create<mm_ushort2>(context, newSize, "interactingTiles");
forceKernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(12, newSize);
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(9, newSize);
if (context.getSIMDWidth() == 32 || deviceIsCpu) {
delete interactionFlags;
interactionFlags = OpenCLArray::create<cl_uint>(context, deviceIsCpu ? 2*newSize : newSize, "interactionFlags");
forceKernel.setArg<cl::Buffer>(13, interactionFlags->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
if (!deviceIsCpu) {
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(4, interactingTiles->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl_uint>(10, newSize);
}
delete interactingAtoms;
interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
interactingAtoms = NULL;
interactingTiles = OpenCLArray::create<mm_ushort2>(context, maxTiles, "interactingTiles");
interactingAtoms = OpenCLArray::create<cl_int>(context, OpenCLContext::TileSize*maxTiles, "interactingAtoms");
forceKernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer());
forceKernel.setArg<cl_uint>(11, maxTiles);
forceKernel.setArg<cl::Buffer>(13, interactingAtoms->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(5, interactingTiles->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl::Buffer>(6, interactingAtoms->getDeviceBuffer());
findInteractingBlocksKernel.setArg<cl_uint>(8, maxTiles);
int numAtoms = context.getNumAtoms();
if (context.getUseDoublePrecision()) {
vector<mm_double4> oldPositionsVec(numAtoms, mm_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<mm_float4> oldPositionsVec(numAtoms, mm_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
}
void OpenCLNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
this->startTileIndex = startTileIndex;
this->numTiles = numTiles;
if (kernelSource.size() == 0)
return; // There are no nonbonded interactions in the System.
forceKernel.setArg<cl_uint>(6, startTileIndex);
forceKernel.setArg<cl_uint>(7, startTileIndex+numTiles);
if (useCutoff) {
findInteractingBlocksKernel.setArg<cl_uint>(10, startTileIndex);
findInteractingBlocksKernel.setArg<cl_uint>(11, startTileIndex+numTiles);
void OpenCLNonbondedUtilities::setUsePadding(bool padding) {
usePadding = padding;
}
void OpenCLNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
int numAtomBlocks = context.getNumAtomBlocks();
startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);;
numTiles = (int) (endFraction*totalTiles)-startTileIndex;
if (useCutoff && interactingTiles != NULL) {
// We are using a cutoff, and the kernels have already been created.
forceKernel.setArg<cl_uint>(5, startTileIndex);
forceKernel.setArg<cl_uint>(6, numTiles);
findInteractingBlocksKernel.setArg<cl_uint>(9, startBlockIndex);
findInteractingBlocksKernel.setArg<cl_uint>(10, numBlocks);
}
else
forceKernel.setArg<cl_uint>(8, numTiles);
}
cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& source, const vector<ParameterInfo>& params, const vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const {
......@@ -510,8 +554,6 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
}
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
map<string, string> defines;
if (forceBufferPerAtomBlock)
defines["USE_OUTPUT_BUFFER_PER_BLOCK"] = "1";
if (useCutoff)
defines["USE_CUTOFF"] = "1";
if (usePeriodic)
......@@ -525,15 +567,21 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["TILE_SIZE"] = context.intToString(OpenCLContext::TileSize);
int numExclusionTiles = exclusionTiles->getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
int numContexts = context.getPlatformData().contexts.size();
int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
if ((localDataSize/4)%2 == 0)
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::nonbonded_cpu;
else if (context.getSIMDWidth() == 32)
file = OpenCLKernelSources::nonbonded_nvidia;
else
file = OpenCLKernelSources::nonbonded_default;
file = OpenCLKernelSources::nonbonded;
cl::Program program = context.createProgram(context.replaceStrings(file, replacements), defines);
cl::Kernel kernel(program, "computeNonbonded");
......@@ -547,19 +595,16 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionIndices->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionRowIndices->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusionTiles->getDeviceBuffer());
kernel.setArg<cl_uint>(index++, startTileIndex);
kernel.setArg<cl_uint>(index++, startTileIndex+numTiles);
kernel.setArg<cl_uint>(index++, numTiles);
if (useCutoff) {
kernel.setArg<cl::Buffer>(index++, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactionCount->getDeviceBuffer());
index += 2; // The periodic box size arguments are set when the kernel is executed.
kernel.setArg<cl_uint>(index++, interactingTiles->getSize());
kernel.setArg<cl::Buffer>(index++, interactionFlags->getDeviceBuffer());
}
else {
kernel.setArg<cl_uint>(index++, numTiles);
kernel.setArg<cl::Buffer>(index++, blockCenter->getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, interactingAtoms->getDeviceBuffer());
}
for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Memory>(index++, params[i].getMemory());
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2010 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -35,6 +35,8 @@
#include <vector>
namespace OpenMM {
class OpenCLSort;
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
......@@ -120,12 +122,6 @@ public:
bool getUsePeriodic() {
return usePeriodic;
}
/**
* Get whether there is one force buffer per atom block.
*/
bool getForceBufferPerAtomBlock() {
return forceBufferPerAtomBlock;
}
/**
* Get the number of work groups used for computing nonbonded forces.
*/
......@@ -193,10 +189,10 @@ public:
return *interactingTiles;
}
/**
* Get the array containing flags for tiles with interactions.
* Get the array containing the atoms in each tile with interactions.
*/
OpenCLArray& getInteractionFlags() {
return *interactionFlags;
OpenCLArray& getInteractingAtoms() {
return *interactingAtoms;
}
/**
* Get the array containing exclusion flags.
......@@ -204,6 +200,12 @@ public:
OpenCLArray& getExclusions() {
return *exclusions;
}
/**
* Get the array containing tiles with exclusions.
*/
OpenCLArray& getExclusionTiles() {
return *exclusionTiles;
}
/**
* Get the array containing the index into the exclusion array for each tile.
*/
......@@ -229,9 +231,17 @@ public:
return numTiles;
}
/**
* Set the range of tiles that should be processed by this context.
* Set whether to add padding to the cutoff distance when building the neighbor list.
* This increases the size of the neighbor list (and thus the cost of computing interactions),
* but also means we don't need to rebuild it every time step. The default value is true,
* since usually this improves performance. For very expensive interactions, however,
* it may be better to set this to false.
*/
void setUsePadding(bool padding);
/**
* Set the range of atom blocks and tiles that should be processed by this context.
*/
void setTileRange(int startTileIndex, int numTiles);
void setAtomBlockRange(double startFraction, double endFraction);
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
......@@ -245,28 +255,36 @@ public:
*/
cl::Kernel createInteractionKernel(const std::string& source, const std::vector<ParameterInfo>& params, const std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) const;
private:
static int findExclusionIndex(int x, int y, const std::vector<cl_uint>& exclusionIndices, const std::vector<cl_uint>& exclusionRowIndices);
class BlockSortTrait;
OpenCLContext& context;
cl::Kernel forceKernel;
cl::Kernel findBlockBoundsKernel;
cl::Kernel sortBoxDataKernel;
cl::Kernel findInteractingBlocksKernel;
cl::Kernel findInteractionsWithinBlocksKernel;
OpenCLArray* exclusionTiles;
OpenCLArray* exclusions;
OpenCLArray* exclusionIndices;
OpenCLArray* exclusionRowIndices;
OpenCLArray* interactingTiles;
OpenCLArray* interactionFlags;
OpenCLArray* interactingAtoms;
OpenCLArray* interactionCount;
OpenCLArray* blockCenter;
OpenCLArray* blockBoundingBox;
OpenCLArray* sortedBlocks;
OpenCLArray* sortedBlockCenter;
OpenCLArray* sortedBlockBoundingBox;
OpenCLArray* oldPositions;
OpenCLArray* rebuildNeighborList;
OpenCLSort* blockSorter;
std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments;
std::string kernelSource;
std::map<std::string, std::string> kernelDefines;
double cutoff;
bool useCutoff, usePeriodic, forceBufferPerAtomBlock, deviceIsCpu, anyExclusions;
int numForceBuffers, startTileIndex, numTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup;
bool useCutoff, usePeriodic, deviceIsCpu, anyExclusions, usePadding;
int numForceBuffers, startTileIndex, numTiles, startBlockIndex, numBlocks, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup;
};
/**
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -108,7 +108,7 @@ private:
};
OpenCLParallelCalcForcesAndEnergyKernel::OpenCLParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, OpenCLPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL),
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
pinnedPositionBuffer(NULL), pinnedPositionMemory(NULL), pinnedForceBuffer(NULL), pinnedForceMemory(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new OpenCLCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
......@@ -126,6 +126,8 @@ OpenCLParallelCalcForcesAndEnergyKernel::~OpenCLParallelCalcForcesAndEnergyKerne
void OpenCLParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system);
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
}
void OpenCLParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
......@@ -172,30 +174,26 @@ double OpenCLParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& c
numAtoms*(data.contexts.size()-1)*elementSize, pinnedForceMemory);
cl.reduceBuffer(*contextForces, data.contexts.size());
// Balance work between the contexts by transferring a few nonbonded tiles from the context that
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
int firstIndex = 0, lastIndex = 0;
int totalTiles = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i;
if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i;
contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
totalTiles += contextTiles[i];
}
int tilesToTransfer = totalTiles/1000;
if (tilesToTransfer < 1)
tilesToTransfer = 1;
if (tilesToTransfer > contextTiles[lastIndex])
tilesToTransfer = contextTiles[lastIndex];
contextTiles[firstIndex] += tilesToTransfer;
contextTiles[lastIndex] -= tilesToTransfer;
int startIndex = 0;
for (int i = 0; i < (int) contextTiles.size(); i++) {
data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
startIndex += contextTiles[i];
double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
contextNonbondedFractions[firstIndex] += fractionToTransfer;
contextNonbondedFractions[lastIndex] -= fractionToTransfer;
double startFraction = 0.0;
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
double endFraction = startFraction+contextNonbondedFractions[i];
if (i == contextNonbondedFractions.size()-1)
endFraction = 1.0; // Avoid roundoff error
data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
startFraction = endFraction;
}
}
return energy;
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011 Stanford University and the Authors. *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -80,7 +80,7 @@ private:
OpenCLPlatform::PlatformData& data;
std::vector<Kernel> kernels;
std::vector<long long> completionTimes;
std::vector<int> contextTiles;
std::vector<double> contextNonbondedFractions;
OpenCLArray* contextForces;
cl::Buffer* pinnedPositionBuffer;
cl::Buffer* pinnedForceBuffer;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment