Commit a381a3ab authored by peastman's avatar peastman
Browse files

Merge branch 'master' into gayberne

parents 5ecc8e00 1f7866ad
...@@ -73,6 +73,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -73,6 +73,7 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
COMPUTE_VALUE COMPUTE_VALUE
} }
value += tempValue1; value += tempValue1;
ADD_TEMP_DERIVS1
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
...@@ -121,6 +122,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -121,6 +122,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
} }
value += tempValue1; value += tempValue1;
localData[tbx+tj].value += tempValue2; localData[tbx+tj].value += tempValue2;
ADD_TEMP_DERIVS1
ADD_TEMP_DERIVS2
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
...@@ -133,11 +136,13 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -133,11 +136,13 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
// Write results. // Write results.
unsigned int offset = x*TILE_SIZE + tgx; unsigned int offset1 = x*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000))); atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
STORE_PARAM_DERIVS1
if (x != y) { if (x != y) {
offset = y*TILE_SIZE + tgx; unsigned int offset2 = y*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000))); atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
STORE_PARAM_DERIVS2
} }
} }
...@@ -146,6 +151,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -146,6 +151,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
#else #else
...@@ -167,39 +174,35 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -167,39 +174,35 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { x = tiles[pos];
x = tiles[pos]; real4 blockSizeX = blockSize[x];
real4 blockSizeX = blockSize[x]; singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF && 0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF && 0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF); #else
} y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
else x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
#endif if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
{ y += (x < y ? -1 : 1);
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. }
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed. // Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos) else
currentSkipIndex++; skipTiles[threadIdx.x] = end;
includeTile = (skipTiles[currentSkipIndex] != pos); skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
if (includeTile) { if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
...@@ -246,6 +249,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -246,6 +249,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
} }
value += tempValue1; value += tempValue1;
localData[tbx+tj].value += tempValue2; localData[tbx+tj].value += tempValue2;
ADD_TEMP_DERIVS1
ADD_TEMP_DERIVS2
} }
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
...@@ -278,6 +283,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -278,6 +283,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
} }
value += tempValue1; value += tempValue1;
localData[tbx+tj].value += tempValue2; localData[tbx+tj].value += tempValue2;
ADD_TEMP_DERIVS1
ADD_TEMP_DERIVS2
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
...@@ -287,14 +294,19 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -287,14 +294,19 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
// Write results. // Write results.
atomicAdd(&global_value[atom1], static_cast<unsigned long long>((long long) (value*0x100000000))); unsigned int offset1 = atom1;
atomicAdd(&global_value[offset1], static_cast<unsigned long long>((long long) (value*0x100000000)));
STORE_PARAM_DERIVS1
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x]; unsigned int atom2 = atomIndices[threadIdx.x];
#else #else
unsigned int atom2 = y*TILE_SIZE + tgx; unsigned int atom2 = y*TILE_SIZE + tgx;
#endif #endif
if (atom2 < PADDED_NUM_ATOMS) if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&global_value[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000))); unsigned int offset2 = atom2;
atomicAdd(&global_value[offset2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
STORE_PARAM_DERIVS2
}
} }
pos++; pos++;
} }
......
...@@ -8,6 +8,7 @@ extern "C" __global__ void computePerParticleValues(real4* posq, long long* valu ...@@ -8,6 +8,7 @@ extern "C" __global__ void computePerParticleValues(real4* posq, long long* valu
// Load the pairwise value // Load the pairwise value
real sum = valueBuffers[index]/(real) 0x100000000; real sum = valueBuffers[index]/(real) 0x100000000;
REDUCE_PARAM0_DERIV
// Now calculate other values // Now calculate other values
......
...@@ -33,7 +33,8 @@ inline __device__ mixed4 convertFromDouble4(double4 a) { ...@@ -33,7 +33,8 @@ inline __device__ mixed4 convertFromDouble4(double4 a) {
extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta, extern "C" __global__ void computePerDof(real4* __restrict__ posq, real4* __restrict__ posqCorrection, mixed4* __restrict__ posDelta,
mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals, mixed4* __restrict__ velm, const long long* __restrict__ force, const mixed2* __restrict__ dt, const mixed* __restrict__ globals,
mixed* __restrict__ sum, const float4* __restrict__ gaussianValues, unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues, const real energy mixed* __restrict__ sum, const float4* __restrict__ gaussianValues, unsigned int gaussianBaseIndex, const float4* __restrict__ uniformValues,
const real energy, mixed* __restrict__ energyParamDerivs
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
mixed stepSize = dt[0].y; mixed stepSize = dt[0].y;
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
......
...@@ -4,15 +4,18 @@ if (!isExcluded && r2 < CUTOFF_SQUARED) { ...@@ -4,15 +4,18 @@ if (!isExcluded && r2 < CUTOFF_SQUARED) {
if (!isExcluded) { if (!isExcluded) {
#endif #endif
real tempForce = 0; real tempForce = 0;
COMPUTE_FORCE real switchValue = 1, switchDeriv = 0;
#if USE_SWITCH #if USE_SWITCH
if (r > SWITCH_CUTOFF) { if (r > SWITCH_CUTOFF) {
real x = r-SWITCH_CUTOFF; real x = r-SWITCH_CUTOFF;
real switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5)); switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
real switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5)); switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
tempForce = tempForce*switchValue - tempEnergy*switchDeriv;
tempEnergy *= switchValue;
} }
#endif
COMPUTE_FORCE
#if USE_SWITCH
tempForce = tempForce*switchValue - tempEnergy*switchDeriv;
tempEnergy *= switchValue;
#endif #endif
dEdR += tempForce*invR; dEdR += tempForce*invR;
} }
...@@ -204,6 +204,8 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -204,6 +204,8 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
#else #else
...@@ -225,39 +227,35 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -225,39 +227,35 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
x = tiles[pos]; x = tiles[pos];
real4 blockSizeX = blockSize[x]; real4 blockSizeX = blockSize[x];
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF && singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF && 0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF); 0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
} #else
else y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
#endif x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
{ if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos)); y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. }
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed. // Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos) else
currentSkipIndex++; skipTiles[threadIdx.x] = end;
includeTile = (skipTiles[currentSkipIndex] != pos); skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
if (includeTile) { if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
...@@ -559,6 +557,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -559,6 +557,8 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
#else #else
...@@ -580,39 +580,35 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -580,39 +580,35 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { x = tiles[pos];
x = tiles[pos]; real4 blockSizeX = blockSize[x];
real4 blockSizeX = blockSize[x]; singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF && 0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF && 0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF); #else
} y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
else x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
#endif if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
{ y += (x < y ? -1 : 1);
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. }
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed. // Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos) else
currentSkipIndex++; skipTiles[threadIdx.x] = end;
includeTile = (skipTiles[currentSkipIndex] != pos); skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
if (includeTile) { if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
......
...@@ -113,11 +113,14 @@ extern "C" __global__ void computeNonbonded( ...@@ -113,11 +113,14 @@ extern "C" __global__ void computeNonbonded(
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); // index within the warp
const unsigned int tbx = threadIdx.x - tgx; // block warpIndex const unsigned int tbx = threadIdx.x - tgx; // block warpIndex
mixed energy = 0; mixed energy = 0;
INIT_DERIVATIVES
// used shared memory if the device cannot shuffle // used shared memory if the device cannot shuffle
#ifndef ENABLE_SHUFFLE #ifndef ENABLE_SHUFFLE
__shared__ AtomData localData[THREAD_BLOCK_SIZE]; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
#endif #endif
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) { for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
...@@ -173,6 +176,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -173,6 +176,7 @@ extern "C" __global__ void computeNonbonded(
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1)); bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif #endif
real tempEnergy = 0.0f; real tempEnergy = 0.0f;
const real interactionScale = 0.5f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
...@@ -241,6 +245,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -241,6 +245,7 @@ extern "C" __global__ void computeNonbonded(
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1)); bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
#endif #endif
real tempEnergy = 0.0f; real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
...@@ -309,8 +314,11 @@ extern "C" __global__ void computeNonbonded( ...@@ -309,8 +314,11 @@ extern "C" __global__ void computeNonbonded(
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all // Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff). // of them (no cutoff).
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0]; const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long long)numTileIndices/totalWarps : warp*(long long)numTiles/totalWarps); int pos = (int) (numTiles > maxTiles ? startTileIndex+warp*(long long)numTileIndices/totalWarps : warp*(long long)numTiles/totalWarps);
int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long long)numTileIndices/totalWarps : (warp+1)*(long long)numTiles/totalWarps); int end = (int) (numTiles > maxTiles ? startTileIndex+(warp+1)*(long long)numTileIndices/totalWarps : (warp+1)*(long long)numTiles/totalWarps);
#else #else
...@@ -335,39 +343,35 @@ extern "C" __global__ void computeNonbonded( ...@@ -335,39 +343,35 @@ extern "C" __global__ void computeNonbonded(
int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { x = tiles[pos];
x = tiles[pos]; real4 blockSizeX = blockSize[x];
real4 blockSizeX = blockSize[x]; singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF &&
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= MAX_CUTOFF && 0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= MAX_CUTOFF && 0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF);
0.5f*periodicBoxSize.z-blockSizeX.z >= MAX_CUTOFF); #else
} y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
else x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
#endif if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
{ y += (x < y ? -1 : 1);
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. }
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed. // Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) { while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx]; ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos) else
currentSkipIndex++; skipTiles[threadIdx.x] = end;
includeTile = (skipTiles[currentSkipIndex] != pos); skipBase += TILE_SIZE;
currentSkipIndex = tbx;
} }
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
if (includeTile) { if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile. // Load atom data for this tile.
...@@ -447,6 +451,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -447,6 +451,7 @@ extern "C" __global__ void computeNonbonded(
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS); bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif #endif
real tempEnergy = 0.0f; real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
...@@ -517,6 +522,7 @@ extern "C" __global__ void computeNonbonded( ...@@ -517,6 +522,7 @@ extern "C" __global__ void computeNonbonded(
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS); bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS);
#endif #endif
real tempEnergy = 0.0f; real tempEnergy = 0.0f;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef INCLUDE_FORCES #ifdef INCLUDE_FORCES
...@@ -585,4 +591,5 @@ extern "C" __global__ void computeNonbonded( ...@@ -585,4 +591,5 @@ extern "C" __global__ void computeNonbonded(
#ifdef INCLUDE_ENERGY #ifdef INCLUDE_ENERGY
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
#endif #endif
SAVE_DERIVATIVES
} }
\ No newline at end of file
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2015 Stanford University and the Authors. * * Portions copyright (c) 2011-2016 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -100,6 +100,15 @@ public: ...@@ -100,6 +100,15 @@ public:
* refer to it by this name. * refer to it by this name.
*/ */
std::string addArgument(cl::Memory& data, const std::string& type); std::string addArgument(cl::Memory& data, const std::string& type);
/**
* Register that the interaction kernel will be computing the derivative of the potential energy
* with respect to a parameter.
*
* @param param the name of the parameter
* @return the variable that will be used to accumulate the derivative. Any code you pass to addInteraction() should
* add its contributions to this variable.
*/
std::string addEnergyParameterDerivative(const std::string& param);
/** /**
* Add some OpenCL code that should be included in the program, before the start of the kernel. * Add some OpenCL code that should be included in the program, before the start of the kernel.
* This can be used, for example, to define functions that will be called by the kernel. * This can be used, for example, to define functions that will be called by the kernel.
...@@ -137,6 +146,7 @@ private: ...@@ -137,6 +146,7 @@ private:
std::vector<OpenCLArray*> atomIndices; std::vector<OpenCLArray*> atomIndices;
std::vector<OpenCLArray*> bufferIndices; std::vector<OpenCLArray*> bufferIndices;
std::vector<std::string> prefixCode; std::vector<std::string> prefixCode;
std::vector<std::string> energyParameterDerivatives;
int numForceBuffers, maxBonds, allGroups; int numForceBuffers, maxBonds, allGroups;
bool hasInitializedKernels; bool hasInitializedKernels;
}; };
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. * * Portions copyright (c) 2009-2016 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -264,6 +264,12 @@ public: ...@@ -264,6 +264,12 @@ public:
OpenCLArray& getEnergyBuffer() { OpenCLArray& getEnergyBuffer() {
return *energyBuffer; return *energyBuffer;
} }
/**
* Get the array which contains the buffer in which derivatives of the energy with respect to parameters are computed.
*/
OpenCLArray& getEnergyParamDerivBuffer() {
return *energyParamDerivBuffer;
}
/** /**
* Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device. * Get a pointer to a block of pinned memory that can be used for efficient transfers between host and device.
* This is guaranteed to be at least as large as any of the arrays returned by methods of this class. * This is guaranteed to be at least as large as any of the arrays returned by methods of this class.
...@@ -408,6 +414,18 @@ public: ...@@ -408,6 +414,18 @@ public:
void setStepsSinceReorder(int steps) { void setStepsSinceReorder(int steps) {
stepsSinceReorder = steps; stepsSinceReorder = steps;
} }
/**
* Get the flag that marks whether the current force evaluation is valid.
*/
bool getForcesValid() const {
return forcesValid;
}
/**
* Get the flag that marks whether the current force evaluation is valid.
*/
void setForcesValid(bool valid) {
forcesValid = valid;
}
/** /**
* Get the number of atoms. * Get the number of atoms.
*/ */
...@@ -647,6 +665,27 @@ public: ...@@ -647,6 +665,27 @@ public:
std::vector<ForcePostComputation*>& getPostComputations() { std::vector<ForcePostComputation*>& getPostComputations() {
return postComputations; return postComputations;
} }
/**
* Get the names of all parameters with respect to which energy derivatives are computed.
*/
const std::vector<std::string>& getEnergyParamDerivNames() const {
return energyParamDerivNames;
}
/**
* Get a workspace data structure used for accumulating the values of derivatives of the energy
* with respect to parameters.
*/
std::map<std::string, double>& getEnergyParamDerivWorkspace() {
return energyParamDerivWorkspace;
}
/**
* Register that the derivative of potential energy with respect to a context parameter
* will need to be calculated. If this is called multiple times for a single parameter,
* it is only added to the list once.
*
* @param param the name of the parameter to add
*/
void addEnergyParameterDerivative(const std::string& param);
/** /**
* Mark that the current molecule definitions (and hence the atom order) may be invalid. * Mark that the current molecule definitions (and hence the atom order) may be invalid.
* This should be called whenever force field parameters change. It will cause the definitions * This should be called whenever force field parameters change. It will cause the definitions
...@@ -684,7 +723,7 @@ private: ...@@ -684,7 +723,7 @@ private:
int numThreadBlocks; int numThreadBlocks;
int numForceBuffers; int numForceBuffers;
int simdWidth; int simdWidth;
bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, boxIsTriclinic; bool supports64BitGlobalAtomics, supportsDoublePrecision, useDoublePrecision, useMixedPrecision, atomsWereReordered, boxIsTriclinic, forcesValid;
mm_float4 periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ; mm_float4 periodicBoxSize, invPeriodicBoxSize, periodicBoxVecX, periodicBoxVecY, periodicBoxVecZ;
mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble, periodicBoxVecXDouble, periodicBoxVecYDouble, periodicBoxVecZDouble; mm_double4 periodicBoxSizeDouble, invPeriodicBoxSizeDouble, periodicBoxVecXDouble, periodicBoxVecYDouble, periodicBoxVecZDouble;
std::string defaultOptimizationOptions; std::string defaultOptimizationOptions;
...@@ -713,7 +752,10 @@ private: ...@@ -713,7 +752,10 @@ private:
OpenCLArray* forceBuffers; OpenCLArray* forceBuffers;
OpenCLArray* longForceBuffer; OpenCLArray* longForceBuffer;
OpenCLArray* energyBuffer; OpenCLArray* energyBuffer;
OpenCLArray* energyParamDerivBuffer;
OpenCLArray* atomIndexDevice; OpenCLArray* atomIndexDevice;
std::vector<std::string> energyParamDerivNames;
std::map<std::string, double> energyParamDerivWorkspace;
std::vector<int> atomIndex; std::vector<int> atomIndex;
std::vector<cl::Memory*> autoclearBuffers; std::vector<cl::Memory*> autoclearBuffers;
std::vector<int> autoclearBufferSizes; std::vector<int> autoclearBufferSizes;
......
...@@ -141,6 +141,12 @@ public: ...@@ -141,6 +141,12 @@ public:
* @param forces on exit, this contains the forces * @param forces on exit, this contains the forces
*/ */
void getForces(ContextImpl& context, std::vector<Vec3>& forces); void getForces(ContextImpl& context, std::vector<Vec3>& forces);
/**
* Get the current derivatives of the energy with respect to context parameters.
*
* @param derivs on exit, this contains the derivatives
*/
void getEnergyParameterDerivatives(ContextImpl& context, std::map<std::string, double>& derivs);
/** /**
* Get the current periodic box vectors. * Get the current periodic box vectors.
* *
...@@ -709,6 +715,7 @@ private: ...@@ -709,6 +715,7 @@ private:
std::vector<cl_float> globalParamValues; std::vector<cl_float> globalParamValues;
std::vector<OpenCLArray*> tabulatedFunctions; std::vector<OpenCLArray*> tabulatedFunctions;
double longRangeCoefficient; double longRangeCoefficient;
std::vector<double> longRangeCoefficientDerivs;
bool hasInitializedLongRangeCorrection, hasInitializedKernel; bool hasInitializedLongRangeCorrection, hasInitializedKernel;
int numGroupThreadBlocks; int numGroupThreadBlocks;
CustomNonbondedForce* forceCopy; CustomNonbondedForce* forceCopy;
...@@ -801,13 +808,15 @@ public: ...@@ -801,13 +808,15 @@ public:
void copyParametersToContext(ContextImpl& context, const CustomGBForce& force); void copyParametersToContext(ContextImpl& context, const CustomGBForce& force);
private: private:
double cutoff; double cutoff;
bool hasInitializedKernels, needParameterGradient; bool hasInitializedKernels, needParameterGradient, needEnergyParamDerivs;
int maxTiles, numComputedValues; int maxTiles, numComputedValues;
OpenCLContext& cl; OpenCLContext& cl;
OpenCLParameterSet* params; OpenCLParameterSet* params;
OpenCLParameterSet* computedValues; OpenCLParameterSet* computedValues;
OpenCLParameterSet* energyDerivs; OpenCLParameterSet* energyDerivs;
OpenCLParameterSet* energyDerivChain; OpenCLParameterSet* energyDerivChain;
std::vector<OpenCLParameterSet*> dValuedParam;
std::vector<OpenCLArray*> dValue0dParam;
OpenCLArray* longEnergyDerivs; OpenCLArray* longEnergyDerivs;
OpenCLArray* globals; OpenCLArray* globals;
OpenCLArray* valueBuffers; OpenCLArray* valueBuffers;
...@@ -953,6 +962,7 @@ public: ...@@ -953,6 +962,7 @@ public:
private: private:
int numGroups, numBonds; int numGroups, numBonds;
bool needEnergyParamDerivs;
OpenCLContext& cl; OpenCLContext& cl;
OpenCLParameterSet* params; OpenCLParameterSet* params;
OpenCLArray* globals; OpenCLArray* globals;
...@@ -1339,7 +1349,7 @@ public: ...@@ -1339,7 +1349,7 @@ public:
enum GlobalTargetType {DT, VARIABLE, PARAMETER}; enum GlobalTargetType {DT, VARIABLE, PARAMETER};
OpenCLIntegrateCustomStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) : IntegrateCustomStepKernel(name, platform), cl(cl), OpenCLIntegrateCustomStepKernel(std::string name, const Platform& platform, OpenCLContext& cl) : IntegrateCustomStepKernel(name, platform), cl(cl),
hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), sumBuffer(NULL), summedValue(NULL), uniformRandoms(NULL), hasInitializedKernels(false), localValuesAreCurrent(false), globalValues(NULL), sumBuffer(NULL), summedValue(NULL), uniformRandoms(NULL),
randomSeed(NULL), perDofValues(NULL) { randomSeed(NULL), perDofEnergyParamDerivs(NULL), perDofValues(NULL), needsEnergyParamDerivs(false) {
} }
~OpenCLIntegrateCustomStepKernel(); ~OpenCLIntegrateCustomStepKernel();
/** /**
...@@ -1404,8 +1414,11 @@ public: ...@@ -1404,8 +1414,11 @@ public:
private: private:
class ReorderListener; class ReorderListener;
class GlobalTarget; class GlobalTarget;
class DerivFunction;
std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName); std::string createPerDofComputation(const std::string& variable, const Lepton::ParsedExpression& expr, int component, CustomIntegrator& integrator, const std::string& forceName, const std::string& energyName);
void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid); void prepareForComputation(ContextImpl& context, CustomIntegrator& integrator, bool& forcesAreValid);
Lepton::ExpressionTreeNode replaceDerivFunctions(const Lepton::ExpressionTreeNode& node, OpenMM::ContextImpl& context);
void findExpressionsForDerivs(const Lepton::ExpressionTreeNode& node, std::vector<std::pair<Lepton::ExpressionTreeNode, std::string> >& variableNodes);
void recordGlobalValue(double value, GlobalTarget target); void recordGlobalValue(double value, GlobalTarget target);
void recordChangedParameters(ContextImpl& context); void recordChangedParameters(ContextImpl& context);
bool evaluateCondition(int step); bool evaluateCondition(int step);
...@@ -1413,18 +1426,23 @@ private: ...@@ -1413,18 +1426,23 @@ private:
double energy; double energy;
float energyFloat; float energyFloat;
int numGlobalVariables; int numGlobalVariables;
bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints; bool hasInitializedKernels, deviceValuesAreCurrent, deviceGlobalsAreCurrent, modifiesParameters, keNeedsForce, hasAnyConstraints, needsEnergyParamDerivs;
mutable bool localValuesAreCurrent; mutable bool localValuesAreCurrent;
OpenCLArray* globalValues; OpenCLArray* globalValues;
OpenCLArray* sumBuffer; OpenCLArray* sumBuffer;
OpenCLArray* summedValue; OpenCLArray* summedValue;
OpenCLArray* uniformRandoms; OpenCLArray* uniformRandoms;
OpenCLArray* randomSeed; OpenCLArray* randomSeed;
OpenCLArray* perDofEnergyParamDerivs;
std::map<int, OpenCLArray*> savedForces; std::map<int, OpenCLArray*> savedForces;
std::set<int> validSavedForces; std::set<int> validSavedForces;
OpenCLParameterSet* perDofValues; OpenCLParameterSet* perDofValues;
mutable std::vector<std::vector<cl_float> > localPerDofValuesFloat; mutable std::vector<std::vector<cl_float> > localPerDofValuesFloat;
mutable std::vector<std::vector<cl_double> > localPerDofValuesDouble; mutable std::vector<std::vector<cl_double> > localPerDofValuesDouble;
std::map<std::string, double> energyParamDerivs;
std::vector<std::string> perDofEnergyParamDerivNames;
std::vector<cl_float> localPerDofEnergyParamDerivsFloat;
std::vector<cl_double> localPerDofEnergyParamDerivsDouble;
std::vector<float> globalValuesFloat; std::vector<float> globalValuesFloat;
std::vector<double> globalValuesDouble; std::vector<double> globalValuesDouble;
std::vector<double> initialGlobalVariables; std::vector<double> initialGlobalVariables;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. * * Portions copyright (c) 2009-2016 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -88,6 +88,15 @@ public: ...@@ -88,6 +88,15 @@ public:
* Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel. * Add an array (other than a per-atom parameter) that should be passed as an argument to the default interaction kernel.
*/ */
void addArgument(const ParameterInfo& parameter); void addArgument(const ParameterInfo& parameter);
/**
* Register that the interaction kernel will be computing the derivative of the potential energy
* with respect to a parameter.
*
* @param param the name of the parameter
* @return the variable that will be used to accumulate the derivative. Any code you pass to addInteraction() should
* add its contributions to this variable.
*/
std::string addEnergyParameterDerivative(const std::string& param);
/** /**
* Specify the list of exclusions that an interaction outside the default kernel will depend on. * Specify the list of exclusions that an interaction outside the default kernel will depend on.
* *
...@@ -281,9 +290,13 @@ private: ...@@ -281,9 +290,13 @@ private:
OpenCLArray* oldPositions; OpenCLArray* oldPositions;
OpenCLArray* rebuildNeighborList; OpenCLArray* rebuildNeighborList;
OpenCLSort* blockSorter; OpenCLSort* blockSorter;
cl::Event downloadCountEvent;
cl::Buffer* pinnedCountBuffer;
int* pinnedCountMemory;
std::vector<std::vector<int> > atomExclusions; std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters; std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments; std::vector<ParameterInfo> arguments;
std::vector<std::string> energyParameterDerivatives;
std::map<int, double> groupCutoff; std::map<int, double> groupCutoff;
std::map<int, std::string> groupKernelSource; std::map<int, std::string> groupKernelSource;
double lastCutoff; double lastCutoff;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2015 Stanford University and the Authors. * * Portions copyright (c) 2011-2016 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -56,12 +56,25 @@ void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, co ...@@ -56,12 +56,25 @@ void OpenCLBondedUtilities::addInteraction(const vector<vector<int> >& atoms, co
} }
} }
std::string OpenCLBondedUtilities::addArgument(cl::Memory& data, const string& type) { string OpenCLBondedUtilities::addArgument(cl::Memory& data, const string& type) {
arguments.push_back(&data); arguments.push_back(&data);
argTypes.push_back(type); argTypes.push_back(type);
return "customArg"+context.intToString(arguments.size()); return "customArg"+context.intToString(arguments.size());
} }
string OpenCLBondedUtilities::addEnergyParameterDerivative(const string& param) {
// See if the parameter has already been added.
int index;
for (index = 0; index < energyParameterDerivatives.size(); index++)
if (param == energyParameterDerivatives[index])
break;
if (index == energyParameterDerivatives.size())
energyParameterDerivatives.push_back(param);
context.addEnergyParameterDerivative(param);
return string("energyParamDeriv")+context.intToString(index);
}
void OpenCLBondedUtilities::addPrefixCode(const string& source) { void OpenCLBondedUtilities::addPrefixCode(const string& source) {
for (int i = 0; i < (int) prefixCode.size(); i++) for (int i = 0; i < (int) prefixCode.size(); i++)
if (prefixCode[i] == source) if (prefixCode[i] == source)
...@@ -190,13 +203,23 @@ void OpenCLBondedUtilities::initialize(const System& system) { ...@@ -190,13 +203,23 @@ void OpenCLBondedUtilities::initialize(const System& system) {
} }
for (int i = 0; i < (int) arguments.size(); i++) for (int i = 0; i < (int) arguments.size(); i++)
s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1); s<<", __global "<<argTypes[i]<<"* customArg"<<(i+1);
if (energyParameterDerivatives.size() > 0)
s<<", __global mixed* restrict energyParamDerivs";
s<<") {\n"; s<<") {\n";
s<<"mixed energy = 0;\n"; s<<"mixed energy = 0;\n";
for (int i = 0; i < energyParameterDerivatives.size(); i++)
s<<"mixed energyParamDeriv"<<i<<" = 0;\n";
for (int i = 0; i < setSize; i++) { for (int i = 0; i < setSize; i++) {
int force = set[i]; int force = set[i];
s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]); s<<createForceSource(i, forceAtoms[force].size(), forceAtoms[force][0].size(), forceGroup[force], forceSource[force]);
} }
s<<"energyBuffer[get_global_id(0)] += energy;\n"; s<<"energyBuffer[get_global_id(0)] += energy;\n";
const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
int numDerivs = allParamDerivNames.size();
for (int i = 0; i < energyParameterDerivatives.size(); i++)
for (int index = 0; index < numDerivs; index++)
if (allParamDerivNames[index] == energyParameterDerivatives[i])
s<<"energyParamDerivs[get_global_id(0)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
s<<"}\n"; s<<"}\n";
map<string, string> defines; map<string, string> defines;
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
...@@ -274,6 +297,8 @@ void OpenCLBondedUtilities::computeInteractions(int groups) { ...@@ -274,6 +297,8 @@ void OpenCLBondedUtilities::computeInteractions(int groups) {
} }
for (int j = 0; j < (int) arguments.size(); j++) for (int j = 0; j < (int) arguments.size(); j++)
kernel.setArg<cl::Memory>(index++, *arguments[j]); kernel.setArg<cl::Memory>(index++, *arguments[j]);
if (energyParameterDerivatives.size() > 0)
kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
} }
} }
for (int i = 0; i < (int) kernels.size(); i++) { for (int i = 0; i < (int) kernels.size(); i++) {
......
...@@ -69,7 +69,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i ...@@ -69,7 +69,7 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) : OpenCLContext::OpenCLContext(const System& system, int platformIndex, int deviceIndex, const string& precision, OpenCLPlatform::PlatformData& platformData) :
system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false), posq(NULL), system(system), time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), stepsSinceReorder(99999), atomsWereReordered(false), posq(NULL),
posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndexDevice(NULL), integration(NULL), posqCorrection(NULL), velm(NULL), forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), energyParamDerivBuffer(NULL), atomIndexDevice(NULL), integration(NULL),
expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) { expression(NULL), bonded(NULL), nonbonded(NULL), thread(NULL) {
if (precision == "single") { if (precision == "single") {
useDoublePrecision = false; useDoublePrecision = false;
...@@ -179,10 +179,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device ...@@ -179,10 +179,8 @@ OpenCLContext::OpenCLContext(const System& system, int platformIndex, int device
compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize); compilationDefines["WORK_GROUP_SIZE"] = intToString(ThreadBlockSize);
if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel") if (platformVendor.size() >= 5 && platformVendor.substr(0, 5) == "Intel")
defaultOptimizationOptions = ""; defaultOptimizationOptions = "";
else if (platformVendor == "Apple")
defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
else else
defaultOptimizationOptions = "-cl-fast-relaxed-math"; defaultOptimizationOptions = "-cl-mad-enable -cl-no-signed-zeros";
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos); supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
supportsDoublePrecision = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos); supportsDoublePrecision = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_fp64") != string::npos);
if ((useDoublePrecision || useMixedPrecision) && !supportsDoublePrecision) if ((useDoublePrecision || useMixedPrecision) && !supportsDoublePrecision)
...@@ -437,6 +435,8 @@ OpenCLContext::~OpenCLContext() { ...@@ -437,6 +435,8 @@ OpenCLContext::~OpenCLContext() {
delete longForceBuffer; delete longForceBuffer;
if (energyBuffer != NULL) if (energyBuffer != NULL)
delete energyBuffer; delete energyBuffer;
if (energyParamDerivBuffer != NULL)
delete energyParamDerivBuffer;
if (atomIndexDevice != NULL) if (atomIndexDevice != NULL)
delete atomIndexDevice; delete atomIndexDevice;
if (integration != NULL) if (integration != NULL)
...@@ -457,15 +457,16 @@ void OpenCLContext::initialize() { ...@@ -457,15 +457,16 @@ void OpenCLContext::initialize() {
numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers()); numForceBuffers = std::max(numForceBuffers, bonded->getNumForceBuffers());
for (int i = 0; i < (int) forces.size(); i++) for (int i = 0; i < (int) forces.size(); i++)
numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers()); numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
int energyBufferSize = max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers());
if (useDoublePrecision) { if (useDoublePrecision) {
forceBuffers = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); forceBuffers = OpenCLArray::create<mm_double4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_double4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force"); force = OpenCLArray::create<mm_double4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer"); energyBuffer = OpenCLArray::create<cl_double>(*this, energyBufferSize, "energyBuffer");
} }
else { else {
forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers"); forceBuffers = OpenCLArray::create<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers");
force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force"); force = OpenCLArray::create<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force");
energyBuffer = OpenCLArray::create<cl_double>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer"); energyBuffer = OpenCLArray::create<cl_double>(*this, energyBufferSize, "energyBuffer");
} }
if (supports64BitGlobalAtomics) { if (supports64BitGlobalAtomics) {
longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer"); longForceBuffer = OpenCLArray::create<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer");
...@@ -477,7 +478,15 @@ void OpenCLContext::initialize() { ...@@ -477,7 +478,15 @@ void OpenCLContext::initialize() {
} }
addAutoclearBuffer(*forceBuffers); addAutoclearBuffer(*forceBuffers);
addAutoclearBuffer(*energyBuffer); addAutoclearBuffer(*energyBuffer);
int bufferBytes = max(velm->getSize()*velm->getElementSize(), energyBuffer->getSize()*energyBuffer->getElementSize()); int numEnergyParamDerivs = energyParamDerivNames.size();
if (numEnergyParamDerivs > 0) {
if (useDoublePrecision || useMixedPrecision)
energyParamDerivBuffer = OpenCLArray::create<cl_double>(*this, numEnergyParamDerivs*energyBufferSize, "energyParamDerivBuffer");
else
energyParamDerivBuffer = OpenCLArray::create<cl_float>(*this, numEnergyParamDerivs*energyBufferSize, "energyParamDerivBuffer");
addAutoclearBuffer(*energyParamDerivBuffer);
}
int bufferBytes = max(velm->getSize()*velm->getElementSize(), energyBufferSize*energyBuffer->getElementSize());
pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes); pinnedBuffer = new cl::Buffer(context, CL_MEM_ALLOC_HOST_PTR, bufferBytes);
pinnedMemory = currentQueue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes); pinnedMemory = currentQueue.enqueueMapBuffer(*pinnedBuffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE, 0, bufferBytes);
for (int i = 0; i < numAtoms; i++) { for (int i = 0; i < numAtoms; i++) {
...@@ -1052,7 +1061,6 @@ void OpenCLContext::reorderAtoms() { ...@@ -1052,7 +1061,6 @@ void OpenCLContext::reorderAtoms() {
reorderAtomsImpl<cl_float, mm_float4, cl_double, mm_double4>(); reorderAtomsImpl<cl_float, mm_float4, cl_double, mm_double4>();
else else
reorderAtomsImpl<cl_float, mm_float4, cl_float, mm_float4>(); reorderAtomsImpl<cl_float, mm_float4, cl_float, mm_float4>();
nonbonded->updateNeighborListSize();
} }
template <class Real, class Real4, class Mixed, class Mixed4> template <class Real, class Real4, class Mixed, class Mixed4>
...@@ -1232,6 +1240,15 @@ void OpenCLContext::addPostComputation(ForcePostComputation* computation) { ...@@ -1232,6 +1240,15 @@ void OpenCLContext::addPostComputation(ForcePostComputation* computation) {
postComputations.push_back(computation); postComputations.push_back(computation);
} }
void OpenCLContext::addEnergyParameterDerivative(const string& param) {
// See if this parameter has already been registered.
for (int i = 0; i < energyParamDerivNames.size(); i++)
if (param == energyParamDerivNames[i])
return;
energyParamDerivNames.push_back(param);
}
struct OpenCLContext::WorkThread::ThreadData { struct OpenCLContext::WorkThread::ThreadData {
ThreadData(std::queue<OpenCLContext::WorkTask*>& tasks, bool& waiting, bool& finished, ThreadData(std::queue<OpenCLContext::WorkTask*>& tasks, bool& waiting, bool& finished,
pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) : pthread_mutex_t& queueLock, pthread_cond_t& waitForTaskCondition, pthread_cond_t& queueEmptyCondition) :
......
...@@ -174,8 +174,8 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre ...@@ -174,8 +174,8 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
out << "if (x >= " << paramsFloat[2] << " && x <= " << paramsFloat[3] << " && y >= " << paramsFloat[4] << " && y <= " << paramsFloat[5] << ") {\n"; out << "if (x >= " << paramsFloat[2] << " && x <= " << paramsFloat[3] << " && y >= " << paramsFloat[4] << " && y <= " << paramsFloat[5] << ") {\n";
out << "x = (x - " << paramsFloat[2] << ")*" << paramsFloat[6] << ";\n"; out << "x = (x - " << paramsFloat[2] << ")*" << paramsFloat[6] << ";\n";
out << "y = (y - " << paramsFloat[4] << ")*" << paramsFloat[7] << ";\n"; out << "y = (y - " << paramsFloat[4] << ")*" << paramsFloat[7] << ";\n";
out << "int s = min((int) floor(x), " << paramsInt[0] << ");\n"; out << "int s = min((int) floor(x), " << paramsInt[0] << "-1);\n";
out << "int t = min((int) floor(y), " << paramsInt[1] << ");\n"; out << "int t = min((int) floor(y), " << paramsInt[1] << "-1);\n";
out << "int coeffIndex = 4*(s+" << paramsInt[0] << "*t);\n"; out << "int coeffIndex = 4*(s+" << paramsInt[0] << "*t);\n";
out << "float4 c[4];\n"; out << "float4 c[4];\n";
for (int j = 0; j < 4; j++) for (int j = 0; j < 4; j++)
...@@ -217,9 +217,9 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre ...@@ -217,9 +217,9 @@ void OpenCLExpressionUtilities::processExpression(stringstream& out, const Expre
out << "x = (x - " << paramsFloat[3] << ")*" << paramsFloat[9] << ";\n"; out << "x = (x - " << paramsFloat[3] << ")*" << paramsFloat[9] << ";\n";
out << "y = (y - " << paramsFloat[5] << ")*" << paramsFloat[10] << ";\n"; out << "y = (y - " << paramsFloat[5] << ")*" << paramsFloat[10] << ";\n";
out << "z = (z - " << paramsFloat[7] << ")*" << paramsFloat[11] << ";\n"; out << "z = (z - " << paramsFloat[7] << ")*" << paramsFloat[11] << ";\n";
out << "int s = min((int) floor(x), " << paramsInt[0] << ");\n"; out << "int s = min((int) floor(x), " << paramsInt[0] << "-1);\n";
out << "int t = min((int) floor(y), " << paramsInt[1] << ");\n"; out << "int t = min((int) floor(y), " << paramsInt[1] << "-1);\n";
out << "int u = min((int) floor(z), " << paramsInt[2] << ");\n"; out << "int u = min((int) floor(z), " << paramsInt[2] << "-1);\n";
out << "int coeffIndex = 16*(s+" << paramsInt[0] << "*(t+" << paramsInt[1] << "*u));\n"; out << "int coeffIndex = 16*(s+" << paramsInt[0] << "*(t+" << paramsInt[1] << "*u));\n";
out << "float4 c[16];\n"; out << "float4 c[16];\n";
for (int j = 0; j < 16; j++) for (int j = 0; j < 16; j++)
......
This diff is collapsed.
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2015 Stanford University and the Authors. * * Portions copyright (c) 2009-2016 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -57,7 +57,7 @@ private: ...@@ -57,7 +57,7 @@ private:
OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true), OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : context(context), useCutoff(false), usePeriodic(false), anyExclusions(false), usePadding(true),
numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL), numForceBuffers(0), exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL), interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) { oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), pinnedCountBuffer(NULL), pinnedCountMemory(NULL), forceRebuildNeighborList(true), lastCutoff(0.0), groupFlags(0) {
// Decide how many thread blocks and force buffers to use. // Decide how many thread blocks and force buffers to use.
deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU); deviceIsCpu = (context.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
...@@ -90,6 +90,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con ...@@ -90,6 +90,8 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize; numForceBuffers = numForceThreadBlocks*forceThreadBlockSize/OpenCLContext::TileSize;
} }
} }
pinnedCountBuffer = new cl::Buffer(context.getContext(), CL_MEM_ALLOC_HOST_PTR, sizeof(int));
pinnedCountMemory = (int*) context.getQueue().enqueueMapBuffer(*pinnedCountBuffer, CL_TRUE, CL_MAP_READ, 0, sizeof(int));
} }
OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() { OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
...@@ -123,6 +125,8 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() { ...@@ -123,6 +125,8 @@ OpenCLNonbondedUtilities::~OpenCLNonbondedUtilities() {
delete rebuildNeighborList; delete rebuildNeighborList;
if (blockSorter != NULL) if (blockSorter != NULL)
delete blockSorter; delete blockSorter;
if (pinnedCountBuffer != NULL)
delete pinnedCountBuffer;
} }
void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) { void OpenCLNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
...@@ -158,6 +162,19 @@ void OpenCLNonbondedUtilities::addArgument(const ParameterInfo& parameter) { ...@@ -158,6 +162,19 @@ void OpenCLNonbondedUtilities::addArgument(const ParameterInfo& parameter) {
arguments.push_back(parameter); arguments.push_back(parameter);
} }
string OpenCLNonbondedUtilities::addEnergyParameterDerivative(const string& param) {
// See if the parameter has already been added.
int index;
for (index = 0; index < energyParameterDerivatives.size(); index++)
if (param == energyParameterDerivatives[index])
break;
if (index == energyParameterDerivatives.size())
energyParameterDerivatives.push_back(param);
context.addEnergyParameterDerivative(param);
return string("energyParamDeriv")+context.intToString(index);
}
void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) { void OpenCLNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclusionList) {
if (anyExclusions) { if (anyExclusions) {
bool sameExclusions = (exclusionList.size() == atomExclusions.size()); bool sameExclusions = (exclusionList.size() == atomExclusions.size());
...@@ -357,20 +374,16 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) { ...@@ -357,20 +374,16 @@ void OpenCLNonbondedUtilities::prepareInteractions(int forceGroups) {
if (lastCutoff != kernels.cutoffDistance) if (lastCutoff != kernels.cutoffDistance)
forceRebuildNeighborList = true; forceRebuildNeighborList = true;
bool rebuild = false; setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1);
do { context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms());
setPeriodicBoxArgs(context, kernels.findBlockBoundsKernel, 1); blockSorter->sort(*sortedBlocks);
context.executeKernel(kernels.findBlockBoundsKernel, context.getNumAtoms()); kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList);
blockSorter->sort(*sortedBlocks); context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms());
kernels.sortBoxDataKernel.setArg<cl_int>(9, forceRebuildNeighborList); setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0);
context.executeKernel(kernels.sortBoxDataKernel, context.getNumAtoms()); context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
setPeriodicBoxArgs(context, kernels.findInteractingBlocksKernel, 0); forceRebuildNeighborList = false;
context.executeKernel(kernels.findInteractingBlocksKernel, context.getNumAtoms(), interactingBlocksThreadBlockSize);
forceRebuildNeighborList = false;
if (context.getComputeForceCount() == 1)
rebuild = updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
} while (rebuild);
lastCutoff = kernels.cutoffDistance; lastCutoff = kernels.cutoffDistance;
context.getQueue().enqueueReadBuffer(interactionCount->getDeviceBuffer(), CL_FALSE, 0, sizeof(int), pinnedCountMemory, NULL, &downloadCountEvent);
} }
void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) { void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool includeForces, bool includeEnergy) {
...@@ -385,20 +398,22 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool include ...@@ -385,20 +398,22 @@ void OpenCLNonbondedUtilities::computeInteractions(int forceGroups, bool include
setPeriodicBoxArgs(context, kernel, 9); setPeriodicBoxArgs(context, kernel, 9);
context.executeKernel(kernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); context.executeKernel(kernel, numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
} }
if (useCutoff && numTiles > 0) {
downloadCountEvent.wait();
updateNeighborListSize();
}
} }
bool OpenCLNonbondedUtilities::updateNeighborListSize() { bool OpenCLNonbondedUtilities::updateNeighborListSize() {
if (!useCutoff) if (!useCutoff)
return false; return false;
unsigned int* pinnedInteractionCount = (unsigned int*) context.getPinnedBuffer(); if (pinnedCountMemory[0] <= (unsigned int) interactingTiles->getSize())
interactionCount->download(pinnedInteractionCount);
if (pinnedInteractionCount[0] <= (unsigned int) interactingTiles->getSize())
return false; return false;
// The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent // The most recent timestep had too many interactions to fit in the arrays. Make the arrays bigger to prevent
// this from happening in the future. // this from happening in the future.
int maxTiles = (int) (1.2*pinnedInteractionCount[0]); int maxTiles = (int) (1.2*pinnedCountMemory[0]);
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2; int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > totalTiles) if (maxTiles > totalTiles)
maxTiles = totalTiles; maxTiles = totalTiles;
...@@ -430,6 +445,7 @@ bool OpenCLNonbondedUtilities::updateNeighborListSize() { ...@@ -430,6 +445,7 @@ bool OpenCLNonbondedUtilities::updateNeighborListSize() {
kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles); kernels.findInteractingBlocksKernel.setArg<cl_uint>(9, maxTiles);
} }
forceRebuildNeighborList = true; forceRebuildNeighborList = true;
context.setForcesValid(false);
return true; return true;
} }
...@@ -588,6 +604,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -588,6 +604,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
args << arguments[i].getName(); args << arguments[i].getName();
} }
} }
if (energyParameterDerivatives.size() > 0)
args << ", __global mixed* restrict energyParamDerivs";
replacements["PARAMETER_ARGUMENTS"] = args.str(); replacements["PARAMETER_ARGUMENTS"] = args.str();
stringstream loadLocal1; stringstream loadLocal1;
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
...@@ -638,6 +656,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -638,6 +656,18 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
} }
} }
replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2j.str();
stringstream initDerivs;
for (int i = 0; i < energyParameterDerivatives.size(); i++)
initDerivs<<"mixed energyParamDeriv"<<i<<" = 0;\n";
replacements["INIT_DERIVATIVES"] = initDerivs.str();
stringstream saveDerivs;
const vector<string>& allParamDerivNames = context.getEnergyParamDerivNames();
int numDerivs = allParamDerivNames.size();
for (int i = 0; i < energyParameterDerivatives.size(); i++)
for (int index = 0; index < numDerivs; index++)
if (allParamDerivNames[index] == energyParameterDerivatives[i])
saveDerivs<<"energyParamDerivs[get_global_id(0)*"<<numDerivs<<"+"<<index<<"] += energyParamDeriv"<<i<<";\n";
replacements["SAVE_DERIVATIVES"] = saveDerivs.str();
map<string, string> defines; map<string, string> defines;
if (useCutoff) if (useCutoff)
defines["USE_CUTOFF"] = "1"; defines["USE_CUTOFF"] = "1";
...@@ -713,5 +743,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -713,5 +743,7 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
for (int i = 0; i < (int) arguments.size(); i++) { for (int i = 0; i < (int) arguments.size(); i++) {
kernel.setArg<cl::Memory>(index++, arguments[i].getMemory()); kernel.setArg<cl::Memory>(index++, arguments[i].getMemory());
} }
if (energyParameterDerivatives.size() > 0)
kernel.setArg<cl::Memory>(index++, context.getEnergyParamDerivBuffer().getDeviceBuffer());
return kernel; return kernel;
} }
...@@ -30,6 +30,10 @@ ...@@ -30,6 +30,10 @@
tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI); tempForce = -prefactor*(erfAlphaR-alphaR*expAlphaRSqr*TWO_OVER_SQRT_PI);
tempEnergy += -prefactor*erfAlphaR; tempEnergy += -prefactor*erfAlphaR;
} }
else {
includeInteraction = false;
tempEnergy -= TWO_OVER_SQRT_PI*EWALD_ALPHA*138.935456f*posq1.w*posq2.w;
}
} }
else { else {
#if HAS_LENNARD_JONES #if HAS_LENNARD_JONES
......
...@@ -116,10 +116,12 @@ __kernel void computeGroupForces(__global long* restrict groupForce, __global mi ...@@ -116,10 +116,12 @@ __kernel void computeGroupForces(__global long* restrict groupForce, __global mi
__global const int* restrict bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ __global const int* restrict bondGroups, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
EXTRA_ARGS) { EXTRA_ARGS) {
mixed energy = 0; mixed energy = 0;
INIT_PARAM_DERIVS
for (int index = get_global_id(0); index < NUM_BONDS; index += get_global_size(0)) { for (int index = get_global_id(0); index < NUM_BONDS; index += get_global_size(0)) {
COMPUTE_FORCE COMPUTE_FORCE
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
SAVE_PARAM_DERIVS
} }
/** /**
......
...@@ -32,6 +32,7 @@ __kernel void computeN2Energy( ...@@ -32,6 +32,7 @@ __kernel void computeN2Energy(
const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); const unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = get_local_id(0) - tgx;
mixed energy = 0; mixed energy = 0;
INIT_PARAM_DERIVS
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
...@@ -73,6 +74,7 @@ __kernel void computeN2Energy( ...@@ -73,6 +74,7 @@ __kernel void computeN2Energy(
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 0.5f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
...@@ -123,6 +125,7 @@ __kernel void computeN2Energy( ...@@ -123,6 +125,7 @@ __kernel void computeN2Energy(
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 1.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
...@@ -181,6 +184,8 @@ __kernel void computeN2Energy( ...@@ -181,6 +184,8 @@ __kernel void computeN2Energy(
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int pos = (int) (warp*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps); int end = (int) ((warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : (long)numTiles)/totalWarps);
#else #else
...@@ -204,42 +209,38 @@ __kernel void computeN2Energy( ...@@ -204,42 +209,38 @@ __kernel void computeN2Energy(
int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { x = tiles[pos];
x = tiles[pos]; real4 blockSizeX = blockSize[x];
real4 blockSizeX = blockSize[x]; singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF && 0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF && 0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF); #else
} y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
else x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
#endif if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
{ y += (x < y ? -1 : 1);
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. }
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed. // Skip over tiles that have exclusions, since they were already processed.
SYNC_WARPS;
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
SYNC_WARPS; SYNC_WARPS;
while (skipTiles[tbx+TILE_SIZE-1] < pos) { if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
SYNC_WARPS; ushort2 tile = exclusionTiles[skipBase+tgx];
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) { skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[get_local_id(0)] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[get_local_id(0)] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
SYNC_WARPS;
} }
while (skipTiles[currentSkipIndex] < pos) else
currentSkipIndex++; skipTiles[get_local_id(0)] = end;
includeTile = (skipTiles[currentSkipIndex] != pos); skipBase += TILE_SIZE;
currentSkipIndex = tbx;
SYNC_WARPS;
} }
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
#endif
if (includeTile) { if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
...@@ -283,6 +284,7 @@ __kernel void computeN2Energy( ...@@ -283,6 +284,7 @@ __kernel void computeN2Energy(
atom2 = atomIndices[tbx+tj]; atom2 = atomIndices[tbx+tj];
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 1.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -321,6 +323,7 @@ __kernel void computeN2Energy( ...@@ -321,6 +323,7 @@ __kernel void computeN2Energy(
atom2 = atomIndices[tbx+tj]; atom2 = atomIndices[tbx+tj];
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 1.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -375,4 +378,5 @@ __kernel void computeN2Energy( ...@@ -375,4 +378,5 @@ __kernel void computeN2Energy(
pos++; pos++;
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
SAVE_PARAM_DERIVS
} }
...@@ -28,6 +28,7 @@ __kernel void computeN2Energy( ...@@ -28,6 +28,7 @@ __kernel void computeN2Energy(
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
mixed energy = 0; mixed energy = 0;
INIT_PARAM_DERIVS
// First loop: process tiles that contain exclusions. // First loop: process tiles that contain exclusions.
...@@ -74,6 +75,7 @@ __kernel void computeN2Energy( ...@@ -74,6 +75,7 @@ __kernel void computeN2Energy(
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 0.5f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
...@@ -140,6 +142,7 @@ __kernel void computeN2Energy( ...@@ -140,6 +142,7 @@ __kernel void computeN2Energy(
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 1.0f;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1)); bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) { if (!isExcluded) {
...@@ -201,6 +204,8 @@ __kernel void computeN2Energy( ...@@ -201,6 +204,8 @@ __kernel void computeN2Energy(
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const unsigned int numTiles = interactionCount[0]; const unsigned int numTiles = interactionCount[0];
if (numTiles > maxTiles)
return; // There wasn't enough memory for the neighbor list.
int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int pos = (int) (get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0)); int end = (int) ((get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*((long)NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0));
#else #else
...@@ -220,35 +225,31 @@ __kernel void computeN2Energy( ...@@ -220,35 +225,31 @@ __kernel void computeN2Energy(
int x, y; int x, y;
bool singlePeriodicCopy = false; bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { x = tiles[pos];
x = tiles[pos]; real4 blockSizeX = blockSize[x];
real4 blockSizeX = blockSize[x]; singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF &&
singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= CUTOFF && 0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= CUTOFF && 0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF);
0.5f*periodicBoxSize.z-blockSizeX.z >= CUTOFF); #else
} y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
else x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
#endif if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
{ y += (x < y ? -1 : 1);
y = (int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2); x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. }
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed. // Skip over tiles that have exclusions, since they were already processed.
while (nextToSkip < pos) { while (nextToSkip < pos) {
if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) { if (currentSkipIndex < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[currentSkipIndex++]; ushort2 tile = exclusionTiles[currentSkipIndex++];
nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2; nextToSkip = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
nextToSkip = end;
} }
includeTile = (nextToSkip != pos); else
nextToSkip = end;
} }
includeTile = (nextToSkip != pos);
#endif
if (includeTile) { if (includeTile) {
// Load the data for this tile. // Load the data for this tile.
...@@ -293,6 +294,7 @@ __kernel void computeN2Energy( ...@@ -293,6 +294,7 @@ __kernel void computeN2Energy(
atom2 = atomIndices[j]; atom2 = atomIndices[j];
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
energy += tempEnergy; energy += tempEnergy;
...@@ -349,6 +351,7 @@ __kernel void computeN2Energy( ...@@ -349,6 +351,7 @@ __kernel void computeN2Energy(
atom2 = atomIndices[j]; atom2 = atomIndices[j];
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
const real interactionScale = 1.0f;
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
energy += tempEnergy; energy += tempEnergy;
...@@ -402,4 +405,5 @@ __kernel void computeN2Energy( ...@@ -402,4 +405,5 @@ __kernel void computeN2Energy(
pos++; pos++;
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
SAVE_PARAM_DERIVS
} }
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
__kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq __kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global real4* restrict forceBuffers, __global mixed* restrict energyBuffer, __global const real4* restrict posq
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
mixed energy = 0; mixed energy = 0;
INIT_PARAM_DERIVS
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) { while (index < NUM_ATOMS) {
// Reduce the derivatives // Reduce the derivatives
...@@ -27,4 +28,5 @@ __kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global ...@@ -27,4 +28,5 @@ __kernel void computePerParticleEnergy(int bufferSize, int numBuffers, __global
index += get_global_size(0); index += get_global_size(0);
} }
energyBuffer[get_global_id(0)] += energy; energyBuffer[get_global_id(0)] += energy;
SAVE_PARAM_DERIVS
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment