Unverified Commit edbc8407 authored by peastman's avatar peastman Committed by GitHub
Browse files

Common compute framework to unify CUDA and OpenCL code (#2488)

* Began creating common compute framework to unify code between CUDA and OpenCL

* Began OpenCL implementation of common compute framework

* Common implementation of CMMotionRemover

* CUDA implementation of common compute interface

* Converted HarmonicBondForce to common compute API

* Converted standard bonded forces to common compute API

* Converted ExpressionUtilities to common compute API

* Created ComputeParameterSet

* Converted custom bonded forces to common compute API

* Converted CustomCentroidBondForce to common compute API

* Converted CustomManyParticleForce to common compute API

* Moved lots of duplicate code from CudaContext and OpenCLContext to ComputeContext

* Converted GayBerneForce to common compute API

* Removed obsolete kernels

* Converted verlet integrators to common compute API

* Converted Langevin and Brownian integrators to common compute API

* Converted CustomIntegrator to common compute API

* Converted CustomNonbondedForce to common compute API

* Removed uses of a deprecated API

* Fixed failing test cases

* Converted GBSAOBCForce to common compute API

* Began converting CustomGBForce to common compute API

* Finished converting CustomGBForce to common compute API

* Merged duplicated code in CudaIntegrationUtilities and OpenCLIntegrationUtilities

* Converted RMSDForce and AndersenThermostat to common compute API

* Converted CustomHbondForce to common compute API

* Merged scripts for encoding kernel sources

* Converted Drude plugin to common compute API

* Fixed errors in CMake scripts

* Attempt at fixing errors on Windows

* Added discussion of common compute API to developer guide

* Added Windows export macro for common classes

* Fixed error in CMMotionRemover

* Ubdated travis to newer Ubuntu version

* Fixed errors on CPU OpenCL

* Fixed Windows linking errors

* Added missing pragma for 32 bit atomics

* Replaced long long with mm_long

* More fixes to Windows linking

* Bug fix
parent 38beeefe
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
#ifdef USE_SYMMETRIC
real tempForce = 0.0f;
#else
real4 tempForce1 = (real4) 0;
real4 tempForce2 = (real4) 0;
#endif
COMPUTE_FORCE
#ifdef USE_SYMMETRIC
dEdR += tempForce*invR;
#else
dEdR1 += tempForce1;
dEdR2 += tempForce2;
#endif
}
/**
* Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/
__kernel void computeGradientChainRuleTerms(__global real4* restrict forceBuffers, __global const real4* restrict posq
PARAMETER_ARGUMENTS) {
INIT_PARAM_DERIVS
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
real4 pos = posq[index];
real4 force = forceBuffers[index];
COMPUTE_FORCES
forceBuffers[index] = force;
index += get_global_size(0);
}
SAVE_PARAM_DERIVS
}
__kernel void computeFloatSum(__global const float* restrict sumBuffer, __global float* result, int bufferSize) {
__local float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
float sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
}
#ifdef SUPPORTS_DOUBLE_PRECISION
__kernel void computeDoubleSum(__global const double* restrict sumBuffer, __global double* result, int bufferSize) {
__local double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
double sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
}
#endif
__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
posDelta[index] = (mixed4) 0;
}
}
__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
uint4 state = seed[get_global_id(0)];
unsigned int carry = 0;
for (int index = get_global_id(0); index < numValues; index += get_global_size(0)) {
// Generate three uniform random numbers.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
unsigned int m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x2 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
// Record the values.
random[index] = (float4) (x1, x2, x3, 0.0f);
}
seed[get_global_id(0)] = state;
}
/**
* Load the position of a particle.
*/
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Store the position of a particle.
*/
void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
__kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
__global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
__global mixed* restrict sum, __global const float4* restrict gaussianValues, unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues,
const mixed energy, __global mixed* restrict energyParamDerivs
PARAMETER_ARGUMENTS) {
mixed stepSize = dt[0].y;
int index = get_global_id(0);
while (index < NUM_ATOMS) {
#ifdef LOAD_POS_AS_DELTA
mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
#else
mixed4 position = loadPos(posq, posqCorrection, index);
#endif
mixed4 velocity = velm[index];
mixed4 f = convert_mixed4(force[index]);
mixed mass = 1/velocity.w;
if (velocity.w != 0.0) {
int gaussianIndex = gaussianBaseIndex;
int uniformIndex = 0;
COMPUTE_STEP
}
index += get_global_size(0);
}
}
#ifdef USE_CUTOFF
if (!isExcluded && r2 < CUTOFF_SQUARED) {
#else
if (!isExcluded) {
#endif
real tempForce = 0.0f;
real switchValue = 1, switchDeriv = 0;
#if USE_SWITCH
if (r > SWITCH_CUTOFF) {
real x = r-SWITCH_CUTOFF;
switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
}
#endif
COMPUTE_FORCE
#if USE_SWITCH
tempForce = tempForce*switchValue - customEnergy*switchDeriv;
tempEnergy += customEnergy*switchValue;
#else
tempEnergy += customEnergy;
#endif
dEdR += tempForce*invR;
}
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define TILE_SIZE 32
#define NEIGHBOR_BLOCK_SIZE 32
/**
* Calculate the ellipsoid coordinate frames and associated matrices.
*/
__kernel void computeEllipsoidFrames(int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices,
__global const float4* restrict sigParams, __global const float4* restrict scale, __global real* restrict aMatrix,
__global real* restrict bMatrix, __global real* restrict gMatrix, __global const int* sortedParticles) {
for (int sortedIndex = get_global_id(0); sortedIndex < numParticles; sortedIndex += get_global_size(0)) {
// Compute the local coordinate system of the ellipsoid;
int originalIndex = sortedParticles[sortedIndex];
real3 pos = posq[originalIndex].xyz;
int2 axisParticles = axisParticleIndices[originalIndex];
real3 xdir, ydir, zdir;
if (axisParticles.x == -1) {
xdir = (real3) (1, 0, 0);
ydir = (real3) (0, 1, 0);
}
else {
xdir = pos-posq[axisParticles.x].xyz;
xdir = normalize(xdir);
if (axisParticles.y == -1) {
if (xdir.y > -0.5f && xdir.y < 0.5f)
ydir = (real3) (0, 1, 0);
else
ydir = (real3) (1, 0, 0);
}
else
ydir = pos-posq[axisParticles.y].xyz;
ydir -= xdir*dot(xdir, ydir);
ydir = normalize(ydir);
}
zdir = cross(xdir, ydir);
// Compute matrices we will need later.
__global real (*a)[3] = (__global real (*)[3]) (aMatrix+sortedIndex*9);
__global real (*b)[3] = (__global real (*)[3]) (bMatrix+sortedIndex*9);
__global real (*g)[3] = (__global real (*)[3]) (gMatrix+sortedIndex*9);
a[0][0] = xdir.x;
a[0][1] = xdir.y;
a[0][2] = xdir.z;
a[1][0] = ydir.x;
a[1][1] = ydir.y;
a[1][2] = ydir.z;
a[2][0] = zdir.x;
a[2][1] = zdir.y;
a[2][2] = zdir.z;
float4 sig = sigParams[originalIndex];
float3 r2 = sig.yzw;
float3 e2 = scale[originalIndex].xyz;
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
b[i][j] = a[0][i]*e2.x*a[0][j] + a[1][i]*e2.y*a[1][j] + a[2][i]*e2.z*a[2][j];
g[i][j] = a[0][i]*r2.x*a[0][j] + a[1][i]*r2.y*a[1][j] + a[2][i]*r2.z*a[2][j];
}
}
}
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const int* sortedAtoms, __global const real4* restrict posq, __global real4* restrict sortedPos, __global real4* restrict blockCenter,
__global real4* restrict blockBoundingBox, __global int* restrict neighborBlockCount) {
int index = get_global_id(0);
int base = index*TILE_SIZE;
while (base < numAtoms) {
real4 pos = posq[sortedAtoms[base]];
sortedPos[base] = pos;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_POS(pos)
#endif
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) {
pos = posq[sortedAtoms[i]];
sortedPos[i] = pos;
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos);
index += get_global_size(0);
base = index*TILE_SIZE;
}
if (get_global_id(0) == 0)
*neighborBlockCount = 0;
}
/**
* This is called by findNeighbors() to write a block to the neighbor list.
*/
void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, __global int* restrict neighbors,
__global int* restrict neighborIndex, __global int* restrict neighborBlockCount) {
int blockIndex = atom_add(neighborBlockCount, 1);
if (blockIndex >= maxNeighborBlocks)
return; // We don't have enough room for the neighbor list.
neighborIndex[blockIndex] = atom1;
int baseIndex = blockIndex*NEIGHBOR_BLOCK_SIZE;
for (int i = 0; i < numAtomsInBuffer; i++)
neighbors[baseIndex+i] = neighborBuffer[i];
for (int i = numAtomsInBuffer; i < NEIGHBOR_BLOCK_SIZE; i++)
neighbors[baseIndex+i] = -1;
}
/**
* Build a list of neighbors for each atom.
*/
__kernel void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global real4* restrict sortedPos, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict neighbors,
__global int* restrict neighborIndex, __global int* restrict neighborBlockCount, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex) {
const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) {
int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1];
real4 pos = sortedPos[atom1];
int nextBufferIndex = 0;
// Loop over atom blocks and compute the distance of this atom from each one's bounding box.
for (int block = (atom1+1)/TILE_SIZE; block < numBlocks; block++) {
real4 center = blockCenter[block];
real4 blockSize = blockBoundingBox[block];
real4 blockDelta = center-pos;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSize.x);
blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSize.y);
blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSize.z);
if (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z >= CUTOFF_SQUARED)
continue;
// Loop over atoms within this block.
int first = max(block*TILE_SIZE, atom1+1);
int last = min((block+1)*TILE_SIZE, numAtoms);
for (int atom2 = first; atom2 < last; atom2++) {
// Skip over excluded interactions.
if (nextExclusion < lastExclusion && exclusions[nextExclusion] >= atom2) {
nextExclusion++;
continue;
}
real4 delta = pos-sortedPos[atom2];
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
neighborBuffer[nextBufferIndex++] = atom2;
if (nextBufferIndex == NEIGHBOR_BLOCK_SIZE) {
storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
nextBufferIndex = 0;
}
}
}
}
if (nextBufferIndex > 0)
storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
}
}
typedef struct {
float4 sig;
float2 eps;
real3 pos;
real a[3][3], b[3][3], g[3][3];
} AtomData;
void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, __global const real4* restrict pos, __global const float4* restrict sigParams,
__global const float2* restrict epsParams, __global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix) {
data->sig = sigParams[originalIndex];
data->eps = epsParams[originalIndex];
data->pos = pos[sortedIndex].xyz;
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
int k = 9*sortedIndex+3*i+j;
data->a[i][j] = aMatrix[k];
data->b[i][j] = bMatrix[k];
data->g[i][j] = gMatrix[k];
}
}
real3 matrixVectorProduct(real (*m)[3], real3 v) {
return (real3) (m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
}
real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
return (real3) (m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
}
void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
result[0][0] = a[0][0]+b[0][0];
result[0][1] = a[0][1]+b[0][1];
result[0][2] = a[0][2]+b[0][2];
result[1][0] = a[1][0]+b[1][0];
result[1][1] = a[1][1]+b[1][1];
result[1][2] = a[1][2]+b[1][2];
result[2][0] = a[2][0]+b[2][0];
result[2][1] = a[2][1]+b[2][1];
result[2][2] = a[2][2]+b[2][2];
}
real determinant(real (*m)[3]) {
return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
}
void matrixInverse(real (*result)[3], real (*m)[3]) {
real invDet = RECIP(determinant(m));
result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
result[2][0] = invDet*(m[1][0]*m[2][1] - m[1][1]*m[2][0]);
result[0][1] = -invDet*(m[0][1]*m[2][2] - m[0][2]*m[2][1]);
result[1][1] = invDet*(m[0][0]*m[2][2] - m[0][2]*m[2][0]);
result[2][1] = -invDet*(m[0][0]*m[2][1] - m[0][1]*m[2][0]);
result[0][2] = invDet*(m[0][1]*m[1][2] - m[0][2]*m[1][1]);
result[1][2] = -invDet*(m[0][0]*m[1][2] - m[0][2]*m[1][0]);
result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
}
void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
real rInv = RSQRT(r2);
real r = r2*rInv;
real3 drUnit = dr*rInv;
// Compute the switching function.
real switchValue = 1, switchDeriv = 0;
#if USE_SWITCH
if (r > SWITCH_CUTOFF) {
real x = r-SWITCH_CUTOFF;
switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
}
#endif
// Compute vectors and matrices we'll be needing.
real B12[3][3], G12[3][3], B12inv[3][3], G12inv[3][3];
matrixSum(B12, data1->b, data2->b);
matrixSum(G12, data1->g, data2->g);
matrixInverse(B12inv, B12);
matrixInverse(G12inv, G12);
real detG12 = determinant(G12);
// Estimate the distance between the ellipsoids and compute the first terms needed for the energy.
real sigma12 = 1/SQRT(0.5f*dot(drUnit, matrixVectorProduct(G12inv, drUnit)));
real h12 = r - sigma12;
real rho = sigma/(h12+sigma);
real rho2 = rho*rho;
real rho6 = rho2*rho2*rho2;
real u = 4*epsilon*(rho6*rho6-rho6);
real eta = SQRT(2*data1->eps.y*data2->eps.y/detG12);
real chi = 2*dot(drUnit, matrixVectorProduct(B12inv, drUnit));
chi *= chi;
real energy = u*eta*chi;
// Compute the terms needed for the force.
real3 kappa = matrixVectorProduct(G12inv, dr);
real3 iota = matrixVectorProduct(B12inv, dr);
real rInv2 = rInv*rInv;
real dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
real temp = 0.5f*sigma12*sigma12*sigma12*rInv2;
real3 dudr = (drUnit + (kappa-drUnit*dot(kappa, drUnit))*temp)*dUSLJdr;
real3 dchidr = (iota-drUnit*dot(iota, drUnit))*(-8*rInv2*SQRT(chi));
real3 force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
*force1 += force;
*force2 -= force;
// Compute the terms needed for the torque.
for (int j = 0; j < 2; j++) {
real (*a)[3] = (j == 0 ? data1->a : data2->a);
real (*b)[3] = (j == 0 ? data1->b : data2->b);
real (*g)[3] = (j == 0 ? data1->g : data2->g);
float4 sig = (j == 0 ? data1->sig : data2->sig);
real3 dudq = cross(vectorMatrixProduct(kappa, g), kappa*(temp*dUSLJdr));
real3 dchidq = cross(vectorMatrixProduct(iota, b), iota)*(-4*rInv2);
real3 scale = (real3) (sig.y, sig.z, sig.w)*(-0.5f*eta/detG12);
real d[3][3];
d[0][0] = scale.x*(2*a[0][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
a[0][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[0][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])));
d[0][1] = scale.x*( a[0][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
2*a[0][1]*(G12[0][0]*G12[2][2] - G12[2][0]*G12[0][2]) +
a[0][2]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])));
d[0][2] = scale.x*( a[0][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[0][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[0][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
d[1][0] = scale.y*(2*a[1][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
a[1][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
a[1][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])));
d[1][1] = scale.y*( a[1][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
2*a[1][1]*(G12[2][2]*G12[0][0] - G12[2][0]*G12[0][2]) +
a[1][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
d[1][2] = scale.y*( a[1][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[1][1]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[1][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
d[2][0] = scale.z*(2*a[2][0]*(G12[1][1]*G12[2][2] - G12[2][1]*G12[1][2]) +
a[2][1]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
a[2][2]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])));
d[2][1] = scale.z*( a[2][0]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
2*a[2][1]*(G12[0][0]*G12[2][2] - G12[0][2]*G12[2][0]) +
a[2][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
d[2][2] = scale.z*( a[2][0]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[2][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[2][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
real3 detadq = 0;
for (int i = 0; i < 3; i++)
detadq += cross((real3) (a[i][0], a[i][1], a[i][2]), (real3) (d[i][0], d[i][1], d[i][2]));
real3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
*(j == 0 ? torque1 : torque2) -= torque;
}
*totalEnergy += switchValue*energy;
}
/**
* Compute the interactions.
*/
__kernel void computeForce(
__global long* restrict forceBuffers, __global long* restrict torqueBuffers,
int numAtoms, int numExceptions, __global mixed* restrict energyBuffer, __global const real4* restrict pos,
__global const float4* restrict sigParams, __global const float2* restrict epsParams, __global const int* restrict sortedAtoms,
__global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix,
__global const int* restrict exclusions, __global const int* restrict exclusionStartIndex,
__global const int4* restrict exceptionParticles, __global const float2* restrict exceptionParams
#ifdef USE_CUTOFF
, int maxNeighborBlocks, __global int* restrict neighbors, __global int* restrict neighborIndex, __global int* restrict neighborBlockCount,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif
) {
const unsigned int warp = get_global_id(0)/TILE_SIZE;
mixed energy = 0;
#ifdef USE_CUTOFF
const int numBlocks = *neighborBlockCount;
if (numBlocks > maxNeighborBlocks)
return; // There wasn't enough memory for the neighbor list.
for (int block = get_global_id(0); block < numBlocks; block += get_global_size(0)) {
// Load parameters for atom1.
int atom1 = neighborIndex[block];
int index1 = sortedAtoms[atom1];
AtomData data1;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0.0f;
real3 torque1 = 0.0f;
for (int indexInBlock = 0; indexInBlock < NEIGHBOR_BLOCK_SIZE; indexInBlock++) {
// Load parameters for atom2.
int atom2 = neighbors[NEIGHBOR_BLOCK_SIZE*block+indexInBlock];
if (atom2 == -1)
continue;
int index2 = sortedAtoms[atom2];
AtomData data2;
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force2 = 0.0f;
real3 torque2 = 0.0f;
// Compute the interaction.
real3 delta = data1.pos-data2.pos;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
}
atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
}
#else
for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) {
// Load parameters for atom1.
int index1 = sortedAtoms[atom1];
AtomData data1;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0.0f;
real3 torque1 = 0.0f;
int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1];
for (int atom2 = atom1+1; atom2 < numAtoms; atom2++) {
// Skip over excluded interactions.
if (nextExclusion < lastExclusion && exclusions[nextExclusion] == atom2) {
nextExclusion++;
continue;
}
// Load parameters for atom2.
int index2 = sortedAtoms[atom2];
AtomData data2;
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force2 = 0.0f;
real3 torque2 = 0.0f;
// Compute the interaction.
real3 delta = data1.pos-data2.pos;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
}
atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
}
#endif
// Now compute exceptions.
for (int index = get_global_id(0); index < numExceptions; index += get_global_size(0)) {
int4 atomIndices = exceptionParticles[index];
float2 params = exceptionParams[index];
int index1 = atomIndices.x, index2 = atomIndices.y;
int atom1 = atomIndices.z, atom2 = atomIndices.w;
AtomData data1, data2;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0, force2 = 0;
real3 torque1 = 0, torque2 = 0;
real3 delta = data1.pos-data2.pos;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
#ifdef USE_CUTOFF
}
#endif
}
energyBuffer[get_global_id(0)] += energy;
}
/**
* Convert the torques to forces on the connected particles.
*/
__kernel void applyTorques(
__global long* restrict forceBuffers, __global long* restrict torqueBuffers,
int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices,
__global const int* sortedParticles) {
const unsigned int warp = get_global_id(0)/TILE_SIZE;
for (int sortedIndex = get_global_id(0); sortedIndex < numParticles; sortedIndex += get_global_size(0)) {
int originalIndex = sortedParticles[sortedIndex];
real3 pos = posq[originalIndex].xyz;
int2 axisParticles = axisParticleIndices[originalIndex];
if (axisParticles.x != -1) {
// Load the torque.
real scale = 1/(real) 0x100000000;
real3 torque = (real3) (scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]);
real3 force = 0, xforce = 0, yforce = 0;
// Apply a force to the x particle.
real3 dx = posq[axisParticles.x].xyz-pos;
real dx2 = dot(dx, dx);
real3 f = cross(torque, dx)/dx2;
xforce += f;
force -= f;
if (axisParticles.y != -1) {
// Apply a force to the y particle. This is based on the component of the torque
// that was not already applied to the x particle.
real3 dy = posq[axisParticles.y].xyz-pos;
real dy2 = dot(dy, dy);
real3 torque2 = dx*dot(torque, dx)/dx2;
f = cross(torque2, dy)/dy2;
yforce += f;
force -= f;
}
atom_add(&forceBuffers[originalIndex], (long) (force.x*0x100000000));
atom_add(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&forceBuffers[axisParticles.x], (long) (xforce.x*0x100000000));
atom_add(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], (long) (xforce.y*0x100000000));
atom_add(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], (long) (xforce.z*0x100000000));
if (axisParticles.y != -1) {
atom_add(&forceBuffers[axisParticles.y], (long) (yforce.x*0x100000000));
atom_add(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (long) (yforce.y*0x100000000));
atom_add(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (long) (yforce.z*0x100000000));
}
}
}
}
{
real invRSquaredOver4 = 0.25f*invR*invR;
real rScaledRadiusJ = r+OBC_PARAMS2.y;
real rScaledRadiusI = r+OBC_PARAMS1.y;
real l_ijJ = RECIP(max((real) OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
real l_ijI = RECIP(max((real) OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
real u_ijJ = RECIP(rScaledRadiusJ);
real u_ijI = RECIP(rScaledRadiusI);
real l_ij2J = l_ijJ*l_ijJ;
real l_ij2I = l_ijI*l_ijI;
real u_ij2J = u_ijJ*u_ijJ;
real u_ij2I = u_ijI*u_ijI;
real t1J = LOG(u_ijJ*RECIP(l_ijJ));
real t1I = LOG(u_ijI*RECIP(l_ijI));
real t2J = (l_ij2J-u_ij2J);
real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
#ifdef USE_CUTOFF
bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else
bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
#endif
dEdR += (includeInteraction ? tempdEdR : (real) 0);
}
float2 angleParams = PARAMS[index];
real deltaIdeal = theta-angleParams.x;
energy += 0.5f*angleParams.y*deltaIdeal*deltaIdeal;
real dEdAngle = angleParams.y*deltaIdeal;
float2 bondParams = PARAMS[index];
real deltaIdeal = r-bondParams.x;
energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
real dEdR = bondParams.y * deltaIdeal;
/**
* Apply a time shift to the velocities before computing kinetic energy.
*/
__kernel void timeShiftVelocities(__global mixed4* restrict velm, __global const real4* restrict force, real timeShift) {
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
mixed4 f = convert_mixed4(force[index]);
velocity.xyz += timeShift*f.xyz*velocity.w;
velm[index] = velocity;
}
}
}
\ No newline at end of file
enum {VelScale, ForceScale, NoiseScale, MaxParams};
/**
* Perform the first step of Langevin integration.
*/
__kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
__global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
mixed vscale = paramBuffer[VelScale];
mixed fscale = paramBuffer[ForceScale];
mixed noisescale = paramBuffer[NoiseScale];
mixed stepSize = dt[0].y;
int index = get_global_id(0);
randomIndex += index;
while (index < NUM_ATOMS) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
mixed sqrtInvMass = sqrt(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity;
posDelta[index] = stepSize*velocity;
}
randomIndex += get_global_size(0);
index += get_global_size(0);
}
}
/**
* Perform the second step of Langevin integration.
*/
__kernel void integrateLangevinPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const mixed4* restrict posDelta, __global mixed4* restrict velm, __global const mixed2* restrict dt) {
#ifdef SUPPORTS_DOUBLE_PRECISION
double invStepSize = 1.0/dt[0].y;
#else
float invStepSize = 1.0f/dt[0].y;
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif
int index = get_global_id(0);
while (index < NUM_ATOMS) {
mixed4 vel = velm[index];
if (vel.w != 0.0) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
mixed4 delta = posDelta[index];
pos.xyz += delta.xyz;
#ifdef SUPPORTS_DOUBLE_PRECISION
vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
#else
vel.xyz = invStepSize*delta.xyz + correction*delta.xyz;
#endif
#ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
velm[index] = vel;
}
index += get_global_size(0);
}
}
/**
* Select the step size to use for the next step.
*/
__kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed friction, mixed kT, __global mixed2* restrict dt,
__global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
// Calculate the error.
mixed err = 0.0f;
unsigned int index = get_local_id(0);
while (index < NUM_ATOMS) {
real4 f = force[index];
mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
index += get_global_size(0);
}
error[get_local_id(0)] = err;
barrier(CLK_LOCAL_MEM_FENCE);
// Sum the errors from all threads.
for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
if (get_local_id(0)+offset < get_local_size(0) && (get_local_id(0)&(2*offset-1)) == 0)
error[get_local_id(0)] += error[get_local_id(0)+offset];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_global_id(0) == 0) {
// Select the new step size.
mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
mixed newStepSize = sqrt(errorTol/totalError);
mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
if (newStepSize > maxStepSize)
newStepSize = maxStepSize;
dt[0].y = newStepSize;
// Recalculate the integration parameters.
mixed vscale = exp(-newStepSize*friction);
mixed fscale = (friction == 0 ? newStepSize : (1-vscale)/friction);
mixed noisescale = sqrt(kT*(1-vscale*vscale));
params[VelScale] = vscale;
params[ForceScale] = fscale;
params[NoiseScale] = noisescale;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < MaxParams)
paramBuffer[get_local_id(0)] = params[get_local_id(0)];
}
float4 torsionParams = PARAMS[index];
real deltaAngle = torsionParams.z*theta-torsionParams.y;
energy += torsionParams.x*(1.0f+cos(deltaAngle));
real sinDeltaAngle = sin(deltaAngle);
real dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle;
/**
* Generate random numbers
*/
__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
int index = get_global_id(0);
uint4 state = seed[index];
unsigned int carry = 0;
while (index < numValues) {
float4 value;
// Generate first two values.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
unsigned int m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x1 = SQRT(-2.0f * LOG(x1));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.x = x1 * cos(2.0f * 3.14159265f * x2);
value.y = x1 * sin(2.0f * 3.14159265f * x2);
// Generate next two values.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x3 = SQRT(-2.0f * LOG(x3));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.z = x3 * cos(2.0f * 3.14159265f * x4);
value.w = x3 * sin(2.0f * 3.14159265f * x4);
// Record the values.
random[index] = value;
index += get_global_size(0);
}
seed[get_global_id(0)] = state;
}
float8 torsionParams = PARAMS[index];
if (theta < 0.0f)
theta += PI;
else
theta -= PI;
cosangle = -cosangle;
real cosFactor = cosangle;
real dEdAngle = -torsionParams.s1;
real rbEnergy = torsionParams.s0;
rbEnergy += torsionParams.s1*cosFactor;
dEdAngle -= 2.0f*torsionParams.s2*cosFactor;
cosFactor *= cosangle;
dEdAngle -= 3.0f*torsionParams.s3*cosFactor;
rbEnergy += torsionParams.s2*cosFactor;
cosFactor *= cosangle;
dEdAngle -= 4.0f*torsionParams.s4*cosFactor;
rbEnergy += torsionParams.s3*cosFactor;
cosFactor *= cosangle;
dEdAngle -= 5.0f*torsionParams.s5*cosFactor;
rbEnergy += torsionParams.s4*cosFactor;
rbEnergy += torsionParams.s5*cosFactor*cosangle;
energy += rbEnergy;
dEdAngle *= sin(theta);
// This file contains kernels to compute the RMSD and its gradient using the algorithm described
// in Coutsias et al, "Using quaternions to calculate RMSD" (doi: 10.1002/jcc.20110).
/**
* Sum a value over all threads.
*/
real reduceValue(real value, __local volatile real* temp) {
const int thread = get_local_id(0);
barrier(CLK_LOCAL_MEM_FENCE);
temp[thread] = value;
barrier(CLK_LOCAL_MEM_FENCE);
for (uint step = 1; step < 32; step *= 2) {
if (thread+step < get_local_size(0) && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step];
SYNC_WARPS;
}
for (uint step = 32; step < get_local_size(0); step *= 2) {
if (thread+step < get_local_size(0) && thread%(2*step) == 0)
temp[thread] = temp[thread] + temp[thread+step];
barrier(CLK_LOCAL_MEM_FENCE);
}
return temp[0];
}
/**
* Perform the first step of computing the RMSD. This is executed as a single work group.
*/
__kernel void computeRMSDPart1(int numParticles, __global const real4* restrict posq, __global const real4* restrict referencePos,
__global const int* restrict particles, __global real* buffer, __local volatile real* restrict temp) {
// Compute the center of the particle positions.
real3 center = (real3) 0;
for (int i = get_local_id(0); i < numParticles; i += get_local_size(0))
center += posq[particles[i]].xyz;
center.x = reduceValue(center.x, temp)/numParticles;
center.y = reduceValue(center.y, temp)/numParticles;
center.z = reduceValue(center.z, temp)/numParticles;
// Compute the correlation matrix.
real R[3][3] = {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}};
real sum = 0;
for (int i = get_local_id(0); i < numParticles; i += get_local_size(0)) {
int index = particles[i];
real3 pos = posq[index].xyz - center;
real3 refPos = referencePos[index].xyz;
R[0][0] += pos.x*refPos.x;
R[0][1] += pos.x*refPos.y;
R[0][2] += pos.x*refPos.z;
R[1][0] += pos.y*refPos.x;
R[1][1] += pos.y*refPos.y;
R[1][2] += pos.y*refPos.z;
R[2][0] += pos.z*refPos.x;
R[2][1] += pos.z*refPos.y;
R[2][2] += pos.z*refPos.z;
sum += dot(pos, pos);
}
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
R[i][j] = reduceValue(R[i][j], temp);
sum = reduceValue(sum, temp);
// Copy everything into the output buffer to send back to the host.
if (get_local_id(0) == 0) {
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++)
buffer[3*i+j] = R[i][j];
buffer[9] = sum;
buffer[10] = center.x;
buffer[11] = center.y;
buffer[12] = center.z;
}
}
/**
* Apply forces based on the RMSD.
*/
__kernel void computeRMSDForces(int numParticles, __global const real4* restrict posq, __global const real4* restrict referencePos,
__global const int* restrict particles, __global const real* buffer, __global real4* restrict forceBuffers) {
real3 center = (real3) (buffer[10], buffer[11], buffer[12]);
real scale = 1 / (real) (buffer[9]*numParticles);
for (int i = get_global_id(0); i < numParticles; i += get_global_size(0)) {
int index = particles[i];
real3 pos = posq[index].xyz - center;
real3 refPos = referencePos[index].xyz;
real3 rotatedRef = (real3) (buffer[0]*refPos.x + buffer[3]*refPos.y + buffer[6]*refPos.z,
buffer[1]*refPos.x + buffer[4]*refPos.y + buffer[7]*refPos.z,
buffer[2]*refPos.x + buffer[5]*refPos.y + buffer[8]*refPos.z);
forceBuffers[index].xyz -= (pos-rotatedRef)*scale;
}
}
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Enforce constraints on SETTLE clusters
*/
__kernel void applySettle(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global const mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
int index = get_global_id(0);
while (index < numClusters) {
// Load the data for this cluster.
int4 atoms = clusterAtoms[index];
float2 params = clusterParams[index];
mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
mixed4 xp0 = posDelta[atoms.x];
mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
mixed4 xp1 = posDelta[atoms.y];
mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
mixed4 xp2 = posDelta[atoms.z];
mixed m0 = 1/velm[atoms.x].w;
mixed m1 = 1/velm[atoms.y].w;
mixed m2 = 1/velm[atoms.z].w;
// Apply the SETTLE algorithm.
mixed xb0 = apos1.x-apos0.x;
mixed yb0 = apos1.y-apos0.y;
mixed zb0 = apos1.z-apos0.z;
mixed xc0 = apos2.x-apos0.x;
mixed yc0 = apos2.y-apos0.y;
mixed zc0 = apos2.z-apos0.z;
mixed invTotalMass = 1.0f/(m0+m1+m2);
mixed xcom = (xp0.x*m0 + (xb0+xp1.x)*m1 + (xc0+xp2.x)*m2) * invTotalMass;
mixed ycom = (xp0.y*m0 + (yb0+xp1.y)*m1 + (yc0+xp2.y)*m2) * invTotalMass;
mixed zcom = (xp0.z*m0 + (zb0+xp1.z)*m1 + (zc0+xp2.z)*m2) * invTotalMass;
mixed xa1 = xp0.x - xcom;
mixed ya1 = xp0.y - ycom;
mixed za1 = xp0.z - zcom;
mixed xb1 = xb0 + xp1.x - xcom;
mixed yb1 = yb0 + xp1.y - ycom;
mixed zb1 = zb0 + xp1.z - zcom;
mixed xc1 = xc0 + xp2.x - xcom;
mixed yc1 = yc0 + xp2.y - ycom;
mixed zc1 = zc0 + xp2.z - zcom;
mixed xaksZd = yb0*zc0 - zb0*yc0;
mixed yaksZd = zb0*xc0 - xb0*zc0;
mixed zaksZd = xb0*yc0 - yb0*xc0;
mixed xaksXd = ya1*zaksZd - za1*yaksZd;
mixed yaksXd = za1*xaksZd - xa1*zaksZd;
mixed zaksXd = xa1*yaksZd - ya1*xaksZd;
mixed xaksYd = yaksZd*zaksXd - zaksZd*yaksXd;
mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
mixed trns11 = xaksXd / axlng;
mixed trns21 = yaksXd / axlng;
mixed trns31 = zaksXd / axlng;
mixed trns12 = xaksYd / aylng;
mixed trns22 = yaksYd / aylng;
mixed trns32 = zaksYd / aylng;
mixed trns13 = xaksZd / azlng;
mixed trns23 = yaksZd / azlng;
mixed trns33 = zaksZd / azlng;
mixed xb0d = trns11*xb0 + trns21*yb0 + trns31*zb0;
mixed yb0d = trns12*xb0 + trns22*yb0 + trns32*zb0;
mixed xc0d = trns11*xc0 + trns21*yc0 + trns31*zc0;
mixed yc0d = trns12*xc0 + trns22*yc0 + trns32*zc0;
mixed za1d = trns13*xa1 + trns23*ya1 + trns33*za1;
mixed xb1d = trns11*xb1 + trns21*yb1 + trns31*zb1;
mixed yb1d = trns12*xb1 + trns22*yb1 + trns32*zb1;
mixed zb1d = trns13*xb1 + trns23*yb1 + trns33*zb1;
mixed xc1d = trns11*xc1 + trns21*yc1 + trns31*zc1;
mixed yc1d = trns12*xc1 + trns22*yc1 + trns32*zc1;
mixed zc1d = trns13*xc1 + trns23*yc1 + trns33*zc1;
// --- Step2 A2' ---
float rc = 0.5*params.y;
mixed rb = sqrt(params.x*params.x-rc*rc);
mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra;
mixed sinphi = za1d / ra;
mixed cosphi = sqrt(1.0f - sinphi*sinphi);
mixed sinpsi = (zb1d - zc1d) / (2*rc*cosphi);
mixed cospsi = sqrt(1.0f - sinpsi*sinpsi);
mixed ya2d = ra*cosphi;
mixed xb2d = - rc*cospsi;
mixed yb2d = - rb*cosphi - rc*sinpsi*sinphi;
mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
mixed xb2d2 = xb2d*xb2d;
mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5;
// --- Step3 al,be,ga ---
mixed alpha = (xb2d*(xb0d-xc0d) + yb0d*yb2d + yc0d*yc2d);
mixed beta = (xb2d*(yc0d-yb0d) + xb0d*yb2d + xc0d*yc2d);
mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
mixed al2be2 = alpha*alpha + beta*beta;
mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' ---
mixed costheta = sqrt(1.0f - sintheta*sintheta);
mixed xa3d = - ya2d*sintheta;
mixed ya3d = ya2d*costheta;
mixed za3d = za1d;
mixed xb3d = xb2d*costheta - yb2d*sintheta;
mixed yb3d = xb2d*sintheta + yb2d*costheta;
mixed zb3d = zb1d;
mixed xc3d = - xb2d*costheta - yc2d*sintheta;
mixed yc3d = - xb2d*sintheta + yc2d*costheta;
mixed zc3d = zc1d;
// --- Step5 A3 ---
mixed xa3 = trns11*xa3d + trns12*ya3d + trns13*za3d;
mixed ya3 = trns21*xa3d + trns22*ya3d + trns23*za3d;
mixed za3 = trns31*xa3d + trns32*ya3d + trns33*za3d;
mixed xb3 = trns11*xb3d + trns12*yb3d + trns13*zb3d;
mixed yb3 = trns21*xb3d + trns22*yb3d + trns23*zb3d;
mixed zb3 = trns31*xb3d + trns32*yb3d + trns33*zb3d;
mixed xc3 = trns11*xc3d + trns12*yc3d + trns13*zc3d;
mixed yc3 = trns21*xc3d + trns22*yc3d + trns23*zc3d;
mixed zc3 = trns31*xc3d + trns32*yc3d + trns33*zc3d;
xp0.x = xcom + xa3;
xp0.y = ycom + ya3;
xp0.z = zcom + za3;
xp1.x = xcom + xb3 - xb0;
xp1.y = ycom + yb3 - yb0;
xp1.z = zcom + zb3 - zb0;
xp2.x = xcom + xc3 - xc0;
xp2.y = ycom + yc3 - yc0;
xp2.z = zcom + zc3 - zc0;
// Record the new positions.
posDelta[atoms.x] = xp0;
posDelta[atoms.y] = xp1;
posDelta[atoms.z] = xp2;
index += get_global_size(0);
}
}
/**
* Enforce velocity constraints on SETTLE clusters
*/
__kernel void constrainVelocities(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global mixed4* restrict velm, __global const int4* restrict clusterAtoms, __global const float2* restrict clusterParams) {
for (int index = get_global_id(0); index < numClusters; index += get_global_size(0)) {
// Load the data for this cluster.
int4 atoms = clusterAtoms[index];
mixed4 apos0 = loadPos(oldPos, posCorrection, atoms.x);
mixed4 apos1 = loadPos(oldPos, posCorrection, atoms.y);
mixed4 apos2 = loadPos(oldPos, posCorrection, atoms.z);
mixed4 v0 = velm[atoms.x];
mixed4 v1 = velm[atoms.y];
mixed4 v2 = velm[atoms.z];
// Compute intermediate quantities: the atom masses, the bond directions, the relative velocities,
// and the angle cosines and sines.
mixed mA = 1/v0.w;
mixed mB = 1/v1.w;
mixed mC = 1/v2.w;
mixed4 eAB = apos1-apos0;
mixed4 eBC = apos2-apos1;
mixed4 eCA = apos0-apos2;
eAB.xyz /= sqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC.xyz /= sqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA.xyz /= sqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
mixed cA = -(eAB.x*eCA.x + eAB.y*eCA.y + eAB.z*eCA.z);
mixed cB = -(eAB.x*eBC.x + eAB.y*eBC.y + eAB.z*eBC.z);
mixed cC = -(eBC.x*eCA.x + eBC.y*eCA.y + eBC.z*eCA.z);
mixed s2A = 1-cA*cA;
mixed s2B = 1-cB*cB;
mixed s2C = 1-cC*cC;
// Solve the equations. These are different from those in the SETTLE paper (JCC 13(8), pp. 952-962, 1992), because
// in going from equations B1 to B2, they make the assumption that mB=mC (but don't bother to mention they're
// making that assumption). We allow all three atoms to have different masses.
mixed mABCinv = 1/(mA*mB*mC);
mixed denom = (((s2A*mB+s2B*mA)*mC+(s2A*mB*mB+2*(cA*cB*cC+1)*mA*mB+s2B*mA*mA))*mC+s2C*mA*mB*(mA+mB))*mABCinv;
mixed tab = ((cB*cC*mA-cA*mB-cA*mC)*vCA + (cA*cC*mB-cB*mC-cB*mA)*vBC + (s2C*mA*mA*mB*mB*mABCinv+(mA+mB+mC))*vAB)/denom;
mixed tbc = ((cA*cB*mC-cC*mB-cC*mA)*vCA + (s2A*mB*mB*mC*mC*mABCinv+(mA+mB+mC))*vBC + (cA*cC*mB-cB*mA-cB*mC)*vAB)/denom;
mixed tca = ((s2B*mA*mA*mC*mC*mABCinv+(mA+mB+mC))*vCA + (cA*cB*mC-cC*mB-cC*mA)*vBC + (cB*cC*mA-cA*mB-cA*mC)*vAB)/denom;
v0.xyz += (tab*eAB.xyz - tca*eCA.xyz)*v0.w;
v1.xyz += (tbc*eBC.xyz - tab*eAB.xyz)*v1.w;
v2.xyz += (tca*eCA.xyz - tbc*eBC.xyz)*v2.w;
velm[atoms.x] = v0;
velm[atoms.y] = v1;
velm[atoms.z] = v2;
}
}
\ No newline at end of file
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Enforce constraints on SHAKE clusters
*/
__kernel void applyShakeToHydrogens(int numClusters, mixed tol, __global const real4* restrict oldPos, __global const real4* restrict posCorrection, __global mixed4* restrict posDelta, __global const int4* restrict clusterAtoms, __global const float4* restrict clusterParams) {
int index = get_global_id(0);
while (index < numClusters) {
// Load the data for this cluster.
int4 atoms = clusterAtoms[index];
float4 params = clusterParams[index];
mixed4 pos = loadPos(oldPos, posCorrection, atoms.x);
mixed4 xpi = posDelta[atoms.x];
mixed4 pos1 = loadPos(oldPos, posCorrection, atoms.y);
mixed4 xpj1 = posDelta[atoms.y];
mixed4 pos2 = {0.0f, 0.0f, 0.0f, 0.0f};
mixed4 xpj2 = {0.0f, 0.0f, 0.0f, 0.0f};
float invMassCentral = params.x;
float avgMass = params.y;
float d2 = params.z;
float invMassPeripheral = params.w;
if (atoms.z != -1) {
pos2 = loadPos(oldPos, posCorrection, atoms.z);
xpj2 = posDelta[atoms.z];
}
mixed4 pos3 = {0.0f, 0.0f, 0.0f, 0.0f};
mixed4 xpj3 = {0.0f, 0.0f, 0.0f, 0.0f};
if (atoms.w != -1) {
pos3 = loadPos(oldPos, posCorrection, atoms.w);
xpj3 = posDelta[atoms.w];
}
// Precompute quantities.
mixed4 rij1 = pos-pos1;
mixed4 rij2 = pos-pos2;
mixed4 rij3 = pos-pos3;
mixed rij1sq = rij1.x*rij1.x + rij1.y*rij1.y + rij1.z*rij1.z;
mixed rij2sq = rij2.x*rij2.x + rij2.y*rij2.y + rij2.z*rij2.z;
mixed rij3sq = rij3.x*rij3.x + rij3.y*rij3.y + rij3.z*rij3.z;
mixed ld1 = d2-rij1sq;
mixed ld2 = d2-rij2sq;
mixed ld3 = d2-rij3sq;
// Iterate until convergence.
int converged = false;
int iteration = 0;
while (iteration < 15 && !converged) {
converged = true;
#ifdef CONSTRAIN_VELOCITIES
mixed4 rpij = xpi-xpj1;
mixed rrpr = rpij.x*rij1.x + rpij.y*rij1.y + rpij.z*rij1.z;
mixed delta = -2.0f*avgMass*rrpr/rij1sq;
mixed4 dr = rij1*delta;
xpi.xyz += dr.xyz*invMassCentral;
xpj1.xyz -= dr.xyz*invMassPeripheral;
if (fabs(delta) > tol)
converged = false;
if (atoms.z != -1) {
rpij = xpi-xpj2;
rrpr = rpij.x*rij2.x + rpij.y*rij2.y + rpij.z*rij2.z;
delta = -2.0f*avgMass*rrpr/rij2sq;
dr = rij2*delta;
xpi.xyz += dr.xyz*invMassCentral;
xpj2.xyz -= dr.xyz*invMassPeripheral;
if (fabs(delta) > tol)
converged = false;
}
if (atoms.w != -1) {
rpij = xpi-xpj3;
rrpr = rpij.x*rij3.x + rpij.y*rij3.y + rpij.z*rij3.z;
delta = -2.0f*avgMass*rrpr/rij3sq;
dr = rij3*delta;
xpi.xyz += dr.xyz*invMassCentral;
xpj3.xyz -= dr.xyz*invMassPeripheral;
if (fabs(delta) > tol)
converged = false;
}
#else
mixed4 rpij = xpi-xpj1;
mixed rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
mixed rrpr = rij1.x*rpij.x + rij1.y*rpij.y + rij1.z*rpij.z;
mixed diff = fabs(ld1-2.0f*rrpr-rpsqij) / (d2*tol);
if (diff >= 1.0f) {
mixed acor = (ld1-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij1sq);
mixed4 dr = rij1*acor;
xpi.xyz += dr.xyz*invMassCentral;
xpj1.xyz -= dr.xyz*invMassPeripheral;
converged = false;
}
if (atoms.z != -1) {
rpij.xyz = xpi.xyz-xpj2.xyz;
rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
rrpr = rij2.x*rpij.x + rij2.y*rpij.y + rij2.z*rpij.z;
diff = fabs(ld2-2.0f*rrpr-rpsqij) / (d2*tol);
if (diff >= 1.0f) {
mixed acor = (ld2 - 2.0f*rrpr - rpsqij)*avgMass / (rrpr + rij2sq);
mixed4 dr = rij2*acor;
xpi.xyz += dr.xyz*invMassCentral;
xpj2.xyz -= dr.xyz*invMassPeripheral;
converged = false;
}
}
if (atoms.w != -1) {
rpij.xyz = xpi.xyz-xpj3.xyz;
rpsqij = rpij.x*rpij.x + rpij.y*rpij.y + rpij.z*rpij.z;
rrpr = rij3.x*rpij.x + rij3.y*rpij.y + rij3.z*rpij.z;
diff = fabs(ld3 - 2.0f*rrpr - rpsqij) / (d2*tol);
if (diff >= 1.0f) {
mixed acor = (ld3-2.0f*rrpr-rpsqij)*avgMass / (rrpr+rij3sq);
mixed4 dr = rij3*acor;
xpi.xyz += dr.xyz*invMassCentral;
xpj3.xyz -= dr.xyz*invMassPeripheral;
converged = false;
}
}
#endif
iteration++;
}
// Record the new positions.
posDelta[atoms.x] = xpi;
posDelta[atoms.y] = xpj1;
if (atoms.z != -1)
posDelta[atoms.z] = xpj2;
if (atoms.w != -1)
posDelta[atoms.w] = xpj3;
index += get_global_size(0);
}
}
......@@ -81,21 +81,26 @@ __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize,
}
}
#ifdef SUPPORTS_64_BIT_ATOMICS
/**
* Sum the various buffers containing forces.
*/
__kernel void reduceForces(__global const long* restrict longBuffer, __global real4* restrict buffer, int bufferSize, int numBuffers) {
__kernel void reduceForces(__global long* restrict longBuffer, __global real4* restrict buffer, int bufferSize, int numBuffers) {
int totalSize = bufferSize*numBuffers;
real scale = 1/(real) 0x100000000;
for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
#ifdef SUPPORTS_64_BIT_ATOMICS
real4 sum = (real4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0);
#else
real4 sum = (real4) 0;
#endif
for (int i = index; i < totalSize; i += bufferSize)
sum += buffer[i];
buffer[index] = sum;
longBuffer[index] = (long) (sum.x*0x100000000);
longBuffer[index+bufferSize] = (long) (sum.y*0x100000000);
longBuffer[index+2*bufferSize] = (long) (sum.z*0x100000000);
}
}
#endif
/**
* Sum the energy buffer.
......
/**
* Perform the first step of verlet integration.
*/
__kernel void integrateVerletPart1(int numAtoms, __global const mixed2* restrict dt, __global const real4* restrict posq, __global const real4* restrict posqCorrection, __global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta) {
mixed2 stepSize = dt[0];
mixed dtPos = stepSize.y;
mixed dtVel = 0.5f*(stepSize.x+stepSize.y);
int index = get_global_id(0);
while (index < numAtoms) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
velocity.x += force[index].x*dtVel*velocity.w;
velocity.y += force[index].y*dtVel*velocity.w;
velocity.z += force[index].z*dtVel*velocity.w;
pos.xyz = velocity.xyz*dtPos;
posDelta[index] = pos;
velm[index] = velocity;
}
index += get_global_size(0);
}
}
/**
* Perform the second step of verlet integration.
*/
__kernel void integrateVerletPart2(int numAtoms, __global mixed2* restrict dt, __global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict velm, __global const mixed4* restrict posDelta) {
mixed2 stepSize = dt[0];
#ifdef SUPPORTS_DOUBLE_PRECISION
double oneOverDt = 1.0/stepSize.y;
#else
float oneOverDt = 1.0f/stepSize.y;
float correction = (1.0f-oneOverDt*stepSize.y)/stepSize.y;
#endif
if (get_global_id(0) == 0)
dt[0].x = stepSize.y;
barrier(CLK_LOCAL_MEM_FENCE);
int index = get_global_id(0);
while (index < numAtoms) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
mixed4 delta = posDelta[index];
pos.xyz += delta.xyz;
#ifdef SUPPORTS_DOUBLE_PRECISION
velocity.xyz = convert_mixed4(convert_double4(delta)*oneOverDt).xyz;
#else
velocity.xyz = delta.xyz*oneOverDt + delta.xyz*correction;
#endif
#ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
velm[index] = velocity;
}
index += get_global_size(0);
}
}
/**
* Select the step size to use for the next step.
*/
__kernel void selectVerletStepSize(int numAtoms, mixed maxStepSize, mixed errorTol, __global mixed2* restrict dt, __global const mixed4* restrict velm, __global const real4* restrict force, __local mixed* restrict error) {
// Calculate the error.
mixed err = 0;
int index = get_local_id(0);
while (index < numAtoms) {
real4 f = force[index];
mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
index += get_global_size(0);
}
error[get_local_id(0)] = err;
barrier(CLK_LOCAL_MEM_FENCE);
// Sum the errors from all threads.
for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
if (get_local_id(0)+offset < get_local_size(0) && (get_local_id(0)&(2*offset-1)) == 0)
error[get_local_id(0)] += error[get_local_id(0)+offset];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_local_id(0) == 0) {
mixed totalError = sqrt(error[0]/(numAtoms*3));
mixed newStepSize = sqrt(errorTol/totalError);
mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
if (newStepSize > maxStepSize)
newStepSize = maxStepSize;
dt[0].y = newStepSize;
}
}
/**
* Load the position of a particle.
*/
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Store the position of a particle.
*/
void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
/**
* Compute the positions of virtual sites
*/
__kernel void computeVirtualSites(__global real4* restrict posq,
#ifdef USE_MIXED_PRECISION
__global real4* restrict posqCorrection,
#endif
__global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
__global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
__global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights,
__global const int* restrict localCoordsIndex, __global const int* restrict localCoordsAtoms,
__global const real* restrict localCoordsWeights, __global const real4* restrict localCoordsPos,
__global const int* restrict localCoordsStartIndex) {
#ifndef USE_MIXED_PRECISION
__global real4* posqCorrection = 0;
#endif
// Two particle average sites.
for (int index = get_global_id(0); index < NUM_2_AVERAGE; index += get_global_size(0)) {
int4 atoms = avg2Atoms[index];
real2 weights = avg2Weights[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
pos.xyz = pos1.xyz*weights.x + pos2.xyz*weights.y;
storePos(posq, posqCorrection, atoms.x, pos);
}
// Three particle average sites.
for (int index = get_global_id(0); index < NUM_3_AVERAGE; index += get_global_size(0)) {
int4 atoms = avg3Atoms[index];
real4 weights = avg3Weights[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
pos.xyz = pos1.xyz*weights.x + pos2.xyz*weights.y + pos3.xyz*weights.z;
storePos(posq, posqCorrection, atoms.x, pos);
}
// Out of plane sites.
for (int index = get_global_id(0); index < NUM_OUT_OF_PLANE; index += get_global_size(0)) {
int4 atoms = outOfPlaneAtoms[index];
real4 weights = outOfPlaneWeights[index];
mixed4 pos = loadPos(posq, posqCorrection, atoms.x);
mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
mixed4 v12 = pos2-pos1;
mixed4 v13 = pos3-pos1;
pos.xyz = pos1.xyz + v12.xyz*weights.x + v13.xyz*weights.y + cross(v12, v13).xyz*weights.z;
storePos(posq, posqCorrection, atoms.x, pos);
}
// Local coordinates sites.
for (int index = get_global_id(0); index < NUM_LOCAL_COORDS; index += get_global_size(0)) {
int siteAtomIndex = localCoordsIndex[index];
int start = localCoordsStartIndex[index];
int end = localCoordsStartIndex[index+1];
mixed3 origin = 0, xdir = 0, ydir = 0;
for (int j = start; j < end; j++) {
mixed3 pos = loadPos(posq, posqCorrection, localCoordsAtoms[j]).xyz;
origin += pos*localCoordsWeights[3*j];
xdir += pos*localCoordsWeights[3*j+1];
ydir += pos*localCoordsWeights[3*j+2];
}
mixed3 zdir = cross(xdir, ydir);
mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
xdir *= invNormXdir;
zdir *= invNormZdir;
ydir = cross(zdir, xdir);
mixed3 localPosition = convert_mixed4(localCoordsPos[index]).xyz;
mixed4 pos = loadPos(posq, posqCorrection, siteAtomIndex);
pos.x = origin.x + xdir.x*localPosition.x + ydir.x*localPosition.y + zdir.x*localPosition.z;
pos.y = origin.y + xdir.y*localPosition.x + ydir.y*localPosition.y + zdir.y*localPosition.z;
pos.z = origin.z + xdir.z*localPosition.x + ydir.z*localPosition.y + zdir.z*localPosition.z;
storePos(posq, posqCorrection, siteAtomIndex, pos);
}
}
#ifdef HAS_OVERLAPPING_VSITES
#ifdef SUPPORTS_64_BIT_ATOMICS
// We will use 64 bit atomics for force redistribution.
#define ADD_FORCE(index, f) addForce(index, f, longForce);
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
void addForce(int index, real4 f, __global long* longForce) {
atom_add(&longForce[index], (long) (f.x*0x100000000));
atom_add(&longForce[index+PADDED_NUM_ATOMS], (long) (f.y*0x100000000));
atom_add(&longForce[index+2*PADDED_NUM_ATOMS], (long) (f.z*0x100000000));
}
__kernel void addDistributedForces(__global const long* restrict longForces, __global real4* restrict forces) {
real scale = 1/(real) 0x100000000;
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
real4 f = (real4) (scale*longForces[index], scale*longForces[index+PADDED_NUM_ATOMS], scale*longForces[index+2*PADDED_NUM_ATOMS], 0);
forces[index] += f;
}
}
#else
// 64 bit atomics aren't supported, so we have to use atomic_cmpxchg() for force redistribution.
#define ADD_FORCE(index, f) addForce(index, f, force);
void atomicAddFloat(__global float* p, float v) {
__global int* ip = (__global int*) p;
while (true) {
union {
float f;
int i;
} oldval, newval;
oldval.f = *p;
newval.f = oldval.f+v;
if (atomic_cmpxchg(ip, oldval.i, newval.i) == oldval.i)
return;
}
}
void addForce(int index, float4 f, __global float4* force) {
__global float* components = (__global float*) force;
atomicAddFloat(&components[4*index], f.x);
atomicAddFloat(&components[4*index+1], f.y);
atomicAddFloat(&components[4*index+2], f.z);
}
#endif
#else
// There are no overlapping virtual sites, so we can just store forces directly.
#define ADD_FORCE(index, f) force[index].xyz += (f).xyz;
#endif
/**
* Distribute forces from virtual sites to the atoms they are based on.
*/
__kernel void distributeForces(__global const real4* restrict posq, __global real4* restrict force,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict longForce,
#endif
#ifdef USE_MIXED_PRECISION
__global real4* restrict posqCorrection,
#endif
__global const int4* restrict avg2Atoms, __global const real2* restrict avg2Weights,
__global const int4* restrict avg3Atoms, __global const real4* restrict avg3Weights,
__global const int4* restrict outOfPlaneAtoms, __global const real4* restrict outOfPlaneWeights,
__global const int* restrict localCoordsIndex, __global const int* restrict localCoordsAtoms,
__global const real* restrict localCoordsWeights, __global const real4* restrict localCoordsPos,
__global const int* restrict localCoordsStartIndex) {
#ifndef USE_MIXED_PRECISION
__global real4* posqCorrection = 0;
#endif
// Two particle average sites.
for (int index = get_global_id(0); index < NUM_2_AVERAGE; index += get_global_size(0)) {
int4 atoms = avg2Atoms[index];
real2 weights = avg2Weights[index];
real4 f = force[atoms.x];
ADD_FORCE(atoms.y, f*weights.x);
ADD_FORCE(atoms.z, f*weights.y);
}
// Three particle average sites.
for (int index = get_global_id(0); index < NUM_3_AVERAGE; index += get_global_size(0)) {
int4 atoms = avg3Atoms[index];
real4 weights = avg3Weights[index];
real4 f = force[atoms.x];
ADD_FORCE(atoms.y, f*weights.x);
ADD_FORCE(atoms.z, f*weights.y);
ADD_FORCE(atoms.w, f*weights.z);
}
// Out of plane sites.
for (int index = get_global_id(0); index < NUM_OUT_OF_PLANE; index += get_global_size(0)) {
int4 atoms = outOfPlaneAtoms[index];
real4 weights = outOfPlaneWeights[index];
mixed4 pos1 = loadPos(posq, posqCorrection, atoms.y);
mixed4 pos2 = loadPos(posq, posqCorrection, atoms.z);
mixed4 pos3 = loadPos(posq, posqCorrection, atoms.w);
mixed4 v12 = pos2-pos1;
mixed4 v13 = pos3-pos1;
real4 f = force[atoms.x];
real4 fp2 = (real4) (weights.x*f.x - weights.z*v13.z*f.y + weights.z*v13.y*f.z,
weights.z*v13.z*f.x + weights.x*f.y - weights.z*v13.x*f.z,
-weights.z*v13.y*f.x + weights.z*v13.x*f.y + weights.x*f.z, 0.0f);
real4 fp3 = (real4) (weights.y*f.x + weights.z*v12.z*f.y - weights.z*v12.y*f.z,
-weights.z*v12.z*f.x + weights.y*f.y + weights.z*v12.x*f.z,
weights.z*v12.y*f.x - weights.z*v12.x*f.y + weights.y*f.z, 0.0f);
ADD_FORCE(atoms.y, f-fp2-fp3);
ADD_FORCE(atoms.z, fp2);
ADD_FORCE(atoms.w, fp3);
}
// Local coordinates sites.
for (int index = get_global_id(0); index < NUM_LOCAL_COORDS; index += get_global_size(0)) {
int siteAtomIndex = localCoordsIndex[index];
int start = localCoordsStartIndex[index];
int end = localCoordsStartIndex[index+1];
mixed3 origin = 0, xdir = 0, ydir = 0;
for (int j = start; j < end; j++) {
mixed3 pos = loadPos(posq, posqCorrection, localCoordsAtoms[j]).xyz;
origin += pos*localCoordsWeights[3*j];
xdir += pos*localCoordsWeights[3*j+1];
ydir += pos*localCoordsWeights[3*j+2];
}
mixed3 zdir = cross(xdir, ydir);
mixed normXdir = sqrt(xdir.x*xdir.x+xdir.y*xdir.y+xdir.z*xdir.z);
mixed normZdir = sqrt(zdir.x*zdir.x+zdir.y*zdir.y+zdir.z*zdir.z);
mixed invNormXdir = (normXdir > 0 ? 1/normXdir : 0);
mixed invNormZdir = (normZdir > 0 ? 1/normZdir : 0);
mixed3 dx = xdir*invNormXdir;
mixed3 dz = zdir*invNormZdir;
mixed3 dy = cross(dz, dx);
mixed3 localPosition = convert_mixed4(localCoordsPos[index]).xyz;
// The derivatives for this case are very complicated. They were computed with SymPy then simplified by hand.
real4 f = force[siteAtomIndex];
mixed3 fp1 = localPosition*f.x;
mixed3 fp2 = localPosition*f.y;
mixed3 fp3 = localPosition*f.z;
for (int j = start; j < end; j++) {
real originWeight = localCoordsWeights[3*j];
real wx = localCoordsWeights[3*j+1];
real wy = localCoordsWeights[3*j+2];
mixed wxScaled = wx*invNormXdir;
mixed t1 = (wx*ydir.x-wy*xdir.x)*invNormZdir;
mixed t2 = (wx*ydir.y-wy*xdir.y)*invNormZdir;
mixed t3 = (wx*ydir.z-wy*xdir.z)*invNormZdir;
mixed sx = t3*dz.y-t2*dz.z;
mixed sy = t1*dz.z-t3*dz.x;
mixed sz = t2*dz.x-t1*dz.y;
real4 fresult = 0;
fresult.x += fp1.x*wxScaled*(1-dx.x*dx.x) + fp1.z*(dz.x*sx ) + fp1.y*((-dx.x*dy.x )*wxScaled + dy.x*sx - dx.y*t2 - dx.z*t3) + f.x*originWeight;
fresult.y += fp1.x*wxScaled*( -dx.x*dx.y) + fp1.z*(dz.x*sy+t3) + fp1.y*((-dx.y*dy.x-dz.z)*wxScaled + dy.x*sy + dx.y*t1);
fresult.z += fp1.x*wxScaled*( -dx.x*dx.z) + fp1.z*(dz.x*sz-t2) + fp1.y*((-dx.z*dy.x+dz.y)*wxScaled + dy.x*sz + dx.z*t1);
fresult.x += fp2.x*wxScaled*( -dx.y*dx.x) + fp2.z*(dz.y*sx-t3) - fp2.y*(( dx.x*dy.y-dz.z)*wxScaled - dy.y*sx - dx.x*t2);
fresult.y += fp2.x*wxScaled*(1-dx.y*dx.y) + fp2.z*(dz.y*sy ) - fp2.y*(( dx.y*dy.y )*wxScaled - dy.y*sy + dx.x*t1 + dx.z*t3) + f.y*originWeight;
fresult.z += fp2.x*wxScaled*( -dx.y*dx.z) + fp2.z*(dz.y*sz+t1) - fp2.y*(( dx.z*dy.y+dz.x)*wxScaled - dy.y*sz - dx.z*t2);
fresult.x += fp3.x*wxScaled*( -dx.z*dx.x) + fp3.z*(dz.z*sx+t2) + fp3.y*((-dx.x*dy.z-dz.y)*wxScaled + dy.z*sx + dx.x*t3);
fresult.y += fp3.x*wxScaled*( -dx.z*dx.y) + fp3.z*(dz.z*sy-t1) + fp3.y*((-dx.y*dy.z+dz.x)*wxScaled + dy.z*sy + dx.y*t3);
fresult.z += fp3.x*wxScaled*(1-dx.z*dx.z) + fp3.z*(dz.z*sz ) + fp3.y*((-dx.z*dy.z )*wxScaled + dy.z*sz - dx.x*t1 - dx.y*t2) + f.z*originWeight;
ADD_FORCE(localCoordsAtoms[j], fresult);
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment