Commit 5a06df78 authored by tic20's avatar tic20
Browse files
parents 8dd60914 a9223eea
const real PI = 3.14159265358979323846f;
// Compute the first angle.
real4 v0a = (real4) (pos1.xyz-pos2.xyz, 0.0f);
real4 v1a = (real4) (pos3.xyz-pos2.xyz, 0.0f);
real4 v2a = (real4) (pos3.xyz-pos4.xyz, 0.0f);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA(v0a)
APPLY_PERIODIC_TO_DELTA(v1a)
APPLY_PERIODIC_TO_DELTA(v2a)
#endif
real4 cp0a = cross(v0a, v1a);
real4 cp1a = cross(v1a, v2a);
real cosangle = dot(normalize(cp0a), normalize(cp1a));
real angleA;
if (cosangle > 0.99f || cosangle < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real4 cross_prod = cross(cp0a, cp1a);
real scale = dot(cp0a, cp0a)*dot(cp1a, cp1a);
angleA = asin(SQRT(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0.0f)
angleA = PI-angleA;
}
else
angleA = acos(cosangle);
angleA = (dot(v0a, cp1a) >= 0 ? angleA : -angleA);
angleA = fmod(angleA+2.0f*PI, 2.0f*PI);
// Compute the second angle.
real4 v0b = (real4) (pos5.xyz-pos6.xyz, 0.0f);
real4 v1b = (real4) (pos7.xyz-pos6.xyz, 0.0f);
real4 v2b = (real4) (pos7.xyz-pos8.xyz, 0.0f);
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA(v0b)
APPLY_PERIODIC_TO_DELTA(v1b)
APPLY_PERIODIC_TO_DELTA(v2b)
#endif
real4 cp0b = cross(v0b, v1b);
real4 cp1b = cross(v1b, v2b);
cosangle = dot(normalize(cp0b), normalize(cp1b));
real angleB;
if (cosangle > 0.99f || cosangle < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real4 cross_prod = cross(cp0b, cp1b);
real scale = dot(cp0b, cp0b)*dot(cp1b, cp1b);
angleB = asin(SQRT(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0.0f)
angleB = PI-angleB;
}
else
angleB = acos(cosangle);
angleB = (dot(v0b, cp1b) >= 0 ? angleB : -angleB);
angleB = fmod(angleB+2.0f*PI, 2.0f*PI);
// Identify which patch this is in.
int2 pos = MAP_POS[MAPS[index]];
int size = pos.y;
real delta = 2*PI/size;
int s = (int) (angleA/delta);
int t = (int) (angleB/delta);
float4 c[4];
int coeffIndex = pos.x+4*(s+size*t);
c[0] = COEFF[coeffIndex];
c[1] = COEFF[coeffIndex+1];
c[2] = COEFF[coeffIndex+2];
c[3] = COEFF[coeffIndex+3];
real da = angleA/delta-s;
real db = angleB/delta-t;
// Evaluate the spline to determine the energy and gradients.
real torsionEnergy = 0.0f;
real dEdA = 0.0f;
real dEdB = 0.0f;
torsionEnergy = da*torsionEnergy + ((c[3].w*db + c[3].z)*db + c[3].y)*db + c[3].x;
dEdA = db*dEdA + (3.0f*c[3].w*da + 2.0f*c[2].w)*da + c[1].w;
dEdB = da*dEdB + (3.0f*c[3].w*db + 2.0f*c[3].z)*db + c[3].y;
torsionEnergy = da*torsionEnergy + ((c[2].w*db + c[2].z)*db + c[2].y)*db + c[2].x;
dEdA = db*dEdA + (3.0f*c[3].z*da + 2.0f*c[2].z)*da + c[1].z;
dEdB = da*dEdB + (3.0f*c[2].w*db + 2.0f*c[2].z)*db + c[2].y;
torsionEnergy = da*torsionEnergy + ((c[1].w*db + c[1].z)*db + c[1].y)*db + c[1].x;
dEdA = db*dEdA + (3.0f*c[3].y*da + 2.0f*c[2].y)*da + c[1].y;
dEdB = da*dEdB + (3.0f*c[1].w*db + 2.0f*c[1].z)*db + c[1].y;
torsionEnergy = da*torsionEnergy + ((c[0].w*db + c[0].z)*db + c[0].y)*db + c[0].x;
dEdA = db*dEdA + (3.0f*c[3].x*da + 2.0f*c[2].x)*da + c[1].x;
dEdB = da*dEdB + (3.0f*c[0].w*db + 2.0f*c[0].z)*db + c[0].y;
dEdA /= delta;
dEdB /= delta;
energy += torsionEnergy;
// Apply the force to the first torsion.
real normCross1 = dot(cp0a, cp0a);
real normSqrBC = dot(v1a, v1a);
real normBC = SQRT(normSqrBC);
real normCross2 = dot(cp1a, cp1a);
real dp = 1.0f/normSqrBC;
real4 ff = (real4) ((-dEdA*normBC)/normCross1, dot(v0a, v1a)*dp, dot(v2a, v1a)*dp, (dEdA*normBC)/normCross2);
real4 force1 = ff.x*cp0a;
real4 force4 = ff.w*cp1a;
real4 d = ff.y*force1 - ff.z*force4;
real4 force2 = d-force1;
real4 force3 = -d-force4;
// Apply the force to the second torsion.
normCross1 = dot(cp0b, cp0b);
normSqrBC = dot(v1b, v1b);
normBC = SQRT(normSqrBC);
normCross2 = dot(cp1b, cp1b);
dp = 1.0f/normSqrBC;
ff = (real4) ((-dEdB*normBC)/normCross1, dot(v0b, v1b)*dp, dot(v2b, v1b)*dp, (dEdB*normBC)/normCross2);
real4 force5 = ff.x*cp0b;
real4 force8 = ff.w*cp1b;
d = ff.y*force5 - ff.z*force8;
real4 force6 = d-force5;
real4 force7 = -d-force8;
/**
* This file contains OpenCL definitions for the macros and functions needed for the
* common compute framework.
*/
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define KERNEL __kernel
#define DEVICE
#define LOCAL __local
#define LOCAL_ARG __local
#define GLOBAL __global
#define RESTRICT restrict
#define LOCAL_ID get_local_id(0)
#define LOCAL_SIZE get_local_size(0)
#define GLOBAL_ID get_global_id(0)
#define GLOBAL_SIZE get_global_size(0)
#define GROUP_ID get_group_id(0)
#define NUM_GROUPS get_num_groups(0)
#define SYNC_THREADS barrier(CLK_LOCAL_MEM_FENCE+CLK_GLOBAL_MEM_FENCE);
#define MEM_FENCE mem_fence(CLK_LOCAL_MEM_FENCE+CLK_GLOBAL_MEM_FENCE);
#define ATOMIC_ADD(dest, value) atom_add(dest, value)
typedef long mm_long;
typedef unsigned long mm_ulong;
#define make_short2(x...) ((short2) (x))
#define make_short3(x...) ((short3) (x))
#define make_short4(x...) ((short4) (x))
#define make_int2(x...) ((int2) (x))
#define make_int3(x...) ((int3) (x))
#define make_int4(x...) ((int4) (x))
#define make_float2(x...) ((float2) (x))
#define make_float3(x...) ((float3) (x))
#define make_float4(x...) ((float4) (x))
#define make_double2(x...) ((double2) (x))
#define make_double3(x...) ((double3) (x))
#define make_double4(x...) ((double4) (x))
#define trimTo3(v) (v).xyz
// OpenCL has overloaded versions of standard math functions for single and double
// precision arguments. CUDA has separate functions. To allow them to be called
// consistently, we define the "single precision" functions to just be synonyms
// for the standard ones.
#define sqrtf(x) sqrt(x)
#define rsqrtf(x) rsqrt(x)
#define expf(x) exp(x)
#define logf(x) log(x)
#define powf(x) pow(x)
#define cosf(x) cos(x)
#define sinf(x) sin(x)
#define tanf(x) tan(x)
#define acosf(x) acos(x)
#define asinf(x) asin(x)
#define atanf(x) atan(x)
#define atan2f(x, y) atan2(x, y)
/**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/
real4 ccb_delta(real4 vec1, real4 vec2, bool periodic, real4 periodicBoxSize, real4 invPeriodicBoxSize,
real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ) {
real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
if (periodic)
APPLY_PERIODIC_TO_DELTA(result);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
real ccb_computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle;
if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
real4 crossProduct = cross(vec1, vec2);
real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0)
angle = M_PI-angle;
}
else
angle = acos(cosine);
return angle;
}
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
real4 ccb_computeCross(real4 vec1, real4 vec2) {
real4 result = cross(vec1, vec2);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
COMPUTE_FORCE
real4 force1 = (real4) (-dEdX, -dEdY, -dEdZ, 0);
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
#ifdef USE_SYMMETRIC
real tempForce = 0.0f;
#else
real4 tempForce1 = (real4) 0;
real4 tempForce2 = (real4) 0;
#endif
COMPUTE_FORCE
#ifdef USE_SYMMETRIC
dEdR += tempForce*invR;
#else
dEdR1 += tempForce1;
dEdR2 += tempForce2;
#endif
}
/**
* Compute chain rule terms for computed values that depend explicitly on particle coordinates.
*/
__kernel void computeGradientChainRuleTerms(__global real4* restrict forceBuffers, __global const real4* restrict posq
PARAMETER_ARGUMENTS) {
INIT_PARAM_DERIVS
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
real4 pos = posq[index];
real4 force = forceBuffers[index];
COMPUTE_FORCES
forceBuffers[index] = force;
index += get_global_size(0);
}
SAVE_PARAM_DERIVS
}
__kernel void computeFloatSum(__global const float* restrict sumBuffer, __global float* result, int bufferSize) {
__local float tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
float sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
}
#ifdef SUPPORTS_DOUBLE_PRECISION
__kernel void computeDoubleSum(__global const double* restrict sumBuffer, __global double* result, int bufferSize) {
__local double tempBuffer[WORK_GROUP_SIZE];
const unsigned int thread = get_local_id(0);
double sum = 0;
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0))
sum += sumBuffer[index];
tempBuffer[thread] = sum;
for (int i = 1; i < WORK_GROUP_SIZE; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < WORK_GROUP_SIZE)
tempBuffer[thread] += tempBuffer[thread+i];
}
if (thread == 0)
*result = tempBuffer[0];
}
#endif
__kernel void applyPositionDeltas(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta) {
for (unsigned int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
pos.xyz += posDelta[index].xyz;
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
posDelta[index] = (mixed4) 0;
}
}
__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
uint4 state = seed[get_global_id(0)];
unsigned int carry = 0;
for (int index = get_global_id(0); index < numValues; index += get_global_size(0)) {
// Generate three uniform random numbers.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
unsigned int m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x2 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
// Record the values.
random[index] = (float4) (x1, x2, x3, 0.0f);
}
seed[get_global_id(0)] = state;
}
/**
* Load the position of a particle.
*/
mixed4 loadPos(__global const real4* restrict posq, __global const real4* restrict posqCorrection, int index) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
return (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
return posq[index];
#endif
}
/**
* Store the position of a particle.
*/
void storePos(__global real4* restrict posq, __global real4* restrict posqCorrection, int index, mixed4 pos) {
#ifdef USE_MIXED_PRECISION
posq[index] = (real4) ((real) pos.x, (real) pos.y, (real) pos.z, (real) pos.w);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
}
__kernel void computePerDof(__global real4* restrict posq, __global real4* restrict posqCorrection, __global mixed4* restrict posDelta,
__global mixed4* restrict velm, __global const real4* restrict force, __global const mixed2* restrict dt, __global const mixed* restrict globals,
__global mixed* restrict sum, __global const float4* restrict gaussianValues, unsigned int gaussianBaseIndex, __global const float4* restrict uniformValues,
const mixed energy, __global mixed* restrict energyParamDerivs
PARAMETER_ARGUMENTS) {
mixed stepSize = dt[0].y;
int index = get_global_id(0);
while (index < NUM_ATOMS) {
#ifdef LOAD_POS_AS_DELTA
mixed4 position = loadPos(posq, posqCorrection, index)+posDelta[index];
#else
mixed4 position = loadPos(posq, posqCorrection, index);
#endif
mixed4 velocity = velm[index];
mixed4 f = convert_mixed4(force[index]);
mixed mass = 1/velocity.w;
if (velocity.w != 0.0) {
int gaussianIndex = gaussianBaseIndex;
int uniformIndex = 0;
COMPUTE_STEP
}
index += get_global_size(0);
}
}
#ifdef USE_CUTOFF
if (!isExcluded && r2 < CUTOFF_SQUARED) {
#else
if (!isExcluded) {
#endif
real tempForce = 0.0f;
real switchValue = 1, switchDeriv = 0;
#if USE_SWITCH
if (r > SWITCH_CUTOFF) {
real x = r-SWITCH_CUTOFF;
switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
}
#endif
COMPUTE_FORCE
#if USE_SWITCH
tempForce = tempForce*switchValue - customEnergy*switchDeriv;
tempEnergy += customEnergy*switchValue;
#else
tempEnergy += customEnergy;
#endif
dEdR += tempForce*invR;
}
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define TILE_SIZE 32
#define NEIGHBOR_BLOCK_SIZE 32
/**
* Calculate the ellipsoid coordinate frames and associated matrices.
*/
__kernel void computeEllipsoidFrames(int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices,
__global const float4* restrict sigParams, __global const float4* restrict scale, __global real* restrict aMatrix,
__global real* restrict bMatrix, __global real* restrict gMatrix, __global const int* sortedParticles) {
for (int sortedIndex = get_global_id(0); sortedIndex < numParticles; sortedIndex += get_global_size(0)) {
// Compute the local coordinate system of the ellipsoid;
int originalIndex = sortedParticles[sortedIndex];
real3 pos = posq[originalIndex].xyz;
int2 axisParticles = axisParticleIndices[originalIndex];
real3 xdir, ydir, zdir;
if (axisParticles.x == -1) {
xdir = (real3) (1, 0, 0);
ydir = (real3) (0, 1, 0);
}
else {
xdir = pos-posq[axisParticles.x].xyz;
xdir = normalize(xdir);
if (axisParticles.y == -1) {
if (xdir.y > -0.5f && xdir.y < 0.5f)
ydir = (real3) (0, 1, 0);
else
ydir = (real3) (1, 0, 0);
}
else
ydir = pos-posq[axisParticles.y].xyz;
ydir -= xdir*dot(xdir, ydir);
ydir = normalize(ydir);
}
zdir = cross(xdir, ydir);
// Compute matrices we will need later.
__global real (*a)[3] = (__global real (*)[3]) (aMatrix+sortedIndex*9);
__global real (*b)[3] = (__global real (*)[3]) (bMatrix+sortedIndex*9);
__global real (*g)[3] = (__global real (*)[3]) (gMatrix+sortedIndex*9);
a[0][0] = xdir.x;
a[0][1] = xdir.y;
a[0][2] = xdir.z;
a[1][0] = ydir.x;
a[1][1] = ydir.y;
a[1][2] = ydir.z;
a[2][0] = zdir.x;
a[2][1] = zdir.y;
a[2][2] = zdir.z;
float4 sig = sigParams[originalIndex];
float3 r2 = sig.yzw;
float3 e2 = scale[originalIndex].xyz;
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
b[i][j] = a[0][i]*e2.x*a[0][j] + a[1][i]*e2.y*a[1][j] + a[2][i]*e2.z*a[2][j];
g[i][j] = a[0][i]*r2.x*a[0][j] + a[1][i]*r2.y*a[1][j] + a[2][i]*r2.z*a[2][j];
}
}
}
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const int* sortedAtoms, __global const real4* restrict posq, __global real4* restrict sortedPos, __global real4* restrict blockCenter,
__global real4* restrict blockBoundingBox, __global int* restrict neighborBlockCount) {
int index = get_global_id(0);
int base = index*TILE_SIZE;
while (base < numAtoms) {
real4 pos = posq[sortedAtoms[base]];
sortedPos[base] = pos;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_POS(pos)
#endif
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) {
pos = posq[sortedAtoms[i]];
sortedPos[i] = pos;
#ifdef USE_PERIODIC
real4 center = 0.5f*(maxPos+minPos);
APPLY_PERIODIC_TO_POS_WITH_CENTER(pos, center)
#endif
minPos = min(minPos, pos);
maxPos = max(maxPos, pos);
}
real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos);
index += get_global_size(0);
base = index*TILE_SIZE;
}
if (get_global_id(0) == 0)
*neighborBlockCount = 0;
}
/**
* This is called by findNeighbors() to write a block to the neighbor list.
*/
void storeNeighbors(int atom1, int* neighborBuffer, int numAtomsInBuffer, int maxNeighborBlocks, __global int* restrict neighbors,
__global int* restrict neighborIndex, __global int* restrict neighborBlockCount) {
int blockIndex = atom_add(neighborBlockCount, 1);
if (blockIndex >= maxNeighborBlocks)
return; // We don't have enough room for the neighbor list.
neighborIndex[blockIndex] = atom1;
int baseIndex = blockIndex*NEIGHBOR_BLOCK_SIZE;
for (int i = 0; i < numAtomsInBuffer; i++)
neighbors[baseIndex+i] = neighborBuffer[i];
for (int i = numAtomsInBuffer; i < NEIGHBOR_BLOCK_SIZE; i++)
neighbors[baseIndex+i] = -1;
}
/**
* Build a list of neighbors for each atom.
*/
__kernel void findNeighbors(int numAtoms, int maxNeighborBlocks, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global real4* restrict sortedPos, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict neighbors,
__global int* restrict neighborIndex, __global int* restrict neighborBlockCount, __global const int* restrict exclusions, __global const int* restrict exclusionStartIndex) {
const int numBlocks = (numAtoms+TILE_SIZE-1)/TILE_SIZE;
int neighborBuffer[NEIGHBOR_BLOCK_SIZE];
for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) {
int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1];
real4 pos = sortedPos[atom1];
int nextBufferIndex = 0;
// Loop over atom blocks and compute the distance of this atom from each one's bounding box.
for (int block = (atom1+1)/TILE_SIZE; block < numBlocks; block++) {
real4 center = blockCenter[block];
real4 blockSize = blockBoundingBox[block];
real4 blockDelta = center-pos;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(blockDelta)
#endif
blockDelta.x = max((real) 0, fabs(blockDelta.x)-blockSize.x);
blockDelta.y = max((real) 0, fabs(blockDelta.y)-blockSize.y);
blockDelta.z = max((real) 0, fabs(blockDelta.z)-blockSize.z);
if (blockDelta.x*blockDelta.x+blockDelta.y*blockDelta.y+blockDelta.z*blockDelta.z >= CUTOFF_SQUARED)
continue;
// Loop over atoms within this block.
int first = max(block*TILE_SIZE, atom1+1);
int last = min((block+1)*TILE_SIZE, numAtoms);
for (int atom2 = first; atom2 < last; atom2++) {
// Skip over excluded interactions.
if (nextExclusion < lastExclusion && exclusions[nextExclusion] >= atom2) {
nextExclusion++;
continue;
}
real4 delta = pos-sortedPos[atom2];
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
neighborBuffer[nextBufferIndex++] = atom2;
if (nextBufferIndex == NEIGHBOR_BLOCK_SIZE) {
storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
nextBufferIndex = 0;
}
}
}
}
if (nextBufferIndex > 0)
storeNeighbors(atom1, neighborBuffer, nextBufferIndex, maxNeighborBlocks, neighbors, neighborIndex, neighborBlockCount);
}
}
typedef struct {
float4 sig;
float2 eps;
real3 pos;
real a[3][3], b[3][3], g[3][3];
} AtomData;
void loadAtomData(AtomData* data, int sortedIndex, int originalIndex, __global const real4* restrict pos, __global const float4* restrict sigParams,
__global const float2* restrict epsParams, __global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix) {
data->sig = sigParams[originalIndex];
data->eps = epsParams[originalIndex];
data->pos = pos[sortedIndex].xyz;
for (int i = 0; i < 3; i++)
for (int j = 0; j < 3; j++) {
int k = 9*sortedIndex+3*i+j;
data->a[i][j] = aMatrix[k];
data->b[i][j] = bMatrix[k];
data->g[i][j] = gMatrix[k];
}
}
real3 matrixVectorProduct(real (*m)[3], real3 v) {
return (real3) (m[0][0]*v.x + m[0][1]*v.y + m[0][2]*v.z,
m[1][0]*v.x + m[1][1]*v.y + m[1][2]*v.z,
m[2][0]*v.x + m[2][1]*v.y + m[2][2]*v.z);
}
real3 vectorMatrixProduct(real3 v, real (*m)[3]) {
return (real3) (m[0][0]*v.x + m[1][0]*v.y + m[2][0]*v.z,
m[0][1]*v.x + m[1][1]*v.y + m[2][1]*v.z,
m[0][2]*v.x + m[1][2]*v.y + m[2][2]*v.z);
}
void matrixSum(real (*result)[3], real (*a)[3], real (*b)[3]) {
result[0][0] = a[0][0]+b[0][0];
result[0][1] = a[0][1]+b[0][1];
result[0][2] = a[0][2]+b[0][2];
result[1][0] = a[1][0]+b[1][0];
result[1][1] = a[1][1]+b[1][1];
result[1][2] = a[1][2]+b[1][2];
result[2][0] = a[2][0]+b[2][0];
result[2][1] = a[2][1]+b[2][1];
result[2][2] = a[2][2]+b[2][2];
}
real determinant(real (*m)[3]) {
return (m[0][0]*m[1][1]*m[2][2] + m[0][1]*m[1][2]*m[2][0] + m[0][2]*m[1][0]*m[2][1] -
m[0][0]*m[1][2]*m[2][1] - m[0][1]*m[1][0]*m[2][2] - m[0][2]*m[1][1]*m[2][0]);
}
void matrixInverse(real (*result)[3], real (*m)[3]) {
real invDet = RECIP(determinant(m));
result[0][0] = invDet*(m[1][1]*m[2][2] - m[1][2]*m[2][1]);
result[1][0] = -invDet*(m[1][0]*m[2][2] - m[1][2]*m[2][0]);
result[2][0] = invDet*(m[1][0]*m[2][1] - m[1][1]*m[2][0]);
result[0][1] = -invDet*(m[0][1]*m[2][2] - m[0][2]*m[2][1]);
result[1][1] = invDet*(m[0][0]*m[2][2] - m[0][2]*m[2][0]);
result[2][1] = -invDet*(m[0][0]*m[2][1] - m[0][1]*m[2][0]);
result[0][2] = invDet*(m[0][1]*m[1][2] - m[0][2]*m[1][1]);
result[1][2] = -invDet*(m[0][0]*m[1][2] - m[0][2]*m[1][0]);
result[2][2] = invDet*(m[0][0]*m[1][1] - m[0][1]*m[1][0]);
}
void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real epsilon, real3 dr, real r2, real3* force1, real3* force2, real3* torque1, real3* torque2, mixed *totalEnergy) {
real rInv = RSQRT(r2);
real r = r2*rInv;
real3 drUnit = dr*rInv;
// Compute the switching function.
real switchValue = 1, switchDeriv = 0;
#if USE_SWITCH
if (r > SWITCH_CUTOFF) {
real x = r-SWITCH_CUTOFF;
switchValue = 1+x*x*x*(SWITCH_C3+x*(SWITCH_C4+x*SWITCH_C5));
switchDeriv = x*x*(3*SWITCH_C3+x*(4*SWITCH_C4+x*5*SWITCH_C5));
}
#endif
// Compute vectors and matrices we'll be needing.
real B12[3][3], G12[3][3], B12inv[3][3], G12inv[3][3];
matrixSum(B12, data1->b, data2->b);
matrixSum(G12, data1->g, data2->g);
matrixInverse(B12inv, B12);
matrixInverse(G12inv, G12);
real detG12 = determinant(G12);
// Estimate the distance between the ellipsoids and compute the first terms needed for the energy.
real sigma12 = 1/SQRT(0.5f*dot(drUnit, matrixVectorProduct(G12inv, drUnit)));
real h12 = r - sigma12;
real rho = sigma/(h12+sigma);
real rho2 = rho*rho;
real rho6 = rho2*rho2*rho2;
real u = 4*epsilon*(rho6*rho6-rho6);
real eta = SQRT(2*data1->eps.y*data2->eps.y/detG12);
real chi = 2*dot(drUnit, matrixVectorProduct(B12inv, drUnit));
chi *= chi;
real energy = u*eta*chi;
// Compute the terms needed for the force.
real3 kappa = matrixVectorProduct(G12inv, dr);
real3 iota = matrixVectorProduct(B12inv, dr);
real rInv2 = rInv*rInv;
real dUSLJdr = 24*epsilon*(2*rho6-1)*rho6*rho/sigma;
real temp = 0.5f*sigma12*sigma12*sigma12*rInv2;
real3 dudr = (drUnit + (kappa-drUnit*dot(kappa, drUnit))*temp)*dUSLJdr;
real3 dchidr = (iota-drUnit*dot(iota, drUnit))*(-8*rInv2*SQRT(chi));
real3 force = (dchidr*u + dudr*chi)*(eta*switchValue) - drUnit*(energy*switchDeriv);
*force1 += force;
*force2 -= force;
// Compute the terms needed for the torque.
for (int j = 0; j < 2; j++) {
real (*a)[3] = (j == 0 ? data1->a : data2->a);
real (*b)[3] = (j == 0 ? data1->b : data2->b);
real (*g)[3] = (j == 0 ? data1->g : data2->g);
float4 sig = (j == 0 ? data1->sig : data2->sig);
real3 dudq = cross(vectorMatrixProduct(kappa, g), kappa*(temp*dUSLJdr));
real3 dchidq = cross(vectorMatrixProduct(iota, b), iota)*(-4*rInv2);
real3 scale = (real3) (sig.y, sig.z, sig.w)*(-0.5f*eta/detG12);
real d[3][3];
d[0][0] = scale.x*(2*a[0][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
a[0][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[0][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])));
d[0][1] = scale.x*( a[0][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
2*a[0][1]*(G12[0][0]*G12[2][2] - G12[2][0]*G12[0][2]) +
a[0][2]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])));
d[0][2] = scale.x*( a[0][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[0][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[0][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
d[1][0] = scale.y*(2*a[1][0]*(G12[1][1]*G12[2][2] - G12[1][2]*G12[2][1]) +
a[1][1]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
a[1][2]*(G12[1][2]*G12[0][1] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])));
d[1][1] = scale.y*( a[1][0]*(G12[0][2]*G12[2][1] + G12[2][0]*G12[1][2] - G12[2][2]*(G12[0][1] + G12[1][0])) +
2*a[1][1]*(G12[2][2]*G12[0][0] - G12[2][0]*G12[0][2]) +
a[1][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
d[1][2] = scale.y*( a[1][0]*(G12[0][1]*G12[1][2] + G12[1][0]*G12[2][1] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[1][1]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[1][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
d[2][0] = scale.z*(2*a[2][0]*(G12[1][1]*G12[2][2] - G12[2][1]*G12[1][2]) +
a[2][1]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
a[2][2]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])));
d[2][1] = scale.z*( a[2][0]*(G12[0][2]*G12[2][1] + G12[1][2]*G12[2][0] - G12[2][2]*(G12[0][1] + G12[1][0])) +
2*a[2][1]*(G12[0][0]*G12[2][2] - G12[0][2]*G12[2][0]) +
a[2][2]*(G12[1][0]*G12[0][2] + G12[0][1]*G12[2][0] - G12[0][0]*(G12[1][2] + G12[2][1])));
d[2][2] = scale.z*( a[2][0]*(G12[0][1]*G12[1][2] + G12[2][1]*G12[1][0] - G12[1][1]*(G12[0][2] + G12[2][0])) +
a[2][1]*(G12[1][0]*G12[0][2] + G12[2][0]*G12[0][1] - G12[0][0]*(G12[1][2] + G12[2][1])) +
2*a[2][2]*(G12[1][1]*G12[0][0] - G12[1][0]*G12[0][1]));
real3 detadq = 0;
for (int i = 0; i < 3; i++)
detadq += cross((real3) (a[i][0], a[i][1], a[i][2]), (real3) (d[i][0], d[i][1], d[i][2]));
real3 torque = (dchidq*(u*eta) + detadq*(u*chi) + dudq*(eta*chi))*switchValue;
*(j == 0 ? torque1 : torque2) -= torque;
}
*totalEnergy += switchValue*energy;
}
/**
* Compute the interactions.
*/
__kernel void computeForce(
__global long* restrict forceBuffers, __global long* restrict torqueBuffers,
int numAtoms, int numExceptions, __global mixed* restrict energyBuffer, __global const real4* restrict pos,
__global const float4* restrict sigParams, __global const float2* restrict epsParams, __global const int* restrict sortedAtoms,
__global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix,
__global const int* restrict exclusions, __global const int* restrict exclusionStartIndex,
__global const int4* restrict exceptionParticles, __global const float2* restrict exceptionParams
#ifdef USE_CUTOFF
, int maxNeighborBlocks, __global int* restrict neighbors, __global int* restrict neighborIndex, __global int* restrict neighborBlockCount,
real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ
#endif
) {
const unsigned int warp = get_global_id(0)/TILE_SIZE;
mixed energy = 0;
#ifdef USE_CUTOFF
const int numBlocks = *neighborBlockCount;
if (numBlocks > maxNeighborBlocks)
return; // There wasn't enough memory for the neighbor list.
for (int block = get_global_id(0); block < numBlocks; block += get_global_size(0)) {
// Load parameters for atom1.
int atom1 = neighborIndex[block];
int index1 = sortedAtoms[atom1];
AtomData data1;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0.0f;
real3 torque1 = 0.0f;
for (int indexInBlock = 0; indexInBlock < NEIGHBOR_BLOCK_SIZE; indexInBlock++) {
// Load parameters for atom2.
int atom2 = neighbors[NEIGHBOR_BLOCK_SIZE*block+indexInBlock];
if (atom2 == -1)
continue;
int index2 = sortedAtoms[atom2];
AtomData data2;
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force2 = 0.0f;
real3 torque2 = 0.0f;
// Compute the interaction.
real3 delta = data1.pos-data2.pos;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
}
atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
}
#else
for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) {
// Load parameters for atom1.
int index1 = sortedAtoms[atom1];
AtomData data1;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0.0f;
real3 torque1 = 0.0f;
int nextExclusion = exclusionStartIndex[atom1];
int lastExclusion = exclusionStartIndex[atom1+1];
for (int atom2 = atom1+1; atom2 < numAtoms; atom2++) {
// Skip over excluded interactions.
if (nextExclusion < lastExclusion && exclusions[nextExclusion] == atom2) {
nextExclusion++;
continue;
}
// Load parameters for atom2.
int index2 = sortedAtoms[atom2];
AtomData data2;
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force2 = 0.0f;
real3 torque2 = 0.0f;
// Compute the interaction.
real3 delta = data1.pos-data2.pos;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
}
atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
}
#endif
// Now compute exceptions.
for (int index = get_global_id(0); index < numExceptions; index += get_global_size(0)) {
int4 atomIndices = exceptionParticles[index];
float2 params = exceptionParams[index];
int index1 = atomIndices.x, index2 = atomIndices.y;
int atom1 = atomIndices.z, atom2 = atomIndices.w;
AtomData data1, data2;
loadAtomData(&data1, atom1, index1, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
loadAtomData(&data2, atom2, index2, pos, sigParams, epsParams, aMatrix, bMatrix, gMatrix);
real3 force1 = 0, force2 = 0;
real3 torque1 = 0, torque2 = 0;
real3 delta = data1.pos-data2.pos;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
#ifdef USE_CUTOFF
}
#endif
}
energyBuffer[get_global_id(0)] += energy;
}
/**
* Convert the torques to forces on the connected particles.
*/
__kernel void applyTorques(
__global long* restrict forceBuffers, __global long* restrict torqueBuffers,
int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices,
__global const int* sortedParticles) {
const unsigned int warp = get_global_id(0)/TILE_SIZE;
for (int sortedIndex = get_global_id(0); sortedIndex < numParticles; sortedIndex += get_global_size(0)) {
int originalIndex = sortedParticles[sortedIndex];
real3 pos = posq[originalIndex].xyz;
int2 axisParticles = axisParticleIndices[originalIndex];
if (axisParticles.x != -1) {
// Load the torque.
real scale = 1/(real) 0x100000000;
real3 torque = (real3) (scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]);
real3 force = 0, xforce = 0, yforce = 0;
// Apply a force to the x particle.
real3 dx = posq[axisParticles.x].xyz-pos;
real dx2 = dot(dx, dx);
real3 f = cross(torque, dx)/dx2;
xforce += f;
force -= f;
if (axisParticles.y != -1) {
// Apply a force to the y particle. This is based on the component of the torque
// that was not already applied to the x particle.
real3 dy = posq[axisParticles.y].xyz-pos;
real dy2 = dot(dy, dy);
real3 torque2 = dx*dot(torque, dx)/dx2;
f = cross(torque2, dy)/dy2;
yforce += f;
force -= f;
}
atom_add(&forceBuffers[originalIndex], (long) (force.x*0x100000000));
atom_add(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
atom_add(&forceBuffers[axisParticles.x], (long) (xforce.x*0x100000000));
atom_add(&forceBuffers[axisParticles.x+PADDED_NUM_ATOMS], (long) (xforce.y*0x100000000));
atom_add(&forceBuffers[axisParticles.x+2*PADDED_NUM_ATOMS], (long) (xforce.z*0x100000000));
if (axisParticles.y != -1) {
atom_add(&forceBuffers[axisParticles.y], (long) (yforce.x*0x100000000));
atom_add(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (long) (yforce.y*0x100000000));
atom_add(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (long) (yforce.z*0x100000000));
}
}
}
}
{
real invRSquaredOver4 = 0.25f*invR*invR;
real rScaledRadiusJ = r+OBC_PARAMS2.y;
real rScaledRadiusI = r+OBC_PARAMS1.y;
real l_ijJ = RECIP(max((real) OBC_PARAMS1.x, fabs(r-OBC_PARAMS2.y)));
real l_ijI = RECIP(max((real) OBC_PARAMS2.x, fabs(r-OBC_PARAMS1.y)));
real u_ijJ = RECIP(rScaledRadiusJ);
real u_ijI = RECIP(rScaledRadiusI);
real l_ij2J = l_ijJ*l_ijJ;
real l_ij2I = l_ijI*l_ijI;
real u_ij2J = u_ijJ*u_ijJ;
real u_ij2I = u_ijI*u_ijI;
real t1J = LOG(u_ijJ*RECIP(l_ijJ));
real t1I = LOG(u_ijI*RECIP(l_ijI));
real t2J = (l_ij2J-u_ij2J);
real t2I = (l_ij2I-u_ij2I);
real term1 = (0.5f*(0.25f+OBC_PARAMS2.y*OBC_PARAMS2.y*invRSquaredOver4)*t2J + t1J*invRSquaredOver4)*invR;
real term2 = (0.5f*(0.25f+OBC_PARAMS1.y*OBC_PARAMS1.y*invRSquaredOver4)*t2I + t1I*invRSquaredOver4)*invR;
real tempdEdR = (OBC_PARAMS1.x < rScaledRadiusJ ? BORN_FORCE1*term1 : (real) 0);
tempdEdR += (OBC_PARAMS2.x < rScaledRadiusI ? BORN_FORCE2*term2 : (real) 0);
#ifdef USE_CUTOFF
bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2 && r2 < CUTOFF_SQUARED);
#else
bool includeInteraction = (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2);
#endif
dEdR += (includeInteraction ? tempdEdR : (real) 0);
}
float2 angleParams = PARAMS[index];
real deltaIdeal = theta-angleParams.x;
energy += 0.5f*angleParams.y*deltaIdeal*deltaIdeal;
real dEdAngle = angleParams.y*deltaIdeal;
float2 bondParams = PARAMS[index];
real deltaIdeal = r-bondParams.x;
energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
real dEdR = bondParams.y * deltaIdeal;
/**
* Apply a time shift to the velocities before computing kinetic energy.
*/
__kernel void timeShiftVelocities(__global mixed4* restrict velm, __global const real4* restrict force, real timeShift) {
for (int index = get_global_id(0); index < NUM_ATOMS; index += get_global_size(0)) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
mixed4 f = convert_mixed4(force[index]);
velocity.xyz += timeShift*f.xyz*velocity.w;
velm[index] = velocity;
}
}
}
\ No newline at end of file
enum {VelScale, ForceScale, NoiseScale, MaxParams};
/**
* Perform the first step of Langevin integration.
*/
__kernel void integrateLangevinPart1(__global mixed4* restrict velm, __global const real4* restrict force, __global mixed4* restrict posDelta,
__global const mixed* restrict paramBuffer, __global const mixed2* restrict dt, __global const float4* restrict random, unsigned int randomIndex) {
mixed vscale = paramBuffer[VelScale];
mixed fscale = paramBuffer[ForceScale];
mixed noisescale = paramBuffer[NoiseScale];
mixed stepSize = dt[0].y;
int index = get_global_id(0);
randomIndex += index;
while (index < NUM_ATOMS) {
mixed4 velocity = velm[index];
if (velocity.w != 0.0) {
mixed sqrtInvMass = sqrt(velocity.w);
velocity.x = vscale*velocity.x + fscale*velocity.w*force[index].x + noisescale*sqrtInvMass*random[randomIndex].x;
velocity.y = vscale*velocity.y + fscale*velocity.w*force[index].y + noisescale*sqrtInvMass*random[randomIndex].y;
velocity.z = vscale*velocity.z + fscale*velocity.w*force[index].z + noisescale*sqrtInvMass*random[randomIndex].z;
velm[index] = velocity;
posDelta[index] = stepSize*velocity;
}
randomIndex += get_global_size(0);
index += get_global_size(0);
}
}
/**
* Perform the second step of Langevin integration.
*/
__kernel void integrateLangevinPart2(__global real4* restrict posq, __global real4* restrict posqCorrection, __global const mixed4* restrict posDelta, __global mixed4* restrict velm, __global const mixed2* restrict dt) {
#ifdef SUPPORTS_DOUBLE_PRECISION
double invStepSize = 1.0/dt[0].y;
#else
float invStepSize = 1.0f/dt[0].y;
float correction = (1.0f-invStepSize*dt[0].y)/dt[0].y;
#endif
int index = get_global_id(0);
while (index < NUM_ATOMS) {
mixed4 vel = velm[index];
if (vel.w != 0.0) {
#ifdef USE_MIXED_PRECISION
real4 pos1 = posq[index];
real4 pos2 = posqCorrection[index];
mixed4 pos = (mixed4) (pos1.x+(mixed)pos2.x, pos1.y+(mixed)pos2.y, pos1.z+(mixed)pos2.z, pos1.w);
#else
real4 pos = posq[index];
#endif
mixed4 delta = posDelta[index];
pos.xyz += delta.xyz;
#ifdef SUPPORTS_DOUBLE_PRECISION
vel.xyz = convert_mixed4(invStepSize*convert_double4(delta)).xyz;
#else
vel.xyz = invStepSize*delta.xyz + correction*delta.xyz;
#endif
#ifdef USE_MIXED_PRECISION
posq[index] = convert_real4(pos);
posqCorrection[index] = (real4) (pos.x-(real) pos.x, pos.y-(real) pos.y, pos.z-(real) pos.z, 0);
#else
posq[index] = pos;
#endif
velm[index] = vel;
}
index += get_global_size(0);
}
}
/**
* Select the step size to use for the next step.
*/
__kernel void selectLangevinStepSize(mixed maxStepSize, mixed errorTol, mixed friction, mixed kT, __global mixed2* restrict dt,
__global const mixed4* restrict velm, __global const real4* restrict force, __global mixed* restrict paramBuffer, __local mixed* restrict params, __local mixed* restrict error) {
// Calculate the error.
mixed err = 0.0f;
unsigned int index = get_local_id(0);
while (index < NUM_ATOMS) {
real4 f = force[index];
mixed invMass = velm[index].w;
err += (f.x*f.x + f.y*f.y + f.z*f.z)*invMass*invMass;
index += get_global_size(0);
}
error[get_local_id(0)] = err;
barrier(CLK_LOCAL_MEM_FENCE);
// Sum the errors from all threads.
for (unsigned int offset = 1; offset < get_local_size(0); offset *= 2) {
if (get_local_id(0)+offset < get_local_size(0) && (get_local_id(0)&(2*offset-1)) == 0)
error[get_local_id(0)] += error[get_local_id(0)+offset];
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_global_id(0) == 0) {
// Select the new step size.
mixed totalError = sqrt(error[0]/(NUM_ATOMS*3));
mixed newStepSize = sqrt(errorTol/totalError);
mixed oldStepSize = dt[0].y;
if (oldStepSize > 0.0f)
newStepSize = min(newStepSize, oldStepSize*2.0f); // For safety, limit how quickly dt can increase.
if (newStepSize > oldStepSize && newStepSize < 1.1f*oldStepSize)
newStepSize = oldStepSize; // Keeping dt constant between steps improves the behavior of the integrator.
if (newStepSize > maxStepSize)
newStepSize = maxStepSize;
dt[0].y = newStepSize;
// Recalculate the integration parameters.
mixed vscale = exp(-newStepSize*friction);
mixed fscale = (friction == 0 ? newStepSize : (1-vscale)/friction);
mixed noisescale = sqrt(kT*(1-vscale*vscale));
params[VelScale] = vscale;
params[ForceScale] = fscale;
params[NoiseScale] = noisescale;
}
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < MaxParams)
paramBuffer[get_local_id(0)] = params[get_local_id(0)];
}
float4 exceptionParams = PARAMS[index];
real4 delta = pos2-pos1;
#if APPLY_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real sig2 = invR*exceptionParams.y;
......
//#include <initializer_list>
__kernel void propagateNoseHooverChain(__global mixed2* restrict chainData, __global const mixed2 * restrict energySum, __global mixed2* restrict scaleFactor,
__global mixed* restrict chainMasses, __global mixed* restrict chainForces,
int chainType, int chainLength, int numMTS, int numDOFs, float timeStep,
mixed kT, float frequency){
const mixed kineticEnergy = chainType == 0 ? energySum[0].x : energySum[0].y;
mixed scale = 1;
if(kineticEnergy < 1e-8) return;
for (int bead = 0; bead < chainLength; ++bead) chainMasses[bead] = kT / (frequency * frequency);
chainMasses[0] *= numDOFs;
mixed KE2 = 2.0f * kineticEnergy;
mixed timeOverMTS = timeStep / numMTS;
chainForces[0] = (KE2 - numDOFs * kT) / chainMasses[0];
for (int bead = 0; bead < chainLength - 1; ++bead) {
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
}
for (int mts = 0; mts < numMTS; ++mts) {
BEGIN_YS_LOOP
mixed wdt = ys * timeOverMTS;
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
for (int bead = chainLength - 2; bead >= 0; --bead) {
mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (chainData[bead].y * aa + 0.25f * wdt * chainForces[bead]);
}
// update particle velocities
mixed aa = EXP(-0.5f * wdt * chainData[0].y);
scale *= aa;
// update the thermostat positions
for (int bead = 0; bead < chainLength; ++bead) {
chainData[bead].x += 0.5f * chainData[bead].y * wdt;
}
// update the forces
chainForces[0] = (scale * scale * KE2 - numDOFs * kT) / chainMasses[0];
// update thermostat velocities
for (int bead = 0; bead < chainLength - 1; ++bead) {
mixed aa = EXP(-0.125f * wdt * chainData[bead + 1].y);
chainData[bead].y = aa * (aa * chainData[bead].y + 0.25f * wdt * chainForces[bead]);
chainForces[bead + 1] = (chainMasses[bead] * chainData[bead].y * chainData[bead].y - kT) / chainMasses[bead + 1];
}
chainData[chainLength-1].y += 0.25f * wdt * chainForces[chainLength-1];
END_YS_LOOP
} // MTS loop
if (chainType == 0) {
scaleFactor[0].x = scale;
} else {
scaleFactor[0].y = scale;
}
}
/**
* Compute total (potential + kinetic) energy of the Nose-Hoover beads
*/
__kernel void computeHeatBathEnergy(__global mixed* restrict heatBathEnergy, int chainLength, int numDOFs,
mixed kT, float frequency, __global const mixed2* restrict chainData){
// Note that this is always incremented; make sure it's zeroed properly before the first call
for(int i = 0; i < chainLength; ++i) {
mixed prefac = i ? 1 : numDOFs;
mixed mass = prefac * kT / (frequency * frequency);
mixed velocity = chainData[i].y;
// The kinetic energy of this bead
heatBathEnergy[0] += 0.5f * mass * velocity * velocity;
// The potential energy of this bead
mixed position = chainData[i].x;
heatBathEnergy[0] += prefac * kT * position;
}
}
__kernel void computeAtomsKineticEnergy(__global mixed2 * restrict energyBuffer, int numAtoms,
__global const mixed4* restrict velm, __global const int *restrict atoms){
mixed2 energy = (mixed2) (0,0);
//energy = 1; return;
int index = get_global_id(0);
while (index < numAtoms){
int atom = atoms[index];
mixed4 v = velm[atom];
mixed mass = v.w == 0 ? 0 : 1 / v.w;
energy.x += 0.5f * mass * (v.x*v.x + v.y*v.y + v.z*v.z);
index += get_global_size(0);
}
energyBuffer[get_global_id(0)] = energy;
}
__kernel void computePairsKineticEnergy(__global mixed2 * restrict energyBuffer, int numPairs,
__global const mixed4* restrict velm, __global const int2 *restrict pairs){
mixed2 energy = (mixed2) (0,0);
int index = get_global_id(0);
while (index < numPairs){
int2 pair = pairs[index];
int atom1 = pair.x;
int atom2 = pair.y;
mixed4 v1 = velm[atom1];
mixed4 v2 = velm[atom2];
mixed m1 = v1.w == 0 ? 0 : 1 / v1.w;
mixed m2 = v2.w == 0 ? 0 : 1 / v2.w;
mixed4 cv;
cv.x = (m1*v1.x + m2*v2.x) / (m1 + m2);
cv.y = (m1*v1.y + m2*v2.y) / (m1 + m2);
cv.z = (m1*v1.z + m2*v2.z) / (m1 + m2);
mixed4 rv;
rv.x = v2.x - v1.x;
rv.y = v2.y - v1.y;
rv.z = v2.z - v1.z;
energy.x += 0.5f * (m1 + m2) * (cv.x*cv.x + cv.y*cv.y + cv.z*cv.z);
energy.y += 0.5f * (m1 * m2 / (m1 + m2)) * (rv.x*rv.x + rv.y*rv.y + rv.z*rv.z);
index += get_global_size(0);
}
// The atoms version of this has been called already, so accumulate instead of assigning here
energyBuffer[get_global_id(0)].xy += energy.xy;
}
__kernel void scaleAtomsVelocities(__global mixed2* restrict scaleFactor, int numAtoms,
__global mixed4* restrict velm, __global const int *restrict atoms){
const mixed scale = scaleFactor[0].x;
int index = get_global_id(0);
while (index < numAtoms){
int atom = atoms[index];
velm[atom].x *= scale;
velm[atom].y *= scale;
velm[atom].z *= scale;
index += get_global_size(0);
}
}
__kernel void scalePairsVelocities(__global mixed2 * restrict scaleFactor, int numPairs,
__global mixed4* restrict velm, __global const int2 *restrict pairs){
int index = get_global_id(0);
while (index < numPairs){
int atom1 = pairs[index].x;
int atom2 = pairs[index].y;
mixed m1 = velm[atom1].w == 0 ? 0 : 1 / velm[atom1].w;
mixed m2 = velm[atom2].w == 0 ? 0 : 1 / velm[atom2].w;
mixed4 cv;
cv.xyz = (m1*velm[atom1].xyz + m2*velm[atom2].xyz) / (m1 + m2);
mixed4 rv;
rv.xyz = velm[atom2].xyz - velm[atom1].xyz;
velm[atom1].x = scaleFactor[0].x * cv.x - scaleFactor[0].y * rv.x * m2 / (m1 + m2);
velm[atom1].y = scaleFactor[0].x * cv.y - scaleFactor[0].y * rv.y * m2 / (m1 + m2);
velm[atom1].z = scaleFactor[0].x * cv.z - scaleFactor[0].y * rv.z * m2 / (m1 + m2);
velm[atom2].x = scaleFactor[0].x * cv.x + scaleFactor[0].y * rv.x * m1 / (m1 + m2);
velm[atom2].y = scaleFactor[0].x * cv.y + scaleFactor[0].y * rv.y * m1 / (m1 + m2);
velm[atom2].z = scaleFactor[0].x * cv.z + scaleFactor[0].y * rv.z * m1 / (m1 + m2);
index += get_global_size(0);
}
}
/**
* Sum the energy buffer containing a pair of energies stored as mixed2. This is copied from utilities.cu with small modifications
*/
__kernel void reduceEnergyPair(__global const mixed2* restrict energyBuffer, __global mixed2* restrict result, int bufferSize, int workGroupSize, __local mixed2* restrict tempBuffer) {
const unsigned int thread = get_local_id(0);
mixed2 sum = (mixed2) (0,0);
for (unsigned int index = thread; index < bufferSize; index += get_local_size(0)) {
sum.xy += energyBuffer[index].xy;
}
tempBuffer[thread].xy = sum.xy;
for (int i = 1; i < workGroupSize; i *= 2) {
barrier(CLK_LOCAL_MEM_FENCE);
if (thread%(i*2) == 0 && thread+i < workGroupSize) {
tempBuffer[thread].xy += tempBuffer[thread+i].xy;
}
}
if (thread == 0) {
*result = tempBuffer[0];
}
}
float4 torsionParams = PARAMS[index];
real deltaAngle = torsionParams.z*theta-torsionParams.y;
energy += torsionParams.x*(1.0f+cos(deltaAngle));
real sinDeltaAngle = sin(deltaAngle);
real dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle;
/**
* Generate random numbers
*/
__kernel void generateRandomNumbers(int numValues, __global float4* restrict random, __global uint4* restrict seed) {
int index = get_global_id(0);
uint4 state = seed[index];
unsigned int carry = 0;
while (index < numValues) {
float4 value;
// Generate first two values.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
unsigned int k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
unsigned int m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x1 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x1 = SQRT(-2.0f * LOG(x1));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.x = x1 * cos(2.0f * 3.14159265f * x2);
value.y = x1 * sin(2.0f * 3.14159265f * x2);
// Generate next two values.
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x3 = (float)max(state.x + state.y + state.w, 0x00000001u) / (float)0xffffffff;
state.x = state.x * 69069 + 1;
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x3 = SQRT(-2.0f * LOG(x3));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.z = x3 * cos(2.0f * 3.14159265f * x4);
value.w = x3 * sin(2.0f * 3.14159265f * x4);
// Record the values.
random[index] = value;
index += get_global_size(0);
}
seed[get_global_id(0)] = state;
}
float8 torsionParams = PARAMS[index];
if (theta < 0.0f)
theta += PI;
else
theta -= PI;
cosangle = -cosangle;
real cosFactor = cosangle;
real dEdAngle = -torsionParams.s1;
real rbEnergy = torsionParams.s0;
rbEnergy += torsionParams.s1*cosFactor;
dEdAngle -= 2.0f*torsionParams.s2*cosFactor;
cosFactor *= cosangle;
dEdAngle -= 3.0f*torsionParams.s3*cosFactor;
rbEnergy += torsionParams.s2*cosFactor;
cosFactor *= cosangle;
dEdAngle -= 4.0f*torsionParams.s4*cosFactor;
rbEnergy += torsionParams.s3*cosFactor;
cosFactor *= cosangle;
dEdAngle -= 5.0f*torsionParams.s5*cosFactor;
rbEnergy += torsionParams.s4*cosFactor;
rbEnergy += torsionParams.s5*cosFactor*cosangle;
energy += rbEnergy;
dEdAngle *= sin(theta);
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment