Commit c8dac206 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement double precision in OpenCL

parent 34938e2c
/**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/
float4 delta(float4 vec1, float4 vec2) {
float4 result = (float4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
real4 delta(real4 vec1, real4 vec2) {
real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
......@@ -11,8 +11,8 @@ float4 delta(float4 vec1, float4 vec2) {
* Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude.
*/
float4 deltaPeriodic(float4 vec1, float4 vec2, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
float4 result = (float4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f);
real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
#ifdef USE_PERIODIC
result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
......@@ -25,15 +25,15 @@ float4 deltaPeriodic(float4 vec1, float4 vec2, float4 periodicBoxSize, float4 in
/**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/
float computeAngle(float4 vec1, float4 vec2) {
float dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
float cosine = dotProduct*RSQRT(vec1.w*vec2.w);
float angle;
real computeAngle(real4 vec1, real4 vec2) {
real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
real angle;
if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 crossProduct = cross(vec1, vec2);
float scale = vec1.w*vec2.w;
real4 crossProduct = cross(vec1, vec2);
real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f)
angle = PI-angle;
......@@ -46,8 +46,8 @@ float computeAngle(float4 vec1, float4 vec2) {
/**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/
float4 computeCross(float4 vec1, float4 vec2) {
float4 result = cross(vec1, vec2);
real4 computeCross(real4 vec1, real4 vec2) {
real4 result = cross(vec1, vec2);
result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result;
}
......@@ -55,24 +55,24 @@ float4 computeCross(float4 vec1, float4 vec2) {
/**
* Compute forces on donors.
*/
__kernel void computeDonorForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local float4* posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize
__kernel void computeDonorForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local real4* posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) {
float energy = 0.0f;
float4 f1 = (float4) 0;
float4 f2 = (float4) 0;
float4 f3 = (float4) 0;
real energy = 0;
real4 f1 = (real4) 0;
real4 f2 = (real4) 0;
real4 f3 = (real4) 0;
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_global_size(0)) {
// Load information about the donor this thread will compute forces on.
int donorIndex = donorStart+get_global_id(0);
int4 atoms, exclusionIndices;
float4 d1, d2, d3;
real4 d1, d2, d3;
if (donorIndex < NUM_DONORS) {
atoms = donorAtoms[donorIndex];
d1 = (atoms.x > -1 ? posq[atoms.x] : (float4) 0);
d2 = (atoms.y > -1 ? posq[atoms.y] : (float4) 0);
d3 = (atoms.z > -1 ? posq[atoms.z] : (float4) 0);
d1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
d2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
d3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
#ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[donorIndex];
#endif
......@@ -85,9 +85,9 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
int blockSize = min((int) get_local_size(0), NUM_ACCEPTORS-acceptorStart);
if (get_local_id(0) < blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+get_local_id(0)];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (float4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (float4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (float4) 0);
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (donorIndex < NUM_DONORS) {
......@@ -99,10 +99,10 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
#endif
// Compute the interaction between a donor and an acceptor.
float4 a1 = posBuffer[3*index];
float4 a2 = posBuffer[3*index+1];
float4 a3 = posBuffer[3*index+2];
float4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
real4 a1 = posBuffer[3*index];
real4 a2 = posBuffer[3*index+1];
real4 a3 = posBuffer[3*index+2];
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
#ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif
......@@ -120,19 +120,19 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
int4 bufferIndices = donorBufferIndices[donorIndex];
if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset];
real4 force = forceBuffers[offset];
force.xyz += f1.xyz;
forceBuffers[offset] = force;
}
if (atoms.y > -1) {
unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset];
real4 force = forceBuffers[offset];
force.xyz += f2.xyz;
forceBuffers[offset] = force;
}
if (atoms.z > -1) {
unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset];
real4 force = forceBuffers[offset];
force.xyz += f3.xyz;
forceBuffers[offset] = force;
}
......@@ -143,23 +143,23 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
/**
* Compute forces on acceptors.
*/
__kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local float4* restrict posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize
__kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local real4* restrict posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) {
float4 f1 = (float4) 0;
float4 f2 = (float4) 0;
float4 f3 = (float4) 0;
real4 f1 = (real4) 0;
real4 f2 = (real4) 0;
real4 f3 = (real4) 0;
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_global_size(0)) {
// Load information about the acceptor this thread will compute forces on.
int acceptorIndex = acceptorStart+get_global_id(0);
int4 atoms, exclusionIndices;
float4 a1, a2, a3;
real4 a1, a2, a3;
if (acceptorIndex < NUM_ACCEPTORS) {
atoms = acceptorAtoms[acceptorIndex];
a1 = (atoms.x > -1 ? posq[atoms.x] : (float4) 0);
a2 = (atoms.y > -1 ? posq[atoms.y] : (float4) 0);
a3 = (atoms.z > -1 ? posq[atoms.z] : (float4) 0);
a1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
a2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
a3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
#ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[acceptorIndex];
#endif
......@@ -172,9 +172,9 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
int blockSize = min((int) get_local_size(0), NUM_DONORS-donorStart);
if (get_local_id(0) < blockSize) {
int4 atoms2 = donorAtoms[donorStart+get_local_id(0)];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (float4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (float4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (float4) 0);
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
}
barrier(CLK_LOCAL_MEM_FENCE);
if (acceptorIndex < NUM_ACCEPTORS) {
......@@ -186,10 +186,10 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
#endif
// Compute the interaction between a donor and an acceptor.
float4 d1 = posBuffer[3*index];
float4 d2 = posBuffer[3*index+1];
float4 d3 = posBuffer[3*index+2];
float4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
real4 d1 = posBuffer[3*index];
real4 d2 = posBuffer[3*index+1];
real4 d3 = posBuffer[3*index+2];
real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
#ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif
......@@ -207,19 +207,19 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset];
real4 force = forceBuffers[offset];
force.xyz += f1.xyz;
forceBuffers[offset] = force;
}
if (atoms.y > -1) {
unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset];
real4 force = forceBuffers[offset];
force.xyz += f2.xyz;
forceBuffers[offset] = force;
}
if (atoms.z > -1) {
unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset];
real4 force = forceBuffers[offset];
force.xyz += f3.xyz;
forceBuffers[offset] = force;
}
......
float2 multofFloat2(float2 a, float2 b) {
return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
real2 multofReal2(real2 a, real2 b) {
return (real2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
}
/**
* Precompute the cosine and sine sums which appear in each force term.
*/
__kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __global const float4* restrict posq, __global float2* restrict cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) {
__kernel void calculateEwaldCosSinSums(__global real* restrict energyBuffer, __global const real4* restrict posq, __global real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
const unsigned int ksizex = 2*KMAX_X-1;
const unsigned int ksizey = 2*KMAX_Y-1;
const unsigned int ksizez = 2*KMAX_Z-1;
const unsigned int totalK = ksizex*ksizey*ksizez;
unsigned int index = get_global_id(0);
float energy = 0.0f;
real energy = 0.0f;
while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
index += get_global_size(0);
while (index < totalK) {
......@@ -23,29 +23,29 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __
int ry = remainder/ksizez;
int rz = remainder - ry*ksizez - KMAX_Z + 1;
ry += -KMAX_Y + 1;
float kx = rx*reciprocalPeriodicBoxSize.x;
float ky = ry*reciprocalPeriodicBoxSize.y;
float kz = rz*reciprocalPeriodicBoxSize.z;
real kx = rx*reciprocalPeriodicBoxSize.x;
real ky = ry*reciprocalPeriodicBoxSize.y;
real kz = rz*reciprocalPeriodicBoxSize.z;
// Compute the sum for this wave vector.
float2 sum = 0.0f;
real2 sum = 0.0f;
for (int atom = 0; atom < NUM_ATOMS; atom++) {
float4 apos = posq[atom];
float phase = apos.x*kx;
float2 structureFactor = (float2) (cos(phase), sin(phase));
real4 apos = posq[atom];
real phase = apos.x*kx;
real2 structureFactor = (real2) (cos(phase), sin(phase));
phase = apos.y*ky;
structureFactor = multofFloat2(structureFactor, (float2) (cos(phase), sin(phase)));
structureFactor = multofReal2(structureFactor, (real2) (cos(phase), sin(phase)));
phase = apos.z*kz;
structureFactor = multofFloat2(structureFactor, (float2) (cos(phase), sin(phase)));
structureFactor = multofReal2(structureFactor, (real2) (cos(phase), sin(phase)));
sum += apos.w*structureFactor;
}
cosSinSum[index] = sum;
// Compute the contribution to the energy.
float k2 = kx*kx + ky*ky + kz*kz;
float ak = EXP(k2*EXP_COEFFICIENT) / k2;
real k2 = kx*kx + ky*ky + kz*kz;
real ak = EXP(k2*EXP_COEFFICIENT) / k2;
energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
index += get_global_size(0);
}
......@@ -57,36 +57,36 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __
* previous routine.
*/
__kernel void calculateEwaldForces(__global float4* restrict forceBuffers, __global const float4* restrict posq, __global const float2* restrict cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) {
__kernel void calculateEwaldForces(__global real4* restrict forceBuffers, __global const real4* restrict posq, __global const real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
unsigned int atom = get_global_id(0);
while (atom < NUM_ATOMS) {
float4 force = forceBuffers[atom];
float4 apos = posq[atom];
real4 force = forceBuffers[atom];
real4 apos = posq[atom];
// Loop over all wave vectors.
int lowry = 0;
int lowrz = 1;
for (int rx = 0; rx < KMAX_X; rx++) {
float kx = rx*reciprocalPeriodicBoxSize.x;
real kx = rx*reciprocalPeriodicBoxSize.x;
for (int ry = lowry; ry < KMAX_Y; ry++) {
float ky = ry*reciprocalPeriodicBoxSize.y;
float phase = apos.x*kx;
float2 tab_xy = (float2) (cos(phase), sin(phase));
real ky = ry*reciprocalPeriodicBoxSize.y;
real phase = apos.x*kx;
real2 tab_xy = (real2) (cos(phase), sin(phase));
phase = apos.y*ky;
tab_xy = multofFloat2(tab_xy, (float2) (cos(phase), sin(phase)));
tab_xy = multofReal2(tab_xy, (real2) (cos(phase), sin(phase)));
for (int rz = lowrz; rz < KMAX_Z; rz++) {
float kz = rz*reciprocalPeriodicBoxSize.z;
real kz = rz*reciprocalPeriodicBoxSize.z;
// Compute the force contribution of this wave vector.
int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1);
float k2 = kx*kx + ky*ky + kz*kz;
float ak = EXP(k2*EXP_COEFFICIENT)/k2;
real k2 = kx*kx + ky*ky + kz*kz;
real ak = EXP(k2*EXP_COEFFICIENT)/k2;
phase = apos.z*kz;
float2 structureFactor = multofFloat2(tab_xy, (float2) (cos(phase), sin(phase)));
float2 sum = cosSinSum[index];
float dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
real2 structureFactor = multofReal2(tab_xy, (real2) (cos(phase), sin(phase)));
real2 sum = cosSinSum[index];
real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
force.x += dEdR*kx;
force.y += dEdR*ky;
force.z += dEdR*kz;
......
float2 multiplyComplex(float2 c1, float2 c2) {
return (float2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
real2 multiplyComplex(real2 c1, real2 c2) {
return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
}
/**
* Perform a 1D FFT on each row along one axis.
*/
__kernel void execFFT(__global const float2* restrict in, __global float2* restrict out, float sign, __local float2* restrict w,
__local float2* restrict data0, __local float2* restrict data1) {
__kernel void execFFT(__global const real2* restrict in, __global real2* restrict out, int sign, __local real2* restrict w,
__local real2* restrict data0, __local real2* restrict data1) {
for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
w[i] = (float2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
barrier(CLK_LOCAL_MEM_FENCE);
for (int index = get_group_id(0); index < XSIZE*YSIZE; index += get_num_groups(0)) {
int x = index/YSIZE;
......
......@@ -8,19 +8,19 @@
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global float4* restrict blockCenter, __global float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
int index = get_global_id(0);
int base = index*TILE_SIZE;
while (base < numAtoms) {
float4 pos = posq[base];
real4 pos = posq[base];
#ifdef USE_PERIODIC
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
float4 firstPoint = pos;
real4 firstPoint = pos;
#endif
float4 minPos = pos;
float4 maxPos = pos;
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) {
pos = posq[i];
......@@ -46,8 +46,8 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
* to global memory.
*/
void storeInteractionData(__local ushort2* buffer, __local int* valid, __local short* sum, __local ushort2* temp, __local int* baseIndex,
__global unsigned int* interactionCount, __global ushort2* interactingTiles, float cutoffSquared, float4 periodicBoxSize,
float4 invPeriodicBoxSize, __global const float4* posq, __global const float4* blockCenter, __global const float4* blockBoundingBox, unsigned int maxTiles) {
__global unsigned int* interactionCount, __global ushort2* interactingTiles, real cutoffSquared, real4 periodicBoxSize,
real4 invPeriodicBoxSize, __global const real4* posq, __global const real4* blockCenter, __global const real4* blockBoundingBox, unsigned int maxTiles) {
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
......@@ -91,7 +91,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
int index = get_local_id(0)&(TILE_SIZE-1);
int group = get_local_id(0)/TILE_SIZE;
float4 center, boxSize, pos;
real4 center, boxSize, pos;
for (int tile = 0; tile < numValid; tile++) {
int x = temp[tile].x;
int y = temp[tile].y;
......@@ -103,12 +103,12 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
#ifdef MAC_AMD_WORKAROUND
int box = (group == 0 ? x : y);
int atom = (group == 0 ? y : x)*TILE_SIZE+index;
__global float* bc = (__global float*) blockCenter;
__global float* bb = (__global float*) blockBoundingBox;
__global float* ps = (__global float*) posq;
center = (float4) (bc[4*box], bc[4*box+1], bc[4*box+2], 0.0f);
boxSize = (float4) (bb[4*box], bb[4*box+1], bb[4*box+2], 0.0f);
pos = (float4) (ps[4*atom], ps[4*atom+1], ps[4*atom+2], 0.0f);
__global real* bc = (__global real*) blockCenter;
__global real* bb = (__global real*) blockBoundingBox;
__global real* ps = (__global real*) posq;
center = (real4) (bc[4*box], bc[4*box+1], bc[4*box+2], 0);
boxSize = (real4) (bb[4*box], bb[4*box+1], bb[4*box+2], 0);
pos = (real4) (ps[4*atom], ps[4*atom+1], ps[4*atom+2], 0);
#else
center = blockCenter[(group == 0 ? x : y)];
boxSize = blockBoundingBox[(group == 0 ? x : y)];
......@@ -117,13 +117,13 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
// Find the distance of the atom from the bounding box.
float4 delta = pos-center;
real4 delta = pos-center;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta = max((float4) 0.0f, fabs(delta)-boxSize);
delta = max((real4) 0, fabs(delta)-boxSize);
__local ushort* flag = (__local ushort*) &buffer[tile];
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared)
flag[group] = false;
......@@ -155,9 +155,9 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const float4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
__kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
__local ushort2 buffer[BUFFER_SIZE];
__local int valid[BUFFER_SIZE];
......@@ -194,26 +194,26 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
// Find the distance between the bounding boxes of the two cells.
#ifdef MAC_AMD_WORKAROUND
__global float* bc = (__global float*) blockCenter;
__global float* bb = (__global float*) blockBoundingBox;
float4 bcx = (float4) (bc[4*x], bc[4*x+1], bc[4*x+2], 0.0f);
float4 bcy = (float4) (bc[4*y], bc[4*y+1], bc[4*y+2], 0.0f);
float4 delta = bcx-bcy;
float4 boxSizea = (float4) (bb[4*x], bb[4*x+1], bb[4*x+2], 0.0f);
float4 boxSizeb = (float4) (bb[4*y], bb[4*y+1], bb[4*y+2], 0.0f);
__global real* bc = (__global real*) blockCenter;
__global real* bb = (__global real*) blockBoundingBox;
real4 bcx = (real4) (bc[4*x], bc[4*x+1], bc[4*x+2], 0);
real4 bcy = (real4) (bc[4*y], bc[4*y+1], bc[4*y+2], 0);
real4 delta = bcx-bcy;
real4 boxSizea = (real4) (bb[4*x], bb[4*x+1], bb[4*x+2], 0);
real4 boxSizeb = (real4) (bb[4*y], bb[4*y+1], bb[4*y+2], 0);
#else
float4 delta = blockCenter[x]-blockCenter[y];
float4 boxSizea = blockBoundingBox[x];
float4 boxSizeb = blockBoundingBox[y];
real4 delta = blockCenter[x]-blockCenter[y];
real4 boxSizea = blockBoundingBox[x];
real4 boxSizeb = blockBoundingBox[y];
#endif
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
delta.x = max((real) 0, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max((real) 0, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max((real) 0, fabs(delta.z)-boxSizea.z-boxSizeb.z);
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
// Add this tile to the buffer.
......@@ -241,8 +241,8 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
__kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global const ushort2* restrict tiles, __global const float4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local volatile unsigned int* restrict flags, unsigned int maxTiles) {
__kernel void findInteractionsWithinBlocks(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global const ushort2* restrict tiles, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local volatile unsigned int* restrict flags, unsigned int maxTiles) {
unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int numTiles = interactionCount[0];
......@@ -253,7 +253,7 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
if (numTiles > maxTiles)
return;
unsigned int lasty = 0xFFFFFFFF;
float4 apos;
real4 apos;
while (pos < end) {
// Extract the coordinates of this tile
ushort2 tileIndices = tiles[pos];
......@@ -266,20 +266,20 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
else {
// Load the bounding box for x and the atom positions for y.
float4 center = blockCenter[x];
float4 boxSize = blockBoundingBox[x];
real4 center = blockCenter[x];
real4 boxSize = blockBoundingBox[x];
if (y != lasty)
apos = posq[y*TILE_SIZE+index];
// Find the distance of the atom from the bounding box.
float4 delta = apos-center;
real4 delta = apos-center;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta = max((float4) 0.0f, fabs(delta)-boxSize);
delta = max((real4) 0, fabs(delta)-boxSize);
int thread = get_local_id(0);
flags[thread] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1 << index);
......
......@@ -8,19 +8,19 @@
/**
* Find a bounding box for the atoms in each block.
*/
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global float4* restrict blockCenter, __global float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
int index = get_global_id(0);
int base = index*TILE_SIZE;
while (base < numAtoms) {
float4 pos = posq[base];
real4 pos = posq[base];
#ifdef USE_PERIODIC
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
float4 firstPoint = pos;
real4 firstPoint = pos;
#endif
float4 minPos = pos;
float4 maxPos = pos;
real4 minPos = pos;
real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) {
pos = posq[i];
......@@ -46,14 +46,14 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
* to global memory.
*/
void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize,
__global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox, unsigned int maxTiles) {
__global unsigned int* interactionFlags, real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize,
__global real4* posq, __global real4* blockCenter, __global real4* blockBoundingBox, unsigned int maxTiles) {
// Filter the list of tiles by comparing the distance from each atom to the other bounding box.
unsigned int flagsBuffer[2*BUFFER_SIZE];
float4 atomPositions[TILE_SIZE];
real4 atomPositions[TILE_SIZE];
int lasty = -1;
float4 centery, boxSizey;
real4 centery, boxSizey;
for (int tile = 0; tile < numValid; ) {
int x = buffer[tile].x;
int y = buffer[tile].y;
......@@ -64,8 +64,8 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
// Load the atom positions and bounding boxes.
float4 centerx = blockCenter[x];
float4 boxSizex = blockBoundingBox[x];
real4 centerx = blockCenter[x];
real4 boxSizex = blockBoundingBox[x];
if (y != lasty) {
for (int atom = 0; atom < TILE_SIZE; atom++)
atomPositions[atom] = posq[y*TILE_SIZE+atom];
......@@ -78,18 +78,18 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
unsigned int flags1 = 0, flags2 = 0;
for (int atom = 0; atom < TILE_SIZE; atom++) {
float4 delta = atomPositions[atom]-centerx;
real4 delta = atomPositions[atom]-centerx;
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
delta = max((float4) 0.0f, fabs(delta)-boxSizex);
delta = max((real4) 0, fabs(delta)-boxSizex);
if (dot(delta.xyz, delta.xyz) < cutoffSquared)
flags1 += 1 << atom;
delta = posq[x*TILE_SIZE+atom]-centery;
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
delta = max((float4) 0.0f, fabs(delta)-boxSizey);
delta = max((real4) 0, fabs(delta)-boxSizey);
if (dot(delta.xyz, delta.xyz) < cutoffSquared)
flags2 += 1 << atom;
}
......@@ -121,9 +121,9 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting.
*/
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const float4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
__kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) {
ushort2 buffer[BUFFER_SIZE];
int valuesInBuffer = 0;
......@@ -142,17 +142,17 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
// Find the distance between the bounding boxes of the two cells.
float4 delta = blockCenter[x]-blockCenter[y];
real4 delta = blockCenter[x]-blockCenter[y];
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float4 boxSizea = blockBoundingBox[x];
float4 boxSizeb = blockBoundingBox[y];
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
real4 boxSizea = blockBoundingBox[x];
real4 boxSizeb = blockBoundingBox[y];
delta.x = max((real) 0, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max((real) 0, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max((real) 0, fabs(delta.z)-boxSizea.z-boxSizeb.z);
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
// Add this tile to the buffer.
......
float2 angleParams = PARAMS[index];
float deltaIdeal = theta-angleParams.x;
real deltaIdeal = theta-angleParams.x;
energy += 0.5f*angleParams.y*deltaIdeal*deltaIdeal;
float dEdAngle = angleParams.y*deltaIdeal;
real dEdAngle = angleParams.y*deltaIdeal;
float2 bondParams = PARAMS[index];
float deltaIdeal = r-bondParams.x;
real deltaIdeal = r-bondParams.x;
energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
float dEdR = bondParams.y * deltaIdeal;
real dEdR = bondParams.y * deltaIdeal;
float4 exceptionParams = PARAMS[index];
float4 delta = pos2-pos1;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float sig2 = invR*exceptionParams.y;
real4 delta = pos2-pos1;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real sig2 = invR*exceptionParams.y;
sig2 *= sig2;
float sig6 = sig2*sig2*sig2;
float dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6;
float tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6;
real sig6 = sig2*sig2*sig2;
real dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6;
real tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6;
dEdR += exceptionParams.x*invR;
dEdR *= invR*invR;
tempEnergy += exceptionParams.x*invR;
energy += tempEnergy;
delta.xyz *= dEdR;
float4 force1 = -delta;
float4 force2 = delta;
real4 force1 = -delta;
real4 force2 = delta;
#define TILE_SIZE 32
typedef struct {
float x, y, z;
float q;
float fx, fy, fz;
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
} AtomData;
......@@ -11,11 +11,11 @@ typedef struct {
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__kernel void computeNonbonded(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......@@ -28,7 +28,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData localData[TILE_SIZE];
......@@ -71,7 +71,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
float4 tempPosq = posq[j];
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
......@@ -87,34 +87,34 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
unsigned int excl = exclusions[exclusionIndex+tgx];
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
real4 force = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
real dEdR = 0;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
float tempEnergy = 0.0f;
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
......@@ -138,9 +138,9 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
// This is an off-diagonal tile.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].fx = 0.0f;
localData[tgx].fy = 0.0f;
localData[tgx].fz = 0.0f;
localData[tgx].fx = 0;
localData[tgx].fy = 0;
localData[tgx].fz = 0;
}
#ifdef USE_CUTOFF
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
......@@ -151,31 +151,31 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
if ((flags2&(1<<tgx)) != 0) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
real4 force = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags1&(1<<j)) != 0) {
bool isExcluded = false;
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
if (r2 < CUTOFF_SQUARED) {
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
real dEdR = 0;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
float tempEnergy = 0.0f;
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
......@@ -208,8 +208,8 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
real4 force = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
......@@ -218,27 +218,27 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
real dEdR = 0;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
float tempEnergy = 0.0f;
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
......@@ -272,7 +272,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
float4 f = forceBuffers[offset];
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
......
......@@ -10,16 +10,16 @@
// aligned which wastes space and causes LDS bank conflicts as stride is no
// longer odd DWORDS.
typedef struct {
float x, y, z;
} UnalignedFloat3;
real x, y, z;
} UnalignedReal3;
typedef struct {
float x, y, z;
float q;
float fx, fy, fz;
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
float padding;
real padding;
#endif
} AtomData;
......@@ -32,13 +32,13 @@ void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global float4* restrict forceBuffers,
__global real4* restrict forceBuffers,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......@@ -51,10 +51,10 @@ void computeNonbonded(
unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__local AtomData localData[TILE_SIZE];
__local UnalignedFloat3 localForce[FORCE_WORK_GROUP_SIZE];
__local UnalignedReal3 localForce[FORCE_WORK_GROUP_SIZE];
#ifdef USE_EXCLUSIONS
__local unsigned int exclusionRange[2];
__local int exclusionIndex[1];
......@@ -83,8 +83,8 @@ void computeNonbonded(
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
real4 force = 0;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
......@@ -121,25 +121,25 @@ void computeNonbonded(
bool isExcluded = !(excl & 0x1);
#endif
unsigned int atom2 = baseLocalAtom+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
real dEdR = 0;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
float tempEnergy = 0.0f;
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
......@@ -173,8 +173,8 @@ void computeNonbonded(
#else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store float4 than float3.
float4 sum = forceBuffers[offset];
// Cheaper to load/store real4 than real3.
real4 sum = forceBuffers[offset];
sum.xyz += force.xyz;
forceBuffers[offset] = sum;
#endif
......@@ -187,16 +187,16 @@ void computeNonbonded(
if (lasty != y && get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx;
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localForce[get_local_id(0)].x = 0.0f;
localForce[get_local_id(0)].y = 0.0f;
localForce[get_local_id(0)].z = 0.0f;
localForce[get_local_id(0)].x = 0;
localForce[get_local_id(0)].y = 0;
localForce[get_local_id(0)].z = 0;
barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile.
......@@ -210,26 +210,26 @@ void computeNonbonded(
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
int atom2 = tj;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
real dEdR = 0;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
float tempEnergy = 0.0f;
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
......@@ -277,9 +277,9 @@ void computeNonbonded(
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1];
float4 sum2 = forceBuffers[offset2];
// Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
real4 sum1 = forceBuffers[offset1];
real4 sum2 = forceBuffers[offset2];
sum1.x += localData[tgx].fx + force.x;
sum1.y += localData[tgx].fy + force.y;
sum1.z += localData[tgx].fz + force.z;
......
......@@ -6,12 +6,12 @@
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct {
float x, y, z;
float q;
float fx, fy, fz;
real x, y, z;
real q;
real fx, fy, fz;
ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN
float padding;
real padding;
#endif
} AtomData;
......@@ -22,13 +22,13 @@ __kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global float4* restrict forceBuffers,
__global real4* restrict forceBuffers,
#endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions,
__global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else
unsigned int numTiles
#endif
......@@ -43,9 +43,9 @@ __kernel void computeNonbonded(
unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif
float energy = 0.0f;
real energy = 0;
__local AtomData localData[FORCE_WORK_GROUP_SIZE];
__local float tempBuffer[3*FORCE_WORK_GROUP_SIZE];
__local real tempBuffer[3*FORCE_WORK_GROUP_SIZE];
__local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange;
......@@ -56,7 +56,7 @@ __kernel void computeNonbonded(
const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y;
float4 force = 0.0f;
real4 force = 0;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
......@@ -75,7 +75,7 @@ __kernel void computeNonbonded(
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1];
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
......@@ -111,25 +111,25 @@ __kernel void computeNonbonded(
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
real dEdR = 0;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
float tempEnergy = 0.0f;
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC
......@@ -147,15 +147,15 @@ __kernel void computeNonbonded(
const unsigned int localAtomIndex = get_local_id(0);
unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j];
real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0.0f;
localData[localAtomIndex].fy = 0.0f;
localData[localAtomIndex].fz = 0.0f;
localData[localAtomIndex].fx = 0;
localData[localAtomIndex].fy = 0;
localData[localAtomIndex].fz = 0;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
......@@ -171,25 +171,25 @@ __kernel void computeNonbonded(
int atom2 = tbx+j;
int bufferIndex = 3*get_local_id(0);
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
real dEdR = 0;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
float tempEnergy = 0.0f;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real tempEnergy = 0;
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
COMPUTE_INTERACTION
......@@ -241,28 +241,28 @@ __kernel void computeNonbonded(
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
real dEdR = 0;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
real4 dEdR1 = (real4) 0;
real4 dEdR2 = (real4) 0;
#endif
float tempEnergy = 0.0f;
real tempEnergy = 0;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
......@@ -347,7 +347,7 @@ __kernel void computeNonbonded(
}
if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f);
forceBuffers[offset] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
}
done = true;
if (tgx == 0)
......
float4 torsionParams = PARAMS[index];
float deltaAngle = torsionParams.z*theta-torsionParams.y;
real deltaAngle = torsionParams.z*theta-torsionParams.y;
energy += torsionParams.x*(1.0f+cos(deltaAngle));
float sinDeltaAngle = sin(deltaAngle);
float dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle;
real sinDeltaAngle = sin(deltaAngle);
real dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle;
__kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __local float4* restrict bsplinesCache,
__global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
const float4 scale = 1.0f/(PME_ORDER-1);
__kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache,
__global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const real4 scale = 1/(real) (PME_ORDER-1);
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
float4 pos = posq[i];
__local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
......@@ -19,15 +19,15 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
data[1] = dr;
data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) {
float div = 1.0f/(j-1.0f);
real div = RECIP(j-1.0f);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]);
data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0];
}
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0];
for (int j = 0; j < PME_ORDER; j++) {
data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
......@@ -39,7 +39,7 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
/**
* For each grid point, find the range of sorted atoms associated with that point.
*/
__kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
......@@ -66,11 +66,11 @@ __kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __g
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel.
*/
__kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
__kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
for (int i = start; i < end; ++i) {
float posz = posq[pmeAtomGridIndex[i].x].z;
real posz = posq[pmeAtomGridIndex[i].x].z;
posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
int z = ((int) ((posz*invPeriodicBoxSize.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
pmeAtomGridIndex[i].y = z;
......@@ -83,14 +83,14 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
__kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global long* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER;
int iz = remainder-iy*PME_ORDER;
__local float4 theta[PME_ORDER];
__local float charge[BUFFER_SIZE];
__local real4 theta[PME_ORDER];
__local real charge[BUFFER_SIZE];
__local int basex[BUFFER_SIZE];
__local int basey[BUFFER_SIZE];
__local int basez[BUFFER_SIZE];
......@@ -101,7 +101,7 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2*
if (get_local_id(0) < BUFFER_SIZE) {
int atomIndex = baseIndex+get_local_id(0);
if (atomIndex < NUM_ATOMS) {
float4 pos = posq[atomIndex];
real4 pos = posq[atomIndex];
charge[get_local_id(0)] = pos.w;
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
......@@ -118,32 +118,40 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2*
if (get_local_id(0) < PME_ORDER)
theta[get_local_id(0)] = pmeBsplineTheta[atomIndex+get_local_id(0)*NUM_ATOMS];
barrier(CLK_LOCAL_MEM_FENCE);
float add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
int x = basex[index]+ix;
int y = basey[index]+iy;
int z = basez[index]+iz;
x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
#ifdef USE_DOUBLE_PRECISION
atom_add(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], (long) (add*0xFFFFFFFF));
#else
atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0xFFFFFFFF));
#endif
}
}
}
}
__kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
__global float2* floatGrid = (__global float2*) pmeGrid;
__global real2* realGrid = (__global real2*) pmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float scale = EPSILON_FACTOR/(float) 0xFFFFFFFF;
real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
#ifdef USE_DOUBLE_PRECISION
long value = pmeGrid[2*index];
#else
long value = pmeGrid[index];
float2 floatValue = (float2) ((float) (value*scale), 0.0f);
floatGrid[index] = floatValue;
#endif
real2 realValue = (real2) ((real) (value*scale), 0);
realGrid[index] = realValue;
}
}
#else
__kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta) {
__kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
// Compute the charge on a grid point.
......@@ -153,7 +161,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
float result = 0.0f;
real result = 0.0f;
// Loop over all atoms that affect this grid point.
......@@ -174,7 +182,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
}
if (z1 > gridPoint.z)
......@@ -189,21 +197,21 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
int atomIndex = atomData.x;
int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
}
}
}
}
pmeGrid[gridIndex] = (float2) (result*EPSILON_FACTOR, 0.0f);
pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0);
}
}
#endif
__kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX,
__global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
__kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
__global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 invPeriodicBoxSize, real recipScaleFactor) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float energy = 0.0f;
real energy = 0.0f;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
......@@ -214,36 +222,36 @@ __kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
float mhx = mx*invPeriodicBoxSize.x;
float mhy = my*invPeriodicBoxSize.y;
float mhz = mz*invPeriodicBoxSize.z;
float bx = pmeBsplineModuliX[kx];
float by = pmeBsplineModuliY[ky];
float bz = pmeBsplineModuliZ[kz];
float2 grid = pmeGrid[index];
float m2 = mhx*mhx+mhy*mhy+mhz*mhz;
float denom = m2*bx*by*bz;
float eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = (float2) (grid.x*eterm, grid.y*eterm);
real mhx = mx*invPeriodicBoxSize.x;
real mhy = my*invPeriodicBoxSize.y;
real mhz = mz*invPeriodicBoxSize.z;
real bx = pmeBsplineModuliX[kx];
real by = pmeBsplineModuliY[ky];
real bz = pmeBsplineModuliZ[kz];
real2 grid = pmeGrid[index];
real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
real denom = m2*bx*by*bz;
real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y);
}
energyBuffer[get_global_id(0)] += 0.5f*energy;
}
__kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float2* restrict pmeGrid,
float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* restrict bsplinesCache) {
const float4 scale = 1.0f/(PME_ORDER-1);
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
__local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
__kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
real4 periodicBoxSize, real4 invPeriodicBoxSize, __local real4* restrict bsplinesCache) {
const real4 scale = 1/(real) (PME_ORDER-1);
__local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
__local real4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
float4 force = 0.0f;
float4 pos = posq[atom];
real4 force = 0.0f;
real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0);
......@@ -251,15 +259,15 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
// Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory.
float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
data[PME_ORDER-1] = 0.0f;
data[1] = dr;
data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) {
float div = 1.0f/(j-1.0f);
real div = RECIP(j-1.0f);
data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]);
data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0];
}
ddata[0] = -data[0];
......@@ -267,7 +275,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0];
// Compute the force on this atom.
......@@ -282,7 +290,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
float gridvalue = pmeGrid[index].x;
real gridvalue = pmeGrid[index].x;
force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
#ifndef MAC_AMD_WORKAROUND
......@@ -302,14 +310,14 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
float gridvalue = pmeGrid[index].x;
real gridvalue = pmeGrid[index].x;
force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
}
}
}
#endif
float4 totalForce = forceBuffers[atom];
float q = pos.w*EPSILON_FACTOR;
real4 totalForce = forceBuffers[atom];
real q = pos.w*EPSILON_FACTOR;
totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y;
totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z;
......
......@@ -4,9 +4,9 @@ if (theta < 0.0f)
else
theta -= PI;
cosangle = -cosangle;
float cosFactor = cosangle;
float dEdAngle = -torsionParams.s1;
float rbEnergy = torsionParams.s0;
real cosFactor = cosangle;
real dEdAngle = -torsionParams.s1;
real rbEnergy = torsionParams.s0;
rbEnergy += torsionParams.s1*cosFactor;
dEdAngle -= 2.0f*torsionParams.s2*cosFactor;
cosFactor *= cosangle;
......
const float PI = 3.14159265358979323846f;
float4 v0 = (float4) (pos1.xyz-pos2.xyz, 0.0f);
float4 v1 = (float4) (pos3.xyz-pos2.xyz, 0.0f);
float4 v2 = (float4) (pos3.xyz-pos4.xyz, 0.0f);
float4 cp0 = cross(v0, v1);
float4 cp1 = cross(v1, v2);
float cosangle = dot(normalize(cp0), normalize(cp1));
float theta;
const real PI = 3.14159265358979323846f;
real4 v0 = (real4) (pos1.xyz-pos2.xyz, 0.0f);
real4 v1 = (real4) (pos3.xyz-pos2.xyz, 0.0f);
real4 v2 = (real4) (pos3.xyz-pos4.xyz, 0.0f);
real4 cp0 = cross(v0, v1);
real4 cp1 = cross(v1, v2);
real cosangle = dot(normalize(cp0), normalize(cp1));
real theta;
if (cosangle > 0.99f || cosangle < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 cross_prod = cross(cp0, cp1);
float scale = dot(cp0, cp0)*dot(cp1, cp1);
real4 cross_prod = cross(cp0, cp1);
real scale = dot(cp0, cp0)*dot(cp1, cp1);
theta = asin(sqrt(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0.0f)
if (cosangle < 0)
theta = PI-theta;
}
else
theta = acos(cosangle);
theta = (dot(v0, cp1) >= 0 ? theta : -theta);
COMPUTE_FORCE
float normCross1 = dot(cp0, cp0);
float normSqrBC = dot(v1, v1);
float normBC = sqrt(normSqrBC);
float normCross2 = dot(cp1, cp1);
float dp = 1.0f/normSqrBC;
float4 ff = (float4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
float4 force1 = ff.x*cp0;
float4 force4 = ff.w*cp1;
float4 s = ff.y*force1 - ff.z*force4;
float4 force2 = s-force1;
float4 force3 = -s-force4;
real normCross1 = dot(cp0, cp0);
real normSqrBC = dot(v1, v1);
real normBC = sqrt(normSqrBC);
real normCross2 = dot(cp1, cp1);
real dp = 1.0f/normSqrBC;
real4 ff = (real4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
real4 force1 = ff.x*cp0;
real4 force4 = ff.w*cp1;
real4 s = ff.y*force1 - ff.z*force4;
real4 force2 = s-force1;
real4 force3 = -s-force4;
......@@ -69,11 +69,11 @@ __kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __globa
* Sum a collection of buffers into the first one.
*/
__kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSize, int numBuffers) {
__kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize, int numBuffers) {
int index = get_global_id(0);
int totalSize = bufferSize*numBuffers;
while (index < bufferSize) {
float4 sum = buffer[index];
real4 sum = buffer[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += buffer[i];
buffer[index] = sum;
......@@ -84,11 +84,11 @@ __kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSiz
/**
* Sum the various buffers containing forces.
*/
__kernel void reduceForces(__global const long* restrict longBuffer, __global float4* restrict buffer, int bufferSize, int numBuffers) {
__kernel void reduceForces(__global const long* restrict longBuffer, __global real4* restrict buffer, int bufferSize, int numBuffers) {
int totalSize = bufferSize*numBuffers;
float scale = 1.0f/(float) 0xFFFFFFFF;
real scale = 1/(real) 0xFFFFFFFF;
for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
float4 sum = (float4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
real4 sum = (real4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0);
for (int i = index; i < totalSize; i += bufferSize)
sum += buffer[i];
buffer[index] = sum;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment