Commit c8dac206 authored by Peter Eastman's avatar Peter Eastman
Browse files

Continuing to implement double precision in OpenCL

parent 34938e2c
/** /**
* Compute the difference between two vectors, setting the fourth component to the squared magnitude. * Compute the difference between two vectors, setting the fourth component to the squared magnitude.
*/ */
float4 delta(float4 vec1, float4 vec2) { real4 delta(real4 vec1, real4 vec2) {
float4 result = (float4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f); real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
result.w = result.x*result.x + result.y*result.y + result.z*result.z; result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result; return result;
} }
...@@ -11,8 +11,8 @@ float4 delta(float4 vec1, float4 vec2) { ...@@ -11,8 +11,8 @@ float4 delta(float4 vec1, float4 vec2) {
* Compute the difference between two vectors, taking periodic boundary conditions into account * Compute the difference between two vectors, taking periodic boundary conditions into account
* and setting the fourth component to the squared magnitude. * and setting the fourth component to the squared magnitude.
*/ */
float4 deltaPeriodic(float4 vec1, float4 vec2, float4 periodicBoxSize, float4 invPeriodicBoxSize) { real4 deltaPeriodic(real4 vec1, real4 vec2, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
float4 result = (float4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0.0f); real4 result = (real4) (vec1.x-vec2.x, vec1.y-vec2.y, vec1.z-vec2.z, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; result.x -= floor(result.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; result.y -= floor(result.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
...@@ -25,15 +25,15 @@ float4 deltaPeriodic(float4 vec1, float4 vec2, float4 periodicBoxSize, float4 in ...@@ -25,15 +25,15 @@ float4 deltaPeriodic(float4 vec1, float4 vec2, float4 periodicBoxSize, float4 in
/** /**
* Compute the angle between two vectors. The w component of each vector should contain the squared magnitude. * Compute the angle between two vectors. The w component of each vector should contain the squared magnitude.
*/ */
float computeAngle(float4 vec1, float4 vec2) { real computeAngle(real4 vec1, real4 vec2) {
float dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z; real dotProduct = vec1.x*vec2.x + vec1.y*vec2.y + vec1.z*vec2.z;
float cosine = dotProduct*RSQRT(vec1.w*vec2.w); real cosine = dotProduct*RSQRT(vec1.w*vec2.w);
float angle; real angle;
if (cosine > 0.99f || cosine < -0.99f) { if (cosine > 0.99f || cosine < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead. // We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 crossProduct = cross(vec1, vec2); real4 crossProduct = cross(vec1, vec2);
float scale = vec1.w*vec2.w; real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale)); angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f) if (cosine < 0.0f)
angle = PI-angle; angle = PI-angle;
...@@ -46,8 +46,8 @@ float computeAngle(float4 vec1, float4 vec2) { ...@@ -46,8 +46,8 @@ float computeAngle(float4 vec1, float4 vec2) {
/** /**
* Compute the cross product of two vectors, setting the fourth component to the squared magnitude. * Compute the cross product of two vectors, setting the fourth component to the squared magnitude.
*/ */
float4 computeCross(float4 vec1, float4 vec2) { real4 computeCross(real4 vec1, real4 vec2) {
float4 result = cross(vec1, vec2); real4 result = cross(vec1, vec2);
result.w = result.x*result.x + result.y*result.y + result.z*result.z; result.w = result.x*result.x + result.y*result.y + result.z*result.z;
return result; return result;
} }
...@@ -55,24 +55,24 @@ float4 computeCross(float4 vec1, float4 vec2) { ...@@ -55,24 +55,24 @@ float4 computeCross(float4 vec1, float4 vec2) {
/** /**
* Compute forces on donors. * Compute forces on donors.
*/ */
__kernel void computeDonorForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const int4* restrict exclusions, __kernel void computeDonorForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local float4* posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize __global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict donorBufferIndices, __local real4* posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
float energy = 0.0f; real energy = 0;
float4 f1 = (float4) 0; real4 f1 = (real4) 0;
float4 f2 = (float4) 0; real4 f2 = (real4) 0;
float4 f3 = (float4) 0; real4 f3 = (real4) 0;
for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_global_size(0)) { for (int donorStart = 0; donorStart < NUM_DONORS; donorStart += get_global_size(0)) {
// Load information about the donor this thread will compute forces on. // Load information about the donor this thread will compute forces on.
int donorIndex = donorStart+get_global_id(0); int donorIndex = donorStart+get_global_id(0);
int4 atoms, exclusionIndices; int4 atoms, exclusionIndices;
float4 d1, d2, d3; real4 d1, d2, d3;
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
atoms = donorAtoms[donorIndex]; atoms = donorAtoms[donorIndex];
d1 = (atoms.x > -1 ? posq[atoms.x] : (float4) 0); d1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
d2 = (atoms.y > -1 ? posq[atoms.y] : (float4) 0); d2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
d3 = (atoms.z > -1 ? posq[atoms.z] : (float4) 0); d3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[donorIndex]; exclusionIndices = exclusions[donorIndex];
#endif #endif
...@@ -85,9 +85,9 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa ...@@ -85,9 +85,9 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
int blockSize = min((int) get_local_size(0), NUM_ACCEPTORS-acceptorStart); int blockSize = min((int) get_local_size(0), NUM_ACCEPTORS-acceptorStart);
if (get_local_id(0) < blockSize) { if (get_local_id(0) < blockSize) {
int4 atoms2 = acceptorAtoms[acceptorStart+get_local_id(0)]; int4 atoms2 = acceptorAtoms[acceptorStart+get_local_id(0)];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (float4) 0); posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (float4) 0); posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (float4) 0); posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (donorIndex < NUM_DONORS) { if (donorIndex < NUM_DONORS) {
...@@ -99,10 +99,10 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa ...@@ -99,10 +99,10 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
#endif #endif
// Compute the interaction between a donor and an acceptor. // Compute the interaction between a donor and an acceptor.
float4 a1 = posBuffer[3*index]; real4 a1 = posBuffer[3*index];
float4 a2 = posBuffer[3*index+1]; real4 a2 = posBuffer[3*index+1];
float4 a3 = posBuffer[3*index+2]; real4 a3 = posBuffer[3*index+2];
float4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize); real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) { if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif #endif
...@@ -120,19 +120,19 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa ...@@ -120,19 +120,19 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
int4 bufferIndices = donorBufferIndices[donorIndex]; int4 bufferIndices = donorBufferIndices[donorIndex];
if (atoms.x > -1) { if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS; unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset]; real4 force = forceBuffers[offset];
force.xyz += f1.xyz; force.xyz += f1.xyz;
forceBuffers[offset] = force; forceBuffers[offset] = force;
} }
if (atoms.y > -1) { if (atoms.y > -1) {
unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS; unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset]; real4 force = forceBuffers[offset];
force.xyz += f2.xyz; force.xyz += f2.xyz;
forceBuffers[offset] = force; forceBuffers[offset] = force;
} }
if (atoms.z > -1) { if (atoms.z > -1) {
unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS; unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset]; real4 force = forceBuffers[offset];
force.xyz += f3.xyz; force.xyz += f3.xyz;
forceBuffers[offset] = force; forceBuffers[offset] = force;
} }
...@@ -143,23 +143,23 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa ...@@ -143,23 +143,23 @@ __kernel void computeDonorForces(__global float4* restrict forceBuffers, __globa
/** /**
* Compute forces on acceptors. * Compute forces on acceptors.
*/ */
__kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const int4* restrict exclusions, __kernel void computeAcceptorForces(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const int4* restrict exclusions,
__global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local float4* restrict posBuffer, float4 periodicBoxSize, float4 invPeriodicBoxSize __global const int4* restrict donorAtoms, __global const int4* restrict acceptorAtoms, __global const int4* restrict acceptorBufferIndices, __local real4* restrict posBuffer, real4 periodicBoxSize, real4 invPeriodicBoxSize
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
float4 f1 = (float4) 0; real4 f1 = (real4) 0;
float4 f2 = (float4) 0; real4 f2 = (real4) 0;
float4 f3 = (float4) 0; real4 f3 = (real4) 0;
for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_global_size(0)) { for (int acceptorStart = 0; acceptorStart < NUM_ACCEPTORS; acceptorStart += get_global_size(0)) {
// Load information about the acceptor this thread will compute forces on. // Load information about the acceptor this thread will compute forces on.
int acceptorIndex = acceptorStart+get_global_id(0); int acceptorIndex = acceptorStart+get_global_id(0);
int4 atoms, exclusionIndices; int4 atoms, exclusionIndices;
float4 a1, a2, a3; real4 a1, a2, a3;
if (acceptorIndex < NUM_ACCEPTORS) { if (acceptorIndex < NUM_ACCEPTORS) {
atoms = acceptorAtoms[acceptorIndex]; atoms = acceptorAtoms[acceptorIndex];
a1 = (atoms.x > -1 ? posq[atoms.x] : (float4) 0); a1 = (atoms.x > -1 ? posq[atoms.x] : (real4) 0);
a2 = (atoms.y > -1 ? posq[atoms.y] : (float4) 0); a2 = (atoms.y > -1 ? posq[atoms.y] : (real4) 0);
a3 = (atoms.z > -1 ? posq[atoms.z] : (float4) 0); a3 = (atoms.z > -1 ? posq[atoms.z] : (real4) 0);
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
exclusionIndices = exclusions[acceptorIndex]; exclusionIndices = exclusions[acceptorIndex];
#endif #endif
...@@ -172,9 +172,9 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl ...@@ -172,9 +172,9 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
int blockSize = min((int) get_local_size(0), NUM_DONORS-donorStart); int blockSize = min((int) get_local_size(0), NUM_DONORS-donorStart);
if (get_local_id(0) < blockSize) { if (get_local_id(0) < blockSize) {
int4 atoms2 = donorAtoms[donorStart+get_local_id(0)]; int4 atoms2 = donorAtoms[donorStart+get_local_id(0)];
posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (float4) 0); posBuffer[3*get_local_id(0)] = (atoms2.x > -1 ? posq[atoms2.x] : (real4) 0);
posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (float4) 0); posBuffer[3*get_local_id(0)+1] = (atoms2.y > -1 ? posq[atoms2.y] : (real4) 0);
posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (float4) 0); posBuffer[3*get_local_id(0)+2] = (atoms2.z > -1 ? posq[atoms2.z] : (real4) 0);
} }
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
if (acceptorIndex < NUM_ACCEPTORS) { if (acceptorIndex < NUM_ACCEPTORS) {
...@@ -186,10 +186,10 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl ...@@ -186,10 +186,10 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
#endif #endif
// Compute the interaction between a donor and an acceptor. // Compute the interaction between a donor and an acceptor.
float4 d1 = posBuffer[3*index]; real4 d1 = posBuffer[3*index];
float4 d2 = posBuffer[3*index+1]; real4 d2 = posBuffer[3*index+1];
float4 d3 = posBuffer[3*index+2]; real4 d3 = posBuffer[3*index+2];
float4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize); real4 deltaD1A1 = deltaPeriodic(d1, a1, periodicBoxSize, invPeriodicBoxSize);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (deltaD1A1.w < CUTOFF_SQUARED) { if (deltaD1A1.w < CUTOFF_SQUARED) {
#endif #endif
...@@ -207,19 +207,19 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl ...@@ -207,19 +207,19 @@ __kernel void computeAcceptorForces(__global float4* restrict forceBuffers, __gl
int4 bufferIndices = acceptorBufferIndices[acceptorIndex]; int4 bufferIndices = acceptorBufferIndices[acceptorIndex];
if (atoms.x > -1) { if (atoms.x > -1) {
unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS; unsigned int offset = atoms.x+bufferIndices.x*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset]; real4 force = forceBuffers[offset];
force.xyz += f1.xyz; force.xyz += f1.xyz;
forceBuffers[offset] = force; forceBuffers[offset] = force;
} }
if (atoms.y > -1) { if (atoms.y > -1) {
unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS; unsigned int offset = atoms.y+bufferIndices.y*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset]; real4 force = forceBuffers[offset];
force.xyz += f2.xyz; force.xyz += f2.xyz;
forceBuffers[offset] = force; forceBuffers[offset] = force;
} }
if (atoms.z > -1) { if (atoms.z > -1) {
unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS; unsigned int offset = atoms.z+bufferIndices.z*PADDED_NUM_ATOMS;
float4 force = forceBuffers[offset]; real4 force = forceBuffers[offset];
force.xyz += f3.xyz; force.xyz += f3.xyz;
forceBuffers[offset] = force; forceBuffers[offset] = force;
} }
......
float2 multofFloat2(float2 a, float2 b) { real2 multofReal2(real2 a, real2 b) {
return (float2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); return (real2) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x);
} }
/** /**
* Precompute the cosine and sine sums which appear in each force term. * Precompute the cosine and sine sums which appear in each force term.
*/ */
__kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __global const float4* restrict posq, __global float2* restrict cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) { __kernel void calculateEwaldCosSinSums(__global real* restrict energyBuffer, __global const real4* restrict posq, __global real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
const unsigned int ksizex = 2*KMAX_X-1; const unsigned int ksizex = 2*KMAX_X-1;
const unsigned int ksizey = 2*KMAX_Y-1; const unsigned int ksizey = 2*KMAX_Y-1;
const unsigned int ksizez = 2*KMAX_Z-1; const unsigned int ksizez = 2*KMAX_Z-1;
const unsigned int totalK = ksizex*ksizey*ksizez; const unsigned int totalK = ksizex*ksizey*ksizez;
unsigned int index = get_global_id(0); unsigned int index = get_global_id(0);
float energy = 0.0f; real energy = 0.0f;
while (index < (KMAX_Y-1)*ksizez+KMAX_Z) while (index < (KMAX_Y-1)*ksizez+KMAX_Z)
index += get_global_size(0); index += get_global_size(0);
while (index < totalK) { while (index < totalK) {
...@@ -23,29 +23,29 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __ ...@@ -23,29 +23,29 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __
int ry = remainder/ksizez; int ry = remainder/ksizez;
int rz = remainder - ry*ksizez - KMAX_Z + 1; int rz = remainder - ry*ksizez - KMAX_Z + 1;
ry += -KMAX_Y + 1; ry += -KMAX_Y + 1;
float kx = rx*reciprocalPeriodicBoxSize.x; real kx = rx*reciprocalPeriodicBoxSize.x;
float ky = ry*reciprocalPeriodicBoxSize.y; real ky = ry*reciprocalPeriodicBoxSize.y;
float kz = rz*reciprocalPeriodicBoxSize.z; real kz = rz*reciprocalPeriodicBoxSize.z;
// Compute the sum for this wave vector. // Compute the sum for this wave vector.
float2 sum = 0.0f; real2 sum = 0.0f;
for (int atom = 0; atom < NUM_ATOMS; atom++) { for (int atom = 0; atom < NUM_ATOMS; atom++) {
float4 apos = posq[atom]; real4 apos = posq[atom];
float phase = apos.x*kx; real phase = apos.x*kx;
float2 structureFactor = (float2) (cos(phase), sin(phase)); real2 structureFactor = (real2) (cos(phase), sin(phase));
phase = apos.y*ky; phase = apos.y*ky;
structureFactor = multofFloat2(structureFactor, (float2) (cos(phase), sin(phase))); structureFactor = multofReal2(structureFactor, (real2) (cos(phase), sin(phase)));
phase = apos.z*kz; phase = apos.z*kz;
structureFactor = multofFloat2(structureFactor, (float2) (cos(phase), sin(phase))); structureFactor = multofReal2(structureFactor, (real2) (cos(phase), sin(phase)));
sum += apos.w*structureFactor; sum += apos.w*structureFactor;
} }
cosSinSum[index] = sum; cosSinSum[index] = sum;
// Compute the contribution to the energy. // Compute the contribution to the energy.
float k2 = kx*kx + ky*ky + kz*kz; real k2 = kx*kx + ky*ky + kz*kz;
float ak = EXP(k2*EXP_COEFFICIENT) / k2; real ak = EXP(k2*EXP_COEFFICIENT) / k2;
energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y); energy += reciprocalCoefficient*ak*(sum.x*sum.x + sum.y*sum.y);
index += get_global_size(0); index += get_global_size(0);
} }
...@@ -57,36 +57,36 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __ ...@@ -57,36 +57,36 @@ __kernel void calculateEwaldCosSinSums(__global float* restrict energyBuffer, __
* previous routine. * previous routine.
*/ */
__kernel void calculateEwaldForces(__global float4* restrict forceBuffers, __global const float4* restrict posq, __global const float2* restrict cosSinSum, float4 reciprocalPeriodicBoxSize, float reciprocalCoefficient) { __kernel void calculateEwaldForces(__global real4* restrict forceBuffers, __global const real4* restrict posq, __global const real2* restrict cosSinSum, real4 reciprocalPeriodicBoxSize, real reciprocalCoefficient) {
unsigned int atom = get_global_id(0); unsigned int atom = get_global_id(0);
while (atom < NUM_ATOMS) { while (atom < NUM_ATOMS) {
float4 force = forceBuffers[atom]; real4 force = forceBuffers[atom];
float4 apos = posq[atom]; real4 apos = posq[atom];
// Loop over all wave vectors. // Loop over all wave vectors.
int lowry = 0; int lowry = 0;
int lowrz = 1; int lowrz = 1;
for (int rx = 0; rx < KMAX_X; rx++) { for (int rx = 0; rx < KMAX_X; rx++) {
float kx = rx*reciprocalPeriodicBoxSize.x; real kx = rx*reciprocalPeriodicBoxSize.x;
for (int ry = lowry; ry < KMAX_Y; ry++) { for (int ry = lowry; ry < KMAX_Y; ry++) {
float ky = ry*reciprocalPeriodicBoxSize.y; real ky = ry*reciprocalPeriodicBoxSize.y;
float phase = apos.x*kx; real phase = apos.x*kx;
float2 tab_xy = (float2) (cos(phase), sin(phase)); real2 tab_xy = (real2) (cos(phase), sin(phase));
phase = apos.y*ky; phase = apos.y*ky;
tab_xy = multofFloat2(tab_xy, (float2) (cos(phase), sin(phase))); tab_xy = multofReal2(tab_xy, (real2) (cos(phase), sin(phase)));
for (int rz = lowrz; rz < KMAX_Z; rz++) { for (int rz = lowrz; rz < KMAX_Z; rz++) {
float kz = rz*reciprocalPeriodicBoxSize.z; real kz = rz*reciprocalPeriodicBoxSize.z;
// Compute the force contribution of this wave vector. // Compute the force contribution of this wave vector.
int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1); int index = rx*(KMAX_Y*2-1)*(KMAX_Z*2-1) + (ry+KMAX_Y-1)*(KMAX_Z*2-1) + (rz+KMAX_Z-1);
float k2 = kx*kx + ky*ky + kz*kz; real k2 = kx*kx + ky*ky + kz*kz;
float ak = EXP(k2*EXP_COEFFICIENT)/k2; real ak = EXP(k2*EXP_COEFFICIENT)/k2;
phase = apos.z*kz; phase = apos.z*kz;
float2 structureFactor = multofFloat2(tab_xy, (float2) (cos(phase), sin(phase))); real2 structureFactor = multofReal2(tab_xy, (real2) (cos(phase), sin(phase)));
float2 sum = cosSinSum[index]; real2 sum = cosSinSum[index];
float dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x); real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
force.x += dEdR*kx; force.x += dEdR*kx;
force.y += dEdR*ky; force.y += dEdR*ky;
force.z += dEdR*kz; force.z += dEdR*kz;
......
float2 multiplyComplex(float2 c1, float2 c2) { real2 multiplyComplex(real2 c1, real2 c2) {
return (float2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x); return (real2) (c1.x*c2.x-c1.y*c2.y, c1.x*c2.y+c1.y*c2.x);
} }
/** /**
* Perform a 1D FFT on each row along one axis. * Perform a 1D FFT on each row along one axis.
*/ */
__kernel void execFFT(__global const float2* restrict in, __global float2* restrict out, float sign, __local float2* restrict w, __kernel void execFFT(__global const real2* restrict in, __global real2* restrict out, int sign, __local real2* restrict w,
__local float2* restrict data0, __local float2* restrict data1) { __local real2* restrict data0, __local real2* restrict data1) {
for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0)) for (int i = get_local_id(0); i < ZSIZE; i += get_local_size(0))
w[i] = (float2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE)); w[i] = (real2) (cos(-sign*i*2*M_PI/ZSIZE), sin(-sign*i*2*M_PI/ZSIZE));
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
for (int index = get_group_id(0); index < XSIZE*YSIZE; index += get_num_groups(0)) { for (int index = get_group_id(0); index < XSIZE*YSIZE; index += get_num_groups(0)) {
int x = index/YSIZE; int x = index/YSIZE;
......
...@@ -8,19 +8,19 @@ ...@@ -8,19 +8,19 @@
/** /**
* Find a bounding box for the atoms in each block. * Find a bounding box for the atoms in each block.
*/ */
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global float4* restrict blockCenter, __global float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) { __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
int index = get_global_id(0); int index = get_global_id(0);
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
while (base < numAtoms) { while (base < numAtoms) {
float4 pos = posq[base]; real4 pos = posq[base];
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
float4 firstPoint = pos; real4 firstPoint = pos;
#endif #endif
float4 minPos = pos; real4 minPos = pos;
float4 maxPos = pos; real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms); int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) { for (int i = base+1; i < last; i++) {
pos = posq[i]; pos = posq[i];
...@@ -46,8 +46,8 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe ...@@ -46,8 +46,8 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
* to global memory. * to global memory.
*/ */
void storeInteractionData(__local ushort2* buffer, __local int* valid, __local short* sum, __local ushort2* temp, __local int* baseIndex, void storeInteractionData(__local ushort2* buffer, __local int* valid, __local short* sum, __local ushort2* temp, __local int* baseIndex,
__global unsigned int* interactionCount, __global ushort2* interactingTiles, float cutoffSquared, float4 periodicBoxSize, __global unsigned int* interactionCount, __global ushort2* interactingTiles, real cutoffSquared, real4 periodicBoxSize,
float4 invPeriodicBoxSize, __global const float4* posq, __global const float4* blockCenter, __global const float4* blockBoundingBox, unsigned int maxTiles) { real4 invPeriodicBoxSize, __global const real4* posq, __global const real4* blockCenter, __global const real4* blockBoundingBox, unsigned int maxTiles) {
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum. // The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE) for (int i = get_local_id(0); i < BUFFER_SIZE; i += GROUP_SIZE)
...@@ -91,7 +91,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s ...@@ -91,7 +91,7 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
int index = get_local_id(0)&(TILE_SIZE-1); int index = get_local_id(0)&(TILE_SIZE-1);
int group = get_local_id(0)/TILE_SIZE; int group = get_local_id(0)/TILE_SIZE;
float4 center, boxSize, pos; real4 center, boxSize, pos;
for (int tile = 0; tile < numValid; tile++) { for (int tile = 0; tile < numValid; tile++) {
int x = temp[tile].x; int x = temp[tile].x;
int y = temp[tile].y; int y = temp[tile].y;
...@@ -103,12 +103,12 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s ...@@ -103,12 +103,12 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
#ifdef MAC_AMD_WORKAROUND #ifdef MAC_AMD_WORKAROUND
int box = (group == 0 ? x : y); int box = (group == 0 ? x : y);
int atom = (group == 0 ? y : x)*TILE_SIZE+index; int atom = (group == 0 ? y : x)*TILE_SIZE+index;
__global float* bc = (__global float*) blockCenter; __global real* bc = (__global real*) blockCenter;
__global float* bb = (__global float*) blockBoundingBox; __global real* bb = (__global real*) blockBoundingBox;
__global float* ps = (__global float*) posq; __global real* ps = (__global real*) posq;
center = (float4) (bc[4*box], bc[4*box+1], bc[4*box+2], 0.0f); center = (real4) (bc[4*box], bc[4*box+1], bc[4*box+2], 0);
boxSize = (float4) (bb[4*box], bb[4*box+1], bb[4*box+2], 0.0f); boxSize = (real4) (bb[4*box], bb[4*box+1], bb[4*box+2], 0);
pos = (float4) (ps[4*atom], ps[4*atom+1], ps[4*atom+2], 0.0f); pos = (real4) (ps[4*atom], ps[4*atom+1], ps[4*atom+2], 0);
#else #else
center = blockCenter[(group == 0 ? x : y)]; center = blockCenter[(group == 0 ? x : y)];
boxSize = blockBoundingBox[(group == 0 ? x : y)]; boxSize = blockBoundingBox[(group == 0 ? x : y)];
...@@ -117,13 +117,13 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s ...@@ -117,13 +117,13 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
// Find the distance of the atom from the bounding box. // Find the distance of the atom from the bounding box.
float4 delta = pos-center; real4 delta = pos-center;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
delta = max((float4) 0.0f, fabs(delta)-boxSize); delta = max((real4) 0, fabs(delta)-boxSize);
__local ushort* flag = (__local ushort*) &buffer[tile]; __local ushort* flag = (__local ushort*) &buffer[tile];
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared)
flag[group] = false; flag[group] = false;
...@@ -155,9 +155,9 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s ...@@ -155,9 +155,9 @@ void storeInteractionData(__local ushort2* buffer, __local int* valid, __local s
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart, * Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting. * mark them as non-interacting.
*/ */
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict blockCenter, __kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles, __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const float4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex, __global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) { unsigned int endTileIndex) {
__local ushort2 buffer[BUFFER_SIZE]; __local ushort2 buffer[BUFFER_SIZE];
__local int valid[BUFFER_SIZE]; __local int valid[BUFFER_SIZE];
...@@ -194,26 +194,26 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox ...@@ -194,26 +194,26 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
// Find the distance between the bounding boxes of the two cells. // Find the distance between the bounding boxes of the two cells.
#ifdef MAC_AMD_WORKAROUND #ifdef MAC_AMD_WORKAROUND
__global float* bc = (__global float*) blockCenter; __global real* bc = (__global real*) blockCenter;
__global float* bb = (__global float*) blockBoundingBox; __global real* bb = (__global real*) blockBoundingBox;
float4 bcx = (float4) (bc[4*x], bc[4*x+1], bc[4*x+2], 0.0f); real4 bcx = (real4) (bc[4*x], bc[4*x+1], bc[4*x+2], 0);
float4 bcy = (float4) (bc[4*y], bc[4*y+1], bc[4*y+2], 0.0f); real4 bcy = (real4) (bc[4*y], bc[4*y+1], bc[4*y+2], 0);
float4 delta = bcx-bcy; real4 delta = bcx-bcy;
float4 boxSizea = (float4) (bb[4*x], bb[4*x+1], bb[4*x+2], 0.0f); real4 boxSizea = (real4) (bb[4*x], bb[4*x+1], bb[4*x+2], 0);
float4 boxSizeb = (float4) (bb[4*y], bb[4*y+1], bb[4*y+2], 0.0f); real4 boxSizeb = (real4) (bb[4*y], bb[4*y+1], bb[4*y+2], 0);
#else #else
float4 delta = blockCenter[x]-blockCenter[y]; real4 delta = blockCenter[x]-blockCenter[y];
float4 boxSizea = blockBoundingBox[x]; real4 boxSizea = blockBoundingBox[x];
float4 boxSizeb = blockBoundingBox[y]; real4 boxSizeb = blockBoundingBox[y];
#endif #endif
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x); delta.x = max((real) 0, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y); delta.y = max((real) 0, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z); delta.z = max((real) 0, fabs(delta.z)-boxSizea.z-boxSizeb.z);
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) { if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
// Add this tile to the buffer. // Add this tile to the buffer.
...@@ -241,8 +241,8 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox ...@@ -241,8 +241,8 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
* Compare each atom in one block to the bounding box of another block, and set * Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting. * flags for which ones are interacting.
*/ */
__kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global const ushort2* restrict tiles, __global const float4* restrict blockCenter, __kernel void findInteractionsWithinBlocks(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global const ushort2* restrict tiles, __global const real4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local volatile unsigned int* restrict flags, unsigned int maxTiles) { __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionFlags, __global const unsigned int* restrict interactionCount, __local volatile unsigned int* restrict flags, unsigned int maxTiles) {
unsigned int totalWarps = get_global_size(0)/TILE_SIZE; unsigned int totalWarps = get_global_size(0)/TILE_SIZE;
unsigned int warp = get_global_id(0)/TILE_SIZE; unsigned int warp = get_global_id(0)/TILE_SIZE;
unsigned int numTiles = interactionCount[0]; unsigned int numTiles = interactionCount[0];
...@@ -253,7 +253,7 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB ...@@ -253,7 +253,7 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
if (numTiles > maxTiles) if (numTiles > maxTiles)
return; return;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
float4 apos; real4 apos;
while (pos < end) { while (pos < end) {
// Extract the coordinates of this tile // Extract the coordinates of this tile
ushort2 tileIndices = tiles[pos]; ushort2 tileIndices = tiles[pos];
...@@ -266,20 +266,20 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB ...@@ -266,20 +266,20 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
else { else {
// Load the bounding box for x and the atom positions for y. // Load the bounding box for x and the atom positions for y.
float4 center = blockCenter[x]; real4 center = blockCenter[x];
float4 boxSize = blockBoundingBox[x]; real4 boxSize = blockBoundingBox[x];
if (y != lasty) if (y != lasty)
apos = posq[y*TILE_SIZE+index]; apos = posq[y*TILE_SIZE+index];
// Find the distance of the atom from the bounding box. // Find the distance of the atom from the bounding box.
float4 delta = apos-center; real4 delta = apos-center;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
delta = max((float4) 0.0f, fabs(delta)-boxSize); delta = max((real4) 0, fabs(delta)-boxSize);
int thread = get_local_id(0); int thread = get_local_id(0);
flags[thread] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1 << index); flags[thread] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > cutoffSquared ? 0 : 1 << index);
......
...@@ -8,19 +8,19 @@ ...@@ -8,19 +8,19 @@
/** /**
* Find a bounding box for the atoms in each block. * Find a bounding box for the atoms in each block.
*/ */
__kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict posq, __global float4* restrict blockCenter, __global float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) { __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount) {
int index = get_global_id(0); int index = get_global_id(0);
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
while (base < numAtoms) { while (base < numAtoms) {
float4 pos = posq[base]; real4 pos = posq[base];
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
float4 firstPoint = pos; real4 firstPoint = pos;
#endif #endif
float4 minPos = pos; real4 minPos = pos;
float4 maxPos = pos; real4 maxPos = pos;
int last = min(base+TILE_SIZE, numAtoms); int last = min(base+TILE_SIZE, numAtoms);
for (int i = base+1; i < last; i++) { for (int i = base+1; i < last; i++) {
pos = posq[i]; pos = posq[i];
...@@ -46,14 +46,14 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe ...@@ -46,14 +46,14 @@ __kernel void findBlockBounds(int numAtoms, float4 periodicBoxSize, float4 invPe
* to global memory. * to global memory.
*/ */
void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int* interactionCount, __global ushort2* interactingTiles, void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int* interactionCount, __global ushort2* interactingTiles,
__global unsigned int* interactionFlags, float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global unsigned int* interactionFlags, real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize,
__global float4* posq, __global float4* blockCenter, __global float4* blockBoundingBox, unsigned int maxTiles) { __global real4* posq, __global real4* blockCenter, __global real4* blockBoundingBox, unsigned int maxTiles) {
// Filter the list of tiles by comparing the distance from each atom to the other bounding box. // Filter the list of tiles by comparing the distance from each atom to the other bounding box.
unsigned int flagsBuffer[2*BUFFER_SIZE]; unsigned int flagsBuffer[2*BUFFER_SIZE];
float4 atomPositions[TILE_SIZE]; real4 atomPositions[TILE_SIZE];
int lasty = -1; int lasty = -1;
float4 centery, boxSizey; real4 centery, boxSizey;
for (int tile = 0; tile < numValid; ) { for (int tile = 0; tile < numValid; ) {
int x = buffer[tile].x; int x = buffer[tile].x;
int y = buffer[tile].y; int y = buffer[tile].y;
...@@ -64,8 +64,8 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int* ...@@ -64,8 +64,8 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
// Load the atom positions and bounding boxes. // Load the atom positions and bounding boxes.
float4 centerx = blockCenter[x]; real4 centerx = blockCenter[x];
float4 boxSizex = blockBoundingBox[x]; real4 boxSizex = blockBoundingBox[x];
if (y != lasty) { if (y != lasty) {
for (int atom = 0; atom < TILE_SIZE; atom++) for (int atom = 0; atom < TILE_SIZE; atom++)
atomPositions[atom] = posq[y*TILE_SIZE+atom]; atomPositions[atom] = posq[y*TILE_SIZE+atom];
...@@ -78,18 +78,18 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int* ...@@ -78,18 +78,18 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
unsigned int flags1 = 0, flags2 = 0; unsigned int flags1 = 0, flags2 = 0;
for (int atom = 0; atom < TILE_SIZE; atom++) { for (int atom = 0; atom < TILE_SIZE; atom++) {
float4 delta = atomPositions[atom]-centerx; real4 delta = atomPositions[atom]-centerx;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
delta = max((float4) 0.0f, fabs(delta)-boxSizex); delta = max((real4) 0, fabs(delta)-boxSizex);
if (dot(delta.xyz, delta.xyz) < cutoffSquared) if (dot(delta.xyz, delta.xyz) < cutoffSquared)
flags1 += 1 << atom; flags1 += 1 << atom;
delta = posq[x*TILE_SIZE+atom]-centery; delta = posq[x*TILE_SIZE+atom]-centery;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
delta = max((float4) 0.0f, fabs(delta)-boxSizey); delta = max((real4) 0, fabs(delta)-boxSizey);
if (dot(delta.xyz, delta.xyz) < cutoffSquared) if (dot(delta.xyz, delta.xyz) < cutoffSquared)
flags2 += 1 << atom; flags2 += 1 << atom;
} }
...@@ -121,9 +121,9 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int* ...@@ -121,9 +121,9 @@ void storeInteractionData(ushort2* buffer, int numValid, __global unsigned int*
* Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart, * Compare the bounding boxes for each pair of blocks. If they are sufficiently far apart,
* mark them as non-interacting. * mark them as non-interacting.
*/ */
__kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBoxSize, float4 invPeriodicBoxSize, __global const float4* restrict blockCenter, __kernel void findBlocksWithInteractions(real cutoffSquared, real4 periodicBoxSize, real4 invPeriodicBoxSize, __global const real4* restrict blockCenter,
__global const float4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles, __global const real4* restrict blockBoundingBox, __global unsigned int* restrict interactionCount, __global ushort2* restrict interactingTiles,
__global unsigned int* restrict interactionFlags, __global const float4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex, __global unsigned int* restrict interactionFlags, __global const real4* restrict posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int endTileIndex) { unsigned int endTileIndex) {
ushort2 buffer[BUFFER_SIZE]; ushort2 buffer[BUFFER_SIZE];
int valuesInBuffer = 0; int valuesInBuffer = 0;
...@@ -142,17 +142,17 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox ...@@ -142,17 +142,17 @@ __kernel void findBlocksWithInteractions(float cutoffSquared, float4 periodicBox
// Find the distance between the bounding boxes of the two cells. // Find the distance between the bounding boxes of the two cells.
float4 delta = blockCenter[x]-blockCenter[y]; real4 delta = blockCenter[x]-blockCenter[y];
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float4 boxSizea = blockBoundingBox[x]; real4 boxSizea = blockBoundingBox[x];
float4 boxSizeb = blockBoundingBox[y]; real4 boxSizeb = blockBoundingBox[y];
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x); delta.x = max((real) 0, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y); delta.y = max((real) 0, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z); delta.z = max((real) 0, fabs(delta.z)-boxSizea.z-boxSizeb.z);
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) { if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < cutoffSquared) {
// Add this tile to the buffer. // Add this tile to the buffer.
......
float2 angleParams = PARAMS[index]; float2 angleParams = PARAMS[index];
float deltaIdeal = theta-angleParams.x; real deltaIdeal = theta-angleParams.x;
energy += 0.5f*angleParams.y*deltaIdeal*deltaIdeal; energy += 0.5f*angleParams.y*deltaIdeal*deltaIdeal;
float dEdAngle = angleParams.y*deltaIdeal; real dEdAngle = angleParams.y*deltaIdeal;
float2 bondParams = PARAMS[index]; float2 bondParams = PARAMS[index];
float deltaIdeal = r-bondParams.x; real deltaIdeal = r-bondParams.x;
energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal; energy += 0.5f * bondParams.y*deltaIdeal*deltaIdeal;
float dEdR = bondParams.y * deltaIdeal; real dEdR = bondParams.y * deltaIdeal;
float4 exceptionParams = PARAMS[index]; float4 exceptionParams = PARAMS[index];
float4 delta = pos2-pos1; real4 delta = pos2-pos1;
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); real invR = RSQRT(r2);
float sig2 = invR*exceptionParams.y; real sig2 = invR*exceptionParams.y;
sig2 *= sig2; sig2 *= sig2;
float sig6 = sig2*sig2*sig2; real sig6 = sig2*sig2*sig2;
float dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6; real dEdR = exceptionParams.z*(12.0f*sig6-6.0f)*sig6;
float tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6; real tempEnergy = exceptionParams.z*(sig6-1.0f)*sig6;
dEdR += exceptionParams.x*invR; dEdR += exceptionParams.x*invR;
dEdR *= invR*invR; dEdR *= invR*invR;
tempEnergy += exceptionParams.x*invR; tempEnergy += exceptionParams.x*invR;
energy += tempEnergy; energy += tempEnergy;
delta.xyz *= dEdR; delta.xyz *= dEdR;
float4 force1 = -delta; real4 force1 = -delta;
float4 force2 = delta; real4 force2 = delta;
#define TILE_SIZE 32 #define TILE_SIZE 32
typedef struct { typedef struct {
float x, y, z; real x, y, z;
float q; real q;
float fx, fy, fz; real fx, fy, fz;
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
} AtomData; } AtomData;
...@@ -11,11 +11,11 @@ typedef struct { ...@@ -11,11 +11,11 @@ typedef struct {
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel void computeNonbonded(__global float4* restrict forceBuffers, __global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, __kernel void computeNonbonded(__global real4* restrict forceBuffers, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -28,7 +28,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global ...@@ -28,7 +28,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
float energy = 0.0f; real energy = 0;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData localData[TILE_SIZE]; __local AtomData localData[TILE_SIZE];
...@@ -71,7 +71,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global ...@@ -71,7 +71,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
if (lasty != y) { if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) { for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex; unsigned int j = y*TILE_SIZE + localAtomIndex;
float4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x; localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y; localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z; localData[localAtomIndex].z = tempPosq.z;
...@@ -87,34 +87,34 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global ...@@ -87,34 +87,34 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
unsigned int excl = exclusions[exclusionIndex+tgx]; unsigned int excl = exclusions[exclusionIndex+tgx];
#endif #endif
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f; real4 force = 0;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; real dEdR = 0;
#else #else
float4 dEdR1 = (float4) 0.0f; real4 dEdR1 = (real4) 0;
float4 dEdR2 = (float4) 0.0f; real4 dEdR2 = (real4) 0;
#endif #endif
float tempEnergy = 0.0f; real tempEnergy = 0;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
...@@ -138,9 +138,9 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global ...@@ -138,9 +138,9 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
// This is an off-diagonal tile. // This is an off-diagonal tile.
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
localData[tgx].fx = 0.0f; localData[tgx].fx = 0;
localData[tgx].fy = 0.0f; localData[tgx].fy = 0;
localData[tgx].fz = 0.0f; localData[tgx].fz = 0;
} }
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF); unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
...@@ -151,31 +151,31 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global ...@@ -151,31 +151,31 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
if ((flags2&(1<<tgx)) != 0) { if ((flags2&(1<<tgx)) != 0) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f; real4 force = 0;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags1&(1<<j)) != 0) { if ((flags1&(1<<j)) != 0) {
bool isExcluded = false; bool isExcluded = false;
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; real dEdR = 0;
#else #else
float4 dEdR1 = (float4) 0.0f; real4 dEdR1 = (real4) 0;
float4 dEdR2 = (float4) 0.0f; real4 dEdR2 = (real4) 0;
#endif #endif
float tempEnergy = 0.0f; real tempEnergy = 0;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
...@@ -208,8 +208,8 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global ...@@ -208,8 +208,8 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) { for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx; unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f; real4 force = 0;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
...@@ -218,27 +218,27 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global ...@@ -218,27 +218,27 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
float4 posq2 = (float4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q); real4 posq2 = (real4) (localData[j].x, localData[j].y, localData[j].z, localData[j].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz; delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif #endif
float r2 = dot(delta.xyz, delta.xyz); real r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
unsigned int atom2 = j; unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; real dEdR = 0;
#else #else
float4 dEdR1 = (float4) 0.0f; real4 dEdR1 = (real4) 0;
float4 dEdR2 = (float4) 0.0f; real4 dEdR2 = (real4) 0;
#endif #endif
float tempEnergy = 0.0f; real tempEnergy = 0;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
...@@ -272,7 +272,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global ...@@ -272,7 +272,7 @@ __kernel void computeNonbonded(__global float4* restrict forceBuffers, __global
for (int tgx = 0; tgx < TILE_SIZE; tgx++) { for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
float4 f = forceBuffers[offset]; real4 f = forceBuffers[offset];
f.x += localData[tgx].fx; f.x += localData[tgx].fx;
f.y += localData[tgx].fy; f.y += localData[tgx].fy;
f.z += localData[tgx].fz; f.z += localData[tgx].fz;
......
...@@ -10,16 +10,16 @@ ...@@ -10,16 +10,16 @@
// aligned which wastes space and causes LDS bank conflicts as stride is no // aligned which wastes space and causes LDS bank conflicts as stride is no
// longer odd DWORDS. // longer odd DWORDS.
typedef struct { typedef struct {
float x, y, z; real x, y, z;
} UnalignedFloat3; } UnalignedReal3;
typedef struct { typedef struct {
float x, y, z; real x, y, z;
float q; real q;
float fx, fy, fz; real fx, fy, fz;
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN #ifndef PARAMETER_SIZE_IS_EVEN
float padding; real padding;
#endif #endif
} AtomData; } AtomData;
...@@ -32,13 +32,13 @@ void computeNonbonded( ...@@ -32,13 +32,13 @@ void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict forceBuffers,
#else #else
__global float4* restrict forceBuffers, __global real4* restrict forceBuffers,
#endif #endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -51,10 +51,10 @@ void computeNonbonded( ...@@ -51,10 +51,10 @@ void computeNonbonded(
unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0); unsigned int pos = startTileIndex+get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0); unsigned int end = startTileIndex+(get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif #endif
float energy = 0.0f; real energy = 0;
unsigned int lasty = 0xFFFFFFFF; unsigned int lasty = 0xFFFFFFFF;
__local AtomData localData[TILE_SIZE]; __local AtomData localData[TILE_SIZE];
__local UnalignedFloat3 localForce[FORCE_WORK_GROUP_SIZE]; __local UnalignedReal3 localForce[FORCE_WORK_GROUP_SIZE];
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
__local unsigned int exclusionRange[2]; __local unsigned int exclusionRange[2];
__local int exclusionIndex[1]; __local int exclusionIndex[1];
...@@ -83,8 +83,8 @@ void computeNonbonded( ...@@ -83,8 +83,8 @@ void computeNonbonded(
unsigned int tgx = get_local_id(0) & (TILE_SIZE-1); unsigned int tgx = get_local_id(0) & (TILE_SIZE-1);
unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1); unsigned int localForceOffset = get_local_id(0) & ~(TILE_SIZE-1);
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 force = 0.0f; real4 force = 0;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile. // Locate the exclusion data for this tile.
...@@ -121,25 +121,25 @@ void computeNonbonded( ...@@ -121,25 +121,25 @@ void computeNonbonded(
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
unsigned int atom2 = baseLocalAtom+j; unsigned int atom2 = baseLocalAtom+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+baseLocalAtom+j; atom2 = y*TILE_SIZE+baseLocalAtom+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; real dEdR = 0;
#else #else
float4 dEdR1 = (float4) 0.0f; real4 dEdR1 = (real4) 0;
float4 dEdR2 = (float4) 0.0f; real4 dEdR2 = (real4) 0;
#endif #endif
float tempEnergy = 0.0f; real tempEnergy = 0;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
...@@ -173,8 +173,8 @@ void computeNonbonded( ...@@ -173,8 +173,8 @@ void computeNonbonded(
#else #else
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Cheaper to load/store float4 than float3. // Cheaper to load/store real4 than real3.
float4 sum = forceBuffers[offset]; real4 sum = forceBuffers[offset];
sum.xyz += force.xyz; sum.xyz += force.xyz;
forceBuffers[offset] = sum; forceBuffers[offset] = sum;
#endif #endif
...@@ -187,16 +187,16 @@ void computeNonbonded( ...@@ -187,16 +187,16 @@ void computeNonbonded(
if (lasty != y && get_local_id(0) < TILE_SIZE) { if (lasty != y && get_local_id(0) < TILE_SIZE) {
const unsigned int localAtomIndex = tgx; const unsigned int localAtomIndex = tgx;
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x; localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y; localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z; localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w; localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} }
localForce[get_local_id(0)].x = 0.0f; localForce[get_local_id(0)].x = 0;
localForce[get_local_id(0)].y = 0.0f; localForce[get_local_id(0)].y = 0;
localForce[get_local_id(0)].z = 0.0f; localForce[get_local_id(0)].z = 0;
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
// Compute the full set of interactions in this tile. // Compute the full set of interactions in this tile.
...@@ -210,26 +210,26 @@ void computeNonbonded( ...@@ -210,26 +210,26 @@ void computeNonbonded(
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
float4 posq2 = (float4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q); real4 posq2 = (real4) (localData[tj].x, localData[tj].y, localData[tj].z, localData[tj].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
int atom2 = tj; int atom2 = tj;
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; real dEdR = 0;
#else #else
float4 dEdR1 = (float4) 0.0f; real4 dEdR1 = (real4) 0;
float4 dEdR2 = (float4) 0.0f; real4 dEdR2 = (real4) 0;
#endif #endif
float tempEnergy = 0.0f; real tempEnergy = 0;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
...@@ -277,9 +277,9 @@ void computeNonbonded( ...@@ -277,9 +277,9 @@ void computeNonbonded(
const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset2 = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif #endif
// Cheaper to load/store float4 than float3. Do all loads before all stores to minimize store-load waits. // Cheaper to load/store real4 than real3. Do all loads before all stores to minimize store-load waits.
float4 sum1 = forceBuffers[offset1]; real4 sum1 = forceBuffers[offset1];
float4 sum2 = forceBuffers[offset2]; real4 sum2 = forceBuffers[offset2];
sum1.x += localData[tgx].fx + force.x; sum1.x += localData[tgx].fx + force.x;
sum1.y += localData[tgx].fy + force.y; sum1.y += localData[tgx].fy + force.y;
sum1.z += localData[tgx].fz + force.z; sum1.z += localData[tgx].fz + force.z;
......
...@@ -6,12 +6,12 @@ ...@@ -6,12 +6,12 @@
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
typedef struct { typedef struct {
float x, y, z; real x, y, z;
float q; real q;
float fx, fy, fz; real fx, fy, fz;
ATOM_PARAMETER_DATA ATOM_PARAMETER_DATA
#ifndef PARAMETER_SIZE_IS_EVEN #ifndef PARAMETER_SIZE_IS_EVEN
float padding; real padding;
#endif #endif
} AtomData; } AtomData;
...@@ -22,13 +22,13 @@ __kernel void computeNonbonded( ...@@ -22,13 +22,13 @@ __kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS #ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict forceBuffers,
#else #else
__global float4* restrict forceBuffers, __global real4* restrict forceBuffers,
#endif #endif
__global float* restrict energyBuffer, __global const float4* restrict posq, __global const unsigned int* restrict exclusions, __global real* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices,
unsigned int startTileIndex, unsigned int endTileIndex, unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags __global const ushort2* restrict tiles, __global const unsigned int* restrict interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, __global const unsigned int* restrict interactionFlags
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -43,9 +43,9 @@ __kernel void computeNonbonded( ...@@ -43,9 +43,9 @@ __kernel void computeNonbonded(
unsigned int pos = startTileIndex+warp*numTiles/totalWarps; unsigned int pos = startTileIndex+warp*numTiles/totalWarps;
unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps; unsigned int end = startTileIndex+(warp+1)*numTiles/totalWarps;
#endif #endif
float energy = 0.0f; real energy = 0;
__local AtomData localData[FORCE_WORK_GROUP_SIZE]; __local AtomData localData[FORCE_WORK_GROUP_SIZE];
__local float tempBuffer[3*FORCE_WORK_GROUP_SIZE]; __local real tempBuffer[3*FORCE_WORK_GROUP_SIZE];
__local unsigned int exclusionRange[2*WARPS_PER_GROUP]; __local unsigned int exclusionRange[2*WARPS_PER_GROUP];
__local int exclusionIndex[WARPS_PER_GROUP]; __local int exclusionIndex[WARPS_PER_GROUP];
__local int2* reservedBlocks = (__local int2*) exclusionRange; __local int2* reservedBlocks = (__local int2*) exclusionRange;
...@@ -56,7 +56,7 @@ __kernel void computeNonbonded( ...@@ -56,7 +56,7 @@ __kernel void computeNonbonded(
const unsigned int tbx = get_local_id(0) - tgx; const unsigned int tbx = get_local_id(0) - tgx;
const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE; const unsigned int localGroupIndex = get_local_id(0)/TILE_SIZE;
unsigned int x, y; unsigned int x, y;
float4 force = 0.0f; real4 force = 0;
if (pos < end) { if (pos < end) {
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (numTiles <= maxTiles) { if (numTiles <= maxTiles) {
...@@ -75,7 +75,7 @@ __kernel void computeNonbonded( ...@@ -75,7 +75,7 @@ __kernel void computeNonbonded(
} }
} }
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
float4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile. // Locate the exclusion data for this tile.
...@@ -111,25 +111,25 @@ __kernel void computeNonbonded( ...@@ -111,25 +111,25 @@ __kernel void computeNonbonded(
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+j; int atom2 = tbx+j;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; real dEdR = 0;
#else #else
float4 dEdR1 = (float4) 0.0f; real4 dEdR1 = (real4) 0;
float4 dEdR2 = (float4) 0.0f; real4 dEdR2 = (real4) 0;
#endif #endif
float tempEnergy = 0.0f; real tempEnergy = 0;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += 0.5f*tempEnergy; energy += 0.5f*tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
...@@ -147,15 +147,15 @@ __kernel void computeNonbonded( ...@@ -147,15 +147,15 @@ __kernel void computeNonbonded(
const unsigned int localAtomIndex = get_local_id(0); const unsigned int localAtomIndex = get_local_id(0);
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
float4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[localAtomIndex].x = tempPosq.x; localData[localAtomIndex].x = tempPosq.x;
localData[localAtomIndex].y = tempPosq.y; localData[localAtomIndex].y = tempPosq.y;
localData[localAtomIndex].z = tempPosq.z; localData[localAtomIndex].z = tempPosq.z;
localData[localAtomIndex].q = tempPosq.w; localData[localAtomIndex].q = tempPosq.w;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].fx = 0.0f; localData[localAtomIndex].fx = 0;
localData[localAtomIndex].fy = 0.0f; localData[localAtomIndex].fy = 0;
localData[localAtomIndex].fz = 0.0f; localData[localAtomIndex].fz = 0;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) { if (!hasExclusions && flags != 0xFFFFFFFF) {
...@@ -171,25 +171,25 @@ __kernel void computeNonbonded( ...@@ -171,25 +171,25 @@ __kernel void computeNonbonded(
int atom2 = tbx+j; int atom2 = tbx+j;
int bufferIndex = 3*get_local_id(0); int bufferIndex = 3*get_local_id(0);
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; real dEdR = 0;
#else #else
float4 dEdR1 = (float4) 0.0f; real4 dEdR1 = (real4) 0;
float4 dEdR2 = (float4) 0.0f; real4 dEdR2 = (real4) 0;
#endif #endif
float tempEnergy = 0.0f; real tempEnergy = 0;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
COMPUTE_INTERACTION COMPUTE_INTERACTION
...@@ -241,28 +241,28 @@ __kernel void computeNonbonded( ...@@ -241,28 +241,28 @@ __kernel void computeNonbonded(
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
int atom2 = tbx+tj; int atom2 = tbx+tj;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q); real4 posq2 = (real4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f); real4 delta = (real4) (posq2.xyz - posq1.xyz, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
float invR = RSQRT(r2); real invR = RSQRT(r2);
float r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
float dEdR = 0.0f; real dEdR = 0;
#else #else
float4 dEdR1 = (float4) 0.0f; real4 dEdR1 = (real4) 0;
float4 dEdR2 = (float4) 0.0f; real4 dEdR2 = (real4) 0;
#endif #endif
float tempEnergy = 0.0f; real tempEnergy = 0;
COMPUTE_INTERACTION COMPUTE_INTERACTION
energy += tempEnergy; energy += tempEnergy;
#ifdef USE_SYMMETRIC #ifdef USE_SYMMETRIC
...@@ -347,7 +347,7 @@ __kernel void computeNonbonded( ...@@ -347,7 +347,7 @@ __kernel void computeNonbonded(
} }
if (writeY > -1) { if (writeY > -1) {
const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS; const unsigned int offset = y*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset] += (float4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0.0f); forceBuffers[offset] += (real4) (localData[get_local_id(0)].fx, localData[get_local_id(0)].fy, localData[get_local_id(0)].fz, 0);
} }
done = true; done = true;
if (tgx == 0) if (tgx == 0)
......
float4 torsionParams = PARAMS[index]; float4 torsionParams = PARAMS[index];
float deltaAngle = torsionParams.z*theta-torsionParams.y; real deltaAngle = torsionParams.z*theta-torsionParams.y;
energy += torsionParams.x*(1.0f+cos(deltaAngle)); energy += torsionParams.x*(1.0f+cos(deltaAngle));
float sinDeltaAngle = sin(deltaAngle); real sinDeltaAngle = sin(deltaAngle);
float dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle; real dEdAngle = -torsionParams.x*torsionParams.z*sinDeltaAngle;
__kernel void updateBsplines(__global const float4* restrict posq, __global float4* restrict pmeBsplineTheta, __local float4* restrict bsplinesCache, __kernel void updateBsplines(__global const real4* restrict posq, __global real4* restrict pmeBsplineTheta, __local real4* restrict bsplinesCache,
__global int2* restrict pmeAtomGridIndex, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global int2* restrict pmeAtomGridIndex, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
const float4 scale = 1.0f/(PME_ORDER-1); const real4 scale = 1/(real) (PME_ORDER-1);
for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) { for (int i = get_global_id(0); i < NUM_ATOMS; i += get_global_size(0)) {
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER]; __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
float4 pos = posq[i]; real4 pos = posq[i];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X, real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y, (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f); (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f); real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X, int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y, ((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0); ((int) t.z) % GRID_SIZE_Z, 0);
...@@ -19,15 +19,15 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa ...@@ -19,15 +19,15 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
data[1] = dr; data[1] = dr;
data[0] = 1.0f-dr; data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) { for (int j = 3; j < PME_ORDER; j++) {
float div = 1.0f/(j-1.0f); real div = RECIP(j-1.0f);
data[j-1] = div*dr*data[j-2]; data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++) for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]); data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0]; data[0] = div*(- dr+1.0f)*data[0];
} }
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++) for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]); data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0]; data[0] = scale*(-dr+1.0f)*data[0];
for (int j = 0; j < PME_ORDER; j++) { for (int j = 0; j < PME_ORDER; j++) {
data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel data[j].w = pos.w; // Storing the charge here improves cache coherency in the charge spreading kernel
...@@ -39,7 +39,7 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa ...@@ -39,7 +39,7 @@ __kernel void updateBsplines(__global const float4* restrict posq, __global floa
/** /**
* For each grid point, find the range of sorted atoms associated with that point. * For each grid point, find the range of sorted atoms associated with that point.
*/ */
__kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __global int* restrict pmeAtomRange, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0); int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0); int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y); int last = (start == 0 ? -1 : pmeAtomGridIndex[start-1].y);
...@@ -66,11 +66,11 @@ __kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __g ...@@ -66,11 +66,11 @@ __kernel void findAtomRangeForGrid(__global int2* restrict pmeAtomGridIndex, __g
* The grid index won't be needed again. Reuse that component to hold the z index, thus saving * The grid index won't be needed again. Reuse that component to hold the z index, thus saving
* some work in the charge spreading kernel. * some work in the charge spreading kernel.
*/ */
__kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global const float4* restrict posq, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global const real4* restrict posq, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0); int start = (NUM_ATOMS*get_global_id(0))/get_global_size(0);
int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0); int end = (NUM_ATOMS*(get_global_id(0)+1))/get_global_size(0);
for (int i = start; i < end; ++i) { for (int i = start; i < end; ++i) {
float posz = posq[pmeAtomGridIndex[i].x].z; real posz = posq[pmeAtomGridIndex[i].x].z;
posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z; posz -= floor(posz*invPeriodicBoxSize.z)*periodicBoxSize.z;
int z = ((int) ((posz*invPeriodicBoxSize.z)*GRID_SIZE_Z)) % GRID_SIZE_Z; int z = ((int) ((posz*invPeriodicBoxSize.z)*GRID_SIZE_Z)) % GRID_SIZE_Z;
pmeAtomGridIndex[i].y = z; pmeAtomGridIndex[i].y = z;
...@@ -83,14 +83,14 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co ...@@ -83,14 +83,14 @@ __kernel void recordZIndex(__global int2* restrict pmeAtomGridIndex, __global co
#define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER) #define BUFFER_SIZE (PME_ORDER*PME_ORDER*PME_ORDER)
__kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1))) __kernel __attribute__((reqd_work_group_size(BUFFER_SIZE, 1, 1)))
void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global long* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) { __global long* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta, real4 periodicBoxSize, real4 invPeriodicBoxSize) {
int ix = get_local_id(0)/(PME_ORDER*PME_ORDER); int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER; int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER; int iy = remainder/PME_ORDER;
int iz = remainder-iy*PME_ORDER; int iz = remainder-iy*PME_ORDER;
__local float4 theta[PME_ORDER]; __local real4 theta[PME_ORDER];
__local float charge[BUFFER_SIZE]; __local real charge[BUFFER_SIZE];
__local int basex[BUFFER_SIZE]; __local int basex[BUFFER_SIZE];
__local int basey[BUFFER_SIZE]; __local int basey[BUFFER_SIZE];
__local int basez[BUFFER_SIZE]; __local int basez[BUFFER_SIZE];
...@@ -101,7 +101,7 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2* ...@@ -101,7 +101,7 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2*
if (get_local_id(0) < BUFFER_SIZE) { if (get_local_id(0) < BUFFER_SIZE) {
int atomIndex = baseIndex+get_local_id(0); int atomIndex = baseIndex+get_local_id(0);
if (atomIndex < NUM_ATOMS) { if (atomIndex < NUM_ATOMS) {
float4 pos = posq[atomIndex]; real4 pos = posq[atomIndex];
charge[get_local_id(0)] = pos.w; charge[get_local_id(0)] = pos.w;
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
...@@ -118,32 +118,40 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2* ...@@ -118,32 +118,40 @@ void gridSpreadCharge(__global const float4* restrict posq, __global const int2*
if (get_local_id(0) < PME_ORDER) if (get_local_id(0) < PME_ORDER)
theta[get_local_id(0)] = pmeBsplineTheta[atomIndex+get_local_id(0)*NUM_ATOMS]; theta[get_local_id(0)] = pmeBsplineTheta[atomIndex+get_local_id(0)*NUM_ATOMS];
barrier(CLK_LOCAL_MEM_FENCE); barrier(CLK_LOCAL_MEM_FENCE);
float add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z; real add = charge[index]*theta[ix].x*theta[iy].y*theta[iz].z;
int x = basex[index]+ix; int x = basex[index]+ix;
int y = basey[index]+iy; int y = basey[index]+iy;
int z = basez[index]+iz; int z = basez[index]+iz;
x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0); x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0); y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
#ifdef USE_DOUBLE_PRECISION
atom_add(&pmeGrid[2*(x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z)], (long) (add*0xFFFFFFFF));
#else
atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0xFFFFFFFF)); atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0xFFFFFFFF));
#endif
} }
} }
} }
} }
__kernel void finishSpreadCharge(__global long* restrict pmeGrid) { __kernel void finishSpreadCharge(__global long* restrict pmeGrid) {
__global float2* floatGrid = (__global float2*) pmeGrid; __global real2* realGrid = (__global real2*) pmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float scale = EPSILON_FACTOR/(float) 0xFFFFFFFF; real scale = EPSILON_FACTOR/(real) 0xFFFFFFFF;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
#ifdef USE_DOUBLE_PRECISION
long value = pmeGrid[2*index];
#else
long value = pmeGrid[index]; long value = pmeGrid[index];
float2 floatValue = (float2) ((float) (value*scale), 0.0f); #endif
floatGrid[index] = floatValue; real2 realValue = (real2) ((real) (value*scale), 0);
realGrid[index] = realValue;
} }
} }
#else #else
__kernel void gridSpreadCharge(__global const float4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange, __kernel void gridSpreadCharge(__global const real4* restrict posq, __global const int2* restrict pmeAtomGridIndex, __global const int* restrict pmeAtomRange,
__global float2* restrict pmeGrid, __global const float4* restrict pmeBsplineTheta) { __global real2* restrict pmeGrid, __global const real4* restrict pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) { for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
// Compute the charge on a grid point. // Compute the charge on a grid point.
...@@ -153,7 +161,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co ...@@ -153,7 +161,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z; int remainder = gridIndex-gridPoint.x*GRID_SIZE_Y*GRID_SIZE_Z;
gridPoint.y = remainder/GRID_SIZE_Z; gridPoint.y = remainder/GRID_SIZE_Z;
gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z; gridPoint.z = remainder-gridPoint.y*GRID_SIZE_Z;
float result = 0.0f; real result = 0.0f;
// Loop over all atoms that affect this grid point. // Loop over all atoms that affect this grid point.
...@@ -174,7 +182,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co ...@@ -174,7 +182,7 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
int atomIndex = atomData.x; int atomIndex = atomData.x;
int z = atomData.y; int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z); int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w; real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z; result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
} }
if (z1 > gridPoint.z) if (z1 > gridPoint.z)
...@@ -189,21 +197,21 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co ...@@ -189,21 +197,21 @@ __kernel void gridSpreadCharge(__global const float4* restrict posq, __global co
int atomIndex = atomData.x; int atomIndex = atomData.x;
int z = atomData.y; int z = atomData.y;
int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z); int iz = gridPoint.z-z+(gridPoint.z >= z ? 0 : GRID_SIZE_Z);
float atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w; real atomCharge = pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].w;
result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z; result += atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
} }
} }
} }
} }
pmeGrid[gridIndex] = (float2) (result*EPSILON_FACTOR, 0.0f); pmeGrid[gridIndex] = (real2) (result*EPSILON_FACTOR, 0);
} }
} }
#endif #endif
__kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global float* restrict energyBuffer, __global const float* restrict pmeBsplineModuliX, __kernel void reciprocalConvolution(__global real2* restrict pmeGrid, __global real* restrict energyBuffer, __global const real* restrict pmeBsplineModuliX,
__global const float* restrict pmeBsplineModuliY, __global const float* restrict pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) { __global const real* restrict pmeBsplineModuliY, __global const real* restrict pmeBsplineModuliZ, real4 invPeriodicBoxSize, real recipScaleFactor) {
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z; const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float energy = 0.0f; real energy = 0.0f;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z); int kx = index/(GRID_SIZE_Y*GRID_SIZE_Z);
int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z; int remainder = index-kx*GRID_SIZE_Y*GRID_SIZE_Z;
...@@ -214,36 +222,36 @@ __kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global ...@@ -214,36 +222,36 @@ __kernel void reciprocalConvolution(__global float2* restrict pmeGrid, __global
int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X); int mx = (kx < (GRID_SIZE_X+1)/2) ? kx : (kx-GRID_SIZE_X);
int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y); int my = (ky < (GRID_SIZE_Y+1)/2) ? ky : (ky-GRID_SIZE_Y);
int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z); int mz = (kz < (GRID_SIZE_Z+1)/2) ? kz : (kz-GRID_SIZE_Z);
float mhx = mx*invPeriodicBoxSize.x; real mhx = mx*invPeriodicBoxSize.x;
float mhy = my*invPeriodicBoxSize.y; real mhy = my*invPeriodicBoxSize.y;
float mhz = mz*invPeriodicBoxSize.z; real mhz = mz*invPeriodicBoxSize.z;
float bx = pmeBsplineModuliX[kx]; real bx = pmeBsplineModuliX[kx];
float by = pmeBsplineModuliY[ky]; real by = pmeBsplineModuliY[ky];
float bz = pmeBsplineModuliZ[kz]; real bz = pmeBsplineModuliZ[kz];
float2 grid = pmeGrid[index]; real2 grid = pmeGrid[index];
float m2 = mhx*mhx+mhy*mhy+mhz*mhz; real m2 = mhx*mhx+mhy*mhy+mhz*mhz;
float denom = m2*bx*by*bz; real denom = m2*bx*by*bz;
float eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom; real eterm = recipScaleFactor*EXP(-RECIP_EXP_FACTOR*m2)/denom;
pmeGrid[index] = (float2) (grid.x*eterm, grid.y*eterm); pmeGrid[index] = (real2) (grid.x*eterm, grid.y*eterm);
energy += eterm*(grid.x*grid.x + grid.y*grid.y); energy += eterm*(grid.x*grid.x + grid.y*grid.y);
} }
energyBuffer[get_global_id(0)] += 0.5f*energy; energyBuffer[get_global_id(0)] += 0.5f*energy;
} }
__kernel void gridInterpolateForce(__global const float4* restrict posq, __global float4* restrict forceBuffers, __global const float2* restrict pmeGrid, __kernel void gridInterpolateForce(__global const real4* restrict posq, __global real4* restrict forceBuffers, __global const real2* restrict pmeGrid,
float4 periodicBoxSize, float4 invPeriodicBoxSize, __local float4* restrict bsplinesCache) { real4 periodicBoxSize, real4 invPeriodicBoxSize, __local real4* restrict bsplinesCache) {
const float4 scale = 1.0f/(PME_ORDER-1); const real4 scale = 1/(real) (PME_ORDER-1);
__local float4* data = &bsplinesCache[get_local_id(0)*PME_ORDER]; __local real4* data = &bsplinesCache[get_local_id(0)*PME_ORDER];
__local float4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER]; __local real4* ddata = &bsplinesCache[get_local_id(0)*PME_ORDER + get_local_size(0)*PME_ORDER];
for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) { for (int atom = get_global_id(0); atom < NUM_ATOMS; atom += get_global_size(0)) {
float4 force = 0.0f; real4 force = 0.0f;
float4 pos = posq[atom]; real4 pos = posq[atom];
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x; pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y; pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z; pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
float4 t = (float4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X, real4 t = (real4) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X,
(pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y, (pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y,
(pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f); (pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z, 0.0f);
int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X, int4 gridIndex = (int4) (((int) t.x) % GRID_SIZE_X,
((int) t.y) % GRID_SIZE_Y, ((int) t.y) % GRID_SIZE_Y,
((int) t.z) % GRID_SIZE_Z, 0); ((int) t.z) % GRID_SIZE_Z, 0);
...@@ -251,15 +259,15 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa ...@@ -251,15 +259,15 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
// Since we need the full set of thetas, it's faster to compute them here than load them // Since we need the full set of thetas, it's faster to compute them here than load them
// from global memory. // from global memory.
float4 dr = (float4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f); real4 dr = (real4) (t.x-(int) t.x, t.y-(int) t.y, t.z-(int) t.z, 0.0f);
data[PME_ORDER-1] = 0.0f; data[PME_ORDER-1] = 0.0f;
data[1] = dr; data[1] = dr;
data[0] = 1.0f-dr; data[0] = 1.0f-dr;
for (int j = 3; j < PME_ORDER; j++) { for (int j = 3; j < PME_ORDER; j++) {
float div = 1.0f/(j-1.0f); real div = RECIP(j-1.0f);
data[j-1] = div*dr*data[j-2]; data[j-1] = div*dr*data[j-2];
for (int k = 1; k < (j-1); k++) for (int k = 1; k < (j-1); k++)
data[j-k-1] = div*((dr+(float4) k) *data[j-k-2] + (-dr+(float4) (j-k))*data[j-k-1]); data[j-k-1] = div*((dr+(real4) k) *data[j-k-2] + (-dr+(real4) (j-k))*data[j-k-1]);
data[0] = div*(- dr+1.0f)*data[0]; data[0] = div*(- dr+1.0f)*data[0];
} }
ddata[0] = -data[0]; ddata[0] = -data[0];
...@@ -267,7 +275,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa ...@@ -267,7 +275,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
ddata[j] = data[j-1]-data[j]; ddata[j] = data[j-1]-data[j];
data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2]; data[PME_ORDER-1] = scale*dr*data[PME_ORDER-2];
for (int j = 1; j < (PME_ORDER-1); j++) for (int j = 1; j < (PME_ORDER-1); j++)
data[PME_ORDER-j-1] = scale*((dr+(float4) j)*data[PME_ORDER-j-2] + (-dr+(float4) (PME_ORDER-j))*data[PME_ORDER-j-1]); data[PME_ORDER-j-1] = scale*((dr+(real4) j)*data[PME_ORDER-j-2] + (-dr+(real4) (PME_ORDER-j))*data[PME_ORDER-j-1]);
data[0] = scale*(-dr+1.0f)*data[0]; data[0] = scale*(-dr+1.0f)*data[0];
// Compute the force on this atom. // Compute the force on this atom.
...@@ -282,7 +290,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa ...@@ -282,7 +290,7 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex; int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
float gridvalue = pmeGrid[index].x; real gridvalue = pmeGrid[index].x;
force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue; force.x += ddata[ix].x*data[iy].y*data[iz].z*gridvalue;
force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue; force.y += data[ix].x*ddata[iy].y*data[iz].z*gridvalue;
#ifndef MAC_AMD_WORKAROUND #ifndef MAC_AMD_WORKAROUND
...@@ -302,14 +310,14 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa ...@@ -302,14 +310,14 @@ __kernel void gridInterpolateForce(__global const float4* restrict posq, __globa
int zindex = gridIndex.z+iz; int zindex = gridIndex.z+iz;
zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0); zindex -= (zindex >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex; int index = xindex*GRID_SIZE_Y*GRID_SIZE_Z + yindex*GRID_SIZE_Z + zindex;
float gridvalue = pmeGrid[index].x; real gridvalue = pmeGrid[index].x;
force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue; force.z += data[ix].x*data[iy].y*ddata[iz].z*gridvalue;
} }
} }
} }
#endif #endif
float4 totalForce = forceBuffers[atom]; real4 totalForce = forceBuffers[atom];
float q = pos.w*EPSILON_FACTOR; real q = pos.w*EPSILON_FACTOR;
totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x; totalForce.x -= q*force.x*GRID_SIZE_X*invPeriodicBoxSize.x;
totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y; totalForce.y -= q*force.y*GRID_SIZE_Y*invPeriodicBoxSize.y;
totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z; totalForce.z -= q*force.z*GRID_SIZE_Z*invPeriodicBoxSize.z;
......
...@@ -4,9 +4,9 @@ if (theta < 0.0f) ...@@ -4,9 +4,9 @@ if (theta < 0.0f)
else else
theta -= PI; theta -= PI;
cosangle = -cosangle; cosangle = -cosangle;
float cosFactor = cosangle; real cosFactor = cosangle;
float dEdAngle = -torsionParams.s1; real dEdAngle = -torsionParams.s1;
float rbEnergy = torsionParams.s0; real rbEnergy = torsionParams.s0;
rbEnergy += torsionParams.s1*cosFactor; rbEnergy += torsionParams.s1*cosFactor;
dEdAngle -= 2.0f*torsionParams.s2*cosFactor; dEdAngle -= 2.0f*torsionParams.s2*cosFactor;
cosFactor *= cosangle; cosFactor *= cosangle;
......
const float PI = 3.14159265358979323846f; const real PI = 3.14159265358979323846f;
float4 v0 = (float4) (pos1.xyz-pos2.xyz, 0.0f); real4 v0 = (real4) (pos1.xyz-pos2.xyz, 0.0f);
float4 v1 = (float4) (pos3.xyz-pos2.xyz, 0.0f); real4 v1 = (real4) (pos3.xyz-pos2.xyz, 0.0f);
float4 v2 = (float4) (pos3.xyz-pos4.xyz, 0.0f); real4 v2 = (real4) (pos3.xyz-pos4.xyz, 0.0f);
float4 cp0 = cross(v0, v1); real4 cp0 = cross(v0, v1);
float4 cp1 = cross(v1, v2); real4 cp1 = cross(v1, v2);
float cosangle = dot(normalize(cp0), normalize(cp1)); real cosangle = dot(normalize(cp0), normalize(cp1));
float theta; real theta;
if (cosangle > 0.99f || cosangle < -0.99f) { if (cosangle > 0.99f || cosangle < -0.99f) {
// We're close to the singularity in acos(), so take the cross product and use asin() instead. // We're close to the singularity in acos(), so take the cross product and use asin() instead.
float4 cross_prod = cross(cp0, cp1); real4 cross_prod = cross(cp0, cp1);
float scale = dot(cp0, cp0)*dot(cp1, cp1); real scale = dot(cp0, cp0)*dot(cp1, cp1);
theta = asin(sqrt(dot(cross_prod, cross_prod)/scale)); theta = asin(sqrt(dot(cross_prod, cross_prod)/scale));
if (cosangle < 0.0f) if (cosangle < 0)
theta = PI-theta; theta = PI-theta;
} }
else else
theta = acos(cosangle); theta = acos(cosangle);
theta = (dot(v0, cp1) >= 0 ? theta : -theta); theta = (dot(v0, cp1) >= 0 ? theta : -theta);
COMPUTE_FORCE COMPUTE_FORCE
float normCross1 = dot(cp0, cp0); real normCross1 = dot(cp0, cp0);
float normSqrBC = dot(v1, v1); real normSqrBC = dot(v1, v1);
float normBC = sqrt(normSqrBC); real normBC = sqrt(normSqrBC);
float normCross2 = dot(cp1, cp1); real normCross2 = dot(cp1, cp1);
float dp = 1.0f/normSqrBC; real dp = 1.0f/normSqrBC;
float4 ff = (float4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2); real4 ff = (real4) ((-dEdAngle*normBC)/normCross1, dot(v0, v1)*dp, dot(v2, v1)*dp, (dEdAngle*normBC)/normCross2);
float4 force1 = ff.x*cp0; real4 force1 = ff.x*cp0;
float4 force4 = ff.w*cp1; real4 force4 = ff.w*cp1;
float4 s = ff.y*force1 - ff.z*force4; real4 s = ff.y*force1 - ff.z*force4;
float4 force2 = s-force1; real4 force2 = s-force1;
float4 force3 = -s-force4; real4 force3 = -s-force4;
...@@ -69,11 +69,11 @@ __kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __globa ...@@ -69,11 +69,11 @@ __kernel void clearSixBuffers(__global int* restrict buffer1, int size1, __globa
* Sum a collection of buffers into the first one. * Sum a collection of buffers into the first one.
*/ */
__kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSize, int numBuffers) { __kernel void reduceReal4Buffer(__global real4* restrict buffer, int bufferSize, int numBuffers) {
int index = get_global_id(0); int index = get_global_id(0);
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
while (index < bufferSize) { while (index < bufferSize) {
float4 sum = buffer[index]; real4 sum = buffer[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize) for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += buffer[i]; sum += buffer[i];
buffer[index] = sum; buffer[index] = sum;
...@@ -84,11 +84,11 @@ __kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSiz ...@@ -84,11 +84,11 @@ __kernel void reduceFloat4Buffer(__global float4* restrict buffer, int bufferSiz
/** /**
* Sum the various buffers containing forces. * Sum the various buffers containing forces.
*/ */
__kernel void reduceForces(__global const long* restrict longBuffer, __global float4* restrict buffer, int bufferSize, int numBuffers) { __kernel void reduceForces(__global const long* restrict longBuffer, __global real4* restrict buffer, int bufferSize, int numBuffers) {
int totalSize = bufferSize*numBuffers; int totalSize = bufferSize*numBuffers;
float scale = 1.0f/(float) 0xFFFFFFFF; real scale = 1/(real) 0xFFFFFFFF;
for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) { for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
float4 sum = (float4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f); real4 sum = (real4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0);
for (int i = index; i < totalSize; i += bufferSize) for (int i = index; i < totalSize; i += bufferSize)
sum += buffer[i]; sum += buffer[i];
buffer[index] = sum; buffer[index] = sum;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment