Commit 5ecc8e00 authored by Peter Eastman's avatar Peter Eastman
Browse files

OpenCL version of GayBerneForce requires 64 bit atomics

parent f1ca1e6d
...@@ -6051,6 +6051,8 @@ OpenCLCalcGayBerneForceKernel::~OpenCLCalcGayBerneForceKernel() { ...@@ -6051,6 +6051,8 @@ OpenCLCalcGayBerneForceKernel::~OpenCLCalcGayBerneForceKernel() {
} }
void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) { void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBerneForce& force) {
if (!cl.getSupports64BitGlobalAtomics())
throw OpenMMException("GayBerneForce requires a device that supports 64 bit atomic operations");
// Initialize interactions. // Initialize interactions.
...@@ -6123,12 +6125,9 @@ void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBe ...@@ -6123,12 +6125,9 @@ void OpenCLCalcGayBerneForceKernel::initialize(const System& system, const GayBe
neighborIndex = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks, "neighbors"); neighborIndex = OpenCLArray::create<cl_int>(cl, maxNeighborBlocks, "neighbors");
neighborBlockCount = OpenCLArray::create<cl_int>(cl, 1, "neighborBlockCount"); neighborBlockCount = OpenCLArray::create<cl_int>(cl, 1, "neighborBlockCount");
// Create arrays for accumulating torques. // Create array for accumulating torques.
if (cl.getSupports64BitGlobalAtomics()) torque = OpenCLArray::create<cl_long>(cl, 3*cl.getPaddedNumAtoms(), "torque");
torque = OpenCLArray::create<cl_long>(cl, 3*cl.getPaddedNumAtoms(), "torque");
else
torque = new OpenCLArray(cl, cl.getPaddedNumAtoms()*cl.getNonbondedUtilities().getNumForceBuffers(), 4*elementSize, "torque");
cl.addAutoclearBuffer(*torque); cl.addAutoclearBuffer(*torque);
// Create the kernels. // Create the kernels.
...@@ -6195,9 +6194,8 @@ double OpenCLCalcGayBerneForceKernel::execute(ContextImpl& context, bool include ...@@ -6195,9 +6194,8 @@ double OpenCLCalcGayBerneForceKernel::execute(ContextImpl& context, bool include
neighborsKernel.setArg<cl::Buffer>(12, neighborBlockCount->getDeviceBuffer()); neighborsKernel.setArg<cl::Buffer>(12, neighborBlockCount->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(13, exclusions->getDeviceBuffer()); neighborsKernel.setArg<cl::Buffer>(13, exclusions->getDeviceBuffer());
neighborsKernel.setArg<cl::Buffer>(14, exclusionStartIndex->getDeviceBuffer()); neighborsKernel.setArg<cl::Buffer>(14, exclusionStartIndex->getDeviceBuffer());
bool useLong = cl.getSupports64BitGlobalAtomics();
int index = 0; int index = 0;
forceKernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer().getDeviceBuffer() : cl.getForceBuffers().getDeviceBuffer())); forceKernel.setArg<cl::Buffer>(index++, cl.getLongForceBuffer().getDeviceBuffer());
forceKernel.setArg<cl::Buffer>(index++, torque->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(index++, torque->getDeviceBuffer());
forceKernel.setArg<cl_int>(index++, numRealParticles); forceKernel.setArg<cl_int>(index++, numRealParticles);
forceKernel.setArg<cl_int>(index++, exceptionAtoms.size()); forceKernel.setArg<cl_int>(index++, exceptionAtoms.size());
...@@ -6220,10 +6218,8 @@ double OpenCLCalcGayBerneForceKernel::execute(ContextImpl& context, bool include ...@@ -6220,10 +6218,8 @@ double OpenCLCalcGayBerneForceKernel::execute(ContextImpl& context, bool include
forceKernel.setArg<cl::Buffer>(index++, neighborBlockCount->getDeviceBuffer()); forceKernel.setArg<cl::Buffer>(index++, neighborBlockCount->getDeviceBuffer());
} }
index = 0; index = 0;
torqueKernel.setArg<cl::Buffer>(index++, (useLong ? cl.getLongForceBuffer().getDeviceBuffer() : cl.getForceBuffers().getDeviceBuffer())); torqueKernel.setArg<cl::Buffer>(index++, cl.getLongForceBuffer().getDeviceBuffer());
torqueKernel.setArg<cl::Buffer>(index++, torque->getDeviceBuffer()); torqueKernel.setArg<cl::Buffer>(index++, torque->getDeviceBuffer());
if (!useLong)
torqueKernel.setArg<cl_int>(index++, cl.getNumForceBuffers());
torqueKernel.setArg<cl_int>(index++, numRealParticles); torqueKernel.setArg<cl_int>(index++, numRealParticles);
torqueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer()); torqueKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
torqueKernel.setArg<cl::Buffer>(index++, axisParticleIndices->getDeviceBuffer()); torqueKernel.setArg<cl::Buffer>(index++, axisParticleIndices->getDeviceBuffer());
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32 #define TILE_SIZE 32
#define NEIGHBOR_BLOCK_SIZE 32 #define NEIGHBOR_BLOCK_SIZE 32
...@@ -343,11 +341,7 @@ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real ep ...@@ -343,11 +341,7 @@ void computeOneInteraction(AtomData* data1, AtomData* data2, real sigma, real ep
* Compute the interactions. * Compute the interactions.
*/ */
__kernel void computeForce( __kernel void computeForce(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict torqueBuffers, __global long* restrict forceBuffers, __global long* restrict torqueBuffers,
#else
__global real4* restrict forceBuffers, __global real4* restrict torqueBuffers,
#endif
int numAtoms, int numExceptions, __global mixed* restrict energyBuffer, __global const real4* restrict pos, int numAtoms, int numExceptions, __global mixed* restrict energyBuffer, __global const real4* restrict pos,
__global const float4* restrict sigParams, __global const float2* restrict epsParams, __global const int* restrict sortedAtoms, __global const float4* restrict sigParams, __global const float2* restrict epsParams, __global const int* restrict sortedAtoms,
__global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix, __global const real* restrict aMatrix, __global const real* restrict bMatrix, __global const real* restrict gMatrix,
...@@ -395,31 +389,19 @@ __kernel void computeForce( ...@@ -395,31 +389,19 @@ __kernel void computeForce(
real sigma = data1.sig.x+data2.sig.x; real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x; real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy); computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000)); atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000)); atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000)); atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000)); atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000)); atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000)); atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
#else
unsigned int offset = index2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force2.xyz;
torqueBuffers[offset].xyz += torque2.xyz;
#endif
} }
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000)); atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000)); atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000)); atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000)); atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000)); atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000)); atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
#else
unsigned int offset = index1 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force1.xyz;
torqueBuffers[offset].xyz += torque1.xyz;
#endif
} }
#else #else
for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) { for (int atom1 = get_global_id(0); atom1 < numAtoms; atom1 += get_global_size(0)) {
...@@ -455,31 +437,19 @@ __kernel void computeForce( ...@@ -455,31 +437,19 @@ __kernel void computeForce(
real sigma = data1.sig.x+data2.sig.x; real sigma = data1.sig.x+data2.sig.x;
real epsilon = data1.eps.x*data2.eps.x; real epsilon = data1.eps.x*data2.eps.x;
computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy); computeOneInteraction(&data1, &data2, sigma, epsilon, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000)); atom_add(&forceBuffers[index2], (long) (force2.x*0x100000000));
atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000)); atom_add(&forceBuffers[index2+PADDED_NUM_ATOMS], (long) (force2.y*0x100000000));
atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000)); atom_add(&forceBuffers[index2+2*PADDED_NUM_ATOMS], (long) (force2.z*0x100000000));
atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000)); atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000)); atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000)); atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
#else
unsigned int offset = index2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force2.xyz;
torqueBuffers[offset].xyz += torque2.xyz;
#endif
} }
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000)); atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000)); atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000)); atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000)); atom_add(&torqueBuffers[index1], (long) (torque1.x*0x100000000));
atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000)); atom_add(&torqueBuffers[index1+PADDED_NUM_ATOMS], (long) (torque1.y*0x100000000));
atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000)); atom_add(&torqueBuffers[index1+2*PADDED_NUM_ATOMS], (long) (torque1.z*0x100000000));
#else
unsigned int offset = index1 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force1.xyz;
torqueBuffers[offset].xyz += torque1.xyz;
#endif
} }
#endif #endif
...@@ -501,7 +471,6 @@ __kernel void computeForce( ...@@ -501,7 +471,6 @@ __kernel void computeForce(
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif #endif
computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy); computeOneInteraction(&data1, &data2, params.x, params.y, delta, r2, &force1, &force2, &torque1, &torque2, &energy);
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000)); atom_add(&forceBuffers[index1], (long) (force1.x*0x100000000));
atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000)); atom_add(&forceBuffers[index1+PADDED_NUM_ATOMS], (long) (force1.y*0x100000000));
atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000)); atom_add(&forceBuffers[index1+2*PADDED_NUM_ATOMS], (long) (force1.z*0x100000000));
...@@ -514,14 +483,6 @@ __kernel void computeForce( ...@@ -514,14 +483,6 @@ __kernel void computeForce(
atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000)); atom_add(&torqueBuffers[index2], (long) (torque2.x*0x100000000));
atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000)); atom_add(&torqueBuffers[index2+PADDED_NUM_ATOMS], (long) (torque2.y*0x100000000));
atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000)); atom_add(&torqueBuffers[index2+2*PADDED_NUM_ATOMS], (long) (torque2.z*0x100000000));
#else
int offset = index1 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force1.xyz;
torqueBuffers[offset].xyz += torque1.xyz;
offset = index2 + warp*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz += force2.xyz;
torqueBuffers[offset].xyz += torque2.xyz;
#endif
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif #endif
...@@ -533,11 +494,7 @@ __kernel void computeForce( ...@@ -533,11 +494,7 @@ __kernel void computeForce(
* Convert the torques to forces on the connected particles. * Convert the torques to forces on the connected particles.
*/ */
__kernel void applyTorques( __kernel void applyTorques(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers, __global long* restrict torqueBuffers, __global long* restrict forceBuffers, __global long* restrict torqueBuffers,
#else
__global real4* restrict forceBuffers, __global real4* restrict torqueBuffers, int numBuffers,
#endif
int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices, int numParticles, __global const real4* restrict posq, __global int2* const restrict axisParticleIndices,
__global const int* sortedParticles) { __global const int* sortedParticles) {
const unsigned int warp = get_global_id(0)/TILE_SIZE; const unsigned int warp = get_global_id(0)/TILE_SIZE;
...@@ -548,14 +505,8 @@ __kernel void applyTorques( ...@@ -548,14 +505,8 @@ __kernel void applyTorques(
if (axisParticles.x != -1) { if (axisParticles.x != -1) {
// Load the torque. // Load the torque.
#ifdef SUPPORTS_64_BIT_ATOMICS
real scale = 1/(real) 0x100000000; real scale = 1/(real) 0x100000000;
real3 torque = (real3) (scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]); real3 torque = (real3) (scale*torqueBuffers[originalIndex], scale*torqueBuffers[originalIndex+PADDED_NUM_ATOMS], scale*torqueBuffers[originalIndex+2*PADDED_NUM_ATOMS]);
#else
real3 torque = 0;
for (int i = 0; i < numBuffers; i++)
torque += torqueBuffers[originalIndex+i*PADDED_NUM_ATOMS].xyz;
#endif
real3 force = 0, xforce = 0, yforce = 0; real3 force = 0, xforce = 0, yforce = 0;
// Apply a force to the x particle. // Apply a force to the x particle.
...@@ -576,7 +527,6 @@ __kernel void applyTorques( ...@@ -576,7 +527,6 @@ __kernel void applyTorques(
yforce += f; yforce += f;
force -= f; force -= f;
} }
#ifdef SUPPORTS_64_BIT_ATOMICS
atom_add(&forceBuffers[originalIndex], (long) (force.x*0x100000000)); atom_add(&forceBuffers[originalIndex], (long) (force.x*0x100000000));
atom_add(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (long) (force.y*0x100000000)); atom_add(&forceBuffers[originalIndex+PADDED_NUM_ATOMS], (long) (force.y*0x100000000));
atom_add(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000)); atom_add(&forceBuffers[originalIndex+2*PADDED_NUM_ATOMS], (long) (force.z*0x100000000));
...@@ -588,12 +538,6 @@ __kernel void applyTorques( ...@@ -588,12 +538,6 @@ __kernel void applyTorques(
atom_add(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (long) (yforce.y*0x100000000)); atom_add(&forceBuffers[axisParticles.y+PADDED_NUM_ATOMS], (long) (yforce.y*0x100000000));
atom_add(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (long) (yforce.z*0x100000000)); atom_add(&forceBuffers[axisParticles.y+2*PADDED_NUM_ATOMS], (long) (yforce.z*0x100000000));
} }
#else
forceBuffers[originalIndex + warp*PADDED_NUM_ATOMS].xyz += force.xyz;
forceBuffers[axisParticles.x + warp*PADDED_NUM_ATOMS].xyz += xforce.xyz;
if (axisParticles.y != -1)
forceBuffers[axisParticles.y + warp*PADDED_NUM_ATOMS].xyz += yforce.xyz;
#endif
} }
} }
} }
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment