Commit e43f14bf authored by Peter Eastman's avatar Peter Eastman
Browse files

Created OpenCL implementation of NonbondedForce that works with non-Nvidia processors

parent 2529b922
...@@ -64,8 +64,12 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste ...@@ -64,8 +64,12 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex) : time(0.0), ste
throw OpenMMException("The specified OpenCL device is not compatible with OpenMM"); throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
compilationOptions = "-cl-fast-relaxed-math"; compilationOptions = "-cl-fast-relaxed-math";
string vendor = device.getInfo<CL_DEVICE_VENDOR>(); string vendor = device.getInfo<CL_DEVICE_VENDOR>();
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
compilationOptions += " -DWARPS_ARE_ATOMIC"; compilationOptions += " -DWARPS_ARE_ATOMIC";
simdWidth = 32;
}
else
simdWidth = 1;
queue = cl::CommandQueue(context, device); queue = cl::CommandQueue(context, device);
numAtoms = numParticles; numAtoms = numParticles;
paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize); paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
......
...@@ -251,6 +251,12 @@ public: ...@@ -251,6 +251,12 @@ public:
int getNumForceBuffers() const { int getNumForceBuffers() const {
return numForceBuffers; return numForceBuffers;
} }
/**
* Get the SIMD width of the device being used.
*/
int getSIMDWidth() const {
return simdWidth;
}
/** /**
* Get the OpenCLIntegrationUtilities for this context. * Get the OpenCLIntegrationUtilities for this context.
*/ */
...@@ -281,6 +287,7 @@ private: ...@@ -281,6 +287,7 @@ private:
int numAtomBlocks; int numAtomBlocks;
int numThreadBlocks; int numThreadBlocks;
int numForceBuffers; int numForceBuffers;
int simdWidth;
std::string compilationOptions; std::string compilationOptions;
cl::Context context; cl::Context context;
cl::Device device; cl::Device device;
......
...@@ -346,7 +346,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -346,7 +346,8 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
padded << context.getPaddedNumAtoms(); padded << context.getPaddedNumAtoms();
defines["NUM_ATOMS"] = natom.str(); defines["NUM_ATOMS"] = natom.str();
defines["PADDED_NUM_ATOMS"] = padded.str(); defines["PADDED_NUM_ATOMS"] = padded.str();
cl::Program program = context.createProgram(context.loadSourceFromFile("nonbonded.cl", replacements), defines); string filename = (context.getSIMDWidth() == 32 ? "nonbonded_nvidia.cl" : "nonbonded_default.cl");
cl::Program program = context.createProgram(context.loadSourceFromFile(filename, replacements), defines);
cl::Kernel kernel(program, "computeNonbonded"); cl::Kernel kernel(program, "computeNonbonded");
// Set arguments to the Kernel. // Set arguments to the Kernel.
...@@ -358,17 +359,17 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc ...@@ -358,17 +359,17 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
kernel.setArg<cl::Buffer>(4, exclusionIndex->getDeviceBuffer()); kernel.setArg<cl::Buffer>(4, exclusionIndex->getDeviceBuffer());
kernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); kernel.setArg(5, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL); kernel.setArg(6, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
int paramBase = 9; kernel.setArg(7, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
int paramBase = 10;
if (useCutoff) { if (useCutoff) {
paramBase = 11; paramBase = 11;
kernel.setArg<cl::Buffer>(7, interactingTiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(8, interactingTiles->getDeviceBuffer());
kernel.setArg<cl::Buffer>(8, interactionFlags->getDeviceBuffer()); kernel.setArg<cl::Buffer>(9, interactionFlags->getDeviceBuffer());
kernel.setArg<cl::Buffer>(9, interactionCount->getDeviceBuffer()); kernel.setArg<cl::Buffer>(10, interactionCount->getDeviceBuffer());
kernel.setArg(10, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
} }
else { else {
kernel.setArg<cl::Buffer>(7, tiles->getDeviceBuffer()); kernel.setArg<cl::Buffer>(8, tiles->getDeviceBuffer());
kernel.setArg<cl_uint>(8, tiles->getSize()); kernel.setArg<cl_uint>(9, tiles->getSize());
} }
for (int i = 0; i < (int) params.size(); i++) { for (int i = 0; i < (int) params.size(); i++) {
kernel.setArg<cl::Buffer>(i*2+paramBase, params[i].getBuffer()); kernel.setArg<cl::Buffer>(i*2+paramBase, params[i].getBuffer());
......
const unsigned int TileSize = 32;
/**
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
#endif
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x = tiles[pos];
unsigned int y = ((x >> 2) & 0x7fff)*TileSize;
bool hasExclusions = (x & 0x1);
x = (x>>17)*TileSize;
unsigned int baseLocalAtom = (get_local_id(0) < TileSize ? 0 : TileSize/2);
unsigned int tgx = get_local_id(0) & (TileSize-1);
unsigned int forceBufferOffset = (tgx < TileSize/2 ? 0 : TileSize);
unsigned int atom1 = x + tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
unsigned int xi = x/TileSize;
unsigned int tile = xi+xi*PADDED_NUM_ATOMS/TileSize-xi*(xi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndices[tile]+tgx] >> baseLocalAtom;
#endif
for (unsigned int j = 0; j < TileSize/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f/r;
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
excl >>= 1;
}
// Sum the forces and write results.
if (get_local_id(0) >= TileSize)
tempBuffer[get_local_id(0)] = force;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TileSize) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset = x + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else
unsigned int offset = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset].xyz += force.xyz+tempBuffer[get_local_id(0)+TileSize].xyz;
}
}
else {
// This is an off-diagonal tile.
if (lasty != y && get_local_id(0) < TileSize) {
unsigned int j = y + tgx;
local_posq[get_local_id(0)] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[get_local_id(0)] = 0.0f;
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_CUTOFF
unsigned int flags = interactionFlags[pos];
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned int xi = x/TileSize;
unsigned int yi = y/TileSize;
unsigned int tile = xi+yi*PADDED_NUM_ATOMS/TileSize-yi*(yi+1)/2;
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TileSize - tgx));
excl >>= baseLocalAtom;
#endif
unsigned int tj = tgx%(TileSize/2);
for (unsigned int j = 0; j < TileSize/2; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = baseLocalAtom+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
delta.z -= floor(delta.z/PERIODIC_BOX_SIZE_Z+0.5f)*PERIODIC_BOX_SIZE_Z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2);
float invR = 1.0f/r;
LOAD_ATOM2_PARAMETERS
atom2 = y+baseLocalAtom+tj;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
local_force[baseLocalAtom+tj+forceBufferOffset].xyz += delta.xyz;
barrier(CLK_LOCAL_MEM_FENCE);
excl >>= 1;
tj = (tj+1)%(TileSize/2);
}
}
// Sum the forces and write results.
if (get_local_id(0) >= TileSize)
tempBuffer[get_local_id(0)] = force;
barrier(CLK_LOCAL_MEM_FENCE);
if (get_local_id(0) < TileSize) {
#ifdef USE_OUTPUT_BUFFER_PER_BLOCK
unsigned int offset1 = x + tgx + (y/TileSize)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + (x/TileSize)*PADDED_NUM_ATOMS;
#else
unsigned int offset1 = x + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
unsigned int offset2 = y + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
#endif
forceBuffers[offset1].xyz += force.xyz+tempBuffer[get_local_id(0)+TileSize].xyz;
forceBuffers[offset2].xyz += local_force[get_local_id(0)].xyz+local_force[get_local_id(0)+TileSize].xyz;
}
lasty = y;
}
pos++;
}
energyBuffer[get_global_id(0)] += energy;
}
...@@ -4,10 +4,10 @@ const unsigned int TileSize = 32; ...@@ -4,10 +4,10 @@ const unsigned int TileSize = 32;
* Compute nonbonded interactions. * Compute nonbonded interactions.
*/ */
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusions, __global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __global unsigned int* tiles, __global unsigned int* exclusionIndices, __local float4* local_posq, __local float4* local_force, __local float4* tempBuffer, __global unsigned int* tiles,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
__global unsigned int* interactionFlags, __global unsigned int* interactionCount, __local float4* tempBuffer __global unsigned int* interactionFlags, __global unsigned int* interactionCount
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
...@@ -30,7 +30,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -30,7 +30,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
x = (x>>17)*TileSize; x = (x>>17)*TileSize;
unsigned int tgx = get_local_id(0) & (TileSize-1); unsigned int tgx = get_local_id(0) & (TileSize-1);
unsigned int tbx = get_local_id(0) - tgx; unsigned int tbx = get_local_id(0) - tgx;
unsigned int tj = tgx;
unsigned int atom1 = x + tgx; unsigned int atom1 = x + tgx;
float4 force = 0.0f; float4 force = 0.0f;
float4 posq1 = posq[atom1]; float4 posq1 = posq[atom1];
...@@ -49,7 +48,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -49,7 +48,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
...@@ -58,8 +59,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -58,8 +59,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f/r; float invR = 1.0f/r;
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+j; atom2 = y+j;
float dEdR = 0.0f; float dEdR = 0.0f;
...@@ -100,7 +99,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -100,7 +99,9 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
for (unsigned int j = 0; j < TileSize; j++) { for (unsigned int j = 0; j < TileSize; j++) {
if ((flags&(1<<j)) != 0) { if ((flags&(1<<j)) != 0) {
bool isExcluded = false; bool isExcluded = false;
float4 delta = (float4) (local_posq[tbx+j].xyz - posq1.xyz, 0.0f); int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
...@@ -109,8 +110,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -109,8 +110,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f/r; float invR = 1.0f/r;
int atom2 = tbx+j;
float4 posq2 = local_posq[atom2];
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+j; atom2 = y+j;
float dEdR = 0.0f; float dEdR = 0.0f;
...@@ -149,11 +148,14 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -149,11 +148,14 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF); unsigned int excl = (hasExclusions ? exclusions[exclusionIndices[tile]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TileSize - tgx)); excl = (excl >> tgx) | (excl << (TileSize - tgx));
#endif #endif
unsigned int tj = tgx;
for (unsigned int j = 0; j < TileSize; j++) { for (unsigned int j = 0; j < TileSize; j++) {
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1); bool isExcluded = !(excl & 0x1);
#endif #endif
float4 delta = (float4) (local_posq[tbx+tj].xyz - posq1.xyz, 0.0f); int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X; delta.x -= floor(delta.x/PERIODIC_BOX_SIZE_X+0.5f)*PERIODIC_BOX_SIZE_X;
delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y; delta.y -= floor(delta.y/PERIODIC_BOX_SIZE_Y+0.5f)*PERIODIC_BOX_SIZE_Y;
...@@ -162,8 +164,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en ...@@ -162,8 +164,6 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float r = sqrt(r2); float r = sqrt(r2);
float invR = 1.0f/r; float invR = 1.0f/r;
int atom2 = tbx+tj;
float4 posq2 = local_posq[atom2];
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y+tj; atom2 = y+tj;
float dEdR = 0.0f; float dEdR = 0.0f;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment