Commit ec3ee586 authored by Peter Eastman's avatar Peter Eastman
Browse files

Implemented faster version of charge spreading on GPUs that support 64 bit atomics

parent 548efdfb
......@@ -48,6 +48,9 @@ using namespace std;
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
#endif
#ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
#endif
const int OpenCLContext::ThreadBlockSize = 64;
const int OpenCLContext::TileSize = 32;
......@@ -90,10 +93,20 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
throw OpenMMException("The specified OpenCL device is not compatible with OpenMM");
compilationOptions = "-DWORK_GROUP_SIZE="+OpenCLExpressionUtilities::intToString(ThreadBlockSize);
defaultOptimizationOptions = "-cl-fast-relaxed-math";
supports64BitGlobalAtomics = (device.getInfo<CL_DEVICE_EXTENSIONS>().find("cl_khr_int64_base_atomics") != string::npos);
string vendor = device.getInfo<CL_DEVICE_VENDOR>();
if (vendor.size() >= 6 && vendor.substr(0, 6) == "NVIDIA") {
compilationOptions += " -DWARPS_ARE_ATOMIC";
simdWidth = 32;
// Compute level 1.2 and later Nvidia GPUs support 64 bit atomics, even though they don't list the
// proper extension as supported.
cl_uint computeCapabilityMajor, computeCapabilityMinor;
clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV, sizeof(cl_uint), &computeCapabilityMajor, NULL);
clGetDeviceInfo(device(), CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV, sizeof(cl_uint), &computeCapabilityMinor, NULL);
if (computeCapabilityMajor > 1 || computeCapabilityMinor > 1)
supports64BitGlobalAtomics = true;
}
else if (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc.") {
// AMD APP SDK 2.4 has a performance problem with atomics. Enable the work around.
......@@ -104,6 +117,8 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
}
else
simdWidth = 1;
if (supports64BitGlobalAtomics)
compilationOptions += " -DSUPPORTS_64_BIT_ATOMICS";
queue = cl::CommandQueue(context, device);
numAtoms = numParticles;
paddedNumAtoms = TileSize*((numParticles+TileSize-1)/TileSize);
......
......@@ -399,6 +399,12 @@ public:
int getSIMDWidth() const {
return simdWidth;
}
/**
* Get whether the device being used supports 64 bit atomic operations on global memory.
*/
bool getSupports64BitGlobalAtomics() {
return supports64BitGlobalAtomics;
}
/**
* Get the size of the periodic box.
*/
......@@ -458,6 +464,7 @@ private:
int numThreadBlocks;
int numForceBuffers;
int simdWidth;
bool supports64BitGlobalAtomics;
mm_float4 periodicBoxSize;
mm_float4 invPeriodicBoxSize;
std::string compilationOptions, defaultOptimizationOptions;
......
......@@ -1445,6 +1445,10 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeInterpolateForceKernel.setArg<cl::Buffer>(2, pmeBsplineTheta->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(3, pmeBsplineDtheta->getDeviceBuffer());
pmeInterpolateForceKernel.setArg<cl::Buffer>(4, pmeGrid->getDeviceBuffer());
if (cl.getSupports64BitGlobalAtomics()) {
pmeFinishSpreadChargeKernel = cl::Kernel(program, "finishSpreadCharge");
pmeFinishSpreadChargeKernel.setArg<cl::Buffer>(0, pmeGrid->getDeviceBuffer());
}
}
}
if (exceptionIndices != NULL)
......@@ -1476,7 +1480,15 @@ double OpenCLCalcNonbondedForceKernel::execute(ContextImpl& context, bool includ
pmeAtomRangeKernel.setArg<mm_float4>(3, boxSize);
pmeAtomRangeKernel.setArg<mm_float4>(4, invBoxSize);
cl.executeKernel(pmeAtomRangeKernel, cl.getNumAtoms());
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
if (cl.getSupports64BitGlobalAtomics()) {
cl.clearBuffer(pmeGrid->getDeviceBuffer(), pmeGrid->getSize()*2);
pmeSpreadChargeKernel.setArg<mm_float4>(5, boxSize);
pmeSpreadChargeKernel.setArg<mm_float4>(6, invBoxSize);
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
cl.executeKernel(pmeFinishSpreadChargeKernel, pmeGrid->getSize());
}
else
cl.executeKernel(pmeSpreadChargeKernel, cl.getNumAtoms());
}
fft->execFFT(*pmeGrid, true);
pmeConvolutionKernel.setArg<mm_float4>(5, invBoxSize);
......
......@@ -521,6 +521,7 @@ private:
cl::Kernel pmeAtomRangeKernel;
cl::Kernel pmeUpdateBsplinesKernel;
cl::Kernel pmeSpreadChargeKernel;
cl::Kernel pmeFinishSpreadChargeKernel;
cl::Kernel pmeConvolutionKernel;
cl::Kernel pmeInterpolateForceKernel;
std::map<std::string, std::string> pmeDefines;
......
......@@ -78,6 +78,44 @@ __kernel void findAtomRangeForGrid(__global int2* pmeAtomGridIndex, __global int
}
}
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global long* pmeGrid, __global float4* pmeBsplineTheta, float4 periodicBoxSize, float4 invPeriodicBoxSize) {
int ix = get_local_id(0)/(PME_ORDER*PME_ORDER);
int remainder = get_local_id(0)-ix*PME_ORDER*PME_ORDER;
int iy = remainder/PME_ORDER;
int iz = remainder-iy*PME_ORDER;
if (ix < PME_ORDER) {
for (int atomIndex = get_group_id(0); atomIndex < NUM_ATOMS; atomIndex += get_num_groups(0)) {
float4 pos = posq[atomIndex];
float atomCharge = pos.w;
float add = atomCharge*pmeBsplineTheta[atomIndex+ix*NUM_ATOMS].x*pmeBsplineTheta[atomIndex+iy*NUM_ATOMS].y*pmeBsplineTheta[atomIndex+iz*NUM_ATOMS].z;
pos.x -= floor(pos.x*invPeriodicBoxSize.x)*periodicBoxSize.x;
pos.y -= floor(pos.y*invPeriodicBoxSize.y)*periodicBoxSize.y;
pos.z -= floor(pos.z*invPeriodicBoxSize.z)*periodicBoxSize.z;
int x = (int) ((pos.x*invPeriodicBoxSize.x)*GRID_SIZE_X)+ix;
int y = (int) ((pos.y*invPeriodicBoxSize.y)*GRID_SIZE_Y)+iy;
int z = (int) ((pos.z*invPeriodicBoxSize.z)*GRID_SIZE_Z)+iz;
x -= (x >= GRID_SIZE_X ? GRID_SIZE_X : 0);
y -= (y >= GRID_SIZE_Y ? GRID_SIZE_Y : 0);
z -= (z >= GRID_SIZE_Z ? GRID_SIZE_Z : 0);
atom_add(&pmeGrid[x*GRID_SIZE_Y*GRID_SIZE_Z+y*GRID_SIZE_Z+z], (long) (add*0xFFFFFFFF));
}
}
}
__kernel void finishSpreadCharge(__global long* pmeGrid) {
__global float2* floatGrid = (__global float2*) pmeGrid;
const unsigned int gridSize = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
float scale = EPSILON_FACTOR/(float) 0xFFFFFFFF;
for (int index = get_global_id(0); index < gridSize; index += get_global_size(0)) {
long value = pmeGrid[index];
float2 floatValue = (float2) ((float) (value*scale), 0.0f);
floatGrid[index] = floatValue;
}
}
#else
__kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGridIndex, __global int* pmeAtomRange, __global float2* pmeGrid, __global float4* pmeBsplineTheta) {
unsigned int numGridPoints = GRID_SIZE_X*GRID_SIZE_Y*GRID_SIZE_Z;
for (int gridIndex = get_global_id(0); gridIndex < numGridPoints; gridIndex += get_global_size(0)) {
......@@ -91,7 +129,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
float result = 0.0f;
// Loop over all atoms that affect this grid point.
for (int ix = 0; ix < PME_ORDER; ++ix) {
int x = gridPoint.x-ix+(gridPoint.x >= ix ? 0 : GRID_SIZE_X);
for (int iy = 0; iy < PME_ORDER; ++iy) {
......@@ -133,6 +171,7 @@ __kernel void gridSpreadCharge(__global float4* posq, __global int2* pmeAtomGrid
pmeGrid[gridIndex] = (float2) (result*EPSILON_FACTOR, 0.0f);
}
}
#endif
__kernel void reciprocalConvolution(__global float2* pmeGrid, __global float* energyBuffer, __global float* pmeBsplineModuliX,
__global float* pmeBsplineModuliY, __global float* pmeBsplineModuliZ, float4 invPeriodicBoxSize, float recipScaleFactor) {
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment