Commit d9029f61 authored by Peter Eastman's avatar Peter Eastman
Browse files

On Fermi, store forces as 64 bit fixed point and update with atomic...

On Fermi, store forces as 64 bit fixed point and update with atomic operations.  Also various other minor optimizations.
parent 8ade176c
......@@ -61,7 +61,8 @@ static void CL_CALLBACK errorCallback(const char* errinfo, const void* private_i
OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::PlatformData& platformData) :
time(0.0), platformData(platformData), stepCount(0), computeForceCount(0), posq(NULL), velm(NULL),
forceBuffers(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL), nonbonded(NULL), thread(NULL) {
forceBuffers(NULL), longForceBuffer(NULL), energyBuffer(NULL), atomIndex(NULL), integration(NULL),
nonbonded(NULL), thread(NULL) {
try {
contextIndex = platformData.contexts.size();
std::vector<cl::Platform> platforms;
......@@ -148,6 +149,7 @@ OpenCLContext::OpenCLContext(int numParticles, int deviceIndex, OpenCLPlatform::
clearThreeBuffersKernel = cl::Kernel(utilities, "clearThreeBuffers");
clearFourBuffersKernel = cl::Kernel(utilities, "clearFourBuffers");
reduceFloat4Kernel = cl::Kernel(utilities, "reduceFloat4Buffer");
reduceForcesKernel = cl::Kernel(utilities, "reduceForces");
// Decide whether native_sqrt(), native_rsqrt(), and native_recip() are sufficiently accurate to use.
......@@ -195,6 +197,8 @@ OpenCLContext::~OpenCLContext() {
delete force;
if (forceBuffers != NULL)
delete forceBuffers;
if (longForceBuffer != NULL)
delete longForceBuffer;
if (energyBuffer != NULL)
delete energyBuffer;
if (atomIndex != NULL)
......@@ -215,6 +219,14 @@ void OpenCLContext::initialize(const System& system) {
for (int i = 0; i < (int) forces.size(); i++)
numForceBuffers = std::max(numForceBuffers, forces[i]->getRequiredForceBuffers());
forceBuffers = new OpenCLArray<mm_float4>(*this, paddedNumAtoms*numForceBuffers, "forceBuffers", false);
if (supports64BitGlobalAtomics) {
longForceBuffer = new OpenCLArray<cl_long>(*this, 3*paddedNumAtoms, "longForceBuffer", false);
reduceForcesKernel.setArg<cl::Buffer>(0, longForceBuffer->getDeviceBuffer());
reduceForcesKernel.setArg<cl::Buffer>(1, forceBuffers->getDeviceBuffer());
reduceForcesKernel.setArg<cl_int>(2, paddedNumAtoms);
reduceForcesKernel.setArg<cl_int>(3, numForceBuffers);
addAutoclearBuffer(longForceBuffer->getDeviceBuffer(), longForceBuffer->getSize()*2);
}
addAutoclearBuffer(forceBuffers->getDeviceBuffer(), forceBuffers->getSize()*4);
force = new OpenCLArray<mm_float4>(*this, &forceBuffers->getDeviceBuffer(), paddedNumAtoms, "force", true);
energyBuffer = new OpenCLArray<cl_float>(*this, max(numThreadBlocks*ThreadBlockSize, nonbonded->getNumEnergyBuffers()), "energyBuffer", true);
......@@ -328,7 +340,7 @@ void OpenCLContext::clearBuffer(OpenCLArray<mm_float4>& array) {
void OpenCLContext::clearBuffer(cl::Memory& memory, int size) {
clearBufferKernel.setArg<cl::Memory>(0, memory);
clearBufferKernel.setArg<cl_int>(1, size);
executeKernel(clearBufferKernel, size);
executeKernel(clearBufferKernel, size, 128);
}
void OpenCLContext::addAutoclearBuffer(cl::Memory& memory, int size) {
......@@ -348,7 +360,7 @@ void OpenCLContext::clearAutoclearBuffers() {
clearFourBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
clearFourBuffersKernel.setArg<cl::Memory>(6, *autoclearBuffers[base+3]);
clearFourBuffersKernel.setArg<cl_int>(7, autoclearBufferSizes[base+3]);
executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]));
executeKernel(clearFourBuffersKernel, max(max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), autoclearBufferSizes[base+3]), 128);
base += 4;
}
if (total-base == 3) {
......@@ -358,26 +370,33 @@ void OpenCLContext::clearAutoclearBuffers() {
clearThreeBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
clearThreeBuffersKernel.setArg<cl::Memory>(4, *autoclearBuffers[base+2]);
clearThreeBuffersKernel.setArg<cl_int>(5, autoclearBufferSizes[base+2]);
executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]));
executeKernel(clearThreeBuffersKernel, max(max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), autoclearBufferSizes[base+2]), 128);
}
else if (total-base == 2) {
clearTwoBuffersKernel.setArg<cl::Memory>(0, *autoclearBuffers[base]);
clearTwoBuffersKernel.setArg<cl_int>(1, autoclearBufferSizes[base]);
clearTwoBuffersKernel.setArg<cl::Memory>(2, *autoclearBuffers[base+1]);
clearTwoBuffersKernel.setArg<cl_int>(3, autoclearBufferSizes[base+1]);
executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]));
executeKernel(clearTwoBuffersKernel, max(autoclearBufferSizes[base], autoclearBufferSizes[base+1]), 128);
}
else if (total-base == 1) {
clearBuffer(*autoclearBuffers[base], autoclearBufferSizes[base]);
}
}
void OpenCLContext::reduceForces() {
if (supports64BitGlobalAtomics)
executeKernel(reduceForcesKernel, paddedNumAtoms, 128);
else
reduceBuffer(*forceBuffers, numForceBuffers);
}
void OpenCLContext::reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers) {
int bufferSize = array.getSize()/numBuffers;
reduceFloat4Kernel.setArg<cl::Buffer>(0, array.getDeviceBuffer());
reduceFloat4Kernel.setArg<cl_int>(1, bufferSize);
reduceFloat4Kernel.setArg<cl_int>(2, numBuffers);
executeKernel(reduceFloat4Kernel, bufferSize);
executeKernel(reduceFloat4Kernel, bufferSize, 128);
}
void OpenCLContext::tagAtomsInMolecule(int atom, int molecule, vector<int>& atomMolecule, vector<vector<int> >& atomBonds) {
......
......@@ -215,6 +215,12 @@ public:
OpenCLArray<mm_float4>& getForceBuffers() {
return *forceBuffers;
}
/**
* Get the array which contains a contribution to each force represented as 64 bit fixed point.
*/
OpenCLArray<cl_long>& getLongForceBuffer() {
return *longForceBuffer;
}
/**
* Get the array which contains the buffer in which energy is computed.
*/
......@@ -311,6 +317,10 @@ public:
* @param numBuffers the number of buffers packed into the array
*/
void reduceBuffer(OpenCLArray<mm_float4>& array, int numBuffers);
/**
* Sum the buffesr containing forces.
*/
void reduceForces();
/**
* Get the current simulation time.
*/
......@@ -463,6 +473,7 @@ private:
cl::Kernel clearThreeBuffersKernel;
cl::Kernel clearFourBuffersKernel;
cl::Kernel reduceFloat4Kernel;
cl::Kernel reduceForcesKernel;
std::vector<OpenCLForceInfo*> forces;
std::vector<MoleculeGroup> moleculeGroups;
std::vector<mm_int4> posCellOffsets;
......@@ -470,6 +481,7 @@ private:
OpenCLArray<mm_float4>* velm;
OpenCLArray<mm_float4>* force;
OpenCLArray<mm_float4>* forceBuffers;
OpenCLArray<cl_long>* longForceBuffer;
OpenCLArray<cl_float>* energyBuffer;
OpenCLArray<cl_int>* atomIndex;
std::vector<cl::Memory*> autoclearBuffers;
......
This diff is collapsed.
......@@ -572,7 +572,8 @@ private:
class OpenCLCalcGBSAOBCForceKernel : public CalcGBSAOBCForceKernel {
public:
OpenCLCalcGBSAOBCForceKernel(std::string name, const Platform& platform, OpenCLContext& cl) : CalcGBSAOBCForceKernel(name, platform), cl(cl),
hasCreatedKernels(false), params(NULL), bornSum(NULL), bornRadii(NULL), bornForce(NULL), obcChain(NULL) {
hasCreatedKernels(false), params(NULL), bornSum(NULL), longBornSum(NULL), bornRadii(NULL), bornForce(NULL),
longBornForce(NULL), obcChain(NULL) {
}
~OpenCLCalcGBSAOBCForceKernel();
/**
......@@ -598,8 +599,10 @@ private:
OpenCLContext& cl;
OpenCLArray<mm_float2>* params;
OpenCLArray<cl_float>* bornSum;
OpenCLArray<cl_long>* longBornSum;
OpenCLArray<cl_float>* bornRadii;
OpenCLArray<cl_float>* bornForce;
OpenCLArray<cl_long>* longBornForce;
OpenCLArray<cl_float>* obcChain;
cl::Kernel computeBornSumKernel;
cl::Kernel reduceBornSumKernel;
......@@ -613,8 +616,8 @@ private:
class OpenCLCalcCustomGBForceKernel : public CalcCustomGBForceKernel {
public:
OpenCLCalcCustomGBForceKernel(std::string name, const Platform& platform, OpenCLContext& cl, System& system) : CalcCustomGBForceKernel(name, platform),
hasInitializedKernels(false), cl(cl), params(NULL), computedValues(NULL), energyDerivs(NULL), globals(NULL), valueBuffers(NULL),
tabulatedFunctionParams(NULL), system(system) {
hasInitializedKernels(false), cl(cl), params(NULL), computedValues(NULL), energyDerivs(NULL), longEnergyDerivs(NULL), globals(NULL),
valueBuffers(NULL), longValueBuffers(NULL), tabulatedFunctionParams(NULL), system(system) {
}
~OpenCLCalcCustomGBForceKernel();
/**
......@@ -635,13 +638,15 @@ public:
double execute(ContextImpl& context, bool includeForces, bool includeEnergy);
private:
bool hasInitializedKernels, needParameterGradient;
int maxTiles;
int maxTiles, numComputedValues;
OpenCLContext& cl;
OpenCLParameterSet* params;
OpenCLParameterSet* computedValues;
OpenCLParameterSet* energyDerivs;
OpenCLArray<cl_long>* longEnergyDerivs;
OpenCLArray<cl_float>* globals;
OpenCLArray<cl_float>* valueBuffers;
OpenCLArray<cl_long>* longValueBuffers;
OpenCLArray<mm_float4>* tabulatedFunctionParams;
std::vector<std::string> globalParamNames;
std::vector<cl_float> globalParamValues;
......
......@@ -48,9 +48,16 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
numForceBuffers = numForceThreadBlocks;
}
else if (context.getSIMDWidth() == 32) {
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 128;
numForceBuffers = numForceThreadBlocks;
if (context.getSupports64BitGlobalAtomics()) {
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 256;
numForceBuffers = 2;
}
else {
numForceThreadBlocks = 4*context.getDevice().getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
forceThreadBlockSize = 128;
numForceBuffers = numForceThreadBlocks;
}
}
else {
numForceThreadBlocks = context.getNumThreadBlocks();
......@@ -282,7 +289,7 @@ void OpenCLNonbondedUtilities::initialize(const System& system) {
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(6, blockBoundingBox->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(7, interactionFlags->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg<cl::Buffer>(8, interactionCount->getDeviceBuffer());
findInteractionsWithinBlocksKernel.setArg(9, OpenCLContext::ThreadBlockSize*sizeof(cl_uint), NULL);
findInteractionsWithinBlocksKernel.setArg(9, 128*sizeof(cl_uint), NULL);
findInteractionsWithinBlocksKernel.setArg<cl_uint>(10, interactingTiles->getSize());
}
}
......@@ -312,7 +319,7 @@ void OpenCLNonbondedUtilities::prepareInteractions() {
if (context.getSIMDWidth() == 32 && !deviceIsCpu) {
findInteractionsWithinBlocksKernel.setArg<mm_float4>(1, context.getPeriodicBoxSize());
findInteractionsWithinBlocksKernel.setArg<mm_float4>(2, context.getInvPeriodicBoxSize());
context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms());
context.executeKernel(findInteractionsWithinBlocksKernel, context.getNumAtoms(), 128);
}
}
......@@ -492,7 +499,10 @@ cl::Kernel OpenCLNonbondedUtilities::createInteractionKernel(const string& sourc
// Set arguments to the Kernel.
int index = 0;
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
if (context.getSupports64BitGlobalAtomics())
kernel.setArg<cl::Memory>(index++, context.getLongForceBuffer().getDeviceBuffer());
else
kernel.setArg<cl::Buffer>(index++, context.getForceBuffers().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getEnergyBuffer().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, context.getPosq().getDeviceBuffer());
kernel.setArg<cl::Buffer>(index++, exclusions->getDeviceBuffer());
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#define TILE_SIZE 32
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#define STORE_DERIVATIVE_1(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (deriv##INDEX##_1*0xFFFFFFFF));
#define STORE_DERIVATIVE_2(INDEX) atom_add(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], (long) (local_deriv##INDEX[get_local_id(0)]*0xFFFFFFFF));
#else
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset] += local_deriv##INDEX[get_local_id(0)];
#endif
#define TILE_SIZE 32
/**
* Compute a force based on pair interactions.
*/
__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__kernel void computeN2Energy(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* forceBuffers,
#else
__global float4* forceBuffers,
#endif
__global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF
......@@ -194,6 +206,22 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
#ifdef SUPPORTS_64_BIT_ATOMICS
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0xFFFFFFFF));
STORE_DERIVATIVES_1
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (local_force[get_local_id(0)].x*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].y*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (local_force[get_local_id(0)].z*0xFFFFFFFF));
STORE_DERIVATIVES_2
}
#else
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
......@@ -245,6 +273,7 @@ __kernel void computeN2Energy(__global float4* forceBuffers, __global float* ene
}
}
}
#endif
pos++;
} while (pos < end);
energyBuffer[get_global_id(0)] += energy;
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32
/**
* Compute a value based on pair interactions.
*/
__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* global_value,
#else
__global float* global_value,
#endif
__local float* local_value, __local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
......@@ -164,16 +172,10 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
// Sum the forces on atom2.
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
if (tgx == 0)
local_value[tbx+j] += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
local_value[tbx+j] += tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
}
}
}
......@@ -233,6 +235,16 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
#ifdef SUPPORTS_64_BIT_ATOMICS
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) (value*0xFFFFFFFF));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&global_value[offset], (long) (local_value[get_local_id(0)]*0xFFFFFFFF));
}
#else
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
......@@ -282,6 +294,7 @@ __kernel void computeN2Value(__global float4* posq, __local float4* local_posq,
}
}
}
#endif
lasty = y;
pos++;
} while (pos < end);
......
......@@ -2,17 +2,26 @@
* Reduce a pairwise computed value, and compute per-particle values.
*/
__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global float* valueBuffers, __global float4* posq
__kernel void computePerParticleValues(int bufferSize, int numBuffers, __global float4* posq,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* valueBuffers
#else
__global float* valueBuffers
#endif
PARAMETER_ARGUMENTS) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Reduce the pairwise value
#ifdef SUPPORTS_64_BIT_ATOMICS
float sum = (1.0f/0xFFFFFFFF)*valueBuffers[index];
#else
int totalSize = bufferSize*numBuffers;
float sum = valueBuffers[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += valueBuffers[i];
#endif
// Now calculate other values
float4 pos = posq[index];
......
......@@ -267,9 +267,9 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
float4 delta = apos-center;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta = max((float4) 0.0f, fabs(delta)-boxSize);
int thread = get_local_id(0);
......@@ -278,31 +278,16 @@ __kernel void findInteractionsWithinBlocks(float cutoffSquared, float4 periodicB
// Sum the flags.
#ifdef WARPS_ARE_ATOMIC
if (index % 2 == 0)
flags[thread] += flags[thread+1];
if (index % 4 == 0)
flags[thread] += flags[thread+2];
if (index % 8 == 0)
flags[thread] += flags[thread+4];
if (index % 16 == 0)
flags[thread] += flags[thread+8];
flags[thread] += flags[thread+1]+flags[thread+2]+flags[thread+3];
#else
barrier(CLK_LOCAL_MEM_FENCE);
if (index % 2 == 0)
flags[thread] += flags[thread+1];
barrier(CLK_LOCAL_MEM_FENCE);
if (index % 4 == 0)
flags[thread] += flags[thread+2];
barrier(CLK_LOCAL_MEM_FENCE);
if (index % 8 == 0)
flags[thread] += flags[thread+4];
barrier(CLK_LOCAL_MEM_FENCE);
if (index % 16 == 0)
flags[thread] += flags[thread+8];
flags[thread] += flags[thread+1]+flags[thread+2]+flags[thread+3];
barrier(CLK_LOCAL_MEM_FENCE);
#endif
if (index == 0) {
unsigned int allFlags = flags[thread] + flags[thread+16];
unsigned int allFlags = flags[thread]+flags[thread+4]+flags[thread+8]+flags[thread+12]+flags[thread+16]+flags[thread+20]+flags[thread+24]+flags[thread+28];
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
......
......@@ -7,15 +7,24 @@
*/
__kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float beta, float gamma,
__global float* bornSum, __global float2* params, __global float* bornRadii, __global float* obcChain) {
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* bornSum,
#else
__global float* bornSum,
#endif
__global float2* params, __global float* bornRadii, __global float* obcChain) {
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Get summed Born data
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS
float sum = (1.0f/0xFFFFFFFF)*bornSum[index];
#else
float sum = bornSum[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
sum += bornSum[i];
#endif
// Now calculate Born radius and OBC term.
......@@ -38,18 +47,24 @@ __kernel void reduceBornSum(int bufferSize, int numBuffers, float alpha, float b
* Reduce the Born force.
*/
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce, __global float* energyBuffer,
__global float2* params, __global float* bornRadii, __global float* obcChain) {
__kernel void reduceBornForce(int bufferSize, int numBuffers, __global float* bornForce,
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* bornForceIn,
#endif
__global float* energyBuffer, __global float2* params, __global float* bornRadii, __global float* obcChain) {
float energy = 0.0f;
unsigned int index = get_global_id(0);
while (index < NUM_ATOMS) {
// Sum the Born force
int totalSize = bufferSize*numBuffers;
#ifdef SUPPORTS_64_BIT_ATOMICS
float force = (1.0f/0xFFFFFFFF)*bornForceIn[index];
#else
float force = bornForce[index];
for (int i = index+bufferSize; i < totalSize; i += bufferSize)
force += bornForce[i];
#endif
// Now calculate the actual force
float offsetRadius = params[index].x;
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32
typedef struct {
......@@ -14,7 +17,13 @@ typedef struct {
/**
* Compute the Born sum.
*/
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params,
__kernel void computeBornSum(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* global_bornSum,
#else
__global float* global_bornSum,
#endif
__global float4* posq, __global float2* global_params,
__local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
......@@ -174,16 +183,10 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
// Sum the forces on atom j.
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
localData[tbx+j].bornSum += tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
}
}
}
......@@ -245,6 +248,16 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
#ifdef SUPPORTS_64_BIT_ATOMICS
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (bornSum*0xFFFFFFFF));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&global_bornSum[offset], (long) (localData[get_local_id(0)].bornSum*0xFFFFFFFF));
}
#else
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
......@@ -294,6 +307,7 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
}
}
}
#endif
lasty = y;
pos++;
} while (pos < end);
......@@ -303,8 +317,13 @@ __kernel void computeBornSum(__global float* global_bornSum, __global float4* po
* First part of computing the GBSA interaction.
*/
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii, __global float* global_bornForce,
__kernel void computeGBSAForce1(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* forceBuffers, __global long* global_bornForce,
#else
__global float4* forceBuffers, __global float* global_bornForce,
#endif
__global float* energyBuffer, __global float4* posq, __global float* global_bornRadii,
__local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
......@@ -460,16 +479,10 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
// Sum the forces on atom j.
if (tgx % 2 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1];
if (tgx % 4 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+2];
if (tgx % 8 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+4];
if (tgx % 16 == 0)
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+8];
tempBuffer[get_local_id(0)] += tempBuffer[get_local_id(0)+1]+tempBuffer[get_local_id(0)+2]+tempBuffer[get_local_id(0)+3];
if (tgx == 0) {
float4 sum = tempBuffer[get_local_id(0)] + tempBuffer[get_local_id(0)+16];
float4 sum = tempBuffer[get_local_id(0)]+tempBuffer[get_local_id(0)+4]+tempBuffer[get_local_id(0)+8]+tempBuffer[get_local_id(0)+12]+tempBuffer[get_local_id(0)+16]+tempBuffer[get_local_id(0)+20]+tempBuffer[get_local_id(0)+24]+tempBuffer[get_local_id(0)+28];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
......@@ -532,6 +545,22 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
#ifdef SUPPORTS_64_BIT_ATOMICS
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0xFFFFFFFF));
atom_add(&global_bornForce[offset], (long) (force.w*0xFFFFFFFF));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0xFFFFFFFF));
atom_add(&global_bornForce[offset], (long) (localData[get_local_id(0)].fw*0xFFFFFFFF));
}
#else
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
......@@ -583,6 +612,7 @@ __kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* e
}
}
}
#endif
lasty = y;
pos++;
} while (pos < end);
......
#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable
#ifdef SUPPORTS_64_BIT_ATOMICS
#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
#endif
#define TILE_SIZE 32
typedef struct {
......@@ -11,7 +14,13 @@ typedef struct {
/**
* Compute nonbonded interactions.
*/
__kernel void computeNonbonded(__global float4* forceBuffers, __global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* forceBuffers,
#else
__global float4* forceBuffers,
#endif
__global float* energyBuffer, __global float4* posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __local AtomData* localData, __local float* tempBuffer,
unsigned int startTileIndex, unsigned int endTileIndex,
#ifdef USE_CUTOFF
......@@ -157,6 +166,14 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
if ((flags&(1<<j)) != 0) {
bool isExcluded = false;
int atom2 = tbx+j;
int bufferIndex = 3*get_local_id(0);
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
#endif
float tempEnergy = 0.0f;
float4 posq2 = (float4) (localData[atom2].x, localData[atom2].y, localData[atom2].z, localData[atom2].q);
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
......@@ -165,20 +182,18 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_CUTOFF
}
#endif
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
int bufferIndex = 3*get_local_id(0);
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
......@@ -194,30 +209,15 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
// Sum the forces on atom2.
if (tgx % 2 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5];
}
if (tgx % 4 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+6];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+7];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+8];
}
if (tgx % 8 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+12];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+13];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+14];
}
if (tgx % 16 == 0) {
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+24];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+25];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+26];
tempBuffer[bufferIndex] += tempBuffer[bufferIndex+3]+tempBuffer[bufferIndex+6]+tempBuffer[bufferIndex+9];
tempBuffer[bufferIndex+1] += tempBuffer[bufferIndex+4]+tempBuffer[bufferIndex+7]+tempBuffer[bufferIndex+10];
tempBuffer[bufferIndex+2] += tempBuffer[bufferIndex+5]+tempBuffer[bufferIndex+8]+tempBuffer[bufferIndex+11];
}
if (tgx == 0) {
localData[tbx+j].fx += tempBuffer[bufferIndex] + tempBuffer[bufferIndex+48];
localData[tbx+j].fy += tempBuffer[bufferIndex+1] + tempBuffer[bufferIndex+49];
localData[tbx+j].fz += tempBuffer[bufferIndex+2] + tempBuffer[bufferIndex+50];
localData[tbx+j].fx += tempBuffer[bufferIndex]+tempBuffer[bufferIndex+12]+tempBuffer[bufferIndex+24]+tempBuffer[bufferIndex+36]+tempBuffer[bufferIndex+48]+tempBuffer[bufferIndex+60]+tempBuffer[bufferIndex+72]+tempBuffer[bufferIndex+84];
localData[tbx+j].fy += tempBuffer[bufferIndex+1]+tempBuffer[bufferIndex+13]+tempBuffer[bufferIndex+25]+tempBuffer[bufferIndex+37]+tempBuffer[bufferIndex+49]+tempBuffer[bufferIndex+61]+tempBuffer[bufferIndex+73]+tempBuffer[bufferIndex+85];
localData[tbx+j].fz += tempBuffer[bufferIndex+2]+tempBuffer[bufferIndex+14]+tempBuffer[bufferIndex+26]+tempBuffer[bufferIndex+38]+tempBuffer[bufferIndex+50]+tempBuffer[bufferIndex+62]+tempBuffer[bufferIndex+74]+tempBuffer[bufferIndex+86];
}
}
}
......@@ -246,30 +246,36 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
float r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float invR = RSQRT(r2);
float r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
#ifdef USE_SYMMETRIC
float dEdR = 0.0f;
float dEdR = 0.0f;
#else
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
float4 dEdR1 = (float4) 0.0f;
float4 dEdR2 = (float4) 0.0f;
#endif
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
float tempEnergy = 0.0f;
COMPUTE_INTERACTION
energy += tempEnergy;
#ifdef USE_SYMMETRIC
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
#else
force.xyz -= dEdR1.xyz;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
force.xyz -= dEdR1.xyz;
localData[tbx+tj].fx += dEdR2.x;
localData[tbx+tj].fy += dEdR2.y;
localData[tbx+tj].fz += dEdR2.z;
#endif
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
......@@ -283,6 +289,20 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
// Write results. We need to coordinate between warps to make sure no two of them
// ever try to write to the same piece of memory at the same time.
#ifdef SUPPORTS_64_BIT_ATOMICS
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (force.x*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (force.y*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (force.z*0xFFFFFFFF));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atom_add(&forceBuffers[offset], (long) (localData[get_local_id(0)].fx*0xFFFFFFFF));
atom_add(&forceBuffers[offset+PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fy*0xFFFFFFFF));
atom_add(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (long) (localData[get_local_id(0)].fz*0xFFFFFFFF));
}
#else
int writeX = (pos < end ? x : -1);
int writeY = (pos < end && x != y ? y : -1);
if (tgx == 0)
......@@ -332,6 +352,7 @@ __kernel void computeNonbonded(__global float4* forceBuffers, __global float* en
}
}
}
#endif
lasty = y;
pos++;
} while (pos < end);
......
......@@ -2,23 +2,23 @@
* Fill a buffer with 0.
*/
__kernel void clearBuffer(__global float* buffer, int size) {
__kernel void clearBuffer(__global int* buffer, int size) {
int index = get_global_id(0);
__global float4* buffer4 = (__global float4*) buffer;
__global int4* buffer4 = (__global int4*) buffer;
int sizeDiv4 = size/4;
while (index < sizeDiv4) {
buffer4[index] = (float4) (0.0f);
buffer4[index] = (int4) 0;
index += get_global_size(0);
}
if (get_global_id(0) == 0)
for (int i = sizeDiv4*4; i < size; i++)
buffer[i] = 0.0f;
buffer[i] = 0;
}
/**
* Fill two buffers with 0.
*/
__kernel void clearTwoBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2) {
__kernel void clearTwoBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
}
......@@ -26,7 +26,7 @@ __kernel void clearTwoBuffers(__global float* buffer1, int size1, __global float
/**
* Fill three buffers with 0.
*/
__kernel void clearThreeBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2, __global float* buffer3, int size3) {
__kernel void clearThreeBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
......@@ -35,7 +35,7 @@ __kernel void clearThreeBuffers(__global float* buffer1, int size1, __global flo
/**
* Fill four buffers with 0.
*/
__kernel void clearFourBuffers(__global float* buffer1, int size1, __global float* buffer2, int size2, __global float* buffer3, int size3, __global float* buffer4, int size4) {
__kernel void clearFourBuffers(__global int* buffer1, int size1, __global int* buffer2, int size2, __global int* buffer3, int size3, __global int* buffer4, int size4) {
clearBuffer(buffer1, size1);
clearBuffer(buffer2, size2);
clearBuffer(buffer3, size3);
......@@ -58,6 +58,20 @@ __kernel void reduceFloat4Buffer(__global float4* buffer, int bufferSize, int nu
}
}
/**
* Sum the various buffers containing forces.
*/
__kernel void reduceForces(__global long* longBuffer, __global float4* buffer, int bufferSize, int numBuffers) {
int totalSize = bufferSize*numBuffers;
float scale = 1.0f/(float) 0xFFFFFFFF;
for (int index = get_global_id(0); index < bufferSize; index += get_global_size(0)) {
float4 sum = (float4) (scale*longBuffer[index], scale*longBuffer[index+bufferSize], scale*longBuffer[index+2*bufferSize], 0.0f);
for (int i = index; i < totalSize; i += bufferSize)
sum += buffer[i];
buffer[index] = sum;
}
}
/**
* This is called to determine the accuracy of various native functions.
*/
......
......@@ -373,6 +373,12 @@ void testParallelComputation() {
vector<Vec3> positions(numParticles);
for (int i = 0; i < numParticles; i++)
positions[i] = Vec3(5*genrand_real2(sfmt), 5*genrand_real2(sfmt), 5*genrand_real2(sfmt));
for (int i = 0; i < numParticles; ++i)
for (int j = 0; j < i; ++j) {
Vec3 delta = positions[i]-positions[j];
if (delta.dot(delta) < 0.1)
force->addExclusion(i, j);
}
VerletIntegrator integrator1(0.01);
map<string, string> props1;
props1[OpenCLPlatform::OpenCLDeviceIndex()] = "0";
......
......@@ -691,6 +691,12 @@ void testParallelComputation() {
vector<Vec3> positions(numParticles);
for (int i = 0; i < numParticles; i++)
positions[i] = Vec3(5*genrand_real2(sfmt), 5*genrand_real2(sfmt), 5*genrand_real2(sfmt));
for (int i = 0; i < numParticles; ++i)
for (int j = 0; j < i; ++j) {
Vec3 delta = positions[i]-positions[j];
if (delta.dot(delta) < 0.1)
force->addException(i, j, 0, 1, 0);
}
VerletIntegrator integrator1(0.01);
map<string, string> props1;
props1[OpenCLPlatform::OpenCLDeviceIndex()] = "0";
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment