"docs-source/vscode:/vscode.git/clone" did not exist on "82e6bcd038b4389bf75ec6e58acf1e5154df87cb"
Commit 562cfb39 authored by Peter Eastman's avatar Peter Eastman
Browse files

Created CustomGBForce kernels optimized for CPU. Fixed bugs in CPU GBSA kernels.

parent cbfff447
......@@ -1715,7 +1715,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
computeBornSumKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, params->getDeviceBuffer());
computeBornSumKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*13*sizeof(cl_float), NULL);
computeBornSumKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
computeBornSumKernel.setArg(index++, (deviceIsCpu ? 1 : OpenCLContext::ThreadBlockSize)*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) {
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
computeBornSumKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......@@ -1734,7 +1734,7 @@ double OpenCLCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeF
force1Kernel.setArg<cl::Buffer>(index++, bornRadii->getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, bornForce->getDeviceBuffer());
force1Kernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*13*sizeof(cl_float), NULL);
force1Kernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(mm_float4), NULL);
force1Kernel.setArg(index++, (deviceIsCpu ? 1 : OpenCLContext::ThreadBlockSize)*sizeof(mm_float4), NULL);
if (nb.getUseCutoff()) {
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
force1Kernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
......@@ -1954,6 +1954,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
bool useCutoff = (force.getNonbondedMethod() != CustomGBForce::NoCutoff);
bool usePeriodic = (force.getNonbondedMethod() != CustomGBForce::NoCutoff && force.getNonbondedMethod() != CustomGBForce::CutoffNonPeriodic);
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
{
// Create the N2 value kernel.
......@@ -1987,8 +1988,8 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string paramName = "params"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* global_" << paramName << ", __local " << buffer.getType() << "* local_" << paramName;
loadLocal1 << "local_" << paramName << "[get_local_id(0)] = " << paramName << "1;\n";
loadLocal2 << "local_" << paramName << "[get_local_id(0)] = global_" << paramName << "[j];\n";
loadLocal1 << "local_" << paramName << "[localAtomIndex] = " << paramName << "1;\n";
loadLocal2 << "local_" << paramName << "[localAtomIndex] = global_" << paramName << "[j];\n";
load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
load2 << buffer.getType() << " " << paramName << "2 = local_" << paramName << "[atom2];\n";
}
......@@ -2010,7 +2011,13 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBValueN2_nvidia : OpenCLKernelSources::customGBValueN2_default);
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::customGBValueN2_cpu;
else if (cl.getSIMDWidth() == 32)
file = OpenCLKernelSources::customGBValueN2_nvidia;
else
OpenCLKernelSources::customGBValueN2_default;
cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
pairValueKernel = cl::Kernel(program, "computeN2Value");
}
......@@ -2106,8 +2113,8 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
string paramName = "params"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* global_" << paramName << ", __local " << buffer.getType() << "* local_" << paramName;
loadLocal1 << "local_" << paramName << "[get_local_id(0)] = " << paramName << "1;\n";
loadLocal2 << "local_" << paramName << "[get_local_id(0)] = global_" << paramName << "[j];\n";
loadLocal1 << "local_" << paramName << "[localAtomIndex] = " << paramName << "1;\n";
loadLocal2 << "local_" << paramName << "[localAtomIndex] = global_" << paramName << "[j];\n";
load1 << buffer.getType() << " " << paramName << "1 = global_" << paramName << "[atom1];\n";
load2 << buffer.getType() << " " << paramName << "2 = local_" << paramName << "[atom2];\n";
}
......@@ -2115,8 +2122,8 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
string valueName = "values"+intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* global_" << valueName << ", __local " << buffer.getType() << "* local_" << valueName;
loadLocal1 << "local_" << valueName << "[get_local_id(0)] = " << valueName << "1;\n";
loadLocal2 << "local_" << valueName << "[get_local_id(0)] = global_" << valueName << "[j];\n";
loadLocal1 << "local_" << valueName << "[localAtomIndex] = " << valueName << "1;\n";
loadLocal2 << "local_" << valueName << "[localAtomIndex] = global_" << valueName << "[j];\n";
load1 << buffer.getType() << " " << valueName << "1 = global_" << valueName << "[atom1];\n";
load2 << buffer.getType() << " " << valueName << "2 = local_" << valueName << "[atom2];\n";
}
......@@ -2124,7 +2131,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
string index = intToString(i+1);
extraArgs << ", __global " << buffer.getType() << "* derivBuffers" << index << ", __local " << buffer.getType() << "* local_deriv" << index;
clearLocal << "local_deriv" << index << "[get_local_id(0)] = 0.0f;\n";
clearLocal << "local_deriv" << index << "[localAtomIndex] = 0.0f;\n";
load1 << buffer.getType() << " deriv" << index << "_1 = 0.0f;\n";
load2 << buffer.getType() << " deriv" << index << "_2 = 0.0f;\n";
recordDeriv << "local_deriv" << index << "[atom2] += deriv" << index << "_2;\n";
......@@ -2157,7 +2164,13 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
defines["NUM_ATOMS"] = intToString(cl.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = intToString(cl.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = OpenCLExpressionUtilities::intToString(cl.getNumAtomBlocks());
string file = (cl.getSIMDWidth() == 32 ? OpenCLKernelSources::customGBEnergyN2_nvidia : OpenCLKernelSources::customGBEnergyN2_default);
string file;
if (deviceIsCpu)
file = OpenCLKernelSources::customGBEnergyN2_cpu;
else if (cl.getSIMDWidth() == 32)
file = OpenCLKernelSources::customGBEnergyN2_nvidia;
else
file = OpenCLKernelSources::customGBEnergyN2_default;
cl::Program program = cl.createProgram(cl.replaceStrings(file, replacements), defines);
pairEnergyKernel = cl::Kernel(program, "computeN2Energy");
}
......@@ -2408,6 +2421,7 @@ void OpenCLCalcCustomGBForceKernel::initialize(const System& system, const Custo
}
double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeForces, bool includeEnergy) {
bool deviceIsCpu = (cl.getDevice().getInfo<CL_DEVICE_TYPE>() == CL_DEVICE_TYPE_CPU);
OpenCLNonbondedUtilities& nb = cl.getNonbondedUtilities();
if (!hasInitializedKernels) {
hasInitializedKernels = true;
......@@ -2422,14 +2436,14 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, valueBuffers->getDeviceBuffer());
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float), NULL);
pairValueKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float), NULL);
if (nb.getUseCutoff()) {
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed.
pairValueKernel.setArg<cl_uint>(index++, maxTiles);
if (cl.getSIMDWidth() == 32)
if (cl.getSIMDWidth() == 32 || deviceIsCpu)
pairValueKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
}
else
......@@ -2465,19 +2479,19 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
index = 0;
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getForceBuffers().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getEnergyBuffer().getDeviceBuffer());
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getPosq().getDeviceBuffer());
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusions().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionIndices().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, cl.getNonbondedUtilities().getExclusionRowIndices().getDeviceBuffer());
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*sizeof(cl_float4), NULL);
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*sizeof(cl_float4), NULL);
if (nb.getUseCutoff()) {
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractingTiles().getDeviceBuffer());
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionCount().getDeviceBuffer());
index += 2; // Periodic box size arguments are set when the kernel is executed.
pairEnergyKernel.setArg<cl_uint>(index++, maxTiles);
if (cl.getSIMDWidth() == 32)
if (cl.getSIMDWidth() == 32 || deviceIsCpu)
pairEnergyKernel.setArg<cl::Buffer>(index++, nb.getInteractionFlags().getDeviceBuffer());
}
else
......@@ -2487,17 +2501,17 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
for (int i = 0; i < (int) params->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = params->getBuffers()[i];
pairEnergyKernel.setArg<cl::Memory>(index++, buffer.getMemory());
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*buffer.getSize(), NULL);
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*buffer.getSize(), NULL);
}
for (int i = 0; i < (int) computedValues->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = computedValues->getBuffers()[i];
pairEnergyKernel.setArg<cl::Memory>(index++, buffer.getMemory());
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*buffer.getSize(), NULL);
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*buffer.getSize(), NULL);
}
for (int i = 0; i < (int) energyDerivs->getBuffers().size(); i++) {
const OpenCLNonbondedUtilities::ParameterInfo& buffer = energyDerivs->getBuffers()[i];
pairEnergyKernel.setArg<cl::Memory>(index++, buffer.getMemory());
pairEnergyKernel.setArg(index++, OpenCLContext::ThreadBlockSize*buffer.getSize(), NULL);
pairEnergyKernel.setArg(index++, (deviceIsCpu ? OpenCLContext::TileSize : OpenCLContext::ThreadBlockSize)*buffer.getSize(), NULL);
}
if (tabulatedFunctionParams != NULL) {
for (int i = 0; i < (int) tabulatedFunctions.size(); i++)
......@@ -2560,9 +2574,9 @@ double OpenCLCalcCustomGBForceKernel::execute(ContextImpl& context, bool include
}
}
int numTiles = cl.getNumAtomBlocks()*(cl.getNumAtomBlocks()+1)/2;
cl.executeKernel(pairValueKernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(pairValueKernel, numTiles*OpenCLContext::TileSize, deviceIsCpu ? 1 : -1);
cl.executeKernel(perParticleValueKernel, cl.getPaddedNumAtoms());
cl.executeKernel(pairEnergyKernel, numTiles*OpenCLContext::TileSize);
cl.executeKernel(pairEnergyKernel, numTiles*OpenCLContext::TileSize, deviceIsCpu ? 1 : -1);
cl.executeKernel(perParticleEnergyKernel, cl.getPaddedNumAtoms());
if (needParameterGradient)
cl.executeKernel(gradientChainRuleKernel, cl.getPaddedNumAtoms());
......
#define TILE_SIZE 32
#define STORE_DERIVATIVE_1(INDEX) derivBuffers##INDEX[offset1] += deriv##INDEX##_1;
#define STORE_DERIVATIVE_2(INDEX) derivBuffers##INDEX[offset2] += local_deriv##INDEX[tgx];
/**
* Compute a force based on pair interactions.
*/
__kernel void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer, __local float4* local_force,
__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions, __global unsigned int* exclusionIndices,
__global unsigned int* exclusionRowIndices, __local float4* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
float energy = 0.0f;
unsigned int lasty = 0xFFFFFFFF;
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
unsigned int exclusionStart = exclusionRowIndices[x];
unsigned int exclusionEnd = exclusionRowIndices[x+1];
int exclusionIndex = -1;
for (int i = exclusionStart; i < exclusionEnd; i++)
if (exclusionIndices[i] == y) {
exclusionIndex = i*TILE_SIZE;
break;
}
bool hasExclusions = (exclusionIndex > -1);
#else
bool hasExclusions = false;
#endif
// Load the data for this tile if we don't already have it cached.
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
if (x == y) {
// This tile is on the diagonal.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex+tgx];
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float r = SQRT(r2);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Write results
unsigned int offset1 = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
}
}
else {
// This is an off-diagonal tile.
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
local_force[localAtomIndex] = 0.0f;
CLEAR_LOCAL_DERIVATIVES
}
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
if (!hasExclusions && (flags1 != 0xFFFFFFFF || flags2 != 0xFFFFFFFF)) {
// Compute only a subset of the interactions in this tile.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
if ((flags2&(1<<tgx)) != 0) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
if (r2 < CUTOFF_SQUARED) {
float r = SQRT(r2);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
atom2 = j;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
}
}
}
// Write results for atom1.
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float4 force = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float r = SQRT(r2);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float dEdR = 0.0f;
float tempEnergy = 0.0f;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta.xyz *= dEdR;
force.xyz -= delta.xyz;
atom2 = j;
local_force[atom2].xyz += delta.xyz;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Write results for atom1.
unsigned int offset1 = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset1].xyz += force.xyz;
STORE_DERIVATIVES_1
}
}
// Write results
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int offset2 = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset2].xyz += local_force[tgx].xyz;
STORE_DERIVATIVES_2
}
}
lasty = y;
pos++;
}
energyBuffer[get_global_id(0)] += energy;
}
......@@ -74,7 +74,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
......@@ -136,12 +137,13 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[get_local_id(0)] = 0.0f;
local_force[localAtomIndex] = 0.0f;
CLEAR_LOCAL_DERIVATIVES
barrier(CLK_LOCAL_MEM_FENCE);
......
......@@ -75,7 +75,8 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
......@@ -128,12 +129,13 @@ void computeN2Energy(__global float4* forceBuffers, __global float* energyBuffer
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = get_local_id(0);
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_force[get_local_id(0)] = 0.0f;
local_force[localAtomIndex] = 0.0f;
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
......
#define TILE_SIZE 32
/**
* Compute a value based on pair interactions.
*/
__kernel void computeN2Value(__global float4* posq, __local float4* local_posq, __global unsigned int* exclusions,
__global unsigned int* exclusionIndices, __global unsigned int* exclusionRowIndices, __global float* global_value, __local float* local_value,
__local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = get_group_id(0)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/get_num_groups(0);
#else
unsigned int pos = get_group_id(0)*numTiles/get_num_groups(0);
unsigned int end = (get_group_id(0)+1)*numTiles/get_num_groups(0);
#endif
unsigned int lasty = 0xFFFFFFFF;
while (pos < end) {
// Extract the coordinates of this tile
unsigned int x, y;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y++;
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS
unsigned int exclusionStart = exclusionRowIndices[x];
unsigned int exclusionEnd = exclusionRowIndices[x+1];
int exclusionIndex = -1;
for (int i = exclusionStart; i < exclusionEnd; i++)
if (exclusionIndices[i] == y) {
exclusionIndex = i*TILE_SIZE;
break;
}
bool hasExclusions = (exclusionIndex > -1);
#else
bool hasExclusions = false;
#endif
// Load the data for this tile if we don't already have it cached.
if (lasty != y) {
for (int localAtomIndex = 0; localAtomIndex < TILE_SIZE; localAtomIndex++) {
unsigned int j = y*TILE_SIZE + localAtomIndex;
local_posq[localAtomIndex] = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
}
if (x == y) {
// This tile is on the diagonal.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex+tgx];
#endif
unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float r = SQRT(r2);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Write results
unsigned int offset = x*TILE_SIZE + tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
}
}
else {
// This is an off-diagonal tile.
for (int tgx = 0; tgx < TILE_SIZE; tgx++)
local_value[tgx] = 0.0f;
#if defined(USE_CUTOFF) && defined(USE_EXCLUSIONS)
unsigned int flags1 = (numTiles <= maxTiles ? interactionFlags[2*pos] : 0xFFFFFFFF);
unsigned int flags2 = (numTiles <= maxTiles ? interactionFlags[2*pos+1] : 0xFFFFFFFF);
if (!hasExclusions && (flags1 != 0xFFFFFFFF || flags2 != 0xFFFFFFFF)) {
// Compute only a subset of the interactions in this tile.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
if ((flags2&(1<<tgx)) != 0) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
if (r2 < CUTOFF_SQUARED) {
float r = SQRT(r2);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
local_value[j] += tempValue2;
}
}
}
// Write results for atom1.
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
}
}
}
else
#endif
{
// Compute the full set of interactions in this tile.
for (unsigned int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int atom1 = x*TILE_SIZE+tgx;
float value = 0.0f;
float4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex+tgx] : 0xFFFFFFFF);
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
float4 posq2 = local_posq[j];
float4 delta = (float4) (posq2.xyz - posq1.xyz, 0.0f);
#ifdef USE_PERIODIC
delta.xyz -= floor(delta.xyz*invPeriodicBoxSize.xyz+0.5f)*periodicBoxSize.xyz;
#endif
float r2 = dot(delta.xyz, delta.xyz);
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
float r = SQRT(r2);
unsigned int atom2 = j;
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
float tempValue1 = 0.0f;
float tempValue2 = 0.0f;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
local_value[j] += tempValue2;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
}
// Write results for atom1.
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += value;
}
}
// Write results
for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_value[offset] += local_value[tgx];
}
}
lasty = y;
pos++;
}
}
......@@ -71,7 +71,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
barrier(CLK_LOCAL_MEM_FENCE);
#ifdef USE_EXCLUSIONS
......@@ -134,6 +135,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
if (lasty != y && get_local_id(0) < TILE_SIZE) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0.0f;
......
......@@ -73,7 +73,8 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
if (x == y) {
// This tile is on the diagonal.
local_posq[get_local_id(0)] = posq1;
const unsigned int localAtomIndex = get_local_id(0);
local_posq[localAtomIndex] = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
......@@ -129,6 +130,7 @@ void computeN2Value(__global float4* posq, __local float4* local_posq, __global
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
local_posq[get_local_id(0)] = posq[j];
const unsigned int localAtomIndex = get_local_id(0);
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
local_value[get_local_id(0)] = 0.0f;
......
......@@ -14,8 +14,7 @@ typedef struct {
* Compute the Born sum.
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
__kernel void computeBornSum(__global float* global_bornSum, __global float4* posq, __global float2* global_params, __local AtomData* localData, __local float* tempBuffer,
#ifdef USE_CUTOFF
__global ushort2* tiles, __global unsigned int* interactionCount, float4 periodicBoxSize, float4 invPeriodicBoxSize, unsigned int maxTiles, __global unsigned int* interactionFlags) {
#else
......@@ -171,8 +170,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
// Write results for atom1.
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
}
global_bornSum[offset] += bornSum;
}
// Write results
......@@ -181,6 +179,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
global_bornSum[offset] += localData[tgx].bornSum;
}
}
lasty = y;
pos++;
}
......@@ -190,8 +189,7 @@ void computeBornSum(__global float* global_bornSum, __global float4* posq, __glo
* First part of computing the GBSA interaction.
*/
__kernel __attribute__((reqd_work_group_size(WORK_GROUP_SIZE, 1, 1)))
void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__kernel void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuffer,
__global float4* posq, __global float* global_bornRadii,
__global float* global_bornForce, __local AtomData* localData, __local float4* tempBuffer,
#ifdef USE_CUTOFF
......@@ -344,7 +342,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
}
global_bornForce[offset] += force.w;
}
// Write results
......@@ -358,6 +356,7 @@ void computeGBSAForce1(__global float4* forceBuffers, __global float* energyBuff
forceBuffers[offset] = f;
global_bornForce[offset] += localData[tgx].fw;
}
}
lasty = y;
pos++;
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment