Commit 93c467b2 authored by Peter Eastman's avatar Peter Eastman
Browse files

Merged 5.1Optimizations branch back to trunk

parent f6d4557d
......@@ -61,7 +61,7 @@ using namespace OpenMM;
using namespace std;
const int CudaContext::ThreadBlockSize = 64;
const int CudaContext::TileSize = 32;
const int CudaContext::TileSize = sizeof(tileflags)*8;
bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
......@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n";
}
src << "typedef unsigned int tileflags;\n";
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first;
if (!iter->second.empty())
......
......@@ -42,6 +42,8 @@
#include "windowsExportCuda.h"
#include "CudaPlatform.h"
typedef unsigned int tileflags;
namespace OpenMM {
class CudaArray;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConvergedMemory(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
// Create workspace arrays.
......@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms");
ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged");
vector<int2> atomsVec(ccmaAtoms->getSize());
vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
......@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete ccmaDelta1;
if (ccmaDelta2 != NULL)
delete ccmaDelta2;
if (ccmaConvergedMemory != NULL)
cuMemFreeHost(ccmaConvergedMemory);
if (ccmaConverged != NULL)
delete ccmaConverged;
if (vsite2AvgAtoms != NULL)
delete vsite2AvgAtoms;
if (vsite2AvgWeights != NULL)
......@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
}
if (ccmaAtoms != NULL) {
void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection};
void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged->getDevicePointer()};
context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
int i;
void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
&ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConvergedDeviceMemory,
&ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
tolPointer, &i};
void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
&ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConvergedDeviceMemory, &i};
&ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
&context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
&ccmaConvergedDeviceMemory, &i};
&ccmaConverged->getDevicePointer(), &i};
const int checkInterval = 4;
int* converged = (int*) context.getPinnedBuffer();
for (i = 0; i < 150; i++) {
if (i == 0) {
ccmaConvergedMemory[0] = 1;
ccmaConvergedMemory[1] = 0;
}
context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
if ((i+1)%checkInterval == 0)
if ((i+1)%checkInterval == 0) {
ccmaConverged->download(converged, false);
CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
}
context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
if ((i+1)%checkInterval == 0) {
CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
if (ccmaConvergedMemory[i%2])
if (converged[i%2])
break;
}
}
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -140,8 +140,7 @@ private:
CudaArray* ccmaConstraintMatrixValue;
CudaArray* ccmaDelta1;
CudaArray* ccmaDelta2;
int* ccmaConvergedMemory;
CUdeviceptr ccmaConvergedDeviceMemory;
CudaArray* ccmaConverged;
CUevent ccmaEvent;
CudaArray* vsite2AvgAtoms;
CudaArray* vsite2AvgWeights;
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. *
* Portions copyright (c) 2008-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -1351,10 +1351,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeBsplineModuliY;
if (pmeBsplineModuliZ != NULL)
delete pmeBsplineModuliZ;
if (pmeBsplineTheta != NULL)
delete pmeBsplineTheta;
if (pmeBsplineDTheta != NULL)
delete pmeBsplineDTheta;
if (pmeAtomRange != NULL)
delete pmeAtomRange;
if (pmeAtomGridIndex != NULL)
......@@ -1507,13 +1503,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if (cu.getUseDoublePrecision())
pmeDefines["USE_DOUBLE_PRECISION"] = "1";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines");
pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
// Create required data structures.
......@@ -1528,7 +1524,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeBsplineTheta = new CudaArray(cu, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
......@@ -1659,20 +1654,14 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
}
if (directPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
void* bsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
int bsplinesSharedSize = cu.ThreadBlockSize*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
cu.executeKernel(pmeUpdateBsplinesKernel, bsplinesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, bsplinesSharedSize);
void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
sort->sort(*pmeAtomGridIndex);
void* rangeArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(), &cu.getPosq().getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeAtomRangeKernel, rangeArgs, cu.getNumAtoms());
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid->getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0) {
void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
......@@ -1699,8 +1688,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms());
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
}
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
......@@ -2071,6 +2060,14 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
int numExclusionTiles = nb.getExclusionTiles().getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
int numContexts = cu.getPlatformData().contexts.size();
int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
map<string, string> replacements;
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::gbsaObc1, replacements), defines);
computeBornSumKernel = cu.getKernel(module, "computeBornSum");
......@@ -2083,12 +2080,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
computeSumArgs.push_back(cu.getPeriodicBoxSizePointer());
computeSumArgs.push_back(cu.getInvPeriodicBoxSizePointer());
computeSumArgs.push_back(&maxTiles);
computeSumArgs.push_back(&nb.getInteractionFlags().getDevicePointer());
computeSumArgs.push_back(&nb.getBlockCenters().getDevicePointer());
computeSumArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
}
else
computeSumArgs.push_back(&maxTiles);
computeSumArgs.push_back(&nb.getExclusionIndices().getDevicePointer());
computeSumArgs.push_back(&nb.getExclusionRowIndices().getDevicePointer());
computeSumArgs.push_back(&nb.getExclusionTiles().getDevicePointer());
force1Kernel = cu.getKernel(module, "computeGBSAForce1");
force1Args.push_back(&cu.getForce().getDevicePointer());
force1Args.push_back(&bornForce->getDevicePointer());
......@@ -2101,12 +2098,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
force1Args.push_back(cu.getPeriodicBoxSizePointer());
force1Args.push_back(cu.getInvPeriodicBoxSizePointer());
force1Args.push_back(&maxTiles);
force1Args.push_back(&nb.getInteractionFlags().getDevicePointer());
force1Args.push_back(&nb.getBlockCenters().getDevicePointer());
force1Args.push_back(&nb.getInteractingAtoms().getDevicePointer());
}
else
force1Args.push_back(&maxTiles);
force1Args.push_back(&nb.getExclusionIndices().getDevicePointer());
force1Args.push_back(&nb.getExclusionRowIndices().getDevicePointer());
force1Args.push_back(&nb.getExclusionTiles().getDevicePointer());
reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
reduceBornForceKernel = cu.getKernel(module, "reduceBornForce");
}
......@@ -2115,8 +2112,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
maxTiles = nb.getInteractingTiles().getSize();
computeSumArgs[3] = &nb.getInteractingTiles().getDevicePointer();
force1Args[5] = &nb.getInteractingTiles().getDevicePointer();
computeSumArgs[8] = &nb.getInteractionFlags().getDevicePointer();
force1Args[10] = &nb.getInteractionFlags().getDevicePointer();
computeSumArgs[9] = &nb.getInteractingAtoms().getDevicePointer();
force1Args[11] = &nb.getInteractingAtoms().getDevicePointer();
}
}
cu.executeKernel(computeBornSumKernel, &computeSumArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......@@ -2244,16 +2241,17 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
// Record parameters and exclusions.
int numParticles = force.getNumParticles();
params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), numParticles, "customGBParameters", true);
computedValues = new CudaParameterSet(cu, force.getNumComputedValues(), numParticles, "customGBComputedValues", true, cu.getUseDoublePrecision());
int paddedNumParticles = cu.getPaddedNumAtoms();
int numParams = force.getNumPerParticleParameters();
params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), paddedNumParticles, "customGBParameters", true);
computedValues = new CudaParameterSet(cu, force.getNumComputedValues(), paddedNumParticles, "customGBComputedValues", true, cu.getUseDoublePrecision());
if (force.getNumGlobalParameters() > 0)
globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customGBGlobals");
vector<vector<float> > paramVector(numParticles);
vector<vector<float> > paramVector(paddedNumParticles, vector<float>(numParams, 0));
vector<vector<int> > exclusionList(numParticles);
for (int i = 0; i < numParticles; i++) {
vector<double> parameters;
force.getParticleParameters(i, parameters);
paramVector[i].resize(parameters.size());
for (int j = 0; j < (int) parameters.size(); j++)
paramVector[i][j] = (float) parameters[j];
exclusionList[i].push_back(i);
......@@ -2406,23 +2404,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
map<string, string> defines;
if (useCutoff)
defines["USE_CUTOFF"] = "1";
pairValueDefines["USE_CUTOFF"] = "1";
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
pairValueDefines["USE_PERIODIC"] = "1";
if (useExclusionsForValue)
defines["USE_EXCLUSIONS"] = "1";
pairValueDefines["USE_EXCLUSIONS"] = "1";
if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision())
defines["NEED_PADDING"] = "1";
defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customGBValueN2, replacements), defines);
pairValueKernel = cu.getKernel(module, "computeN2Value");
pairValueDefines["NEED_PADDING"] = "1";
pairValueDefines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
pairValueDefines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
pairValueDefines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
pairValueDefines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
pairValueDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
pairValueDefines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
pairValueDefines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
pairValueSrc = cu.replaceStrings(CudaKernelSources::customGBValueN2, replacements);
if (useExclusionsForValue)
cu.getNonbondedUtilities().requestExclusions(exclusionList);
}
......@@ -2574,23 +2571,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
map<string, string> defines;
if (useCutoff)
defines["USE_CUTOFF"] = "1";
pairEnergyDefines["USE_CUTOFF"] = "1";
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
pairEnergyDefines["USE_PERIODIC"] = "1";
if (anyExclusions)
defines["USE_EXCLUSIONS"] = "1";
pairEnergyDefines["USE_EXCLUSIONS"] = "1";
if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision())
defines["NEED_PADDING"] = "1";
defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customGBEnergyN2, replacements), defines);
pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
pairEnergyDefines["NEED_PADDING"] = "1";
pairEnergyDefines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
pairEnergyDefines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
pairEnergyDefines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
pairEnergyDefines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
pairEnergyDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
pairEnergyDefines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
pairEnergyDefines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
pairEnergySrc = cu.replaceStrings(CudaKernelSources::customGBEnergyN2, replacements);
}
{
// Create the kernel to reduce the derivatives and calculate per-particle energy terms.
......@@ -2834,14 +2830,46 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
if (!hasInitializedKernels) {
hasInitializedKernels = true;
// These two kernels can't be compiled in initialize(), because the nonbonded utilities object
// has not yet been initialized then.
{
int numExclusionTiles = cu.getNonbondedUtilities().getExclusionTiles().getSize();
pairValueDefines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
int numContexts = cu.getPlatformData().contexts.size();
int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
pairValueDefines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
pairValueDefines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+pairValueSrc, pairValueDefines);
pairValueKernel = cu.getKernel(module, "computeN2Value");
pairValueSrc = "";
pairValueDefines.clear();
}
{
int numExclusionTiles = cu.getNonbondedUtilities().getExclusionTiles().getSize();
pairEnergyDefines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
int numContexts = cu.getPlatformData().contexts.size();
int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
pairEnergyDefines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
pairEnergyDefines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+pairEnergySrc, pairEnergyDefines);
pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
pairEnergySrc = "";
pairEnergyDefines.clear();
}
// Set arguments for kernels.
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
valueBuffers = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "customGBValueBuffers");
cu.addAutoclearBuffer(*valueBuffers);
cu.clearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
pairValueArgs.push_back(&cu.getPosq().getDevicePointer());
pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer());
pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionTiles().getDevicePointer());
pairValueArgs.push_back(&valueBuffers->getDevicePointer());
if (nb.getUseCutoff()) {
pairValueArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
......@@ -2849,7 +2877,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairValueArgs.push_back(cu.getPeriodicBoxSizePointer());
pairValueArgs.push_back(cu.getInvPeriodicBoxSizePointer());
pairValueArgs.push_back(&maxTiles);
pairValueArgs.push_back(&nb.getInteractionFlags().getDevicePointer());
pairValueArgs.push_back(&nb.getBlockCenters().getDevicePointer());
pairValueArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
}
else
pairValueArgs.push_back(&maxTiles);
......@@ -2881,15 +2910,15 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairEnergyArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
pairEnergyArgs.push_back(&cu.getPosq().getDevicePointer());
pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer());
pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionTiles().getDevicePointer());
if (nb.getUseCutoff()) {
pairEnergyArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
pairEnergyArgs.push_back(&nb.getInteractionCount().getDevicePointer());
pairEnergyArgs.push_back(cu.getPeriodicBoxSizePointer());
pairEnergyArgs.push_back(cu.getInvPeriodicBoxSizePointer());
pairEnergyArgs.push_back(&maxTiles);
pairEnergyArgs.push_back(&nb.getInteractionFlags().getDevicePointer());
pairEnergyArgs.push_back(&nb.getBlockCenters().getDevicePointer());
pairEnergyArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
}
else
pairEnergyArgs.push_back(&maxTiles);
......@@ -2953,10 +2982,10 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
if (nb.getUseCutoff()) {
if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize();
pairValueArgs[5] = &nb.getInteractingTiles().getDevicePointer();
pairEnergyArgs[6] = &nb.getInteractingTiles().getDevicePointer();
pairValueArgs[10] = &nb.getInteractionFlags().getDevicePointer();
pairEnergyArgs[11] = &nb.getInteractionFlags().getDevicePointer();
pairValueArgs[4] = &nb.getInteractingTiles().getDevicePointer();
pairEnergyArgs[5] = &nb.getInteractingTiles().getDevicePointer();
pairValueArgs[10] = &nb.getInteractingAtoms().getDevicePointer();
pairEnergyArgs[11] = &nb.getInteractingAtoms().getDevicePointer();
}
}
cu.executeKernel(pairValueKernel, &pairValueArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
......@@ -2976,11 +3005,10 @@ void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context,
// Record the per-particle parameters.
vector<vector<float> > paramVector(numParticles);
vector<vector<float> > paramVector(cu.getPaddedNumAtoms(), vector<float>(force.getNumPerParticleParameters(), 0));
vector<double> parameters;
for (int i = 0; i < numParticles; i++) {
force.getParticleParameters(i, parameters);
paramVector[i].resize(parameters.size());
for (int j = 0; j < (int) parameters.size(); j++)
paramVector[i][j] = (float) parameters[j];
}
......
......@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL),
pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
}
~CudaCalcNonbondedForceKernel();
/**
......@@ -607,8 +606,6 @@ private:
CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ;
CudaArray* pmeBsplineTheta;
CudaArray* pmeBsplineDTheta;
CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex;
CudaSort* sort;
......@@ -617,9 +614,6 @@ private:
CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel;
CUfunction pmeAtomRangeKernel;
CUfunction pmeZIndexKernel;
CUfunction pmeUpdateBsplinesKernel;
CUfunction pmeSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel;
CUfunction pmeEvalEnergyKernel;
......@@ -776,6 +770,8 @@ private:
System& system;
CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
std::string pairValueSrc, pairEnergySrc;
std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
};
/**
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -29,6 +29,8 @@
#include "CudaArray.h"
#include "CudaKernelSources.h"
#include "CudaExpressionUtilities.h"
#include "CudaSort.h"
#include <algorithm>
#include <map>
#include <set>
#include <utility>
......@@ -43,15 +45,33 @@ using namespace std;
throw OpenMMException(m.str());\
}
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), nonbondedForceGroup(0) {
class CudaNonbondedUtilities::BlockSortTrait : public CudaSort::SortTrait {
public:
BlockSortTrait(bool useDouble) : useDouble(useDouble) {
}
int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);}
int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);}
const char* getDataType() const {return "real2";}
const char* getKeyType() const {return "real";}
const char* getMinKey() const {return "-3.40282e+38f";}
const char* getMaxKey() const {return "3.40282e+38f";}
const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";}
const char* getSortKey() const {return "value.x";}
private:
bool useDouble;
};
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) {
// Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities";
int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
numForceThreadBlocks = 3*multiprocessors;
numForceThreadBlocks = 4*multiprocessors;
forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
}
......@@ -60,18 +80,32 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete exclusionIndices;
if (exclusionRowIndices != NULL)
delete exclusionRowIndices;
if (exclusionTiles != NULL)
delete exclusionTiles;
if (exclusions != NULL)
delete exclusions;
if (interactingTiles != NULL)
delete interactingTiles;
if (interactionFlags != NULL)
delete interactionFlags;
if (interactingAtoms != NULL)
delete interactingAtoms;
if (interactionCount != NULL)
delete interactionCount;
if (blockCenter != NULL)
delete blockCenter;
if (blockBoundingBox != NULL)
delete blockBoundingBox;
if (sortedBlocks != NULL)
delete sortedBlocks;
if (sortedBlockCenter != NULL)
delete sortedBlockCenter;
if (sortedBlockBoundingBox != NULL)
delete sortedBlockBoundingBox;
if (oldPositions != NULL)
delete oldPositions;
if (rebuildNeighborList != NULL)
delete rebuildNeighborList;
if (blockSorter != NULL)
delete blockSorter;
}
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
......@@ -124,6 +158,10 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu
}
}
static bool compareUshort2(ushort2 a, ushort2 b) {
return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}
void CudaNonbondedUtilities::initialize(const System& system) {
string errorMessage = "Error initializing nonbonded utilities";
if (atomExclusions.size() == 0) {
......@@ -138,13 +176,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
numAtoms = context.getNumAtoms();
int numAtomBlocks = context.getNumAtomBlocks();
int totalTiles = numAtomBlocks*(numAtomBlocks+1)/2;
int numContexts = context.getPlatformData().contexts.size();
startTileIndex = context.getContextIndex()*totalTiles/numContexts;
int endTileIndex = (context.getContextIndex()+1)*totalTiles/numContexts;
numTiles = endTileIndex-startTileIndex;
setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
// Build a list of indices for the tiles with exclusions.
// Build a list of tiles that contain exclusions.
set<pair<int, int> > tilesWithExclusions;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
......@@ -155,19 +190,29 @@ void CudaNonbondedUtilities::initialize(const System& system) {
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
}
}
if (context.getPaddedNumAtoms() > context.getNumAtoms()) {
for (int i = 0; i < numAtomBlocks; ++i)
tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i));
vector<ushort2> exclusionTilesVec;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
exclusionTilesVec.push_back(make_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2);
exclusionTiles = CudaArray::create<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles->upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
ushort2 tile = exclusionTilesVec[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i;
}
vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
exclusionBlocksForBlock[iter->first].push_back(iter->second);
if (iter->first != iter->second)
exclusionBlocksForBlock[iter->second].push_back(iter->first);
}
vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0);
vector<unsigned int> exclusionIndicesVec;
int currentRow = 0;
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
while (iter->first != currentRow)
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndicesVec.push_back(iter->second);
for (int i = 0; i < numAtomBlocks; i++) {
exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
}
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndices = CudaArray::create<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices = CudaArray::create<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices->upload(exclusionIndicesVec);
......@@ -175,8 +220,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Record the exclusion data.
exclusions = CudaArray::create<unsigned int>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
vector<unsigned int> exclusionVec(exclusions->getSize(), 0xFFFFFFFF);
exclusions = CudaArray::create<tileflags>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
tileflags allFlags = (tileflags) -1;
vector<tileflags> exclusionVec(exclusions->getSize(), allFlags);
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/CudaContext::TileSize;
int offset1 = atom1-x*CudaContext::TileSize;
......@@ -185,31 +231,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
int y = atom2/CudaContext::TileSize;
int offset2 = atom2-y*CudaContext::TileSize;
if (x > y) {
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
exclusionVec[index+offset1] &= allFlags-(1<<offset2);
}
else {
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
}
}
}
// Mark all interactions that involve a padding atom as being excluded.
for (int atom1 = context.getNumAtoms(); atom1 < context.getPaddedNumAtoms(); ++atom1) {
int x = atom1/CudaContext::TileSize;
int offset1 = atom1-x*CudaContext::TileSize;
for (int atom2 = 0; atom2 < context.getPaddedNumAtoms(); ++atom2) {
int y = atom2/CudaContext::TileSize;
int offset2 = atom2-y*CudaContext::TileSize;
if (x >= y) {
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
if (y >= x) {
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
int index = exclusionTileMap[make_pair(y, x)]*CudaContext::TileSize;
exclusionVec[index+offset2] &= allFlags-(1<<offset1);
}
}
}
......@@ -219,26 +246,34 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Create data structures for the neighbor list.
if (useCutoff) {
// Select a size for the arrays that hold the neighbor list. This estimate is intentionally very
// high, because if it ever is too small, we have to fall back to the N^2 algorithm.
// Select a size for the arrays that hold the neighbor list. We have to make a fairly
// arbitrary guess, but if this turns out to be too small we'll increase it later.
double4 boxSize = context.getPeriodicBoxSize();
maxTiles = (int) (numTiles*(cutoff/boxSize.x+cutoff/boxSize.y+cutoff/boxSize.z));
maxTiles = 20*numAtomBlocks;
if (maxTiles > numTiles)
maxTiles = numTiles;
if (maxTiles < 1)
maxTiles = 1;
interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles");
interactionFlags = CudaArray::create<unsigned int>(context, maxTiles, "interactionFlags");
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount");
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks = new CudaArray(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlockCenter = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
if (context.getUseDoublePrecision()) {
blockCenter = CudaArray::create<double4>(context, numAtomBlocks, "blockCenter");
blockBoundingBox = CudaArray::create<double4>(context, numAtomBlocks, "blockBoundingBox");
vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
blockCenter = CudaArray::create<float4>(context, numAtomBlocks, "blockCenter");
blockBoundingBox = CudaArray::create<float4>(context, numAtomBlocks, "blockBoundingBox");
vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(1, 0);
interactionCount->upload(count);
}
......@@ -248,11 +283,22 @@ void CudaNonbondedUtilities::initialize(const System& system) {
if (kernelSource.size() > 0)
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
if (useCutoff) {
double padding = (usePadding ? 0.1*cutoff : 0.0);
double paddedCutoff = cutoff+padding;
map<string, string> defines;
defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff);
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDING"] = context.doubleToString(padding);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
if (usePeriodic)
defines["USE_PERIODIC"] = "1";
int maxExclusions = 0;
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
findBlockBoundsArgs.push_back(&numAtoms);
......@@ -261,7 +307,18 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
findBlockBoundsArgs.push_back(&blockCenter->getDevicePointer());
findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
findBlockBoundsArgs.push_back(&interactionCount->getDevicePointer());
findBlockBoundsArgs.push_back(&rebuildNeighborList->getDevicePointer());
findBlockBoundsArgs.push_back(&sortedBlocks->getDevicePointer());
sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
sortBoxDataArgs.push_back(&sortedBlocks->getDevicePointer());
sortBoxDataArgs.push_back(&blockCenter->getDevicePointer());
sortBoxDataArgs.push_back(&blockBoundingBox->getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockCenter->getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockBoundingBox->getDevicePointer());
sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
sortBoxDataArgs.push_back(&oldPositions->getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
......@@ -269,35 +326,21 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs.push_back(&blockBoundingBox->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactionFlags->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractingBlocksArgs.push_back(&maxTiles);
findInteractingBlocksArgs.push_back(&startTileIndex);
findInteractingBlocksArgs.push_back(&numTiles);
findInteractionsWithinBlocksKernel = context.getKernel(interactingBlocksProgram, "findInteractionsWithinBlocks");
findInteractionsWithinBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractionsWithinBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
findInteractionsWithinBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&interactingTiles->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&blockCenter->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&blockBoundingBox->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&interactionFlags->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&maxTiles);
findInteractingBlocksArgs.push_back(&startBlockIndex);
findInteractingBlocksArgs.push_back(&numBlocks);
findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockCenter->getDevicePointer());
findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox->getDevicePointer());
findInteractingBlocksArgs.push_back(&exclusionIndices->getDevicePointer());
findInteractingBlocksArgs.push_back(&exclusionRowIndices->getDevicePointer());
findInteractingBlocksArgs.push_back(&oldPositions->getDevicePointer());
findInteractingBlocksArgs.push_back(&rebuildNeighborList->getDevicePointer());
}
}
int CudaNonbondedUtilities::findExclusionIndex(int x, int y, const vector<unsigned int>& exclusionIndices, const vector<unsigned int>& exclusionRowIndices) {
if (x < y)
throw OpenMMException("Internal error: called findExclusionIndex with x<y");
int start = exclusionRowIndices[x];
int end = exclusionRowIndices[x+1];
for (int i = start; i < end; i++)
if (exclusionIndices[i] == y)
return i*CudaContext::TileSize;
throw OpenMMException("Internal error: exclusion in unexpected tile");
}
void CudaNonbondedUtilities::prepareInteractions() {
if (!useCutoff)
return;
......@@ -311,13 +354,17 @@ void CudaNonbondedUtilities::prepareInteractions() {
// Compute the neighbor list.
context.executeKernel(findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms());
context.executeKernel(findInteractionsWithinBlocksKernel, &findInteractionsWithinBlocksArgs[0], context.getNumAtoms(), 128);
blockSorter->sort(*sortedBlocks);
context.executeKernel(sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
}
void CudaNonbondedUtilities::computeInteractions() {
if (kernelSource.size() > 0)
if (kernelSource.size() > 0) {
context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
if (context.getComputeForceCount() == 1)
updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
}
}
void CudaNonbondedUtilities::updateNeighborListSize() {
......@@ -332,26 +379,42 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
// this from happening in the future.
maxTiles = (int) (1.2*pinnedInteractionCount[0]);
int numTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > numTiles)
maxTiles = numTiles;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > totalTiles)
maxTiles = totalTiles;
delete interactingTiles;
delete interactingAtoms;
interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
interactingAtoms = NULL;
interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles");
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
if (forceArgs.size() > 0)
forceArgs[8] = &interactingTiles->getDevicePointer();
forceArgs[7] = &interactingTiles->getDevicePointer();
findInteractingBlocksArgs[5] = &interactingTiles->getDevicePointer();
delete interactionFlags;
interactionFlags = CudaArray::create<unsigned int>(context, maxTiles, "interactionFlags");
if (forceArgs.size() > 0)
forceArgs[13] = &interactionFlags->getDevicePointer();
findInteractingBlocksArgs[6] = &interactionFlags->getDevicePointer();
findInteractionsWithinBlocksArgs[3] = &interactingTiles->getDevicePointer();
findInteractionsWithinBlocksArgs[6] = &interactionFlags->getDevicePointer();
forceArgs[13] = &interactingAtoms->getDevicePointer();
findInteractingBlocksArgs[6] = &interactingAtoms->getDevicePointer();
if (context.getUseDoublePrecision()) {
vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
}
void CudaNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) {
this->startTileIndex = startTileIndex;
this->numTiles = numTiles;
void CudaNonbondedUtilities::setUsePadding(bool padding) {
usePadding = padding;
}
void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
int numAtomBlocks = context.getNumAtomBlocks();
startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);;
numTiles = (int) (endFraction*totalTiles)-startTileIndex;
}
CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
......@@ -447,6 +510,14 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
int numExclusionTiles = exclusionTiles->getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
int numContexts = context.getPlatformData().contexts.size();
int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1";
if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
......@@ -461,8 +532,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
forceArgs.push_back(&context.getPosq().getDevicePointer());
forceArgs.push_back(&exclusions->getDevicePointer());
forceArgs.push_back(&exclusionIndices->getDevicePointer());
forceArgs.push_back(&exclusionRowIndices->getDevicePointer());
forceArgs.push_back(&exclusionTiles->getDevicePointer());
forceArgs.push_back(&startTileIndex);
forceArgs.push_back(&numTiles);
if (useCutoff) {
......@@ -471,7 +541,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs.push_back(context.getPeriodicBoxSizePointer());
forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
forceArgs.push_back(&maxTiles);
forceArgs.push_back(&interactionFlags->getDevicePointer());
forceArgs.push_back(&blockCenter->getDevicePointer());
forceArgs.push_back(&interactingAtoms->getDevicePointer());
}
for (int i = 0; i < (int) params.size(); i++)
forceArgs.push_back(&params[i].getMemory());
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. *
* Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -35,6 +35,8 @@
#include <vector>
namespace OpenMM {
class CudaSort;
/**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two
......@@ -181,10 +183,10 @@ public:
return *interactingTiles;
}
/**
* Get the array containing flags for tiles with interactions.
* Get the array containing the atoms in each tile with interactions.
*/
CudaArray& getInteractionFlags() {
return *interactionFlags;
CudaArray& getInteractingAtoms() {
return *interactingAtoms;
}
/**
* Get the array containing exclusion flags.
......@@ -192,6 +194,12 @@ public:
CudaArray& getExclusions() {
return *exclusions;
}
/**
* Get the array containing tiles with exclusions.
*/
CudaArray& getExclusionTiles() {
return *exclusionTiles;
}
/**
* Get the array containing the index into the exclusion array for each tile.
*/
......@@ -217,9 +225,17 @@ public:
return numTiles;
}
/**
* Set the range of tiles that should be processed by this context.
* Set whether to add padding to the cutoff distance when building the neighbor list.
* This increases the size of the neighbor list (and thus the cost of computing interactions),
* but also means we don't need to rebuild it every time step. The default value is true,
* since usually this improves performance. For very expensive interactions, however,
* it may be better to set this to false.
*/
void setUsePadding(bool padding);
/**
* Set the range of atom blocks and tiles that should be processed by this context.
*/
void setTileRange(int startTileIndex, int numTiles);
void setAtomBlockRange(double startFraction, double endFraction);
/**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use
......@@ -232,42 +248,38 @@ public:
* @param isSymmetric specifies whether the interaction is symmetric
*/
CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
/**
* This is a utility routine for locating data in the exclusions array. It takes the (x,y) indices of a tile,
* and returns the location in the array where the data for that tile begins.
*
* This routine requires that x >= y. If not, it will throw an exception.
*
* @param x the x index of the tile
* @param y the y index of the tile
* @param exclusionIndices the content of the exclusionIndices array
* @param exclusionRowIndices the content of the exclusionRowIndices array
* @return the index in the exclusions array at which the data for that tile begins
*/
static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
private:
class BlockSortTrait;
CudaContext& context;
CUfunction forceKernel;
CUfunction findBlockBoundsKernel;
CUfunction sortBoxDataKernel;
CUfunction findInteractingBlocksKernel;
CUfunction findInteractionsWithinBlocksKernel;
CudaArray* exclusionTiles;
CudaArray* exclusions;
CudaArray* exclusionIndices;
CudaArray* exclusionRowIndices;
CudaArray* interactingTiles;
CudaArray* interactionFlags;
CudaArray* interactingAtoms;
CudaArray* interactionCount;
CudaArray* blockCenter;
CudaArray* blockBoundingBox;
std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs;
CudaArray* sortedBlocks;
CudaArray* sortedBlockCenter;
CudaArray* sortedBlockBoundingBox;
CudaArray* oldPositions;
CudaArray* rebuildNeighborList;
CudaSort* blockSorter;
std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs;
std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments;
std::string kernelSource;
std::map<std::string, std::string> kernelDefines;
double cutoff;
bool useCutoff, usePeriodic, anyExclusions;
int startTileIndex, numTiles, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
bool useCutoff, usePeriodic, anyExclusions, usePadding;
int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
};
/**
......
......@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -118,7 +118,7 @@ private:
};
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL),
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
......@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
sumKernel = cu.getKernel(module, "sumForces");
for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system);
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
}
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
......@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
void* args[] = {&cu.getForce().getDevicePointer(), &contextForces->getDevicePointer(), &bufferSize, &numBuffers};
cu.executeKernel(sumKernel, args, bufferSize);
// Balance work between the contexts by transferring a few nonbonded tiles from the context that
// Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first.
int firstIndex = 0, lastIndex = 0;
int totalTiles = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i;
if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i;
contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles();
totalTiles += contextTiles[i];
}
int tilesToTransfer = totalTiles/1000;
if (tilesToTransfer < 1)
tilesToTransfer = 1;
if (tilesToTransfer > contextTiles[lastIndex])
tilesToTransfer = contextTiles[lastIndex];
contextTiles[firstIndex] += tilesToTransfer;
contextTiles[lastIndex] -= tilesToTransfer;
int startIndex = 0;
for (int i = 0; i < (int) contextTiles.size(); i++) {
data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
startIndex += contextTiles[i];
double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
contextNonbondedFractions[firstIndex] += fractionToTransfer;
contextNonbondedFractions[lastIndex] -= fractionToTransfer;
double startFraction = 0.0;
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
double endFraction = startFraction+contextNonbondedFractions[i];
if (i == contextNonbondedFractions.size()-1)
endFraction = 1.0; // Avoid roundoff error
data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
startFraction = endFraction;
}
}
return energy;
......
......@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. *
* *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. *
* Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Authors: Peter Eastman *
* Contributors: *
* *
......@@ -80,7 +80,7 @@ private:
CudaPlatform::PlatformData& data;
std::vector<Kernel> kernels;
std::vector<long long> completionTimes;
std::vector<int> contextTiles;
std::vector<double> contextNonbondedFractions;
CudaArray* contextForces;
void* pinnedPositionBuffer;
long long* pinnedForceBuffer;
......
......@@ -32,7 +32,7 @@ using namespace OpenMM;
using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) {
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
// Create kernels.
map<string, string> replacements;
......@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue();
CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
shortListKernel = context.getKernel(module, "sortShortList");
computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
......@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int maxBlockSize;
cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
isShortList = (length <= maxLocalBuffer);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
;
positionsKernelSize = rangeKernelSize;
sortKernelSize = rangeKernelSize/2;
sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
if (rangeKernelSize > length)
rangeKernelSize = length;
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = sortKernelSize/2;
......@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
// Create workspace arrays.
dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
if (!isShortList) {
dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
}
}
CudaSort::~CudaSort() {
......@@ -95,38 +99,44 @@ CudaSort::~CudaSort() {
}
void CudaSort::sort(CudaArray& data) {
if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize())
if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
throw OpenMMException("CudaSort called with different data size");
if (data.getSize() == 0)
return;
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation at once in local memory.
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
}
else {
// Compute the range of data values.
// Compute the range of data values.
unsigned int dataSize = data.getSize();
void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets.
// Assign array elements to buckets.
unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(*bucketOffset);
void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(*bucketOffset);
void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
// Compute the position of each bucket.
// Compute the position of each bucket.
void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
void* computeArgs[] = {&numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(computeBucketPositionsKernel, computeArgs, positionsKernelSize, positionsKernelSize, positionsKernelSize*sizeof(int));
// Copy the data into the buckets.
// Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(),
&bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataLength, &bucketOffset->getDevicePointer(),
&bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
// Sort each bucket.
// Sort each bucket.
void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
}
......@@ -92,8 +92,9 @@ private:
CudaArray* offsetInBucket;
CudaArray* bucketOffset;
CudaArray* buckets;
CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize;
CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList;
};
/**
......
#if USE_EWALD
bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if (!isExcluded || needCorrection) {
real tempForce = 0.0f;
if (r2 < CUTOFF_SQUARED || needCorrection) {
const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR);
......@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
t *= t;
t *= t;
const real erfcAlphaR = RECIP(t*t);
real tempForce = 0.0f;
if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution.
......@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
tempEnergy += prefactor*erfcAlphaR;
#endif
}
dEdR += tempForce*invR*invR;
}
dEdR += tempForce*invR*invR;
}
#else
{
......
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define TILE_SIZE 32
typedef struct {
real4 posq;
......@@ -15,169 +14,305 @@ typedef struct {
* Compute a force based on pair interactions.
*/
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices,
const unsigned int* __restrict__ exclusionRowIndices,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
// First loop: process tiles that contain exclusions.
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real dEdR = 0;
real tempEnergy = 0;
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF
}
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
excl >>= 1;
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
atom2 = y*TILE_SIZE+tj;
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += 0.5f*tempEnergy;
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
excl >>= 1;
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
STORE_DERIVATIVES_1
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
STORE_DERIVATIVES_2
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
const bool isExcluded = false;
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
{
// Compute the full set of interactions in this tile.
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
......@@ -195,35 +330,33 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
lasty = y;
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
// Write results.
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
unsigned int offset = atom1;
STORE_DERIVATIVES_1
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
STORE_DERIVATIVES_2
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
offset = atom2;
STORE_DERIVATIVES_2
}
}
pos++;
} while (pos < end);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
#define TILE_SIZE 32
typedef struct {
real4 posq;
real value, temp;
......@@ -13,238 +11,295 @@ typedef struct {
* Compute a value based on pair interactions.
*/
extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, unsigned long long* __restrict__ global_value,
const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ global_value,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
#else
unsigned int numTiles
#endif
PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
// First loop: process tiles that contain exclusions.
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real value = 0;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded && atom1 != atom2) {
#else
bool hasExclusions = false;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
#ifdef USE_CUTOFF
}
#endif
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
excl >>= 1;
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) {
}
}
else {
// This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].value = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
atom2 = y*TILE_SIZE+tj;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
localData[tbx+tj].value += tempValue2;
#ifdef USE_CUTOFF
}
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
excl >>= 1;
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
else {
// This is an off-diagonal tile.
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
localData[threadIdx.x].posq = posq[j];
const unsigned int localAtomIndex = threadIdx.x;
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[threadIdx.x].value = 0;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags != 0xFFFFFFFF) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
real value = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real tempValue1 = 0;
real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
}
localData[threadIdx.x].temp = tempValue2;
// Skip over tiles that have exclusions, since they were already processed.
// Sum the forces on atom2.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].value = 0;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
if (tgx % 4 == 0)
localData[threadIdx.x].temp += localData[threadIdx.x+1].temp+localData[threadIdx.x+2].temp+localData[threadIdx.x+3].temp;
if (tgx == 0)
localData[tbx+j].value += localData[threadIdx.x].temp+localData[threadIdx.x+4].temp+localData[threadIdx.x+8].temp+localData[threadIdx.x+12].temp+localData[threadIdx.x+16].temp+localData[threadIdx.x+20].temp+localData[threadIdx.x+24].temp+localData[threadIdx.x+28].temp;
}
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real tempValue1 = 0;
real tempValue2 = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
localData[tbx+tj].value += tempValue2;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
else
}
else
#endif
{
// Compute the full set of interactions in this tile.
{
// We need to apply periodic boundary conditions separately for each interaction.
#ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj;
atom2 = atomIndices[tbx+tj];
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE
}
value += tempValue1;
localData[tbx+tj].value += tempValue2;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
// Write results.
atomicAdd(&global_value[atom1], static_cast<unsigned long long>((long long) (value*0x100000000)));
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS)
atomicAdd(&global_value[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
}
lasty = y;
pos++;
} while (pos < end);
}
}
......@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
real3 crossProduct = cross(vec1, vec2);
real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale));
angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f)
angle = M_PI-angle;
}
else
angle = acos(cosine);
angle = ACOS(cosine);
return angle;
}
......
......@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
for (int atom = 0; atom < NUM_ATOMS; atom++) {
real4 apos = posq[atom];
real phase = apos.x*kx;
real2 structureFactor = make_real2(cos(phase), sin(phase));
real2 structureFactor = make_real2(COS(phase), SIN(phase));
phase = apos.y*ky;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
phase = apos.z*kz;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase)));
structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
sum += apos.w*structureFactor;
}
cosSinSum[index] = sum;
......@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
for (int ry = lowry; ry < KMAX_Y; ry++) {
real ky = ry*reciprocalBoxSize.y;
real phase = apos.x*kx;
real2 tab_xy = make_real2(cos(phase), sin(phase));
real2 tab_xy = make_real2(COS(phase), SIN(phase));
phase = apos.y*ky;
tab_xy = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
tab_xy = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
for (int rz = lowrz; rz < KMAX_Z; rz++) {
real kz = rz*reciprocalBoxSize.z;
......@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
real k2 = kx*kx + ky*ky + kz*kz;
real ak = EXP(k2*EXP_COEFFICIENT)/k2;
phase = apos.z*kz;
real2 structureFactor = multofReal2(tab_xy, make_real2(cos(phase), sin(phase)));
real2 structureFactor = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
real2 sum = cosSinSum[index];
real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
force.x += dEdR*kx;
......
#define TILE_SIZE 32
#define GROUP_SIZE 64
#define BUFFER_GROUPS 4
#define GROUP_SIZE 256
#define BUFFER_GROUPS 2
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
#define INVALID 0xFFFF
/**
* Find a bounding box for the atoms in each block.
*/
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount) {
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList, real2* __restrict__ sortedBlocks) {
int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE;
while (base < numAtoms) {
......@@ -30,68 +32,231 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
}
blockBoundingBox[index] = 0.5f*(maxPos-minPos);
real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos);
sortedBlocks[index] = make_real2(blockSize.x+blockSize.y+blockSize.z, index);
index += blockDim.x*gridDim.x;
base = index*TILE_SIZE;
}
if (blockIdx.x == 0 && threadIdx.x == 0)
interactionCount[0] = 0;
rebuildNeighborList[0] = 0;
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them
* to global memory.
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/
__device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, ushort2* temp, int* baseIndex,
unsigned int* interactionCount, ushort2* interactingTiles, real4 periodicBoxSize,
real4 invPeriodicBoxSize, const real4* posq, const real4* blockCenter, const real4* blockBoundingBox, unsigned int maxTiles) {
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = blockBoundingBox[index];
}
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
bool rebuild = false;
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 delta = oldPositions[i]-posq[i];
if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
rebuild = true;
}
if (rebuild) {
rebuildNeighborList[0] = 1;
interactionCount[0] = 0;
}
}
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
temp[i].x = (valid[i] ? 1 : 0);
/**
* Perform a parallel prefix sum over an array. The input values are all assumed to be 0 or 1.
*/
__device__ void prefixSum(short* sum, ushort2* temp) {
#if __CUDA_ARCH__ >= 300
const int indexInWarp = threadIdx.x%WARP_SIZE;
const int warpMask = (2<<indexInWarp)-1;
for (int base = 0; base < BUFFER_SIZE; base += blockDim.x)
temp[base+threadIdx.x].x = __popc(__ballot(sum[base+threadIdx.x])&warpMask);
__syncthreads();
if (threadIdx.x < BUFFER_SIZE/WARP_SIZE) {
int multiWarpSum = temp[(threadIdx.x+1)*WARP_SIZE-1].x;
for (int offset = 1; offset < BUFFER_SIZE/WARP_SIZE; offset *= 2) {
short n = __shfl_up(multiWarpSum, offset, WARP_SIZE);
if (indexInWarp >= offset)
multiWarpSum += n;
}
temp[threadIdx.x].y = multiWarpSum;
}
__syncthreads();
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = temp[i].x+(i < WARP_SIZE ? 0 : temp[i/WARP_SIZE-1].y);
__syncthreads();
#else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
temp[i].x = sum[i];
__syncthreads();
int whichBuffer = 0;
for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
whichBuffer = 1-whichBuffer;
__syncthreads();
}
if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = temp[i].x;
else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = temp[i].y;
__syncthreads();
int numValid = sum[BUFFER_SIZE-1];
__syncthreads();
// Compact the buffer.
#endif
}
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE)
if (valid[i]) {
temp[sum[i]-1] = buffer[i];
sum[i] = valid[i];
valid[i] = false;
buffer[i] = make_ushort2(1, 1);
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks, identifies interactions
* in them, and writes the result to global memory.
*/
__device__ void storeInteractionData(unsigned short x, unsigned short* buffer, short* sum, ushort2* temp, int* atoms, int& numAtoms,
int& baseIndex, unsigned int* interactionCount, ushort2* interactingTiles, unsigned int* interactingAtoms, real4 periodicBoxSize,
real4 invPeriodicBoxSize, const real4* posq, real3* posBuffer, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
if (threadIdx.x < TILE_SIZE) {
real3 pos = trimTo3(posq[x*TILE_SIZE+threadIdx.x]);
posBuffer[threadIdx.x] = pos;
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
posBuffer[threadIdx.x] = pos;
}
#endif
}
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = (buffer[i] == INVALID ? 0 : 1);
__syncthreads();
prefixSum(sum, temp);
int numValid = sum[BUFFER_SIZE-1];
// Store it to global memory.
// Compact the buffer.
if (threadIdx.x == 0)
*baseIndex = atomicAdd(interactionCount, numValid);
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
if (buffer[i] != INVALID)
temp[sum[i]-1].x = buffer[i];
__syncthreads();
if (*baseIndex+numValid <= maxTiles)
for (int i = threadIdx.x; i < numValid; i += GROUP_SIZE)
interactingTiles[*baseIndex+i] = temp[i];
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
buffer[i] = temp[i].x;
__syncthreads();
// Loop over the tiles and find specific interactions in them.
const int indexInWarp = threadIdx.x%WARP_SIZE;
for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
for (int i = threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
// Check each atom in block Y for interactions.
real3 pos = trimTo3(posq[buffer[base+i]*TILE_SIZE+indexInWarp]);
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
}
#endif
bool interacts = false;
#ifdef USE_PERIODIC
if (!singlePeriodicCopy) {
for (int j = 0; j < TILE_SIZE; j++) {
real3 delta = pos-posBuffer[j];
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
else {
#endif
for (int j = 0; j < TILE_SIZE; j++) {
real3 delta = pos-posBuffer[j];
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
#ifdef USE_PERIODIC
}
#endif
sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
}
for (int i = numValid-base+threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
sum[i*WARP_SIZE+indexInWarp] = 0;
// Compact the list of atoms.
__syncthreads();
prefixSum(sum, temp);
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
if (sum[i] != (i == 0 ? 0 : sum[i-1]))
atoms[numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
// Store them to global memory.
int atomsToStore = numAtoms+sum[BUFFER_SIZE-1];
bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
if (tilesToStore > 0) {
if (threadIdx.x == 0)
baseIndex = atomicAdd(interactionCount, tilesToStore);
__syncthreads();
if (threadIdx.x == 0)
numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
if (baseIndex+tilesToStore <= maxTiles) {
if (threadIdx.x < tilesToStore)
interactingTiles[baseIndex+threadIdx.x] = make_ushort2(x, singlePeriodicCopy);
for (int i = threadIdx.x; i < tilesToStore*TILE_SIZE; i += blockDim.x)
interactingAtoms[baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
}
}
else {
__syncthreads();
if (threadIdx.x == 0)
numAtoms += sum[BUFFER_SIZE-1];
}
__syncthreads();
if (threadIdx.x < numAtoms && !storePartialTile)
atoms[threadIdx.x] = atoms[tilesToStore*TILE_SIZE+threadIdx.x];
}
if (numValid == 0 && numAtoms > 0 && finish) {
// We didn't have any more tiles to process, but there were some atoms left over from a
// previous call to this function. Save them now.
if (threadIdx.x == 0)
baseIndex = atomicAdd(interactionCount, 1);
__syncthreads();
if (baseIndex < maxTiles) {
if (threadIdx.x == 0)
interactingTiles[baseIndex] = make_ushort2(x, singlePeriodicCopy);
if (threadIdx.x < TILE_SIZE)
interactingAtoms[baseIndex*TILE_SIZE+threadIdx.x] = (threadIdx.x < numAtoms ? atoms[threadIdx.x] : NUM_ATOMS);
}
}
// Reset the buffer for processing more tiles.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
buffer[i] = INVALID;
}
/**
......@@ -100,139 +265,92 @@ __device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, us
*/
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount, ushort2* __restrict__ interactingTiles,
unsigned int* __restrict__ interactionFlags, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startTileIndex,
unsigned int numTiles) {
__shared__ ushort2 buffer[BUFFER_SIZE];
__shared__ int valid[BUFFER_SIZE];
unsigned int* __restrict__ interactingAtoms, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startBlockIndex,
unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const real4* __restrict__ sortedBlockBoundingBox,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions,
const int* __restrict__ rebuildNeighborList) {
__shared__ unsigned short buffer[BUFFER_SIZE];
__shared__ short sum[BUFFER_SIZE];
__shared__ ushort2 temp[BUFFER_SIZE];
__shared__ int atoms[BUFFER_SIZE+TILE_SIZE];
__shared__ real3 posBuffer[TILE_SIZE];
__shared__ int exclusionsForX[MAX_EXCLUSIONS];
__shared__ int bufferFull;
__shared__ int globalIndex;
unsigned int endTileIndex = startTileIndex+numTiles;
__shared__ int numAtoms;
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
int valuesInBuffer = 0;
if (threadIdx.x == 0)
bufferFull = false;
for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+threadIdx.x] = false;
buffer[i*GROUP_SIZE+threadIdx.x] = INVALID;
__syncthreads();
for (int baseIndex = startTileIndex+blockIdx.x*blockDim.x; baseIndex < endTileIndex; baseIndex += blockDim.x*gridDim.x) {
// Identify the pair of blocks to compare.
int index = baseIndex+threadIdx.x;
if (index < endTileIndex) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index));
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (index-y*NUM_BLOCKS+y*(y+1)/2);
}
// Find the distance between the bounding boxes of the two cells.
// Loop over blocks sorted by size.
for (int i = startBlockIndex+blockIdx.x; i < startBlockIndex+numBlocks; i += gridDim.x) {
if (threadIdx.x == blockDim.x-1)
numAtoms = 0;
real2 sortedKey = sortedBlocks[i];
unsigned short x = (unsigned short) sortedKey.y;
real4 blockCenterX = blockCenter[x];
real4 blockSizeX = blockBoundingBox[x];
real4 delta = blockCenter[x]-blockCenter[y];
real4 boxSizea = blockBoundingBox[x];
real4 boxSizeb = blockBoundingBox[y];
// Load exclusion data for block x.
const int exclusionStart = exclusionRowIndices[x];
const int exclusionEnd = exclusionRowIndices[x+1];
const int numExclusions = exclusionEnd-exclusionStart;
for (int j = threadIdx.x; j < numExclusions; j += blockDim.x)
exclusionsForX[j] = exclusionIndices[exclusionStart+j];
__syncthreads();
// Compare it to other blocks after this one in sorted order.
for (int base = i+1; base < NUM_BLOCKS; base += blockDim.x) {
int j = base+threadIdx.x;
real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : make_real2(0));
real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : make_real4(0));
real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : make_real4(0));
unsigned short y = (unsigned short) sortedKey2.y;
real4 delta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z);
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < CUTOFF_SQUARED) {
delta.x = max(0.0f, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
delta.y = max(0.0f, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
delta.z = max(0.0f, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
bool hasExclusions = false;
for (int k = 0; k < numExclusions; k++)
hasExclusions |= (exclusionsForX[k] == y);
if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
// Add this tile to the buffer.
int bufferIndex = valuesInBuffer*GROUP_SIZE+threadIdx.x;
valid[bufferIndex] = true;
buffer[bufferIndex] = make_ushort2(x, y);
buffer[bufferIndex] = y;
valuesInBuffer++;
if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
bufferFull = true;
}
}
__syncthreads();
if (bufferFull) {
storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
valuesInBuffer = 0;
if (threadIdx.x == 0)
bufferFull = false;
__syncthreads();
}
}
storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles);
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
extern "C" __global__ void findInteractionsWithinBlocks(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, const ushort2* __restrict__ tiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionFlags, const unsigned int* __restrict__ interactionCount, unsigned int maxTiles) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
unsigned int index = threadIdx.x & (TILE_SIZE - 1);
#if (__CUDA_ARCH__ < 200)
__shared__ unsigned int flags[128];
#endif
if (numTiles > maxTiles)
return;
unsigned int lasty = 0xFFFFFFFF;
real4 apos;
while (pos < end) {
// Extract the coordinates of this tile
ushort2 tileIndices = tiles[pos];
unsigned int x = tileIndices.x;
unsigned int y = tileIndices.y;
if (x == y) {
if (index == 0)
interactionFlags[pos] = 0xFFFFFFFF;
}
else {
// Load the bounding box for x and the atom positions for y.
real4 center = blockCenter[x];
real4 boxSize = blockBoundingBox[x];
if (y != lasty)
apos = posq[y*TILE_SIZE+index];
// Find the distance of the atom from the bounding box.
real4 delta = apos-center;
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta.x = max((real) 0, fabs(delta.x)-boxSize.x);
delta.y = max((real) 0, fabs(delta.y)-boxSize.y);
delta.z = max((real) 0, fabs(delta.z)-boxSize.z);
#if (__CUDA_ARCH__ < 200)
flags[threadIdx.x] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > CUTOFF_SQUARED ? 0 : 1 << index);
if (index % 4 == 0)
flags[threadIdx.x] += flags[threadIdx.x+1]+flags[threadIdx.x+2]+flags[threadIdx.x+3];
unsigned int allFlags = 0;
if (index == 0)
allFlags = flags[threadIdx.x]+flags[threadIdx.x+4]+flags[threadIdx.x+8]+flags[threadIdx.x+12]+flags[threadIdx.x+16]+flags[threadIdx.x+20]+flags[threadIdx.x+24]+flags[threadIdx.x+28];
#else
unsigned int allFlags = __ballot(delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < CUTOFF_SQUARED);
#endif
// Sum the flags.
if (index == 0) {
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
int bits = __popc(allFlags);
interactionFlags[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
if (bufferFull) {
storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
valuesInBuffer = 0;
if (threadIdx.x == 0)
bufferFull = false;
__syncthreads();
}
lasty = y;
}
pos++;
storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
}
// Record the positions the neighbor list is based on.
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
oldPositions[i] = posq[i];
}
#define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f
#define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
/**
......@@ -70,262 +69,332 @@ typedef struct {
*/
extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const float2* __restrict__ global_params,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#else
unsigned int numTiles,
#endif
unsigned int* exclusionIndices, unsigned int* exclusionRowIndices) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const ushort2* __restrict__ exclusionTiles) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
__shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real bornSum = 0;
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = posq1.w;
localData[threadIdx.x].radius = params1.x;
localData[threadIdx.x].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
}
}
}
else {
// This is an off-diagonal tile.
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[threadIdx.x].radius = tempParams.x;
localData[threadIdx.x].scaledRadius = tempParams.y;
localData[threadIdx.x].bornSum = 0.0f;
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[FORCE_WORK_GROUP_SIZE];
#endif
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
__shared__ int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
real bornSum = 0;
if (pos < end) {
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
if (pos >= end)
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = posq1.w;
localData[threadIdx.x].radius = params1.x;
localData[threadIdx.x].scaledRadius = params1.y;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[threadIdx.x].radius = tempParams.x;
localData[threadIdx.x].scaledRadius = tempParams.y;
localData[threadIdx.x].bornSum = 0.0f;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if ((j != tgx) && (params1.x < rScaledRadiusJ)) {
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[threadIdx.x].radius = tempParams.x;
localData[threadIdx.x].scaledRadius = tempParams.y;
}
localData[threadIdx.x].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
bool computeSubset = false;
if (flags != 0xFFFFFFFF) {
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
computeSubset = (exclusionIndex[localGroupIndex] == -1);
}
if (computeSubset) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z);
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real sum = 0;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
sum = term;
}
}
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2)
sum += __shfl_xor(sum, i, 32);
if (tgx == 0)
localData[tbx+j].bornSum += sum;
#else
tempBuffer[threadIdx.x] = sum;
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28];
#endif
}
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
else
#endif
{
// Compute the full set of interactions in this tile.
}
// Write results.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
atomicAdd(&global_bornSum[atom1], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
#ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
unsigned int atom2 = atomIndices[threadIdx.x];
#else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r)
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r)
term += 2.0f*(RECIP(params2.x)-l_ij);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
}
if (atom2 < PADDED_NUM_ATOMS)
atomicAdd(&global_bornSum[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
}
lasty = y;
pos++;
} while (pos < end);
}
}
typedef struct {
......@@ -342,77 +411,50 @@ typedef struct {
extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
real* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
#ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags,
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#else
unsigned int numTiles,
#endif
unsigned int* exclusionIndices, unsigned int* exclusionRowIndices) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
const ushort2* __restrict__ exclusionTiles) {
const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
#endif
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE;
unsigned int x, y;
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = make_real4(0);
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = posq1.w;
localData[threadIdx.x].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
if (x == y) {
// This tile is on the diagonal.
localData[threadIdx.x].x = posq1.x;
localData[threadIdx.x].y = posq1.y;
localData[threadIdx.x].z = posq1.z;
localData[threadIdx.x].q = posq1.w;
localData[threadIdx.x].bornRadius = bornRadius1;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
......@@ -433,143 +475,233 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
force.y -= delta.y;
force.z -= delta.z;
#ifdef USE_CUTOFF
}
#endif
}
#endif
}
}
else {
// This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
localData[threadIdx.x].bornRadius = global_bornRadii[j];
}
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
localData[threadIdx.x].fw = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
bool computeSubset = false;
if (flags != 0xFFFFFFFF) {
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
computeSubset = (exclusionIndex[localGroupIndex] == -1);
}
if (computeSubset) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
}
else {
// This is an off-diagonal tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real4 delta = make_real4(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z, 0);
unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
localData[threadIdx.x].bornRadius = global_bornRadii[j];
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
localData[threadIdx.x].fw = 0.0f;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
}
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
tempEnergy = 0.0f;
}
energy += tempEnergy;
force.w += dGpol_dalpha2_ij*bornRadius2;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
delta.w = dGpol_dalpha2_ij*bornRadius1;
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
__shared__ int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
real4 force = make_real4(0);
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
}
else
delta = make_real4(0);
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Sum the forces on atom j.
#ifdef ENABLE_SHUFFLE
for (int i = 16; i >= 1; i /= 2) {
delta.x += __shfl_xor(delta.x, i, 32);
delta.y += __shfl_xor(delta.y, i, 32);
delta.z += __shfl_xor(delta.z, i, 32);
delta.w += __shfl_xor(delta.w, i, 32);
}
if (tgx == 0) {
localData[tbx+j].fx += delta.x;
localData[tbx+j].fy += delta.y;
localData[tbx+j].fz += delta.z;
localData[tbx+j].fw += delta.w;
}
// Load atom data for this tile.
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
tempBuffer[threadIdx.x] = delta;
if (tgx % 4 == 0)
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
if (tgx == 0) {
real4 sum = tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28];
localData[tbx+j].fx += sum.x;
localData[tbx+j].fy += sum.y;
localData[tbx+j].fz += sum.z;
localData[tbx+j].fw += sum.w;
}
unsigned int j = y*TILE_SIZE + tgx;
#endif
}
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
localData[threadIdx.x].bornRadius = global_bornRadii[j];
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
localData[threadIdx.x].fw = 0.0f;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
else
}
else
#endif
{
// Compute the full set of interactions in this tile.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
......@@ -594,33 +726,32 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF
}
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
#endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atomicAdd(&global_bornForce[atom1], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
atomicAdd(&global_bornForce[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
}
}
// Write results.
if (pos < end) {
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
}
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
}
lasty = y;
pos++;
} while (pos < end);
}
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
}
......@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x1 = sqrt(-2.0f * log(x1));
x1 = SQRT(-2.0f * LOG(x1));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.x = x1 * cos(2.0f * 3.14159265f * x2);
value.x = x1 * COS(2.0f * 3.14159265f * x2);
// Generate second value.
......@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x3 = sqrt(-2.0f * log(x3));
x3 = SQRT(-2.0f * LOG(x3));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.y = x3 * cos(2.0f * 3.14159265f * x4);
value.y = x3 * COS(2.0f * 3.14159265f * x4);
// Generate third value.
......@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x5 = sqrt(-2.0f * log(x5));
x5 = SQRT(-2.0f * LOG(x5));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.z = x5 * cos(2.0f * 3.14159265f * x6);
value.z = x5 * COS(2.0f * 3.14159265f * x6);
// Generate fourth value.
......@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13;
state.y ^= state.y >> 17;
state.y ^= state.y << 5;
x7 = sqrt(-2.0f * log(x7));
x7 = SQRT(-2.0f * LOG(x7));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry;
state.z = state.w;
state.w = m;
carry = k >> 30;
float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.w = x7 * cos(2.0f * 3.14159265f * x8);
value.w = x7 * COS(2.0f * 3.14159265f * x8);
// Record the values.
......@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
mixed trns11 = xaksXd / axlng;
mixed trns21 = yaksXd / axlng;
mixed trns31 = zaksXd / axlng;
......@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
// --- Step2 A2' ---
float rc = 0.5f*params.y;
mixed rb = sqrt(params.x*params.x-rc*rc);
mixed rb = SQRT(params.x*params.x-rc*rc);
mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra;
mixed sinphi = za1d/ra;
mixed cosphi = sqrt(1-sinphi*sinphi);
mixed cosphi = SQRT(1-sinphi*sinphi);
mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi);
mixed cospsi = sqrt(1-sinpsi*sinpsi);
mixed cospsi = SQRT(1-sinpsi*sinpsi);
mixed ya2d = ra*cosphi;
mixed xb2d = - rc*cospsi;
......@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
mixed xb2d2 = xb2d*xb2d;
mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y);
mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5f;
// --- Step3 al,be,ga ---
......@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
mixed al2be2 = alpha*alpha + beta*beta;
mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2;
mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' ---
mixed costheta = sqrt(1-sintheta*sintheta);
mixed costheta = SQRT(1-sintheta*sintheta);
mixed xa3d = - ya2d*sintheta;
mixed ya3d = ya2d*costheta;
mixed za3d = za1d;
......@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
mixed3 eAB = make_mixed3(apos1.x-apos0.x, apos1.y-apos0.y, apos1.z-apos0.z);
mixed3 eBC = make_mixed3(apos2.x-apos1.x, apos2.y-apos1.y, apos2.z-apos1.z);
mixed3 eCA = make_mixed3(apos0.x-apos2.x, apos0.y-apos2.y, apos0.z-apos2.z);
eAB *= rsqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC *= rsqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA *= rsqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
eAB *= RSQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC *= RSQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA *= RSQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
......@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
/**
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/
extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance, const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection) {
extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance,
const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection, int* __restrict__ converged) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
// Compute the direction for this constraint.
......@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
dir.z = oldPos1.z-oldPos2.z;
constraintDistance[index] = dir;
}
if (threadIdx.x == 0 && blockIdx.x == 0) {
converged[0] = 1;
converged[1] = 0;
}
}
/**
......@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
__syncthreads();
mixed lowerTol = 1-2*tol+tol*tol;
mixed upperTol = 1+2*tol+tol*tol;
bool threadConverged = true;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
// Compute the force due to this constraint.
......@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
mixed dist2 = dir.w*dir.w;
mixed diff = dist2 - rp2;
delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
// See whether it has converged.
if (groupConverged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) {
groupConverged = 0;
converged[iteration%2] = 0;
}
threadConverged &= (rp2 > lowerTol*dist2 && rp2 < upperTol*dist2);
}
if (groupConverged && !threadConverged)
groupConverged = 0;
__syncthreads();
if (threadIdx.x == 0 && !groupConverged)
converged[iteration%2] = 0;
}
/**
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment