Commit 93c467b2 authored by Peter Eastman's avatar Peter Eastman
Browse files

Merged 5.1Optimizations branch back to trunk

parent f6d4557d
...@@ -61,7 +61,7 @@ using namespace OpenMM; ...@@ -61,7 +61,7 @@ using namespace OpenMM;
using namespace std; using namespace std;
const int CudaContext::ThreadBlockSize = 64; const int CudaContext::ThreadBlockSize = 64;
const int CudaContext::TileSize = 32; const int CudaContext::TileSize = sizeof(tileflags)*8;
bool CudaContext::hasInitializedCuda = false; bool CudaContext::hasInitializedCuda = false;
CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler, CudaContext::CudaContext(const System& system, int deviceIndex, bool useBlockingSync, const string& precision, const string& compiler,
...@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string ...@@ -369,6 +369,7 @@ CUmodule CudaContext::createModule(const string source, const map<string, string
src << "typedef float3 mixed3;\n"; src << "typedef float3 mixed3;\n";
src << "typedef float4 mixed4;\n"; src << "typedef float4 mixed4;\n";
} }
src << "typedef unsigned int tileflags;\n";
for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) { for (map<string, string>::const_iterator iter = defines.begin(); iter != defines.end(); ++iter) {
src << "#define " << iter->first; src << "#define " << iter->first;
if (!iter->second.empty()) if (!iter->second.empty())
......
...@@ -42,6 +42,8 @@ ...@@ -42,6 +42,8 @@
#include "windowsExportCuda.h" #include "windowsExportCuda.h"
#include "CudaPlatform.h" #include "CudaPlatform.h"
typedef unsigned int tileflags;
namespace OpenMM { namespace OpenMM {
class CudaArray; class CudaArray;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -99,7 +99,7 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL), posDelta(NULL), settleAtoms(NULL), settleParams(NULL), shakeAtoms(NULL), shakeParams(NULL),
random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL), random(NULL), randomSeed(NULL), randomPos(0), stepSize(NULL), ccmaAtoms(NULL), ccmaDistance(NULL),
ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL), ccmaReducedMass(NULL), ccmaAtomConstraints(NULL), ccmaNumAtomConstraints(NULL), ccmaConstraintMatrixColumn(NULL),
ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConvergedMemory(NULL), ccmaConstraintMatrixValue(NULL), ccmaDelta1(NULL), ccmaDelta2(NULL), ccmaConverged(NULL),
vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL), vsite2AvgAtoms(NULL), vsite2AvgWeights(NULL), vsite3AvgAtoms(NULL), vsite3AvgWeights(NULL),
vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) { vsiteOutOfPlaneAtoms(NULL), vsiteOutOfPlaneWeights(NULL) {
// Create workspace arrays. // Create workspace arrays.
...@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S ...@@ -466,9 +466,8 @@ CudaIntegrationUtilities::CudaIntegrationUtilities(CudaContext& context, const S
ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms"); ccmaAtoms = CudaArray::create<int2>(context, numCCMA, "CcmaAtoms");
ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints"); ccmaAtomConstraints = CudaArray::create<int>(context, numAtoms*maxAtomConstraints, "CcmaAtomConstraints");
ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex"); ccmaNumAtomConstraints = CudaArray::create<int>(context, numAtoms, "CcmaAtomConstraintsIndex");
CHECK_RESULT2(cuMemHostAlloc((void**) &ccmaConvergedMemory, 2*sizeof(int), CU_MEMHOSTALLOC_DEVICEMAP), "Error allocating pinned memory");
CHECK_RESULT2(cuMemHostGetDevicePointer(&ccmaConvergedDeviceMemory, ccmaConvergedMemory, 0), "Error getting device address for pinned memory");
ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn"); ccmaConstraintMatrixColumn = CudaArray::create<int>(context, numCCMA*maxRowElements, "ConstraintMatrixColumn");
ccmaConverged = CudaArray::create<int>(context, 2, "ccmaConverged");
vector<int2> atomsVec(ccmaAtoms->getSize()); vector<int2> atomsVec(ccmaAtoms->getSize());
vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize()); vector<int> atomConstraintsVec(ccmaAtomConstraints->getSize());
vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize()); vector<int> numAtomConstraintsVec(ccmaNumAtomConstraints->getSize());
...@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() { ...@@ -680,8 +679,8 @@ CudaIntegrationUtilities::~CudaIntegrationUtilities() {
delete ccmaDelta1; delete ccmaDelta1;
if (ccmaDelta2 != NULL) if (ccmaDelta2 != NULL)
delete ccmaDelta2; delete ccmaDelta2;
if (ccmaConvergedMemory != NULL) if (ccmaConverged != NULL)
cuMemFreeHost(ccmaConvergedMemory); delete ccmaConverged;
if (vsite2AvgAtoms != NULL) if (vsite2AvgAtoms != NULL)
delete vsite2AvgAtoms; delete vsite2AvgAtoms;
if (vsite2AvgWeights != NULL) if (vsite2AvgWeights != NULL)
...@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double ...@@ -734,33 +733,32 @@ void CudaIntegrationUtilities::applyConstraints(bool constrainVelocities, double
context.executeKernel(shakeKernel, args, shakeAtoms->getSize()); context.executeKernel(shakeKernel, args, shakeAtoms->getSize());
} }
if (ccmaAtoms != NULL) { if (ccmaAtoms != NULL) {
void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection}; void* directionsArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), &context.getPosq().getDevicePointer(), &posCorrection, &ccmaConverged->getDevicePointer()};
context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaDirectionsKernel, directionsArgs, ccmaAtoms->getSize());
int i; int i;
void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(), void* forceArgs[] = {&ccmaAtoms->getDevicePointer(), &ccmaDistance->getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(), constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
&ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConvergedDeviceMemory, &ccmaReducedMass->getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaConverged->getDevicePointer(),
tolPointer, &i}; tolPointer, &i};
void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(), void* multiplyArgs[] = {&ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
&ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConvergedDeviceMemory, &i}; &ccmaConstraintMatrixColumn->getDevicePointer(), &ccmaConstraintMatrixValue->getDevicePointer(), &ccmaConverged->getDevicePointer(), &i};
void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(), void* updateArgs[] = {&ccmaNumAtomConstraints->getDevicePointer(), &ccmaAtomConstraints->getDevicePointer(), &ccmaDistance->getDevicePointer(),
constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(), constrainVelocities ? &context.getVelm().getDevicePointer() : &posDelta->getDevicePointer(),
&context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(), &context.getVelm().getDevicePointer(), &ccmaDelta1->getDevicePointer(), &ccmaDelta2->getDevicePointer(),
&ccmaConvergedDeviceMemory, &i}; &ccmaConverged->getDevicePointer(), &i};
const int checkInterval = 4; const int checkInterval = 4;
int* converged = (int*) context.getPinnedBuffer();
for (i = 0; i < 150; i++) { for (i = 0; i < 150; i++) {
if (i == 0) {
ccmaConvergedMemory[0] = 1;
ccmaConvergedMemory[1] = 0;
}
context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaForceKernel, forceArgs, ccmaAtoms->getSize());
if ((i+1)%checkInterval == 0) if ((i+1)%checkInterval == 0) {
ccmaConverged->download(converged, false);
CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA"); CHECK_RESULT2(cuEventRecord(ccmaEvent, 0), "Error recording event for CCMA");
}
context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize()); context.executeKernel(ccmaMultiplyKernel, multiplyArgs, ccmaAtoms->getSize());
context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms()); context.executeKernel(ccmaUpdateKernel, updateArgs, context.getNumAtoms());
if ((i+1)%checkInterval == 0) { if ((i+1)%checkInterval == 0) {
CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA"); CHECK_RESULT2(cuEventSynchronize(ccmaEvent), "Error synchronizing on event for CCMA");
if (ccmaConvergedMemory[i%2]) if (converged[i%2])
break; break;
} }
} }
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -140,8 +140,7 @@ private: ...@@ -140,8 +140,7 @@ private:
CudaArray* ccmaConstraintMatrixValue; CudaArray* ccmaConstraintMatrixValue;
CudaArray* ccmaDelta1; CudaArray* ccmaDelta1;
CudaArray* ccmaDelta2; CudaArray* ccmaDelta2;
int* ccmaConvergedMemory; CudaArray* ccmaConverged;
CUdeviceptr ccmaConvergedDeviceMemory;
CUevent ccmaEvent; CUevent ccmaEvent;
CudaArray* vsite2AvgAtoms; CudaArray* vsite2AvgAtoms;
CudaArray* vsite2AvgWeights; CudaArray* vsite2AvgWeights;
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2008-2012 Stanford University and the Authors. * * Portions copyright (c) 2008-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -1351,10 +1351,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() { ...@@ -1351,10 +1351,6 @@ CudaCalcNonbondedForceKernel::~CudaCalcNonbondedForceKernel() {
delete pmeBsplineModuliY; delete pmeBsplineModuliY;
if (pmeBsplineModuliZ != NULL) if (pmeBsplineModuliZ != NULL)
delete pmeBsplineModuliZ; delete pmeBsplineModuliZ;
if (pmeBsplineTheta != NULL)
delete pmeBsplineTheta;
if (pmeBsplineDTheta != NULL)
delete pmeBsplineDTheta;
if (pmeAtomRange != NULL) if (pmeAtomRange != NULL)
delete pmeAtomRange; delete pmeAtomRange;
if (pmeAtomGridIndex != NULL) if (pmeAtomGridIndex != NULL)
...@@ -1507,13 +1503,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1507,13 +1503,13 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
if (cu.getUseDoublePrecision()) if (cu.getUseDoublePrecision())
pmeDefines["USE_DOUBLE_PRECISION"] = "1"; pmeDefines["USE_DOUBLE_PRECISION"] = "1";
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+CudaKernelSources::pme, pmeDefines);
pmeUpdateBsplinesKernel = cu.getKernel(module, "updateBsplines"); pmeGridIndexKernel = cu.getKernel(module, "findAtomGridIndex");
pmeAtomRangeKernel = cu.getKernel(module, "findAtomRangeForGrid");
pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge"); pmeSpreadChargeKernel = cu.getKernel(module, "gridSpreadCharge");
pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution"); pmeConvolutionKernel = cu.getKernel(module, "reciprocalConvolution");
pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce"); pmeInterpolateForceKernel = cu.getKernel(module, "gridInterpolateForce");
pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy"); pmeEvalEnergyKernel = cu.getKernel(module, "gridEvaluateEnergy");
pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge"); pmeFinishSpreadChargeKernel = cu.getKernel(module, "finishSpreadCharge");
cuFuncSetCacheConfig(pmeSpreadChargeKernel, CU_FUNC_CACHE_PREFER_L1);
cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1); cuFuncSetCacheConfig(pmeInterpolateForceKernel, CU_FUNC_CACHE_PREFER_L1);
// Create required data structures. // Create required data structures.
...@@ -1528,7 +1524,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon ...@@ -1528,7 +1524,6 @@ void CudaCalcNonbondedForceKernel::initialize(const System& system, const Nonbon
pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX"); pmeBsplineModuliX = new CudaArray(cu, gridSizeX, elementSize, "pmeBsplineModuliX");
pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY"); pmeBsplineModuliY = new CudaArray(cu, gridSizeY, elementSize, "pmeBsplineModuliY");
pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ"); pmeBsplineModuliZ = new CudaArray(cu, gridSizeZ, elementSize, "pmeBsplineModuliZ");
pmeBsplineTheta = new CudaArray(cu, PmeOrder*numParticles, 4*elementSize, "pmeBsplineTheta");
pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange"); pmeAtomRange = CudaArray::create<int>(cu, gridSizeX*gridSizeY*gridSizeZ+1, "pmeAtomRange");
pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex"); pmeAtomGridIndex = CudaArray::create<int2>(cu, numParticles, "pmeAtomGridIndex");
sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms()); sort = new CudaSort(cu, new SortTrait(), cu.getNumAtoms());
...@@ -1659,20 +1654,14 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1659,20 +1654,14 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms()); cu.executeKernel(ewaldForcesKernel, forcesArgs, cu.getNumAtoms());
} }
if (directPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) { if (directPmeGrid != NULL && cu.getContextIndex() == 0 && includeReciprocal) {
void* bsplinesArgs[] = {&cu.getPosq().getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), void* gridIndexArgs[] = {&cu.getPosq().getDevicePointer(), &pmeAtomGridIndex->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.executeKernel(pmeGridIndexKernel, gridIndexArgs, cu.getNumAtoms());
int bsplinesSharedSize = cu.ThreadBlockSize*PmeOrder*(cu.getUseDoublePrecision() ? sizeof(double4) : sizeof(float4));
cu.executeKernel(pmeUpdateBsplinesKernel, bsplinesArgs, cu.getNumAtoms(), cu.ThreadBlockSize, bsplinesSharedSize);
sort->sort(*pmeAtomGridIndex); sort->sort(*pmeAtomGridIndex);
void* rangeArgs[] = {&pmeAtomGridIndex->getDevicePointer(), &pmeAtomRange->getDevicePointer(), &cu.getPosq().getDevicePointer(), void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid->getDevicePointer(), cu.getPeriodicBoxSizePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeAtomRangeKernel, rangeArgs, cu.getNumAtoms()); cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), 128);
void* spreadArgs[] = {&cu.getPosq().getDevicePointer(), &directPmeGrid->getDevicePointer(), &pmeBsplineTheta->getDevicePointer(), cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()};
cu.executeKernel(pmeSpreadChargeKernel, spreadArgs, cu.getNumAtoms(), PmeOrder*PmeOrder*PmeOrder);
void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0) { if (cu.getUseDoublePrecision() || cu.getComputeCapability() < 2.0) {
void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()}; void* finishSpreadArgs[] = {&directPmeGrid->getDevicePointer()};
...@@ -1699,8 +1688,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF ...@@ -1699,8 +1688,8 @@ double CudaCalcNonbondedForceKernel::execute(ContextImpl& context, bool includeF
void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(), void* interpolateArgs[] = {&cu.getPosq().getDevicePointer(), &cu.getForce().getDevicePointer(), &directPmeGrid->getDevicePointer(),
cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer()}; cu.getPeriodicBoxSizePointer(), cu.getInvPeriodicBoxSizePointer(), &pmeAtomGridIndex->getDevicePointer()};
cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms()); cu.executeKernel(pmeInterpolateForceKernel, interpolateArgs, cu.getNumAtoms(), 128);
} }
double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0); double energy = (includeReciprocal ? ewaldSelfEnergy : 0.0);
...@@ -2071,6 +2060,14 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor ...@@ -2071,6 +2060,14 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks()); defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize()); defines["FORCE_WORK_GROUP_SIZE"] = cu.intToString(nb.getForceThreadBlockSize());
defines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
int numExclusionTiles = nb.getExclusionTiles().getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
int numContexts = cu.getPlatformData().contexts.size();
int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
map<string, string> replacements; map<string, string> replacements;
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::gbsaObc1, replacements), defines); CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::gbsaObc1, replacements), defines);
computeBornSumKernel = cu.getKernel(module, "computeBornSum"); computeBornSumKernel = cu.getKernel(module, "computeBornSum");
...@@ -2083,12 +2080,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor ...@@ -2083,12 +2080,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
computeSumArgs.push_back(cu.getPeriodicBoxSizePointer()); computeSumArgs.push_back(cu.getPeriodicBoxSizePointer());
computeSumArgs.push_back(cu.getInvPeriodicBoxSizePointer()); computeSumArgs.push_back(cu.getInvPeriodicBoxSizePointer());
computeSumArgs.push_back(&maxTiles); computeSumArgs.push_back(&maxTiles);
computeSumArgs.push_back(&nb.getInteractionFlags().getDevicePointer()); computeSumArgs.push_back(&nb.getBlockCenters().getDevicePointer());
computeSumArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
} }
else else
computeSumArgs.push_back(&maxTiles); computeSumArgs.push_back(&maxTiles);
computeSumArgs.push_back(&nb.getExclusionIndices().getDevicePointer()); computeSumArgs.push_back(&nb.getExclusionTiles().getDevicePointer());
computeSumArgs.push_back(&nb.getExclusionRowIndices().getDevicePointer());
force1Kernel = cu.getKernel(module, "computeGBSAForce1"); force1Kernel = cu.getKernel(module, "computeGBSAForce1");
force1Args.push_back(&cu.getForce().getDevicePointer()); force1Args.push_back(&cu.getForce().getDevicePointer());
force1Args.push_back(&bornForce->getDevicePointer()); force1Args.push_back(&bornForce->getDevicePointer());
...@@ -2101,12 +2098,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor ...@@ -2101,12 +2098,12 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
force1Args.push_back(cu.getPeriodicBoxSizePointer()); force1Args.push_back(cu.getPeriodicBoxSizePointer());
force1Args.push_back(cu.getInvPeriodicBoxSizePointer()); force1Args.push_back(cu.getInvPeriodicBoxSizePointer());
force1Args.push_back(&maxTiles); force1Args.push_back(&maxTiles);
force1Args.push_back(&nb.getInteractionFlags().getDevicePointer()); force1Args.push_back(&nb.getBlockCenters().getDevicePointer());
force1Args.push_back(&nb.getInteractingAtoms().getDevicePointer());
} }
else else
force1Args.push_back(&maxTiles); force1Args.push_back(&maxTiles);
force1Args.push_back(&nb.getExclusionIndices().getDevicePointer()); force1Args.push_back(&nb.getExclusionTiles().getDevicePointer());
force1Args.push_back(&nb.getExclusionRowIndices().getDevicePointer());
reduceBornSumKernel = cu.getKernel(module, "reduceBornSum"); reduceBornSumKernel = cu.getKernel(module, "reduceBornSum");
reduceBornForceKernel = cu.getKernel(module, "reduceBornForce"); reduceBornForceKernel = cu.getKernel(module, "reduceBornForce");
} }
...@@ -2115,8 +2112,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor ...@@ -2115,8 +2112,8 @@ double CudaCalcGBSAOBCForceKernel::execute(ContextImpl& context, bool includeFor
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
computeSumArgs[3] = &nb.getInteractingTiles().getDevicePointer(); computeSumArgs[3] = &nb.getInteractingTiles().getDevicePointer();
force1Args[5] = &nb.getInteractingTiles().getDevicePointer(); force1Args[5] = &nb.getInteractingTiles().getDevicePointer();
computeSumArgs[8] = &nb.getInteractionFlags().getDevicePointer(); computeSumArgs[9] = &nb.getInteractingAtoms().getDevicePointer();
force1Args[10] = &nb.getInteractionFlags().getDevicePointer(); force1Args[11] = &nb.getInteractingAtoms().getDevicePointer();
} }
} }
cu.executeKernel(computeBornSumKernel, &computeSumArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize()); cu.executeKernel(computeBornSumKernel, &computeSumArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
...@@ -2244,16 +2241,17 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG ...@@ -2244,16 +2241,17 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
// Record parameters and exclusions. // Record parameters and exclusions.
int numParticles = force.getNumParticles(); int numParticles = force.getNumParticles();
params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), numParticles, "customGBParameters", true); int paddedNumParticles = cu.getPaddedNumAtoms();
computedValues = new CudaParameterSet(cu, force.getNumComputedValues(), numParticles, "customGBComputedValues", true, cu.getUseDoublePrecision()); int numParams = force.getNumPerParticleParameters();
params = new CudaParameterSet(cu, force.getNumPerParticleParameters(), paddedNumParticles, "customGBParameters", true);
computedValues = new CudaParameterSet(cu, force.getNumComputedValues(), paddedNumParticles, "customGBComputedValues", true, cu.getUseDoublePrecision());
if (force.getNumGlobalParameters() > 0) if (force.getNumGlobalParameters() > 0)
globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customGBGlobals"); globals = CudaArray::create<float>(cu, force.getNumGlobalParameters(), "customGBGlobals");
vector<vector<float> > paramVector(numParticles); vector<vector<float> > paramVector(paddedNumParticles, vector<float>(numParams, 0));
vector<vector<int> > exclusionList(numParticles); vector<vector<int> > exclusionList(numParticles);
for (int i = 0; i < numParticles; i++) { for (int i = 0; i < numParticles; i++) {
vector<double> parameters; vector<double> parameters;
force.getParticleParameters(i, parameters); force.getParticleParameters(i, parameters);
paramVector[i].resize(parameters.size());
for (int j = 0; j < (int) parameters.size(); j++) for (int j = 0; j < (int) parameters.size(); j++)
paramVector[i][j] = (float) parameters[j]; paramVector[i][j] = (float) parameters[j];
exclusionList[i].push_back(i); exclusionList[i].push_back(i);
...@@ -2406,23 +2404,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG ...@@ -2406,23 +2404,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str(); replacements["LOAD_LOCAL_PARAMETERS_FROM_GLOBAL"] = loadLocal2.str();
replacements["LOAD_ATOM1_PARAMETERS"] = load1.str(); replacements["LOAD_ATOM1_PARAMETERS"] = load1.str();
replacements["LOAD_ATOM2_PARAMETERS"] = load2.str(); replacements["LOAD_ATOM2_PARAMETERS"] = load2.str();
map<string, string> defines;
if (useCutoff) if (useCutoff)
defines["USE_CUTOFF"] = "1"; pairValueDefines["USE_CUTOFF"] = "1";
if (usePeriodic) if (usePeriodic)
defines["USE_PERIODIC"] = "1"; pairValueDefines["USE_PERIODIC"] = "1";
if (useExclusionsForValue) if (useExclusionsForValue)
defines["USE_EXCLUSIONS"] = "1"; pairValueDefines["USE_EXCLUSIONS"] = "1";
if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision()) if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision())
defines["NEED_PADDING"] = "1"; pairValueDefines["NEED_PADDING"] = "1";
defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize); pairValueDefines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()); pairValueDefines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance()); pairValueDefines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); pairValueDefines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); pairValueDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks()); pairValueDefines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customGBValueN2, replacements), defines); pairValueDefines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
pairValueKernel = cu.getKernel(module, "computeN2Value"); pairValueSrc = cu.replaceStrings(CudaKernelSources::customGBValueN2, replacements);
if (useExclusionsForValue) if (useExclusionsForValue)
cu.getNonbondedUtilities().requestExclusions(exclusionList); cu.getNonbondedUtilities().requestExclusions(exclusionList);
} }
...@@ -2574,23 +2571,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG ...@@ -2574,23 +2571,22 @@ void CudaCalcCustomGBForceKernel::initialize(const System& system, const CustomG
replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str(); replacements["RECORD_DERIVATIVE_2"] = recordDeriv.str();
replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str(); replacements["STORE_DERIVATIVES_1"] = storeDerivs1.str();
replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str(); replacements["STORE_DERIVATIVES_2"] = storeDerivs2.str();
map<string, string> defines;
if (useCutoff) if (useCutoff)
defines["USE_CUTOFF"] = "1"; pairEnergyDefines["USE_CUTOFF"] = "1";
if (usePeriodic) if (usePeriodic)
defines["USE_PERIODIC"] = "1"; pairEnergyDefines["USE_PERIODIC"] = "1";
if (anyExclusions) if (anyExclusions)
defines["USE_EXCLUSIONS"] = "1"; pairEnergyDefines["USE_EXCLUSIONS"] = "1";
if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision()) if (atomParamSize%2 == 0 && !cu.getUseDoublePrecision())
defines["NEED_PADDING"] = "1"; pairEnergyDefines["NEED_PADDING"] = "1";
defines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()); pairEnergyDefines["THREAD_BLOCK_SIZE"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize());
defines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize); pairEnergyDefines["WARPS_PER_GROUP"] = cu.intToString(cu.getNonbondedUtilities().getForceThreadBlockSize()/CudaContext::TileSize);
defines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance()); pairEnergyDefines["CUTOFF_SQUARED"] = cu.doubleToString(force.getCutoffDistance()*force.getCutoffDistance());
defines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms()); pairEnergyDefines["NUM_ATOMS"] = cu.intToString(cu.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms()); pairEnergyDefines["PADDED_NUM_ATOMS"] = cu.intToString(cu.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks()); pairEnergyDefines["NUM_BLOCKS"] = cu.intToString(cu.getNumAtomBlocks());
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+cu.replaceStrings(CudaKernelSources::customGBEnergyN2, replacements), defines); pairEnergyDefines["TILE_SIZE"] = cu.intToString(CudaContext::TileSize);
pairEnergyKernel = cu.getKernel(module, "computeN2Energy"); pairEnergySrc = cu.replaceStrings(CudaKernelSources::customGBEnergyN2, replacements);
} }
{ {
// Create the kernel to reduce the derivatives and calculate per-particle energy terms. // Create the kernel to reduce the derivatives and calculate per-particle energy terms.
...@@ -2834,14 +2830,46 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -2834,14 +2830,46 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
CudaNonbondedUtilities& nb = cu.getNonbondedUtilities(); CudaNonbondedUtilities& nb = cu.getNonbondedUtilities();
if (!hasInitializedKernels) { if (!hasInitializedKernels) {
hasInitializedKernels = true; hasInitializedKernels = true;
// These two kernels can't be compiled in initialize(), because the nonbonded utilities object
// has not yet been initialized then.
{
int numExclusionTiles = cu.getNonbondedUtilities().getExclusionTiles().getSize();
pairValueDefines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
int numContexts = cu.getPlatformData().contexts.size();
int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
pairValueDefines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
pairValueDefines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+pairValueSrc, pairValueDefines);
pairValueKernel = cu.getKernel(module, "computeN2Value");
pairValueSrc = "";
pairValueDefines.clear();
}
{
int numExclusionTiles = cu.getNonbondedUtilities().getExclusionTiles().getSize();
pairEnergyDefines["NUM_TILES_WITH_EXCLUSIONS"] = cu.intToString(numExclusionTiles);
int numContexts = cu.getPlatformData().contexts.size();
int startExclusionIndex = cu.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (cu.getContextIndex()+1)*numExclusionTiles/numContexts;
pairEnergyDefines["FIRST_EXCLUSION_TILE"] = cu.intToString(startExclusionIndex);
pairEnergyDefines["LAST_EXCLUSION_TILE"] = cu.intToString(endExclusionIndex);
CUmodule module = cu.createModule(CudaKernelSources::vectorOps+pairEnergySrc, pairEnergyDefines);
pairEnergyKernel = cu.getKernel(module, "computeN2Energy");
pairEnergySrc = "";
pairEnergyDefines.clear();
}
// Set arguments for kernels.
maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2); maxTiles = (nb.getUseCutoff() ? nb.getInteractingTiles().getSize() : cu.getNumAtomBlocks()*(cu.getNumAtomBlocks()+1)/2);
valueBuffers = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "customGBValueBuffers"); valueBuffers = CudaArray::create<long long>(cu, cu.getPaddedNumAtoms(), "customGBValueBuffers");
cu.addAutoclearBuffer(*valueBuffers); cu.addAutoclearBuffer(*valueBuffers);
cu.clearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize()); cu.clearBuffer(valueBuffers->getDevicePointer(), sizeof(long long)*valueBuffers->getSize());
pairValueArgs.push_back(&cu.getPosq().getDevicePointer()); pairValueArgs.push_back(&cu.getPosq().getDevicePointer());
pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer()); pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer()); pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionTiles().getDevicePointer());
pairValueArgs.push_back(&cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
pairValueArgs.push_back(&valueBuffers->getDevicePointer()); pairValueArgs.push_back(&valueBuffers->getDevicePointer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairValueArgs.push_back(&nb.getInteractingTiles().getDevicePointer()); pairValueArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
...@@ -2849,7 +2877,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -2849,7 +2877,8 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairValueArgs.push_back(cu.getPeriodicBoxSizePointer()); pairValueArgs.push_back(cu.getPeriodicBoxSizePointer());
pairValueArgs.push_back(cu.getInvPeriodicBoxSizePointer()); pairValueArgs.push_back(cu.getInvPeriodicBoxSizePointer());
pairValueArgs.push_back(&maxTiles); pairValueArgs.push_back(&maxTiles);
pairValueArgs.push_back(&nb.getInteractionFlags().getDevicePointer()); pairValueArgs.push_back(&nb.getBlockCenters().getDevicePointer());
pairValueArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
} }
else else
pairValueArgs.push_back(&maxTiles); pairValueArgs.push_back(&maxTiles);
...@@ -2881,15 +2910,15 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -2881,15 +2910,15 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
pairEnergyArgs.push_back(&cu.getEnergyBuffer().getDevicePointer()); pairEnergyArgs.push_back(&cu.getEnergyBuffer().getDevicePointer());
pairEnergyArgs.push_back(&cu.getPosq().getDevicePointer()); pairEnergyArgs.push_back(&cu.getPosq().getDevicePointer());
pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer()); pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusions().getDevicePointer());
pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionIndices().getDevicePointer()); pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionTiles().getDevicePointer());
pairEnergyArgs.push_back(&cu.getNonbondedUtilities().getExclusionRowIndices().getDevicePointer());
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
pairEnergyArgs.push_back(&nb.getInteractingTiles().getDevicePointer()); pairEnergyArgs.push_back(&nb.getInteractingTiles().getDevicePointer());
pairEnergyArgs.push_back(&nb.getInteractionCount().getDevicePointer()); pairEnergyArgs.push_back(&nb.getInteractionCount().getDevicePointer());
pairEnergyArgs.push_back(cu.getPeriodicBoxSizePointer()); pairEnergyArgs.push_back(cu.getPeriodicBoxSizePointer());
pairEnergyArgs.push_back(cu.getInvPeriodicBoxSizePointer()); pairEnergyArgs.push_back(cu.getInvPeriodicBoxSizePointer());
pairEnergyArgs.push_back(&maxTiles); pairEnergyArgs.push_back(&maxTiles);
pairEnergyArgs.push_back(&nb.getInteractionFlags().getDevicePointer()); pairEnergyArgs.push_back(&nb.getBlockCenters().getDevicePointer());
pairEnergyArgs.push_back(&nb.getInteractingAtoms().getDevicePointer());
} }
else else
pairEnergyArgs.push_back(&maxTiles); pairEnergyArgs.push_back(&maxTiles);
...@@ -2953,10 +2982,10 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo ...@@ -2953,10 +2982,10 @@ double CudaCalcCustomGBForceKernel::execute(ContextImpl& context, bool includeFo
if (nb.getUseCutoff()) { if (nb.getUseCutoff()) {
if (maxTiles < nb.getInteractingTiles().getSize()) { if (maxTiles < nb.getInteractingTiles().getSize()) {
maxTiles = nb.getInteractingTiles().getSize(); maxTiles = nb.getInteractingTiles().getSize();
pairValueArgs[5] = &nb.getInteractingTiles().getDevicePointer(); pairValueArgs[4] = &nb.getInteractingTiles().getDevicePointer();
pairEnergyArgs[6] = &nb.getInteractingTiles().getDevicePointer(); pairEnergyArgs[5] = &nb.getInteractingTiles().getDevicePointer();
pairValueArgs[10] = &nb.getInteractionFlags().getDevicePointer(); pairValueArgs[10] = &nb.getInteractingAtoms().getDevicePointer();
pairEnergyArgs[11] = &nb.getInteractionFlags().getDevicePointer(); pairEnergyArgs[11] = &nb.getInteractingAtoms().getDevicePointer();
} }
} }
cu.executeKernel(pairValueKernel, &pairValueArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize()); cu.executeKernel(pairValueKernel, &pairValueArgs[0], nb.getNumForceThreadBlocks()*nb.getForceThreadBlockSize(), nb.getForceThreadBlockSize());
...@@ -2976,11 +3005,10 @@ void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context, ...@@ -2976,11 +3005,10 @@ void CudaCalcCustomGBForceKernel::copyParametersToContext(ContextImpl& context,
// Record the per-particle parameters. // Record the per-particle parameters.
vector<vector<float> > paramVector(numParticles); vector<vector<float> > paramVector(cu.getPaddedNumAtoms(), vector<float>(force.getNumPerParticleParameters(), 0));
vector<double> parameters; vector<double> parameters;
for (int i = 0; i < numParticles; i++) { for (int i = 0; i < numParticles; i++) {
force.getParticleParameters(i, parameters); force.getParticleParameters(i, parameters);
paramVector[i].resize(parameters.size());
for (int j = 0; j < (int) parameters.size(); j++) for (int j = 0; j < (int) parameters.size(); j++)
paramVector[i][j] = (float) parameters[j]; paramVector[i][j] = (float) parameters[j];
} }
......
...@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel { ...@@ -557,8 +557,7 @@ class CudaCalcNonbondedForceKernel : public CalcNonbondedForceKernel {
public: public:
CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform), CudaCalcNonbondedForceKernel(std::string name, const Platform& platform, CudaContext& cu, System& system) : CalcNonbondedForceKernel(name, platform),
cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL), cu(cu), hasInitializedFFT(false), sigmaEpsilon(NULL), exceptionParams(NULL), cosSinSums(NULL), directPmeGrid(NULL), reciprocalPmeGrid(NULL),
pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeBsplineTheta(NULL), pmeBsplineDTheta(NULL), pmeBsplineModuliX(NULL), pmeBsplineModuliY(NULL), pmeBsplineModuliZ(NULL), pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
pmeAtomRange(NULL), pmeAtomGridIndex(NULL), sort(NULL) {
} }
~CudaCalcNonbondedForceKernel(); ~CudaCalcNonbondedForceKernel();
/** /**
...@@ -607,8 +606,6 @@ private: ...@@ -607,8 +606,6 @@ private:
CudaArray* pmeBsplineModuliX; CudaArray* pmeBsplineModuliX;
CudaArray* pmeBsplineModuliY; CudaArray* pmeBsplineModuliY;
CudaArray* pmeBsplineModuliZ; CudaArray* pmeBsplineModuliZ;
CudaArray* pmeBsplineTheta;
CudaArray* pmeBsplineDTheta;
CudaArray* pmeAtomRange; CudaArray* pmeAtomRange;
CudaArray* pmeAtomGridIndex; CudaArray* pmeAtomGridIndex;
CudaSort* sort; CudaSort* sort;
...@@ -617,9 +614,6 @@ private: ...@@ -617,9 +614,6 @@ private:
CUfunction ewaldSumsKernel; CUfunction ewaldSumsKernel;
CUfunction ewaldForcesKernel; CUfunction ewaldForcesKernel;
CUfunction pmeGridIndexKernel; CUfunction pmeGridIndexKernel;
CUfunction pmeAtomRangeKernel;
CUfunction pmeZIndexKernel;
CUfunction pmeUpdateBsplinesKernel;
CUfunction pmeSpreadChargeKernel; CUfunction pmeSpreadChargeKernel;
CUfunction pmeFinishSpreadChargeKernel; CUfunction pmeFinishSpreadChargeKernel;
CUfunction pmeEvalEnergyKernel; CUfunction pmeEvalEnergyKernel;
...@@ -776,6 +770,8 @@ private: ...@@ -776,6 +770,8 @@ private:
System& system; System& system;
CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel; CUfunction pairValueKernel, perParticleValueKernel, pairEnergyKernel, perParticleEnergyKernel, gradientChainRuleKernel;
std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs; std::vector<void*> pairValueArgs, perParticleValueArgs, pairEnergyArgs, perParticleEnergyArgs, gradientChainRuleArgs;
std::string pairValueSrc, pairEnergySrc;
std::map<std::string, std::string> pairValueDefines, pairEnergyDefines;
}; };
/** /**
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -29,6 +29,8 @@ ...@@ -29,6 +29,8 @@
#include "CudaArray.h" #include "CudaArray.h"
#include "CudaKernelSources.h" #include "CudaKernelSources.h"
#include "CudaExpressionUtilities.h" #include "CudaExpressionUtilities.h"
#include "CudaSort.h"
#include <algorithm>
#include <map> #include <map>
#include <set> #include <set>
#include <utility> #include <utility>
...@@ -43,15 +45,33 @@ using namespace std; ...@@ -43,15 +45,33 @@ using namespace std;
throw OpenMMException(m.str());\ throw OpenMMException(m.str());\
} }
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusions(NULL), interactingTiles(NULL), interactionFlags(NULL), class CudaNonbondedUtilities::BlockSortTrait : public CudaSort::SortTrait {
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), nonbondedForceGroup(0) { public:
BlockSortTrait(bool useDouble) : useDouble(useDouble) {
}
int getDataSize() const {return useDouble ? sizeof(double2) : sizeof(float2);}
int getKeySize() const {return useDouble ? sizeof(double) : sizeof(float);}
const char* getDataType() const {return "real2";}
const char* getKeyType() const {return "real";}
const char* getMinKey() const {return "-3.40282e+38f";}
const char* getMaxKey() const {return "3.40282e+38f";}
const char* getMaxValue() const {return "make_real2(3.40282e+38f, 3.40282e+38f)";}
const char* getSortKey() const {return "value.x";}
private:
bool useDouble;
};
CudaNonbondedUtilities::CudaNonbondedUtilities(CudaContext& context) : context(context), cutoff(-1.0), useCutoff(false), anyExclusions(false), usePadding(true),
exclusionIndices(NULL), exclusionRowIndices(NULL), exclusionTiles(NULL), exclusions(NULL), interactingTiles(NULL), interactingAtoms(NULL),
interactionCount(NULL), blockCenter(NULL), blockBoundingBox(NULL), sortedBlocks(NULL), sortedBlockCenter(NULL), sortedBlockBoundingBox(NULL),
oldPositions(NULL), rebuildNeighborList(NULL), blockSorter(NULL), nonbondedForceGroup(0) {
// Decide how many thread blocks to use. // Decide how many thread blocks to use.
string errorMessage = "Error initializing nonbonded utilities"; string errorMessage = "Error initializing nonbonded utilities";
int multiprocessors; int multiprocessors;
CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice())); CHECK_RESULT(cuDeviceGetAttribute(&multiprocessors, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, context.getDevice()));
numForceThreadBlocks = 3*multiprocessors; numForceThreadBlocks = 4*multiprocessors;
forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256); forceThreadBlockSize = (context.getComputeCapability() < 2.0 ? 128 : 256);
} }
...@@ -60,18 +80,32 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() { ...@@ -60,18 +80,32 @@ CudaNonbondedUtilities::~CudaNonbondedUtilities() {
delete exclusionIndices; delete exclusionIndices;
if (exclusionRowIndices != NULL) if (exclusionRowIndices != NULL)
delete exclusionRowIndices; delete exclusionRowIndices;
if (exclusionTiles != NULL)
delete exclusionTiles;
if (exclusions != NULL) if (exclusions != NULL)
delete exclusions; delete exclusions;
if (interactingTiles != NULL) if (interactingTiles != NULL)
delete interactingTiles; delete interactingTiles;
if (interactionFlags != NULL) if (interactingAtoms != NULL)
delete interactionFlags; delete interactingAtoms;
if (interactionCount != NULL) if (interactionCount != NULL)
delete interactionCount; delete interactionCount;
if (blockCenter != NULL) if (blockCenter != NULL)
delete blockCenter; delete blockCenter;
if (blockBoundingBox != NULL) if (blockBoundingBox != NULL)
delete blockBoundingBox; delete blockBoundingBox;
if (sortedBlocks != NULL)
delete sortedBlocks;
if (sortedBlockCenter != NULL)
delete sortedBlockCenter;
if (sortedBlockBoundingBox != NULL)
delete sortedBlockBoundingBox;
if (oldPositions != NULL)
delete oldPositions;
if (rebuildNeighborList != NULL)
delete rebuildNeighborList;
if (blockSorter != NULL)
delete blockSorter;
} }
void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) { void CudaNonbondedUtilities::addInteraction(bool usesCutoff, bool usesPeriodic, bool usesExclusions, double cutoffDistance, const vector<vector<int> >& exclusionList, const string& kernel, int forceGroup) {
...@@ -124,6 +158,10 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu ...@@ -124,6 +158,10 @@ void CudaNonbondedUtilities::requestExclusions(const vector<vector<int> >& exclu
} }
} }
static bool compareUshort2(ushort2 a, ushort2 b) {
return ((a.y < b.y) || (a.y == b.y && a.x < b.x));
}
void CudaNonbondedUtilities::initialize(const System& system) { void CudaNonbondedUtilities::initialize(const System& system) {
string errorMessage = "Error initializing nonbonded utilities"; string errorMessage = "Error initializing nonbonded utilities";
if (atomExclusions.size() == 0) { if (atomExclusions.size() == 0) {
...@@ -138,13 +176,10 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -138,13 +176,10 @@ void CudaNonbondedUtilities::initialize(const System& system) {
numAtoms = context.getNumAtoms(); numAtoms = context.getNumAtoms();
int numAtomBlocks = context.getNumAtomBlocks(); int numAtomBlocks = context.getNumAtomBlocks();
int totalTiles = numAtomBlocks*(numAtomBlocks+1)/2;
int numContexts = context.getPlatformData().contexts.size(); int numContexts = context.getPlatformData().contexts.size();
startTileIndex = context.getContextIndex()*totalTiles/numContexts; setAtomBlockRange(context.getContextIndex()/(double) numContexts, (context.getContextIndex()+1)/(double) numContexts);
int endTileIndex = (context.getContextIndex()+1)*totalTiles/numContexts;
numTiles = endTileIndex-startTileIndex;
// Build a list of indices for the tiles with exclusions. // Build a list of tiles that contain exclusions.
set<pair<int, int> > tilesWithExclusions; set<pair<int, int> > tilesWithExclusions;
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) { for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
...@@ -155,19 +190,29 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -155,19 +190,29 @@ void CudaNonbondedUtilities::initialize(const System& system) {
tilesWithExclusions.insert(make_pair(max(x, y), min(x, y))); tilesWithExclusions.insert(make_pair(max(x, y), min(x, y)));
} }
} }
if (context.getPaddedNumAtoms() > context.getNumAtoms()) { vector<ushort2> exclusionTilesVec;
for (int i = 0; i < numAtomBlocks; ++i) for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter)
tilesWithExclusions.insert(make_pair(numAtomBlocks-1, i)); exclusionTilesVec.push_back(make_ushort2((unsigned short) iter->first, (unsigned short) iter->second));
sort(exclusionTilesVec.begin(), exclusionTilesVec.end(), compareUshort2);
exclusionTiles = CudaArray::create<ushort2>(context, exclusionTilesVec.size(), "exclusionTiles");
exclusionTiles->upload(exclusionTilesVec);
map<pair<int, int>, int> exclusionTileMap;
for (int i = 0; i < (int) exclusionTilesVec.size(); i++) {
ushort2 tile = exclusionTilesVec[i];
exclusionTileMap[make_pair(tile.x, tile.y)] = i;
}
vector<vector<int> > exclusionBlocksForBlock(numAtomBlocks);
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) {
exclusionBlocksForBlock[iter->first].push_back(iter->second);
if (iter->first != iter->second)
exclusionBlocksForBlock[iter->second].push_back(iter->first);
} }
vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0); vector<unsigned int> exclusionRowIndicesVec(numAtomBlocks+1, 0);
vector<unsigned int> exclusionIndicesVec; vector<unsigned int> exclusionIndicesVec;
int currentRow = 0; for (int i = 0; i < numAtomBlocks; i++) {
for (set<pair<int, int> >::const_iterator iter = tilesWithExclusions.begin(); iter != tilesWithExclusions.end(); ++iter) { exclusionIndicesVec.insert(exclusionIndicesVec.end(), exclusionBlocksForBlock[i].begin(), exclusionBlocksForBlock[i].end());
while (iter->first != currentRow) exclusionRowIndicesVec[i+1] = exclusionIndicesVec.size();
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndicesVec.push_back(iter->second);
} }
exclusionRowIndicesVec[++currentRow] = exclusionIndicesVec.size();
exclusionIndices = CudaArray::create<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices"); exclusionIndices = CudaArray::create<unsigned int>(context, exclusionIndicesVec.size(), "exclusionIndices");
exclusionRowIndices = CudaArray::create<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices"); exclusionRowIndices = CudaArray::create<unsigned int>(context, exclusionRowIndicesVec.size(), "exclusionRowIndices");
exclusionIndices->upload(exclusionIndicesVec); exclusionIndices->upload(exclusionIndicesVec);
...@@ -175,8 +220,9 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -175,8 +220,9 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Record the exclusion data. // Record the exclusion data.
exclusions = CudaArray::create<unsigned int>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions"); exclusions = CudaArray::create<tileflags>(context, tilesWithExclusions.size()*CudaContext::TileSize, "exclusions");
vector<unsigned int> exclusionVec(exclusions->getSize(), 0xFFFFFFFF); tileflags allFlags = (tileflags) -1;
vector<tileflags> exclusionVec(exclusions->getSize(), allFlags);
for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) { for (int atom1 = 0; atom1 < (int) atomExclusions.size(); ++atom1) {
int x = atom1/CudaContext::TileSize; int x = atom1/CudaContext::TileSize;
int offset1 = atom1-x*CudaContext::TileSize; int offset1 = atom1-x*CudaContext::TileSize;
...@@ -185,31 +231,12 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -185,31 +231,12 @@ void CudaNonbondedUtilities::initialize(const System& system) {
int y = atom2/CudaContext::TileSize; int y = atom2/CudaContext::TileSize;
int offset2 = atom2-y*CudaContext::TileSize; int offset2 = atom2-y*CudaContext::TileSize;
if (x > y) { if (x > y) {
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec); int index = exclusionTileMap[make_pair(x, y)]*CudaContext::TileSize;
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2); exclusionVec[index+offset1] &= allFlags-(1<<offset2);
} }
else { else {
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec); int index = exclusionTileMap[make_pair(y, x)]*CudaContext::TileSize;
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1); exclusionVec[index+offset2] &= allFlags-(1<<offset1);
}
}
}
// Mark all interactions that involve a padding atom as being excluded.
for (int atom1 = context.getNumAtoms(); atom1 < context.getPaddedNumAtoms(); ++atom1) {
int x = atom1/CudaContext::TileSize;
int offset1 = atom1-x*CudaContext::TileSize;
for (int atom2 = 0; atom2 < context.getPaddedNumAtoms(); ++atom2) {
int y = atom2/CudaContext::TileSize;
int offset2 = atom2-y*CudaContext::TileSize;
if (x >= y) {
int index = findExclusionIndex(x, y, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset1] &= 0xFFFFFFFF-(1<<offset2);
}
if (y >= x) {
int index = findExclusionIndex(y, x, exclusionIndicesVec, exclusionRowIndicesVec);
exclusionVec[index+offset2] &= 0xFFFFFFFF-(1<<offset1);
} }
} }
} }
...@@ -219,26 +246,34 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -219,26 +246,34 @@ void CudaNonbondedUtilities::initialize(const System& system) {
// Create data structures for the neighbor list. // Create data structures for the neighbor list.
if (useCutoff) { if (useCutoff) {
// Select a size for the arrays that hold the neighbor list. This estimate is intentionally very // Select a size for the arrays that hold the neighbor list. We have to make a fairly
// high, because if it ever is too small, we have to fall back to the N^2 algorithm. // arbitrary guess, but if this turns out to be too small we'll increase it later.
double4 boxSize = context.getPeriodicBoxSize(); maxTiles = 20*numAtomBlocks;
maxTiles = (int) (numTiles*(cutoff/boxSize.x+cutoff/boxSize.y+cutoff/boxSize.z));
if (maxTiles > numTiles) if (maxTiles > numTiles)
maxTiles = numTiles; maxTiles = numTiles;
if (maxTiles < 1) if (maxTiles < 1)
maxTiles = 1; maxTiles = 1;
interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles"); interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles");
interactionFlags = CudaArray::create<unsigned int>(context, maxTiles, "interactionFlags"); interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount"); interactionCount = CudaArray::create<unsigned int>(context, 1, "interactionCount");
int elementSize = (context.getUseDoublePrecision() ? sizeof(double) : sizeof(float));
blockCenter = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockCenter");
blockBoundingBox = new CudaArray(context, numAtomBlocks, 4*elementSize, "blockBoundingBox");
sortedBlocks = new CudaArray(context, numAtomBlocks, 2*elementSize, "sortedBlocks");
sortedBlockCenter = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockCenter");
sortedBlockBoundingBox = new CudaArray(context, numAtomBlocks+1, 4*elementSize, "sortedBlockBoundingBox");
oldPositions = new CudaArray(context, numAtoms, 4*elementSize, "oldPositions");
if (context.getUseDoublePrecision()) { if (context.getUseDoublePrecision()) {
blockCenter = CudaArray::create<double4>(context, numAtomBlocks, "blockCenter"); vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
blockBoundingBox = CudaArray::create<double4>(context, numAtomBlocks, "blockBoundingBox"); oldPositions->upload(oldPositionsVec);
} }
else { else {
blockCenter = CudaArray::create<float4>(context, numAtomBlocks, "blockCenter"); vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
blockBoundingBox = CudaArray::create<float4>(context, numAtomBlocks, "blockBoundingBox"); oldPositions->upload(oldPositionsVec);
} }
rebuildNeighborList = CudaArray::create<int>(context, 1, "rebuildNeighborList");
blockSorter = new CudaSort(context, new BlockSortTrait(context.getUseDoublePrecision()), numAtomBlocks);
vector<unsigned int> count(1, 0); vector<unsigned int> count(1, 0);
interactionCount->upload(count); interactionCount->upload(count);
} }
...@@ -248,11 +283,22 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -248,11 +283,22 @@ void CudaNonbondedUtilities::initialize(const System& system) {
if (kernelSource.size() > 0) if (kernelSource.size() > 0)
forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true); forceKernel = createInteractionKernel(kernelSource, parameters, arguments, true, true);
if (useCutoff) { if (useCutoff) {
double padding = (usePadding ? 0.1*cutoff : 0.0);
double paddedCutoff = cutoff+padding;
map<string, string> defines; map<string, string> defines;
defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["CUTOFF_SQUARED"] = context.doubleToString(cutoff*cutoff); defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDING"] = context.doubleToString(padding);
defines["PADDED_CUTOFF"] = context.doubleToString(paddedCutoff);
defines["PADDED_CUTOFF_SQUARED"] = context.doubleToString(paddedCutoff*paddedCutoff);
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(exclusionTiles->getSize());
if (usePeriodic) if (usePeriodic)
defines["USE_PERIODIC"] = "1"; defines["USE_PERIODIC"] = "1";
int maxExclusions = 0;
for (int i = 0; i < (int) exclusionBlocksForBlock.size(); i++)
maxExclusions = (maxExclusions > exclusionBlocksForBlock[i].size() ? maxExclusions : exclusionBlocksForBlock[i].size());
defines["MAX_EXCLUSIONS"] = context.intToString(maxExclusions);
CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines); CUmodule interactingBlocksProgram = context.createModule(CudaKernelSources::vectorOps+CudaKernelSources::findInteractingBlocks, defines);
findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds"); findBlockBoundsKernel = context.getKernel(interactingBlocksProgram, "findBlockBounds");
findBlockBoundsArgs.push_back(&numAtoms); findBlockBoundsArgs.push_back(&numAtoms);
...@@ -261,7 +307,18 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -261,7 +307,18 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer()); findBlockBoundsArgs.push_back(&context.getPosq().getDevicePointer());
findBlockBoundsArgs.push_back(&blockCenter->getDevicePointer()); findBlockBoundsArgs.push_back(&blockCenter->getDevicePointer());
findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer()); findBlockBoundsArgs.push_back(&blockBoundingBox->getDevicePointer());
findBlockBoundsArgs.push_back(&interactionCount->getDevicePointer()); findBlockBoundsArgs.push_back(&rebuildNeighborList->getDevicePointer());
findBlockBoundsArgs.push_back(&sortedBlocks->getDevicePointer());
sortBoxDataKernel = context.getKernel(interactingBlocksProgram, "sortBoxData");
sortBoxDataArgs.push_back(&sortedBlocks->getDevicePointer());
sortBoxDataArgs.push_back(&blockCenter->getDevicePointer());
sortBoxDataArgs.push_back(&blockBoundingBox->getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockCenter->getDevicePointer());
sortBoxDataArgs.push_back(&sortedBlockBoundingBox->getDevicePointer());
sortBoxDataArgs.push_back(&context.getPosq().getDevicePointer());
sortBoxDataArgs.push_back(&oldPositions->getDevicePointer());
sortBoxDataArgs.push_back(&interactionCount->getDevicePointer());
sortBoxDataArgs.push_back(&rebuildNeighborList->getDevicePointer());
findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions"); findInteractingBlocksKernel = context.getKernel(interactingBlocksProgram, "findBlocksWithInteractions");
findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getPeriodicBoxSizePointer());
findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer());
...@@ -269,35 +326,21 @@ void CudaNonbondedUtilities::initialize(const System& system) { ...@@ -269,35 +326,21 @@ void CudaNonbondedUtilities::initialize(const System& system) {
findInteractingBlocksArgs.push_back(&blockBoundingBox->getDevicePointer()); findInteractingBlocksArgs.push_back(&blockBoundingBox->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactingTiles->getDevicePointer());
findInteractingBlocksArgs.push_back(&interactionFlags->getDevicePointer()); findInteractingBlocksArgs.push_back(&interactingAtoms->getDevicePointer());
findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer()); findInteractingBlocksArgs.push_back(&context.getPosq().getDevicePointer());
findInteractingBlocksArgs.push_back(&maxTiles); findInteractingBlocksArgs.push_back(&maxTiles);
findInteractingBlocksArgs.push_back(&startTileIndex); findInteractingBlocksArgs.push_back(&startBlockIndex);
findInteractingBlocksArgs.push_back(&numTiles); findInteractingBlocksArgs.push_back(&numBlocks);
findInteractionsWithinBlocksKernel = context.getKernel(interactingBlocksProgram, "findInteractionsWithinBlocks"); findInteractingBlocksArgs.push_back(&sortedBlocks->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(context.getPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(&sortedBlockCenter->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(context.getInvPeriodicBoxSizePointer()); findInteractingBlocksArgs.push_back(&sortedBlockBoundingBox->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&context.getPosq().getDevicePointer()); findInteractingBlocksArgs.push_back(&exclusionIndices->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&interactingTiles->getDevicePointer()); findInteractingBlocksArgs.push_back(&exclusionRowIndices->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&blockCenter->getDevicePointer()); findInteractingBlocksArgs.push_back(&oldPositions->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&blockBoundingBox->getDevicePointer()); findInteractingBlocksArgs.push_back(&rebuildNeighborList->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&interactionFlags->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&interactionCount->getDevicePointer());
findInteractionsWithinBlocksArgs.push_back(&maxTiles);
} }
} }
int CudaNonbondedUtilities::findExclusionIndex(int x, int y, const vector<unsigned int>& exclusionIndices, const vector<unsigned int>& exclusionRowIndices) {
if (x < y)
throw OpenMMException("Internal error: called findExclusionIndex with x<y");
int start = exclusionRowIndices[x];
int end = exclusionRowIndices[x+1];
for (int i = start; i < end; i++)
if (exclusionIndices[i] == y)
return i*CudaContext::TileSize;
throw OpenMMException("Internal error: exclusion in unexpected tile");
}
void CudaNonbondedUtilities::prepareInteractions() { void CudaNonbondedUtilities::prepareInteractions() {
if (!useCutoff) if (!useCutoff)
return; return;
...@@ -311,13 +354,17 @@ void CudaNonbondedUtilities::prepareInteractions() { ...@@ -311,13 +354,17 @@ void CudaNonbondedUtilities::prepareInteractions() {
// Compute the neighbor list. // Compute the neighbor list.
context.executeKernel(findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms()); context.executeKernel(findBlockBoundsKernel, &findBlockBoundsArgs[0], context.getNumAtoms());
context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms()); blockSorter->sort(*sortedBlocks);
context.executeKernel(findInteractionsWithinBlocksKernel, &findInteractionsWithinBlocksArgs[0], context.getNumAtoms(), 128); context.executeKernel(sortBoxDataKernel, &sortBoxDataArgs[0], context.getNumAtoms());
context.executeKernel(findInteractingBlocksKernel, &findInteractingBlocksArgs[0], context.getNumAtoms(), 256);
} }
void CudaNonbondedUtilities::computeInteractions() { void CudaNonbondedUtilities::computeInteractions() {
if (kernelSource.size() > 0) if (kernelSource.size() > 0) {
context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize); context.executeKernel(forceKernel, &forceArgs[0], numForceThreadBlocks*forceThreadBlockSize, forceThreadBlockSize);
if (context.getComputeForceCount() == 1)
updateNeighborListSize(); // This is the first time step, so check whether our initial guess was large enough.
}
} }
void CudaNonbondedUtilities::updateNeighborListSize() { void CudaNonbondedUtilities::updateNeighborListSize() {
...@@ -332,26 +379,42 @@ void CudaNonbondedUtilities::updateNeighborListSize() { ...@@ -332,26 +379,42 @@ void CudaNonbondedUtilities::updateNeighborListSize() {
// this from happening in the future. // this from happening in the future.
maxTiles = (int) (1.2*pinnedInteractionCount[0]); maxTiles = (int) (1.2*pinnedInteractionCount[0]);
int numTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2; int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
if (maxTiles > numTiles) if (maxTiles > totalTiles)
maxTiles = numTiles; maxTiles = totalTiles;
delete interactingTiles; delete interactingTiles;
delete interactingAtoms;
interactingTiles = NULL; // Avoid an error in the destructor if the following allocation fails
interactingAtoms = NULL;
interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles"); interactingTiles = CudaArray::create<ushort2>(context, maxTiles, "interactingTiles");
interactingAtoms = CudaArray::create<int>(context, CudaContext::TileSize*maxTiles, "interactingAtoms");
if (forceArgs.size() > 0) if (forceArgs.size() > 0)
forceArgs[8] = &interactingTiles->getDevicePointer(); forceArgs[7] = &interactingTiles->getDevicePointer();
findInteractingBlocksArgs[5] = &interactingTiles->getDevicePointer(); findInteractingBlocksArgs[5] = &interactingTiles->getDevicePointer();
delete interactionFlags;
interactionFlags = CudaArray::create<unsigned int>(context, maxTiles, "interactionFlags");
if (forceArgs.size() > 0) if (forceArgs.size() > 0)
forceArgs[13] = &interactionFlags->getDevicePointer(); forceArgs[13] = &interactingAtoms->getDevicePointer();
findInteractingBlocksArgs[6] = &interactionFlags->getDevicePointer(); findInteractingBlocksArgs[6] = &interactingAtoms->getDevicePointer();
findInteractionsWithinBlocksArgs[3] = &interactingTiles->getDevicePointer(); if (context.getUseDoublePrecision()) {
findInteractionsWithinBlocksArgs[6] = &interactionFlags->getDevicePointer(); vector<double4> oldPositionsVec(numAtoms, make_double4(1e30, 1e30, 1e30, 0));
oldPositions->upload(oldPositionsVec);
}
else {
vector<float4> oldPositionsVec(numAtoms, make_float4(1e30f, 1e30f, 1e30f, 0));
oldPositions->upload(oldPositionsVec);
}
} }
void CudaNonbondedUtilities::setTileRange(int startTileIndex, int numTiles) { void CudaNonbondedUtilities::setUsePadding(bool padding) {
this->startTileIndex = startTileIndex; usePadding = padding;
this->numTiles = numTiles; }
void CudaNonbondedUtilities::setAtomBlockRange(double startFraction, double endFraction) {
int numAtomBlocks = context.getNumAtomBlocks();
startBlockIndex = (int) (startFraction*numAtomBlocks);
numBlocks = (int) (endFraction*numAtomBlocks)-startBlockIndex;
int totalTiles = context.getNumAtomBlocks()*(context.getNumAtomBlocks()+1)/2;
startTileIndex = (int) (startFraction*totalTiles);;
numTiles = (int) (endFraction*totalTiles)-startTileIndex;
} }
CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) { CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, vector<ParameterInfo>& params, vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric) {
...@@ -447,6 +510,14 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -447,6 +510,14 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms()); defines["NUM_ATOMS"] = context.intToString(context.getNumAtoms());
defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms()); defines["PADDED_NUM_ATOMS"] = context.intToString(context.getPaddedNumAtoms());
defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks()); defines["NUM_BLOCKS"] = context.intToString(context.getNumAtomBlocks());
defines["TILE_SIZE"] = context.intToString(CudaContext::TileSize);
int numExclusionTiles = exclusionTiles->getSize();
defines["NUM_TILES_WITH_EXCLUSIONS"] = context.intToString(numExclusionTiles);
int numContexts = context.getPlatformData().contexts.size();
int startExclusionIndex = context.getContextIndex()*numExclusionTiles/numContexts;
int endExclusionIndex = (context.getContextIndex()+1)*numExclusionTiles/numContexts;
defines["FIRST_EXCLUSION_TILE"] = context.intToString(startExclusionIndex);
defines["LAST_EXCLUSION_TILE"] = context.intToString(endExclusionIndex);
if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision()) if ((localDataSize/4)%2 == 0 && !context.getUseDoublePrecision())
defines["PARAMETER_SIZE_IS_EVEN"] = "1"; defines["PARAMETER_SIZE_IS_EVEN"] = "1";
if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision()) if (context.getComputeCapability() >= 3.0 && !context.getUseDoublePrecision())
...@@ -461,8 +532,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -461,8 +532,7 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer()); forceArgs.push_back(&context.getEnergyBuffer().getDevicePointer());
forceArgs.push_back(&context.getPosq().getDevicePointer()); forceArgs.push_back(&context.getPosq().getDevicePointer());
forceArgs.push_back(&exclusions->getDevicePointer()); forceArgs.push_back(&exclusions->getDevicePointer());
forceArgs.push_back(&exclusionIndices->getDevicePointer()); forceArgs.push_back(&exclusionTiles->getDevicePointer());
forceArgs.push_back(&exclusionRowIndices->getDevicePointer());
forceArgs.push_back(&startTileIndex); forceArgs.push_back(&startTileIndex);
forceArgs.push_back(&numTiles); forceArgs.push_back(&numTiles);
if (useCutoff) { if (useCutoff) {
...@@ -471,7 +541,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source, ...@@ -471,7 +541,8 @@ CUfunction CudaNonbondedUtilities::createInteractionKernel(const string& source,
forceArgs.push_back(context.getPeriodicBoxSizePointer()); forceArgs.push_back(context.getPeriodicBoxSizePointer());
forceArgs.push_back(context.getInvPeriodicBoxSizePointer()); forceArgs.push_back(context.getInvPeriodicBoxSizePointer());
forceArgs.push_back(&maxTiles); forceArgs.push_back(&maxTiles);
forceArgs.push_back(&interactionFlags->getDevicePointer()); forceArgs.push_back(&blockCenter->getDevicePointer());
forceArgs.push_back(&interactingAtoms->getDevicePointer());
} }
for (int i = 0; i < (int) params.size(); i++) for (int i = 0; i < (int) params.size(); i++)
forceArgs.push_back(&params[i].getMemory()); forceArgs.push_back(&params[i].getMemory());
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2009-2012 Stanford University and the Authors. * * Portions copyright (c) 2009-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -36,6 +36,8 @@ ...@@ -36,6 +36,8 @@
namespace OpenMM { namespace OpenMM {
class CudaSort;
/** /**
* This class provides a generic interface for calculating nonbonded interactions. It does this in two * This class provides a generic interface for calculating nonbonded interactions. It does this in two
* ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients * ways. First, it can be used to create kernels that evaluate nonbonded interactions. Clients
...@@ -181,10 +183,10 @@ public: ...@@ -181,10 +183,10 @@ public:
return *interactingTiles; return *interactingTiles;
} }
/** /**
* Get the array containing flags for tiles with interactions. * Get the array containing the atoms in each tile with interactions.
*/ */
CudaArray& getInteractionFlags() { CudaArray& getInteractingAtoms() {
return *interactionFlags; return *interactingAtoms;
} }
/** /**
* Get the array containing exclusion flags. * Get the array containing exclusion flags.
...@@ -192,6 +194,12 @@ public: ...@@ -192,6 +194,12 @@ public:
CudaArray& getExclusions() { CudaArray& getExclusions() {
return *exclusions; return *exclusions;
} }
/**
* Get the array containing tiles with exclusions.
*/
CudaArray& getExclusionTiles() {
return *exclusionTiles;
}
/** /**
* Get the array containing the index into the exclusion array for each tile. * Get the array containing the index into the exclusion array for each tile.
*/ */
...@@ -217,9 +225,17 @@ public: ...@@ -217,9 +225,17 @@ public:
return numTiles; return numTiles;
} }
/** /**
* Set the range of tiles that should be processed by this context. * Set whether to add padding to the cutoff distance when building the neighbor list.
* This increases the size of the neighbor list (and thus the cost of computing interactions),
* but also means we don't need to rebuild it every time step. The default value is true,
* since usually this improves performance. For very expensive interactions, however,
* it may be better to set this to false.
*/
void setUsePadding(bool padding);
/**
* Set the range of atom blocks and tiles that should be processed by this context.
*/ */
void setTileRange(int startTileIndex, int numTiles); void setAtomBlockRange(double startFraction, double endFraction);
/** /**
* Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions * Create a Kernel for evaluating a nonbonded interaction. Cutoffs and periodic boundary conditions
* are assumed to be the same as those for the default interaction Kernel, since this kernel will use * are assumed to be the same as those for the default interaction Kernel, since this kernel will use
...@@ -232,42 +248,38 @@ public: ...@@ -232,42 +248,38 @@ public:
* @param isSymmetric specifies whether the interaction is symmetric * @param isSymmetric specifies whether the interaction is symmetric
*/ */
CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric); CUfunction createInteractionKernel(const std::string& source, std::vector<ParameterInfo>& params, std::vector<ParameterInfo>& arguments, bool useExclusions, bool isSymmetric);
/**
* This is a utility routine for locating data in the exclusions array. It takes the (x,y) indices of a tile,
* and returns the location in the array where the data for that tile begins.
*
* This routine requires that x >= y. If not, it will throw an exception.
*
* @param x the x index of the tile
* @param y the y index of the tile
* @param exclusionIndices the content of the exclusionIndices array
* @param exclusionRowIndices the content of the exclusionRowIndices array
* @return the index in the exclusions array at which the data for that tile begins
*/
static int findExclusionIndex(int x, int y, const std::vector<unsigned int>& exclusionIndices, const std::vector<unsigned int>& exclusionRowIndices);
private: private:
class BlockSortTrait;
CudaContext& context; CudaContext& context;
CUfunction forceKernel; CUfunction forceKernel;
CUfunction findBlockBoundsKernel; CUfunction findBlockBoundsKernel;
CUfunction sortBoxDataKernel;
CUfunction findInteractingBlocksKernel; CUfunction findInteractingBlocksKernel;
CUfunction findInteractionsWithinBlocksKernel; CUfunction findInteractionsWithinBlocksKernel;
CudaArray* exclusionTiles;
CudaArray* exclusions; CudaArray* exclusions;
CudaArray* exclusionIndices; CudaArray* exclusionIndices;
CudaArray* exclusionRowIndices; CudaArray* exclusionRowIndices;
CudaArray* interactingTiles; CudaArray* interactingTiles;
CudaArray* interactionFlags; CudaArray* interactingAtoms;
CudaArray* interactionCount; CudaArray* interactionCount;
CudaArray* blockCenter; CudaArray* blockCenter;
CudaArray* blockBoundingBox; CudaArray* blockBoundingBox;
std::vector<void*> forceArgs, findBlockBoundsArgs, findInteractingBlocksArgs, findInteractionsWithinBlocksArgs; CudaArray* sortedBlocks;
CudaArray* sortedBlockCenter;
CudaArray* sortedBlockBoundingBox;
CudaArray* oldPositions;
CudaArray* rebuildNeighborList;
CudaSort* blockSorter;
std::vector<void*> forceArgs, findBlockBoundsArgs, sortBoxDataArgs, findInteractingBlocksArgs;
std::vector<std::vector<int> > atomExclusions; std::vector<std::vector<int> > atomExclusions;
std::vector<ParameterInfo> parameters; std::vector<ParameterInfo> parameters;
std::vector<ParameterInfo> arguments; std::vector<ParameterInfo> arguments;
std::string kernelSource; std::string kernelSource;
std::map<std::string, std::string> kernelDefines; std::map<std::string, std::string> kernelDefines;
double cutoff; double cutoff;
bool useCutoff, usePeriodic, anyExclusions; bool useCutoff, usePeriodic, anyExclusions, usePadding;
int startTileIndex, numTiles, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms; int startTileIndex, numTiles, startBlockIndex, numBlocks, maxTiles, numForceThreadBlocks, forceThreadBlockSize, nonbondedForceGroup, numAtoms;
}; };
/** /**
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. * * Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -118,7 +118,7 @@ private: ...@@ -118,7 +118,7 @@ private:
}; };
CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) : CudaParallelCalcForcesAndEnergyKernel::CudaParallelCalcForcesAndEnergyKernel(string name, const Platform& platform, CudaPlatform::PlatformData& data) :
CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextTiles(data.contexts.size()), contextForces(NULL), CalcForcesAndEnergyKernel(name, platform), data(data), completionTimes(data.contexts.size()), contextNonbondedFractions(data.contexts.size()), contextForces(NULL),
pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) { pinnedPositionBuffer(NULL), pinnedForceBuffer(NULL) {
for (int i = 0; i < (int) data.contexts.size(); i++) for (int i = 0; i < (int) data.contexts.size(); i++)
kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i]))); kernels.push_back(Kernel(new CudaCalcForcesAndEnergyKernel(name, platform, *data.contexts[i])));
...@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) { ...@@ -141,6 +141,8 @@ void CudaParallelCalcForcesAndEnergyKernel::initialize(const System& system) {
sumKernel = cu.getKernel(module, "sumForces"); sumKernel = cu.getKernel(module, "sumForces");
for (int i = 0; i < (int) kernels.size(); i++) for (int i = 0; i < (int) kernels.size(); i++)
getKernel(i).initialize(system); getKernel(i).initialize(system);
for (int i = 0; i < (int) contextNonbondedFractions.size(); i++)
contextNonbondedFractions[i] = 1/(double) contextNonbondedFractions.size();
} }
void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) { void CudaParallelCalcForcesAndEnergyKernel::beginComputation(ContextImpl& context, bool includeForce, bool includeEnergy, int groups) {
...@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con ...@@ -184,30 +186,26 @@ double CudaParallelCalcForcesAndEnergyKernel::finishComputation(ContextImpl& con
void* args[] = {&cu.getForce().getDevicePointer(), &contextForces->getDevicePointer(), &bufferSize, &numBuffers}; void* args[] = {&cu.getForce().getDevicePointer(), &contextForces->getDevicePointer(), &bufferSize, &numBuffers};
cu.executeKernel(sumKernel, args, bufferSize); cu.executeKernel(sumKernel, args, bufferSize);
// Balance work between the contexts by transferring a few nonbonded tiles from the context that // Balance work between the contexts by transferring a little nonbonded work from the context that
// finished last to the one that finished first. // finished last to the one that finished first.
int firstIndex = 0, lastIndex = 0; int firstIndex = 0, lastIndex = 0;
int totalTiles = 0;
for (int i = 0; i < (int) completionTimes.size(); i++) { for (int i = 0; i < (int) completionTimes.size(); i++) {
if (completionTimes[i] < completionTimes[firstIndex]) if (completionTimes[i] < completionTimes[firstIndex])
firstIndex = i; firstIndex = i;
if (completionTimes[i] > completionTimes[lastIndex]) if (completionTimes[i] > completionTimes[lastIndex])
lastIndex = i; lastIndex = i;
contextTiles[i] = data.contexts[i]->getNonbondedUtilities().getNumTiles(); }
totalTiles += contextTiles[i]; double fractionToTransfer = min(0.001, contextNonbondedFractions[lastIndex]);
} contextNonbondedFractions[firstIndex] += fractionToTransfer;
int tilesToTransfer = totalTiles/1000; contextNonbondedFractions[lastIndex] -= fractionToTransfer;
if (tilesToTransfer < 1) double startFraction = 0.0;
tilesToTransfer = 1; for (int i = 0; i < (int) contextNonbondedFractions.size(); i++) {
if (tilesToTransfer > contextTiles[lastIndex]) double endFraction = startFraction+contextNonbondedFractions[i];
tilesToTransfer = contextTiles[lastIndex]; if (i == contextNonbondedFractions.size()-1)
contextTiles[firstIndex] += tilesToTransfer; endFraction = 1.0; // Avoid roundoff error
contextTiles[lastIndex] -= tilesToTransfer; data.contexts[i]->getNonbondedUtilities().setAtomBlockRange(startFraction, endFraction);
int startIndex = 0; startFraction = endFraction;
for (int i = 0; i < (int) contextTiles.size(); i++) {
data.contexts[i]->getNonbondedUtilities().setTileRange(startIndex, contextTiles[i]);
startIndex += contextTiles[i];
} }
} }
return energy; return energy;
......
...@@ -9,7 +9,7 @@ ...@@ -9,7 +9,7 @@
* Biological Structures at Stanford, funded under the NIH Roadmap for * * Biological Structures at Stanford, funded under the NIH Roadmap for *
* Medical Research, grant U54 GM072970. See https://simtk.org. * * Medical Research, grant U54 GM072970. See https://simtk.org. *
* * * *
* Portions copyright (c) 2011-2012 Stanford University and the Authors. * * Portions copyright (c) 2011-2013 Stanford University and the Authors. *
* Authors: Peter Eastman * * Authors: Peter Eastman *
* Contributors: * * Contributors: *
* * * *
...@@ -80,7 +80,7 @@ private: ...@@ -80,7 +80,7 @@ private:
CudaPlatform::PlatformData& data; CudaPlatform::PlatformData& data;
std::vector<Kernel> kernels; std::vector<Kernel> kernels;
std::vector<long long> completionTimes; std::vector<long long> completionTimes;
std::vector<int> contextTiles; std::vector<double> contextNonbondedFractions;
CudaArray* contextForces; CudaArray* contextForces;
void* pinnedPositionBuffer; void* pinnedPositionBuffer;
long long* pinnedForceBuffer; long long* pinnedForceBuffer;
......
...@@ -32,7 +32,7 @@ using namespace OpenMM; ...@@ -32,7 +32,7 @@ using namespace OpenMM;
using namespace std; using namespace std;
CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait), CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) : context(context), trait(trait),
dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL) { dataRange(NULL), bucketOfElement(NULL), offsetInBucket(NULL), bucketOffset(NULL), buckets(NULL), dataLength(length) {
// Create kernels. // Create kernels.
map<string, string> replacements; map<string, string> replacements;
...@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -43,6 +43,7 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
replacements["MAX_KEY"] = trait->getMaxKey(); replacements["MAX_KEY"] = trait->getMaxKey();
replacements["MAX_VALUE"] = trait->getMaxValue(); replacements["MAX_VALUE"] = trait->getMaxValue();
CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements)); CUmodule module = context.createModule(context.replaceStrings(CudaKernelSources::sort, replacements));
shortListKernel = context.getKernel(module, "sortShortList");
computeRangeKernel = context.getKernel(module, "computeRange"); computeRangeKernel = context.getKernel(module, "computeRange");
assignElementsKernel = context.getKernel(module, "assignElementsToBuckets"); assignElementsKernel = context.getKernel(module, "assignElementsToBuckets");
computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions"); computeBucketPositionsKernel = context.getKernel(module, "computeBucketPositions");
...@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -53,15 +54,16 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
int maxBlockSize; int maxBlockSize;
cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice()); cuDeviceGetAttribute(&maxBlockSize, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, context.getDevice());
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
isShortList = (length <= maxLocalBuffer);
for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2) for (rangeKernelSize = 1; rangeKernelSize*2 <= maxBlockSize; rangeKernelSize *= 2)
; ;
positionsKernelSize = rangeKernelSize; positionsKernelSize = rangeKernelSize;
sortKernelSize = rangeKernelSize/2; sortKernelSize = (isShortList ? rangeKernelSize/2 : rangeKernelSize/4);
if (rangeKernelSize > length) if (rangeKernelSize > length)
rangeKernelSize = length; rangeKernelSize = length;
int maxSharedMem;
cuDeviceGetAttribute(&maxSharedMem, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, context.getDevice());
unsigned int maxLocalBuffer = (unsigned int) ((maxSharedMem/trait->getDataSize())/2);
if (sortKernelSize > maxLocalBuffer) if (sortKernelSize > maxLocalBuffer)
sortKernelSize = maxLocalBuffer; sortKernelSize = maxLocalBuffer;
unsigned int targetBucketSize = sortKernelSize/2; unsigned int targetBucketSize = sortKernelSize/2;
...@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length) ...@@ -73,11 +75,13 @@ CudaSort::CudaSort(CudaContext& context, SortTrait* trait, unsigned int length)
// Create workspace arrays. // Create workspace arrays.
if (!isShortList) {
dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange"); dataRange = new CudaArray(context, 2, trait->getKeySize(), "sortDataRange");
bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset"); bucketOffset = CudaArray::create<uint1>(context, numBuckets, "bucketOffset");
bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement"); bucketOfElement = CudaArray::create<uint1>(context, length, "bucketOfElement");
offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket"); offsetInBucket = CudaArray::create<uint1>(context, length, "offsetInBucket");
buckets = new CudaArray(context, length, trait->getDataSize(), "buckets"); buckets = new CudaArray(context, length, trait->getDataSize(), "buckets");
}
} }
CudaSort::~CudaSort() { CudaSort::~CudaSort() {
...@@ -95,22 +99,27 @@ CudaSort::~CudaSort() { ...@@ -95,22 +99,27 @@ CudaSort::~CudaSort() {
} }
void CudaSort::sort(CudaArray& data) { void CudaSort::sort(CudaArray& data) {
if (data.getSize() != bucketOfElement->getSize() || data.getElementSize() != trait->getDataSize()) if (data.getSize() != dataLength || data.getElementSize() != trait->getDataSize())
throw OpenMMException("CudaSort called with different data size"); throw OpenMMException("CudaSort called with different data size");
if (data.getSize() == 0) if (data.getSize() == 0)
return; return;
if (isShortList) {
// We can use a simpler sort kernel that does the entire operation at once in local memory.
void* sortArgs[] = {&data.getDevicePointer(), &dataLength};
context.executeKernel(shortListKernel, sortArgs, sortKernelSize, sortKernelSize, dataLength*trait->getDataSize());
}
else {
// Compute the range of data values. // Compute the range of data values.
unsigned int dataSize = data.getSize(); void* rangeArgs[] = {&data.getDevicePointer(), &dataLength, &dataRange->getDevicePointer()};
void* rangeArgs[] = {&data.getDevicePointer(), &dataSize, &dataRange->getDevicePointer()};
context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize()); context.executeKernel(computeRangeKernel, rangeArgs, rangeKernelSize, rangeKernelSize, rangeKernelSize*trait->getKeySize());
// Assign array elements to buckets. // Assign array elements to buckets.
unsigned int numBuckets = bucketOffset->getSize(); unsigned int numBuckets = bucketOffset->getSize();
context.clearBuffer(*bucketOffset); context.clearBuffer(*bucketOffset);
void* elementsArgs[] = {&data.getDevicePointer(), &dataSize, &numBuckets, &dataRange->getDevicePointer(), void* elementsArgs[] = {&data.getDevicePointer(), &dataLength, &numBuckets, &dataRange->getDevicePointer(),
&bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()}; &bucketOffset->getDevicePointer(), &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(assignElementsKernel, elementsArgs, data.getSize()); context.executeKernel(assignElementsKernel, elementsArgs, data.getSize());
...@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) { ...@@ -121,7 +130,7 @@ void CudaSort::sort(CudaArray& data) {
// Copy the data into the buckets. // Copy the data into the buckets.
void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataSize, &bucketOffset->getDevicePointer(), void* copyArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &dataLength, &bucketOffset->getDevicePointer(),
&bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()}; &bucketOfElement->getDevicePointer(), &offsetInBucket->getDevicePointer()};
context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize()); context.executeKernel(copyToBucketsKernel, copyArgs, data.getSize());
...@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) { ...@@ -129,4 +138,5 @@ void CudaSort::sort(CudaArray& data) {
void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()}; void* sortArgs[] = {&data.getDevicePointer(), &buckets->getDevicePointer(), &numBuckets, &bucketOffset->getDevicePointer()};
context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize()); context.executeKernel(sortBucketsKernel, sortArgs, ((data.getSize()+sortKernelSize-1)/sortKernelSize)*sortKernelSize, sortKernelSize, sortKernelSize*trait->getDataSize());
}
} }
...@@ -92,8 +92,9 @@ private: ...@@ -92,8 +92,9 @@ private:
CudaArray* offsetInBucket; CudaArray* offsetInBucket;
CudaArray* bucketOffset; CudaArray* bucketOffset;
CudaArray* buckets; CudaArray* buckets;
CUfunction computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel; CUfunction shortListKernel, computeRangeKernel, assignElementsKernel, computeBucketPositionsKernel, copyToBucketsKernel, sortBucketsKernel;
unsigned int rangeKernelSize, positionsKernelSize, sortKernelSize; unsigned int dataLength, rangeKernelSize, positionsKernelSize, sortKernelSize;
bool isShortList;
}; };
/** /**
......
#if USE_EWALD #if USE_EWALD
bool needCorrection = isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS; bool needCorrection = hasExclusions && isExcluded && atom1 != atom2 && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS;
if (!isExcluded || needCorrection) { if (!isExcluded || needCorrection) {
real tempForce = 0.0f;
if (r2 < CUTOFF_SQUARED || needCorrection) { if (r2 < CUTOFF_SQUARED || needCorrection) {
const real alphaR = EWALD_ALPHA*r; const real alphaR = EWALD_ALPHA*r;
const real expAlphaRSqr = EXP(-alphaR*alphaR); const real expAlphaRSqr = EXP(-alphaR*alphaR);
...@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) { ...@@ -16,6 +15,7 @@ if (!isExcluded || needCorrection) {
t *= t; t *= t;
t *= t; t *= t;
const real erfcAlphaR = RECIP(t*t); const real erfcAlphaR = RECIP(t*t);
real tempForce = 0.0f;
if (needCorrection) { if (needCorrection) {
// Subtract off the part of this interaction that was included in the reciprocal space contribution. // Subtract off the part of this interaction that was included in the reciprocal space contribution.
...@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) { ...@@ -36,8 +36,8 @@ if (!isExcluded || needCorrection) {
tempEnergy += prefactor*erfcAlphaR; tempEnergy += prefactor*erfcAlphaR;
#endif #endif
} }
}
dEdR += tempForce*invR*invR; dEdR += tempForce*invR*invR;
}
} }
#else #else
{ {
......
#define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000))); #define STORE_DERIVATIVE_1(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (deriv##INDEX##_1*0x100000000)));
#define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000))); #define STORE_DERIVATIVE_2(INDEX) atomicAdd(&derivBuffers[offset+(INDEX-1)*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].deriv##INDEX*0x100000000)));
#define TILE_SIZE 32
typedef struct { typedef struct {
real4 posq; real4 posq;
...@@ -15,88 +14,43 @@ typedef struct { ...@@ -15,88 +14,43 @@ typedef struct {
* Compute a force based on pair interactions. * Compute a force based on pair interactions.
*/ */
extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer, extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forceBuffers, real* __restrict__ energyBuffer,
const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const unsigned int* __restrict__ exclusionIndices, const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, const ushort2* __restrict__ exclusionTiles,
const unsigned int* __restrict__ exclusionRowIndices,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
unsigned int numTiles = interactionCount[0]; const unsigned int tbx = threadIdx.x - tgx;
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0; real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData localData[THREAD_BLOCK_SIZE]; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do { // First loop: process tiles that contain exclusions.
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int tbx = threadIdx.x - tgx; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
unsigned int x, y; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real3 force = make_real3(0); real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES DECLARE_ATOM1_DERIVATIVES
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (tgx < 2) unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif #endif
if (pos >= end) if (x == y) {
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1; localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j; int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
...@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -115,6 +69,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+j;
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -136,32 +93,16 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
// This is an off-diagonal tile. // This is an off-diagonal tile.
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j]; localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
}
localData[localAtomIndex].force = make_real3(0); localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES CLEAR_LOCAL_DERIVATIVES
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
if (!hasExclusions && flags == 0) {
// No interactions in this tile.
}
else
#endif
{
// Compute the full set of interactions in this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
#endif #endif
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
...@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -180,6 +121,9 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
atom2 = y*TILE_SIZE+tj; atom2 = y*TILE_SIZE+tj;
real dEdR = 0; real dEdR = 0;
real tempEnergy = 0; real tempEnergy = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION COMPUTE_INTERACTION
dEdR /= -r; dEdR /= -r;
...@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc ...@@ -203,27 +147,216 @@ extern "C" __global__ void computeN2Energy(unsigned long long* __restrict__ forc
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
}
}
lasty = y;
// Write results. // Write results.
if (pos < end) { unsigned int offset = x*TILE_SIZE + tgx;
const unsigned int offset = x*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
STORE_DERIVATIVES_1 STORE_DERIVATIVES_1
} if (x != y) {
if (pos < end && x != y) { offset = y*TILE_SIZE + tgx;
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000))); atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000))); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000))); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
STORE_DERIVATIVES_2 STORE_DERIVATIVES_2
} }
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
const bool isExcluded = false;
real3 force = make_real3(0);
DECLARE_ATOM1_DERIVATIVES
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].force = make_real3(0);
CLEAR_LOCAL_DERIVATIVES
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else
#endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
#ifdef USE_CUTOFF
if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real dEdR = 0;
real tempEnergy = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_INTERACTION
dEdR /= -r;
}
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
atom2 = tbx+tj;
localData[atom2].force.x += delta.x;
localData[atom2].force.y += delta.y;
localData[atom2].force.z += delta.z;
RECORD_DERIVATIVE_2
#ifdef USE_CUTOFF
}
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Write results.
atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
unsigned int offset = atom1;
STORE_DERIVATIVES_1
#ifdef USE_CUTOFF
unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.x*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.y*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].force.z*0x100000000)));
offset = atom2;
STORE_DERIVATIVES_2
}
}
pos++; pos++;
} while (pos < end); }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
} }
#define TILE_SIZE 32
typedef struct { typedef struct {
real4 posq; real4 posq;
real value, temp; real value, temp;
...@@ -13,86 +11,41 @@ typedef struct { ...@@ -13,86 +11,41 @@ typedef struct {
* Compute a value based on pair interactions. * Compute a value based on pair interactions.
*/ */
extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions, extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const unsigned int* __restrict__ exclusions,
const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, unsigned long long* __restrict__ global_value, const ushort2* __restrict__ exclusionTiles, unsigned long long* __restrict__ global_value,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms
#else #else
unsigned int numTiles unsigned int numTiles
#endif #endif
PARAMETER_ARGUMENTS) { PARAMETER_ARGUMENTS) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData localData[THREAD_BLOCK_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx; const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; __shared__ AtomData localData[THREAD_BLOCK_SIZE];
unsigned int x, y;
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real value = 0; real value = 0;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS LOAD_ATOM1_PARAMETERS
// Locate the exclusion data for this tile.
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (tgx < 2) unsigned int excl = exclusions[pos*TILE_SIZE+tgx];
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
bool hasExclusions = (exclusionIndex[localGroupIndex] > -1);
#else
bool hasExclusions = false;
#endif #endif
if (pos >= end) if (x == y) {
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
localData[localAtomIndex].posq = posq1; localData[localAtomIndex].posq = posq1;
LOAD_LOCAL_PARAMETERS_FROM_1 LOAD_LOCAL_PARAMETERS_FROM_1
#ifdef USE_EXCLUSIONS
unsigned int excl = exclusions[exclusionIndex[localGroupIndex]+tgx];
#endif
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS
bool isExcluded = !(excl & 0x1);
#endif
int atom2 = tbx+j; int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
...@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -112,7 +65,8 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real tempValue1 = 0; real tempValue1 = 0;
real tempValue2 = 0; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS #ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded && atom1 != atom2) {
#else #else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && atom1 != atom2) {
#endif #endif
...@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -130,25 +84,17 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx;
localData[threadIdx.x].posq = posq[j];
const unsigned int localAtomIndex = threadIdx.x; const unsigned int localAtomIndex = threadIdx.x;
unsigned int j = y*TILE_SIZE + tgx;
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
} localData[localAtomIndex].value = 0;
localData[threadIdx.x].value = 0; #ifdef USE_EXCLUSIONS
#ifdef USE_CUTOFF excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx));
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); #endif
if (!hasExclusions && flags != 0xFFFFFFFF) { unsigned int tj = tgx;
if (flags == 0) { for (j = 0; j < TILE_SIZE; j++) {
// No interactions in this tile. int atom2 = tbx+tj;
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
int atom2 = tbx+j;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -157,44 +103,162 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real tempValue1 = 0; #ifdef USE_CUTOFF
real tempValue2 = 0;
if (r2 < CUTOFF_SQUARED) { if (r2 < CUTOFF_SQUARED) {
#endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+j; atom2 = y*TILE_SIZE+tj;
real tempValue1 = 0;
real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
bool isExcluded = (atom1 >= NUM_ATOMS || atom2 >= NUM_ATOMS || !(excl & 0x1));
if (!isExcluded) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE COMPUTE_VALUE
} }
value += tempValue1; value += tempValue1;
localData[tbx+tj].value += tempValue2;
#ifdef USE_CUTOFF
}
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif
tj = (tj + 1) & (TILE_SIZE - 1);
}
} }
localData[threadIdx.x].temp = tempValue2;
// Sum the forces on atom2. // Write results.
if (tgx % 4 == 0) unsigned int offset = x*TILE_SIZE + tgx;
localData[threadIdx.x].temp += localData[threadIdx.x+1].temp+localData[threadIdx.x+2].temp+localData[threadIdx.x+3].temp; atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000)));
if (tgx == 0) if (x != y) {
localData[tbx+j].value += localData[threadIdx.x].temp+localData[threadIdx.x+4].temp+localData[threadIdx.x+8].temp+localData[threadIdx.x+12].temp+localData[threadIdx.x+16].temp+localData[threadIdx.x+20].temp+localData[threadIdx.x+24].temp+localData[threadIdx.x+28].temp; offset = y*TILE_SIZE + tgx;
} atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
} }
} }
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
int pos = warp*numTiles/totalWarps;
int end = (warp+1)*numTiles/totalWarps;
#endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[THREAD_BLOCK_SIZE];
__shared__ int skipTiles[THREAD_BLOCK_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
real value = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
} }
else else
#endif #endif
{ {
// Compute the full set of interactions in this tile. y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
#ifdef USE_EXCLUSIONS // Skip over tiles that have exclusions, since they were already processed.
unsigned int excl = (hasExclusions ? exclusions[exclusionIndex[localGroupIndex]+tgx] : 0xFFFFFFFF);
excl = (excl >> tgx) | (excl << (TILE_SIZE - tgx)); while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
}
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
LOAD_ATOM1_PARAMETERS
const unsigned int localAtomIndex = threadIdx.x;
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
localData[localAtomIndex].posq = posq[j];
LOAD_LOCAL_PARAMETERS_FROM_GLOBAL
localData[localAtomIndex].value = 0;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].posq.x -= floor((localData[threadIdx.x].posq.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].posq.y -= floor((localData[threadIdx.x].posq.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].posq.z -= floor((localData[threadIdx.x].posq.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (unsigned int j = 0; j < TILE_SIZE; j++) {
#ifdef USE_EXCLUSIONS int atom2 = tbx+tj;
bool isExcluded = !(excl & 0x1); real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS
atom2 = atomIndices[tbx+tj];
real tempValue1 = 0;
real tempValue2 = 0;
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
COMPUTE_VALUE
}
value += tempValue1;
localData[tbx+tj].value += tempValue2;
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
else
#endif #endif
{
// We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) {
int atom2 = tbx+tj; int atom2 = tbx+tj;
real4 posq2 = localData[atom2].posq; real4 posq2 = localData[atom2].posq;
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
...@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const ...@@ -210,41 +274,32 @@ extern "C" __global__ void computeN2Value(const real4* __restrict__ posq, const
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
LOAD_ATOM2_PARAMETERS LOAD_ATOM2_PARAMETERS
atom2 = y*TILE_SIZE+tj; atom2 = atomIndices[tbx+tj];
real tempValue1 = 0; real tempValue1 = 0;
real tempValue2 = 0; real tempValue2 = 0;
#ifdef USE_EXCLUSIONS
if (!isExcluded && atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#else
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif
COMPUTE_VALUE COMPUTE_VALUE
} }
value += tempValue1; value += tempValue1;
localData[tbx+tj].value += tempValue2; localData[tbx+tj].value += tempValue2;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
#endif
#ifdef USE_EXCLUSIONS
excl >>= 1;
#endif #endif
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
}
}
// Write results. // Write results.
if (pos < end) { atomicAdd(&global_value[atom1], static_cast<unsigned long long>((long long) (value*0x100000000)));
const unsigned int offset = x*TILE_SIZE + tgx; #ifdef USE_CUTOFF
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (value*0x100000000))); unsigned int atom2 = atomIndices[threadIdx.x];
} #else
if (pos < end && x != y) { unsigned int atom2 = y*TILE_SIZE + tgx;
const unsigned int offset = y*TILE_SIZE + tgx; #endif
atomicAdd(&global_value[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000))); if (atom2 < PADDED_NUM_ATOMS)
atomicAdd(&global_value[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].value*0x100000000)));
} }
lasty = y;
pos++; pos++;
} while (pos < end); }
} }
...@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) { ...@@ -48,12 +48,12 @@ inline __device__ real computeAngle(real4 vec1, real4 vec2) {
real3 crossProduct = cross(vec1, vec2); real3 crossProduct = cross(vec1, vec2);
real scale = vec1.w*vec2.w; real scale = vec1.w*vec2.w;
angle = asin(SQRT(dot(crossProduct, crossProduct)/scale)); angle = ASIN(SQRT(dot(crossProduct, crossProduct)/scale));
if (cosine < 0.0f) if (cosine < 0.0f)
angle = M_PI-angle; angle = M_PI-angle;
} }
else else
angle = acos(cosine); angle = ACOS(cosine);
return angle; return angle;
} }
......
...@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf ...@@ -35,11 +35,11 @@ extern "C" __global__ void calculateEwaldCosSinSums(real* __restrict__ energyBuf
for (int atom = 0; atom < NUM_ATOMS; atom++) { for (int atom = 0; atom < NUM_ATOMS; atom++) {
real4 apos = posq[atom]; real4 apos = posq[atom];
real phase = apos.x*kx; real phase = apos.x*kx;
real2 structureFactor = make_real2(cos(phase), sin(phase)); real2 structureFactor = make_real2(COS(phase), SIN(phase));
phase = apos.y*ky; phase = apos.y*ky;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase))); structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
phase = apos.z*kz; phase = apos.z*kz;
structureFactor = multofReal2(structureFactor, make_real2(cos(phase), sin(phase))); structureFactor = multofReal2(structureFactor, make_real2(COS(phase), SIN(phase)));
sum += apos.w*structureFactor; sum += apos.w*structureFactor;
} }
cosSinSum[index] = sum; cosSinSum[index] = sum;
...@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__ ...@@ -76,9 +76,9 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
for (int ry = lowry; ry < KMAX_Y; ry++) { for (int ry = lowry; ry < KMAX_Y; ry++) {
real ky = ry*reciprocalBoxSize.y; real ky = ry*reciprocalBoxSize.y;
real phase = apos.x*kx; real phase = apos.x*kx;
real2 tab_xy = make_real2(cos(phase), sin(phase)); real2 tab_xy = make_real2(COS(phase), SIN(phase));
phase = apos.y*ky; phase = apos.y*ky;
tab_xy = multofReal2(tab_xy, make_real2(cos(phase), sin(phase))); tab_xy = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
for (int rz = lowrz; rz < KMAX_Z; rz++) { for (int rz = lowrz; rz < KMAX_Z; rz++) {
real kz = rz*reciprocalBoxSize.z; real kz = rz*reciprocalBoxSize.z;
...@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__ ...@@ -88,7 +88,7 @@ extern "C" __global__ void calculateEwaldForces(unsigned long long* __restrict__
real k2 = kx*kx + ky*ky + kz*kz; real k2 = kx*kx + ky*ky + kz*kz;
real ak = EXP(k2*EXP_COEFFICIENT)/k2; real ak = EXP(k2*EXP_COEFFICIENT)/k2;
phase = apos.z*kz; phase = apos.z*kz;
real2 structureFactor = multofReal2(tab_xy, make_real2(cos(phase), sin(phase))); real2 structureFactor = multofReal2(tab_xy, make_real2(COS(phase), SIN(phase)));
real2 sum = cosSinSum[index]; real2 sum = cosSinSum[index];
real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x); real dEdR = 2*reciprocalCoefficient*ak*apos.w*(sum.x*structureFactor.y - sum.y*structureFactor.x);
force.x += dEdR*kx; force.x += dEdR*kx;
......
#define TILE_SIZE 32 #define GROUP_SIZE 256
#define GROUP_SIZE 64 #define BUFFER_GROUPS 2
#define BUFFER_GROUPS 4
#define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE #define BUFFER_SIZE BUFFER_GROUPS*GROUP_SIZE
#define WARP_SIZE 32
#define INVALID 0xFFFF
/** /**
* Find a bounding box for the atoms in each block. * Find a bounding box for the atoms in each block.
*/ */
extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount) { extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq,
real4* __restrict__ blockCenter, real4* __restrict__ blockBoundingBox, int* __restrict__ rebuildNeighborList, real2* __restrict__ sortedBlocks) {
int index = blockIdx.x*blockDim.x+threadIdx.x; int index = blockIdx.x*blockDim.x+threadIdx.x;
int base = index*TILE_SIZE; int base = index*TILE_SIZE;
while (base < numAtoms) { while (base < numAtoms) {
...@@ -30,68 +32,231 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize, ...@@ -30,68 +32,231 @@ extern "C" __global__ void findBlockBounds(int numAtoms, real4 periodicBoxSize,
minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0); minPos = make_real4(min(minPos.x,pos.x), min(minPos.y,pos.y), min(minPos.z,pos.z), 0);
maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0); maxPos = make_real4(max(maxPos.x,pos.x), max(maxPos.y,pos.y), max(maxPos.z,pos.z), 0);
} }
blockBoundingBox[index] = 0.5f*(maxPos-minPos); real4 blockSize = 0.5f*(maxPos-minPos);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos); blockCenter[index] = 0.5f*(maxPos+minPos);
sortedBlocks[index] = make_real2(blockSize.x+blockSize.y+blockSize.z, index);
index += blockDim.x*gridDim.x; index += blockDim.x*gridDim.x;
base = index*TILE_SIZE; base = index*TILE_SIZE;
} }
if (blockIdx.x == 0 && threadIdx.x == 0) if (blockIdx.x == 0 && threadIdx.x == 0)
interactionCount[0] = 0; rebuildNeighborList[0] = 0;
} }
/** /**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks and writes them * Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
* to global memory.
*/ */
__device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, ushort2* temp, int* baseIndex, extern "C" __global__ void sortBoxData(const real2* __restrict__ sortedBlock, const real4* __restrict__ blockCenter,
unsigned int* interactionCount, ushort2* interactingTiles, real4 periodicBoxSize, const real4* __restrict__ blockBoundingBox, real4* __restrict__ sortedBlockCenter,
real4 invPeriodicBoxSize, const real4* posq, const real4* blockCenter, const real4* blockBoundingBox, unsigned int maxTiles) { real4* __restrict__ sortedBlockBoundingBox, const real4* __restrict__ posq, const real4* __restrict__ oldPositions,
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum. unsigned int* __restrict__ interactionCount, int* __restrict__ rebuildNeighborList) {
for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_BLOCKS; i += blockDim.x*gridDim.x) {
int index = (int) sortedBlock[i].y;
sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = blockBoundingBox[index];
}
// Also check whether any atom has moved enough so that we really need to rebuild the neighbor list.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE) bool rebuild = false;
temp[i].x = (valid[i] ? 1 : 0); for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x) {
real4 delta = oldPositions[i]-posq[i];
if (delta.x*delta.x + delta.y*delta.y + delta.z*delta.z > 0.25f*PADDING*PADDING)
rebuild = true;
}
if (rebuild) {
rebuildNeighborList[0] = 1;
interactionCount[0] = 0;
}
}
/**
* Perform a parallel prefix sum over an array. The input values are all assumed to be 0 or 1.
*/
__device__ void prefixSum(short* sum, ushort2* temp) {
#if __CUDA_ARCH__ >= 300
const int indexInWarp = threadIdx.x%WARP_SIZE;
const int warpMask = (2<<indexInWarp)-1;
for (int base = 0; base < BUFFER_SIZE; base += blockDim.x)
temp[base+threadIdx.x].x = __popc(__ballot(sum[base+threadIdx.x])&warpMask);
__syncthreads();
if (threadIdx.x < BUFFER_SIZE/WARP_SIZE) {
int multiWarpSum = temp[(threadIdx.x+1)*WARP_SIZE-1].x;
for (int offset = 1; offset < BUFFER_SIZE/WARP_SIZE; offset *= 2) {
short n = __shfl_up(multiWarpSum, offset, WARP_SIZE);
if (indexInWarp >= offset)
multiWarpSum += n;
}
temp[threadIdx.x].y = multiWarpSum;
}
__syncthreads();
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = temp[i].x+(i < WARP_SIZE ? 0 : temp[i/WARP_SIZE-1].y);
__syncthreads();
#else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
temp[i].x = sum[i];
__syncthreads(); __syncthreads();
int whichBuffer = 0; int whichBuffer = 0;
for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) { for (int offset = 1; offset < BUFFER_SIZE; offset *= 2) {
if (whichBuffer == 0) if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE) for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x); temp[i].y = (i < offset ? temp[i].x : temp[i].x+temp[i-offset].x);
else else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE) for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y); temp[i].x = (i < offset ? temp[i].y : temp[i].y+temp[i-offset].y);
whichBuffer = 1-whichBuffer; whichBuffer = 1-whichBuffer;
__syncthreads(); __syncthreads();
} }
if (whichBuffer == 0) if (whichBuffer == 0)
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE) for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = temp[i].x; sum[i] = temp[i].x;
else else
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE) for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = temp[i].y; sum[i] = temp[i].y;
__syncthreads(); __syncthreads();
int numValid = sum[BUFFER_SIZE-1]; #endif
}
/**
* This is called by findBlocksWithInteractions(). It compacts the list of blocks, identifies interactions
* in them, and writes the result to global memory.
*/
__device__ void storeInteractionData(unsigned short x, unsigned short* buffer, short* sum, ushort2* temp, int* atoms, int& numAtoms,
int& baseIndex, unsigned int* interactionCount, ushort2* interactingTiles, unsigned int* interactingAtoms, real4 periodicBoxSize,
real4 invPeriodicBoxSize, const real4* posq, real3* posBuffer, real4 blockCenterX, real4 blockSizeX, unsigned int maxTiles, bool finish) {
const bool singlePeriodicCopy = (0.5f*periodicBoxSize.x-blockSizeX.x >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.y-blockSizeX.y >= PADDED_CUTOFF &&
0.5f*periodicBoxSize.z-blockSizeX.z >= PADDED_CUTOFF);
if (threadIdx.x < TILE_SIZE) {
real3 pos = trimTo3(posq[x*TILE_SIZE+threadIdx.x]);
posBuffer[threadIdx.x] = pos;
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
posBuffer[threadIdx.x] = pos;
}
#endif
}
// The buffer is full, so we need to compact it and write out results. Start by doing a parallel prefix sum.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
sum[i] = (buffer[i] == INVALID ? 0 : 1);
__syncthreads(); __syncthreads();
prefixSum(sum, temp);
int numValid = sum[BUFFER_SIZE-1];
// Compact the buffer. // Compact the buffer.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += GROUP_SIZE) for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
if (valid[i]) { if (buffer[i] != INVALID)
temp[sum[i]-1] = buffer[i]; temp[sum[i]-1].x = buffer[i];
sum[i] = valid[i]; __syncthreads();
valid[i] = false; for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
buffer[i] = make_ushort2(1, 1); buffer[i] = temp[i].x;
__syncthreads();
// Loop over the tiles and find specific interactions in them.
const int indexInWarp = threadIdx.x%WARP_SIZE;
for (int base = 0; base < numValid; base += BUFFER_SIZE/WARP_SIZE) {
for (int i = threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE && base+i < numValid; i += GROUP_SIZE/WARP_SIZE) {
// Check each atom in block Y for interactions.
real3 pos = trimTo3(posq[buffer[base+i]*TILE_SIZE+indexInWarp]);
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
pos.x -= floor((pos.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
pos.y -= floor((pos.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
pos.z -= floor((pos.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
}
#endif
bool interacts = false;
#ifdef USE_PERIODIC
if (!singlePeriodicCopy) {
for (int j = 0; j < TILE_SIZE; j++) {
real3 delta = pos-posBuffer[j];
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
}
else {
#endif
for (int j = 0; j < TILE_SIZE; j++) {
real3 delta = pos-posBuffer[j];
interacts |= (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED);
}
#ifdef USE_PERIODIC
} }
#endif
sum[i*WARP_SIZE+indexInWarp] = (interacts ? 1 : 0);
}
for (int i = numValid-base+threadIdx.x/WARP_SIZE; i < BUFFER_SIZE/WARP_SIZE; i += GROUP_SIZE/WARP_SIZE)
sum[i*WARP_SIZE+indexInWarp] = 0;
// Compact the list of atoms.
__syncthreads(); __syncthreads();
prefixSum(sum, temp);
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
if (sum[i] != (i == 0 ? 0 : sum[i-1]))
atoms[numAtoms+sum[i]-1] = buffer[base+i/WARP_SIZE]*TILE_SIZE+indexInWarp;
// Store it to global memory. // Store them to global memory.
int atomsToStore = numAtoms+sum[BUFFER_SIZE-1];
bool storePartialTile = (finish && base >= numValid-BUFFER_SIZE/WARP_SIZE);
int tilesToStore = (storePartialTile ? (atomsToStore+TILE_SIZE-1)/TILE_SIZE : atomsToStore/TILE_SIZE);
if (tilesToStore > 0) {
if (threadIdx.x == 0) if (threadIdx.x == 0)
*baseIndex = atomicAdd(interactionCount, numValid); baseIndex = atomicAdd(interactionCount, tilesToStore);
__syncthreads(); __syncthreads();
if (*baseIndex+numValid <= maxTiles) if (threadIdx.x == 0)
for (int i = threadIdx.x; i < numValid; i += GROUP_SIZE) numAtoms = atomsToStore-tilesToStore*TILE_SIZE;
interactingTiles[*baseIndex+i] = temp[i]; if (baseIndex+tilesToStore <= maxTiles) {
if (threadIdx.x < tilesToStore)
interactingTiles[baseIndex+threadIdx.x] = make_ushort2(x, singlePeriodicCopy);
for (int i = threadIdx.x; i < tilesToStore*TILE_SIZE; i += blockDim.x)
interactingAtoms[baseIndex*TILE_SIZE+i] = (i < atomsToStore ? atoms[i] : NUM_ATOMS);
}
}
else {
__syncthreads();
if (threadIdx.x == 0)
numAtoms += sum[BUFFER_SIZE-1];
}
__syncthreads(); __syncthreads();
if (threadIdx.x < numAtoms && !storePartialTile)
atoms[threadIdx.x] = atoms[tilesToStore*TILE_SIZE+threadIdx.x];
}
if (numValid == 0 && numAtoms > 0 && finish) {
// We didn't have any more tiles to process, but there were some atoms left over from a
// previous call to this function. Save them now.
if (threadIdx.x == 0)
baseIndex = atomicAdd(interactionCount, 1);
__syncthreads();
if (baseIndex < maxTiles) {
if (threadIdx.x == 0)
interactingTiles[baseIndex] = make_ushort2(x, singlePeriodicCopy);
if (threadIdx.x < TILE_SIZE)
interactingAtoms[baseIndex*TILE_SIZE+threadIdx.x] = (threadIdx.x < numAtoms ? atoms[threadIdx.x] : NUM_ATOMS);
}
}
// Reset the buffer for processing more tiles.
for (int i = threadIdx.x; i < BUFFER_SIZE; i += blockDim.x)
buffer[i] = INVALID;
} }
/** /**
...@@ -100,139 +265,92 @@ __device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, us ...@@ -100,139 +265,92 @@ __device__ void storeInteractionData(ushort2* buffer, int* valid, short* sum, us
*/ */
extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ blockCenter, extern "C" __global__ void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount, ushort2* __restrict__ interactingTiles, const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionCount, ushort2* __restrict__ interactingTiles,
unsigned int* __restrict__ interactionFlags, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startTileIndex, unsigned int* __restrict__ interactingAtoms, const real4* __restrict__ posq, unsigned int maxTiles, unsigned int startBlockIndex,
unsigned int numTiles) { unsigned int numBlocks, real2* __restrict__ sortedBlocks, const real4* __restrict__ sortedBlockCenter, const real4* __restrict__ sortedBlockBoundingBox,
__shared__ ushort2 buffer[BUFFER_SIZE]; const unsigned int* __restrict__ exclusionIndices, const unsigned int* __restrict__ exclusionRowIndices, real4* __restrict__ oldPositions,
__shared__ int valid[BUFFER_SIZE]; const int* __restrict__ rebuildNeighborList) {
__shared__ unsigned short buffer[BUFFER_SIZE];
__shared__ short sum[BUFFER_SIZE]; __shared__ short sum[BUFFER_SIZE];
__shared__ ushort2 temp[BUFFER_SIZE]; __shared__ ushort2 temp[BUFFER_SIZE];
__shared__ int atoms[BUFFER_SIZE+TILE_SIZE];
__shared__ real3 posBuffer[TILE_SIZE];
__shared__ int exclusionsForX[MAX_EXCLUSIONS];
__shared__ int bufferFull; __shared__ int bufferFull;
__shared__ int globalIndex; __shared__ int globalIndex;
unsigned int endTileIndex = startTileIndex+numTiles; __shared__ int numAtoms;
if (rebuildNeighborList[0] == 0)
return; // The neighbor list doesn't need to be rebuilt.
int valuesInBuffer = 0; int valuesInBuffer = 0;
if (threadIdx.x == 0) if (threadIdx.x == 0)
bufferFull = false; bufferFull = false;
for (int i = 0; i < BUFFER_GROUPS; ++i) for (int i = 0; i < BUFFER_GROUPS; ++i)
valid[i*GROUP_SIZE+threadIdx.x] = false; buffer[i*GROUP_SIZE+threadIdx.x] = INVALID;
__syncthreads(); __syncthreads();
for (int baseIndex = startTileIndex+blockIdx.x*blockDim.x; baseIndex < endTileIndex; baseIndex += blockDim.x*gridDim.x) {
// Identify the pair of blocks to compare.
int index = baseIndex+threadIdx.x; // Loop over blocks sorted by size.
if (index < endTileIndex) {
unsigned int y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*index)); for (int i = startBlockIndex+blockIdx.x; i < startBlockIndex+numBlocks; i += gridDim.x) {
unsigned int x = (index-y*NUM_BLOCKS+y*(y+1)/2); if (threadIdx.x == blockDim.x-1)
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error. numAtoms = 0;
y += (x < y ? -1 : 1); real2 sortedKey = sortedBlocks[i];
x = (index-y*NUM_BLOCKS+y*(y+1)/2); unsigned short x = (unsigned short) sortedKey.y;
} real4 blockCenterX = blockCenter[x];
real4 blockSizeX = blockBoundingBox[x];
// Find the distance between the bounding boxes of the two cells. // Load exclusion data for block x.
real4 delta = blockCenter[x]-blockCenter[y]; const int exclusionStart = exclusionRowIndices[x];
real4 boxSizea = blockBoundingBox[x]; const int exclusionEnd = exclusionRowIndices[x+1];
real4 boxSizeb = blockBoundingBox[y]; const int numExclusions = exclusionEnd-exclusionStart;
for (int j = threadIdx.x; j < numExclusions; j += blockDim.x)
exclusionsForX[j] = exclusionIndices[exclusionStart+j];
__syncthreads();
// Compare it to other blocks after this one in sorted order.
for (int base = i+1; base < NUM_BLOCKS; base += blockDim.x) {
int j = base+threadIdx.x;
real2 sortedKey2 = (j < NUM_BLOCKS ? sortedBlocks[j] : make_real2(0));
real4 blockCenterY = (j < NUM_BLOCKS ? sortedBlockCenter[j] : make_real4(0));
real4 blockSizeY = (j < NUM_BLOCKS ? sortedBlockBoundingBox[j] : make_real4(0));
unsigned short y = (unsigned short) sortedKey2.y;
real4 delta = blockCenterX-blockCenterY;
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
delta.x = max(0.0f, fabs(delta.x)-boxSizea.x-boxSizeb.x); delta.x = max(0.0f, fabs(delta.x)-blockSizeX.x-blockSizeY.x);
delta.y = max(0.0f, fabs(delta.y)-boxSizea.y-boxSizeb.y); delta.y = max(0.0f, fabs(delta.y)-blockSizeX.y-blockSizeY.y);
delta.z = max(0.0f, fabs(delta.z)-boxSizea.z-boxSizeb.z); delta.z = max(0.0f, fabs(delta.z)-blockSizeX.z-blockSizeY.z);
if (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < CUTOFF_SQUARED) { bool hasExclusions = false;
for (int k = 0; k < numExclusions; k++)
hasExclusions |= (exclusionsForX[k] == y);
if (j < NUM_BLOCKS && delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < PADDED_CUTOFF_SQUARED && !hasExclusions) {
// Add this tile to the buffer. // Add this tile to the buffer.
int bufferIndex = valuesInBuffer*GROUP_SIZE+threadIdx.x; int bufferIndex = valuesInBuffer*GROUP_SIZE+threadIdx.x;
valid[bufferIndex] = true; buffer[bufferIndex] = y;
buffer[bufferIndex] = make_ushort2(x, y);
valuesInBuffer++; valuesInBuffer++;
if (!bufferFull && valuesInBuffer == BUFFER_GROUPS) if (!bufferFull && valuesInBuffer == BUFFER_GROUPS)
bufferFull = true; bufferFull = true;
} }
}
__syncthreads(); __syncthreads();
if (bufferFull) { if (bufferFull) {
storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles); storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, false);
valuesInBuffer = 0; valuesInBuffer = 0;
if (threadIdx.x == 0) if (threadIdx.x == 0)
bufferFull = false; bufferFull = false;
__syncthreads(); __syncthreads();
} }
} }
storeInteractionData(buffer, valid, sum, temp, &globalIndex, interactionCount, interactingTiles, periodicBoxSize, invPeriodicBoxSize, posq, blockCenter, blockBoundingBox, maxTiles); storeInteractionData(x, buffer, sum, temp, atoms, numAtoms, globalIndex, interactionCount, interactingTiles, interactingAtoms, periodicBoxSize, invPeriodicBoxSize, posq, posBuffer, blockCenterX, blockSizeX, maxTiles, true);
}
/**
* Compare each atom in one block to the bounding box of another block, and set
* flags for which ones are interacting.
*/
extern "C" __global__ void findInteractionsWithinBlocks(real4 periodicBoxSize, real4 invPeriodicBoxSize, const real4* __restrict__ posq, const ushort2* __restrict__ tiles, const real4* __restrict__ blockCenter,
const real4* __restrict__ blockBoundingBox, unsigned int* __restrict__ interactionFlags, const unsigned int* __restrict__ interactionCount, unsigned int maxTiles) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
unsigned int index = threadIdx.x & (TILE_SIZE - 1);
#if (__CUDA_ARCH__ < 200)
__shared__ unsigned int flags[128];
#endif
if (numTiles > maxTiles)
return;
unsigned int lasty = 0xFFFFFFFF;
real4 apos;
while (pos < end) {
// Extract the coordinates of this tile
ushort2 tileIndices = tiles[pos];
unsigned int x = tileIndices.x;
unsigned int y = tileIndices.y;
if (x == y) {
if (index == 0)
interactionFlags[pos] = 0xFFFFFFFF;
} }
else {
// Load the bounding box for x and the atom positions for y.
real4 center = blockCenter[x];
real4 boxSize = blockBoundingBox[x];
if (y != lasty)
apos = posq[y*TILE_SIZE+index];
// Find the distance of the atom from the bounding box. // Record the positions the neighbor list is based on.
real4 delta = apos-center; for (int i = threadIdx.x+blockIdx.x*blockDim.x; i < NUM_ATOMS; i += blockDim.x*gridDim.x)
#ifdef USE_PERIODIC oldPositions[i] = posq[i];
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif
delta.x = max((real) 0, fabs(delta.x)-boxSize.x);
delta.y = max((real) 0, fabs(delta.y)-boxSize.y);
delta.z = max((real) 0, fabs(delta.z)-boxSize.z);
#if (__CUDA_ARCH__ < 200)
flags[threadIdx.x] = (delta.x*delta.x+delta.y*delta.y+delta.z*delta.z > CUTOFF_SQUARED ? 0 : 1 << index);
if (index % 4 == 0)
flags[threadIdx.x] += flags[threadIdx.x+1]+flags[threadIdx.x+2]+flags[threadIdx.x+3];
unsigned int allFlags = 0;
if (index == 0)
allFlags = flags[threadIdx.x]+flags[threadIdx.x+4]+flags[threadIdx.x+8]+flags[threadIdx.x+12]+flags[threadIdx.x+16]+flags[threadIdx.x+20]+flags[threadIdx.x+24]+flags[threadIdx.x+28];
#else
unsigned int allFlags = __ballot(delta.x*delta.x+delta.y*delta.y+delta.z*delta.z < CUTOFF_SQUARED);
#endif
// Sum the flags.
if (index == 0) {
// Count how many flags are set, and based on that decide whether to compute all interactions
// or only a fraction of them.
int bits = __popc(allFlags);
interactionFlags[pos] = (bits > 12 ? 0xFFFFFFFF : allFlags);
}
lasty = y;
}
pos++;
}
} }
#define DIELECTRIC_OFFSET 0.009f #define DIELECTRIC_OFFSET 0.009f
#define PROBE_RADIUS 0.14f #define PROBE_RADIUS 0.14f
#define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f; #define SURFACE_AREA_FACTOR -170.351730667551f //-6.0f*3.14159265358979323846f*0.0216f*1000.0f*0.4184f;
#define TILE_SIZE 32
#define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE) #define WARPS_PER_GROUP (FORCE_WORK_GROUP_SIZE/TILE_SIZE)
/** /**
...@@ -70,58 +69,30 @@ typedef struct { ...@@ -70,58 +69,30 @@ typedef struct {
*/ */
extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const float2* __restrict__ global_params, extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ global_bornSum, const real4* __restrict__ posq, const float2* __restrict__ global_params,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
unsigned int* exclusionIndices, unsigned int* exclusionRowIndices) { const ushort2* __restrict__ exclusionTiles) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real tempBuffer[FORCE_WORK_GROUP_SIZE];
#endif
do {
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
const unsigned int tbx = threadIdx.x - tgx; const unsigned int tbx = threadIdx.x - tgx;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; __shared__ AtomData1 localData[FORCE_WORK_GROUP_SIZE];
unsigned int x, y;
// First loop: process tiles that contain exclusions.
const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real bornSum = 0; real bornSum = 0;
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1]; float2 params1 = global_params[atom1];
if (pos >= end) if (x == y) {
; // This warp is done.
else if (x == y) {
// This tile is on the diagonal. // This tile is on the diagonal.
localData[threadIdx.x].x = posq1.x; localData[threadIdx.x].x = posq1.x;
...@@ -155,8 +126,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -155,8 +126,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) + bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2)); (params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r) bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
} }
} }
} }
...@@ -164,7 +134,6 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -164,7 +134,6 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x; localData[threadIdx.x].x = tempPosq.x;
...@@ -174,46 +143,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -174,46 +143,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
float2 tempParams = global_params[j]; float2 tempParams = global_params[j];
localData[threadIdx.x].radius = tempParams.x; localData[threadIdx.x].radius = tempParams.x;
localData[threadIdx.x].scaledRadius = tempParams.y; localData[threadIdx.x].scaledRadius = tempParams.y;
}
localData[threadIdx.x].bornSum = 0.0f; localData[threadIdx.x].bornSum = 0.0f;
#ifdef USE_CUTOFF
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF);
bool computeSubset = false;
if (flags != 0xFFFFFFFF) {
if (tgx < 2)
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
computeSubset = (exclusionIndex[localGroupIndex] == -1);
}
if (computeSubset) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) { // Compute the full set of interactions in this tile.
if ((flags&(1<<j)) != 0) {
real3 delta = make_real3(localData[tbx+j].x-posq1.x, localData[tbx+j].y-posq1.y, localData[tbx+j].z-posq1.z); unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
real sum = 0;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+j < NUM_ATOMS) { if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+j].radius, localData[tbx+j].scaledRadius); float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y; real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) { if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y))); real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
...@@ -223,8 +173,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -223,8 +173,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) + bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2)); (params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r) bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
} }
real rScaledRadiusI = r+params1.y; real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) { if (params2.x < rScaledRadiusI) {
...@@ -235,37 +184,160 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -235,37 +184,160 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) + real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2)); (params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r) term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
term += 2.0f*(RECIP(params2.x)-l_ij); localData[tbx+tj].bornSum += term;
sum = term; }
}
tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
// Sum the forces on atom j. // Write results.
#ifdef ENABLE_SHUFFLE unsigned int offset = x*TILE_SIZE + tgx;
for (int i = 16; i >= 1; i /= 2) atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
sum += __shfl_xor(sum, i, 32); if (x != y) {
if (tgx == 0) offset = y*TILE_SIZE + tgx;
localData[tbx+j].bornSum += sum; atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
}
}
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else #else
tempBuffer[threadIdx.x] = sum; int pos = warp*numTiles/totalWarps;
if (tgx % 4 == 0) int end = (warp+1)*numTiles/totalWarps;
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3];
if (tgx == 0)
localData[tbx+j].bornSum += tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28];
#endif #endif
int skipBase = 0;
int currentSkipIndex = tbx;
__shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
__shared__ int skipTiles[FORCE_WORK_GROUP_SIZE];
skipTiles[threadIdx.x] = -1;
while (pos < end) {
real bornSum = 0;
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
} }
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
float2 params1 = global_params[atom1];
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
float2 tempParams = global_params[j];
localData[threadIdx.x].radius = tempParams.x;
localData[threadIdx.x].scaledRadius = tempParams.y;
localData[threadIdx.x].bornSum = 0.0f;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
float2 params2 = make_float2(localData[tbx+tj].radius, localData[tbx+tj].scaledRadius);
real rScaledRadiusJ = r+params2.y;
if (params1.x < rScaledRadiusJ) {
real l_ij = RECIP(max(params1.x, fabs(r-params2.y)));
real u_ij = RECIP(rScaledRadiusJ);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2));
bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
}
real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) {
real l_ij = RECIP(max(params2.x, fabs(r-params1.y)));
real u_ij = RECIP(rScaledRadiusI);
real l_ij2 = l_ij*l_ij;
real u_ij2 = u_ij*u_ij;
real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2));
term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
localData[tbx+tj].bornSum += term;
}
}
tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
else else
#endif #endif
{ {
// Compute the full set of interactions in this tile. // We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z); real3 delta = make_real3(localData[tbx+tj].x-posq1.x, localData[tbx+tj].y-posq1.y, localData[tbx+tj].z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
...@@ -273,10 +345,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -273,10 +345,11 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z; delta.z -= floor(delta.z*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
#endif #endif
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z; real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
int atom2 = atomIndices[tbx+tj];
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS && r2 < CUTOFF_SQUARED) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS && r2 < CUTOFF_SQUARED) {
#else #else
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) { if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
...@@ -290,8 +363,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -290,8 +363,7 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) + bornSum += l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params2.y*params2.y*invR)*(l_ij2-u_ij2)); (params2.y*params2.y*invR)*(l_ij2-u_ij2));
if (params1.x < params2.y-r) bornSum += (params1.x < params2.y-r ? 2.0f*(RECIP(params1.x)-l_ij) : 0);
bornSum += 2.0f*(RECIP(params1.x)-l_ij);
} }
real rScaledRadiusI = r+params1.y; real rScaledRadiusI = r+params1.y;
if (params2.x < rScaledRadiusI) { if (params2.x < rScaledRadiusI) {
...@@ -302,30 +374,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa ...@@ -302,30 +374,27 @@ extern "C" __global__ void computeBornSum(unsigned long long* __restrict__ globa
real ratio = LOG(u_ij * RECIP(l_ij)); real ratio = LOG(u_ij * RECIP(l_ij));
real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) + real term = l_ij - u_ij + (0.50f*invR*ratio) + 0.25f*(r*(u_ij2-l_ij2) +
(params1.y*params1.y*invR)*(l_ij2-u_ij2)); (params1.y*params1.y*invR)*(l_ij2-u_ij2));
if (params2.x < params1.y-r) term += (params2.x < params1.y-r ? 2.0f*(RECIP(params2.x)-l_ij) : 0);
term += 2.0f*(RECIP(params2.x)-l_ij);
localData[tbx+tj].bornSum += term; localData[tbx+tj].bornSum += term;
} }
} }
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
}
}
// Write results. // Write results.
if (pos < end) { atomicAdd(&global_bornSum[atom1], static_cast<unsigned long long>((long long) (bornSum*0x100000000)));
const unsigned int offset = x*TILE_SIZE + tgx; #ifdef USE_CUTOFF
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (bornSum*0x100000000))); unsigned int atom2 = atomIndices[threadIdx.x];
} #else
if (pos < end && x != y) { unsigned int atom2 = y*TILE_SIZE + tgx;
const unsigned int offset = y*TILE_SIZE + tgx; #endif
atomicAdd(&global_bornSum[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000))); if (atom2 < PADDED_NUM_ATOMS)
atomicAdd(&global_bornSum[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].bornSum*0x100000000)));
} }
lasty = y;
pos++; pos++;
} while (pos < end); }
} }
typedef struct { typedef struct {
...@@ -342,54 +411,27 @@ typedef struct { ...@@ -342,54 +411,27 @@ typedef struct {
extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce, extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ forceBuffers, unsigned long long* __restrict__ global_bornForce,
real* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii, real* __restrict__ energyBuffer, const real4* __restrict__ posq, const real* __restrict__ global_bornRadii,
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const unsigned int* __restrict__ interactionFlags, const ushort2* __restrict__ tiles, const unsigned int* __restrict__ interactionCount, real4 periodicBoxSize, real4 invPeriodicBoxSize, unsigned int maxTiles, const real4* __restrict__ blockCenter, const unsigned int* __restrict__ interactingAtoms,
#else #else
unsigned int numTiles, unsigned int numTiles,
#endif #endif
unsigned int* exclusionIndices, unsigned int* exclusionRowIndices) { const ushort2* __restrict__ exclusionTiles) {
unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE; const unsigned int totalWarps = (blockDim.x*gridDim.x)/TILE_SIZE;
unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE; const unsigned int warp = (blockIdx.x*blockDim.x+threadIdx.x)/TILE_SIZE;
#ifdef USE_CUTOFF const unsigned int tgx = threadIdx.x & (TILE_SIZE-1);
unsigned int numTiles = interactionCount[0]; const unsigned int tbx = threadIdx.x - tgx;
unsigned int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
unsigned int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else
unsigned int pos = warp*numTiles/totalWarps;
unsigned int end = (warp+1)*numTiles/totalWarps;
#endif
real energy = 0; real energy = 0;
unsigned int lasty = 0xFFFFFFFF;
__shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE]; __shared__ AtomData2 localData[FORCE_WORK_GROUP_SIZE];
__shared__ unsigned int exclusionRange[2*WARPS_PER_GROUP];
__shared__ int exclusionIndex[WARPS_PER_GROUP];
#ifndef ENABLE_SHUFFLE
__shared__ real4 tempBuffer[FORCE_WORK_GROUP_SIZE];
#endif
do { // First loop: process tiles that contain exclusions.
// Extract the coordinates of this tile
const unsigned int tgx = threadIdx.x & (TILE_SIZE-1); const unsigned int firstExclusionTile = FIRST_EXCLUSION_TILE+warp*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int tbx = threadIdx.x - tgx; const unsigned int lastExclusionTile = FIRST_EXCLUSION_TILE+(warp+1)*(LAST_EXCLUSION_TILE-FIRST_EXCLUSION_TILE)/totalWarps;
const unsigned int localGroupIndex = threadIdx.x/TILE_SIZE; for (int pos = firstExclusionTile; pos < lastExclusionTile; pos++) {
unsigned int x, y; const ushort2 tileIndices = exclusionTiles[pos];
const unsigned int x = tileIndices.x;
const unsigned int y = tileIndices.y;
real4 force = make_real4(0); real4 force = make_real4(0);
if (pos < end) {
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
y = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-sqrt((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
}
unsigned int atom1 = x*TILE_SIZE + tgx; unsigned int atom1 = x*TILE_SIZE + tgx;
real4 posq1 = posq[atom1]; real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1]; real bornRadius1 = global_bornRadii[atom1];
...@@ -441,7 +483,6 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -441,7 +483,6 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
else { else {
// This is an off-diagonal tile. // This is an off-diagonal tile.
if (lasty != y) {
unsigned int j = y*TILE_SIZE + tgx; unsigned int j = y*TILE_SIZE + tgx;
real4 tempPosq = posq[j]; real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x; localData[threadIdx.x].x = tempPosq.x;
...@@ -449,35 +490,15 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -449,35 +490,15 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
localData[threadIdx.x].z = tempPosq.z; localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w; localData[threadIdx.x].q = tempPosq.w;
localData[threadIdx.x].bornRadius = global_bornRadii[j]; localData[threadIdx.x].bornRadius = global_bornRadii[j];
}
localData[threadIdx.x].fx = 0.0f; localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f; localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f; localData[threadIdx.x].fz = 0.0f;
localData[threadIdx.x].fw = 0.0f; localData[threadIdx.x].fw = 0.0f;
#ifdef USE_CUTOFF unsigned int tj = tgx;
unsigned int flags = (numTiles <= maxTiles ? interactionFlags[pos] : 0xFFFFFFFF); for (j = 0; j < TILE_SIZE; j++) {
bool computeSubset = false; if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) {
if (flags != 0xFFFFFFFF) { real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
if (tgx < 2) real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
exclusionRange[2*localGroupIndex+tgx] = exclusionRowIndices[x+tgx];
if (tgx == 0)
exclusionIndex[localGroupIndex] = -1;
for (unsigned int i = exclusionRange[2*localGroupIndex]+tgx; i < exclusionRange[2*localGroupIndex+1]; i += TILE_SIZE)
if (exclusionIndices[i] == y)
exclusionIndex[localGroupIndex] = i*TILE_SIZE;
computeSubset = (exclusionIndex[localGroupIndex] == -1);
}
if (computeSubset) {
if (flags == 0) {
// No interactions in this tile.
}
else {
// Compute only a subset of the interactions in this tile.
for (unsigned int j = 0; j < TILE_SIZE; j++) {
if ((flags&(1<<j)) != 0) {
real4 posq2 = make_real4(localData[tbx+j].x, localData[tbx+j].y, localData[tbx+j].z, localData[tbx+j].q);
real4 delta = make_real4(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z, 0);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x; delta.x -= floor(delta.x*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y; delta.y -= floor(delta.y*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
...@@ -489,7 +510,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -489,7 +510,7 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
#endif #endif
real invR = RSQRT(r2); real invR = RSQRT(r2);
real r = RECIP(invR); real r = RECIP(invR);
real bornRadius2 = localData[tbx+j].bornRadius; real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2; real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij); real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij); real expTerm = EXP(-D_ij);
...@@ -499,67 +520,178 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -499,67 +520,178 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
real Gpol = tempEnergy*RECIP(denominator2); real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij); real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm); real dEdR = Gpol*(1.0f - 0.25f*expTerm);
#ifdef USE_CUTOFF
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS || r2 > CUTOFF_SQUARED) {
#else
if (atom1 >= NUM_ATOMS || y*TILE_SIZE+j >= NUM_ATOMS) {
#endif
dEdR = 0.0f;
dGpol_dalpha2_ij = 0.0f;
tempEnergy = 0.0f;
}
energy += tempEnergy;
force.w += dGpol_dalpha2_ij*bornRadius2; force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta *= dEdR; delta *= dEdR;
force.x -= delta.x; force.x -= delta.x;
force.y -= delta.y; force.y -= delta.y;
force.z -= delta.z; force.z -= delta.z;
delta.w = dGpol_dalpha2_ij*bornRadius1; localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
#ifdef USE_CUTOFF #ifdef USE_CUTOFF
} }
else
delta = make_real4(0);
#endif #endif
}
tj = (tj + 1) & (TILE_SIZE - 1);
}
}
// Sum the forces on atom j. // Write results.
#ifdef ENABLE_SHUFFLE unsigned int offset = x*TILE_SIZE + tgx;
for (int i = 16; i >= 1; i /= 2) { atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
delta.x += __shfl_xor(delta.x, i, 32); atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
delta.y += __shfl_xor(delta.y, i, 32); atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
delta.z += __shfl_xor(delta.z, i, 32); atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
delta.w += __shfl_xor(delta.w, i, 32); if (x != y) {
offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
} }
if (tgx == 0) {
localData[tbx+j].fx += delta.x;
localData[tbx+j].fy += delta.y;
localData[tbx+j].fz += delta.z;
localData[tbx+j].fw += delta.w;
} }
// Second loop: tiles without exclusions, either from the neighbor list (with cutoff) or just enumerating all
// of them (no cutoff).
#ifdef USE_CUTOFF
unsigned int numTiles = interactionCount[0];
int pos = warp*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
int end = (warp+1)*(numTiles > maxTiles ? NUM_BLOCKS*(NUM_BLOCKS+1)/2 : numTiles)/totalWarps;
#else #else
tempBuffer[threadIdx.x] = delta; int pos = warp*numTiles/totalWarps;
if (tgx % 4 == 0) int end = (warp+1)*numTiles/totalWarps;
tempBuffer[threadIdx.x] += tempBuffer[threadIdx.x+1]+tempBuffer[threadIdx.x+2]+tempBuffer[threadIdx.x+3]; #endif
if (tgx == 0) { int skipBase = 0;
real4 sum = tempBuffer[threadIdx.x]+tempBuffer[threadIdx.x+4]+tempBuffer[threadIdx.x+8]+tempBuffer[threadIdx.x+12]+tempBuffer[threadIdx.x+16]+tempBuffer[threadIdx.x+20]+tempBuffer[threadIdx.x+24]+tempBuffer[threadIdx.x+28]; int currentSkipIndex = tbx;
localData[tbx+j].fx += sum.x; __shared__ int atomIndices[FORCE_WORK_GROUP_SIZE];
localData[tbx+j].fy += sum.y; __shared__ int skipTiles[FORCE_WORK_GROUP_SIZE];
localData[tbx+j].fz += sum.z; skipTiles[threadIdx.x] = -1;
localData[tbx+j].fw += sum.w;
while (pos < end) {
real4 force = make_real4(0);
bool includeTile = true;
// Extract the coordinates of this tile.
unsigned int x, y;
bool singlePeriodicCopy = false;
#ifdef USE_CUTOFF
if (numTiles <= maxTiles) {
ushort2 tileIndices = tiles[pos];
x = tileIndices.x;
singlePeriodicCopy = tileIndices.y;
}
else
#endif
{
y = (unsigned int) floor(NUM_BLOCKS+0.5f-SQRT((NUM_BLOCKS+0.5f)*(NUM_BLOCKS+0.5f)-2*pos));
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
if (x < y || x >= NUM_BLOCKS) { // Occasionally happens due to roundoff error.
y += (x < y ? -1 : 1);
x = (pos-y*NUM_BLOCKS+y*(y+1)/2);
}
// Skip over tiles that have exclusions, since they were already processed.
while (skipTiles[tbx+TILE_SIZE-1] < pos) {
if (skipBase+tgx < NUM_TILES_WITH_EXCLUSIONS) {
ushort2 tile = exclusionTiles[skipBase+tgx];
skipTiles[threadIdx.x] = tile.x + tile.y*NUM_BLOCKS - tile.y*(tile.y+1)/2;
} }
else
skipTiles[threadIdx.x] = end;
skipBase += TILE_SIZE;
currentSkipIndex = tbx;
}
while (skipTiles[currentSkipIndex] < pos)
currentSkipIndex++;
includeTile = (skipTiles[currentSkipIndex] != pos);
}
if (includeTile) {
unsigned int atom1 = x*TILE_SIZE + tgx;
// Load atom data for this tile.
real4 posq1 = posq[atom1];
real bornRadius1 = global_bornRadii[atom1];
#ifdef USE_CUTOFF
unsigned int j = (numTiles <= maxTiles ? interactingAtoms[pos*TILE_SIZE+tgx] : y*TILE_SIZE + tgx);
#else
unsigned int j = y*TILE_SIZE + tgx;
#endif #endif
atomIndices[threadIdx.x] = j;
if (j < PADDED_NUM_ATOMS) {
real4 tempPosq = posq[j];
localData[threadIdx.x].x = tempPosq.x;
localData[threadIdx.x].y = tempPosq.y;
localData[threadIdx.x].z = tempPosq.z;
localData[threadIdx.x].q = tempPosq.w;
localData[threadIdx.x].bornRadius = global_bornRadii[j];
localData[threadIdx.x].fx = 0.0f;
localData[threadIdx.x].fy = 0.0f;
localData[threadIdx.x].fz = 0.0f;
localData[threadIdx.x].fw = 0.0f;
}
#ifdef USE_PERIODIC
if (singlePeriodicCopy) {
// The box is small enough that we can just translate all the atoms into a single periodic
// box, then skip having to apply periodic boundary conditions later.
real4 blockCenterX = blockCenter[x];
posq1.x -= floor((posq1.x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
posq1.y -= floor((posq1.y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
posq1.z -= floor((posq1.z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
localData[threadIdx.x].x -= floor((localData[threadIdx.x].x-blockCenterX.x)*invPeriodicBoxSize.x+0.5f)*periodicBoxSize.x;
localData[threadIdx.x].y -= floor((localData[threadIdx.x].y-blockCenterX.y)*invPeriodicBoxSize.y+0.5f)*periodicBoxSize.y;
localData[threadIdx.x].z -= floor((localData[threadIdx.x].z-blockCenterX.z)*invPeriodicBoxSize.z+0.5f)*periodicBoxSize.z;
unsigned int tj = tgx;
for (j = 0; j < TILE_SIZE; j++) {
int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
real r2 = delta.x*delta.x + delta.y*delta.y + delta.z*delta.z;
if (r2 < CUTOFF_SQUARED) {
real invR = RSQRT(r2);
real r = RECIP(invR);
real bornRadius2 = localData[tbx+tj].bornRadius;
real alpha2_ij = bornRadius1*bornRadius2;
real D_ij = r2*RECIP(4.0f*alpha2_ij);
real expTerm = EXP(-D_ij);
real denominator2 = r2 + alpha2_ij*expTerm;
real denominator = SQRT(denominator2);
real tempEnergy = (PREFACTOR*posq1.w*posq2.w)*RECIP(denominator);
real Gpol = tempEnergy*RECIP(denominator2);
real dGpol_dalpha2_ij = -0.5f*Gpol*expTerm*(1.0f+D_ij);
real dEdR = Gpol*(1.0f - 0.25f*expTerm);
force.w += dGpol_dalpha2_ij*bornRadius2;
energy += tempEnergy;
delta *= dEdR;
force.x -= delta.x;
force.y -= delta.y;
force.z -= delta.z;
localData[tbx+tj].fx += delta.x;
localData[tbx+tj].fy += delta.y;
localData[tbx+tj].fz += delta.z;
localData[tbx+tj].fw += dGpol_dalpha2_ij*bornRadius1;
} }
} }
tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
else else
#endif #endif
{ {
// Compute the full set of interactions in this tile. // We need to apply periodic boundary conditions separately for each interaction.
unsigned int tj = tgx; unsigned int tj = tgx;
for (unsigned int j = 0; j < TILE_SIZE; j++) { for (j = 0; j < TILE_SIZE; j++) {
if (atom1 < NUM_ATOMS && y*TILE_SIZE+tj < NUM_ATOMS) { int atom2 = atomIndices[tbx+tj];
if (atom1 < NUM_ATOMS && atom2 < NUM_ATOMS) {
real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q); real4 posq2 = make_real4(localData[tbx+tj].x, localData[tbx+tj].y, localData[tbx+tj].z, localData[tbx+tj].q);
real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z); real3 delta = make_real3(posq2.x-posq1.x, posq2.y-posq1.y, posq2.z-posq1.z);
#ifdef USE_PERIODIC #ifdef USE_PERIODIC
...@@ -600,27 +732,26 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo ...@@ -600,27 +732,26 @@ extern "C" __global__ void computeGBSAForce1(unsigned long long* __restrict__ fo
tj = (tj + 1) & (TILE_SIZE - 1); tj = (tj + 1) & (TILE_SIZE - 1);
} }
} }
}
}
// Write results. // Write results.
if (pos < end) { atomicAdd(&forceBuffers[atom1], static_cast<unsigned long long>((long long) (force.x*0x100000000)));
const unsigned int offset = x*TILE_SIZE + tgx; atomicAdd(&forceBuffers[atom1+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000)));
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (force.x*0x100000000))); atomicAdd(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.y*0x100000000))); atomicAdd(&global_bornForce[atom1], static_cast<unsigned long long>((long long) (force.w*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (force.z*0x100000000))); #ifdef USE_CUTOFF
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (force.w*0x100000000))); unsigned int atom2 = atomIndices[threadIdx.x];
#else
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
atomicAdd(&forceBuffers[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[atom2+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
atomicAdd(&global_bornForce[atom2], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
} }
if (pos < end && x != y) {
const unsigned int offset = y*TILE_SIZE + tgx;
atomicAdd(&forceBuffers[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fx*0x100000000)));
atomicAdd(&forceBuffers[offset+PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fy*0x100000000)));
atomicAdd(&forceBuffers[offset+2*PADDED_NUM_ATOMS], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fz*0x100000000)));
atomicAdd(&global_bornForce[offset], static_cast<unsigned long long>((long long) (localData[threadIdx.x].fw*0x100000000)));
} }
lasty = y;
pos++; pos++;
} while (pos < end); }
energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy; energyBuffer[blockIdx.x*blockDim.x+threadIdx.x] += energy;
} }
...@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -24,14 +24,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x1 = sqrt(-2.0f * log(x1)); x1 = SQRT(-2.0f * LOG(x1));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
state.w = m; state.w = m;
carry = k >> 30; carry = k >> 30;
float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff; float x2 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.x = x1 * cos(2.0f * 3.14159265f * x2); value.x = x1 * COS(2.0f * 3.14159265f * x2);
// Generate second value. // Generate second value.
...@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -49,14 +49,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x3 = sqrt(-2.0f * log(x3)); x3 = SQRT(-2.0f * LOG(x3));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
state.w = m; state.w = m;
carry = k >> 30; carry = k >> 30;
float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff; float x4 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.y = x3 * cos(2.0f * 3.14159265f * x4); value.y = x3 * COS(2.0f * 3.14159265f * x4);
// Generate third value. // Generate third value.
...@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -74,14 +74,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x5 = sqrt(-2.0f * log(x5)); x5 = SQRT(-2.0f * LOG(x5));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
state.w = m; state.w = m;
carry = k >> 30; carry = k >> 30;
float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff; float x6 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.z = x5 * cos(2.0f * 3.14159265f * x6); value.z = x5 * COS(2.0f * 3.14159265f * x6);
// Generate fourth value. // Generate fourth value.
...@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri ...@@ -99,14 +99,14 @@ extern "C" __global__ void generateRandomNumbers(int numValues, float4* __restri
state.y ^= state.y << 13; state.y ^= state.y << 13;
state.y ^= state.y >> 17; state.y ^= state.y >> 17;
state.y ^= state.y << 5; state.y ^= state.y << 5;
x7 = sqrt(-2.0f * log(x7)); x7 = SQRT(-2.0f * LOG(x7));
k = (state.z >> 2) + (state.w >> 3) + (carry >> 2); k = (state.z >> 2) + (state.w >> 3) + (carry >> 2);
m = state.w + state.w + state.z + carry; m = state.w + state.w + state.z + carry;
state.z = state.w; state.z = state.w;
state.w = m; state.w = m;
carry = k >> 30; carry = k >> 30;
float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff; float x8 = (float)(state.x + state.y + state.w) / (float)0xffffffff;
value.w = x7 * cos(2.0f * 3.14159265f * x8); value.w = x7 * COS(2.0f * 3.14159265f * x8);
// Record the values. // Record the values.
...@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -412,9 +412,9 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd; mixed yaksYd = zaksZd*xaksXd - xaksZd*zaksXd;
mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd; mixed zaksYd = xaksZd*yaksXd - yaksZd*xaksXd;
mixed axlng = sqrt(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd); mixed axlng = SQRT(xaksXd*xaksXd + yaksXd*yaksXd + zaksXd*zaksXd);
mixed aylng = sqrt(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd); mixed aylng = SQRT(xaksYd*xaksYd + yaksYd*yaksYd + zaksYd*zaksYd);
mixed azlng = sqrt(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd); mixed azlng = SQRT(xaksZd*xaksZd + yaksZd*yaksZd + zaksZd*zaksZd);
mixed trns11 = xaksXd / axlng; mixed trns11 = xaksXd / axlng;
mixed trns21 = yaksXd / axlng; mixed trns21 = yaksXd / axlng;
mixed trns31 = zaksXd / axlng; mixed trns31 = zaksXd / axlng;
...@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -440,13 +440,13 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
// --- Step2 A2' --- // --- Step2 A2' ---
float rc = 0.5f*params.y; float rc = 0.5f*params.y;
mixed rb = sqrt(params.x*params.x-rc*rc); mixed rb = SQRT(params.x*params.x-rc*rc);
mixed ra = rb*(m1+m2)*invTotalMass; mixed ra = rb*(m1+m2)*invTotalMass;
rb -= ra; rb -= ra;
mixed sinphi = za1d/ra; mixed sinphi = za1d/ra;
mixed cosphi = sqrt(1-sinphi*sinphi); mixed cosphi = SQRT(1-sinphi*sinphi);
mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi); mixed sinpsi = (zb1d-zc1d) / (2*rc*cosphi);
mixed cospsi = sqrt(1-sinpsi*sinpsi); mixed cospsi = SQRT(1-sinpsi*sinpsi);
mixed ya2d = ra*cosphi; mixed ya2d = ra*cosphi;
mixed xb2d = - rc*cospsi; mixed xb2d = - rc*cospsi;
...@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -454,7 +454,7 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi; mixed yc2d = - rb*cosphi + rc*sinpsi*sinphi;
mixed xb2d2 = xb2d*xb2d; mixed xb2d2 = xb2d*xb2d;
mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d); mixed hh2 = 4.0f*xb2d2 + (yb2d-yc2d)*(yb2d-yc2d) + (zb1d-zc1d)*(zb1d-zc1d);
mixed deltx = 2.0f*xb2d + sqrt(4.0f*xb2d2 - hh2 + params.y*params.y); mixed deltx = 2.0f*xb2d + SQRT(4.0f*xb2d2 - hh2 + params.y*params.y);
xb2d -= deltx*0.5f; xb2d -= deltx*0.5f;
// --- Step3 al,be,ga --- // --- Step3 al,be,ga ---
...@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co ...@@ -464,11 +464,11 @@ extern "C" __global__ void applySettleToPositions(int numClusters, mixed tol, co
mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d; mixed gamma = xb0d*yb1d - xb1d*yb0d + xc0d*yc1d - xc1d*yc0d;
mixed al2be2 = alpha*alpha + beta*beta; mixed al2be2 = alpha*alpha + beta*beta;
mixed sintheta = (alpha*gamma - beta*sqrt(al2be2 - gamma*gamma)) / al2be2; mixed sintheta = (alpha*gamma - beta*SQRT(al2be2 - gamma*gamma)) / al2be2;
// --- Step4 A3' --- // --- Step4 A3' ---
mixed costheta = sqrt(1-sintheta*sintheta); mixed costheta = SQRT(1-sintheta*sintheta);
mixed xa3d = - ya2d*sintheta; mixed xa3d = - ya2d*sintheta;
mixed ya3d = ya2d*costheta; mixed ya3d = ya2d*costheta;
mixed za3d = za1d; mixed za3d = za1d;
...@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c ...@@ -534,9 +534,9 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
mixed3 eAB = make_mixed3(apos1.x-apos0.x, apos1.y-apos0.y, apos1.z-apos0.z); mixed3 eAB = make_mixed3(apos1.x-apos0.x, apos1.y-apos0.y, apos1.z-apos0.z);
mixed3 eBC = make_mixed3(apos2.x-apos1.x, apos2.y-apos1.y, apos2.z-apos1.z); mixed3 eBC = make_mixed3(apos2.x-apos1.x, apos2.y-apos1.y, apos2.z-apos1.z);
mixed3 eCA = make_mixed3(apos0.x-apos2.x, apos0.y-apos2.y, apos0.z-apos2.z); mixed3 eCA = make_mixed3(apos0.x-apos2.x, apos0.y-apos2.y, apos0.z-apos2.z);
eAB *= rsqrt(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z); eAB *= RSQRT(eAB.x*eAB.x + eAB.y*eAB.y + eAB.z*eAB.z);
eBC *= rsqrt(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z); eBC *= RSQRT(eBC.x*eBC.x + eBC.y*eBC.y + eBC.z*eBC.z);
eCA *= rsqrt(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z); eCA *= RSQRT(eCA.x*eCA.x + eCA.y*eCA.y + eCA.z*eCA.z);
mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z; mixed vAB = (v1.x-v0.x)*eAB.x + (v1.y-v0.y)*eAB.y + (v1.z-v0.z)*eAB.z;
mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z; mixed vBC = (v2.x-v1.x)*eBC.x + (v2.y-v1.y)*eBC.y + (v2.z-v1.z)*eBC.z;
mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z; mixed vCA = (v0.x-v2.x)*eCA.x + (v0.y-v2.y)*eCA.y + (v0.z-v2.z)*eCA.z;
...@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c ...@@ -574,7 +574,8 @@ extern "C" __global__ void applySettleToVelocities(int numClusters, mixed tol, c
/** /**
* Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation. * Compute the direction each CCMA constraint is pointing in. This is called once at the beginning of constraint evaluation.
*/ */
extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance, const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection) { extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restrict__ constraintAtoms, mixed4* __restrict__ constraintDistance,
const real4* __restrict__ atomPositions, const real4* __restrict__ posqCorrection, int* __restrict__ converged) {
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
// Compute the direction for this constraint. // Compute the direction for this constraint.
...@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric ...@@ -587,6 +588,10 @@ extern "C" __global__ void computeCCMAConstraintDirections(const int2* __restric
dir.z = oldPos1.z-oldPos2.z; dir.z = oldPos1.z-oldPos2.z;
constraintDistance[index] = dir; constraintDistance[index] = dir;
} }
if (threadIdx.x == 0 && blockIdx.x == 0) {
converged[0] = 1;
converged[1] = 0;
}
} }
/** /**
...@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest ...@@ -605,6 +610,7 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
__syncthreads(); __syncthreads();
mixed lowerTol = 1-2*tol+tol*tol; mixed lowerTol = 1-2*tol+tol*tol;
mixed upperTol = 1+2*tol+tol*tol; mixed upperTol = 1+2*tol+tol*tol;
bool threadConverged = true;
for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) { for (int index = blockIdx.x*blockDim.x+threadIdx.x; index < NUM_CCMA_CONSTRAINTS; index += blockDim.x*gridDim.x) {
// Compute the force due to this constraint. // Compute the force due to this constraint.
...@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest ...@@ -620,14 +626,13 @@ extern "C" __global__ void computeCCMAPositionConstraintForce(const int2* __rest
mixed dist2 = dir.w*dir.w; mixed dist2 = dir.w*dir.w;
mixed diff = dist2 - rp2; mixed diff = dist2 - rp2;
delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f); delta1[index] = (rrpr > d_ij2*1e-6f ? reducedMass[index]*diff/rrpr : 0.0f);
threadConverged &= (rp2 > lowerTol*dist2 && rp2 < upperTol*dist2);
// See whether it has converged. }
if (groupConverged && !threadConverged)
if (groupConverged && (rp2 < lowerTol*dist2 || rp2 > upperTol*dist2)) {
groupConverged = 0; groupConverged = 0;
__syncthreads();
if (threadIdx.x == 0 && !groupConverged)
converged[iteration%2] = 0; converged[iteration%2] = 0;
}
}
} }
/** /**
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment